colly.png
colly一款快速優(yōu)雅的golang爬蟲(chóng)框架,簡(jiǎn)單易用伴澄,功能完備。
官網(wǎng)地址:http://go-colly.org/
包地址:import "github.com/gocolly/colly"
一個(gè)簡(jiǎn)單的例子:
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnHTML("a", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL)
})
c.Visit("http://go-colly.org/")
}
使用方式概括下來(lái)主要有三步:
- 創(chuàng)建一個(gè)采集器
- 注冊(cè)回調(diào)函數(shù)
- 訪問(wèn)具體網(wǎng)站
創(chuàng)建采集器時(shí)可以指定一些配置參數(shù)阱缓,如useragent非凌,爬取深度及日志等
colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"),
colly.MaxDepth(1),
colly.Debugger(&debug.LogDebugger{}))
回調(diào)函數(shù)共有7中
名稱 | 說(shuō)明 | 參數(shù)1 | 參數(shù)2 |
---|---|---|---|
OnRequest | 請(qǐng)求前調(diào)用 | *colly.Request | |
OnError | 請(qǐng)求發(fā)生錯(cuò)誤時(shí)調(diào)用 | *colly.Response | error |
OnResponseHeaders | 收到響應(yīng)頭后調(diào)用 | colly.Response | |
OnResponse | 收到響應(yīng)后調(diào)用 | colly.Response | |
OnHTML | 響應(yīng)內(nèi)容是HTML時(shí)調(diào)用 | xpath表達(dá)式 | func(e *colly.HTMLElement) |
OnXML | 響應(yīng)內(nèi)容是XML時(shí)調(diào)用 | xpath表達(dá)式 | func(e *colly.XMLElement) |
OnScraped | 在OnXML之后調(diào)用 | func(r *colly.Response) |
OnHTML回調(diào)可以注冊(cè)多個(gè),匹配不同的xpath表達(dá)式
1. 爬取簡(jiǎn)書首頁(yè)文章列表
通過(guò)瀏覽器開(kāi)發(fā)者工具查看jianshu.com結(jié)構(gòu)如下
colly-jianshu-dom.png
文章列表為ul標(biāo)簽茬祷,中間每一項(xiàng)是li標(biāo)簽清焕,li中包含content,content中包含title祭犯,abstract和meta標(biāo)簽
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
//文章列表
c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
//列表中每一項(xiàng)
e.ForEach("li", func(i int, item *colly.HTMLElement) {
//文章鏈接
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
//文章標(biāo)題
title := item.ChildText("div[class='content'] > a[class='title']")
//文章摘要
summary := item.ChildText("div[class='content'] > p[class='abstract']")
fmt.Println(title, href)
fmt.Println(summary)
fmt.Println()
})
})
err := c.Visit("http://www.reibang.com")
if err != nil {
fmt.Println(err.Error())
}
}
2.爬取文章列表和詳情
文章列表和1方式一樣秸妥,文章詳情通過(guò)創(chuàng)建新的采集器訪問(wèn)詳情頁(yè)面
package main
import (
"fmt"
"github.com/gocolly/colly"
"time"
)
func main() {
c1 := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1))
c2 := c1.Clone()
//異步
c2.Async = true
//限速
c2.Limit(&colly.LimitRule{
DomainRegexp: "",
DomainGlob: "*.jianshu.com/p/*",
Delay: 10 * time.Second,
RandomDelay: 0,
Parallelism: 1,
})
//采集器1,獲取文章列表
c1.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
e.ForEach("li", func(i int, item *colly.HTMLElement) {
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
title := item.ChildText("div[class='content'] > a[class='title']")
summary := item.ChildText("div[class='content'] > p[class='abstract']")
ctx := colly.NewContext()
ctx.Put("href", href)
ctx.Put("title", title)
ctx.Put("summary", summary)
//通過(guò)Context上下文對(duì)象將采集器1采集到的數(shù)據(jù)傳遞到采集器2
c2.Request("GET", "http://www.reibang.com" + href, nil, ctx, nil)
})
})
//采集器2沃粗,獲取文章詳情
c2.OnHTML("article", func(e *colly.HTMLElement) {
href := e.Request.Ctx.Get("href")
title := e.Request.Ctx.Get("title")
summary := e.Request.Ctx.Get("summary")
detail := e.Text
fmt.Println("----------" + title + "----------")
fmt.Println(href)
fmt.Println(summary)
fmt.Println(detail)
fmt.Println()
})
c2.OnRequest(func(r *colly.Request) {
fmt.Println("c2爬取頁(yè)面:", r.URL)
})
c1.OnRequest(func(r *colly.Request) {
fmt.Println("c1爬取頁(yè)面:", r.URL)
})
c1.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
err := c1.Visit("http://www.reibang.com")
if err != nil {
fmt.Println(err.Error())
}
c2.Wait()
}
3. 爬取需要登錄的網(wǎng)頁(yè)
官網(wǎng)提供登錄頁(yè)處理的例子粥惧,但是大多數(shù)涉及驗(yàn)證碼,不好處理最盅,目前方式是手動(dòng)登錄突雪,復(fù)制cookie寫到爬蟲(chóng)請(qǐng)求頭里
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/extensions"
_ "github.com/gocolly/colly/extensions"
"net/http"
)
func main() {
url := "http://www.reibang.com"
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(1), colly.Debugger(&debug.LogDebugger{}))
c.OnHTML("ul[class='note-list']", func(e *colly.HTMLElement) {
e.ForEach("li", func(i int, item *colly.HTMLElement) {
href := item.ChildAttr("div[class='content'] > a[class='title']", "href")
title := item.ChildText("div[class='content'] > a[class='title']")
summary := item.ChildText("div[class='content'] > p[class='abstract']")
fmt.Println(title, href)
fmt.Println(summary)
fmt.Println()
})
})
//設(shè)置隨機(jī)useragent
extensions.RandomUserAgent(c)
//設(shè)置登錄cookie
c.SetCookies(url, []*http.Cookie{
&http.Cookie{
Name: "remember_user_token",
Value: "wNDUxOV0sIiQyYSQxMSRwdkhqWVhHYmxXaDJ6dEU3NzJwbmsuIiwiMTU",
Path: "/",
Domain: ".jianshu.com",
Secure: true,
HttpOnly: true,
},
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取頁(yè)面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
err := c.Visit(url)
if err != nil {
fmt.Println(err.Error())
}
}
4. 內(nèi)存任務(wù)隊(duì)列
將需要爬取的連接放入隊(duì)列中起惕,設(shè)置隊(duì)列并發(fā)數(shù),可以并行爬取連接
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/queue"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))
//創(chuàng)建內(nèi)存隊(duì)列咏删,大小10000惹想,goroutine數(shù)量 5
q, _ := queue.New(5, &queue.InMemoryQueueStorage{MaxSize: 10000})
c.OnHTML("a", func(element *colly.HTMLElement) {
element.Request.Visit(element.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取頁(yè)面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
q.AddURL("http://www.reibang.com")
q.Run(c)
}
5. redis任務(wù)隊(duì)列
設(shè)置redis存儲(chǔ)后,隊(duì)列中URL存儲(chǔ)在redis中督函,訪問(wèn)頁(yè)面的cookie及訪問(wèn)記錄也會(huì)保存在redis中
package main
import (
"fmt"
"github.com/gocolly/colly"
"github.com/gocolly/colly/debug"
"github.com/gocolly/colly/queue"
"github.com/gocolly/redisstorage"
)
func main() {
c := colly.NewCollector(colly.UserAgent("Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"), colly.MaxDepth(3), colly.Debugger(&debug.LogDebugger{}))
storage := &redisstorage.Storage{
Address: "192.168.1.10:6379",
Password: "123456",
DB: 0,
Prefix: "colly",
Client: nil,
Expires: 0,
}
c.SetStorage(storage)
err := storage.Clear()
if err != nil{
panic(err)
}
defer storage.Client.Close()
q, _ := queue.New(5, storage)
c.OnHTML("a", func(element *colly.HTMLElement) {
element.Request.Visit(element.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("爬取頁(yè)面:", r.URL)
})
c.OnError(func(r *colly.Response, err error) {
fmt.Println("Request URL:", r.Request.URL, "failed with response:", r, "\nError:", err)
})
q.AddURL("http://www.reibang.com")
q.Run(c)
}
redis中數(shù)據(jù)
6.配置代理
package main
import (
"bytes"
"log"
"github.com/gocolly/colly"
"github.com/gocolly/colly/proxy"
)
func main() {
c := colly.NewCollector()
//配置兩個(gè)代理
rp, err := proxy.RoundRobinProxySwitcher("http://127.0.0.1:1080", "socks5://127.0.0.1:1338")
if err != nil {
log.Fatal(err)
}
c.SetProxyFunc(rp)
c.OnResponse(func(r *colly.Response) {
log.Printf("%s\n", bytes.Replace(r.Body, []byte("\n"), nil, -1))
})
c.Visit("https://httpbin.org/ip")
}