go 實現的分散式爬蟲
自學golang期間,編寫gorouting,chan相關程式碼,實現分散式爬去某主站的每個頁面的連結,
暫未想到有什麼好辦法,在抓取完網頁上的連結後,自動退出程式。
抓取效能:記憶體(8G)佔用97%,抓取連結數量:923571個。
最後抓取的連結如下:
923550 https://jobs.51job.com/xian/97026549.html?t=2&s=01
923551 https://jobs.51job.com/xian-ytq/100531159.html?t=2&s=01
923552 https://jobs.51job.com/xian-ytq/co4962830.html
923553 https://jobs.51job.com/xian/102536289.html?t=2&s=01
923554 https://jobs.51job.com/xian/co4541067.html
923555 https://jobs.51job.com/xian-ytq/103266474.html?t=2&s=01
923556 https://jobs.51job.com/xian-ytq/co4427712.html
923557 https://jobs.51job.com/xian/100993960.html?t=2&s=01
923558 https://jobs.51job.com/xian-jjjs/101527308.html?t=2&s=01
923559 https://jobs.51job.com/xian-jjjs/co4429285.html
923560 https://jobs.51job.com/xian/98443048.html?t=2&s=01
923561 https://jobs.51job.com/xian/co4204829.html
923562 https://jobs.51job.com/xian-ytq/101144457.html?t=2&s=01
923563 https://jobs.51job.com/xian-ytq/co4979864.html
923564 https://jobs.51job.com/xian-ytq/91962903.html?t=2&s=01
923565 https://jobs.51job.com/xian-ytq/co4411060.html
923566 https://jobs.51job.com/xian/98252112.html?t=2&s=01
923567 https://jobs.51job.com/xian/co4868985.html
923568 https://jobs.51job.com/xian-lhq/95451726.html?t=2&s=01
923569 https://jobs.51job.com/xian-lhq/co4054606.html
923570 https://jobs.51job.com/xian/103780877.html?t=2&s=01
923571 https://jobs.51job.com/xian/co4301859.html
具體程式碼如下:
package main import ( "fmt" "net/http" "strings" "golang.org/x/net/html" ) func analyseNode(node *html.Node, link string, f func(string)) { if node.Type == html.ElementNode && node.Data == "a" { for _, b := range node.Attr { if b.Key == "href" && b.Val != "javascript:" { v := b.Val if strings.HasPrefix(v, "#") { continue } f(v) } } } for n := node.FirstChild; n != nil; n = n.NextSibling { analyseNode(n, link, f) } } func requestPage(link string, f func(string)) { resp, err := http.Get(link) if err == nil { doc, err := html.Parse(resp.Body) resp.Body.Close() if err == nil { analyseNode(doc, link, f) } } } func main() { var i uint64 = 0 seen := make(map[string]bool) worklist := make(chan string) found := make(chan string) fmt.Println("main runing") go func() { str := "https://www.51job.com" seen[str] = true worklist <- str }() for i := 0; i < 20; i++ { go func() { for url := range worklist { link := url requestPage(link, func(s string) { if strings.HasPrefix(s, "//") { protocol := link[0:strings.Index(link, "//")] s = protocol + s } else if strings.HasPrefix(s, "/") { s = link + s } found <- s }) } }() } for url := range found { if !seen[url] { seen[url] = true i += 1 fmt.Println(i, url) go func(url string) { worklist <- url }(url) } } }
有待進一步優化