go--單任務版爬蟲
阿新 • • 發佈:2018-07-07
bbf str coo find 表達式 fcc 整體 [] expire
1.獲取初識頁面內容
package main import ( "net/http" "fmt" "io/ioutil" ) func main() { //打開鏈接,會有兩個返回值,一個響應,一個error res, err := http.Get("https://tieba.baidu.com/p/5524106374?red_tag=0000236673") if err != nil { panic(err) } defer res.Body.Close() //如果響應狀態碼不等於200,證明出問題了 if res.StatusCode != 200 { fmt.Println("err statuscode:", res.StatusCode) return } //將內容讀到all裏面 all, _ := ioutil.ReadAll(res.Body) fmt.Println(string(all)) //還可以使用httputil下的一個方法 //會自動的將res的響應體讀取到all中 //此外該方法還會打印一些頭部信息,如下 //all, _ = httputil.DumpResponse(res, true) //fmt.Println(string(all)) // HTTP / 1.1 // 200 // OK // Transfer - Encoding: chunked //Connection: // keep - alive // Content - Type: text / html; // charset = UTF - 8 //Date: // Sat, 07 // Jul // 2018 // 05:20:48 // GMT //P3p: // CP = " OTI DSP COR IVA OUR IND COM " //Server: // Apache // Set - Cookie: TIEBA_USERTYPE = d8c56c898382fa778148475e; // expires = Thu, 31-Dec-2020 // 15:59:59 // GMT; // path =/; // domain = tieba.baidu.com // Set - Cookie: wise_device = 0; // path =/ // Set - Cookie: BAIDUID = 3826 //C6F501EC8C114AC77215BBE0DA64: // FG = 1; // expires = Sun, 07-Jul-19 // 05:20:48 // GMT; // max - age = 31536000; // path =/; // domain =.baidu.com; // version = 1 //Tracecode: // 12484498910460795914070713 //Tracecode: // 12484498910470965258070713 //Vary: // Accept - Encoding // X - Xss - Protection: 1; // mode = block }
2.使用正則表達式解析,並提取url
package main import ( "net/http" "fmt" "io/ioutil" "regexp" ) func main() { res, err := http.Get("https://tieba.baidu.com/p/5524106374?red_tag=0000236673") if err != nil { panic(err) } defer res.Body.Close() if res.StatusCode != 200 { fmt.Println("err statuscode:", res.StatusCode) return } //將內容讀到all裏面 all, _ := ioutil.ReadAll(res.Body) re:=regexp.MustCompile(`src="(http[^"]+?(?:jpg|png))"`) //找到所有圖片鏈接 src="thhp://xxxxxx.jpg" match := re.FindAllStringSubmatch(string(all),-1) //go中的正則沒辦法單獨匹配括號裏面的內容,也許我們不想要整體的內容,但是go還是會匹配出來。 //而且只能使用FindAllStringSubmatch,如果FindAllString,匹配的仍然是整體 //加上SubMatch,會將整體和()裏面的內容都加到切片當中 //解決的辦法是for循環 urls:=make([]interface{},0) for _,url:=range match{ //但我們的目的是找我們想要的圖片,可是有一些圖片是我們不想要的,但它們也符合我們的正則表達式 //因此我們可以進行一個判斷 //若url的長度和我們期待的圖片的長度不相等,那麽就進行下一輪循環 if len(url[1]) != len("https://imgsa.baidu.com/forum/w%3D580/sign=605377bf04b30f24359aec0bf897d192/eb55981bb051f8199dc2df94d1b44aed2c73e7d5.jpg"){ continue } urls = append(urls,url[1]) } fmt.Println(urls) } //[https://imgsa.baidu.com/forum/w%3D580/sign=51dfed7aafc27d1ea5263bcc2bd7adaf/29aa8064034f78f072d6c52d72310a55b1191cd2.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=b76619654c10b912bfc1f6f6f3fcfcb5/0b74e395d143ad4bab64181e89025aafa50f0669.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=dc2cfe017dc6a7efb926a82ecdfbafe9/6e799a3533fa828b9d23c63df61f4134960a5ab7.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=605377bf04b30f24359aec0bf897d192/eb55981bb051f8199dc2df94d1b44aed2c73e7d5.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=2f3f86cc444a20a4311e3ccfa0539847/644711f33a87e95059032db21b385343faf2b4a4.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=e30f414125dda3cc0be4b82831e83905/4b6297315c6034a8f6e6605fc0134954082376b4.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=50b8de14898ba61edfeec827713597cc/af59f91b0ef41bd54ce3302a5ada81cb38db3d00.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=3f9442bff8deb48ffb69a1d6c01e3aef/2f586a34970a304ee7717824dac8a786c8175c61.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=17f88df316950a7b75354ecc3ad3625c/a98c2146f21fbe09309315d160600c338544adea.jpg // https://imgsa.baidu.com/forum/w%3D580/sign=4bae33e30224ab18e016e13f05f8e69a/1dbb35178a82b901535815f6788da9773b12efc2.jpg]
package main import ( "net/http" "fmt" "io/ioutil" "regexp" ) //可以將上面的進行一個封裝 //link:訪問的url地址 //rule:正則表達式要匹配的規則 //target:我們想獲取的鏈接 func get_pic_url(link,rule,target string) []interface{}{ res, err := http.Get(link) if err != nil { panic(err) } defer res.Body.Close() if res.StatusCode != 200 { fmt.Println("err statuscode:", res.StatusCode) panic("出錯了") } all, _ := ioutil.ReadAll(res.Body) re:=regexp.MustCompile(rule) match := re.FindAllStringSubmatch(string(all),-1) urls:=make([]interface{},0) for _,url:=range match{ if len(url[1]) != len(target){ continue } urls = append(urls,url[1]) } return urls } func main(){ link := "https://tieba.baidu.com/p/4244799788?red_tag=2313275030" rule := `src="(.+?\.jpg)"` target := "https://imgsa.baidu.com/forum/w%3D580/sign=a80a7ab75eee3d6d22c687c373166d41/862df7246b600c337e73b7d81d4c510fd9f9a163.jpg" urls := get_pic_url(link,rule,target) for _,url := range urls{ fmt.Println(url) } } //https://imgsa.baidu.com/forum/w%3D580/sign=4e328e1f8094a4c20a23e7233ef51bac/0b837a899e510fb3979e1887de33c895d0430ced.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=4b14d9aba11ea8d38a22740ca70b30cf/57efcc11728b4710c1fc93dcc4cec3fdfd032399.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=dc1c05ce3a6d55fbc5c6762e5d234f40/1b6a6a600c3387441173466c560fd9f9d72aa03f.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=72361f898526cffc692abfba89004a7d/5e528601a18b87d6e20cc20f000828381f30fd27.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=9627ab23968fa0ec7fc764051696594a/3a847acb0a46f21f4afe3743f1246b600d33aea3.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=9f46baed7c899e51788e3a1c72a6d990/1a78aec379310a5519a5b7e4b04543a9832610ef.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=0dceca0bd30735fa91f04eb1ae500f9f/462a024f78f0f736091524110d55b319eac41381.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=4da8783f92cad1c8d0bbfc2f4f3f67c4/2eb3fd039245d688a9d9dab4a3c27d1ed31b2491.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=6459fdcc0ffa513d51aa6cd60d6c554c/e708f31fbe096b6303f236400b338744eaf8ac82.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=5cd8795f992f07085f052a08d924b865/675d622762d0f7034a81fecc0ffa513d2697c5ba.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=77e1f18f29dda3cc0be4b82831e83905/7da177c6a7efce1ba1c1a0daa851f3deb58f6582.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=bddeeefe9b82d158bb8259b9b00b19d5/4ac8c8177f3e6709a5ea2def3cc79f3df9dc5582.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=ab1df271f4deb48ffb69a1d6c01e3aef/7f16d009b3de9c8287bf919f6b81800a18d84392.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=c527aa16a8c379317d688621dbc5b784/15578718367adab4f73a73538cd4b31c8601e483.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=0bce68f3d239b6004dce0fbfd9503526/407a5882b2b7d0a212f46f8dccef76094b369aab.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=64b851138044ebf86d716437e9f8d736/d7c9e850352ac65c1fe00e63fcf2b21192138a83.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=2e1d5d1b5cb5c9ea62f303ebe538b622/e199902397dda144b7b207a2b5b7d0a20cf48631.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=727db5e4b04543a9f51bfac42e178a7b/c33d8ad4b31c870160efd9f6207f9e2f0708ff10.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=77dd52d3d0ca7bcb7d7bc7278e086b3f/d9d5023b5bb5c9ea101d68f3d239b6003bf3b386.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=76dc96b5d643ad4ba62e46c8b2035a89/aee78326cffc1e17256de7214d90f603728de99b.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=09f988ea7ccb0a4685228b315b63f63e/ee755ab5c9ea15ce33d98a19b1003af33a87b2bd.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=d41bcbcef71f3a295ac8d5c6a925bce3/3c16cdbf6c81800a4197e31eb63533fa828b477a.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=00cd89a8d33f8794d3ff4826e21a0ead/d8b4e7cd7b899e519d3b500445a7d933c8950d33.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=0ebf913fb9096b6381195e583c328733/f88037d3d539b6003e0a59a7ee50352ac75cb7ac.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=b8db0843f1246b607b0eb27cdbf91a35/3877b7003af33a87ef82560ac15c10385243b585.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=e4baea7493eef01f4d1418cdd0fe99e0/71ccd058ccbf6c81d03f7ff8bb3eb13533fa4060.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=6ff190046c600c33f079dec02a4d5134/5b4e3bf33a87e95031b8a07c17385343faf2b4c5.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=099bb22732fae6cd0cb4ab693fb20f9e/ee0179f0f736afc3e7816c75b419ebc4b64512db.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=4b966f04ac18972ba33a00c2d6cc7b9d/a6b5faedab64034fe3709116a8c379310a551d2b.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=44dc433f92cad1c8d0bbfc2f4f3f67c4/2eb3fd039245d688a0ade1b4a3c27d1ed31b24c5.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=4bff61282f381f309e198da199004c67/8f9ef603918fa0ecdd2a371d219759ee3c6ddb84.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=0447538859df8db1bc2e7c6c3923dddb/6fc5a71ea8d3fd1f1913f02a374e251f95ca5f52.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=1d2eae1d77cf3bc7e800cde4e100babd/9050d31b0ef41bd5d749bae456da81cb39db3d52.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=1d1af6afa7ec08fa260013af69ee3d4d/96850b46f21fbe09817a91046c600c338744ad4d.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=57bfc6ab80d6277fe912323018391f63/fc91a8ec8a136327b26e9023968fa0ec09fac7ea.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=a745d5db44166d223877159c76220945/6a0aeaf81a4c510f2b47b00a6759252dd52aa5ea.jpg //https://imgsa.baidu.com/forum/w%3D580/sign=db4dd5319413b07ebdbd50003cd69113/4e827dd98d1001e919be461fbf0e7bec55e797ea.jpg
go--單任務版爬蟲