python爬蟲--re結合xpath爬取圖片
阿新 • • 發佈:2019-01-25
背景:虛擬機器ubuntu16.04利用xpath與爬取www.uumnt.cc/圖片
當然,我們要爬取的是動物板塊!
程式分析,將動物板塊一頁一頁分析拿取出來,然後拿去各種動物頁面的連結,然後對連結分析拿取圖片(每個連結拿取4張圖)
效果為:
原始碼如下:
1 # -*- coding:utf-8 -*- 2 3 #準備爬取https://www.uumnt.cc/dongwu/的一些圖片 4 5 import urllib 6 import urllib2 7 import re 8 import random 9 from lxml import etree 10 11 12 def loadPage(url): 13 #拿到每一頁的html原始碼 14 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"} 15 request = urllib2.Request(url,headers=headers) 16 html = urllib2.urlopen(request).read() 17 #print html 檢測用 18 19 content = etree.HTML(html) 20 #返回所有匹配成功的列表集合 21 link_list = content.xpath('//div[@class="best-pic-c clearfix"]/ul/li/a[@class="best-pic-c-pic"]/@href') 22 23 #print link_list 檢測用 24 for link in link_list: 25 fulllink = 'https://www.uumnt.cc'+link 26 #拿取了相對應的動物的網頁 27 #print fulllink 檢測用 28 loadsunPage(fulllink) 29 30 #拿取子網頁的資訊 31 def loadsunPage(url): 32 # writeImage(url)#提取第一個子網頁 33 url_ = re.match(r"(https://www.uumnt.cc/dongwu/)+(\d*)",url) 34 url_sre =url_.group() 35 #print url_sre 檢測用 36 for i in range(2,6): 37 a = "_%d.html"%i 38 url = url_sre + a 39 #print url 40 writeImage(url) 41 42 def writeImage(url): 43 #拿到html原始碼,xpath提取出圖片連結 44 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"} 45 request = urllib2.Request(url,headers=headers) 46 html = urllib2.urlopen(request).read() 47 48 content = etree.HTML(html) 49 #返回所有匹配成功的列表集合 50 link_list = content.xpath('//img[@class="center other_cont_pics"]/@src') 51 #print link_list 52 for link in link_list: 53 #print link 54 loadImage(link) 55 56 #下載圖片 57 def loadImage(link): 58 #下載 59 headers = {"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"} 60 request = urllib2.Request(link,headers = headers) 61 image = urllib2.urlopen(request).read() 62 63 a = random.randint(1,100000000) 64 filename = str(a) 65 66 with open('/home/cl/桌面/uumntanimal'+filename+'.jpg',"wb") as f: 67 f.write(image) 68 print "download successful-" +filename+".jpg" 69 70 71 72 if __name__ == "__main__": 73 url = "https://www.uumnt.cc/dongwu/" 74 #為了方便,從第二頁開始爬取 75 print '請輸入需爬取的頁數:', 76 a = input() 77 for i in range(2,a+1): 78 #print url 79 url = 'https://www.uumnt.cc/dongwu/list_%d.html'%i 80 loadPage(url) 81
程式碼中有很多print link之類的是為了除錯檢測程式,對新手來說很好用!