爬蟲中xpath的用法的一些例項
阿新 • • 發佈:2019-01-22
import requestsfrom lxml import etreeimport urllibimport urllib.requestimport osurl = 'http://sc.chinaz.com/tupian/shamotupian.html'x = '''/html/body/div[@class='all_wrap']/div[@class='index_only']/div[@class='left pic_left_l']/div[@class='text_left text_lefts']/div[@id='container']/div[@class='box picblock col3 masonry-brick'][1]/div/a/img/@src'''data = '''<div class="box picblock col3 masonry-brick" style="width: 186px; height: 156px; position: absolute; top: 0px; left: 0px;"> <div> <a target="_blank" href="http://sc.chinaz.com/tupian/180531579563.htm" alt="新疆沙漠風景圖片"><img alt="新疆沙漠風景圖片" src="http://pic2.sc.chinaz.com/Files/pic/pic9/201805/wpic982_s.jpg"></a> </div> <p><a target="_blank" href="http://sc.chinaz.com/tupian/180531579563.htm" alt="新疆沙漠風景圖片">新疆沙漠風景圖片</a></p> </div>'''# requests獲取的資料:data = '''<div class="box picblock col3" style="width:186px;height:311px"> <div> <a target="_blank" href="http://sc.chinaz.com/tupian/180316401614.htm" alt="西北地區荒漠化圖片"><img src2="http://pic2.sc.chinaz.com/Files/pic/pic9/201803/wpic011_s.jpg" alt="西北地區荒漠化圖片"></a> </div> <p><a target="_blank" href="http://sc.chinaz.com/tupian/180316401614.htm" alt="西北地區荒漠化圖片">西北地區荒漠化圖片</a></p> </div>'''# 第二步,修改xpath語句x = '//div[@class="box picblock col3"]/div/a/img/@src2'# html_tree = etree.HTML(data)## src = html_tree.xpath(x)## print(src)response = requests.get(url=url)print('下載圖片%s成功!'%(img_name))response.encoding = 'utf-8'# 呼叫屬性就可以了html = response.text# 解決問題第一步# print(html)html_etree = etree.HTML(html)src = html_etree.xpath(x)print(src)# http://pic2.sc.chinaz.com/Files/pic/pic9/201805/wpic982_s.jpgfor img_url in src: img_name = os.path.split(img_url)[-1] urllib.request.urlretrieve(img_url,filename='./images/%s'%(img_name))
還有一種更為全面的爬蟲
這個寫的非常好import requests # 請求網頁import timefrom lxml import etree # 解析網頁from urllib import request # 下載內容import os # 系統包def parse_page(url): # 2.1簡單的反爬蟲機制 HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'} # 2.2獲取伺服器響應 # 獲取請求響應的狀態碼 response = requests.get(url, headers=HEADERS) # 2.3 從響應裡提取出網頁 # 獲取整個網頁 text = response.text # 將抓取的網頁作為引數返回 return text## 解析資料def page_list(text): html_tree = etree.HTML(text.encode('utf-8')) image_list = html_tree.xpath('//div[@class="page-content text-center"]//a[@class="col-xs-6 col-sm-3"]/img') return image_listdef main(): cont = 0 # 根據URL的規則對URL進行迴圈,獲取多頁URL,並傳給parse_page()進行抓取資料 for x in range(1, 1890): cont = cont + 1 print('--------------當前是:%d 頁' % cont) url = "http://www.doutula.com/photo/list/?page=%d" % x # 呼叫parse_page()函式並把url給進去 text = parse_page(url) # 接收parse_page()返回的網頁,並傳給page_list()函式進行解析 imgs = page_list(text) time.sleep(1) # 3.3 對列表裡的img物件進行提取 for img in imgs: # try.... except 對程式進行異常處理,避免因為其他原因報錯 try: # 3.4獲取所有表情的URL並儲存在列表裡 imgurl = img.xpath(".//@data-original") # 3.5從列表裡提取出表情的URL,至於為啥不取零,列表中有空值,取零報錯 for img_url in imgurl: # 分割字尾名:.jpg .png # 3.6對錶情的URL進行處理,提取出表情圖片的格式,用於組成表情的名字 suffix = os.path.splitext(img_url)[1] suffix = suffix.split("!")[0] # 3.7獲取表情的名字 alt = img.xpath(".//@alt")[0] # alt = re.sub(r'[,。??,/\\·]','',alt) #利用正則表示式對錶情名字中存在的特殊字元進行處理 # 3.8用 alt+suffix組成表情的新名字 img_name = alt + suffix # 使用request.urlretrieve()對錶情進行下載並儲存在images檔案裡 # Python學習交流群:125240963,群內每天分享乾貨,歡迎各位小夥伴入群學習交流 request.urlretrieve(img_url, 'images/' + img_name) # 打印出那些表情已經下載 print(img_name + '下載完畢!') except: print("表情報錯") # 執行函式if __name__ == '__main__': main()