python爬蟲-通過bs4和xpath分析html程式碼
阿新 • • 發佈:2018-12-20
我感覺作者用xpath分析程式碼的時候不是很好,下面是我重新改善的
一、用lxml模組分析程式碼
#!/usr/bin/env python #-*- coding:utf-8 -*- import requests import time,os from lxml import etree def get_Page(url,headers): response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None def parse_Page(html,headers): html_lxml = etree.HTML(html) #在xpath中可以用 "|" 表示選取若干路徑 datas = html_lxml.xpath('.//div[@class="captcha_images_left"]|.//div[@class="captcha_images_right"]') item= {} # 建立儲存驗證碼資料夾 file = 'qcode' if os.path.exists(file): os.chdir(file) else: os.mkdir(file) os.chdir(file) for data in datas: name = data.xpath('.//h3') #驗證碼名稱,返回列表 src = data.xpath('.//div/img/@src') #驗證碼連結,返回列表 for i in range(len(name)): filename = name[i].text + '.jpg' # 驗證碼圖片檔名 img_url = 'https://captcha.com/' + src[i] item[filename] = img_url count = 0 for imgname, imgurl in item.items(): response = requests.get(imgurl, headers=headers) if response.status_code == 200: image = response.content #獲取圖片內容 with open(imgname,'wb') as f: f.write(image) count += 1 print('儲存第{}張驗證碼成功'.format(count)) time.sleep(1) def main(): url = 'https://captcha.com/captcha-examples.html?cst=corg' headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} html = get_Page(url,headers) parse_Page(html,headers) if __name__ == '__main__': main()
二、用bs4模組分析程式碼
#!/usr/bin/env python #-*- coding:utf-8 -*- import requests import time,os from urllib.request import urlretrieve from bs4 import BeautifulSoup def get_Page(url,headers): response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None def parse_Page(html): soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8') data_left = soup.select('#main .captcha_images_left') data_right = soup.select('#main .captcha_images_right') data = {} # 建立儲存驗證碼資料夾 file = 'qcode' if os.path.exists(file): os.chdir(file) else: os.mkdir(file) os.chdir(file) for i in range(2): for row in zip(data_left,data_right): names = row[i].select('h3') #獲取所有的 h3 標籤,返回列表 images = row[i].select('img') #獲取所有的 img 標籤,返回列表 for tag_h,tag_img in zip(names,images): #tag_h 為所有的 h3 標籤 ;tag_img 為所有的 img 標籤 data[str(tag_h.text).strip() + '.jpg'] = 'https://captcha.com/' + tag_img['src'] for imgname,imgurl in data.items(): response = requests.get(imgurl, headers=headers) if response.status_code == 200: urlretrieve(imgurl,imgname) time.sleep(1) if __name__ == '__main__': url = 'https://captcha.com/captcha-examples.html?cst=corg' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} html = get_Page(url, headers) parse_Page(html)
改善多執行緒爬蟲,以下是pyton3的程式碼
#!/usr/bin/env python #-*- coding:utf-8 -*- # author:Mr Yang import requests import time,os from urllib.request import urlretrieve from bs4 import BeautifulSoup import threading, queue def get_Page(url,headers): response = requests.get(url,headers=headers) if response.status_code == 200: return response.text return None def parse_Page(html,urlQueue): soup = BeautifulSoup(html.encode(), 'html.parser', from_encoding='utf-8') data_left = soup.select('#main .captcha_images_left') data_right = soup.select('#main .captcha_images_right') # 建立儲存驗證碼資料夾 file = 'qcode' if os.path.exists(file): os.chdir(file) else: os.mkdir(file) os.chdir(file) for i in range(2): for row in zip(data_left,data_right): names = row[i].select('h3') #獲取所有的 h3 標籤,返回列表 images = row[i].select('img') #獲取所有的 img 標籤,返回列表 for tag_h,tag_img in zip(names,images): #tag_h 為所有的 h3 標籤 ;tag_img 為所有的 img 標籤 urlQueue.put({str(tag_h.text).strip() + '.jpg':'https://captcha.com/' + tag_img['src']}) def dowloadimg(urlQueue,headers): while True: try: data = urlQueue.get_nowait() # 不阻塞的讀取佇列資料 i = urlQueue.qsize() # 佇列長度,取出一個長度就減少一個 except Exception as e: break for imgname,imgurl in data.items(): response = requests.get(imgurl, headers=headers) if response.status_code == 200: urlretrieve(imgurl,imgname) time.sleep(1) if __name__ == '__main__': url = 'https://captcha.com/captcha-examples.html?cst=corg' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36'} urlQueue = queue.Queue() html = get_Page(url,headers) parse_Page(html,urlQueue) threadNum = 7 for i in range(threadNum): t = threading.Thread(target=dowloadimg,args=(urlQueue,headers,)) t.start()