Python如何使用佇列方式實現多執行緒爬蟲
阿新 • • 發佈:2020-05-13
說明:糗事百科段子的爬取,採用了佇列和多執行緒的方式,其中關鍵點是Queue.task_done()、Queue.join(),保證了執行緒的有序進行。
程式碼如下
import requests from lxml import etree import json from queue import Queue import threading class Qsbk(object): def __init__(self): self.headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/81.0.4044.138 Safari/537.36","Referer": "https://www.qiushibaike.com/" } # 例項化三個佇列,用來存放內容 self.url_queue = Queue() self.html_queue = Queue() self.content_queue = Queue() def get_total_url(self): """ 獲取了所有的頁面url,並且返回url_list return:url_list 現在放入url_queue佇列中儲存 """ url_temp = "https://www.qiushibaike.com/text/page/{}/" url_list = list() for i in range(1,13): # url_list.append(url_temp.format(i)) # 將生成的url放入url_queue佇列 self.url_queue.put(url_temp.format(i)) def parse_url(self): """ 傳送請求,獲取響應,同時etree處理html """ while self.url_queue.not_empty: # 判斷非空,為空時結束迴圈 # 從佇列中取出一個url url = self.url_queue.get() print("parsing url:",url) # 傳送請求 response = requests.get(url,headers=self.headers,timeout=10) # 獲取html字串 html = response.content.decode() # 獲取element型別的html html = etree.HTML(html) # 將生成的element物件放入html_queue佇列 self.html_queue.put(html) # Queue.task_done() 在完成一項工作之後,Queue.task_done()函式向任務已經完成的佇列傳送一個訊號 self.url_queue.task_done() def get_content(self): """ 解析網頁內容,獲取想要的資訊 """ while self.html_queue.not_empty: items = list() html = self.html_queue.get() total_div = html.xpath("//div[@class='col1 old-style-col1']/div") for i in total_div: author_img = i.xpath(".//a[@rel='nofollow']/img/@src") author_img = "https"+author_img[0] if len(author_img)>0 else None author_name = i.xpath(".//a[@rel='nofollow']/img/@alt") author_name = author_name[0] if len(author_name)>0 else None author_href = i.xpath("./a/@href") author_href = "https://www.qiushibaike.com/"+author_href[0] if len(author_href)>0 else None author_gender = i.xpath("./div[1]/div/@class") author_gender = author_gender[0].split(" ")[-1].replace("Icon","").strip() if len(author_gender)>0 else None author_age = i.xpath("./div[1]/div/text()") author_age = author_age[0] if len(author_age)>0 else None content = i.xpath("./a/div/span/text()") content = content[0].strip() if len(content)>0 else None content_vote = i.xpath("./div[@class='stats']/span[@class='stats-vote']/i/text()") content_vote = content_vote[0] if len(content_vote)>0 else None content_comment_numbers = i.xpath("./div[@class='stats']/span[@class='stats-comments']/a/i/text()") content_comment_numbers = content_comment_numbers[0] if len(content_comment_numbers)>0 else None item = { "author_name":author_name,"author_age" :author_age,"author_gender":author_gender,"author_img":author_img,"author_href":author_href,"content":content,"content_vote":content_vote,"content_comment_numbers":content_comment_numbers,} items.append(item) self.content_queue.put(items) # task_done的時候,佇列計數減一 self.html_queue.task_done() def save_items(self): """ 儲存items """ while self.content_queue.not_empty: items = self.content_queue.get() with open("quishibaike.txt",'a',encoding='utf-8') as f: for i in items: json.dump(i,f,ensure_ascii=False,indent=2) self.content_queue.task_done() def run(self): # 獲取url list thread_list = list() thread_url = threading.Thread(target=self.get_total_url) thread_list.append(thread_url) # 傳送網路請求 for i in range(10): thread_parse = threading.Thread(target=self.parse_url) thread_list.append(thread_parse) # 提取資料 thread_get_content = threading.Thread(target=self.get_content) thread_list.append(thread_get_content) # 儲存 thread_save = threading.Thread(target=self.save_items) thread_list.append(thread_save) for t in thread_list: # 為每個程序設定為後臺程序,效果是主程序退出子程序也會退出 t.setDaemon(True) t.start() # 讓主執行緒等待,所有的佇列為空的時候才能退出 self.url_queue.join() self.html_queue.join() self.content_queue.join() if __name__=="__main__": obj = Qsbk() obj.run()
以上就是本文的全部內容,希望對大家的學習有所幫助,也希望大家多多支援我們。