多執行緒、程序池、協程
阿新 • • 發佈:2019-02-19
程序池
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree #import threading # multiprocessing.dummy 是多程序類庫裡裡的一個多執行緒模組,有一個類Pool,表示執行緒池 from multiprocessing.dummy import Pool import requests import Queue import time class Douban(object): def __init__(self): self.base_url = "https://movie.douban.com/top250?start=" self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)] self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} # 建立佇列儲存資料 self.data_queue = Queue.Queue() self.count = 0 def send_request(self, url): print url html = requests.get(url, headers = self.headers).content time.sleep(1) self.parse_page(html) def parse_page(self, html): html_obj = etree.HTML(html) node_list = html_obj.xpath("//div[@class='info']") for node in node_list: # 獲取電影標題 title = node.xpath("./div[@class='hd']/a/span/text()")[0] # 獲取電影評分 score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0] # 將資料儲存到佇列裡 self.data_queue.put(score + "\t" + title) #print score, title def start_work(self): #for url in self.url_list: # 傳送請求 # html = self.send_request(url) # 解析響應 # 建立執行緒池 pool = Pool(len(self.url_list)) pool.map(self.send_request, self.url_list) pool.close() # 主執行緒等待所有子執行緒執行結束,主執行緒再執行後面的程式碼 pool.join() while not self.data_queue.empty(): print self.data_queue.get() self.count += 1 print "\n%d" % self.count if __name__ == "__main__": douban = Douban() start = time.time() douban.start_work() print "[INFO]: Useing %f secend" % (time.time() - start)
多執行緒
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree import requests import threading import Queue import time class Douban(object): def __init__(self): self.base_url = "https://movie.douban.com/top250?start=" self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)] self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} # 建立佇列儲存資料 self.data_queue = Queue.Queue() self.count = 0 def send_request(self, url): print url html = requests.get(url, headers = self.headers).content time.sleep(1) self.parse_page(html) def parse_page(self, html): html_obj = etree.HTML(html) node_list = html_obj.xpath("//div[@class='info']") for node in node_list: # 獲取電影標題 title = node.xpath("./div[@class='hd']/a/span/text()")[0] # 獲取電影評分 score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0] # 將資料儲存到佇列裡 self.data_queue.put(score + "\t" + title) #print score, title def start_work(self): thread_list = [] for url in self.url_list: # 傳送請求 #html = self.send_request(url) thread = threading.Thread(target = self.send_request, args = [url]) thread.start() thread_list.append(thread) # 讓主執行緒等待,等待所有子執行緒執行結束,再向下執行程式碼 for thread in thread_list: thread.join() while not self.data_queue.empty(): print self.data_queue.get() self.count += 1 print "\n%d" % self.count if __name__ == "__main__": douban = Douban() start = time.time() douban.start_work() print "[INFO]: Useing %f secend" % (time.time() - start)
協程
#!/usr/bin/env python # -*- coding:utf-8 -*- from lxml import etree import requests import Queue import time import gevent from gevent import monkey monkey.patch_all() # gevent 可以用同步的語法寫非同步的程式。 # monkey.patch_all() 在Python程式執行的時候,會動態的將網路庫(socket, select)打個補丁,變為非同步的庫。 # 讓程式在進行網路操作的時候,都變為非同步的方式去執行。 class Douban(object): def __init__(self): self.base_url = "https://movie.douban.com/top250?start=" self.url_list = [self.base_url + str(page) for page in range(0, 225 + 1, 25)] self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko"} # 建立佇列儲存資料 self.data_queue = Queue.Queue() self.count = 0 def send_request(self, url): print url html = requests.get(url, headers = self.headers).content time.sleep(1) self.parse_page(html) def parse_page(self, html): html_obj = etree.HTML(html) node_list = html_obj.xpath("//div[@class='info']") for node in node_list: # 獲取電影標題 title = node.xpath("./div[@class='hd']/a/span/text()")[0] # 獲取電影評分 score = node.xpath("./div[@class='bd']//span[@class='rating_num']/text()")[0] # 將資料儲存到佇列裡 self.data_queue.put(score + "\t" + title) #print score, title def start_work(self): #for url in self.url_list: # 傳送請求 # html = self.send_request(url) # 解析響應 # 建立任務列表,儲存所有的協程任務 job_list = [] for url in self.url_list: # 建立一個協程任務 job = gevent.spawn(self.send_request, url) # 將任務新增到列表裡 job_list.append(job) # 將所有的協程任務新增到任務佇列裡執行 gevent.joinall(job_list) #gevent.joinall([gevent.spawn(self.send_request, url) for url in self.url_list]) #job_list = [gevent.spawn(self.send_request, url) for url in self.url_list] #gevent.joinall(job_list) while not self.data_queue.empty(): print self.data_queue.get() self.count += 1 print "\n%d" % self.count if __name__ == "__main__": douban = Douban() start = time.time() douban.start_work() print "[INFO]: Useing %f secend" % (time.time() - start)