python - 多線程/多進程
阿新 • • 發佈:2018-02-23
進程間 lose elf nal 運行時 url .get [] pen
多線程:
import threading from multiprocessing import Queue from time import sleep from bs4 import BeautifulSoup from requests import get import re class myThread(threading.Thread): def __init__(self, qlock, queue): threading.Thread.__init__(self) self.qlock = qlock self.queue = queue def run(self): process(self.qlock, self.queue) def process(qlock, queue): qlock.acquire() # 互斥鎖 try: data = queue.get() # 獲取隊列 print(data) finally: qlock.release() # 釋放鎖 sleep(1) # 建立隊列 workQueue = Queue(50) qlock = threading.Lock() url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ r = get(url, timeout=1) html = r.text soup = BeautifulSoup(html,‘lxml‘) urls = soup.find_all(‘img‘) links = [] for url in urls: r = re.compile(r‘data-src="(.+?)"‘) link = r.findall(str(url)) workQueue.put(link) # 寫入隊列 links.append(link) threads = [] for url in links: thread = myThread(qlock, workQueue) thread.daemon = True thread.start() threads.append(thread) # 清空隊列 while not workQueue.empty(): pass # 等待線程結束 for t in threads: t.join()
多進程:
1.使用Pool模塊創建進程池:
from multiprocessing import Pool from bs4 import BeautifulSoup from requests import get import re import os def run_process(url): print(url) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) links.append(link) process = Pool(os.cpu_count()) # cpu核個數 for u in links: process.apply_async(run_process,args=(u,)) process.close() process.join()
2.Process模塊、Queue模塊進行進程間的通信(但我的寫入隊列沒有用多進程):
from multiprocessing import Process, Queue from bs4 import BeautifulSoup from requests import get import re class myProcess(Process): def __init__(self, queue): Process.__init__(self) self.queue = queue def run(self): run_process(self.queue) def run_process(queue): data = queue.get() print(data) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) queue = Queue(50) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) queue.put(link) links.append(link) for u in links: process = myProcess(queue) process.start() while not queue.empty(): pass process.join()
第2個比第1個明顯慢了很多,不知道為什麽...
但上面只是cpu密集型,測試一下用io密集型的小爬蟲來看看效果:
1.多線程:
import threading from multiprocessing import Queue from time import sleep from bs4 import BeautifulSoup from requests import get import re class myThread(threading.Thread): def __init__(self, qlock, queue): threading.Thread.__init__(self) self.qlock = qlock self.queue = queue def run(self): process(self.qlock, self.queue) def process(qlock, queue): qlock.acquire() # 互斥鎖 try: url = queue.get()[0] # 獲取隊列 img = get(url,timeout=1).content name = url.split(‘/‘)[-1] imgid = name[:8] with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp: fp.write(img) print(‘download: ‘ + url) finally: qlock.release() # sleep(1) # 建立隊列 workQueue = Queue(50) qlock = threading.Lock() url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html,‘lxml‘) urls = soup.find_all(‘img‘) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) workQueue.put(link) # 寫入隊列 links.append(link) threads = [] for u in links: thread = myThread(qlock, workQueue) thread.start() threads.append(thread) # 清空隊列 while not workQueue.empty(): pass # 等待線程結束 for t in threads: t.join()
2.多進程:
from multiprocessing import Process, Queue from bs4 import BeautifulSoup from requests import get import re class myProcess(Process): def __init__(self, queue): Process.__init__(self) self.queue = queue def run(self): run_process(self.queue) def run_process(queue): url = queue.get()[0] # 獲取隊列 img = get(url, timeout=1).content name = url.split(‘/‘)[-1] imgid = name[:8] with open(‘C:/Users/adimin/Desktop/video/{}.jpg‘.format(imgid), ‘wb‘) as fp: fp.write(img) print(‘download: ‘ + url) if __name__ == ‘__main__‘: url = ‘https://www.pixiv.net/ranking.php?mode=daily‘ html = get(url, timeout=1).text soup = BeautifulSoup(html, ‘lxml‘) urls = soup.find_all(‘img‘) queue = Queue(50) links = [] for u in urls: r = re.compile(r‘data-src="(.+?.jpg)"‘) link = r.findall(str(u)) queue.put(link) links.append(link) for u in links: process = myProcess(queue) process.start() while not queue.empty(): pass process.join()
最後,感覺運行時間都差不多...還是看不太出來差距。
python - 多線程/多進程