90分鐘掌握Python多執行緒爬蟲(全程實戰)
阿新 • • 發佈:2021-07-07
https://edu.csdn.net/learn/20379?spm=1002.2001.3001.4157
#encoding: utf-8 import requests from bs4 import BeautifulSoup from urllib import request import os import threading # 首先先要對請求的身份進行偽裝。 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36" } # 用來儲存所有的頁面的url PAGE_URLS = [] IMG_URLS = [] gLock = threading.Lock() # 生產者:專門用來獲取表情包的url連結。 # 消費者:專門從表情包的url連結中下載圖片 # 全域性變數:就是一個列表,這個列表儲存了許多的表情包的連結。 def producer(): while True: gLock.acquire() if len(PAGE_URLS) == 0: gLock.release() break page_url= PAGE_URLS.pop() gLock.release() response = requests.get(page_url, headers=headers) text = response.text soup = BeautifulSoup(text, 'lxml') img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"}) for img in img_list:# 有些img_url沒有http字首 img_url = img['data-original'] IMG_URLS.append(img_url) def consumer(): while True: gLock.acquire() if len(IMG_URLS) == 0 and len(PAGE_URLS) == 0: gLock.release() break if len(IMG_URLS) > 0: img_url = IMG_URLS.pop() else: img_url = '' gLock.release() # https://ws2.sinaimg.cn/bmiddle/9150e4e5gy1g0saavmreuj20250250sh.jpg # ['https:','','ws2.sinaimg.cn','bmiddle','9150e4e5gy1g0saavmreuj20250250sh.jpg'] # windows: D:\PublicCourse\class\2019_03_06\bqb # Mac/Linux/Unix:/root/srv if img_url: try: filename = img_url.split("/")[-1] fullpath = os.path.join("images", filename) request.urlretrieve(img_url, fullpath) print("%s下載完成" % filename) except: print("="*30) print(img_url) print("=" * 30) def main(): # 1. 先獲取所有頁面的url for x in range(1,100): page_url = "https://www.doutula.com/photo/list/?page="+str(x) PAGE_URLS.append(page_url) # 五個生產者執行緒 for x in range(5): th = threading.Thread(target=producer) th.start() # 五個消費者執行緒 for x in range(5): th = threading.Thread(target=consumer) th.start() if __name__ == '__main__': main()
多程序優化:
#encoding: utf-8 import time import threading import random gMoney = 0 # 只要想要在多執行緒中操作全域性變數,那麼就需要在操作的時候進行上鎖 gLock = threading.Lock() def greet(index): print("helloworld-%d"%index) time.sleep(0.5) def line_run(): for x in range(5): greet(x) def thread_run(): for x in range(5): th = threading.Thread(target=greet,args=[x]) th.start() def produter(): global gMoney while True: money = random.randint(0,100) gLock.acquire() gMoney += money gLock.release() print("%s生產者生產了%s元錢,剩餘%s元錢"%(threading.current_thread(),money,gMoney)) time.sleep(0.5) def consumer(): global gMoney while True: money = random.randint(0,100) gLock.acquire() if gMoney >= money: gMoney -= money print("%s消費者消費了%s元錢,剩餘%s元錢"%(threading.current_thread(),money,gMoney)) else: print("%s消費者想消費%s元錢,但是餘額不足!剩餘%s元錢!"%(threading.current_thread(),money,gMoney)) gLock.release() time.sleep(0.5) if __name__ == '__main__': # line_run() # thread_run() for x in range(5): th = threading.Thread(target=produter) th.start() for x in range(5): th = threading.Thread(target=consumer) th.start()