1. 程式人生 > 其它 >90分鐘掌握Python多執行緒爬蟲(全程實戰)

90分鐘掌握Python多執行緒爬蟲(全程實戰)

https://edu.csdn.net/learn/20379?spm=1002.2001.3001.4157

#encoding: utf-8

import requests
from bs4 import BeautifulSoup
from urllib import request
import os
import threading

# 首先先要對請求的身份進行偽裝。
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
" } # 用來儲存所有的頁面的url PAGE_URLS = [] IMG_URLS = [] gLock = threading.Lock() # 生產者:專門用來獲取表情包的url連結。 # 消費者:專門從表情包的url連結中下載圖片 # 全域性變數:就是一個列表,這個列表儲存了許多的表情包的連結。 def producer(): while True: gLock.acquire() if len(PAGE_URLS) == 0: gLock.release() break page_url
= PAGE_URLS.pop() gLock.release() response = requests.get(page_url, headers=headers) text = response.text soup = BeautifulSoup(text, 'lxml') img_list = soup.find_all("img", attrs={"class": "img-responsive lazy image_dta"}) for img in img_list:
# 有些img_url沒有http字首 img_url = img['data-original'] IMG_URLS.append(img_url) def consumer(): while True: gLock.acquire() if len(IMG_URLS) == 0 and len(PAGE_URLS) == 0: gLock.release() break if len(IMG_URLS) > 0: img_url = IMG_URLS.pop() else: img_url = '' gLock.release() # https://ws2.sinaimg.cn/bmiddle/9150e4e5gy1g0saavmreuj20250250sh.jpg # ['https:','','ws2.sinaimg.cn','bmiddle','9150e4e5gy1g0saavmreuj20250250sh.jpg'] # windows: D:\PublicCourse\class\2019_03_06\bqb # Mac/Linux/Unix:/root/srv if img_url: try: filename = img_url.split("/")[-1] fullpath = os.path.join("images", filename) request.urlretrieve(img_url, fullpath) print("%s下載完成" % filename) except: print("="*30) print(img_url) print("=" * 30) def main(): # 1. 先獲取所有頁面的url for x in range(1,100): page_url = "https://www.doutula.com/photo/list/?page="+str(x) PAGE_URLS.append(page_url) # 五個生產者執行緒 for x in range(5): th = threading.Thread(target=producer) th.start() # 五個消費者執行緒 for x in range(5): th = threading.Thread(target=consumer) th.start() if __name__ == '__main__': main()

多程序優化:

#encoding: utf-8
import time
import threading
import random

gMoney = 0
# 只要想要在多執行緒中操作全域性變數,那麼就需要在操作的時候進行上鎖
gLock = threading.Lock()

def greet(index):
    print("helloworld-%d"%index)
    time.sleep(0.5)


def line_run():
    for x in range(5):
        greet(x)

def thread_run():
    for x in range(5):
        th = threading.Thread(target=greet,args=[x])
        th.start()

def produter():
    global gMoney
    while True:
        money = random.randint(0,100)
        gLock.acquire()
        gMoney += money
        gLock.release()
        print("%s生產者生產了%s元錢,剩餘%s元錢"%(threading.current_thread(),money,gMoney))
        time.sleep(0.5)

def consumer():
    global gMoney
    while True:
        money = random.randint(0,100)
        gLock.acquire()
        if gMoney >= money:
            gMoney -= money
            print("%s消費者消費了%s元錢,剩餘%s元錢"%(threading.current_thread(),money,gMoney))
        else:
            print("%s消費者想消費%s元錢,但是餘額不足!剩餘%s元錢!"%(threading.current_thread(),money,gMoney))
        gLock.release()
        time.sleep(0.5)



if __name__ == '__main__':
    # line_run()
    # thread_run()
    for x in range(5):
        th = threading.Thread(target=produter)
        th.start()

    for x in range(5):
        th = threading.Thread(target=consumer)
        th.start()