1. 程式人生 > >python使用多執行緒爬取表情包

python使用多執行緒爬取表情包

使用多執行緒爬取資料可以顯著提高效率

編輯環境:pycharm
目標:爬取表情包庫的所有表情包
首先在同目錄下建一個images資料夾
這裡寫圖片描述
程式碼如下

#coding:utf8

import os
import threading
import requests
import urllib
from bs4 import BeautifulSoup

base_page_url = 'https://www.doutula.com/photo/list/?page='
#頁面url列表
page_url_list = []
#表情url列表
face_url_list = []
#全域性鎖
glock = threading.Lock() for x in range(1,870): url = base_page_url + str(x) page_url_list.append(url) def procuder(): while True: glock.acquire() if len(page_url_list) == 0: glock.release() break else: page_url = page_url_list.pop() glock.release() response = requests.get(page_url) content = response.content soup = BeautifulSoup(content, 'lxml'
) img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'}) glock.acquire() for img in img_list: url = img['data-original'] if not url.startswith('http'): url = 'http:' + url face_url_list.append(url) glock.release() def
customer():
while True: glock.acquire() if len(face_url_list)==0: glock.release() continue else: face_url = face_url_list.pop() glock.release() split_list = face_url.split('/') filename = split_list.pop() path = os.path.join('images', filename) urllib.urlretrieve(face_url, filename=path) def main(): #建立4個多執行緒作為生產者,爬取圖片 for x in range(4): th = threading.Thread(target=procuder) th.start() #建立5個多執行緒作為消費者,下載圖片 for x in range(5): th = threading.Thread(target=customer) th.start() if __name__ =='__main__': main()

一共4w多張表情包,美滋滋
一共4w多張表情包,美滋滋