python使用多執行緒爬取表情包
阿新 • • 發佈:2019-02-14
使用多執行緒爬取資料可以顯著提高效率
編輯環境:pycharm
目標:爬取表情包庫的所有表情包
首先在同目錄下建一個images資料夾
程式碼如下
#coding:utf8
import os
import threading
import requests
import urllib
from bs4 import BeautifulSoup
base_page_url = 'https://www.doutula.com/photo/list/?page='
#頁面url列表
page_url_list = []
#表情url列表
face_url_list = []
#全域性鎖
glock = threading.Lock()
for x in range(1,870):
url = base_page_url + str(x)
page_url_list.append(url)
def procuder():
while True:
glock.acquire()
if len(page_url_list) == 0:
glock.release()
break
else:
page_url = page_url_list.pop()
glock.release()
response = requests.get(page_url)
content = response.content
soup = BeautifulSoup(content, 'lxml' )
img_list = soup.find_all('img', attrs={'class': 'img-responsive lazy image_dta'})
glock.acquire()
for img in img_list:
url = img['data-original']
if not url.startswith('http'):
url = 'http:' + url
face_url_list.append(url)
glock.release()
def customer():
while True:
glock.acquire()
if len(face_url_list)==0:
glock.release()
continue
else:
face_url = face_url_list.pop()
glock.release()
split_list = face_url.split('/')
filename = split_list.pop()
path = os.path.join('images', filename)
urllib.urlretrieve(face_url, filename=path)
def main():
#建立4個多執行緒作為生產者,爬取圖片
for x in range(4):
th = threading.Thread(target=procuder)
th.start()
#建立5個多執行緒作為消費者,下載圖片
for x in range(5):
th = threading.Thread(target=customer)
th.start()
if __name__ =='__main__':
main()
一共4w多張表情包,美滋滋