1. 程式人生 > 其它 >PYTHON爬取圖片

PYTHON爬取圖片

from threading import Thread
from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Process, Queue
import requests
from lxml import etree
from urllib import parse

# 異常處理還未優化,後續補上
# 未解決問題1:這是爬取多個頁面的當前所有圖片,圖片內部的還未處理
# 未解決問題2:當爬取頁面過多時,會報錯,原因還未找到,後續補上

headers = {
"User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Mobile Safari/537.36",
# 防盜鏈 : 朔源,當前本次請求的上一級是誰
"Referer": "https://xxx"
}


def get_img_src(q):
urls = []
for i in range(1, 5):
if i == 1:
a = f"https://xxx/index.html"
else:
a = f"https://xxx/{i}.html"
urls.append(a)
href_list_all = []
for i in urls:
resp = requests.get(i, headers=headers)
resp.encoding = 'utf-8'
tree = etree.HTML(resp.text)
href_list = tree.xpath("//div[@class='list-box-p']/ul/li/a/@href")
href_list_all.append(href_list)

for all_list in href_list_all:
for href in all_list:
child_resp = requests.get(href, headers=headers)
child_resp.encoding = 'utf-8'
child_tree = etree.HTML(child_resp.text)
src = child_tree.xpath("//div[@class='img_box']/a/img/@src")[0] # 注意這裡獲取的是列表,需要取裡面的下標為0的第一個元素值
q.put(src) # 迴圈向佇列裡裝東西,後面好給下載用
print(f"---------------------------------------------------被塞進佇列--------------------->{src}")
q.put("完事了")


def download(src):
print('開始下載------------>', src)
name = src.split('/')[-1]
with open("./image/" + name, mode='wb') as f:
resp = requests.get(src, headers=headers)
f.write(resp.content)
print('下載完畢------------>', src)


def download_img(q):
with ThreadPoolExecutor(5) as t:
while 1:
src = q.get() # 從佇列裡拿東西,如果沒資料就阻塞,一直等著有資料來
if src == "完事了":
break
t.submit(download, src)


if __name__ == '__main__':
q = Queue()
p1 = Process(target=get_img_src, args=(q,))
p2 = Process(target=download_img, args=(q,))
p1.start()
p2.start()