1. 程式人生 > >目前學習的爬取小資料圖片zzz

目前學習的爬取小資料圖片zzz

import os
import threading
import re
import time
from lxml import etree


all_img_urls = []    # 圖片列表頁面的陣列

g_lock = threading.Lock()      # 初始化一個鎖

# 宣告一個生產者的類,來不斷地獲取圖片詳情頁地址,然後新增到 all_img_url列表中

# url = "http://www.xiaohuar.com/"

all_urls = []

class Spider(object):
    # 建構函式,初始化資料實用
    def __init__
(self,target_url,headers): self.target_url = target_url self.headers = headers # 獲取所有的想要抓取的URL def getUrls(self,start_page,page_num): for i in range(start_page,page_num): url = self.target_url % i all_urls.append(url) if __name__ == '__main__
': headers = { "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host":"eclick.baidu.com", } target_url = "http://www.xiaohuar.com/list-1-%d.html" # 抓取連結的樣式 spider = Spider(target_url,headers) #
抓取連結的物件傳入 連結與請求頭 spider.getUrls(0,14) # 抓取的多少頁面的連結 # print (all_urls) class Producer(threading.Thread): #建立一個生產者用來批量的'生產'連結 def run(self): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host": "eclick.baidu.com", } while len(all_urls) > 0: # 這裡寫了一個死迴圈,為的是能夠一直抓取為爬去資料的連結 g_lock.acquire() # 鎖,為的是不讓不同的執行緒共同使用同一條連線 # for url in all_urls: url = all_urls.pop() # 使用pop方法,可以獲取連結 g_lock.release() # 獲取連線後 釋放鎖,讓其他執行緒可前去列表中獲取連結 response = requests.get(url,headers).text selector = etree.HTML(response) # 使用xpath mods = selector.xpath("//div[@class='item_t']") # 獲取指定標籤 for i in mods: img_link = i.xpath("div[@class='img']/a/img/@src") name = i.xpath("div[@class='img']/span/text()") name = name[0].encode("utf8") img_link = img_link[0].encode("utf8") comment = {name: img_link} if img_link.startswith("/"): # 因為抓取的連結,有一部分是本地,所以在此處將之拼接成可直接訪問的url str = "http://www.xiaohuar.com" img_link = str + img_link comment = {name: img_link} all_img_urls.append(comment) all_img_urls.append(comment) for x in range(10): # 建立10個執行緒用來爬去連結 down = Producer() down.run() # print all_img_urls class DownPic(threading.Thread): # 用來下載爬取資料的類 def run(self): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36", "Host": "eclick.baidu.com", } while True: # 這個地方寫成死迴圈,為的是不斷監控圖片連結陣列是否更新 g_lock.acquire() if len(all_img_urls) == 0: #沒有圖片了,就解鎖 g_lock.release() continue else: img = all_img_urls.pop() g_lock.release() # 遍歷字典列表 for key,value in img.items(): path = "xiaohua/%s.jpg"% key.decode("utf8") response = requests.get(value) # print path with open (path,"wb") as f: f.write(response.content) f.close()# # # # # for x in range(10): down = DownPic() down.run()