Scrapy框架自定義pipeline兩層下載路徑去下載圖片,關於item傳值的問題
阿新 • • 發佈:2018-12-19
自定義兩層路徑的時候,item是需要經過傳值的,爬蟲函式如下
import scrapy from urllib.request import urljoin from ..items import OffmymindspiderItem class OffmymindSpider(scrapy.Spider): name = 'offmymind' allowed_domains = ['www.biaobaiju.com'] start_urls = ['http://www.biaobaiju.com/'] def parse(self, response): """ 獲取每個分類的地址和分類的名稱 :param response: :return: """ a_list = response.xpath("//ul[@class='nav clearfix']/li/a") for a in a_list: img_type_url = a.xpath("@href").extract_first("") img_type_name = a.xpath("text()").extract_first("") yield scrapy.Request(url=img_type_url, dont_filter=False, callback=self.parse_img_type_info, meta={"img_type_name":img_type_name}) def parse_img_type_info(self, response): """ 解析每個分類地址的原始碼,並取出每個圖片集連結的網址 :param response: :return: """ div_list = response.xpath("//ul[@id='container']/li/div[2]") #因為parse()裡item的值是不能丟棄的,所以在這需要用item接收一下,將meta裡的值取出來,然後在parse_img_type_info()裡給item再新增一個鍵值對(第二層路徑),通過request一塊傳給下一個函式。 item = response.meta #取每一頁中的小分類的url地址 for div in div_list: img_small_type_href = div.xpath("a/@href").extract_first("") img_small_type_name = div.xpath("a/text()").extract_first("") item["img_small_type_name"] = img_small_type_name yield scrapy.Request(url=img_small_type_href, dont_filter=True, callback=self.parse_every_small_type_info, meta=item) #判斷是否有下一頁;由於頁數較多,這段程式碼沒有執行,只下載每個分類的第一頁 # href = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").extract_first("") # if href=="": # print("沒有下一頁了!") # elif href!="": # next_page_url = urljoin(response.url,href) # #需要呼叫自身,即下載完第一頁的圖片後,判斷是否有下一頁,有的話在呼叫parse_img_type_info(),然後下載第二頁的圖片 # yield scrapy.Request(url=next_page_url, dont_filter=True, callback=self.parse_img_type_info, meta=item) def parse_every_small_type_info(self, response): """ 解析每個小分類地址的圖片網址 :return: """ # 只取第一頁的圖片 # print(response) p_list = response.xpath("//div[@class='content tag_blue']/p") for p in p_list: img_url = p.xpath("img/@src").extract_first("") if img_url=="": del img_url elif img_url!="": item = OffmymindspiderItem() item["img_url"] = [img_url] #img_type_name,img_small_type_name是以字典的形式存放在meta中的,屬於response item["img_small_type_name"] = response.meta.get("img_small_type_name") item["img_type_name"] = response.meta.get("img_type_name") yield item
items.py中的程式碼
import scrapy
class ZhanzhangsucaispiderItem(scrapy.Item):
name = scrapy.Field()
img_url = scrapy.Field()
img_path = scrapy.Field()
settings.py中修改的內容
ROBOTSTXT_OBEY = False#第22行需要改成False
#第67行 ITEM_PIPELINES = { 'OffMyMindSpider.pipelines.CustomImagesPipeline': 300, } IMAGES_STORE = "imgs"
pipelines.py中的程式碼
from scrapy.pipelines.images import ImagesPipeline from scrapy.exceptions import DropItem import scrapy class CustomImagesPipeline(ImagesPipeline): def get_media_requests(self, item, info): for img_download_url in item.get("img_url"): yield scrapy.Request(url=img_download_url, meta={"item":item}) def file_path(self, request, response=None, info=None): # print(request) #將item取出來 item = request.meta["item"] img_type_name = item["img_type_name"] img_small_type_name = item["img_small_type_name"] #img_url存放在列表中 img_url = item.get("img_url")[0].split("/")[-1] return "%s/%s/%s"%(img_type_name, img_small_type_name, img_url) def item_completed(self, results, item, info): print("---") img_path = results[0][1].get("path") if not img_path: raise DropItem("Image download failed, delete the corresponding item value, do not let it return out") item["img_path"] = img_path return item
具體內容的註釋可以參考連結:https://mp.csdn.net/postedit/84668344