爬蟲學習之-文件管道重寫
阿新 • • 發佈:2018-09-24
.org url The brush 學習 zoom 保存 raw files
如果要文件管道保存為原有的文件名 需要重寫文件管道的方法
pipeitem文件
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don‘t forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html from scrapy.pipelines.files import FilesPipeline class OveridePipeline(FilesPipeline): def file_path(self, request, response=None, info=None): file_name = request.url.split(‘/‘)[-1] if "." not in file_name: file_name = file_name + ‘.png‘ return "pexels/"+file_name class ImagesPipeline(object): def process_item(self, item, spider): # tmp = item[‘image_urls‘] # item[‘image_urls‘] = [] # # for i in tmp: # if "?" in i: # item[‘image_urls‘].append(i.split("?")[0]) # else: # item[‘image_urls‘].append(i) # print("下載圖片:",item[‘image_urls‘]) # return item tmp = item[‘file_urls‘] item[‘file_urls‘] = [] for i in tmp: if "?" in i: item[‘file_urls‘].append(i.split("?")[0]) else: item[‘file_urls‘].append(i) print("下載圖片:", item[‘file_urls‘]) return item
setting配置
ITEM_PIPELINES = { #‘scrapy.pipelines.images.ImagesPipeline‘:2, #‘scrapy.pipelines.files.FilesPipeline‘:3, ‘images.pipelines.OveridePipeline‘:3, ‘images.pipelines.ImagesPipeline‘: 1, } FILES_STORE = ‘d:/crawl‘
spider文件
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from ..items import ImagesItem from scrapy.pipelines.images import ImagesPipeline from scrapy.pipelines.files import FilesPipeline from scrapy.pipelines.media import MediaPipeline class PexSpider(CrawlSpider): name = ‘pex‘ allowed_domains = [‘www.pexels.com‘] start_urls = [‘https://www.pexels.com/photo/vehicle-on-road-along-green-grass-during-night-714023/‘] rules = ( Rule(LinkExtractor(allow=r‘/photo/‘), callback=‘parse_item‘, follow=True), ) def parse_item(self, response): i = ImagesItem() #i[‘domain_id‘] = response.xpath(‘//input[@id="sid"]/@value‘).extract() #i[‘name‘] = response.xpath(‘//div[@id="name"]‘).extract() #i[‘description‘] = response.xpath(‘//div[@id="description"]‘).extract() #i[‘image_urls‘] = response.xpath("//img[@class=‘image-section__image js-photo-zoom‘]/@src").extract() i[‘file_urls‘] = response.xpath("//img[@class=‘image-section__image js-photo-zoom‘]/@src").extract() return i
item文件
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy class ImagesItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() # image_urls = scrapy.Field() # images = scrapy.Field() file_urls = scrapy.Field() files = scrapy.Field()
爬蟲學習之-文件管道重寫