Python專案-Day46-Scrapy框架之利用ImagesPipeline下載圖片.md
阿新 • • 發佈:2018-12-11
專案程式碼
-
import scrapy from scrapy.item import Item,Field class ZiyuanItem(scrapy.Item): href=Field()#域 title=Field() date=Field() resourse_type=Field() img=Field() image_paths = Field()
-
import scrapy from urllib import parse import string from myspiders.items import ZiyuanItem class ZiyuanspiderSpider(scrapy.Spider): name = 'ziyuanSpider' allowed_domains = ['ziyuan.tv'] start_urls = ["https://www.ziyuan.tv/yuanma/"] custom_settings = { # 'SOME_SETTING': 'some value', "ITEM_PIPELINES":{ # 'myspiders.pipelines.ResPipeline': 400, 'myspiders.imagePipelines.ImagePipeline': 400, 'myspiders.imagePipelines.GetImagePipeline': 500, } } def start_requests(self): url_s = "https://www.ziyuan.tv/" query={ "search":"資源", } for i in range(1,3): url = url_s + "search" + '/' + query["search"] + '/page/' + str(i) url = parse.quote(url, safe=string.printable) print(url) yield scrapy.FormRequest(url=url,callback=self.parse) def parse(self, response): print("開始解析資料") links=response.xpath('//div[@class="card-item"]') items=[] for link in links: item = ZiyuanItem() item['title'] = link.xpath('p/text()').extract()[0] item['href'] = link.xpath('div/a/@href').extract()[1] item['date'] = link.xpath('div[@class="cardpricebtn"]/text()').extract() item['resourse_type'] = link.xpath('.//a[@class="metacat"]/text()').extract() item['img'] = link.select('div/a/img/@data-original').extract()[0] print("-------------------") print("item:",item) items.append(item) yield item print("回來-----------------") return items
-
#設定圖片下載路徑 IMAGES_STORE = r'D:\images' IMAGES_EXPIRES = 30 IMAGES_THUMBS = { 'big': (270, 270), 'small': (80, 80) }
-
imagePipelines
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymysql import json import re from scrapy import Request from scrapy.exceptions import DropItem from scrapy.pipelines.images import ImagesPipeline # class MyspidersPipeline(object): # def process_item(self, item, spider): # # return item # print("**********pipelines******") # print(1,item) # with open("href.json",'w') as f: # json.dump({"href":item["href"]},f) # return item # def open_spider(self,spider): # print('open--------------') # def close_spider(self,spider): # print('close--------------') # json.dump(ensure_ascii=False) #ensure_ascii不要轉化成ascii碼 # 儲存資源的基本資訊 class ImagePipeline(object): def process_item(self, item, spider): if spider.name == "ziyuanSpider": print("執行了process_item方法") return item class GetImagePipeline(ImagesPipeline): default_headers = { 'accept': 'image/webp,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'cookie': 'bid=yQdC/AzTaCw', 'referer': '', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', } id=0 def file_path(self, request, response=None, info=None): print("執行了file_path方法") item = request.meta['item'] folder = item['title'].strip()[0:5] # folder_strip = self.strip(folder) image_guid = re.findall(r'/(\w*)\.[jpg,png]',request.url)[1] filename = u'full/{0}/{1}'.format(folder,image_guid+".jpg") print("filename",filename) return filename def get_media_requests(self, item, info): print("執行了get_media_requests") referer = item['img'] self.default_headers['referer'] = referer yield Request(item['img'], headers=self.default_headers, meta={'item': item, 'referer': referer}) # def item_completed(self, results, item, info): print("--------results-------") print(results) print("執行了item_completed方法") image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
-