scrapy 初體驗
阿新 • • 發佈:2018-12-17
scrapy 爬蟲
目標把gank上的圖片趴下來
// 初始化專案
scrapy startproject demo
修改items物件
import scrapy import os import requests class GankItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() imageurl = scrapy.Field() url = scrapy.Field() pass def canParse(self): return self['name'] != '' and self['imageurl'] != '' def downLoad(self, imagepath): filename = 'file' files = self['url'].split("/") if len(files) > 3: filename = files[len(files) - 3] + "-" + files[len(files) - 2] + "-" + files[len(files) - 1] suffix = "jpg" data = self['imageurl'].split(".") if len(data) >= 2: suffix = data[len(data) - 1] path = imagepath + "/" + filename + "." + suffix if not os.path.exists(path): print('下載檔案') with open(path, 'wb') as fp: r = requests.get(self['imageurl']) fp.write(r.content)
piplines
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html imags="./images" class GankPipeline(object): def process_item(self, item, spider): if item.canParse(): item.downLoad(imags) pass
新建ganksprider
import scrapy from demo.spiders.gank import GankItem class GankSpider(scrapy.Spider,count=1): name="gank" allowed_domains = ["gank.io"] start_urls=["https://gank.io/2018/10/22"] def parse(self, response): item=GankItem() item['url'] = response.url item['name']=response.xpath('//div[@class="container content"]/h1/text()').extract()[0] item['imageurl']=response.xpath('//div[@class="container content"]/div[@class="outlink"]//p/img/@src').extract()[0] return item newcontent =response.xpath('//div[@class="container content"]/div[@class="row"]/div[@class="six columns"]/p[@style="text-align: right"]/a/@href').extract_first() if newcontent: newurl="https://gank.io"+newcontent print(newurl) yield scrapy.Request(newurl, callback=self.parse)
修復setting 開啟
ITEM_PIPELINES = {
'gank.pipelines.GankPipeline': 300,
}
就跑起來了
scrapy crawl xxx