1. 程式人生 > >scrapy爬取詩詞網

scrapy爬取詩詞網

入門scrapy不久,嘗試著寫一段程式碼爬取詩詞網. spider如下:

class ShiciSpider(scrapy.Spider):
    name = 'shici'
    url = 'http://www.shicimingju.com/chaxun/zuozhe/44_'
    page = 1
    start_urls = [url + str(page) +'.html']

    def parse(self, response):
        #判斷是否是主頁
        judge = response.xpath('//head/title/text()').extract_first()
        if
judge == '辛棄疾的詩詞全集、詩集(816首全)_詩詞名句網': #說明是在主頁 for each in response.xpath('//h3/a/@href').extract(): html = 'http://www.shicimingju.com' + each yield scrapy.Request(html,callback=self.parse) self.page += 1 yield scrapy.Request(self.url+str(self.page)+'.html'
,callback=self.parse) else:#在次級頁面時 item = MyspiderItem() item['name'] = response.xpath('//h1[@class="shici-title"]/text()').extract_first() item['info'] = response.xpath('string(//div[@class="shici-content"])').extract_first().strip() yield item

items如下

class MyspiderItem(scrapy.Item):
    name = scrapy.Field()
    info = scrapy.Field()

pipelines如下

import json

class MyspiderPipeline(object):
    #初始化方法
    def __init__(self):
        self.file = open('詩詞.json','w')

    #對返回進行的處理過程,一定有這個方法
    def process_item(self, item, spider):
        jsontext = json.dumps(dict(item),ensure_ascii=False) +'\n'
        self.file.write(jsontext)
        return item

    #結束時呼叫這個方法
    def close_spider(self,spider):
        self.file.close()