scrapy爬取詩詞網
阿新 • • 發佈:2018-12-10
入門scrapy不久,嘗試著寫一段程式碼爬取詩詞網. spider如下:
class ShiciSpider(scrapy.Spider):
name = 'shici'
url = 'http://www.shicimingju.com/chaxun/zuozhe/44_'
page = 1
start_urls = [url + str(page) +'.html']
def parse(self, response):
#判斷是否是主頁
judge = response.xpath('//head/title/text()').extract_first()
if judge == '辛棄疾的詩詞全集、詩集(816首全)_詩詞名句網':
#說明是在主頁
for each in response.xpath('//h3/a/@href').extract():
html = 'http://www.shicimingju.com' + each
yield scrapy.Request(html,callback=self.parse)
self.page += 1
yield scrapy.Request(self.url+str(self.page)+'.html' ,callback=self.parse)
else:#在次級頁面時
item = MyspiderItem()
item['name'] = response.xpath('//h1[@class="shici-title"]/text()').extract_first()
item['info'] = response.xpath('string(//div[@class="shici-content"])').extract_first().strip()
yield item
items如下
class MyspiderItem(scrapy.Item):
name = scrapy.Field()
info = scrapy.Field()
pipelines如下
import json
class MyspiderPipeline(object):
#初始化方法
def __init__(self):
self.file = open('詩詞.json','w')
#對返回進行的處理過程,一定有這個方法
def process_item(self, item, spider):
jsontext = json.dumps(dict(item),ensure_ascii=False) +'\n'
self.file.write(jsontext)
return item
#結束時呼叫這個方法
def close_spider(self,spider):
self.file.close()