爬取起點小說並存入資料庫
阿新 • • 發佈:2018-12-11
最終效果如下:
······················主程式:·······································
# -*- coding: utf-8 -*- import scrapy import requests import json from qidian.items import QidianItem class MyqidianSpider(scrapy.Spider): name = 'myqidian' allowed_domains = ['qidian.com'] start_urls = ['http://www.qidian.com/all?chanId=21&orderId=&page=1&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0'] def parse(self, response): # print(response.text) bookList = response.xpath('//ul[@class="all-img-list cf"]/li') for i in bookList: bookId = i.xpath('./div[@class="book-img-box"]/a/@data-bid').extract()[0] bookUrl = 'http:'+ i.xpath('./div[@class="book-img-box"]/a/@href').extract()[0] yield scrapy.Request(bookUrl,callback=self.get_url,meta={"bookId":bookId})#把 url , bookId 傳到下一個方法 #構建翻頁 page = response.xpath('//@data-pagemax)').extract_first() page = int(page) for i in range(2, page + 1): url = "http://www.qidian.com/all?chanId=21&orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page={}".format( i) yield scrapy.Request(url, callback=self.parse) def get_url(self,response): meta = response.meta bookId = response.meta['bookId'] jsonurl = 'https://book.qidian.com/ajax/book/category?_csrfToken=OFmDKzipSh4trLG5YRG79dFXcFYAEZgV0cjNceDd&bookId=' + bookId bookName = response.xpath('//div[@class="book-info "]/h1/em/text()').extract()[0] writerName = response.xpath('//div[@class="book-info "]/h1/span/a/text()').extract()[0] xinxi = response.xpath('//div[@class="book-intro"]/p/text()').extract()[0].strip() meta = { "bookName" : bookName,"writerName" : writerName, "xinxi" : xinxi } yield scrapy.Request(jsonurl,callback=self.get_zhangjie,meta = meta) def get_zhangjie(self,response): meta = response.meta bookName = meta['bookName'] writerName = meta['writerName'] xinxi = meta['xinxi'] html = requests.get(response.url).content.decode('utf-8') data = json.loads(html)['data'] vs = data.get('vs') for i in vs: cs = i.get('cs') for i in cs: cN = i.get('cN') cU = i.get('cU') curl = 'https://read.qidian.com/chapter/'+cU uT = i.get('uT') cnt = i.get('cnt') meta = { "bookName": bookName, "writerName": writerName, "xinxi": xinxi, "cN" : cN, "curl" : curl,"uT" : uT,"cnt":cnt } yield scrapy.Request(curl,callback=self.Lett_text,meta = meta) def Lett_text(self,response): item = QidianItem() meta = response.meta item['bookName'] = meta['bookName'] item['writerName'] = meta['writerName'] item['xinxi'] = meta['xinxi'] item['cN'] = meta['cN'] item['curl'] = meta['curl'] item['uT'] = meta['uT'] item['cnt'] = meta['cnt'] textList = response.xpath('//div[@class="read-content j_readContent"]') for text in textList: text = text.xpath('//p/text()').extract()[1:] item['text'] = ''.join(text).strip().replace('\u3000','') yield item ··············item檔案:··························
import scrapy class QidianItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() bookName = scrapy.Field() writerName = scrapy.Field() xinxi = scrapy.Field() cN = scrapy.Field() curl = scrapy.Field() uT = scrapy.Field() cnt = scrapy.Field() text = scrapy.Field()
················寫入資料庫 import pymysql class QidianPipeline(object): def __init__(self): self.conn = None self.cur = None def open_spider(self, spider): self.conn = pymysql.connect( host='127.0.0.1', port=3306, user='root', password='密碼', db='pydata201806', charset='utf8' ) self.cur = self.conn.cursor() def process_item(self, item, spider): # if not hasattr(item, 'table_name'): # return item cols, values = zip(*item.items()) sql = "INSERT INTO `%s` (%s) VALUES (%s)" % \ ( 'qidianbook', ','.join(cols), ','.join(['%s'] * len(values)) ) self.cur.execute(sql, values) self.conn.commit() print(self.cur._last_executed) return item def close_spider(self, spider): self.cur.close() self.conn.close()