1. 程式人生 > >scrapy爬取小說盜墓筆記

scrapy爬取小說盜墓筆記

xtra pipeline odin trac items style ict ref open

# -*- coding: utf-8 -*-
import scrapy
import requests
from daomu.items import DaomuItem
from pyquery import PyQuery as pq

class DaomuspiderSpider(scrapy.Spider):
    name = "daomuspider"
    # allowed_domains = ["www.daomubiji.com"]
    start_urls = [http://www.daomubiji.com/]
    index_url = http://www.daomubiji.com/
def start_requests(self): yield scrapy.Request(url=self.index_url,callback=self.parse_book) def parse_book(self, response): for url in response.css(.article-content a): book_url = url.css(a::attr(href)).extract_first() yield scrapy.Request(url=book_url, callback=self.parse_chapter)
def parse_chapter(self, response): item = DaomuItem() book_title = response.css(.focusbox .container h1::text).extract_first() book_info = response.css(.focusbox .container .focusbox-text::text).extract_first() book_url = response.url for chapter in response.css(
.excerpts-wrapper .excerpts .excerpt): chapter_title = chapter.css(a::text).extract_first().split( )[1] + :+ chapter.css(a::text).extract_first().split( )[-1] chapter_url = chapter.css(a::attr(href)).extract_first() content = self.parse_detail(chapter_url) item[book_title] = book_title item[book_info] = book_info item[book_url] = book_url item[chapter_title] = chapter_title item[chapter_url] = chapter_url item[content] = content yield item def parse_detail(self, url): response = requests.get(url) doc = pq(response.text) content = doc(.article-content p).text() return content
import pymongo

class DaomuPipeline(object):

    def __init__(self):
        self.mongo_uri = localhost
        self.mongo_db = daomu

    # @classmethod
    # def frow_crawler(cls, crawler):
    #     return cls(
    #         mongo_uri = crawler.settings.get(‘MONGO_URI‘),
    #         mongo_db = crawler.settings.get(‘MONGO_DB‘)
    #     )

    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]

    def process_item(self, item, spider):
        name = item.__class__.__name__
        self.db[name].insert(dict(item))#一定要註意這裏用dict
        return item

    def close_spider(self, spider):
        self.client.close()

scrapy爬取小說盜墓筆記