1. 程式人生 > >scrapy spider及其子類

scrapy spider及其子類

level __init__ 常用 mit read none them csv sna

1.spider傳參

  在運行 crawl 時添加 -a 可以傳遞Spider參數:

scrapy crawl myspider -a category=electronics

  Spider在構造器(constructor)中獲取參數:

import scrapy

class MySpider(Spider):
    name = myspider

    def __init__(self, category=None, *args, **kwargs):
        super(MySpider, self).__init__(*args, **kwargs)
        self.start_urls 
= [http://www.example.com/categories/%s % category] # ...

2.class scrapy.spider.Spider

  常用方法:name

       allowed_domains

       start_urls

       custom_settings

       crawler

       settings

       from_crawler(crawler, *args, **kwargs)

       start_requests()

       make_requests_from_url

(url)

       parse(response)

       log(message[, level, component])

       closed(reason)

    表單請求:

def start_requests(self):
    return [scrapy.FormRequest("http://www.example.com/login",
                               formdata={user: john, pass: secret},
                               callback
=self.logged_in)] def logged_in(self, response): # here you would extract links to follow and return Requests for # each of them, with another callback pass
import scrapy

class MySpider(scrapy.Spider):
    name = example.com
    allowed_domains = [example.com]
    start_urls = [
        http://www.example.com/1.html,
        http://www.example.com/2.html,
        http://www.example.com/3.html,
    ]

    def parse(self, response):
        self.log(A response from %s just arrived! % response.url)
import scrapy

class MySpider(scrapy.Spider):
    name = example.com
    allowed_domains = [example.com]
    start_urls = [
        http://www.example.com/1.html,
        http://www.example.com/2.html,
        http://www.example.com/3.html,
    ]

    def parse(self, response):
        self.log(A response from %s just arrived! % response.url)

3.class scrapy.contrib.spiders.CrawlSpider

  新屬性:rules

  新方法:parse_start_url(response)

 class scrapy.contrib.spiders.Rule(link_extractor, callback=None, cb_kwargs=None, follow=None, process_links=None, process_request=None)

import scrapy
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors import LinkExtractor

class MySpider(CrawlSpider):
    name = example.com
    allowed_domains = [example.com]
    start_urls = [http://www.example.com]

    rules = (
        # 提取匹配 ‘category.php‘ (但不匹配 ‘subsection.php‘) 的鏈接並跟進鏈接(沒有callback意味著follow默認為True)
        Rule(LinkExtractor(allow=(category\.php, ), deny=(subsection\.php, ))),

        # 提取匹配 ‘item.php‘ 的鏈接並使用spider的parse_item方法進行分析
        Rule(LinkExtractor(allow=(item\.php, )), callback=parse_item),
    )

    def parse_item(self, response):
        self.log(Hi, this is an item page! %s % response.url)

        item = scrapy.Item()
        item[id] = response.xpath(//td[@id="item_id"]/text()).re(rID: (\d+))
        item[name] = response.xpath(//td[@id="item_name"]/text()).extract()
        item[description] = response.xpath(//td[@id="item_description"]/text()).extract()
        return item

4.XMLFeedSpider

class scrapy.contrib.spiders.XMLFeedSpider

from scrapy import log
from scrapy.contrib.spiders import XMLFeedSpider
from myproject.items import TestItem

class MySpider(XMLFeedSpider):
    name = example.com
    allowed_domains = [example.com]
    start_urls = [http://www.example.com/feed.xml]
    iterator = iternodes # This is actually unnecessary, since it‘s the default value
    itertag = item

    def parse_node(self, response, node):
        log.msg(Hi, this is a <%s> node!: %s % (self.itertag, ‘‘.join(node.extract())))

        item = TestItem()
        item[id] = node.xpath(@id).extract()
        item[name] = node.xpath(name).extract()
        item[description] = node.xpath(description).extract()
        return item

5.CSVFeedSpider

class scrapy.contrib.spiders.CSVFeedSpider

from scrapy import log
from scrapy.contrib.spiders import CSVFeedSpider
from myproject.items import TestItem

class MySpider(CSVFeedSpider):
    name = example.com
    allowed_domains = [example.com]
    start_urls = [http://www.example.com/feed.csv]
    delimiter = ;
    quotechar = ""
    headers = [id, name, description]

    def parse_row(self, response, row):
        log.msg(Hi, this is a row!: %r % row)

        item = TestItem()
        item[id] = row[id]
        item[name] = row[name]
        item[description] = row[description]
        return item

6.SitemapSpider

class scrapy.contrib.spiders.SitemapSpider

from scrapy.contrib.spiders import SitemapSpider

class MySpider(SitemapSpider):
    sitemap_urls = [http://www.example.com/sitemap.xml]
    sitemap_rules = [
        (/product/, parse_product),
        (/category/, parse_category),
    ]

    def parse_product(self, response):
        pass # ... scrape product ...

    def parse_category(self, response):
        pass # ... scrape category ...
from scrapy.contrib.spiders import SitemapSpider

class MySpider(SitemapSpider):
    sitemap_urls = [http://www.example.com/robots.txt]
    sitemap_rules = [
        (/shop/, parse_shop),
    ]

    other_urls = [http://www.example.com/about]

    def start_requests(self):
        requests = list(super(MySpider, self).start_requests())
        requests += [scrapy.Request(x, self.parse_other) for x in self.other_urls]
        return requests

    def parse_shop(self, response):
        pass # ... scrape shop here ...

    def parse_other(self, response):
        pass # ... scrape other here ...

scrapy spider及其子類