1. 程式人生 > >Scrapy框架----- Request/Response

Scrapy框架----- Request/Response

Request

Request 部分原始碼:

# 部分程式碼
class Request(object_ref): def __init__(self, url, callback=None, method='GET', headers=None, body=None, cookies=None, meta=None, encoding='utf-8', priority=0, dont_filter=False, errback=None): self._encoding = encoding # this one has to be set first self.method = str(method).upper() self._set_url(url) self._set_body(body) assert isinstance(priority, int), "Request priority not an integer: %r" % priority self.priority = priority assert callback or not errback, "Cannot use errback without a callback" self.callback = callback self.errback = errback self.cookies = cookies or {} self.headers = Headers(headers or {}, encoding=encoding) self.dont_filter = dont_filter self._meta = dict(meta) if meta else None  @property def meta(self): if self._meta is None: self._meta = {} return self._meta 

其中,比較常用的引數:

url: 就是需要請求,並進行下一步處理的url

callback: 指定該請求返回的Response,由那個函式來處理。

method: 請求一般不需要指定,預設GET方法,可設定為"GET", "POST", "PUT"等,且保證字串大寫

headers: 請求時,包含的標頭檔案。一般不需要。內容一般如下:
        # 自己寫過爬蟲的肯定知道
        Host: media.readthedocs.org
        User-Agent: Mozilla/5.0 (Windows NT 6.2; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0 Accept: text/css,*/*;q=0.1 Accept-Language: zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3 Accept-Encoding: gzip, deflate Referer: http://scrapy-chs.readthedocs.org/zh_CN/0.24/ Cookie: _ga=GA1.2.1612165614.1415584110; Connection: keep-alive If-Modified-Since: Mon, 25 Aug 2014 21:59:35 GMT Cache-Control: max-age=0 meta: 比較常用,在不同的請求之間傳遞資料使用的。字典dict型 request_with_cookies = Request( url="http://www.example.com", cookies={'currency': 'USD', 'country': 'UY'}, meta={'dont_merge_cookies': True} ) encoding: 使用預設的 'utf-8' 就行。 dont_filter: 表明該請求不由排程器過濾。這是當你想使用多次執行相同的請求,忽略重複的過濾器。預設為False。 errback: 指定錯誤處理函式 

Response

# 部分程式碼
class Response(object_ref): def __init__(self, url, status=200, headers=None, body='', flags=None, request=None): self.headers = Headers(headers or {}) self.status = int(status) self._set_body(body) self._set_url(url) self.request = request self.flags = [] if flags is None else list(flags)  @property def meta(self): try: return self.request.meta except AttributeError: raise AttributeError("Response.meta not available, this response " \ "is not tied to any request") 

大部分引數和上面的差不多:


status: 響應碼
_set_body(body): 響應體
_set_url(url):響應url
self.request = request

傳送POST請求

  • 可以使用 yield scrapy.FormRequest(url, formdata, callback)方法傳送POST請求。

  • 如果希望程式執行一開始就傳送POST請求,可以重寫Spider類的start_requests(self) 方法,並且不再呼叫start_urls裡的url。

class mySpider(scrapy.Spider):
    # start_urls = ["http://www.example.com/"] def start_requests(self): url = 'http://www.renren.com/PLogin.do' # FormRequest 是Scrapy傳送POST請求的方法 yield scrapy.FormRequest( url = url, formdata = {"email" : "[email protected]", "password" : "axxxxxxxe"}, callback = self.parse_page ) def parse_page(self, response): # do something 

模擬登陸

使用FormRequest.from_response()方法模擬使用者登入

通常網站通過 實現對某些表單欄位(如資料或是登入介面中的認證令牌等)的預填充。

使用Scrapy抓取網頁時,如果想要預填充或重寫像使用者名稱、使用者密碼這些表單欄位, 可以使用 FormRequest.from_response() 方法實現。

下面是使用這種方法的爬蟲例子:

import scrapy

class LoginSpider(scrapy.Spider): name = 'example.com' start_urls = ['http://www.example.com/users/login.php'] def parse(self, response): return scrapy.FormRequest.from_response( response, formdata={'username': 'john', 'password': 'secret'}, callback=self.after_login ) def after_login(self, response): # check login succeed before going on if "authentication failed" in response.body: self.log("Login failed", level=log.ERROR) return # continue scraping with authenticated session... 

知乎爬蟲案例參考:

zhihuSpider.py爬蟲程式碼

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from scrapy.spiders import CrawlSpider, Rule
from scrapy.selector import Selector from scrapy.linkextractors import LinkExtractor from scrapy import Request, FormRequest from zhihu.items import ZhihuItem class ZhihuSipder(CrawlSpider) : name = "zhihu" allowed_domains = ["www.zhihu.com"] start_urls = [ "http://www.zhihu.com" ] rules = ( Rule(LinkExtractor(allow = ('/question/\d+#.*?', )), callback = 'parse_page', follow = True), Rule(LinkExtractor(allow = ('/question/\d+', )), callback = 'parse_page', follow = True), ) headers = { "Accept": "*/*", "Accept-Encoding": "gzip,deflate", "Accept-Language": "en-US,en;q=0.8,zh-TW;q=0.6,zh;q=0.4", "Connection": "keep-alive", "Content-Type":" application/x-www-form-urlencoded; charset=UTF-8", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36", "Referer": "http://www.zhihu.com/" } #重寫了爬蟲類的方法, 實現了自定義請求, 執行成功後會呼叫callback回撥函式 def start_requests(self): return [Request("https://www.zhihu.com/login", meta = {'cookiejar' : 1}, callback = self.post_login)] def post_login(self, response): print 'Preparing login' #下面這句話用於抓取請求網頁後返回網頁中的_xsrf欄位的文字, 用於成功提交表單 xsrf = Selector(response).xpath('//input[@name="_xsrf"]/@value').extract()[0] print xsrf #FormRequeset.from_response是Scrapy提供的一個函式, 用於post表單 #登陸成功後, 會呼叫after_login回撥函式 return [FormRequest.from_response(response, #"http://www.zhihu.com/login", meta = {'cookiejar' : response.meta['cookiejar']}, headers = self.headers, #注意此處的headers formdata = { '_xsrf': xsrf, 'email': '[email protected]', 'password': '123456' }, callback = self.after_login, dont_filter = True )] def after_login(self, response) : for url in self.start_urls : yield self.make_requests_from_url(url) def parse_page(self, response): problem = Selector(response) item = ZhihuItem() item['url'] = response.url item['name'] = problem.xpath('//span[@class="name"]/text()').extract() print item['name'] item['title'] = problem.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract() item['description'] = problem.xpath('//div[@class="zm-editable-content"]/text()').extract() item['answer']= problem.xpath('//div[@class=" zm-editable-content clearfix"]/text()').extract() return item 

Item類設定

from scrapy.item import Item, Field

class ZhihuItem(Item): # define the fields for your item here like: # name = scrapy.Field() url = Field() #儲存抓取問題的url title = Field() #抓取問題的標題 description = Field() #抓取問題的描述 answer = Field() #抓取問題的答案 name = Field() #個人使用者的名稱 

setting.py 設定抓取間隔

BOT_NAME = 'zhihu'

SPIDER_MODULES = ['zhihu.spiders']
NEWSPIDER_MODULE = 'zhihu.spiders'
DOWNLOAD_DELAY = 0.25   #設定下載間隔為250ms