1. 程式人生 > >scrapy框架之cookie和代理操作

scrapy框架之cookie和代理操作

一,scrapy傳送post請求

scrapy框架中預設傳送的是get請求,原始碼:

scrapy.Spider父類中的start_request方法:
    def start_requests(self):
        cls = self.__class__
        if method_is_overridden(cls, Spider, 'make_requests_from_url'):
            warnings.warn(
                "Spider.make_requests_from_url method is deprecated; it 
" "won't be called in future Scrapy releases. Please " "override Spider.start_requests method instead (see %s.%s)." % ( cls.__module__, cls.__name__ ), ) for url in self.start_urls: yield
self.make_requests_from_url(url) else: for url in self.start_urls: yield Request(url, dont_filter=True)

那麼,想要傳送post請求,我們就需要在我們的爬蟲檔案中重寫父類的start_request方法。

詳見程式碼:

ex:利用爬蟲傳送post請求到百度翻譯

import scrapy


class PosttestSpider(scrapy.Spider):
    name = '
postTest' # allowed_domains = ['www.qiubai.com'] start_urls = ['http://www.qiubai.com/'] def start_requests(self): url = 'https://fanyi.baidu.com/sug' data = { "kw": "hello" } yield scrapy.FormRequest(url=url, formdata=data, callback=self.parse) def parse(self, response): print(response.text)

二,cookie

scrapy框架中傳送的request請求,預設回保留cookie。

ex:利用爬蟲登陸豆瓣電影,獲取個人主頁

# -*- coding: utf-8 -*-
import scrapy
from cookieDemo.utils.YMD import YDMHttp


class DoubanSpider(scrapy.Spider):
    name = 'douban'
    # allowed_domains = ['www.douban.com']
    start_urls = ['https://www.douban.com/']

    def parse(self, response):
        img_code_url = response.xpath("//*[@id='captcha_image']/@src").extract_first()
        captcha_id = response.xpath("//*[@id='lzform']/fieldset/div[3]/div/input[2]/@value").extract_first()
        print(img_code_url)
        print(captcha_id)
        yield scrapy.Request(url=img_code_url, callback=self.parse_img, meta={"captcha_id": captcha_id})  # 利用meta傳參

    def parse_img(self, response):
        with open("code.png", "wb") as f:
            f.write(response.body)

        # 傳送登陸請求
        login_url = "https://www.douban.com/accounts/login"
        img_code = self.get_code("code.png")
        captcha_id = response.meta.get("captcha_id")  # 利用request傳參
        data = {
            'redir': 'https://movie.douban.com/',
            "source": "movie",
            "form_email": "[email protected]",
            "form_password": "goulonghui371379.",
            "captcha-solution": img_code,
            "captcha-id": captcha_id,
            'login': '登入',
        }
        yield scrapy.FormRequest(url=login_url, formdata=data, callback=self.parse_login)

    def parse_login(self, response):
        people_url = "https://www.douban.com/people/186597252/"
        yield scrapy.Request(url=people_url, callback=self.get_people_page)

    def get_people_page(self, response):
        with open("people.html", "w", encoding="utf-8") as f:
            f.write(response.text)
            print("over...............................")

    def get_code(self, img_path):
        # 使用者名稱
        username = 'EksYiQiang'
        # 密碼
        password = 'xyq19990113'

        # 軟體ID,開發者分成必要引數。登入開發者後臺【我的軟體】獲得!
        appid = 6041

        # 軟體金鑰,開發者分成必要引數。登入開發者後臺【我的軟體】獲得!
        appkey = 'c9f0265f96d9e97118aeb8eff629da64'

        # 圖片檔案
        filename = img_path

        # 驗證碼型別,# 例:1004表示4位字母數字,不同型別收費不同。請準確填寫,否則影響識別率。在此查詢所有型別 http://www.yundama.com/price.html
        codetype = 3000

        # 超時時間,秒
        timeout = 60

        # 檢查
        if (username == 'username'):
            print('請設定好相關引數再測試')
            return
        else:
            # 初始化
            yundama = YDMHttp(username, password, appid, appkey)

            # 登陸雲打碼
            uid = yundama.login()
            print('uid: %s' % uid)

            # 查詢餘額
            balance = yundama.balance()
            print('balance: %s' % balance)

            # 開始識別,圖片路徑,驗證碼型別ID,超時時間(秒),識別結果
            cid, result = yundama.decode(filename, codetype, timeout)
            print('cid: %s, result: %s' % (cid, result))
        return result

三,代理操作

ex:利用百度可以搜尋ip可以顯示本機ip,測試代理操作

import scrapy


class IpdemoSpider(scrapy.Spider):
    name = 'IPdemo'
    allowed_domains = ['www.baidu.com']
    start_urls = ['https://www.baidu.com/s?wd=ip']

    def parse(self, response):
        with open("ip.html", "w", encoding="utf-8") as f:
            f.write(response.text)
            print("over")

DownloadMiddleware:

class MyProxyMiddleware(object):

    def process_request(self, request, spider):
        # 這裡的request就是就是中介軟體攔截的請求物件
        # 該方法可以攔截請求物件
        # 將該請求物件的UA進行偽裝
        # 對該請求的uri進行篡改
        request.meta["proxy"] = "https://151.106.15.12:1080"

settings.py:

DOWNLOADER_MIDDLEWARES = {
   # 'ipDemo.middlewares.IpdemoDownloaderMiddleware': 543,
   'ipDemo.middlewares.MyProxyMiddleware': 543,
}