1. 程式人生 > >爬蟲登入Django, scrf攔截, token驗證, scrapy模擬登入

爬蟲登入Django, scrf攔截, token驗證, scrapy模擬登入

思路:在登入頁面獲取令牌,傳送發帖,附帶上已獲取的令牌

參考:https:  //www.jianshu.com/p/d73e971da41c

import requests
from lxml import etree


headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWe\
    bKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
    'Referer': '',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
}
url = ''
sss = requests.Session()
response = sss.get(url, headers=headers)

page = etree.HTML(response.text)
page = page.xpath('//input[1]/@value')
token = page[0]
data = {
    'csrfmiddlewaretoken': token,
    'username': 'zuolibing', 'password': 'zuolibing'}

r = sss.post(url, headers=headers, data=data)
print(r.text)

scarpy版本:

參考:https:  //www.jianshu.com/p/9d1e00dc40e4

from scrapy.spiders import CrawlSpider
from scrapy.http import FormRequest, Request


class LoginSpider(CrawlSpider):
    name = 'login'
    allowed_domains = ['web']

    def start_requests(self):
        '''
        向頁面發起請求requests
        '''
        return [
            Request(
                'url',
                callback=self.parse_welcome
            )
        ]

    def parse_welcome(self, response):
        '''
        收到返回的response, response中包含所有隱藏的欄位, 構造response和表單引數,
        並再次發起請求
        '''
        return FormRequest.from_response(
            response,
            formdata={
                'username': 'zuolibing',
                'password': 'zuolibing'
            }
        )

第三版:

import scrapy

# 利用response中返回的隱藏欄位,構造表單併發起請求
class LoginSpider(scrapy.Spider):
    name = 'login'
    start_urls = ['']

    def parse(self, response):
        return scrapy.FormRequest.from_response(
            response,formdata={
                'username': 'zuolibing',
                'password': 'zuolibing'
            },callback=self.parse_link
        )

    def parse_link(self, response):
        item = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0]
        print(item)

scrapy使用cookie登陸 :

# coding=utf-8
import scrapy


class LoginSpider(scrapy.Spider):
    name = 'login'
    start_urls = ['url']

    cookies = {
        'csrftoken':'V6uSztzBUbnGScC3ds8pbLqhdnsoc4Wj;', 'sessionid':'74zm6gfnevp24nf15174ei9uqa6d01jh'
               }

    def start_requests(self):
        yield scrapy.FormRequest(url=self.start_urls[0], cookies=self.cookies, callback=self.parse_url)

    def parse_url(self, response):
        element = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0]
        print(element)