爬蟲登入Django, scrf攔截, token驗證, scrapy模擬登入
阿新 • • 發佈:2018-12-22
思路:在登入頁面獲取令牌,傳送發帖,附帶上已獲取的令牌
參考:https: //www.jianshu.com/p/d73e971da41c
import requests from lxml import etree headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWe\ bKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36', 'Referer': '', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8' } url = '' sss = requests.Session() response = sss.get(url, headers=headers) page = etree.HTML(response.text) page = page.xpath('//input[1]/@value') token = page[0] data = { 'csrfmiddlewaretoken': token, 'username': 'zuolibing', 'password': 'zuolibing'} r = sss.post(url, headers=headers, data=data) print(r.text)
scarpy版本:
參考:https: //www.jianshu.com/p/9d1e00dc40e4
from scrapy.spiders import CrawlSpider from scrapy.http import FormRequest, Request class LoginSpider(CrawlSpider): name = 'login' allowed_domains = ['web'] def start_requests(self): ''' 向頁面發起請求requests ''' return [ Request( 'url', callback=self.parse_welcome ) ] def parse_welcome(self, response): ''' 收到返回的response, response中包含所有隱藏的欄位, 構造response和表單引數, 並再次發起請求 ''' return FormRequest.from_response( response, formdata={ 'username': 'zuolibing', 'password': 'zuolibing' } )
第三版:
import scrapy # 利用response中返回的隱藏欄位,構造表單併發起請求 class LoginSpider(scrapy.Spider): name = 'login' start_urls = [''] def parse(self, response): return scrapy.FormRequest.from_response( response,formdata={ 'username': 'zuolibing', 'password': 'zuolibing' },callback=self.parse_link ) def parse_link(self, response): item = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0] print(item)
scrapy使用cookie登陸 :
# coding=utf-8
import scrapy
class LoginSpider(scrapy.Spider):
name = 'login'
start_urls = ['url']
cookies = {
'csrftoken':'V6uSztzBUbnGScC3ds8pbLqhdnsoc4Wj;', 'sessionid':'74zm6gfnevp24nf15174ei9uqa6d01jh'
}
def start_requests(self):
yield scrapy.FormRequest(url=self.start_urls[0], cookies=self.cookies, callback=self.parse_url)
def parse_url(self, response):
element = response.xpath('//*[@class="tab-pane fade in active"]/h5/text()').extract()[0]
print(element)