python-spider 第10題
阿新 • • 發佈:2020-08-16
# 使用flask 搭建一個後端網站 from flask import Flask from flask import request app = Flask(__name__) @app.route('/data', methods=['GET', 'POST']) def hello_world(): if request.method == "GET": print(request) if request.method == 'POST': # print(request) print(request.form.to_dict() ) print(request.headers) print(request.accept_charsets) return {'data': 'data'} if __name__ == '__main__': app.run(debug=True)
<!doctype html> <html lang="en"> <head> <meta charset="UTF-8"> <meta name="viewport" content="width=device-width, user-scalable=no, initial-scale=1.0, maximum-scale=1.0, minimum-scale=1.0"> <meta http-equiv="X-UA-Compatible" content="ie=edge"> <title> 第十題---煙霧繚繞【難度:簡單】 </title> <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.4.1/jquery.min.js"></script> </head> <body> <h1> 目標:採集100頁的全部數字,並計算所有資料加和。當然了,有一個並不太明顯的反爬手段存在 </h1> <button id="id">按鈕</button> </body> <script type="text/javascript"> var url = "http://127.0.0.1:5000/data"; call = function (num) { var list = { "page": String(1), }; $.ajax({ url: url, dataType: "json", async: true, data: list, type: "POST", beforeSend: function (request) { (function () { })() }, success: function (data) { datas = data.data; console.log(datas) } }) }; call(1); </script>
copy 網頁的程式碼做主要是想分析 beforesend 這個請求到底做了什麼 最後找了一圈也沒有發現的有啥 再翻了下js基礎 發現是我想多了
這個函式什麼東西都沒有幹
最後把請求頭替換掉成功過關 具體檢測的應該是請求頭中的某一個 ,想要知道具體檢測的是什麼就把每一個請求頭打上備註 看下少了哪個請求頭訪問會失敗就完事了~
這個貼一個正則替換headers 跟一個爬蟲老師學來的 具體是誰忘了。
(.): (.) 替換成 "$1":"$2",
最後貼上程式碼
import json from typing import Dict, List import browsercookie import requests from requests.cookies import RequestsCookieJar ## init for classes session = requests.session() chrome_cookie = browsercookie.chrome() s = [] url = "http://www.python-spider.com/api/challenge10" # url = 'http://127.0.0.1:5000/data' def get_cookie_from_chrome(domain: str) -> List[Dict]: """ :param domain: the cookies your want to get from. :return: a dict the contains cookies from the domain. """ l = [] for i in chrome_cookie: if domain in i.domain: l.append({'name': i.name, "value": i.value}) return l def set_cookie(domain): """ :param domain: the domain going to set :return: the instance of RequestsCookieJar contain the cookies of the domain you set """ cookie_jar = RequestsCookieJar() cookies_list = get_cookie_from_chrome(domain=domain) for cookie in cookies_list: cookie_jar.set(cookie['name'], cookie['value'], domain=domain) return cookie_jar if __name__ == '__main__': header = { "Connection": "keep-alive", "Content-Length": "6", "Accept": "application/json, text/javascript, */*; q=0.01", "Dnt": "1", "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36", "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8", "Origin": "http://www.python-spider.com/api/challenge10", "Sec-Fetch-Site": "cross-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "http://www.python-spider.com/api/challenge10", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh,en;q=0.9,zh-CN;q=0.8", } cookie_jar = set_cookie('www.python-spider.com') for i in range(1, 101): data = {"page": str(i)} response = session.post(url, headers=header, cookies=cookie_jar, data={"page": str(i)}) print(response.text) data1 = json.loads(response.text)['data'] s.extend([i['value'].strip('\r') for i in data1]) print(s) print(sum(int(i) for i in s))