1. 程式人生 > >百度翻譯爬蟲-Web版(自動生成sign)

百度翻譯爬蟲-Web版(自動生成sign)

ads ner bstr contex token open 發送請求 run mozilla

 1 # 面向對象
 2 # 百度翻譯 -- 網頁版(自動獲取token,sign)
 3 import requests
 4 import js2py
 5 import json
 6 import re
 7 
 8 
 9 class WebFanyi:
10     """百度翻譯網頁版爬蟲"""
11     def __init__(self,query_str):
12         self.session = requests.session()
13         headers = {
14             "User-Agent
": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36", 15 } 16 self.session.headers = headers 17 self.baidu_url = "https://www.baidu.com/" 18 self.root_url = "https://fanyi.baidu.com/" 19 self.lang_url = "
https://fanyi.baidu.com/langdetect" 20 self.trans_url = "https://fanyi.baidu.com/v2transapi" 21 self.query_str = query_str 22 23 def get_token_gtk(self): 24 ‘‘‘獲取token和gtk(用於合成Sign)‘‘‘ 25 self.session.get(self.root_url) 26 resp = self.session.get(self.root_url)
27 html_str = resp.content.decode() 28 token = re.findall(r"token: ‘(.*?)‘", html_str)[0] 29 gtk = re.findall(r"window.gtk = ‘(.*?)‘", html_str)[0] 30 return token,gtk 31 32 def generate_sign(self,gtk): 33 """生成sign""" 34 # 1. 準備js編譯環境 35 context = js2py.EvalJs() 36 with open(webtrans.js, encoding=utf8) as f: 37 js_data = f.read() 38 js_data = re.sub("window\[l\]","+gtk+",js_data) 39 # js_data = re.sub("window\[l\]", "\"{}\"".format(gtk), js_data) 40 # print(js_data) 41 context.execute(js_data) 42 sign = context.e(self.query_str) 43 return sign 44 45 def lang_detect(self): 46 ‘‘‘獲取語言轉換類型.eg: zh-->en‘‘‘ 47 lang_resp = self.session.post(self.lang_url,data={"query":self.query_str}) 48 lang_json_str = lang_resp.content.decode() # {"error":0,"msg":"success","lan":"zh"} 49 lan = json.loads(lang_json_str)[lan] 50 to = "en" if lan == "zh" else "zh" 51 return lan,to 52 53 54 def parse_url(self,post_data): 55 trans_resp = self.session.post(self.trans_url,data=post_data) 56 trans_json_str = trans_resp.content.decode() 57 trans_json = json.loads(trans_json_str) 58 result = trans_json["trans_result"]["data"][0]["dst"] 59 print("{}: {}".format(self.query_str,result)) 60 61 def run(self): 62 """實現邏輯""" 63 # 1.獲取百度的cookie,(缺乏百度首頁的cookie會始終報錯998) 64 self.session.get(self.baidu_url) 65 # 2. 獲取百度翻譯的token和gtk(用於合成sign) 66 token, gtk = self.get_token_gtk() 67 # 3. 生成sign 68 sign = self.generate_sign(gtk) 69 # 4. 獲取語言轉換類型.eg: zh-->en 70 lan, to = self.lang_detect() 71 # 5. 發送請求,獲取響應,輸出結果 72 post_data = { 73 "from": lan, 74 "to": to, 75 "query": self.query_str, 76 "transtype": "realtime", 77 "simple_means_flag": 3, 78 "sign": sign, 79 "token": token 80 } 81 self.parse_url(post_data) 82 83 if __name__ == __main__: 84 webfanyi = WebFanyi(lover) 85 webfanyi.run()

上述代碼中用於生成sign的 webtrans.js 文件具體代碼如下(可以自己抓包,在js中打斷點獲取):

 1 // webtrans.js
 2 
 3 function n(r, o) {
 4     for (var t = 0; t < o.length - 2; t += 3) {
 5         var a = o.charAt(t + 2);
 6         a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
 7         a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
 8         r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
 9     }
10     return r
11 }
12 function e(r) {
13     var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
14     if (null === o) {
15         var t = r.length;
16         t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
17     } else {
18         for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
19             "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
20             C !== h - 1 && f.push(o[C]);
21         var g = f.length;
22         g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
23     }
24     var u = void 0
25       ,
26         // l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
27         i = null;
28         u = null !== i ? i : (i = window[l] || "") || "";
29     for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
30         var A = r.charCodeAt(v);
31         128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
32         S[c++] = A >> 18 | 240,
33         S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
34         S[c++] = A >> 6 & 63 | 128),
35         S[c++] = 63 & A | 128)
36     }
37     for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
38         p += S[b],
39         p = n(p, F);
40     return p = n(p, D),
41     p ^= s,
42     0 > p && (p = (2147483647 & p) + 2147483648),
43     p %= 1e6,
44     p.toString() + "." + (p ^ m)
45 }

實際上,除了用js2py作為python中執行js代碼的環境編譯器外,還可以使用另一個方法 ‘execjs‘ ,不過要先通過 pip install PyExecJS 安裝PyExecJS模塊.具體實現代碼如下:

1 import execjs
2 with open("webtrans.js") as f:
3     js_data = f.read()
4     js_data = re.sub("window\[l\]", " + gtk + ", js_data)
5     sign = execjs.compile(js_data).call("e", query_str)  # 調用webtrans.js代碼中的 e函數,傳入參數為 query_str
6     print(sign)

百度翻譯爬蟲-Web版(自動生成sign)