爬蟲採集全國工商系統的資料(外接打碼平臺)
阿新 • • 發佈:2018-11-17
javascript的程式碼用的是java的javascript引擎,用python的jpype去呼叫java的物件。因為試了好多python的js庫,PYv8裝起來太麻煩。而且對js的eval函式支援不是很好,後面就用了java 的js引擎。
java的話打包成jar或者class檔案,java用的是1.8版本
#coding:UTF-8 import json import re import threading import time import jpype import redis import requests from bs4 import BeautifulSoup from jpype import * jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=/code/java/forpython/target/classes/") class SearchItem(threading.Thread): session=requests.session() keyword="" proxy="" semaphore=None def getGTChallenge(self): print "getGTChallenge start" loginurl="http://www.gsxt.gov.cn/SearchItemCaptcha" result=self.session.get(loginurl) if "y.replace(" not in result.text: raise Exception("被遮蔽了") mycookies= result.cookies jpype.attachThreadToJVM() jpype.isThreadAttachedToJVM() A = jpype.JClass("com.GovTest") self.Aobj=A() fu=self.Aobj.challenge(result.text) print "fu="+fu jslarr= fu.split("=") jsl_clearance=jslarr[1] self.session.cookies['__jsl_clearance']=jsl_clearance result=self.session.get(loginurl) challengeJson=json.loads(result.text) return challengeJson def getImageGif(self): print "getImageGif start" url="http://www.gsxt.gov.cn/corp-query-custom-geetest-image.gif?v=" localTime=time.localtime(time.time()) url=url+str(localTime.tm_min+localTime.tm_sec) resp=self.session.get(url) aaa=self.Aobj.getImageGif(resp.text) matchObj = re.search( 'location_info = (\d+);', aaa) if matchObj: return matchObj.group(1) else: Exception("沒有找到location_info") def getValidateInput(self,location_info): print "getValidateInput start" url="http://www.gsxt.gov.cn/corp-query-geetest-validate-input.html?token="+location_info resp=self.session.get(url) aaa=self.Aobj.getImageGif(resp.text) matchObj = re.search( 'value: (\d+)}', aaa) if matchObj: location_info= matchObj.group(1) token=int(location_info) ^ 536870911; print "token=",token return str(token) else: Exception("沒有找到location_info") def searchTest(self,keyword): print "searchTest start" url="http://www.gsxt.gov.cn/corp-query-search-test.html?searchword="+keyword resp=self.session.get(url); print "searchTest ",resp.text def jianYan(self,challengeJson): print "jianYan start" url="http://jiyanapi.c2567.com/shibie?user=你的賬號&pass=你的密碼>="+challengeJson["gt"]+"&challenge="+challengeJson["challenge"]+"&referer=http://www.gsxt.gov.cn&return=json&format=utf8" sess=requests.session() resp=sess.get(url); jiyanJson= json.loads(resp.text) print resp.text return jiyanJson def querySearch(self,jiYanJson,token,keyword): print "querySearch start" url="http://www.gsxt.gov.cn/corp-query-search-1.html" postData={ 'tab':'ent_tab', 'province':'', 'geetest_challenge':jiYanJson['challenge'], 'geetest_validate':jiYanJson['validate'], 'geetest_seccode':jiYanJson['validate']+'|jordan', 'token':token, 'searchword':keyword } resp=self.session.post(url,postData) return resp.text ,postData def dealPageUrl(self,html): print "dealPageUrl start" soup = BeautifulSoup(html,"html.parser") urlsItem=soup.find_all("a",class_="search_list_item db") pageNums=0 for urlItem in urlsItem: print "urlItem['href']=",urlItem['href'] if len(urlsItem)>1: pageForm=soup.find_all(id="pageForm") tabAs=pageForm[0].find_all("a",text=re.compile("\d+")) pageNums=len(tabAs) return pageNums def dealPageUrlNum(self,pageNums,postData): print "dealPageUrlNum start" url="http://www.gsxt.gov.cn/corp-query-search-advancetest.html" for i in range(pageNums): postData['page']=i+1 resp=self.session.get(url,params=postData) soup = BeautifulSoup(resp.text) urlsItem=soup.find_all("a",class_="search_list_item db") for urlItem in urlsItem: print "urlItem['href']=",urlItem['href'] def getCorpUrl(self): self.session.timeout=1 self.session.max_redirects=1 if self.proxy: self.session.proxies={ "http": "http://"+self.proxy, "https": "http://"+self.proxy, } headers={'Host': 'www.gsxt.gov.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Referer': 'http://www.gsxt.gov.cn/SearchItemCaptcha', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Cache-Control': 'max-age=0, no-cache'} self.session.headers=headers challengeJson=self.getGTChallenge() localtion_info= self.getImageGif() token=self.getValidateInput(localtion_info) self.searchTest(self.keyword) jiyanJson=self.jianYan(challengeJson) html,postData=self.querySearch(jiyanJson,token,self.keyword) pageNums=self.dealPageUrl(html) print 'pageNums=',pageNums self.dealPageUrlNum(pageNums,postData) return 1 def run(self): try: self.getCorpUrl() except Exception,e: print "run exception ",e.message self.session.close() self.semaphore.release() print "search Item run finish" def __init__(self, keyword,proxy,semaphore): threading.Thread.__init__(self) self.keyword = keyword self.proxy = proxy self.semaphore = semaphore semaphore=threading.Semaphore(1) while 1: try: semaphore.acquire() t1=SearchItem("百度",None,semaphore) t1.start() except Exception, e: print 'main e.message:\t', e.message time.sleep(1)
java程式碼執行js引擎的程式碼:
package com;
import javax.script.ScriptEngine;
import javax.script.ScriptEngineManager;
import javax.script.ScriptException;
public class GovTest {
private ScriptEngine scriptEngine;
public GovTest() {
ScriptEngineManager scriptEngineManager = new ScriptEngineManager();
this.scriptEngine = scriptEngineManager.getEngineByName("JavaScript");
}
public String challenge(String resp){
resp = resp.substring(8);
String tmp[] = resp.split("</script");
resp = tmp[0];
resp = resp.replace("eval(y.replace", "var aaa=(y.replace");
resp = resp + "aaa=aaa.replace("h=document.createElement('div');","");aaa=aaa.replace("h.innerHTML='<a href=\\\'/\\\'","");\n" +
"aaa=aaa.replace(">x</a>';","");aaa=aaa.replace("h=h.firstChild.href;","h='http://www.gsxt.gov.cn/';");aaa=aaa.replace("while(window._phantom||window.__phantomas){};","");bbb=aaa.split("setTimeout");\n" +
" aaa=bbb[0]+"return dc;}}";\n" +
" aaa=aaa.replace("var l=","{fa:");\n" +
" var ffa=eval("("+aaa+")");\n" +
" var fffa=ffa.fa();";
System.out.println(resp);
String script = resp;
try {
scriptEngine.eval(script);
} catch (ScriptException e) {
return e.getMessage();
}
String bbb = (String) scriptEngine.get("fffa");
System.out.println(bbb);
return bbb;
}
public String getImageGif(String resp){
String script="function dd(){var json="+resp+";return json.map( function(item){ return String.fromCharCode(item);}).join('');}" +
"var ggg=dd();";
try {
scriptEngine.eval(script);
} catch (ScriptException e) {
return e.getMessage();
}
String bbb = (String) scriptEngine.get("ggg");
return bbb;
}
public static void main(String[] s){
new GovTest().challenge("<script>var x=" [email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected][email protected]@[email protected][email protected] @[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected][email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@[email protected]@onreadystatechange".replace(/@*$/,"").split("@"),y="1a d=3d(){1(6.19||6.30){};1a 23,e='16=1d.20|14|';1a 1b=[3d(36){3f 32('c.26('+36+')')},(3d(){1a 34=2b.2a('2');34.2e='<11 12=\\\'/\\\'>36</11>';34=34.28.12;1a 3b=34.1f(/37?:\\\\/\\\\//)[14];34=34.3(3b.15).18();3f 3d(36){39(1a 31=14;31<36.15;31++){36[31]=34.27(36[31])};3f 36.21('')}})()];23=[[[3c+8]+[-~-~(+[])],(22+[]+[[]][~~[]])+[3c+8],(8+[[]][~~[]])+[(+[])],[3c+8]+[(+[])],[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+[(+[])],(8+[[]][~~[]])+[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]],(8+[[]][~~[]])+[3c+8],[3c+8]+[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]],(22+[]+[[]][~~[]])+[3c+8],(8+[[]][~~[]])+[(+[])],[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]]+[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]],[3c+8]+[(+[])],[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]]+(8+[[]][~~[]]),(-~(+[])+[[]][~~[]])+[(+[])]+[17],[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+(8+[[]][~~[]]),(7+[])+[(+[])]],[[(+[])]],[(-~(+[])+[[]][~~[]])+[(+[])]+[-~-~(+[])],(8+[[]][~~[]])+[(+[])],[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+(22+[]+[[]][~~[]])],[[3c+8]],[[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+[-~-~(+[])]],[[(+[])],[17],[17]],[[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]]+[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]],[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+[17],(-~(+[])+[[]][~~[]])+(-~(+[])+[[]][~~[]])+[17],(-~(+[])+[[]][~~[]])+[(+[])]+(7+[]),[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]+[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]],[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]]+(8+[[]][~~[]]),(22+[]+[[]][~~[]])+[3c+8],(8+[[]][~~[]])+(-~(+[])+[[]][~~[]]),[-~[(-~(+[])<<-~(+[]))]-~[(-~(+[])<<-~(+[]))]]+[-~-~(+[])-~[([(-~(+[])<<-~(+[]))]+~~![]>>(-~(+[])<<-~(+[])))]]]];39(1a 31=14;31<23.15;31++){23[31]=1b.1e()[(-~(+[])+[[]][~~[]])](23[31])};23=23.21('');e+=23;4('a.12=a.12.35(/[\\\\?|&]33-1c/,\\\'\\\')',40);2b.b=(e+';2c=24, 2d-38-3e 9:5:f 10;29=/;');};13((3d(){3a{3f !!6.2f;}25(41){3f 42;}})()){2b.2f('43',d,42);}44{2b.45('46',d);}",z=0,f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},g=y.match(/\\b\\w+\\b/g).sort(function(x,y){return f(x)-f(y)}).pop();while(f(g,++z)-x.length){};eval(y.replace(/\\b\\w+\\b/g, function(y){return x[f(y,z)-1]}));</script>");
}
}
注:本文僅供參考學習,請勿做其它非法用途!