基於Java的阿里媽媽資料抓取技術
基於Java的阿里媽媽資料抓取技術
前言:
對於需要登入的網站爬蟲最大的困難就是需要登入,然後才能獲取到資料,如微博,阿里媽媽,webqq等。之前也有看過使用瀏覽器登入到網站後直接從瀏覽器中獲取cookie的文章,這不失為一種解決方案,但是當cookie失效時就需要再次獲取,比較麻煩,那有沒有能有自動登入,然後在爬取資料的技術呢,這就是本文研究的重點,好啦不扯淡了,開始進入正題吧。
技術要點:java + Phantomjs+httpclient
思路:
1.需要藉助一些工具獲取登入的cookie,在做後續操作,這裡就使用phantomjs來獲取,其實phantomjs就是一個沒有介面的瀏覽器,它提供了一些
2.由於phantom也可以直接傳送post,get請求,但是解析起來比較慢,所以這裡就只是在登入時使用該外掛來獲取cookie然後使用Java的httpclient來發送請求獲取資料。
引入maven庫:
<!-- https://mvnrepository.com/artifact/htmlunit/htmlunit -->
<dependency>
<groupId>net.sourceforge.htmlunit</
<artifactId>htmlunit</artifactId>
<version>2.15</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.4</version>
</dependency>
<dependency
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
<version>2.53.0</version>
</dependency>
<dependency>
<groupId>com.codeborne</groupId>
<artifactId>phantomjsdriver</artifactId>
<version>1.2.1</version>
<exclusions>
<exclusion>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-remote-driver</artifactId>
</exclusion>
<exclusion>
<groupId>org.seleniumhq.selenium</groupId>
<artifactId>selenium-java</artifactId>
</exclusion>
</exclusions>
</dependency>
編寫頁面處理的工具類
/***
* 頁面操作的工具類
*
* @authoradmin
*
*/
publicclass PageUtils {
publicstatic PhantomJSDriver driver = null;
publicstatic String PhantomPath = "E:/phantomjs/bin/phantomjs.exe";
//頁面中的cookie
publicstatic Set<Cookie>cookies=null;
//頁面中的token
publicstatic String token=null;
// 初始化phantom先關的操作
static {
// 設定必要引數
DesiredCapabilities dcaps = new DesiredCapabilities();
// ssl證書支援
dcaps.setCapability("acceptSslCerts", true);
// 截圖支援
dcaps.setCapability("takesScreenshot", true);
// css搜尋支援
dcaps.setCapability("cssSelectorsEnabled", true);
// js支援
dcaps.setJavascriptEnabled(true);
// 驅動支援
dcaps.setCapability(PhantomJSDriverService.PHANTOMJS_EXECUTABLE_PATH_PROPERTY, PhantomPath);
// 建立無介面瀏覽器物件
driver = new PhantomJSDriver(dcaps);
}
publicstaticboolean loadLogin(String loginUrl, String mainUrl) throws Exception {
try {
driver.get(loginUrl);
File scrFile = ((TakesScreenshot) driver).getScreenshotAs(OutputType.FILE);
File file = new File("baidu_selenium.png");
FileUtils.copyFile(scrFile, file);
System.out.println("二維碼已經儲存為:" + file.getAbsolutePath());
System.out.println("請開啟淘寶掃描該圖片登入");
String current = "";
while (!current.equals(mainUrl)) {
current = driver.getCurrentUrl();
Thread.sleep(2000);
}
System.out.println("登入成功");
returntrue;
} catch (Exception e) {
//當出現異常時關閉
//driver.close();
//driver.quit();
returnfalse;
}
}
//開啟一個執行緒,定時訪問頁面,防止頁面太久沒有操作導致session超時
publicstaticvoid keepCookies() {
new Thread() {
publicvoid run() {
//淘寶聯盟頁面
String url="http://pub.alimama.com/myunion.htm";
while(true) {
try {
driver.get(url);
cookies=driver.manage().getCookies();
Thread.sleep(8*1000);
} catch (InterruptedException e) {
}
}
};
}.start();
}
//獲取token
publicstatic String getToken() {
//等待cookie中有資料
while(cookies==null) {
try {
Thread.sleep(1*1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
for(Cookie cook:cookies) {
String name=cook.getName();
if(name!=null&&name.equals("_tb_token_")) {
returntoken= cook.getValue();
}
}
returntoken;
}
/**
* 封裝get請求
* @param url
* @return
*/
publicstatic String httpGet(String url) {
try {
//獲取cookie字串
String cookieStr="";
for(Cookie cookie:cookies) {
cookieStr+=cookie.getName()+"="+cookie.getValue()+";";
}
// 根據地址獲取請求
HttpGet request = new HttpGet(url);//這裡傳送get請求
//新增請求頭
request.addHeader("Pragma", "no-cache");
//設定瀏覽器型別
request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1");
request.addHeader("Accept-Language", "zh-CN,en,*");
//設定cookie
request.addHeader("Cookie", cookieStr);
// 獲取當前客戶端物件
HttpClient httpClient = HttpClients.custom().build();
// 通過請求物件獲取響應物件
HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
return EntityUtils.toString(response.getEntity(),"utf-8");
}
} catch (Exception e) {
e.printStackTrace();
}
returnnull;
}
/**
* 封裝post請求
* @param url
* @return
*/
publicstatic String httpPost(String url) {
try {
//獲取cookie字串
String cookieStr="";
for(Cookie cookie:cookies) {
cookieStr+=cookie.getName()+"="+cookie.getValue()+";";
}
// 根據地址獲取請求
HttpPost request = new HttpPost(url);//這裡傳送get請求
//新增請求頭
request.addHeader("Pragma", "no-cache");
//設定瀏覽器型別
request.addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/538.1 (KHTML, like Gecko) PhantomJS/2.1.1 Safari/538.1");
request.addHeader("Accept-Language", "zh-CN,en,*");
//設定cookie
request.addHeader("Cookie", cookieStr);
// 獲取當前客戶端物件
HttpClient httpClient = HttpClients.custom().build();
// 通過請求物件獲取響應物件
HttpResponse response = httpClient.execute(request);
if (response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
return EntityUtils.toString(response.getEntity(),"utf-8");
}
} catch (Exception e) {
e.printStackTrace();
}
returnnull;
}
/**
* 獲取導購推廣位
* @param time
* @param pvid
* @param token
* @return
*/
publicstatic String getShoppingGuide(String pvid,String token) {
String url="https://pub.alimama.com/common/adzone/adzoneManage.json?"
+ "spm=a219t.7900221/1.1998910419.dbb742793.5bcb8cdcYsmdCb&"
+ "tab=3&"
+ "toPage=1&"
+ "perPageSize=40&"
+ "gcid=8&"
+ "t="+getCirrentTime()+"&"
+ "pvid="+pvid+"&"
//"pvid=60_59.172.110.203_862_1517744052508&"
+ "_tb_token_="+token+"&"
+ "_input_charset=utf-8";
returnhttpGet(url);
}
//獲取當前的時間戳
publicstaticlong getCirrentTime() {
return System.currentTimeMillis();
}
/**
* 獲取1000以內的隨機數
* @return
*/
publicstaticint getRandom() {
Random random=new Random();
intresult=(int)(random.nextFloat()*1000);
returnresult;
}
}
編寫主類:
publicclass Test {
publicstatic Gson gson=new Gson();
publicstaticvoid main(String[] args) {
//阿里媽媽錄頁面
String taobao="https://login.taobao.com/member/login.jhtml?style=mini&newMini2=true&css_style=alimama_index&from=alimama&redirectURL=http://www.alimama.com&full_redirect=true&disableQuickLogin=true";
//阿里媽媽主頁面
String loginUrl="https://www.alimama.com/index.htm";
//獲取個人資訊的介面
String userUrl="http://pub.alimama.com/common/getUnionPubContextInfo.json";
try {
PageUtils.loadLogin(taobao, loginUrl);
PageUtils.keepCookies();
String token=PageUtils.getToken();
System.out.println("token:"+token);
String result=PageUtils.httpGet(userUrl);
LoginInfo info=gson.fromJson(result, LoginInfo.class);
System.out.println("獲取個人資訊:"+info);
//格式60_59.172.110.203_862_1517744052508
String pvid="60"
+"_"+info.getData().getIp()
+"_"+PageUtils.getRandom()
+"_"+PageUtils.getCirrentTime();
//獲取導購推廣位
String shopping=PageUtils.getShoppingGuide(pvid,PageUtils.getToken());
System.out.println("導購位資訊:"+shopping);
} catch (Exception e) {
e.printStackTrace();
}
}
效果展示:
總結:通過本次爬蟲也對http中cookie有了一定的瞭解,同時一些細節沒有直接展示出來,但是程式碼中會有對一些躺過的坑作出解釋,希望能夠幫助到大家。