使用HttpClient和Jsoup定向抓取資料
阿新 • • 發佈:2019-02-13
1.業務需求:
從指定外網抓點貨,冷啟動
2.站點分析:
.限制IP…
.需要登入……
.對登入賬號有抓取頻率限制……….
.抓取頻率過低,直接跳驗證碼頁面…………..
.驗證碼長度、模樣(純數字&字母數字混合)TM不固定………………..
“我們能不能不抓了?“
“不行!必須得抓…”
“……”
這麼說,此前寫的爬蟲,多執行緒、生產者—>消費者 併發抓取壓根行不通。多執行緒毫無意義。
3.使用技術:
1.HttpClient
:讀取指定URL網頁內容
2.Jsoup
:解析所要的頁面資料——省得寫噁心的正則表示式
3.Swing
:繪製使用者操作介面
4.Tess4J
5.
Exe4J
:生成可獨立執行的exe程式——給每人機器安裝一個,大家一起監控抓~
4.實現要點:
1.代理IP
從一些網站上抓取代理IP,並檢測是否可以使用,如下:
package com.ydj.zhuaqu.proxy;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import org.apache.commons.collections.map.LRUMap;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.params.HttpMethodParams;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import com.ydj.common.kit.MyLog;
/**
*
* @author : Ares.yi
* @createTime : 2014-11-10 上午11:13:42
* @version : 1.0
* @description :
*
*/
public class ProxyIpPool {
/**設定最多IP數*/
private static final int MAX_IP = 100;
/**設定最少IP數(最好控制和外部使用執行緒數一致)*/
@SuppressWarnings("unused")
private static final int MIN_IP = 10;
// public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>();
public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP));
private static LRUMap notCanUseIPsTemp = new LRUMap(2000);
/**每次抓取IP數*/
private static final int NUM = 20;
private static final String ORDER_ID = "904557733280949";
private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json";
private ProxyIpPool(){
}
/**
* 啟動抓取代理IP執行緒
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午5:58:54
*/
public static void startCrawl(){
final int period = 3;
ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1);
scheduledExecutorService.scheduleAtFixedRate(new Runnable() {
int i = 0 ;
@Override
public void run() {
produceIP(i);
i++;
}
}, 1, period,TimeUnit.MINUTES);
}
private static void produceIP(int i){
int currentSize = canUseIpList.size();
if( currentSize >= MAX_IP){
MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
return ;
}
JSONArray ips = getIPFromKuaiDaiLi();
produceIP(ips);
MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size());
}
private static void produceIP(JSONArray ips){
if(ips == null || ips.isEmpty()){
return ;
}
for(int i = 0 ;i < ips.size() ;i++ ){
Object one = ips.get(i);
String s[] = one.toString().split(":");
String ip = s[0];
int port = Integer.valueOf(s[1]);
ProxyIp proxyIp = new ProxyIp(ip, port);
if(isCanUse(ip, port)){
addIP(proxyIp);
}else{
removeIP(proxyIp);
}
}
}
public static ProxyIp useOneProxyIp(){
if(canUseIpList.isEmpty()){
MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP.");
try {
Thread.sleep(2 * 60 * 1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
Collections.sort(canUseIpList);
ProxyIp proxyIp = canUseIpList.remove(0);
proxyIp.useThis();
return proxyIp;
}
public static void returnProxyIp(ProxyIp proxyIp){
proxyIp.setUseing(false);
canUseIpList.add(proxyIp);
return ;
}
/**
* 從快代理網站獲取代理IP
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:36:05
*/
private static JSONArray getIPFromKuaiDaiLi(){
JSONArray ips = new JSONArray();
HttpClient client = new HttpClient();
GetMethod method = new GetMethod(KDL_URL);
HttpMethodParams param = method.getParams();
param.setContentCharset("UTF-8");
try {
client.executeMethod(method);
String res = method.getResponseBodyAsString();
JSONObject json = JSONObject.fromObject(res);
if(json != null && json.containsKey("data")){
ips = json.getJSONObject("data").getJSONArray("proxy_list");
MyLog.logInfo(ips);
}
} catch (Exception e) {
e.printStackTrace();
}
return ips;
}
/**
* 從更多的網站獲取代理IP
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:46:40
*/
@SuppressWarnings("unused")
private static JSONArray getIPFromXXX(){
JSONArray ips = new JSONArray();
HttpClient client = new HttpClient();
GetMethod method = new GetMethod("XXX");
HttpMethodParams param = method.getParams();
param.setContentCharset("UTF-8");
try {
client.executeMethod(method);
String res = method.getResponseBodyAsString();
JSONObject json = JSONObject.fromObject(res);
if(json != null && json.containsKey("data")){
ips = json.getJSONObject("data").getJSONArray("proxy_list");
MyLog.logInfo(ips);
}
} catch (Exception e) {
e.printStackTrace();
}
return ips;
}
/**
* 檢測代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午2:37:22
*/
private static boolean isCanUse(String ip,int port){
if(port < 0 ){
return false;
}
if(notCanUseIPsTemp.containsKey(ip)){
MyLog.logInfo(ip+":"+port+" can't use again.");
return false;
}
if(!checkIp(ip, port)){
return false;
}
return checkIpUseTargetSite(ip, port);
}
/**
* 檢測代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午12:35:28
*/
private static boolean checkIp(String ip,int port){
Socket server = null;
try {
server = new Socket();
InetSocketAddress address = new InetSocketAddress(ip,port);
server.connect(address, 3000);
MyLog.logInfo(ip+":"+port+" is ok!");
return true;
}catch (UnknownHostException e) {
//e.printStackTrace();
MyLog.logInfo(ip+":"+port+" is wrong!");
} catch (IOException e) {
//e.printStackTrace();
MyLog.logInfo(ip+":"+port+" is wrong!!");
}
return false;
}
/**
* 到目標網站準確檢測代理IP是否可用
*
* @param ip
* @param port
* @return
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午12:06:03
*/
private static boolean checkIpUseTargetSite(String ip,int port){
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
CloseableHttpClient closeableHttpClient = httpClientBuilder.build();
HttpHost proxy = new HttpHost(ip,port, "http");
RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();
HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/");
httpGet.setConfig(config);
try {
CloseableHttpResponse response = closeableHttpClient.execute(httpGet);
HttpEntity httpentity = response.getEntity();
String html = EntityUtils.toString(httpentity, "UTF-8");
if(Jsoup.parse(html).select("div[class=header fix]").first() != null){
return true;
}
} catch (Exception exc){
// exc.printStackTrace();
MyLog.logError(exc.getMessage());
}
return false;
}
public static void removeIP(ProxyIp proxyIp){
canUseIpList.remove(proxyIp);
notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort());
}
public static void addIP(ProxyIp proxyIp){
canUseIpList.add(proxyIp);
notCanUseIPsTemp.remove(proxyIp.getIp());
}
/**
* 測試使用代理IP
*
* @author : Ares.yi
* @createTime : 2015年10月29日 下午6:00:16
*/
private static void testUseProxyIp(){
ExecutorService threadPool = Executors.newFixedThreadPool(10);
for(int i=0 ;i <20 ;i++){
final int flag = i;
threadPool.execute(new Runnable() {
@Override
public void run() {
ProxyIp proxyIp = useOneProxyIp();
MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString());
long millis = new Random().nextInt(10) * 1000;
try {
Thread.sleep(millis);//每個執行緒隨機sleep N秒,模擬執行緒在工作
} catch (InterruptedException e) {
e.printStackTrace();
}
returnProxyIp(proxyIp);
MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool.");
}
});
}
}
}
使用代理IP:
/**
* 使用代理獲取網頁內容
*
* @param url
* @param proxyIp
* @param proxyPort
* @return
* @throws ParseException
* @throws IOException
*
* @author : Ares.yi
* @createTime : 2015年10月30日 上午9:55:21
*/
public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException {
HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
CloseableHttpClient closeableHttpClient = httpClientBuilder.build();
HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http");
RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();
HttpPost httpGet = new HttpPost(url);
httpGet.setConfig(config);
String html = "";
CloseableHttpResponse response = null;
try {
response = closeableHttpClient.execute(httpGet);
}catch(Exception exc){
exc.printStackTrace();
System.out.println("get請求失敗!");
return "cannot connect";
}
HttpEntity httpEntity = response.getEntity();
if (httpEntity != null) {
// 列印響應內容
try{
html = EntityUtils.toString(httpEntity, "UTF-8");
}catch(Exception excep){
System.out.println(url);
}
}else{
return "cannot connect";
}
closeableHttpClient.close();
return html;
}
2.模擬登入
提取登入Cookie和User-Agent:
程式碼片段,如下:
public static String postRequest(String url,
Map<String, String> parameterMap, String charSet)
throws UnsupportedEncodingException {
CloseableHttpClient client = HttpClients.createDefault();
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet);
httpPost.setEntity(postEntity);
httpPost.addHeader("HOST", "sec.1688.com");
httpPost.addHeader("User-Agent", Constant.userAgent);
httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
httpPost.addHeader("Cookie", Constant.cookie);
MyLog.logInfo("request line:" + httpPost.getRequestLine());
try {
// 執行post請求
HttpResponse httpResponse = client.execute(httpPost);
Header header = httpResponse.getFirstHeader("Location");
if (header != null && Toolbox.isNotEmpty(header.getValue())) {
MyLog.logInfo("location:" + header.getValue());
return "SUCCESS";
} else {
String html = printResponse(httpResponse);
return html;
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
client.close();
} catch (IOException e) {
}
}
return "";
}
3.驗證碼
獲取輸入驗證碼頁面資訊:
public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){
Ali1688CheckCodeFormData ali1688CheckCodeFormData= null;
if(Toolbox.isEmptyString(checkCodePageHtml)){
return ali1688CheckCodeFormData;
}
Document doc = Jsoup.parse(checkCodePageHtml);
String action = doc.select("input[name=action]").attr("value");
String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value");
String smPolicy = doc.select("input[name=smPolicy]").attr("value");
String smReturn = doc.select("input[name=smReturn]").attr("value");
String smApp = doc.select("input[name=smApp]").attr("value");
String smCharset = doc.select("input[name=smCharset]").attr("value");
String smTag = doc.select("input[name=smTag]").attr("value");
String smSign = doc.select("input[name=smSign]").attr("value");
String identity = doc.select("input[name=identity]").attr("value");
String captcha = doc.select("input[name=captcha]").attr("value");
String sessionid = doc.select("img[id=checkcodeImg]").attr("src");
sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&"));
ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url);
return ali1688CheckCodeFormData;
}
提交驗證碼:
public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{
String smApp = Constant.ali1688CheckCodeFormData.getSmApp();
String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy();
String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset();
String smTag = Constant.ali1688CheckCodeFormData.getSmTag();
String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn();
String smSign = Constant.ali1688CheckCodeFormData.getSmSign();
String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign;
try {
get = java.net.URLEncoder.encode(get,"utf-8");
} catch (UnsupportedEncodingException e1) {
}
String formAction = "https://sec.1688.com/query.htm?"+get;
Map<String,String> parameterMap = new HashMap<String,String>();
parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction());
parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query());
parameterMap.put("smPolicy", smPolicy);
parameterMap.put("smReturn", smReturn);
parameterMap.put("smApp", smApp);
parameterMap.put("smCharset", smCharset);
parameterMap.put("smTag", smTag);
parameterMap.put("smSign", smSign);
parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity());
parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha());
parameterMap.put("checkcode", checkcode);
String res = HttpKit.postRequest(formAction, parameterMap, "UTF-8");
if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) {
return "SUCCESS";
}else{
String html = res;
Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html);
}
return "";
}
4.exe4j操作:
5.部分介面: