1. 程式人生 > >使用HttpClient和Jsoup定向抓取資料

使用HttpClient和Jsoup定向抓取資料

1.業務需求:

從指定外網抓點貨,冷啟動

2.站點分析:

.限制IP…
.需要登入……
.對登入賬號有抓取頻率限制……….
.抓取頻率過低,直接跳驗證碼頁面…………..
.驗證碼長度、模樣(純數字&字母數字混合)TM不固定………………..

“我們能不能不抓了?“
“不行!必須得抓…”
“……”

這麼說,此前寫的爬蟲,多執行緒、生產者—>消費者 併發抓取壓根行不通。多執行緒毫無意義。

3.使用技術:

1.HttpClient:讀取指定URL網頁內容
2.Jsoup:解析所要的頁面資料——省得寫噁心的正則表示式
3.Swing:繪製使用者操作介面
4.Tess4J

:自動識別驗證碼(http://tess4j.sourceforge.net/
5.Exe4J:生成可獨立執行的exe程式——給每人機器安裝一個,大家一起監控抓~

4.實現要點:

1.代理IP
從一些網站上抓取代理IP,並檢測是否可以使用,如下:

package com.ydj.zhuaqu.proxy;

import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.UnknownHostException;
import java.util.ArrayList;
import
java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import
org.apache.commons.collections.map.LRUMap; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import com.ydj.common.kit.MyLog; /** * * @author : Ares.yi * @createTime : 2014-11-10 上午11:13:42 * @version : 1.0 * @description : * */ public class ProxyIpPool { /**設定最多IP數*/ private static final int MAX_IP = 100; /**設定最少IP數(最好控制和外部使用執行緒數一致)*/ @SuppressWarnings("unused") private static final int MIN_IP = 10; // public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>(); public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP)); private static LRUMap notCanUseIPsTemp = new LRUMap(2000); /**每次抓取IP數*/ private static final int NUM = 20; private static final String ORDER_ID = "904557733280949"; private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json"; private ProxyIpPool(){ } /** * 啟動抓取代理IP執行緒 * * @author : Ares.yi * @createTime : 2015年10月29日 下午5:58:54 */ public static void startCrawl(){ final int period = 3; ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1); scheduledExecutorService.scheduleAtFixedRate(new Runnable() { int i = 0 ; @Override public void run() { produceIP(i); i++; } }, 1, period,TimeUnit.MINUTES); } private static void produceIP(int i){ int currentSize = canUseIpList.size(); if( currentSize >= MAX_IP){ MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); return ; } JSONArray ips = getIPFromKuaiDaiLi(); produceIP(ips); MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); } private static void produceIP(JSONArray ips){ if(ips == null || ips.isEmpty()){ return ; } for(int i = 0 ;i < ips.size() ;i++ ){ Object one = ips.get(i); String s[] = one.toString().split(":"); String ip = s[0]; int port = Integer.valueOf(s[1]); ProxyIp proxyIp = new ProxyIp(ip, port); if(isCanUse(ip, port)){ addIP(proxyIp); }else{ removeIP(proxyIp); } } } public static ProxyIp useOneProxyIp(){ if(canUseIpList.isEmpty()){ MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP."); try { Thread.sleep(2 * 60 * 1000); } catch (InterruptedException e) { e.printStackTrace(); } } Collections.sort(canUseIpList); ProxyIp proxyIp = canUseIpList.remove(0); proxyIp.useThis(); return proxyIp; } public static void returnProxyIp(ProxyIp proxyIp){ proxyIp.setUseing(false); canUseIpList.add(proxyIp); return ; } /** * 從快代理網站獲取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:36:05 */ private static JSONArray getIPFromKuaiDaiLi(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod(KDL_URL); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 從更多的網站獲取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:46:40 */ @SuppressWarnings("unused") private static JSONArray getIPFromXXX(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod("XXX"); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 檢測代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:37:22 */ private static boolean isCanUse(String ip,int port){ if(port < 0 ){ return false; } if(notCanUseIPsTemp.containsKey(ip)){ MyLog.logInfo(ip+":"+port+" can't use again."); return false; } if(!checkIp(ip, port)){ return false; } return checkIpUseTargetSite(ip, port); } /** * 檢測代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:35:28 */ private static boolean checkIp(String ip,int port){ Socket server = null; try { server = new Socket(); InetSocketAddress address = new InetSocketAddress(ip,port); server.connect(address, 3000); MyLog.logInfo(ip+":"+port+" is ok!"); return true; }catch (UnknownHostException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!"); } catch (IOException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!!"); } return false; } /** * 到目標網站準確檢測代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:06:03 */ private static boolean checkIpUseTargetSite(String ip,int port){ HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); CloseableHttpClient closeableHttpClient = httpClientBuilder.build(); HttpHost proxy = new HttpHost(ip,port, "http"); RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build(); HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/"); httpGet.setConfig(config); try { CloseableHttpResponse response = closeableHttpClient.execute(httpGet); HttpEntity httpentity = response.getEntity(); String html = EntityUtils.toString(httpentity, "UTF-8"); if(Jsoup.parse(html).select("div[class=header fix]").first() != null){ return true; } } catch (Exception exc){ // exc.printStackTrace(); MyLog.logError(exc.getMessage()); } return false; } public static void removeIP(ProxyIp proxyIp){ canUseIpList.remove(proxyIp); notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort()); } public static void addIP(ProxyIp proxyIp){ canUseIpList.add(proxyIp); notCanUseIPsTemp.remove(proxyIp.getIp()); } /** * 測試使用代理IP * * @author : Ares.yi * @createTime : 2015年10月29日 下午6:00:16 */ private static void testUseProxyIp(){ ExecutorService threadPool = Executors.newFixedThreadPool(10); for(int i=0 ;i <20 ;i++){ final int flag = i; threadPool.execute(new Runnable() { @Override public void run() { ProxyIp proxyIp = useOneProxyIp(); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString()); long millis = new Random().nextInt(10) * 1000; try { Thread.sleep(millis);//每個執行緒隨機sleep N秒,模擬執行緒在工作 } catch (InterruptedException e) { e.printStackTrace(); } returnProxyIp(proxyIp); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool."); } }); } } }

使用代理IP:

    /**
     * 使用代理獲取網頁內容
     * 
     * @param url
     * @param proxyIp
     * @param proxyPort
     * @return
     * @throws ParseException
     * @throws IOException
     *
     * @author : Ares.yi
     * @createTime : 2015年10月30日 上午9:55:21
     */
    public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException {

        HttpClientBuilder httpClientBuilder = HttpClientBuilder.create();
        CloseableHttpClient closeableHttpClient = httpClientBuilder.build();

        HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http");
        RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build();

        HttpPost httpGet = new HttpPost(url);
        httpGet.setConfig(config);

        String html = "";
        CloseableHttpResponse response = null;

        try {
            response = closeableHttpClient.execute(httpGet);
        }catch(Exception exc){
            exc.printStackTrace();
            System.out.println("get請求失敗!");
            return "cannot connect";
        }

        HttpEntity httpEntity = response.getEntity();
        if (httpEntity != null) {
            // 列印響應內容
            try{
                html =  EntityUtils.toString(httpEntity, "UTF-8");
            }catch(Exception excep){
                System.out.println(url);
            }   
        }else{
            return "cannot connect";
        }

        closeableHttpClient.close();
        return html;
    }

2.模擬登入
提取登入Cookie和User-Agent:
這裡寫圖片描述

程式碼片段,如下:

public static String postRequest(String url,
            Map<String, String> parameterMap, String charSet)
            throws UnsupportedEncodingException {
        CloseableHttpClient client = HttpClients.createDefault();

        HttpPost httpPost = new HttpPost(url);

        UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet);
        httpPost.setEntity(postEntity);

        httpPost.addHeader("HOST", "sec.1688.com");
        httpPost.addHeader("User-Agent", Constant.userAgent);
        httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
        httpPost.addHeader("Cookie", Constant.cookie);

        MyLog.logInfo("request line:" + httpPost.getRequestLine());

        try {
            // 執行post請求
            HttpResponse httpResponse = client.execute(httpPost);

            Header header = httpResponse.getFirstHeader("Location");

            if (header != null && Toolbox.isNotEmpty(header.getValue())) {
                MyLog.logInfo("location:" + header.getValue());
                return "SUCCESS";
            } else {
                String html = printResponse(httpResponse);

                return html;
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                client.close();
            } catch (IOException e) {
            }
        }

        return "";
    }

3.驗證碼
獲取輸入驗證碼頁面資訊:

public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){
        Ali1688CheckCodeFormData ali1688CheckCodeFormData= null;

        if(Toolbox.isEmptyString(checkCodePageHtml)){
            return ali1688CheckCodeFormData;
        }

        Document doc = Jsoup.parse(checkCodePageHtml);

        String action = doc.select("input[name=action]").attr("value");
        String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value");
        String smPolicy = doc.select("input[name=smPolicy]").attr("value");
        String smReturn = doc.select("input[name=smReturn]").attr("value");
        String smApp = doc.select("input[name=smApp]").attr("value");
        String smCharset = doc.select("input[name=smCharset]").attr("value");
        String smTag = doc.select("input[name=smTag]").attr("value");
        String smSign = doc.select("input[name=smSign]").attr("value");
        String identity = doc.select("input[name=identity]").attr("value");
        String captcha = doc.select("input[name=captcha]").attr("value");

        String sessionid = doc.select("img[id=checkcodeImg]").attr("src"); 

        sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&"));

        ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url);

        return ali1688CheckCodeFormData;
    }

提交驗證碼:

public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{

        String smApp = Constant.ali1688CheckCodeFormData.getSmApp();
        String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy();
        String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset();
        String smTag = Constant.ali1688CheckCodeFormData.getSmTag();
        String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn();
        String smSign = Constant.ali1688CheckCodeFormData.getSmSign();

        String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign;

        try {
            get = java.net.URLEncoder.encode(get,"utf-8");
        } catch (UnsupportedEncodingException e1) {
        }

        String formAction = "https://sec.1688.com/query.htm?"+get;

        Map<String,String> parameterMap = new HashMap<String,String>();
        parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction());
        parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query());
        parameterMap.put("smPolicy", smPolicy);
        parameterMap.put("smReturn", smReturn);
        parameterMap.put("smApp", smApp);
        parameterMap.put("smCharset", smCharset);
        parameterMap.put("smTag", smTag);
        parameterMap.put("smSign", smSign);
        parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity());
        parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha());
        parameterMap.put("checkcode", checkcode);

        String res = HttpKit.postRequest(formAction, parameterMap,  "UTF-8");

        if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) {
                return "SUCCESS";
        }else{
              String html = res;
              Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html);
        } 

        return "";
    }

4.exe4j操作:
這裡寫圖片描述

5.部分介面:

這裡寫圖片描述
這裡寫圖片描述
這裡寫圖片描述

6.原始碼: