1. 程式人生 > >Java爬蟲——代理IP

Java爬蟲——代理IP

1 核心程式碼-爬蟲類
package cn.tyoui.httpclient;

import org.apache.commons.io.FileUtils;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import 
org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; /** * 爬蟲網頁 */ public class HttpCrawler { private
CloseableHttpClient httpClient = HttpClients.createDefault(); private List<ProxyIP> list = null; //儲存爬取的網頁 private String dir = null; /** * 代理初始化 * * @throws Exception */ public void proxyInit(String proxyText) throws Exception { list = new ArrayList<>(); List<String> listIP = FileUtils.readLines
(new File(proxyText)); for (String str : listIP) { String ip = str.split(":")[0]; int port = Integer.parseInt(str.split(":")[1]); ProxyIP proxyIp = new ProxyIP(ip, port); list.add(proxyIp); } } /** * 開始爬取* * @param webURL 要爬取的網址 * @throws Exception 爬取失敗 */ public void startCrawler(String webURL) throws Exception { String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/")) + ".html"; File file = new File(path); if (file.exists() && file.length() > 20_000) return; if (list == null) { crawler(webURL, path, null, 0); } else { int index = new Random().nextInt(list.size()); crawler(webURL, path, list.get(index), index); } } /** * 爬蟲 * * @param url 要爬的網址 * @param path 儲存的路徑 * @param proxy 代理ip的物件 * @param index 第幾個代理ip * @throws CloneNotSupportedException 關閉流失敗 * @throws IOException 關閉流失敗 */ private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException { CloseableHttpResponse response = null; HttpGet httpGet = null; try { httpGet = new HttpGet(url); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); RequestConfig requestConfig = null; if (proxy == null) { requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build(); } else { HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort()); requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build(); } httpGet.setConfig(requestConfig); response = httpClient.execute(httpGet); int status = response.getStatusLine().getStatusCode(); if (status == 200) { HttpEntity entity = response.getEntity(); entity.writeTo(new FileOutputStream(path)); System.out.println("下載成功!" + url); } else { if (list != null) list.remove(index); throw new Exception("爬取到的網頁非正常!"); } } catch (Exception e) { System.err.println(e); System.err.println("下載失敗!" + url); } finally { if (httpGet != null) httpGet.clone(); if (response != null) response.close(); } } /** * 儲存爬取網頁發的資料夾 * * @param dir 資料夾 */ public void setDir(String dir) { this.dir = dir; File file=new File(dir); if(!file.exists()) file.mkdirs(); } /** * 關閉爬取流 */ public void close() { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 獲取代理ip連結串列 * * @return */ public List<ProxyIP> getList() { return list; }
/**
 * 測試
*/
  public static void main(String[] args) throws Exception {
    HttpCrawler httpCrawler = new HttpCrawler();
    httpCrawler.setDir("D:\\baidu");//新增儲存網頁資料夾
//  httpCrawler.proxyInit("E:\\IDECode\\StringUtils\\text\\代理ip.txt");//代理ip文字路徑
httpCrawler.startCrawler("http://www.baidu.com");//要爬取的網址
httpCrawler.close();//關閉爬蟲流
  }
}
2 測試類
package cn.tyoui.httpclient;

class ProxyIP {
    private String ip;
    private int port;

    ProxyIP(String ip, int port) {
        this.ip = ip;
        this.port = port;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public void setPort(int port) {
        this.port = port;
    }

    public int getPort() {
        return port;
    }

    public String getIp() {
        return ip;
    }
}
3 maven pom.xml
<properties>
    <commons-httpclient>4.5.3</commons-httpclient>
    <commons-io.version>2.4</commons-io.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
    <groupId>org.apache.httpcomponents</groupId>
    <artifactId>httpclient</artifactId>
    <version>${commons-httpclient}</version>
</dependency>
<dependency>
    <groupId>commons-io</groupId>
    <artifactId>commons-io</artifactId>
    <version>${commons-io.version}</version>
</dependency>
</dependencies>
4 JDK 1.8
<build>
    <plugins>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-resources-plugin</artifactId>
            <version>2.7</version>
            <configuration>
                <encoding>UTF-8</encoding>
            </configuration>
        </plugin>
        <plugin>
            <groupId>org.apache.maven.plugins</groupId>
            <artifactId>maven-compiler-plugin</artifactId>
            <version>3.6.0</version>
            <configuration>
                <source>1.8</source>
                <target>1.8</target>
                <encoding>UTF-8</encoding>
            </configuration>
        </plugin>
    </plugins>
</build>

5 代理IP格式