Java爬蟲——代理IP
阿新 • • 發佈:2019-02-01
1 核心程式碼-爬蟲類
package cn.tyoui.httpclient; import org.apache.commons.io.FileUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; importorg.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClients; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Random; /** * 爬蟲網頁 */ public class HttpCrawler { privateCloseableHttpClient httpClient = HttpClients.createDefault(); private List<ProxyIP> list = null; //儲存爬取的網頁 private String dir = null; /** * 代理初始化 * * @throws Exception */ public void proxyInit(String proxyText) throws Exception { list = new ArrayList<>(); List<String> listIP = FileUtils.readLines(new File(proxyText)); for (String str : listIP) { String ip = str.split(":")[0]; int port = Integer.parseInt(str.split(":")[1]); ProxyIP proxyIp = new ProxyIP(ip, port); list.add(proxyIp); } } /** * 開始爬取* * @param webURL 要爬取的網址 * @throws Exception 爬取失敗 */ public void startCrawler(String webURL) throws Exception { String path = dir + File.separator + webURL.substring(webURL.lastIndexOf("/")) + ".html"; File file = new File(path); if (file.exists() && file.length() > 20_000) return; if (list == null) { crawler(webURL, path, null, 0); } else { int index = new Random().nextInt(list.size()); crawler(webURL, path, list.get(index), index); } } /** * 爬蟲 * * @param url 要爬的網址 * @param path 儲存的路徑 * @param proxy 代理ip的物件 * @param index 第幾個代理ip * @throws CloneNotSupportedException 關閉流失敗 * @throws IOException 關閉流失敗 */ private void crawler(String url, String path, ProxyIP proxy, int index) throws CloneNotSupportedException, IOException { CloseableHttpResponse response = null; HttpGet httpGet = null; try { httpGet = new HttpGet(url); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/44.0.2403.157 Safari/537.36"); RequestConfig requestConfig = null; if (proxy == null) { requestConfig = RequestConfig.custom().setConnectTimeout(2000).setSocketTimeout(1000).build(); } else { HttpHost httpHost = new HttpHost(proxy.getIp(), proxy.getPort()); requestConfig = RequestConfig.custom().setProxy(httpHost).setConnectTimeout(2000).setSocketTimeout(1000).build(); } httpGet.setConfig(requestConfig); response = httpClient.execute(httpGet); int status = response.getStatusLine().getStatusCode(); if (status == 200) { HttpEntity entity = response.getEntity(); entity.writeTo(new FileOutputStream(path)); System.out.println("下載成功!" + url); } else { if (list != null) list.remove(index); throw new Exception("爬取到的網頁非正常!"); } } catch (Exception e) { System.err.println(e); System.err.println("下載失敗!" + url); } finally { if (httpGet != null) httpGet.clone(); if (response != null) response.close(); } } /** * 儲存爬取網頁發的資料夾 * * @param dir 資料夾 */ public void setDir(String dir) { this.dir = dir; File file=new File(dir); if(!file.exists()) file.mkdirs(); } /** * 關閉爬取流 */ public void close() { try { httpClient.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 獲取代理ip連結串列 * * @return */ public List<ProxyIP> getList() { return list; }
/** * 測試 */ public static void main(String[] args) throws Exception { HttpCrawler httpCrawler = new HttpCrawler(); httpCrawler.setDir("D:\\baidu");//新增儲存網頁資料夾 // httpCrawler.proxyInit("E:\\IDECode\\StringUtils\\text\\代理ip.txt");//代理ip文字路徑 httpCrawler.startCrawler("http://www.baidu.com");//要爬取的網址 httpCrawler.close();//關閉爬蟲流 }
}
2 測試類
package cn.tyoui.httpclient; class ProxyIP { private String ip; private int port; ProxyIP(String ip, int port) { this.ip = ip; this.port = port; } public void setIp(String ip) { this.ip = ip; } public void setPort(int port) { this.port = port; } public int getPort() { return port; } public String getIp() { return ip; } }3 maven pom.xml
<properties> <commons-httpclient>4.5.3</commons-httpclient> <commons-io.version>2.4</commons-io.version> </properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> <dependency> <groupId>org.apache.httpcomponents</groupId> <artifactId>httpclient</artifactId> <version>${commons-httpclient}</version> </dependency>
<dependency> <groupId>commons-io</groupId> <artifactId>commons-io</artifactId> <version>${commons-io.version}</version> </dependency>
</dependencies>4 JDK 1.8
<build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-resources-plugin</artifactId> <version>2.7</version> <configuration> <encoding>UTF-8</encoding> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.6.0</version> <configuration> <source>1.8</source> <target>1.8</target> <encoding>UTF-8</encoding> </configuration> </plugin> </plugins> </build>
5 代理IP格式