1. 程式人生 > 實用技巧 >網路動態代理反反爬

網路動態代理反反爬

前些天,寫了個爬蟲的部落格,但是沒有實現使用動態代理反反爬,今天補充下。如果想大量爬取資料,建議還是付費購買代理。

pom檔案見:https://www.cnblogs.com/yhc-910/p/13440456.html

package com.paic.ocss.fps.client.jsoup;

import java.io.File;
import java.io.FileOutputStream;
import java.util.Arrays;
import java.util.List;
import java.util.Random;

import org.apache.commons.compress.utils.Lists;
import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.ss.usermodel.Cell; import org.apache.poi.ss.usermodel.Row; import org.apache.poi.ss.usermodel.Sheet; import org.apache.poi.ss.usermodel.Workbook; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element;
import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author yhc * @date 2020/8/4 */ public class AnjukeData { private final static Logger log = LoggerFactory.getLogger(AnjukeData.class);
   // 一些已知的免費代理,但是除錯後基本都無法使用,下面有實時獲取動態代理IP的地址,每天可免費獲取十個,需註冊賬號後生成連結:
http://h.etdaili.com/
private final static String[] proxy = { "112.65.53.167:24631", "113.195.171.58:9999", "112.95.22.78:8888", "175.44.109.219:9999", "113.195.18.104:9999", "163.125.30.227:8118", "118.212.105.115:9999", "175.44.109.239:9999", "112.111.77.41:9999", "125.108.84.68:9000", "36.250.156.213:9999", "36.249.53.29:9999", "121.232.148.222:9000", "36.249.109.19:9999", "163.125.31.3:8118", "115.218.209.27:9000", "120.83.106.218:9999", "120.83.109.228:9999" }; private static int proxyIndex = 0;
// 一些agent
static String[] ua = { "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36 OPR/37.0.2178.32", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2486.0 Safari/537.36 Edge/13.10586", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko", "Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.106 BIDUBrowser/8.3 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36 Core/1.47.277.400 QQBrowser/9.4.7658.400", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 UBrowser/5.6.12150.8 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36 TheWorld 7", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/60.0" }; // 讀取的資料 private static List<List<String>> data = Lists.newArrayList(); private static String proxyIp = "0.0.0.0"; private static int proxyPort = 80; private static boolean isFirstCall = true; private static void refreshProxy() { isFirstCall = false; String proxy = OkHttpUtil.doGet( "http://47.106.160.121/Index-generate_api_url.html?packid=1&fa=0&qty=1&port=1&format=txt&ss=1&css=&ipport=1&pro=%E4%B8%8A%E6%B5%B7%E7%9B%B4%E8%BE%96%E5%B8%82&city=%E4%B8%8A%E6%B5%B7%E5%B8%82&usertype=13"); log.info("# 獲取代理IP:{}", proxy); String[] proxys = proxy.split(":"); proxyIp = proxys[0]; proxyPort = Integer.parseInt(proxys[1]); } public static void getData(String urls) throws Exception { // 讀取資料 List<List<String>> pageData = Lists.newArrayList(); if (isFirstCall) { refreshProxy(); } String agent = ua[new Random().nextInt(ua.length - 1)]; Document doc = Jsoup.connect(urls).timeout(600000).proxy(proxyIp, proxyPort).userAgent(agent) .ignoreContentType(true).ignoreHttpErrors(true) // .header("referer", "https://hanchuanshi.anjuke.com/sale/rd1/?kw=&from=sugg") .get(); String html = doc.outerHtml(); if (html.contains("訪問驗證-安居客")) { log.error("# 請求被攔截,重新設定代理請求"); refreshProxy(); getData(urls); return; } else { log.info("# 請求成功,獲取資料"); } Elements els = doc.body().getElementsByClass("list-item"); for (Element el : els) { List<String> rowData = Lists.newArrayList(); Elements titleEls = el.getElementsByClass("house-title"); log.info("# 標題:{}", titleEls.get(0).getElementsByTag("a").text()); rowData.add(titleEls.get(0).getElementsByTag("a").text()); Elements itemEls = el.getElementsByClass("details-item"); Elements itemSpanEls = itemEls.get(0).getElementsByTag("span"); log.info("# 戶型:{}", itemSpanEls.get(0).text()); rowData.add(itemSpanEls.get(0).text()); log.info("# 面積:{}", itemSpanEls.get(1).text()); rowData.add(itemSpanEls.get(1).text()); log.info("# 樓層:{}", itemSpanEls.get(2).text()); rowData.add(itemSpanEls.get(2).text()); log.info("# 年限:{}", itemSpanEls.get(3).text()); rowData.add(itemSpanEls.get(3).text()); if (itemEls.size() == 1) { continue; } String[] address = itemEls.get(1).getElementsByTag("span").text().split(" "); log.info("# 樓盤:{}", address[0]); log.info("# 地址:{}", address[1]); rowData.add(address[0]); rowData.add(address[1]); Elements priceEls = el.getElementsByClass("pro-price"); Elements priceSpanEls = priceEls.get(0).getElementsByTag("span"); log.info("# 總價:{}", priceSpanEls.get(0).getElementsByTag("strong").text()); rowData.add(priceSpanEls.get(0).getElementsByTag("strong").text()); log.info("# 單價:{}", priceSpanEls.get(1).text()); rowData.add(priceSpanEls.get(1).text()); pageData.add(rowData); } data.addAll(pageData); } public static void writeExcel(List<String> titleList, List<List<String>> dataList) throws Exception { // open file. File excel = new File("C:\\Users\\Administrator\\Desktop\\data.xls"); excel.deleteOnExit(); excel.createNewFile(); FileOutputStream fos = new FileOutputStream(excel); Workbook book = new HSSFWorkbook(); // create Sheet named "Sheet_1". 0 means this is 1st page. Sheet sheet = book.createSheet("安居客房源資訊"); // 寫入標題 Row titleRow = sheet.createRow(0); for (int x = 0; x < titleList.size(); x++) { Cell cell0 = titleRow.createCell(x); cell0.setCellValue(titleList.get(x)); } // 寫入資料 for (int i = 0; i < dataList.size(); i++) { int row = i + 1; Row dataRow = sheet.createRow(row); List<String> rowData = dataList.get(i); for (int j = 0; j < titleList.size(); j++) { Cell dataCell = dataRow.createCell(j); dataCell.setCellValue(rowData.get(j)); } } book.write(fos); book.close(); log.info("# write data success"); } public static void main(String[] args) { try { String url = "file:///D:/Users/YUHUCHENG693/Desktop/test.html"; // url = "https://hanchuanshi.anjuke.com/sale/p1-rd1/#filtersort"; for (int i = 1; i <= 50; i++) { url = "https://hanchuanshi.anjuke.com/sale/p" + i + "-rd1/#filtersort"; getData(url); log.info("# ===>>> 獲取{}頁資料成功", i); Thread.sleep(10000); } List<String> titleList = Arrays.asList("標題", "戶型", "面積", "樓層", "年限", "樓盤", "地址", "總價", "單價"); writeExcel(titleList, data); } catch (Exception e) { e.printStackTrace(); } } }