java爬取天眼查並存入excel中
阿新 • • 發佈:2018-11-02
功能:
自動讀取comyang.txt檔案中的公司名進行搜尋
把搜尋到含有公司詳細資訊的html儲存在info資料夾
把html檔案中的資訊提取到excel表格中
判斷是否出現機器人驗證
斷點續查(關了再開啟不會重複查詢)
缺點:無法跳過機器人驗證
程式分為兩個執行檔案,不是一鍵完成
偶爾會卡住(請求沒有響應)
問題記錄
登入問題:一開始沒有登入,查詢的時候總是被攔截跳轉到登入頁面。在瀏覽器上登入,複製cookie資訊,在程式碼中設定即可。
機器人驗證:據我觀察,同一個IP呼叫天眼查網站上的介面大約100次就會出現一次機器人驗證。雖然很想自動完成,但是能力有限,實現不了,後來想想採取了一個折中的方法,在程式碼裡面檢測是否出現機器人驗證。當出現機器人驗證的時候,列印驗證的地址,程式暫停。等待人工完成驗證後,輸入OK再繼續往下執行。
程式卡住:不知道是程式碼問題還是,網站的問題。每查詢一百多個公司的時候,總會有個請求等不到響應,一直在等待。雖然做了處理,把程式關了再開啟還是會繼續往後查詢,但是挺糾結。以後再處理。
圖片編碼:試著破解機器人驗證的過程發先一個挺有意思的地方。天眼查網站的機器人驗證是點選漢字的方式,頁面中有兩張圖片。這兩張圖片有個比較有意思的地方是採用Base64編碼的方式進行傳輸的。也就是把漢字轉為字串的形式,以前不知道還有這種操作。
依賴jar包
httpclient:模擬發起HTTP請求
jsoup:解析HTML
poi-ooxml:Excel表格操作
關鍵程式碼
設定請求頭:非常關鍵,需要把登入後的cookie資訊複製在這裡設定,不然呼叫介面的時候會被攔截跳轉到登入頁面。
/** * 設定請求頭 * @param httpGet */ public static void setHttpHeaders(HttpGet httpGet) { //設定預設請求頭 在瀏覽器登陸後,把cookie的內容複製到這裡設定cookie,不然無法查詢 httpGet.setHeader("Cookie",""); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9"); httpGet.setHeader("Connection", "keep-alive"); httpGet.setHeader("Host", "www.tianyancha.com"); httpGet.setHeader("Referer", "https://www.tianyancha.com/"); httpGet.setHeader("Upgrade-Insecure-Requests", "1"); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"); }
下面是全部的程式碼:
package cn.xiaoyanol.crawler; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.StatusLine; import org.apache.http.client.HttpClient; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.protocol.HttpClientContext; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.*; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.Random; import java.util.Scanner; /** * Created with IntelliJ IDEA. * Description: 利用天眼查網站查詢指定公司的工商資訊 * @Author: jenrey * @Date: 2018-10-16 * @Time: 下午5:09 */ public class TianYanChaCrawler { public static void main(String[] args) throws IOException { //要獲取資訊的公司名單 File companyFile = new File("company.txt"); FileReader fileReader = new FileReader(companyFile); BufferedReader bufferedReader = new BufferedReader(fileReader); List<String> companyNameList = new ArrayList<String>(); String companyName = null; while ((companyName = bufferedReader.readLine()) != null) { companyNameList.add(companyName.trim().replaceAll(" ", "")); } bufferedReader.close(); fileReader.close(); //建立資料夾儲存含有公司詳細資訊的html頁面 File directory = new File("info"); if (!directory.exists()) { directory.mkdir(); } else { //過濾已經查詢過的公司 String[] fileNameList = directory.list(); for (String fileName : fileNameList) { fileName = fileName.substring(0, fileName.length()-5); if (companyNameList.contains(fileName)) { companyNameList.remove(fileName); System.out.println(fileName+" 已經搜尋過,該公司將被跳過。。。"); System.out.println(); } } } if (companyNameList.size() == 0) { System.out.println("沒有要搜尋的公司,程式即將關閉。。。"); System.exit(0); }else { System.out.println("程式將要搜尋 "+ companyNameList.size()+" 個公司的資訊。。。"); System.out.println(); } HttpClient httpClient = HttpClientBuilder.create().build(); HttpClientContext context = HttpClientContext.create(); Scanner scanner = new Scanner(System.in); //設定請求和傳輸超時時間 RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).build(); // httpClient. for (int index = 0; index < companyNameList.size(); index++) { companyName = companyNameList.get(index); System.out.println((index+1)+"、正在獲取 "+ companyName +" 的資訊。。。"); System.out.println(); HttpGet httpGet = new HttpGet("https://www.tianyancha.com/search?key=" + companyName); //設定預設請求頭 setHttpHeaders(httpGet); httpGet.setConfig(requestConfig); //執行HTTP請求 HttpResponse response = httpClient.execute(httpGet, context); System.out.println("HTTP請求執行完成。。。"); //判斷是否出現機器人驗證 if (checkRobotVerification(response, context)) { List<URI> redirectLocations = context.getRedirectLocations(); System.out.println("注意!出現機器人驗證,請點選下面的連結,在驗證完後輸入 ok 繼續執行。。。"); System.out.println(); System.out.println(redirectLocations.get(0)); System.out.println(); System.out.print("完成驗證後,請在此處輸入OK:"); while (! "ok".equalsIgnoreCase(scanner.nextLine())) { System.out.print("完成驗證後,請在此處輸入OK:"); } System.out.println(); //再次執行http請求 response = httpClient.execute(httpGet, context); if (checkRobotVerification(response, context)) { System.out.println("沒有完成機器人驗證,程式結束執行。。。。"); System.out.println(); break; } } System.out.println((index+1)+"、搜尋 "+companyName+" 資訊完成"); System.out.println(); HttpEntity entity = response.getEntity(); //從搜尋頁面中提取目標詳細資訊的URL if (entity != null) { //將實體的內容轉換為字串 String html= EntityUtils.toString(entity); Document document = Jsoup.parse(html); Elements select = document.select("a.name"); //提取URL String url = select.attr("abs:href"); //沒有搜尋結果時跳過 if ("".equals(url)) { System.out.println("無法在天眼查網站查詢到:"+companyName+" 的資訊"); System.out.println(); try { //系統暫停10s System.out.println("系統暫停1秒。。。。"); Thread.sleep(1 * 1000); } catch (InterruptedException e) { e.printStackTrace(); } continue; }else { System.out.println((index+1)+"、獲取 "+companyName+" 詳情URL成功。。。"); System.out.println(); } System.out.println("準備獲取詳細資訊。。。"); //獲取公司的詳細資訊 httpGet = new HttpGet(url); //設定預設請求頭 setHttpHeaders(httpGet); httpGet.setConfig(requestConfig); //執行HTTP請求 response = httpClient.execute(httpGet, context); System.out.println("HTTP請求執行完成"); //判斷是否出現機器人驗證 if (checkRobotVerification(response, context)) { List<URI> redirectLocations = context.getRedirectLocations(); System.out.println("注意!出現機器人驗證,請點選下面的連結,在驗證完後輸入回車繼續執行。。。"); System.out.println(redirectLocations.get(0)); System.out.print("完成驗證後,請在此處輸入OK:"); while (! "ok".equalsIgnoreCase(scanner.nextLine())) { System.out.print("完成驗證後,請在此處輸入OK:"); } System.out.println(); scanner.close(); //再次執行http請求 response = httpClient.execute(httpGet, context); if (checkRobotVerification(response, context)) { System.out.println("沒有完成機器人驗證。。。。"); break; } } entity = response.getEntity(); InputStream content = entity.getContent(); FileOutputStream fileOutputStream = new FileOutputStream(directory+"/"+companyName+".html"); byte[] buff = new byte[2048]; int length = 0; //儲存公司詳細資訊 while ((length = content.read(buff, 0, buff.length)) != -1) { fileOutputStream.write(buff, 0, length); } html = EntityUtils.toString(entity); fileOutputStream.close(); content.close(); //提取資訊 File file = new File("info/"+companyName+".html"); document = Jsoup.parse(file, "UTF-8"); Elements tbodys = document.select("tbody"); if (tbodys.size() < 2){ System.out.println("注意!"+companyName+" 無法查詢到工商資訊。。。"); continue; } Element tbody = tbodys.get(1) ; Elements rows = tbody.select("tr"); System.out.println((index+1)+"、"+companyName+" 的工商資訊如下:"); System.out.println(); for (int i = 0; i < rows.size(); i++){ Elements tds = rows.get(i).select("td"); for (int j = 0; j < tds.size(); j++){ if (j % 2 == 0){ System.out.print(tds.get(j).text().split(" ")[0]+" : "); }else { System.out.print(tds.get(j).text()+"\t\t\t"); } } System.out.println(); System.out.println(); } } System.out.println(); System.out.println((index+1)+"、提取:"+companyName+" 資訊完成"); System.out.println(); if (index == companyNameList.size() - 1 ) { System.out.println("搜尋完成,程式即將結束。。。"); }else { int time = new Random().nextInt(2)+1; System.out.println("系統暫停:" + time + "秒"); System.out.println(); try { Thread.sleep(time * 1000); } catch (InterruptedException e) { e.printStackTrace(); } } } } /** * 檢查是否出現機器人驗證 * @param response * @param context * @return */ public static boolean checkRobotVerification(HttpResponse response , HttpClientContext context) { boolean result = false; StatusLine statusLine = response.getStatusLine(); int statusCode = statusLine.getStatusCode(); if (statusCode != HttpStatus.SC_OK) { return true; } return result; } /** * 設定請求頭 * @param httpGet */ public static void setHttpHeaders(HttpGet httpGet) { //設定預設請求頭 在瀏覽器登陸後,把cookie的內容複製到這裡設定cookie,不然無法查詢 httpGet.setHeader("Cookie", "TYCID=3f2b49d0cd4111e8a5e549f497d021fa; undefined=3f2b49d0cd4111e8a5e549f497d021fa; ssuid=5123032640; _ga=GA1.2.1228278129.1539254092; _gid=GA1.2.1873227241.1539582139; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252243%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215522887713%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg; RTYCID=5ecd4dc1fe2a41ff8c5337236243115a; CT_TYCID=e65b84b666fe47febe8d3d669a99d899; aliyungf_tc=AQAAAKGn5XMvqAkAxZFsyv2ebLRydO1v; csrfToken=eGbxdFydN_eDMCrU8Pxv5JJm"); httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8"); httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9"); httpGet.setHeader("Connection", "keep-alive"); httpGet.setHeader("Host", "www.tianyancha.com"); httpGet.setHeader("Referer", "https://www.tianyancha.com/"); httpGet.setHeader("Upgrade-Insecure-Requests", "1"); httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"); } public static void UITips(HttpResponse response, HttpClientContext context) { } }
下面是生成excel的程式碼:
package cn.xiaoyanol.crawler;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
/**
* Created with IntelliJ IDEA.
* Description:
* 把查詢到的頁面資訊中的資料提取到excel表格中
* @Author: jenrey
* @Date: 2018-10-16
* @Time: 下午2:15
*/
public class GetExcel {
public static void main(String[] args) throws IOException, InvalidFormatException {
File directory = new File("info");
if (!directory.exists()) {
System.out.println("資料夾不存在,程式結束執行");
return;
}
//獲取要提取的檔案
String[] files = directory.list();
List<String> fileNames = new ArrayList<String>();
for (String file : files) {
if (file.contains(".html")) {
fileNames.add(file);
}
}
File xlsxFile = new File("查詢結果.xlsx");
//如果檔案不存在,建立檔案
if (!xlsxFile.exists()) {
//建立一個工作簿
XSSFWorkbook workbook = new XSSFWorkbook();
//建立一個工作表
XSSFSheet sheet = workbook.createSheet("sheet1");
//初始化第一行資訊頭
Row row = sheet.createRow(0);
row.createCell(0).setCellValue("搜尋公司名");
row.createCell(1).setCellValue("實際公司名");
row.createCell(2).setCellValue("工商註冊號");
row.createCell(3).setCellValue("組織機構程式碼");
row.createCell(4).setCellValue("統一信用程式碼");
row.createCell(5).setCellValue("公司型別");
row.createCell(6).setCellValue("納稅人識別號");
row.createCell(7).setCellValue("行業");
row.createCell(8).setCellValue("營業期限");
row.createCell(9).setCellValue("核准日期");
row.createCell(10).setCellValue("納稅人資質");
row.createCell(11).setCellValue("人員規模");
row.createCell(12).setCellValue("實繳資本");
row.createCell(13).setCellValue("登記機關");
row.createCell(14).setCellValue("參保人數");
row.createCell(15).setCellValue("英文名稱");
row.createCell(16).setCellValue("註冊地址");
row.createCell(17).setCellValue("經營範圍");
row.createCell(18).setCellValue("法定代表人");
FileOutputStream outputStream = new FileOutputStream(xlsxFile);
workbook.write(outputStream);
outputStream.close();
}
//開啟工作簿
FileInputStream fileInputStream = new FileInputStream(xlsxFile);
Workbook workbook = new XSSFWorkbook(fileInputStream);
//獲取工作表
Sheet sheet = workbook.getSheet("sheet1");
//提取資訊
int rowNum = 1;
for (String fileName : fileNames) {
try {
List<String> messageList = new ArrayList<String>();
//搜尋的公司
messageList.add(fileName.substring(0, fileName.length()-5));
File file = new File("info/" + fileName);
Document document = Jsoup.parse(file, "UTF-8");
//獲取實際查詢到的公司名
Elements h1 = document.select("h1");
String realCompany = h1.text();
messageList.add(realCompany);
Elements tbodys = document.select("tbody");
Element tbody = tbodys.get(1);
Elements rows = tbody.select("tr");
//提取查詢到的公司的工商資訊
for (int i = 0; i < rows.size(); i++) {
Elements tds = rows.get(i).select("td");
for (int j = 0; j < tds.size(); j++) {
if (j % 2 == 0) {
continue;
} else {
messageList.add(tds.get(j).text());
}
}
}
Element element = tbodys.get(0);
Element aElement = element.select("a").get(0);
messageList.add(aElement.text());
Row row1 = sheet.createRow(rowNum++);
for (int i = 0; i < messageList.size(); i++) {
row1.createCell(i).setCellValue(messageList.get(i));
}
FileOutputStream outputStream = new FileOutputStream(xlsxFile);
outputStream.flush();
workbook.write(outputStream);
outputStream.close();
System.out.println("rowNum:"+(rowNum - 1)+" "+ fileName);
}catch (Exception e) {
System.out.println(fileName+"-------------------");
}
}
}
}