1. 程式人生 > >java爬取天眼查並存入excel中

java爬取天眼查並存入excel中

功能:

自動讀取comyang.txt檔案中的公司名進行搜尋
把搜尋到含有公司詳細資訊的html儲存在info資料夾
把html檔案中的資訊提取到excel表格中
判斷是否出現機器人驗證
斷點續查(關了再開啟不會重複查詢)


缺點:

無法跳過機器人驗證
程式分為兩個執行檔案,不是一鍵完成
偶爾會卡住(請求沒有響應)

問題記錄
 

登入問題:一開始沒有登入,查詢的時候總是被攔截跳轉到登入頁面。在瀏覽器上登入,複製cookie資訊,在程式碼中設定即可。
機器人驗證:據我觀察,同一個IP呼叫天眼查網站上的介面大約100次就會出現一次機器人驗證。雖然很想自動完成,但是能力有限,實現不了,後來想想採取了一個折中的方法,在程式碼裡面檢測是否出現機器人驗證。當出現機器人驗證的時候,列印驗證的地址,程式暫停。等待人工完成驗證後,輸入OK再繼續往下執行。
程式卡住:不知道是程式碼問題還是,網站的問題。每查詢一百多個公司的時候,總會有個請求等不到響應,一直在等待。雖然做了處理,把程式關了再開啟還是會繼續往後查詢,但是挺糾結。以後再處理。
圖片編碼:試著破解機器人驗證的過程發先一個挺有意思的地方。天眼查網站的機器人驗證是點選漢字的方式,頁面中有兩張圖片。這兩張圖片有個比較有意思的地方是採用Base64編碼的方式進行傳輸的。也就是把漢字轉為字串的形式,以前不知道還有這種操作。


依賴jar包


httpclient:模擬發起HTTP請求
jsoup:解析HTML
poi-ooxml:Excel表格操作


關鍵程式碼


設定請求頭:非常關鍵,需要把登入後的cookie資訊複製在這裡設定,不然呼叫介面的時候會被攔截跳轉到登入頁面。

/**
     * 設定請求頭
     * @param httpGet
     */
    public static void setHttpHeaders(HttpGet httpGet) {
        //設定預設請求頭 在瀏覽器登陸後,把cookie的內容複製到這裡設定cookie,不然無法查詢
        httpGet.setHeader("Cookie","");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Host", "www.tianyancha.com");
        httpGet.setHeader("Referer", "https://www.tianyancha.com/");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
    }

下面是全部的程式碼:

package cn.xiaoyanol.crawler;

import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.StatusLine;
import org.apache.http.client.HttpClient;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.*;
import java.net.URI;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import java.util.Scanner;

/**
 * Created with IntelliJ IDEA.
 * Description: 利用天眼查網站查詢指定公司的工商資訊
 * @Author: jenrey
 * @Date: 2018-10-16
 * @Time: 下午5:09
 */
public class TianYanChaCrawler {
    public static void main(String[] args) throws IOException {


        //要獲取資訊的公司名單
        File companyFile = new File("company.txt");
        FileReader fileReader = new FileReader(companyFile);
        BufferedReader bufferedReader = new BufferedReader(fileReader);
        List<String> companyNameList = new ArrayList<String>();
        String companyName = null;
        while ((companyName = bufferedReader.readLine()) != null) {
            companyNameList.add(companyName.trim().replaceAll(" ", ""));
        }
        bufferedReader.close();
        fileReader.close();

        //建立資料夾儲存含有公司詳細資訊的html頁面
        File directory = new File("info");
        if (!directory.exists()) {
            directory.mkdir();
        } else {
            //過濾已經查詢過的公司
            String[] fileNameList = directory.list();
            for (String fileName : fileNameList) {
                fileName = fileName.substring(0, fileName.length()-5);
                if (companyNameList.contains(fileName)) {
                    companyNameList.remove(fileName);
                    System.out.println(fileName+" 已經搜尋過,該公司將被跳過。。。");
                    System.out.println();
                }
            }
        }

        if (companyNameList.size() == 0) {
            System.out.println("沒有要搜尋的公司,程式即將關閉。。。");
            System.exit(0);
        }else {
            System.out.println("程式將要搜尋 "+ companyNameList.size()+" 個公司的資訊。。。");
            System.out.println();
        }

        HttpClient httpClient = HttpClientBuilder.create().build();
        HttpClientContext context = HttpClientContext.create();
        Scanner scanner = new Scanner(System.in);
        //設定請求和傳輸超時時間
        RequestConfig requestConfig = RequestConfig.custom().setConnectTimeout(5000).build();



        // httpClient.
        for (int index = 0; index < companyNameList.size(); index++) {
            companyName = companyNameList.get(index);
            System.out.println((index+1)+"、正在獲取 "+ companyName +" 的資訊。。。");
            System.out.println();
            HttpGet httpGet = new HttpGet("https://www.tianyancha.com/search?key=" + companyName);
            //設定預設請求頭
            setHttpHeaders(httpGet);
            httpGet.setConfig(requestConfig);

            //執行HTTP請求
            HttpResponse response = httpClient.execute(httpGet, context);
            System.out.println("HTTP請求執行完成。。。");

            //判斷是否出現機器人驗證
            if (checkRobotVerification(response, context)) {
                List<URI> redirectLocations = context.getRedirectLocations();
                System.out.println("注意!出現機器人驗證,請點選下面的連結,在驗證完後輸入 ok 繼續執行。。。");
                System.out.println();
                System.out.println(redirectLocations.get(0));
                System.out.println();
                System.out.print("完成驗證後,請在此處輸入OK:");
                while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
                    System.out.print("完成驗證後,請在此處輸入OK:");
                }
                System.out.println();
                //再次執行http請求
                response = httpClient.execute(httpGet, context);
                if (checkRobotVerification(response, context)) {
                    System.out.println("沒有完成機器人驗證,程式結束執行。。。。");
                    System.out.println();
                    break;
                }
            }

            System.out.println((index+1)+"、搜尋 "+companyName+" 資訊完成");
            System.out.println();
            HttpEntity entity = response.getEntity();

            //從搜尋頁面中提取目標詳細資訊的URL
            if (entity != null) {

                //將實體的內容轉換為字串
                String html= EntityUtils.toString(entity);
                Document document = Jsoup.parse(html);
                Elements select = document.select("a.name");
                //提取URL
                String url = select.attr("abs:href");

                //沒有搜尋結果時跳過
                if ("".equals(url)) {
                    System.out.println("無法在天眼查網站查詢到:"+companyName+" 的資訊");
                    System.out.println();
                    try {
                        //系統暫停10s
                        System.out.println("系統暫停1秒。。。。");
                        Thread.sleep(1 * 1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                    continue;
                }else {
                    System.out.println((index+1)+"、獲取 "+companyName+" 詳情URL成功。。。");
                    System.out.println();
                }
                System.out.println("準備獲取詳細資訊。。。");
                //獲取公司的詳細資訊
                httpGet = new HttpGet(url);
                //設定預設請求頭
                setHttpHeaders(httpGet);
                httpGet.setConfig(requestConfig);

                //執行HTTP請求
                response = httpClient.execute(httpGet, context);
                System.out.println("HTTP請求執行完成");

                //判斷是否出現機器人驗證
                if (checkRobotVerification(response, context)) {
                    List<URI> redirectLocations = context.getRedirectLocations();
                    System.out.println("注意!出現機器人驗證,請點選下面的連結,在驗證完後輸入回車繼續執行。。。");
                    System.out.println(redirectLocations.get(0));
                    System.out.print("完成驗證後,請在此處輸入OK:");
                    while (! "ok".equalsIgnoreCase(scanner.nextLine())) {
                        System.out.print("完成驗證後,請在此處輸入OK:");
                    }
                    System.out.println();
                    scanner.close();
                    //再次執行http請求
                    response = httpClient.execute(httpGet, context);
                    if (checkRobotVerification(response, context)) {
                        System.out.println("沒有完成機器人驗證。。。。");
                        break;
                    }
                }


                entity = response.getEntity();
                InputStream content = entity.getContent();
                FileOutputStream fileOutputStream = new FileOutputStream(directory+"/"+companyName+".html");
                byte[] buff = new byte[2048];
                int length = 0;
                //儲存公司詳細資訊
                while ((length = content.read(buff, 0, buff.length)) != -1) {
                    fileOutputStream.write(buff, 0, length);
                }
                html = EntityUtils.toString(entity);
                fileOutputStream.close();
                content.close();

                //提取資訊
                File file = new File("info/"+companyName+".html");
                document = Jsoup.parse(file, "UTF-8");
                Elements tbodys = document.select("tbody");
                if (tbodys.size() < 2){
                    System.out.println("注意!"+companyName+" 無法查詢到工商資訊。。。");
                    continue;
                }

                Element tbody = tbodys.get(1) ;

                Elements rows = tbody.select("tr");

                System.out.println((index+1)+"、"+companyName+" 的工商資訊如下:");
                System.out.println();
                for (int i = 0; i < rows.size(); i++){
                    Elements tds = rows.get(i).select("td");
                    for (int j = 0; j < tds.size(); j++){
                        if (j % 2 == 0){
                            System.out.print(tds.get(j).text().split(" ")[0]+" : ");

                        }else {
                            System.out.print(tds.get(j).text()+"\t\t\t");
                        }
                    }
                    System.out.println();
                    System.out.println();

                }
            }
            System.out.println();
            System.out.println((index+1)+"、提取:"+companyName+" 資訊完成");
            System.out.println();
            if (index == companyNameList.size() - 1 ) {
                System.out.println("搜尋完成,程式即將結束。。。");
            }else {
                int time = new Random().nextInt(2)+1;
                System.out.println("系統暫停:" + time + "秒");
                System.out.println();
                try {
                    Thread.sleep(time * 1000);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                }
            }

        }
    }

    /**
     * 檢查是否出現機器人驗證
     * @param response
     * @param context
     * @return
     */
    public static boolean checkRobotVerification(HttpResponse response , HttpClientContext context) {
        boolean result  = false;

        StatusLine statusLine = response.getStatusLine();
        int statusCode = statusLine.getStatusCode();
        if (statusCode != HttpStatus.SC_OK) {
            return true;
        }

        return result;
    }

    /**
     * 設定請求頭
     * @param httpGet
     */
    public static void setHttpHeaders(HttpGet httpGet) {
        //設定預設請求頭 在瀏覽器登陸後,把cookie的內容複製到這裡設定cookie,不然無法查詢
        httpGet.setHeader("Cookie", "TYCID=3f2b49d0cd4111e8a5e549f497d021fa; undefined=3f2b49d0cd4111e8a5e549f497d021fa; ssuid=5123032640; _ga=GA1.2.1228278129.1539254092; _gid=GA1.2.1873227241.1539582139; tyc-user-info=%257B%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252243%2522%252C%2522onum%2522%253A%25220%2522%252C%2522mobile%2522%253A%252215522887713%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNTUyMjg4NzcxMyIsImlhdCI6MTUzOTU4MjI4MSwiZXhwIjoxNTU1MTM0MjgxfQ.0uq_J8GQo2sJSX_xaV1COHURUqbdCdBrRXdjRlwAyH0EuuR3xxLd1R5VtnQAzuZPVlf-9GJN2wRyYBDktKfHNg; RTYCID=5ecd4dc1fe2a41ff8c5337236243115a; CT_TYCID=e65b84b666fe47febe8d3d669a99d899; aliyungf_tc=AQAAAKGn5XMvqAkAxZFsyv2ebLRydO1v; csrfToken=eGbxdFydN_eDMCrU8Pxv5JJm");
        httpGet.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        httpGet.setHeader("Accept-Language", "zh-CN,zh;q=0.9");
        httpGet.setHeader("Connection", "keep-alive");
        httpGet.setHeader("Host", "www.tianyancha.com");
        httpGet.setHeader("Referer", "https://www.tianyancha.com/");
        httpGet.setHeader("Upgrade-Insecure-Requests", "1");
        httpGet.setHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36");
    }

    public static void UITips(HttpResponse response, HttpClientContext context) {

    }
}

下面是生成excel的程式碼:

package cn.xiaoyanol.crawler;


import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
 * Created with IntelliJ IDEA.
 * Description:
 * 把查詢到的頁面資訊中的資料提取到excel表格中
 * @Author: jenrey
 * @Date: 2018-10-16
 * @Time: 下午2:15
 */
public class GetExcel {
    public static void main(String[] args) throws IOException, InvalidFormatException {

        File directory = new File("info");

        if (!directory.exists()) {
            System.out.println("資料夾不存在,程式結束執行");
            return;
        }

        //獲取要提取的檔案
        String[] files = directory.list();
        List<String> fileNames = new ArrayList<String>();
        for (String file : files) {
            if (file.contains(".html")) {
                fileNames.add(file);
            }
        }


        File xlsxFile = new File("查詢結果.xlsx");

        //如果檔案不存在,建立檔案
        if (!xlsxFile.exists()) {
            //建立一個工作簿
            XSSFWorkbook workbook = new XSSFWorkbook();
            //建立一個工作表
            XSSFSheet sheet = workbook.createSheet("sheet1");

            //初始化第一行資訊頭
            Row row = sheet.createRow(0);
            row.createCell(0).setCellValue("搜尋公司名");
            row.createCell(1).setCellValue("實際公司名");
            row.createCell(2).setCellValue("工商註冊號");
            row.createCell(3).setCellValue("組織機構程式碼");
            row.createCell(4).setCellValue("統一信用程式碼");
            row.createCell(5).setCellValue("公司型別");
            row.createCell(6).setCellValue("納稅人識別號");
            row.createCell(7).setCellValue("行業");
            row.createCell(8).setCellValue("營業期限");
            row.createCell(9).setCellValue("核准日期");
            row.createCell(10).setCellValue("納稅人資質");
            row.createCell(11).setCellValue("人員規模");
            row.createCell(12).setCellValue("實繳資本");
            row.createCell(13).setCellValue("登記機關");
            row.createCell(14).setCellValue("參保人數");
            row.createCell(15).setCellValue("英文名稱");
            row.createCell(16).setCellValue("註冊地址");
            row.createCell(17).setCellValue("經營範圍");
            row.createCell(18).setCellValue("法定代表人");


            FileOutputStream outputStream = new FileOutputStream(xlsxFile);
            workbook.write(outputStream);
            outputStream.close();
        }


        //開啟工作簿
        FileInputStream fileInputStream = new FileInputStream(xlsxFile);
        Workbook workbook = new XSSFWorkbook(fileInputStream);
        //獲取工作表
        Sheet sheet = workbook.getSheet("sheet1");

        //提取資訊
        int rowNum = 1;
        for (String fileName : fileNames) {
            try {
                List<String> messageList = new ArrayList<String>();
                //搜尋的公司
                messageList.add(fileName.substring(0, fileName.length()-5));

                File file = new File("info/" + fileName);
                Document document = Jsoup.parse(file, "UTF-8");

                //獲取實際查詢到的公司名
                Elements h1 = document.select("h1");
                String realCompany = h1.text();
                messageList.add(realCompany);

                Elements tbodys = document.select("tbody");

                Element tbody = tbodys.get(1);
                Elements rows = tbody.select("tr");

                //提取查詢到的公司的工商資訊
                for (int i = 0; i < rows.size(); i++) {
                    Elements tds = rows.get(i).select("td");
                    for (int j = 0; j < tds.size(); j++) {
                        if (j % 2 == 0) {
                            continue;
                        } else {
                            messageList.add(tds.get(j).text());
                        }
                    }
                }
                Element element = tbodys.get(0);
                Element aElement = element.select("a").get(0);
                messageList.add(aElement.text());
                Row row1 = sheet.createRow(rowNum++);
                for (int i = 0; i < messageList.size(); i++) {
                    row1.createCell(i).setCellValue(messageList.get(i));
                }
                FileOutputStream outputStream = new FileOutputStream(xlsxFile);
                outputStream.flush();
                workbook.write(outputStream);
                outputStream.close();
                System.out.println("rowNum:"+(rowNum - 1)+" "+ fileName);
            }catch (Exception e) {
                System.out.println(fileName+"-------------------");
            }
        }

    }
}