1. 程式人生 > >天眼查pc端公司資訊抓取

天眼查pc端公司資訊抓取

本篇查詢的是人工智慧前5頁相關公司的資訊:

#主要是異常處理和反爬處理
1.異常處理就是有的公司不是公開的資料沒有所以需要判斷,不然程式會出錯
2.反爬頁面瀏覽多了會需要登入,這邊用cookie處理

在這裡插入圖片描述

import requests
from lxml import etree



gs=[]
headers={
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-CN,zh;q=0.9",
    "Cache-Control": "max-age=0",
    "Connection": "keep-alive",
    "Cookie": "TYCID=10575bf0d29911e8a3bed1727775aa44; undefined=10575bf0d29911e8a3bed1727775aa44; ssuid=8797202500; _ga=GA1.2.2092555821.1539841878; aliyungf_tc=AQAAALe70WELCAUAe//wcukbb8+nCOzQ; csrfToken=vhjj7ig76QAPNz5tbfy2Wa9T; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1539920329,1540453718,1542189999; _gid=GA1.2.34407757.1542189999; RTYCID=8dd86f4c818942549de776bedef42b6a; CT_TYCID=440a4500eae1484a9f9b2ef0addbc6f9; cloud_token=db9b2e0efc154b7195d1c5cfe42e855e; token=ec9c932a784c4fd68b333fa257277f11; _utm=23855cc22e2744ea8b84d227befcf23c; tyc-user-info=%257B%2522myQuestionCount%2522%253A%25220%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252227%2522%252C%2522discussCommendCount%2522%253A%25221%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDIyMDg0MCIsImlhdCI6MTU0MjE5MjU3MSwiZXhwIjoxNTU3NzQ0NTcxfQ.deqDqs_4y39XlZpmkduY-lFRPJZ-LeKyTsCTRNccPNbmyAW6DzYaCcp-XKpCOOmbMj5O41j8oFZvS-lHP7Ca5A%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522pleaseAnswerCount%2522%253A%25221%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522mobile%2522%253A%252217600220840%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDIyMDg0MCIsImlhdCI6MTU0MjE5MjU3MSwiZXhwIjoxNTU3NzQ0NTcxfQ.deqDqs_4y39XlZpmkduY-lFRPJZ-LeKyTsCTRNccPNbmyAW6DzYaCcp-XKpCOOmbMj5O41j8oFZvS-lHP7Ca5A; _gat_gtag_UA_123487620_1=1; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1542193086",
    "Host": "www.tianyancha.com",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36",

}
def down_load(url):
    cc=requests.get(url=url,headers=headers)
    cc.encoding="utf-8"
    return  cc.text

for i in range(1,6):
    first_url="https://www.tianyancha.com/search/p{}?key=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD".format(i)
    a=down_load(first_url)
    a=etree.HTML(a)
    detail_url=a.xpath('//div[@class="search-item"]//div[@class="header"]/a/@href')
    #用於異常判斷,因為有的資訊沒有公開
    kk=a.xpath('//div[@class="search-item"]//div[@class="info"]/div[1]/text()')
    print(detail_url)



    for ii in range(len(detail_url)):
        try:
            if kk[ii] !="未公開":
                aa=down_load(detail_url[ii])
                bb=etree.HTML(aa)
                company=bb.xpath('//div[@class="box"]//div[@class="header"]/h1/text()')[0]
                boss = bb.xpath('//table[@class="table"]//div[@class="name"]/a/text()')[0]

                new=["公司名:"+company,"法人:"+boss]
                print(new)
                with open("gs1.txt", "a")as f:
                    f.write("公司名:"+company+"   "+"法人:"+boss+"\n")
                gs.append(new)
        except:
            pass

print(gs)