天眼查pc端公司資訊抓取
阿新 • • 發佈:2018-11-19
本篇查詢的是人工智慧前5頁相關公司的資訊:
#主要是異常處理和反爬處理
1.異常處理就是有的公司不是公開的資料沒有所以需要判斷,不然程式會出錯
2.反爬頁面瀏覽多了會需要登入,這邊用cookie處理
import requests from lxml import etree gs=[] headers={ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "TYCID=10575bf0d29911e8a3bed1727775aa44; undefined=10575bf0d29911e8a3bed1727775aa44; ssuid=8797202500; _ga=GA1.2.2092555821.1539841878; aliyungf_tc=AQAAALe70WELCAUAe//wcukbb8+nCOzQ; csrfToken=vhjj7ig76QAPNz5tbfy2Wa9T; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1539920329,1540453718,1542189999; _gid=GA1.2.34407757.1542189999; RTYCID=8dd86f4c818942549de776bedef42b6a; CT_TYCID=440a4500eae1484a9f9b2ef0addbc6f9; cloud_token=db9b2e0efc154b7195d1c5cfe42e855e; token=ec9c932a784c4fd68b333fa257277f11; _utm=23855cc22e2744ea8b84d227befcf23c; tyc-user-info=%257B%2522myQuestionCount%2522%253A%25220%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%252227%2522%252C%2522discussCommendCount%2522%253A%25221%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDIyMDg0MCIsImlhdCI6MTU0MjE5MjU3MSwiZXhwIjoxNTU3NzQ0NTcxfQ.deqDqs_4y39XlZpmkduY-lFRPJZ-LeKyTsCTRNccPNbmyAW6DzYaCcp-XKpCOOmbMj5O41j8oFZvS-lHP7Ca5A%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522pleaseAnswerCount%2522%253A%25221%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522mobile%2522%253A%252217600220840%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxNzYwMDIyMDg0MCIsImlhdCI6MTU0MjE5MjU3MSwiZXhwIjoxNTU3NzQ0NTcxfQ.deqDqs_4y39XlZpmkduY-lFRPJZ-LeKyTsCTRNccPNbmyAW6DzYaCcp-XKpCOOmbMj5O41j8oFZvS-lHP7Ca5A; _gat_gtag_UA_123487620_1=1; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1542193086", "Host": "www.tianyancha.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36", } def down_load(url): cc=requests.get(url=url,headers=headers) cc.encoding="utf-8" return cc.text for i in range(1,6): first_url="https://www.tianyancha.com/search/p{}?key=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD".format(i) a=down_load(first_url) a=etree.HTML(a) detail_url=a.xpath('//div[@class="search-item"]//div[@class="header"]/a/@href') #用於異常判斷,因為有的資訊沒有公開 kk=a.xpath('//div[@class="search-item"]//div[@class="info"]/div[1]/text()') print(detail_url) for ii in range(len(detail_url)): try: if kk[ii] !="未公開": aa=down_load(detail_url[ii]) bb=etree.HTML(aa) company=bb.xpath('//div[@class="box"]//div[@class="header"]/h1/text()')[0] boss = bb.xpath('//table[@class="table"]//div[@class="name"]/a/text()')[0] new=["公司名:"+company,"法人:"+boss] print(new) with open("gs1.txt", "a")as f: f.write("公司名:"+company+" "+"法人:"+boss+"\n") gs.append(new) except: pass print(gs)