22.天眼查cookie模擬登陸采集數據
阿新 • • 發佈:2018-11-05
cat from undefined 地址 answer Language ase 圖片 count
通過賬號登錄獲取cookies,模擬登錄(前提有天眼查賬號),會員賬號可查看5000家,普通只是100家,同時也要設置一定的反爬措施以防賬號被封。
拿有權限的賬號去獲取cookies,去訪問頁面信息,不過這樣呢感覺還是不合適,因為之前也采集過都是避開登錄和驗證碼的問題,因為這些數據只是人家網站讓不讓你拿,該怎樣去拿的問題。
這裏只是簡單地做一下測試,實際采集會遇到各種問題的,這裏只是個解題思路僅供參考。
不然會被檢測如圖:
# coding:utf-8 import requests from lxml import etree import re #請求地址 target_url =‘https://www.tianyancha.com/search?key=‘ headers = { ‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘, ‘Accept-Encoding‘: ‘gzip, deflate, br‘, ‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘, ‘Connection‘: ‘keep-alive‘, ‘Cookie‘: ‘TYCID=b9583550959d11e897e06dcb73cfa6e2; undefined=b9583550959d11e897e06dcb73cfa6e2; _ga=GA1.2.442696904.1533136553; ssuid=9383237588; aliyungf_tc=AQAAAI5OlAkhwQcA7nGBd5KVOLSC9NYt; csrfToken=-56Od6hSl_S1CmBVCzLLBYEI; _gid=GA1.2.928876903.1541388137; _gat_gtag_UA_123487620_1=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1541393330,1541393799,1541394021,1541394112; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1541394115; tyc-user-info=%257B%2522myQuestionCount%2522%253A%25220%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%25226%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODIzNjUzMTkwNiIsImlhdCI6MTU0MTM5NDEzMSwiZXhwIjoxNTU2OTQ2MTMxfQ.biIMiqd7l2LBwARywkoJ4J-dFh7zT-SSzz0V-GKc9r4EENomkv-1SA68RvVn0sZUzN_3wHbrw-Sl0ksedBgNGA%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522mobile%2522%253A%252218236531906%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODIzNjUzMTkwNiIsImlhdCI6MTU0MTM5NDEzMSwiZXhwIjoxNTU2OTQ2MTMxfQ.biIMiqd7l2LBwARywkoJ4J-dFh7zT-SSzz0V-GKc9r4EENomkv-1SA68RvVn0sZUzN_3wHbrw-Sl0ksedBgNGA‘, ‘Host‘: ‘www.tianyancha.com‘, ‘Referer‘: ‘https://www.tianyancha.com/‘, ‘Upgrade-Insecure-Requests‘: ‘1‘, ‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36‘, } #搜索關鍵字 list=[‘佛山‘] for j in range(6): for i in list: form_data={ ‘key‘: ‘{}‘.format(i), } url = ‘https://www.tianyancha.com/search/p{}?key={}‘.format(j,i) # 發送post請求,翻譯數據 response = requests.get(url, data=form_data, headers=headers) # print(response.text) html = etree.HTML(response.text) #獲取當前搜索界面url link_urls = html.xpath("//div[@class=‘content‘]/div[@class=‘header‘]/a/@href") for link_url in link_urls: # print(link_url) response = requests.get(link_url, headers=headers) # print(response.text) html2 = etree.HTML(response.text) #公司名稱 company = html2.xpath("//h1[@class=‘name‘]").extract_first() print(company) print(‘*‘*100)
# coding:utf-8
import requests
from lxml import etree
import re
#請求地址
target_url =‘https://www.tianyancha.com/search?key=‘
headers = {
‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8‘,
‘Accept-Encoding‘: ‘gzip, deflate, br‘,
‘Accept-Language‘: ‘zh-CN,zh;q=0.9‘,
‘Connection‘: ‘keep-alive‘,
‘Cookie‘: ‘TYCID=b9583550959d11e897e06dcb73cfa6e2; undefined=b9583550959d11e897e06dcb73cfa6e2; _ga=GA1.2.442696904.1533136553; ssuid=9383237588; aliyungf_tc=AQAAAI5OlAkhwQcA7nGBd5KVOLSC9NYt; csrfToken=-56Od6hSl_S1CmBVCzLLBYEI; _gid=GA1.2.928876903.1541388137; _gat_gtag_UA_123487620_1=1; Hm_lvt_e92c8d65d92d534b0fc290df538b4758=1541393330,1541393799,1541394021,1541394112; Hm_lpvt_e92c8d65d92d534b0fc290df538b4758=1541394115; tyc-user-info=%257B%2522myQuestionCount%2522%253A%25220%2522%252C%2522integrity%2522%253A%25220%2525%2522%252C%2522state%2522%253A%25220%2522%252C%2522vipManager%2522%253A%25220%2522%252C%2522onum%2522%253A%25220%2522%252C%2522monitorUnreadCount%2522%253A%25226%2522%252C%2522discussCommendCount%2522%253A%25220%2522%252C%2522token%2522%253A%2522eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODIzNjUzMTkwNiIsImlhdCI6MTU0MTM5NDEzMSwiZXhwIjoxNTU2OTQ2MTMxfQ.biIMiqd7l2LBwARywkoJ4J-dFh7zT-SSzz0V-GKc9r4EENomkv-1SA68RvVn0sZUzN_3wHbrw-Sl0ksedBgNGA%2522%252C%2522redPoint%2522%253A%25220%2522%252C%2522pleaseAnswerCount%2522%253A%25220%2522%252C%2522vnum%2522%253A%25220%2522%252C%2522bizCardUnread%2522%253A%25220%2522%252C%2522mobile%2522%253A%252218236531906%2522%257D; auth_token=eyJhbGciOiJIUzUxMiJ9.eyJzdWIiOiIxODIzNjUzMTkwNiIsImlhdCI6MTU0MTM5NDEzMSwiZXhwIjoxNTU2OTQ2MTMxfQ.biIMiqd7l2LBwARywkoJ4J-dFh7zT-SSzz0V-GKc9r4EENomkv-1SA68RvVn0sZUzN_3wHbrw-Sl0ksedBgNGA‘,
‘Host‘: ‘www.tianyancha.com‘,
‘Referer‘: ‘https://www.tianyancha.com/‘,
‘Upgrade-Insecure-Requests‘: ‘1‘,
‘User-Agent‘: ‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36‘,
}
#搜索關鍵字
list=[‘佛山‘]
for j in range(6):
for i in list:
form_data={
‘key‘: ‘{}‘.format(i),
}
url = ‘https://www.tianyancha.com/search/p{}?key={}‘.format(j,i)
# 發送post請求,翻譯數據
response = requests.get(url, data=form_data, headers=headers)
# print(response.text)
html = etree.HTML(response.text)
#獲取當前搜索界面url
link_urls = html.xpath("//div[@class=‘content‘]/div[@class=‘header‘]/a/@href")
for link_url in link_urls:
# print(link_url)
response = requests.get(link_url, headers=headers)
# print(response.text)
html2 = etree.HTML(response.text)
#公司名稱
company = html2.xpath("//h1[@class=‘name‘]").extract_first()
print(company)
print(‘*‘*100)
22.天眼查cookie模擬登陸采集數據