python爬蟲爬取今日頭條APP資料(無需破解as ,cp,_cp_signature引數)
阿新 • • 發佈:2018-12-01
#!coding=utf-8 import requests import re import json import math import random import time from requests.packages.urllib3.exceptions import InsecureRequestWarning import pandas as pd requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告 def ttapi(url): ####APP模式 channel = re.search('ch/(.*?)/', url).group(1) s = requests.session() headers = { 'Accept':'image/webp,image/*;q=0.8', 'User-Agent':'News/6.9.8.36 CFNetwork/975.0.3 Darwin/18.2.0', 'Accept-Language':'zh-cn' } s.headers.update(headers) df=pd.DataFrame(columns=( 'abstract 簡報','title 標題','keywords 關鍵詞','read_count 閱讀量','share_count 分享數量', 'ban_comment 可評論','publish_time 推送時間','share_url url 連結','user_info_name 使用者名稱', 'user_id 使用者 id','description 使用者描述','user_verified 官方賬號','time 抓取時間','category 頻道' )) t2 = int(time.time())-500 x=0 for i in range(10): ###爬取頁數 time.sleep(3) t=int(time.time()) params={ 'category':channel, ###頻道名 'refer':'1', ###???,固定值1 'count':'20', ####返回數量,預設為20 'min_behot_time':t2, ####上次請求時間的時間戳,例:1491981025 'last_refresh_sub_entrance_interval':t-10,#####本次請求時間的時間戳,例:1491981165 'loc_time':int(t/1000)*1000,###本地時間 'latitude':'',###經度 'longitude':'',###緯度 'city':'',###當前城市 'iid':'1234876543',###某個唯一 id,長度為10 'device_id':'42433242851',###裝置id,長度為11 'abflag':'3', 'ssmix':'a', 'language':'zh', 'openudid':'1b8d5bf69dc4a561',####某個唯一id,長度為16 } url='http://is.snssdk.com/api/news/feed/v51/' app=s.get(url=url,params=params,verify=False).json() print(app) t2=t-10 total_number=app['total_number'] #print(total_number) for j in range(0,total_number): content=json.loads(app['data'][j]['content']) try: abstract=content['abstract'] ##簡報 except: abstract = '' try: title = content['title'] ##標題 except: title ='' try: keywords = content['keywords'] ##關鍵詞 except: keywords ='' try: read_count=content['read_count'] ##閱讀量 except: read_count='' try: share_count = content['share_count'] ##分享數量 except: share_count ='' try: ban_comment = content['ban_comment'] ###是否可以評論,0為可評論,1不可評論 except: ban_comment ='' try: publish_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(content['publish_time'])) ##推送時間 except: publish_time ='' try: share_url = content['share_url'] ###分享 url 連結 except: share_url ='' try: user_info_name = content['user_info']['name'] ##使用者名稱 except: user_info_name ='' try: user_id = content['user_info']['user_id'] ##使用者 id except: user_id ='' try: description = content['user_info']['description'] ##使用者描述 except: description ='' try: user_verified = content['user_info']['user_verified'] ###是否官方賬號 except: user_verified ='' nowtime=time.strftime("%Y-%m-%d %H:%M:%S",time.localtime()) df.loc[x] =[abstract, title, keywords, read_count, share_count, ban_comment, publish_time, share_url, user_info_name, user_id, description, user_verified,nowtime,channel] x=x+1 df.to_csv('tt.csv',index=False, encoding="GB18030") s.close() if __name__=='__main__': url='https://www.toutiao.com/ch/news_tech/' ttapi(url)
網頁版as ,cp,_cp_signature引數破解:
https://blog.csdn.net/weixin_39416561/article/details/82111455