1. 程式人生 > >python爬今日頭條

python爬今日頭條

最近在做給新聞分詞。為了保證給文章貼的標籤的準確度高,決定做一個標籤庫。但發現給新聞打標籤網站就只有今日頭條打的比較好,網易一般,其他根本不能看,決定寫一個爬取今日頭條文章標籤的爬蟲。
一:解析引數
在這裡插入圖片描述今日頭條的資料全部都是ajax非同步載入的。谷歌瀏覽器按f12選擇network點選XHR會得到如上圖所示,上圖請求的url中有如下幾個引數會變化:
① category
② max_behot_time
③ max_behot_time_tmp
④ as
⑤ cp
⑥ _signature
其中只需要category,max_behot_time,_signature這個三個引數就可以獲取到資料。這是我自己親自試驗過的。
category根據你請求不同的欄目會變化,比如你請求科技欄目category為news_tech:
在這裡插入圖片描述


請求熱點欄目category為news_hot:
在這裡插入圖片描述
max_behot_time會動態變化最開始為0,下一次變化為這次請求到的json資料中max_behot_time的值:
當前max_behot_time請求的json資料中的max_behot_time的值為1544445969在這裡插入圖片描述在這裡插入圖片描述
第二次請求的max_behot_time為1544445969。 在這裡插入圖片描述
第三個引數為_signature,它是由一個很複雜的js程式碼生成的,這個js程式碼通過TAC.sign(max_behot_time)來生成,就是上面的那個引數max_behot_time的值:
在這裡插入圖片描述
在這裡插入圖片描述
在這裡插入圖片描述
仔細看哦,他們可不是一樣的哦。
在這裡插入圖片描述
三個引數到此解析完畢:
接下來就是擼程式碼,只需複製貼上,改動一點即可使用。
pacong.py

#coding:utf-8
from selenium import webdriver
from time import ctime,sleep
import threading
import requests
import time
import json
import sys
import random
import Two
reload(sys)
sys.setdefaultencoding('utf-8')
# 進入瀏覽器設定

def run(ajax):
    name = "word-{a}-".format(a=ajax) + time.strftime("%Y-%m-%d") + ".txt"
    print name
    options = webdriver.ChromeOptions()
    # 設定中文
    agent=Two.get_agent()
    options.set_headless()
    options.add_argument('lang=zh_CN.UTF-8')
    options.add_argument(
        'user-agent={}'.format(agent))
    # --我使用了瀏覽器去獲取_signature的值,你們需要修改這個地方,詳細資訊去簡單百度一下即可
    brower = webdriver.Chrome(chrome_options=options,
                              executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe')
    #brower.get('https://www.toutiao.com/ch/news_hot/')

    brower.get('https://www.toutiao.com/ch/{t}/'.format(t=ajax))
    print 'https://www.toutiao.com/ch/{t}/'.format(t=ajax)
    sinature = brower.execute_script('return TAC.sign(0)')
    print(sinature)
    """獲取cookie"""
    cookie = brower.get_cookies()
    print cookie
    cookie = [item['name'] + "=" + item['value'] for item in cookie]
    cookiestr = '; '.join(item for item in cookie)
    time1=0
    last=0
    while 1:
        header1 = {
            'Host': 'www.toutiao.com',
            'User-Agent': agent,
            'Referer': 'https://www.toutiao.com/ch/{}/'.format(ajax),
            "Cookie": cookiestr
        }
        #print cookiestr

        url = 'https://www.toutiao.com/api/pc/feed/?category={t}&utm_source=toutiao&widen=1&max_behot_time={time}&_signature={s}'.format(t=ajax,time=time1,s=sinature)
        print(url)
        #設定了動態代理好像沒什麼用
        o_g = ["213.162.218.75:55230",
               "180.180.152.25:51460",
               "79.173.124.194:47832",
               "50.112.160.137:53910",
               "211.159.140.111:8080",
               "95.189.112.214:35508",
               "168.232.207.145:46342",
               "181.129.139.202:32885",
               "78.47.157.159:80",
               "112.25.6.15:80",
               "46.209.135.201:30418",
               "187.122.224.69:60331",
               "188.0.190.75:59378",
               "114.234.76.131:8060",
               "125.209.78.80:32431",
               "183.203.13.135:80",
               "168.232.207.145:46342",
               "190.152.5.46:53281",
               "89.250.149.114:60981",
               "183.232.113.51:80",
               "213.109.5.230:33138",
               "85.158.186.12:41258",
               "142.93.51.134:8080",
               "181.129.181.250:53539"]
        a = 0
        for a in range(0, 1):
           #跑了17個執行緒,請求太快會被封的
            sleep(30)
            c = random.randint(0, 23)
            proxies_l = {'http': o_g[c],}
            try:
                
                html = requests.get(url, headers=header1, verify=False,proxies=proxies_l)
                print html.cookies
                html.encoding
                data = html.content
                print(data)
                if(len(data)==51):
                    print "被禁了"
                    sleep(3600)
                try:
                    s1 = json.loads(data)
                    try:
                        time1 = s1["next"]["max_behot_time"]
                    except Exception as e:
                        print e
                    print time1
                    #根據max_behot_time獲取signature的值
                    sinature = brower.execute_script('return TAC.sign({})'.format(time1))
                    print(sinature)
                    f = open(name, 'a')
                    res = ""
                    for i in range(len(s1["data"])):
                        try:
                           #我需要的文章的label值。
                            l = s1["data"][i]["label"]
                        except Exception as e:
                            print e
                            continue
                        for j in range(len(l)):
                            res = l[j] + "\n"
                            f.write(res)
                            #print l[j]
                    f.close()
                    #last=time1
                    break
                except Exception as e:
                    print("解析錯誤")
                    continue;
            except Exception as e:
                print('no proxies')
                continue

            #print html.content
threads = []
#熱點news_hot
t1 = threading.Thread(target=run,args=("news_hot",))
threads.append(t1)

#科技 https://www.toutiao.com/ch/news_tech/
t2 = threading.Thread(target=run,args=("news_tech",))
threads.append(t2)
# #娛樂 https://www.toutiao.com/ch/news_entertainment/
t3 = threading.Thread(target=run,args=("news_entertainment",))
threads.append(t3)
#遊戲 https://www.toutiao.com/ch/news_game/ 沒撒用
t4 = threading.Thread(target=run,args=("news_game",))
threads.append(t4)
#體育https://www.toutiao.com/ch/news_sports/
t5 = threading.Thread(target=run,args=("news_sports",))
threads.append(t5)
#汽車https://www.toutiao.com/ch/news_car/
t6= threading.Thread(target=run,args=("news_car",))
threads.append(t6)
#財經https://www.toutiao.com/ch/news_finance/
t7= threading.Thread(target=run,args=("news_finance",))
threads.append(t7)
#軍事https://www.toutiao.com/ch/news_military/
t8= threading.Thread(target=run,args=("news_military",))
threads.append(t8)
#時尚https://www.toutiao.com/ch/news_fashion/
t9= threading.Thread(target=run,args=("news_fashion",))
threads.append(t9)
#國際https://www.toutiao.com/ch/news_world/
t10= threading.Thread(target=run,args=("news_world",))
threads.append(t10)
#探索https://www.toutiao.com/ch/news_discovery/
t11= threading.Thread(target=run,args=("news_discovery",))
threads.append(t11)
#養生https://www.toutiao.com/ch/news_regimen/
t12= threading.Thread(target=run,args=("news_regimen",))
threads.append(t12)
#歷史https://www.toutiao.com/ch/news_history/
t13= threading.Thread(target=run,args=("news_history",))
threads.append(t13)
#美食https://www.toutiao.com/ch/news_food/
t14= threading.Thread(target=run,args=("news_food",))
threads.append(t14)
#旅遊https://www.toutiao.com/ch/news_travel/
t15= threading.Thread(target=run,args=("news_travel",))
threads.append(t15)
#育兒https://www.toutiao.com/ch/news_baby/
t16= threading.Thread(target=run,args=("news_baby",))
threads.append(t16)
#美文https://www.toutiao.com/ch/news_essay/
t17= threading.Thread(target=run,args=("news_essay",))
threads.append(t17)

if __name__ == '__main__':
    for t in threads:
        t.setDaemon(True)
        t.start()
    time.sleep(1080000)
    print "all over %s" %ctime()

Two.py

#coding:utf-8
import requests
import random
import json
import re
def get_agent():
    ua_list = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]
    user_agent = random.choice(ua_list)
    print user_agent
    return user_agent

上面這個是隨機生成agent,好讓爬蟲不那麼容易被禁
結果如下:
在這裡插入圖片描述
希望能幫到有需要的朋友。