微博帶cookie訪問抓取熱搜
阿新 • • 發佈:2018-11-26
# -*- coding: utf-8 -*- ''' @author: Yalei Meng E-mail: [email protected] @license: (C) Copyright 2017, HUST Corporation Limited. @desc:獲取新浪熱門微博內容。儲存為txt檔案 @DateTime: Created on 2017/10/3,at 15:48 ''' from bs4 import BeautifulSoup as bs import requests as rq import time import random import json import csv cookie = ? def request_page(Page): head = {'Accept': 'application / json, text / plain, * / *', 'Cookie':cookie, #將登陸後自己的cookie貼上在這裡即可。 'Referer':'https://m.weibo.cn/p/index?containerid=102803', 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) ' 'Version/9.0 Mobile/13B143 Safari/601.1', 'X-Requested-With':'XMLHttpRequest' } r = rq.get(Page,headers = head,timeout = 5) return r.json() #11353296 def get_text_from(page): js = request_page(page) cards = js.get('data').get('cards') print(len(cards)) t_list = [] if not cards: return t_list for a in range(len(js['data']['cards'])): try: a_txt = js['data']['cards'][a]['mblog']['text'].split('<')[0] # pub_time = js['cards'][a]['mblog']['created_at'] # a_pic = js['cards'][a]['mblog'].get('original_pic') t_list.append(a_txt) except Exception as ex: print(ex) return t_list #延遲重新整理訪問,可能存在微博內容重複的問題。需要注意去重。 url_list = ['https://m.weibo.cn/api/container/getIndex?containerid=102803&since_id={}'.format(str(i)) for i in range(0,1000)] #url_list=['https://weibo.cn/search/mblog?hideSearchFrame=&keyword=%E5%8F%B0%E9%A3%8E%E5%B1%B1%E7%AB%B9&page={}'.format(str(i)) # for i in range(0,100)] url="https://weibo.cn/search/?tf=5_012" #如果要批量訪問,迴圈訪問列表的url即可。注意訪問間隔不能太短。小心被封號 for i, url in enumerate(url_list,1): mylist = get_text_from(url) print('當前第%d頁'%i,mylist) with open('E:/weibo1010.txt', 'a',encoding= 'utf-8')as f: for my in mylist: f.write(my) f.write('\n') time.sleep(random.uniform(1.2,3.0)) print('恭喜,程式執行完畢!')
引用了github 某位程式設計師的程式碼,等修改完成後(抓取某熱點的話題)再整理上傳。