利用twitter官網提供的api 及tweepy庫 爬取tweets
阿新 • • 發佈:2019-02-02
利用twitter官網提供的api及tweepy庫爬取tweets
思路:
1.以使用者為中心,爬取使用者的所有推文資料
2.根據使用者id尋找使用者朋友的tweeter id擴充套件待爬使用者表
3.迴圈1,2
幾點說明:
1.爬推特資料需要翻牆,推薦用ss。程式碼翻牆需要http,https代理。如果是socks的話會發現瀏覽器能翻牆,但是程式碼會提示
tweepy.error.TweepError: Failed to send request: HTTPSConnectionPool(host='api.twitter.com', port=443): Max retries exceeded with url: ....
說明https連線失敗。如果需要終端翻牆參考Mac命令列終端下使用shadowsocks翻牆
然後在tweepy.API中加入代理資訊,埠為你設定的代理埠。
api = tweepy.API(auth, proxy="127.0.0.1:1080",)
2.使用官方api需要先申請一個應用程式以獲得授權,申請地址Twitter應用程式 名字描述什麼的隨便寫好好,沒有稽核時間,填寫後即可獲得consumer_key,consumer_secret,access_token,access_token_secret這些在求取資料時需要用到。
3.官方API有速率限制具體參見[Rate limits-Twitter Development]授權使用者和授權應用的請求視窗數有差異我用的。user_timeline()
狀語從句:user_friends()
限制如下:
所以需要協調兩個介面的呼叫頻率。
4.當請求次數超過上限時會丟擲異常,然後退出程式,解決方法時tweepy.API中將引數wait_on_rate_limit,wait_on_rate_limit_notify設定為True
到達上限時,程式將自動等待,並輸出提示資訊。
api = tweepy.API(auth, proxy="127.0.0.1:1080" , wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
5.api請求返回json格式資料如圖:
6.有些使用者設定不允許取資料時會提示Not authorized.
可以在異常部分處理異常,跳過改使用者即可.tweepy.error資訊也可以在上面的官方文件連線中查到。
程式碼
import tweepy
import time
import csv
import threading
consumer_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
consumer_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
access_token_secret = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
lock = threading.Lock()
def get_tweets():
global user_ids
global old_ids
lock.acquire()
try:
num = 0
while len(user_ids) > 1:
try:
user_id = user_ids[num]
print('crawling user %s data...' % user_id)
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, proxy="127.0.0.1:1080", wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
tweets = []
new_tweets = api.user_timeline(user_id, count=200)
tweets.extend(new_tweets)
old = tweets[-1].id - 1
while len(new_tweets) > 0:
new_tweets = api.user_timeline(user_id=user_id, count=200, max_id=old)
tweets.extend(new_tweets)
old = tweets[-1].id - 1
print('%s tweets downloaded' % (len(tweets)))
out_tweets = [[tweet.id, tweet.text, tweet.created_at, tweet.lang, tweet.place, tweet.geo, tweet.source,
tweet.truncated, tweet.favorite_count, tweet.favorited, tweet.in_reply_to_screen_name,
tweet.in_reply_to_status_id, tweet.in_reply_to_user_id, tweet.is_quote_status,
tweet.retweet_count, tweet.retweeted, tweet.user.id, tweet.user.name, tweet.user.screen_name,
tweet.user.statuses_count, tweet.user.time_zone, tweet.user.url, tweet.user.notifications,
tweet.user.profile_background_image_url, tweet.user.profile_image_url,
tweet.user.profile_image_url_https, tweet.user.location, tweet.user.contributors_enabled,
tweet.user.created_at, tweet.user.default_profile, tweet.user.default_profile_image,
tweet.user.description, tweet.user.favourites_count, tweet.user.follow_request_sent,
tweet.user.followers_count, tweet.user.following, tweet.user.friends_count,
tweet.user.geo_enabled] for tweet in tweets]
user_ids.remove(user_id)
old_ids.append(user_id)
with open('./data1/%s_tweets.csv' % user_id, 'w',encoding='utf-8') as file:
writer = csv.writer(file)
writer.writerows(out_tweets)
print('saved data')
except tweepy.TweepError as e:
if e.reason=='Not authorized.':
print('this user not authorized.')
user_ids.remove(user_id)
old_ids.append(user_id)
continue
else:print(e)
finally:
lock.release()
def get_friends():
global user_ids
global old_ids
global oldest
lock.acquire()
try:
print('getting user friends id...')
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, proxy="127.0.0.1:1080", wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
ids = []
for user in old_ids[10]:
try:
friends = api.friends_ids(user)
friend = []
for idd in friends:
if (idd not in old_ids) and (idd not in user_ids) and(idd not in oldest):
friend.append(idd)
ids.extend(friend)
except tweepy.TweepError as e:
if e.reason == 'Not authorized.':
print('this user not authorized.')
old_ids.remove(user)
oldest.append(user)
continue
else:
print(e)
old_ids.remove(user)
oldest.append(user)
user_ids.extend(ids)
print('done!')
with open('crawled and expened user.txt','w',encoding='utf-8') as file:
for x in oldest:
file.write(str(x))
file.write(' ')
finally:
lock.release()
if __name__ == '__main__':
user_ids = [25073877,198599889]
with open('old_ids.txt','r',encoding='utf-8') as file:
old_ids=[x for x in file.read().split(' ')]
while len(user_ids) > 0:
t1=threading.Thread(target=get_tweets)
t2=threading.Thread(target=get_friends)
t1.start()
t1.join()
t2.start()
t2.join()