1. 程式人生 > >楊超越微博爬蟲(微博文字+圖片)粉絲資訊待續

楊超越微博爬蟲(微博文字+圖片)粉絲資訊待續

# -*- coding: utf-8 -*-
import urllib.request
import json
import time
import random

from urllib.request import urlopen
from bs4 import BeautifulSoup
import threading
import requests
from urllib.request import urlretrieve
import re
import sys
import string
import os
import socket
import urllib


id 
= '5644764907' # 定義要爬取的微博id。楊超越微博https://m.weibo.cn/u/5644764907 proxy = [ {'http': '106.14.47.5:80'}, {'http': '61.135.217.7:80'}, {'http': '58.53.128.83:3128'}, {'http': '58.118.228.7:1080'}, {'http': '221.212.117.10:808'}, {'http
': '115.159.116.98:8118'}, {'http': '121.33.220.158:808'}, {'http': '124.243.226.18:8888'}, {'http': '124.235.135.87:80'}, {'http': '14.118.135.10:808'}, {'http': '119.176.51.135:53281'}, {'http': '114.94.10.232:43376
'}, {'http': '218.79.86.236:54166'}, {'http': '221.224.136.211:35101'}, {'http': '58.56.149.198:53281'}] # 設定代理IP # 定義頁面開啟函式 def use_proxy(url,proxy_addr): req = urllib.request.Request(url) req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0") proxy = urllib.request.ProxyHandler(proxy_addr) opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler) urllib.request.install_opener(opener) data = urllib.request.urlopen(req).read().decode('utf-8','ignore') return data # 獲取微博使用者的基本資訊,如:微博暱稱、微博地址、微博頭像、關注人數、粉絲數、性別、等級等 def get_userInfo(id): url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id # 個人資訊介面 seed_num = random.randint(1,15)-1 proxy_addr = proxy[seed_num] data = use_proxy(url, proxy_addr) content = json.loads(data).get('data') profile_image_url = content.get('userInfo').get('profile_image_url') description = content.get('userInfo').get('description') profile_url = content.get('userInfo').get('profile_url') verified = content.get('userInfo').get('verified') guanzhu = content.get('userInfo').get('follow_count') name = content.get('userInfo').get('screen_name') fensi = content.get('userInfo').get('followers_count') gender = content.get('userInfo').get('gender') urank = content.get('userInfo').get('urank') print("微博暱稱:"+name+"\n"+"微博主頁地址:"+profile_url+"\n"+"微博頭像地址:"+profile_image_url+"\n"+"是否認證:"+str(verified)+"\n"+"微博說明:"+description+"\n"+"關注人數:"+str(guanzhu)+"\n"+"粉絲數:"+str(fensi)+"\n"+"性別:"+gender+"\n"+"微博等級:"+str(urank)+"\n") pass def save_pics(pics_info,m): print("pic_save start") for pic_info in pics_info: pic_url=pic_info['large']['url']#原圖 #pic_url=pic_info['url']#低清圖 pic_path=pics_dir + '\\%d.jpg'%m try: #下載圖片 with open(pic_path,'wb') as f: for chunk in requests.get(pic_url,stream=True).iter_content(): f.write(chunk) except: print(pic_path + '儲存失敗') else: print(pic_path + '儲存成功') m+=1 # 獲取微博主頁的containerid,爬取微博內容時需要此id def get_containerid(url,proxy_addr): data = use_proxy(url, proxy_addr) content = json.loads(data).get('data') for data in content.get('tabsInfo').get('tabs'): if(data.get('tab_type') == 'weibo'): containerid = data.get('containerid') return containerid # 獲取微博內容資訊,並儲存到文字中,內容包括:每條微博的內容、微博詳情頁面地址、點贊數、評論數、轉發數等 def get_weibo(id, file,file_content): i = 1 m = 0 while True: num = random.randint(1,15)-1 proxy_addr = proxy[num] url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id weibo_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url,proxy_addr)+'&page='+str(i) print(url) print(weibo_url) try: data = use_proxy(weibo_url, proxy_addr) content = json.loads(data).get('data') cards = content.get('cards') if(len(cards)>0): threads = [] for j in range(len(cards)): print(""+str(i)+"頁,第"+str(j)+"條微博") card_type = cards[j].get('card_type') if(card_type == 9): mblog = cards[j].get('mblog') attitudes_count = mblog.get('attitudes_count') comments_count = mblog.get('comments_count') created_at = mblog.get('created_at') reposts_count = mblog.get('reposts_count') scheme = cards[j].get('scheme') print(i) #獲取微博內容 try: text = mblog.get('text') text = re.sub(u"\<.*?\>", "", text) except: return None with open(file_content, 'a+',encoding='utf-8') as f1: f1.write(str(text)+"\n") pass #下載圖片 try: pics_info = mblog.get('pics') except: pass else: if pics_info: print("have pics") save_pics(pics_info,m) m += 1 with open(file, 'a+', encoding='utf-8') as fh: fh.write(""+str(i)+"頁,第"+str(j)+"條微博"+"\n") fh.write("微博地址:"+str(scheme)+"\n"+"釋出時間:"+str(created_at)+"\n"+"微博內容:"+text+"\n"+"點贊數:"+str(attitudes_count)+"\n"+"評論數:"+str(comments_count)+"\n"+"轉發數:"+str(reposts_count)+"\n") pass pass pass i += 1 time.sleep(random.randint(1,3)) pass else: break except Exception as e: print(e) pass pass pass if __name__ == "__main__": print('開始---') pics_dir = r"D:\software_study\my_jupyter_notebook\scrawl\pics_origin" file_all = "ycy_all.txt" file_content = "ycy_content.txt" #pic_index get_userInfo(id) get_weibo(id, file_all, file_content) print('完成---') pass

 結果展示:

微博內容:

微博圖片:

 GO! 衝鴨!!!超越一切