楊超越微博爬蟲（微博文字+圖片）粉絲資訊待續

阿新 • • 發佈：2018-12-14

# -*- coding: utf-8 -*-
import urllib.request
import json
import time
import random

from urllib.request import urlopen
from bs4 import BeautifulSoup
import threading
import requests
from urllib.request import urlretrieve
import re
import sys
import string
import os
import socket
import urllib


id  
= '5644764907'  # 定義要爬取的微博id。楊超越微博https://m.weibo.cn/u/5644764907
proxy = [  {'http': '106.14.47.5:80'},
                {'http': '61.135.217.7:80'},
                {'http': '58.53.128.83:3128'},
                {'http': '58.118.228.7:1080'},
                {'http': '221.212.117.10:808'},
                {'http 
': '115.159.116.98:8118'},
                {'http': '121.33.220.158:808'},
                {'http': '124.243.226.18:8888'},
                {'http': '124.235.135.87:80'},
                {'http': '14.118.135.10:808'},
                {'http': '119.176.51.135:53281'},
                {'http': '114.94.10.232:43376 
'},
                {'http': '218.79.86.236:54166'},
                {'http': '221.224.136.211:35101'},
                {'http': '58.56.149.198:53281'}]  # 設定代理IP
# 定義頁面開啟函式
def use_proxy(url,proxy_addr):
    req = urllib.request.Request(url)
    req.add_header("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
    proxy = urllib.request.ProxyHandler(proxy_addr)
    opener = urllib.request.build_opener(proxy,urllib.request.HTTPHandler)
    urllib.request.install_opener(opener)
    data = urllib.request.urlopen(req).read().decode('utf-8','ignore')
    return data

# 獲取微博使用者的基本資訊，如：微博暱稱、微博地址、微博頭像、關注人數、粉絲數、性別、等級等
def get_userInfo(id):
    url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id  # 個人資訊介面
    seed_num = random.randint(1,15)-1
    proxy_addr = proxy[seed_num]
    data = use_proxy(url, proxy_addr)
    content = json.loads(data).get('data')
    profile_image_url = content.get('userInfo').get('profile_image_url')
    description = content.get('userInfo').get('description')
    profile_url = content.get('userInfo').get('profile_url')
    verified = content.get('userInfo').get('verified')
    guanzhu = content.get('userInfo').get('follow_count')
    name = content.get('userInfo').get('screen_name')
    fensi = content.get('userInfo').get('followers_count')
    gender = content.get('userInfo').get('gender')
    urank = content.get('userInfo').get('urank')
 
    print("微博暱稱："+name+"\n"+"微博主頁地址："+profile_url+"\n"+"微博頭像地址："+profile_image_url+"\n"+"是否認證："+str(verified)+"\n"+"微博說明："+description+"\n"+"關注人數："+str(guanzhu)+"\n"+"粉絲數："+str(fensi)+"\n"+"性別："+gender+"\n"+"微博等級："+str(urank)+"\n")
 
    pass

def save_pics(pics_info,m):
    print("pic_save start")
    for pic_info in pics_info:
        pic_url=pic_info['large']['url']#原圖
        #pic_url=pic_info['url']#低清圖
        pic_path=pics_dir + '\\%d.jpg'%m
        try:
            #下載圖片
            with open(pic_path,'wb') as f:
                for chunk in requests.get(pic_url,stream=True).iter_content():
                    f.write(chunk)
        except:
            print(pic_path + '儲存失敗')
        else:
            print(pic_path + '儲存成功')
            m+=1
        

# 獲取微博主頁的containerid，爬取微博內容時需要此id
def get_containerid(url,proxy_addr):
    data = use_proxy(url, proxy_addr)
    content = json.loads(data).get('data')
    for data in content.get('tabsInfo').get('tabs'):
        if(data.get('tab_type') == 'weibo'):
            containerid = data.get('containerid')
    return containerid
 

 
# 獲取微博內容資訊,並儲存到文字中，內容包括：每條微博的內容、微博詳情頁面地址、點贊數、評論數、轉發數等
def get_weibo(id, file,file_content):
    i = 1
    m = 0
    while True:
        num = random.randint(1,15)-1
        proxy_addr = proxy[num]
        url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id
        weibo_url = 'https://m.weibo.cn/api/container/getIndex?type=uid&value='+id+'&containerid='+get_containerid(url,proxy_addr)+'&page='+str(i)
        print(url)
        print(weibo_url)
        try:

            data = use_proxy(weibo_url, proxy_addr)
            content = json.loads(data).get('data')
            cards = content.get('cards')
            if(len(cards)>0):
                threads = []
                for j in range(len(cards)):
                    print("第"+str(i)+"頁，第"+str(j)+"條微博")
                    card_type = cards[j].get('card_type')
                    if(card_type == 9):
                        mblog = cards[j].get('mblog')
                        attitudes_count = mblog.get('attitudes_count')
                        comments_count = mblog.get('comments_count')
                        created_at = mblog.get('created_at')
                        reposts_count = mblog.get('reposts_count')
                        scheme = cards[j].get('scheme')
                        print(i)

                        #獲取微博內容
                        try:
                            text = mblog.get('text')
                            text = re.sub(u"\<.*?\>", "", text)

                        except:
                             return None
                        with open(file_content, 'a+',encoding='utf-8') as f1:
                            f1.write(str(text)+"\n")
                            pass
                            
                        #下載圖片
                        try:
                            pics_info = mblog.get('pics')
                        except:
                            pass
                        else:
                            if pics_info:
                                print("have pics")
                                save_pics(pics_info,m)
                                m += 1
                        with open(file, 'a+', encoding='utf-8') as fh:
                            fh.write("第"+str(i)+"頁，第"+str(j)+"條微博"+"\n")
                            fh.write("微博地址："+str(scheme)+"\n"+"釋出時間："+str(created_at)+"\n"+"微博內容："+text+"\n"+"點贊數："+str(attitudes_count)+"\n"+"評論數："+str(comments_count)+"\n"+"轉發數："+str(reposts_count)+"\n")
                            pass
                        pass
                    pass
                i += 1
                time.sleep(random.randint(1,3))
                pass
            else:
                break
        except Exception as e:
            print(e)
            pass
        pass
 
    pass

if __name__ == "__main__":
    print('開始---')
    pics_dir = r"D:\software_study\my_jupyter_notebook\scrawl\pics_origin"
    file_all = "ycy_all.txt"
    file_content = "ycy_content.txt"
    #pic_index
    get_userInfo(id)
    get_weibo(id, file_all, file_content)
    print('完成---')
pass

結果展示：

微博內容：

微博圖片：

GO! 衝鴨！！！超越一切

楊超越微博爬蟲（微博文字+圖片）粉絲資訊待續

# -*- coding: utf-8 -*- import urllib.request import json import time import random from urllib.request import urlopen from bs4 import BeautifulSoup im

Python——網路爬蟲（爬取網頁圖片）

最近在學習 Python，然後就試著寫了一個簡單的Python小程式，爬取一個網頁的圖片，不得不說 Python 真的強大，以下是爬取 NEFU Online Judge 網站的程式碼。吐槽：其實

1-新浪微博爬蟲-（2017-05-09）

1 爬使用者的資訊 1-1 哪裡找cookies 1-2 哪裡找使用者資訊 2 爬使用者發過的所有部落格 2

【網路爬蟲】【java】微博爬蟲（二）：如何抓取HTML頁面及HttpClient使用

一、寫在前面上篇文章以網易微博爬蟲為例，給出了一個很簡單的微博爬蟲的爬取過程，大概說明了網路爬蟲其實也就這麼回事，或許初次看到這個例子覺得有些複雜，不過沒有關係，上篇文章給的例子只是讓大家對爬蟲過程有所瞭解。接下來的系列裡，將一步一步地剖析每個過程。現

pyhton微博爬蟲（3）——獲取微博評論資料

本文的主要目標是獲取微博評論資料，具體包括微博評論連結、總評論數、使用者ID、使用者暱稱、評論時間、評論內容、使用者詳情連結等。實現程式碼如下所示： # -*- coding: utf-8 -*- """ Created on Tue Aug 8 16:

pyhton微博爬蟲（2）——獲取微博使用者關注列表

本文的主要目標是獲取微博使用者關注列表以及關注列表中各微博使用者的ID、暱稱、詳情連結、粉絲數、關注數等關鍵資訊。實現程式碼如下所示： # -*- coding: utf-8 -*- """ Created on Thu Aug 3 20:59:53

【網路爬蟲】【java】微博爬蟲（四）：資料處理——jsoup工具解析html、dom4j讀寫xml

之前提到過，對於簡單的網頁結構解析，可以直接通過觀察法、手工寫正則解析，可以做出來，比如網易微博。但是對於結構稍微複雜點的，比如新浪微博，如果還用正則，用眼睛一個個去找，未免太麻煩了。本文介紹兩個工具包：解析html, xml的jsoup，

iOS之接入新浪微博 SDK（微信支付）的坑(registerApp 的問題)

com .net symbols object type lan creat manager -o 最近在做一個 iOS 的 cocos2d-x 項目接入新浪微博 SDK 的時候被“坑”了，最後終於順利的解決了。發現網上也有不少人遇到一樣的問題，但是能找到的數量有限的解決辦

Android實現新浪微博SSO授權登入分享文字圖片等功能（WEIBO_ANDROID_SDK V2.3.0 ）

新浪開發平臺：http://open.weibo.com 新浪微博分享目前分為兩種途徑： 1，直接在自己的APP，彈出類似Dialog（sina整合）來完成授權，授權成功後可直接分享內容，全程都是在自己APP裡完成分享。老版本的微博SDK中集成了彈出分享Dialog（

微信開發（準備工作簡版）

scale tro ima acc client XML 帳號 red wid 1.準備工作 1.1 首先需要一個url地址，用來接收相關的數據 1.2 註冊開發者賬號進行開發（可用公眾平臺測試賬號） 1.3 appid,第三方用戶唯一憑證(你的AppID) 1.4 sec

公眾號調用微信上傳圖片接口（商品評論的圖片）

微信公眾號圖片接口 <body> <div> <input type="button" value="頭像" onclick="chooseImage()"/> <div >

微信支付（微信公眾號支付） [記錄]

scope err question dir rec package ready fad span 後臺　　先獲取code code有效5min　　　　 public string GetCodeUrl(string Appid, string redirect

使用Hexo搭建GitHub博客（2018年Mac版）

第三方編輯如何使用 master 托管更換學習根據定義域關於本文本文僅記錄自己學習搭建Hexo博客之時，搭建過程中掉坑的歷程總結，對零基礎起步的觀眾朋友可能缺乏某些基礎技術的指導，請優先食用下述兩篇優質教程：【2018更新】小白獨立搭建博客—Githu

搭建好看的靜態博客（使用Hexo進行搭建）

ssi 名稱時間 module request -c ssh key lose 靜態博客經常看到大牛的博客非常的高大帥氣，雖然我很渣，但是逼格不能輸，所以有了以下的搭建記錄。我的成果ninwoo,喜歡的可以參考下面的記錄一起來動手搞起來。安裝Git Bash 訪問g

團隊博客（大豬蹄子隊）

有道雲筆記 com 團隊 note down youdao clas share amp 隊名大豬蹄子隊隊員學號鄧宇 3116004682 劉豐璨 3115005153 侯國鑫 3116004684 王翠鸞 3216004715 謝雅淇 3216004716 袁杏儀

微信自定義分享連結（設定標題+簡介+圖片）

$(function() { //對url進行編碼 var localUrl = encodeURIComponent(location.href.split('#')[0]); //url傳到後臺格式 var Url = "URL=" +localUrl; //這幾

微信分享（微信內建瀏覽器）

轉自：https://blog.csdn.net/kobe088124/article/details/54097593 /**! * 微信內建瀏覽器的Javascript API，功能包括：

APICloud整合微信支付（wxPay第二種方案）

建立應用開發者在使用APICloud提供的來自第三方開放平臺-微信開放平臺的相關模組時，需要開發者自行到微信開放平臺申請相應的appId（urlScheme），並將該appId以feature的形式配置到您專案的config檔案中。該appId的申請與您應用的建立過程有關，具體流程請參考如下介紹。登入微信開放

知識累積——這些年寫過的博文（部落格目錄索引）

前言：關於寫部落格，博主覺得首先在於解惑，其次能傳道、授業當然最好。不知不覺已經寫了這麼多博文了，不管好或壞，都算是對知識的一個積累。博主是一個務實的人，不願意去寫一些浮誇或吸睛之文，只願平平淡淡、一步一個腳印將所有的博文做一個彙總，利人利己！博主也會不定期將最新的博文更新進去。歡迎交流！一、JavaSc

微信小程式自制提示框（具有輸入文字功能）

微信小程式自帶的API中的頁面互動功能，雖然提示功能非常全面，但是所有的互動API中是沒有可以自己在提示框中輸入文字的功能，那麼現在我們來自己做這樣的一個提示框（可以帶有輸入功能），在提示框輸入完內容之後，點選確定，可以將文字內容返回，點選取消則可以回到之前的狀態。（在這裡，

楊超越微博爬蟲（微博文字+圖片）粉絲資訊待續

相關推薦