scrapy爬取新浪微博並存入MongoDB中

阿新 • • 發佈：2018-12-26

spider.py

import json

from scrapy import Request, Spider

from weibo.items import *


class WeiboSpider(Spider):
    name = 'weibocn'
    
    allowed_domains = ['m.weibo.cn']
    
    user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}'
    
    follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}'
    
    fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}'
    
    weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}'

    start_users = 3217179555
    
    def start_requests(self):
        yield Request(self.user_url.format(uid=self.start_users), callback=self.parse_user)
    
    def parse_user(self, response):
        """
        解析使用者資訊
        :param response: Response物件
        """
        self.logger.debug(response)
        result = json.loads(response.text)
        if result.get('data').get('userInfo'):
            user_info = result.get('data').get('userInfo')
            user_item = UserItem()
            field_map = {
                'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone',
                'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count',
                'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified',
                'verified_reason': 'verified_reason', 'verified_type': 'verified_type'
            }
            for field, attr in field_map.items():
                user_item[field] = user_info.get(attr)
            yield user_item
            # 關注
            uid = user_info.get('id')
            yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows,
                          meta={'page': 1, 'uid': uid})
            # 粉絲
            yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans,
                          meta={'page': 1, 'uid': uid})
            # 微博
            yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos,
                          meta={'page': 1, 'uid': uid})
    
    def parse_follows(self, response):
        """
        解析使用者關注
        :param response: Response物件
        """
        result = json.loads(response.text)
        if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
            'card_group'):
            # 解析使用者
            follows = result.get('data').get('cards')[-1].get('card_group')
            for follow in follows:
                if follow.get('user'):
                    uid = follow.get('user').get('id')
                    yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
            
            uid = response.meta.get('uid')
            # 關注列表
            user_relation_item = UserRelationItem()
            follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in
                       follows]
            user_relation_item['id'] = uid
            user_relation_item['follows'] = follows
            user_relation_item['fans'] = []
            yield user_relation_item
            # 下一頁關注
            page = response.meta.get('page') + 1
            yield Request(self.follow_url.format(uid=uid, page=page),
                          callback=self.parse_follows, meta={'page': page, 'uid': uid})
    
    def parse_fans(self, response):
        """
        解析使用者粉絲
        :param response: Response物件
        """
        result = json.loads(response.text)
        if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get(
            'card_group'):
            # 解析使用者
            fans = result.get('data').get('cards')[-1].get('card_group')
            for fan in fans:
                if fan.get('user'):
                    uid = fan.get('user').get('id')
                    yield Request(self.user_url.format(uid=uid), callback=self.parse_user)
            
            uid = response.meta.get('uid')
            # 粉絲列表
            user_relation_item = UserRelationItem()
            fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in
                    fans]
            user_relation_item['id'] = uid
            user_relation_item['fans'] = fans
            user_relation_item['follows'] = []
            yield user_relation_item
            # 下一頁粉絲
            page = response.meta.get('page') + 1
            yield Request(self.fan_url.format(uid=uid, page=page),
                          callback=self.parse_fans, meta={'page': page, 'uid': uid})
    
    def parse_weibos(self, response):
        """
        解析微博列表
        :param response: Response物件
        """
        result = json.loads(response.text)
        if result.get('ok') and result.get('data').get('cards'):
            weibos = result.get('data').get('cards')
            for weibo in weibos:
                mblog = weibo.get('mblog')
                if mblog:
                    weibo_item = WeiboItem()
                    field_map = {
                        'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count',
                        'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics',
                        'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text',
                        'thumbnail': 'thumbnail_pic',
                    }
                    for field, attr in field_map.items():
                        weibo_item[field] = mblog.get(attr)
                    weibo_item['user'] = response.meta.get('uid')
                    yield weibo_item
            # 下一頁微博
            uid = response.meta.get('uid')
            page = response.meta.get('page') + 1
            yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos,
                          meta={'uid': uid, 'page': page})

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

from scrapy import Item, Field


class UserItem(Item):
    collection = 'users'
    
    id = Field()
    name = Field()
    avatar = Field()
    cover = Field()
    gender = Field()
    description = Field()
    fans_count = Field()
    follows_count = Field()
    weibos_count = Field()
    verified = Field()
    verified_reason = Field()
    verified_type = Field()
    follows = Field()
    fans = Field()
    crawled_at = Field()


class UserRelationItem(Item):
    collection = 'users'
    
    id = Field()
    follows = Field()
    fans = Field()


class WeiboItem(Item):
    collection = 'weibos'
    
    id = Field()
    attitudes_count = Field()
    comments_count = Field()
    reposts_count = Field()
    picture = Field()
    pictures = Field()
    source = Field()
    text = Field()
    raw_text = Field()
    thumbnail = Field()
    user = Field()
    created_at = Field()
    crawled_at = Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import re, time

import logging
import pymongo

from weibo.items import *


class TimePipeline():
    def process_item(self, item, spider):
        if isinstance(item, UserItem) or isinstance(item, WeiboItem):
            now = time.strftime('%Y-%m-%d %H:%M', time.localtime())
            item['crawled_at'] = now
        return item


class WeiboPipeline():
    def parse_time(self, date):
        if re.match('剛剛', date):
            date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time()))
        if re.match('\d+分鐘前', date):
            minute = re.match('(\d+)', date).group(1)
            date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60))
        if re.match('\d+小時前', date):
            hour = re.match('(\d+)', date).group(1)
            date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60))
        if re.match('昨天.*', date):
            date = re.match('昨天(.*)', date).group(1).strip()
            date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date
        if re.match('\d{2}-\d{2}', date):
            date = time.strftime('%Y-', time.localtime()) + date + ' 00:00'
        return date
    
    def process_item(self, item, spider):
        if isinstance(item, WeiboItem):
            if item.get('created_at'):
                item['created_at'] = item['created_at'].strip()
                item['created_at'] = self.parse_time(item.get('created_at'))
            if item.get('pictures'):
                item['pictures'] = [pic.get('url') for pic in item.get('pictures')]
        return item


class MongoPipeline(object):
    def __init__(self, mongo_uri, mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    
    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            mongo_uri=crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE')
        )
    
    def open_spider(self, spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
        self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)])
        self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)])
    
    def close_spider(self, spider):
        self.client.close()
    
    def process_item(self, item, spider):
        if isinstance(item, UserItem) or isinstance(item, WeiboItem):
            self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True)
        if isinstance(item, UserRelationItem):
            self.db[item.collection].update(
                {'id': item.get('id')},
                {'$addToSet':
                    {
                        'follows': {'$each': item['follows']},
                        'fans': {'$each': item['fans']}
                    }
                }, True)
        return item

settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for weibo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'weibo'

SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'weibo (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

DEFAULT_REQUEST_HEADERS = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
    'Connection': 'keep-alive',
    'Host': 'm.weibo.cn',
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
#    'weibo.middlewares.WeiboSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
    'weibo.middlewares.RandomUserAgent': 1,
}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'weibo.pipelines.TimePipeline': 300,
    'weibo.pipelines.WeiboPipeline': 301,
    'weibo.pipelines.MongoPipeline': 302,
}
# DOWNLOAD_DELAY = 1
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'


MONGO_URI = 'localhost'

MONGO_DATABASE = 'weibo1'

USER_AGENTS = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
    "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
    "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
    "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
    "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
    "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
    "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]

scrapy爬取新浪微博並存入MongoDB中

spider.pyimport json from scrapy import Request, Spider from weibo.items import * class WeiboSpider(Spider): name = 'weibocn'

Scrapy爬取新浪微博移動版使用者首頁第一條微博

大家好，本月第一次更新。最近找了一份關於爬蟲的實習工作，需要爬取較大量的資料，這時就發現通過自己編寫函式來實現爬蟲效率太慢了；於是又轉回來用scrapy，以前稍微學習了一下，這次剛好爬爬微博練練手，而後再使用部分資料生成詞雲。本次爬取的是新浪微博移動端（https://m.weibo.cn/），爬取的

Python爬取新浪微博用戶信息及內容

pro 目標 oss 來源但是 blog .com 交流 exc 新浪微博作為新時代火爆的新媒體社交平臺，擁有許多用戶行為及商戶數據，因此需要研究人員都想要得到新浪微博數據，But新浪微博數據量極大，獲取的最好方法無疑就是使用Python爬蟲來得到。網上有一些關於使用Py

關於爬取新浪微博，記憶體耗用過高的問題

最近在做網際網路輿情分析時，需要爬取新浪微博做相關實驗。雖然新浪微博開放了相關輿論的API，然而申請什麼的，並不想做，而且輿情變化快，最終還是自己爬取，相關輿情。在用selenium的時候，有時候經常發現記憶體耗

python3[爬蟲實戰] 爬蟲之requests爬取新浪微博京東客服

爬取的內容為京東客服的微博及評論思路:主要是通過手機端訪問新浪微博的api介面，然後進行資料的篩選，這個主要是登陸上去的微博的url連結，可以看到的介面：這裡主要爬取的內容為：說說，說說下面的評論條目雖然很簡單，但是，不得不說句mmp，爬

用python寫網路爬蟲-爬取新浪微博評論

新浪微博需要登入才能爬取，這裡使用m.weibo.cn這個移動端網站即可實現簡化操作，用這個訪問可以直接得到的微博id。分析新浪微博的評論獲取方式得知，其採用動態載入。所以使用json模組解析json程式碼單獨編寫了字元優化函式，解決微博評論中的嘈雜干擾

爬蟲爬取新浪微博

這周的第一個小任務：爬取動態網頁，拿新浪微博做例子，我爬取了指定使用者微博的基本資訊，包括暱稱，性別，粉絲數，關注人數和主頁地址，還有發過的所有微博的地址和資訊內容，如果轉發時沒有說任何內容的話只會顯示轉發了微博。需要注意的是網頁版資訊量太大，用手機端的也就

java 使用htmlunit模擬登入爬取新浪微博頁面

import java.io.IOException; import java.net.MalformedURLException; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoy

【python 新浪微博爬蟲】python 爬取新浪微博24小時熱門話題top500

一、需求分析模擬登陸新浪微博,爬取新浪微博的熱門話題版塊的24小時內的前TOP500的話題名稱、該話題的閱讀數、討論數、粉絲數、話題主持人，以及對應話題主持人的關注數、粉絲數和微博數。二、開發語言 python2.7 三、需要匯入模組 import

java爬取新浪微博帶有“展開全文”的完整微博文字

獲取新浪微博“展開全文”的完整文字在個人主頁的響應中，這篇微博的表示形式是這樣的： <div class=\"WB_text W_f14\" node-type=\"feed_list_content\" nick-name=\"Vista看天下\">\n

WebCollector教程——爬取新浪微博

下載本教程需要兩套jar包，WebCollector核心jar包和selenium的jar包。 selenium的maven依賴： <dependency> <groupId>org.seleniumhq.sele

爬取新浪微博使用者的個人資訊和微博內容

#-*- coding:utf-8 -*- """ 爬取新浪微博的使用者資訊功能：使用者ID 使用者名稱粉絲數關注數微博數微博內容網址：www.weibo.cn 資料量更少相對於 www.weibo.cn """ import time impo

[python爬蟲] Selenium爬取新浪微博內容及使用者資訊

登入入口新浪微博登入常用介面：http://login.sina.com.cn/ 對應主介面：http://weibo.com/但是個人建議採用手機端微博入口：http://login.weibo.cn/login/ 其原因是手機端資料相對更輕量型，同時基本資料都齊全，可能缺少些個人基本資訊，如"個人資料

爬取新浪微博評論及點贊數並存儲為excel的.csv格式

1、獲取cookie，先進入微博頁面登陸微博，如進入https://m.weibo.cn/status/4173028302302955後登陸，再使用chrome的F12可方便地獲取自己的cookie，獲取Cookie所需的選擇項如下圖所示，往下拉會看到自己的Cookie。

用python爬取新浪微博資料（無需手動獲取cookie)

從java 轉為python from selenium import webdriver import selenium from selenium.webdriver.common.desired_capabilities import DesiredCapabi

Scrapy實現對新浪微博某關鍵詞的爬取以及不同url中重複內容的過濾

工作原因需要爬取微博上相關微博內容以及評論。直接scrapy上手，發現有部分重複的內容出現。（標題重複，內容重複，但是url不重複） 1.scrapy爬取微博內容為了降低爬取難度，直接爬取微博的移動端：（電腦訪問到移動版本微博，之後F12調出控制檯來操作）點選

基於scrapy的分散式爬蟲抓取新浪微博個人資訊和微博內容存入MySQL

為了學習機器學習深度學習和文字挖掘方面的知識，需要獲取一定的資料，新浪微博的大量資料可以作為此次研究歷程的物件一、環境準備 python 2.7 scrapy框架的部署（可以檢視上一篇部落格的簡要操作，傳送門：點選開啟連結） mysql的部署（需要的資源

python抓取新浪微博評論並分析

1，實現效果 2，資料庫 3，主要步驟 1，輸入賬號密碼，模擬新浪微博登陸 2，抓取評論頁的內容 3，用正則表示式過濾出使用者名稱，評論時間和評論內容 4，將得到的內容存入資料庫 5，用SQL語句實現其他功能：例如統計評論次數等 4，詳細步驟 # -*- codi

Python程式碼登入新浪微博並自動發微博

前言對於很少玩微博@張行之_的我來說，微博內容少的可憐。所以本人就想：能不能寫個成功程式來幫我發微博。這個程式要滿足以下要求：自動化，自動登入微博，自動發微博。微博內容要有意義，不能是隨機生成的字元。可以設定每隔一段時間發一條微博，頻率不能太快，當

scrapy爬取新浪微博並存入MongoDB中

相關推薦