scrapy爬取新浪微博並存入MongoDB中
阿新 • • 發佈:2018-12-26
spider.py
import json from scrapy import Request, Spider from weibo.items import * class WeiboSpider(Spider): name = 'weibocn' allowed_domains = ['m.weibo.cn'] user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}' follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}' fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}' weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}' start_users = 3217179555 def start_requests(self): yield Request(self.user_url.format(uid=self.start_users), callback=self.parse_user) def parse_user(self, response): """ 解析使用者資訊 :param response: Response物件 """ self.logger.debug(response) result = json.loads(response.text) if result.get('data').get('userInfo'): user_info = result.get('data').get('userInfo') user_item = UserItem() field_map = { 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' } for field, attr in field_map.items(): user_item[field] = user_info.get(attr) yield user_item # 關注 uid = user_info.get('id') yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, meta={'page': 1, 'uid': uid}) # 粉絲 yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, meta={'page': 1, 'uid': uid}) # 微博 yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, meta={'page': 1, 'uid': uid}) def parse_follows(self, response): """ 解析使用者關注 :param response: Response物件 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get( 'card_group'): # 解析使用者 follows = result.get('data').get('cards')[-1].get('card_group') for follow in follows: if follow.get('user'): uid = follow.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 關注列表 user_relation_item = UserRelationItem() follows = [{'id': follow.get('user').get('id'), 'name': follow.get('user').get('screen_name')} for follow in follows] user_relation_item['id'] = uid user_relation_item['follows'] = follows user_relation_item['fans'] = [] yield user_relation_item # 下一頁關注 page = response.meta.get('page') + 1 yield Request(self.follow_url.format(uid=uid, page=page), callback=self.parse_follows, meta={'page': page, 'uid': uid}) def parse_fans(self, response): """ 解析使用者粉絲 :param response: Response物件 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and result.get('data').get('cards')[-1].get( 'card_group'): # 解析使用者 fans = result.get('data').get('cards')[-1].get('card_group') for fan in fans: if fan.get('user'): uid = fan.get('user').get('id') yield Request(self.user_url.format(uid=uid), callback=self.parse_user) uid = response.meta.get('uid') # 粉絲列表 user_relation_item = UserRelationItem() fans = [{'id': fan.get('user').get('id'), 'name': fan.get('user').get('screen_name')} for fan in fans] user_relation_item['id'] = uid user_relation_item['fans'] = fans user_relation_item['follows'] = [] yield user_relation_item # 下一頁粉絲 page = response.meta.get('page') + 1 yield Request(self.fan_url.format(uid=uid, page=page), callback=self.parse_fans, meta={'page': page, 'uid': uid}) def parse_weibos(self, response): """ 解析微博列表 :param response: Response物件 """ result = json.loads(response.text) if result.get('ok') and result.get('data').get('cards'): weibos = result.get('data').get('cards') for weibo in weibos: mblog = weibo.get('mblog') if mblog: weibo_item = WeiboItem() field_map = { 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 'thumbnail': 'thumbnail_pic', } for field, attr in field_map.items(): weibo_item[field] = mblog.get(attr) weibo_item['user'] = response.meta.get('uid') yield weibo_item # 下一頁微博 uid = response.meta.get('uid') page = response.meta.get('page') + 1 yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, meta={'uid': uid, 'page': page})
items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy import Item, Field class UserItem(Item): collection = 'users' id = Field() name = Field() avatar = Field() cover = Field() gender = Field() description = Field() fans_count = Field() follows_count = Field() weibos_count = Field() verified = Field() verified_reason = Field() verified_type = Field() follows = Field() fans = Field() crawled_at = Field() class UserRelationItem(Item): collection = 'users' id = Field() follows = Field() fans = Field() class WeiboItem(Item): collection = 'weibos' id = Field() attitudes_count = Field() comments_count = Field() reposts_count = Field() picture = Field() pictures = Field() source = Field() text = Field() raw_text = Field() thumbnail = Field() user = Field() created_at = Field() crawled_at = Field()
pipelines.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import re, time import logging import pymongo from weibo.items import * class TimePipeline(): def process_item(self, item, spider): if isinstance(item, UserItem) or isinstance(item, WeiboItem): now = time.strftime('%Y-%m-%d %H:%M', time.localtime()) item['crawled_at'] = now return item class WeiboPipeline(): def parse_time(self, date): if re.match('剛剛', date): date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time())) if re.match('\d+分鐘前', date): minute = re.match('(\d+)', date).group(1) date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(minute) * 60)) if re.match('\d+小時前', date): hour = re.match('(\d+)', date).group(1) date = time.strftime('%Y-%m-%d %H:%M', time.localtime(time.time() - float(hour) * 60 * 60)) if re.match('昨天.*', date): date = re.match('昨天(.*)', date).group(1).strip() date = time.strftime('%Y-%m-%d', time.localtime() - 24 * 60 * 60) + ' ' + date if re.match('\d{2}-\d{2}', date): date = time.strftime('%Y-', time.localtime()) + date + ' 00:00' return date def process_item(self, item, spider): if isinstance(item, WeiboItem): if item.get('created_at'): item['created_at'] = item['created_at'].strip() item['created_at'] = self.parse_time(item.get('created_at')) if item.get('pictures'): item['pictures'] = [pic.get('url') for pic in item.get('pictures')] return item class MongoPipeline(object): def __init__(self, mongo_uri, mongo_db): self.mongo_uri = mongo_uri self.mongo_db = mongo_db @classmethod def from_crawler(cls, crawler): return cls( mongo_uri=crawler.settings.get('MONGO_URI'), mongo_db=crawler.settings.get('MONGO_DATABASE') ) def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_uri) self.db = self.client[self.mongo_db] self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)]) self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)]) def close_spider(self, spider): self.client.close() def process_item(self, item, spider): if isinstance(item, UserItem) or isinstance(item, WeiboItem): self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True) if isinstance(item, UserRelationItem): self.db[item.collection].update( {'id': item.get('id')}, {'$addToSet': { 'follows': {'$each': item['follows']}, 'fans': {'$each': item['fans']} } }, True) return item
settings.py
# -*- coding: utf-8 -*-
# Scrapy settings for weibo project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'weibo'
SPIDER_MODULES = ['weibo.spiders']
NEWSPIDER_MODULE = 'weibo.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'weibo (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
DEFAULT_REQUEST_HEADERS = {
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2',
'Connection': 'keep-alive',
'Host': 'm.weibo.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
# Configure maximum concurrent requests performed by Scrapy (default: 16)
CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
# COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False
# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }
# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'weibo.middlewares.WeiboSpiderMiddleware': 543,
# }
# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'weibo.middlewares.RandomUserAgent': 1,
}
# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
# }
# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'weibo.pipelines.TimePipeline': 300,
'weibo.pipelines.WeiboPipeline': 301,
'weibo.pipelines.MongoPipeline': 302,
}
# DOWNLOAD_DELAY = 1
# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
MONGO_URI = 'localhost'
MONGO_DATABASE = 'weibo1'
USER_AGENTS = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]