python scrapy 豆瓣爬蟲及詞雲
阿新 • • 發佈:2020-12-28
沒事弄著玩的,爬取的是電影《流浪貓鮑勃》的電影評價,說是有1W多評價,實際只有500條左右,估計是引用的也算進去了
用的是python scrapy框架,安裝部分就省略了
import time
import scrapy
from scrapy.selector import Selector
from ..items import DoubanItem
# 模擬請求頭
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36"}
class QuotesSpider(scrapy.Spider):
name = "quotes"
allowed_domains = ['movie.douban.com']
start_urls = ['https://movie.douban.com/subject/26685451/comments?start=500&limit=20&status=P&sort=new_score']
number = 0
def parse(self, response):
item = DoubanItem()
for v in response.xpath(
'//div[@class="comment-item "]/div[@class="comment"]'):
item['name'] = v.xpath('h3/span[@class="comment-info"]/a/text()').get()
item['time'] = str.strip(
v.xpath('h3/span[@class="comment-info"]/span[@class="comment-time "]/text()').get())
item['evaluate'] = v.xpath('p[@class=" comment-content"]/span[@class="short"]/text()').get()
item['star'] = v.css("span").xpath('@title').get()
yield item
next_link_end = response.xpath("//div[@class='center']/a[@class='next']/@href").get()
next_link = response.xpath("//div[@class='center']/a[@class='next']/text()").get()
if next_link == '後頁 >':
time.sleep(1)
self.number = self.number + 20
next_like = 'https://movie.douban.com/subject/26685451/comments' + next_link_end
yield scrapy.Request(url=next_like, callback=self.parse, headers={
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Referer": "https://movie.douban.com/subject/26685451/comments?start={}&limit=20&status=P&sort=new_score".format(
self.number),
"Cookie": '你的cookie'}) # 豆瓣有限制,沒登入只等爬取200條左右的資料
items.py檔案
import scrapy
class DoubanItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
time = scrapy.Field()
star = scrapy.Field()
evaluate = scrapy.Field()
pass
啟動命令scrapy crawl quotes -o list.csv 直接儲存為list.csv檔案
後續我把檔案存進了資料庫,通過資料庫讀取的
#! /usr/bin/env python
# -*- coding:utf-8 -*-
import pymysql, jieba, re
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
conn = pymysql.connect(host='*', user='root', passwd="*", db='demo', port=3306, charset='utf8')
cur = conn.cursor(cursor=pymysql.cursors.DictCursor)
sql = "select * from douban"
cur.execute(sql)
cur.close()
conn.close()
# 將列表中的資料轉換為字串
allComment = ''
for v in cur.fetchall():
allComment = allComment + v['evaluate'].strip(" ")
# 使用正則表示式去除標點符號
pattern = re.compile(r'[\u4e00-\u9fa5]+')
filterdata = re.findall(pattern, allComment)
cleaned_comments = ''.join(filterdata)
# 使用結巴分詞進行中文分詞
segment = jieba.lcut(cleaned_comments)
comment = pd.DataFrame({'segment': segment})
# 去掉停用詞 chineseStopWords.txt 自己網上下m,qu
stopwords = pd.read_csv("./chineseStopWords.txt", index_col=False, quoting=3, sep="\t",
names=['stopword'], encoding='GBK')
comment = comment[~comment.segment.isin(stopwords.stopword)]
# 統計詞頻
comment_fre = comment.groupby(by='segment').agg(
計數=pd.NamedAgg(column='segment', aggfunc='size')).reset_index().sort_values(
by='計數', ascending=False)
# 用詞雲進行顯示
wordcloud = WordCloud(
font_path="你的檔案地址/simhei.ttf",
background_color="white", max_font_size=80)
word_frequence = {x[0]: x[1] for x in comment_fre.head(1000).values}
word_frequence_list = []
for key in word_frequence:
temp = (key, word_frequence[key])
word_frequence_list.append(temp)
wordcloud = wordcloud.fit_words(dict(word_frequence_list))
plt.imshow(wordcloud)
plt.show()
最終結果