影評詞雲製作
資料準備:pyhton3 爬取電影影評
我們以春宵苦短,少女前進吧! 夜は短し歩けよ乙女 這部電影為例。
URL:https://movie.douban.com/subject/26935251/
程式碼:
建立資料夾存入爬取的圖片
import os dirs = 'F:\爬蟲\詞雲' if not os.path.exists(dirs): os.makedirs(dirs) os.chdir(dirs) import json import os os.chdir(dirs) def get_comments(url,headers,start,max_restart_num,movie_name,collection): if start >= 5000: print("已爬取5000條評論,結束爬取") return data = { 'start': start, 'limit': 20, 'sort': 'new_score', 'status': 'P', } response = requests.get(url=url, headers=headers, params=data) tree = etree.HTML(response.text) comment_item = tree.xpath('//div[@id ="comments"]/div[@class="comment-item"]') len_comments = len(comment_item) if len_comments > 0: for i in range(1, len_comments + 1): votes = tree.xpath('//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="votes"]'.format(i)) commenters = tree.xpath( '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/a'.format(i)) ratings = tree.xpath( '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/span[contains(@class,"rating")]/@title'.format( i)) comments_time = tree.xpath( '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/span[@class="comment-time "]'.format( i)) comments = tree.xpath( '//div[@id ="comments"]/div[@class="comment-item"][{}]/div[@class="comment"]/p/span'.format(i)) vote = (votes[0].text.strip()) commenter = (commenters[0].text.strip()) try: rating = (str(ratings[0])) except: rating = 'null' comment_time = (comments_time[0].text.strip()) comment = (comments[0].text.strip()) comment_dict = {} comment_dict['vote'] = vote comment_dict['commenter'] = commenter comment_dict['rating'] = rating comment_dict['comments_time'] = comment_time comment_dict['comments'] = comment comment_dict['movie_name'] = movie_name #存入資料庫 print("正在存取第{}條資料".format(start+i)) print(comment_dict) jsObj = json.dumps(comment_dict,ensure_ascii=False) with open('test1.txt', 'a+',encoding='utf-8') as file: file.writelines(jsObj+'\n') file.close() headers['Referer'] = response.url start += 20 data['start'] = start time.sleep(5) return get_comments(url, headers, start, max_restart_num,movie_name,collection) else: # print(response.status_code) if max_restart_num>0 : if response.status_code != 200: print("fail to crawl ,waiting 10s to restart continuing crawl...") time.sleep(10) # headers['User-Agent'] = Headers.getUA() # print(start) return get_comments(url, headers, start, max_restart_num-1, movie_name, collection) else: print("finished crawling") return else: print("max_restart_num has run out") with open('log.txt',"a") as fp: fp.write('\n{}--latest start:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), start)) return if __name__ =='__main__': base_url = 'https://movie.douban.com/subject/26935251' headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36', 'Upgrade-Insecure-Requests': '1', 'Connection':'keep-alive', 'Upgrade-Insecure-Requests':'1', 'Host':'movie.douban.com', } start = 0 response = requests.get(base_url,headers) tree = etree.HTML(response.text) movie_name = tree.xpath('//div[@id="content"]/h1/span')[0].text.strip() # print(movie_name) url = base_url+'/comments' try: get_comments(url, headers,start, 5, movie_name,None) finally: pass 得到爬取的資料:存入到test1.txt
資料處理
with open('test1.txt' ,'r',encoding='utf-8') as f: data=f.readlines() f.close() #得到想要的rating的評論 n=len(data) array=[] for i in range(n): temp=data[i] temp=eval(temp) # print(temp['rating']) # print(type(temp)) if temp['rating']=="力薦"or"推薦": array.append(temp) #生成評論字出現大於1小於5的資料 import jieba from collections import Counter words_list = [] num=0 for doc in array: num+=1 # print(doc['comments']) comment = doc['comments'] #把評論拆分成不同的詞 t_list = jieba.lcut(str(comment),cut_all=False) for word in t_list: #當詞不在停用詞集中出現,並且長度大於1小於5,將之視為課作為詞頻統計的詞 if 5>len(word)>1: words_list.append(word) words_dict = dict(Counter(words_list)) ``num = 5 - 1 dict1 = {k:v for k,v in words_dict.items() if v > num} ## 生成詞雲 from wordcloud import WordCloud,ImageColorGenerator from scipy.misc import imread import matplotlib.pyplot as plt from matplotlib.font_manager import FontProperties # 詞雲設定 mask_color_path = "bg_1.png" # 設定背景圖片路徑 font_path = 'C:\Windows\Fonts\simkai.ttf' # 為matplotlib設定中文字型路徑沒 imgname1 = "color_by_defualut.png" # 儲存的圖片名字1(只按照背景圖片形狀) imgname2 = "color_by_img.png" # 儲存的圖片名字2(顏色按照背景圖片顏色佈局生成) width = 1000 height = 860 margin = 2 # 設定背景圖片 mask_coloring = imread(mask_color_path) # 設定WordCloud屬性 wc = WordCloud( font_path =font_path , background_color="white", # 背景顏色 max_words=150, # 詞雲顯示的最大詞數 mask=mask_coloring, # 設定背景圖片 max_font_size=200, # 字型最大值 # random_state=42, width=width, height=height, margin=margin, # 設定圖片預設的大小,但是如果使用背景圖片的話,那麼儲存的圖片大小將會按照其大小儲存,margin為詞語邊緣距離 ) # 生成詞雲 wc.generate_from_frequencies(dict1) bg_color = ImageColorGenerator(mask_coloring) # 重定義字型顏色 wc.recolor(color_func=bg_color) # 定義自定義字型,檔名從1.b檢視系統中文字型中來 myfont = FontProperties(fname=font_path) plt.figure() title='nihao' plt.title(title, fontproperties=myfont) plt.imshow(wc) plt.axis("off") plt.show() save=True if save is True:#儲存到 os.chdir(dirs) wc.to_file(imgname2) #得到詞雲 ![背景圖片](https://img-blog.csdn.net/201810171447324?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3l3ZDE5OTUwOTAx/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) ![得到的詞雲](https://img-blog.csdn.net/20181017144809494?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3l3ZDE5OTUwOTAx/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) ## 程式碼:
import json
import os
import requests,time
from lxml import etree
import time
import jieba
from collections import Counter
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
dirs = ‘F:\爬蟲\詞雲’
if not os.path.exists(dirs):
os.makedirs(dirs)
os.chdir(dirs)
def get_comments(url,headers,start,max_restart_num,movie_name,collection):
if start >= 5000:
print(“已爬取5000條評論,結束爬取”)
return
data = {
‘start’: start,
‘limit’: 20,
‘sort’: ‘new_score’,
‘status’: ‘P’,
}
response = requests.get(url=url, headers=headers, params=data)
tree = etree.HTML(response.text)
comment_item = tree.xpath(’//div[@id =“comments”]/div[@class=“comment-item”]’)
len_comments = len(comment_item)
if len_comments > 0:
for i in range(1, len_comments + 1):
votes = tree.xpath(’//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“votes”]’.format(i))
commenters = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/a’.format(i))
ratings = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/span[contains(@class,“rating”)]/@title’.format(
i))
comments_time = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/span[@class="comment-time "]’.format(
i))
comments = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]/div[@class=“comment”]/p/span’.format(i))
vote = (votes[0].text.strip())
commenter = (commenters[0].text.strip())
try:
rating = (str(ratings[0]))
except:
rating = 'null'
comment_time = (comments_time[0].text.strip())
comment = (comments[0].text.strip())
comment_dict = {}
comment_dict['vote'] = vote
comment_dict['commenter'] = commenter
comment_dict['rating'] = rating
comment_dict['comments_time'] = comment_time
comment_dict['comments'] = comment
comment_dict['movie_name'] = movie_name
#存入資料庫
print("正在存取第{}條資料".format(start+i))
print(comment_dict)
jsObj = json.dumps(comment_dict,ensure_ascii=False)
with open('test1.txt', 'a+',encoding='utf-8') as file:
file.writelines(jsObj+'\n')
file.close()
headers['Referer'] = response.url
start += 20
data['start'] = start
time.sleep(5)
return get_comments(url, headers, start, max_restart_num,movie_name,collection)
else:
# print(response.status_code)
if max_restart_num>0 :
if response.status_code != 200:
print("fail to crawl ,waiting 10s to restart continuing crawl...")
time.sleep(10)
# headers['User-Agent'] = Headers.getUA()
# print(start)
return get_comments(url, headers, start, max_restart_num-1, movie_name, collection)
else:
print("finished crawling")
return
else:
print("max_restart_num has run out")
with open('log.txt',"a") as fp:
fp.write('\n{}--latest start:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), start))
return
def get_dict(filename,number):
with open(filename ,'r',encoding='utf-8') as f:
data=f.readlines()
f.close()
n=len(data)
array=[]
for i in range(n):
temp=data[i]
temp=eval(temp)
#print(temp['rating'])
#print(type(temp))
if temp['rating']=="力薦"or"推薦":
array.append(temp)
words_list = []
num=0
for doc in array:
num+=1
# print(doc['comments'])
comment = doc['comments']
t_list = jieba.lcut(str(comment),cut_all=False)
for word in t_list: #當詞不在停用詞集中出現,並且長度大於1小於5,將之視為課作為詞頻統計的詞
if 5>len(word)>1:
words_list.append(word)
words_dict = dict(Counter(words_list))
num = number - 1
dict1 = {k:v for k,v in words_dict.items() if v > num}
return dict1
def get_wordcloud(dict1,save=False):
# 詞雲設定
mask_color_path = “bg_1.png” # 設定背景圖片路徑
font_path = ‘C:\Windows\Fonts\simkai.ttf’ # 為matplotlib設定中文字型路徑沒
imgname1 = “color_by_defualut.png” # 儲存的圖片名字1(只按照背景圖片形狀)
imgname2 = “color_by_img.png” # 儲存的圖片名字2(顏色按照背景圖片顏色佈局生成)
width = 1000
height = 860
margin = 2
# 設定背景圖片
mask_coloring = imread(mask_color_path)
# 設定WordCloud屬性
wc = WordCloud( font_path =font_path ,
background_color=“white”, # 背景顏色
max_words=150, # 詞雲顯示的最大詞數
mask=mask_coloring, # 設定背景圖片
max_font_size=200, # 字型最大值
# random_state=42,
width=width, height=height, margin=margin, # 設定圖片預設的大小,但是如果使用背景圖片的話,那麼儲存的圖片大小將會按照其大小儲存,margin為詞語邊緣距離
)
# 生成詞雲
wc.generate_from_frequencies(dict1)
bg_color = ImageColorGenerator(mask_coloring)
# 重定義字型顏色
wc.recolor(color_func=bg_color)
# 定義自定義字型,檔名從1.b檢視系統中文字型中來
myfont = FontProperties(fname=font_path)
plt.figure()
title='詞雲'
plt.title(title, fontproperties=myfont)
plt.imshow(wc)
plt.axis("off")
plt.show()
if save is True:#儲存到
os.chdir(dirs)
wc.to_file(imgname2)
if name ==‘main’:
base_url = ‘https://movie.douban.com/subject/26935251’
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36’,
‘Upgrade-Insecure-Requests’: ‘1’,
‘Connection’:‘keep-alive’,
‘Upgrade-Insecure-Requests’:‘1’,
‘Host’:‘movie.douban.com’,
}
start = 0
response = requests.get(base_url,headers)
tree = etree.HTML(response.text)
movie_name = tree.xpath(’//div[@id=“content”]/h1/span’)[0].text.strip()
# print(movie_name)
url = base_url+'/comments'
filename='test1.txt'
try:
get_comments(url, headers,start, 5, movie_name,None)
dict1=get_dict(filename,5)
get_wordcloud(dict1,save=True)
finally:
pass