1. 程式人生 > >影評詞雲製作

影評詞雲製作

資料準備:pyhton3 爬取電影影評

我們以春宵苦短,少女前進吧! 夜は短し歩けよ乙女 這部電影為例。
URL:https://movie.douban.com/subject/26935251/
程式碼:

建立資料夾存入爬取的圖片

import os
dirs = 'F:\爬蟲\詞雲'
if not os.path.exists(dirs):
    os.makedirs(dirs)
os.chdir(dirs)

import json
import os
os.chdir(dirs)
def get_comments(url,headers,start,max_restart_num,movie_name,collection):
    if start >= 5000:
        print("已爬取5000條評論,結束爬取")
        return
    data = {
        'start': start,
        'limit': 20,
        'sort': 'new_score',
        'status': 'P',
    }
    response = requests.get(url=url, headers=headers, params=data)
    tree = etree.HTML(response.text)
    comment_item = tree.xpath('//div[@id ="comments"]/div[@class="comment-item"]')
    len_comments = len(comment_item)
    if len_comments > 0:
        for i in range(1, len_comments + 1):
            votes = tree.xpath('//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="votes"]'.format(i))
            commenters = tree.xpath(
                '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/a'.format(i))
            ratings = tree.xpath(
                '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/span[contains(@class,"rating")]/@title'.format(
                    i))
            comments_time = tree.xpath(
                '//div[@id ="comments"]/div[@class="comment-item"][{}]//span[@class="comment-info"]/span[@class="comment-time "]'.format(
                    i))
            comments = tree.xpath(
                '//div[@id ="comments"]/div[@class="comment-item"][{}]/div[@class="comment"]/p/span'.format(i))

            vote = (votes[0].text.strip())
            commenter = (commenters[0].text.strip())
            try:
                rating = (str(ratings[0]))
            except:
                rating = 'null'
            comment_time = (comments_time[0].text.strip())
            comment = (comments[0].text.strip())

            comment_dict = {}
            comment_dict['vote'] = vote
            comment_dict['commenter'] = commenter
            comment_dict['rating'] = rating
            comment_dict['comments_time'] = comment_time
            comment_dict['comments'] = comment

            comment_dict['movie_name'] = movie_name
            #存入資料庫
            

            print("正在存取第{}條資料".format(start+i))
            print(comment_dict)
            
            jsObj = json.dumps(comment_dict,ensure_ascii=False)  
            with  open('test1.txt', 'a+',encoding='utf-8')  as file:
                file.writelines(jsObj+'\n')


        file.close()
        headers['Referer'] = response.url
        start += 20
        data['start'] = start
        time.sleep(5)
        return get_comments(url, headers, start, max_restart_num,movie_name,collection)
    else:
        # print(response.status_code)
        if max_restart_num>0 :
            if response.status_code != 200:
                print("fail to crawl ,waiting 10s to restart continuing crawl...")
                time.sleep(10)
                # headers['User-Agent'] = Headers.getUA()
                # print(start)
                return get_comments(url, headers, start, max_restart_num-1, movie_name, collection)
            else:
                print("finished crawling")
                return
        else:
            print("max_restart_num has run out")
            with open('log.txt',"a") as fp:
                fp.write('\n{}--latest start:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), start))
            return
if __name__ =='__main__':
    base_url = 'https://movie.douban.com/subject/26935251'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36',
        'Upgrade-Insecure-Requests': '1',
        'Connection':'keep-alive',
        'Upgrade-Insecure-Requests':'1',
        'Host':'movie.douban.com',
    }
    start = 0
    response = requests.get(base_url,headers)
    tree = etree.HTML(response.text)
    movie_name = tree.xpath('//div[@id="content"]/h1/span')[0].text.strip()
    # print(movie_name)

    url = base_url+'/comments'

    try:
        get_comments(url, headers,start, 5, movie_name,None)
    finally:
        pass
 得到爬取的資料:存入到test1.txt

資料處理

with open('test1.txt' ,'r',encoding='utf-8') as f:
        data=f.readlines()
        f.close()
#得到想要的rating的評論
 n=len(data)
array=[]
for i in range(n):
    
    temp=data[i]
    temp=eval(temp)
#     print(temp['rating'])
 #   print(type(temp))
    if temp['rating']=="力薦"or"推薦":
        array.append(temp)
#生成評論字出現大於1小於5的資料
import jieba
from collections import Counter
words_list = []
num=0
for doc in array:
        num+=1
        # print(doc['comments'])
        comment = doc['comments']
        #把評論拆分成不同的詞
        t_list = jieba.lcut(str(comment),cut_all=False)
        for word in t_list: #當詞不在停用詞集中出現,並且長度大於1小於5,將之視為課作為詞頻統計的詞
            if  5>len(word)>1:
                words_list.append(word)
        words_dict = dict(Counter(words_list))
``num = 5 - 1
 dict1 = {k:v for k,v in words_dict.items() if v > num}
 
## 生成詞雲
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties


# 詞雲設定
mask_color_path = "bg_1.png"  # 設定背景圖片路徑
font_path = 'C:\Windows\Fonts\simkai.ttf'  # 為matplotlib設定中文字型路徑沒
imgname1 = "color_by_defualut.png"  # 儲存的圖片名字1(只按照背景圖片形狀)
imgname2 = "color_by_img.png"  # 儲存的圖片名字2(顏色按照背景圖片顏色佈局生成)
width = 1000
height = 860
margin = 2
    # 設定背景圖片
mask_coloring = imread(mask_color_path)
    # 設定WordCloud屬性
wc = WordCloud( font_path =font_path ,
                background_color="white",  # 背景顏色
                max_words=150,  # 詞雲顯示的最大詞數
                mask=mask_coloring,  # 設定背景圖片
                max_font_size=200,  # 字型最大值
                   # random_state=42,
                width=width, height=height, margin=margin,  # 設定圖片預設的大小,但是如果使用背景圖片的話,那麼儲存的圖片大小將會按照其大小儲存,margin為詞語邊緣距離
                )
    # 生成詞雲
wc.generate_from_frequencies(dict1)

bg_color = ImageColorGenerator(mask_coloring)
    # 重定義字型顏色
wc.recolor(color_func=bg_color)
    # 定義自定義字型,檔名從1.b檢視系統中文字型中來
myfont = FontProperties(fname=font_path)
plt.figure()
title='nihao'
plt.title(title, fontproperties=myfont)
plt.imshow(wc)
plt.axis("off")
plt.show()
save=True
if save is True:#儲存到
    os.chdir(dirs)
    wc.to_file(imgname2)
#得到詞雲
![背景圖片](https://img-blog.csdn.net/201810171447324?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3l3ZDE5OTUwOTAx/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70) ![得到的詞雲](https://img-blog.csdn.net/20181017144809494?watermark/2/text/aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3l3ZDE5OTUwOTAx/font/5a6L5L2T/fontsize/400/fill/I0JBQkFCMA==/dissolve/70)
## 程式碼:

import json
import os
import requests,time
from lxml import etree
import time
import jieba
from collections import Counter
from wordcloud import WordCloud,ImageColorGenerator
from scipy.misc import imread
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

dirs = ‘F:\爬蟲\詞雲’
if not os.path.exists(dirs):
os.makedirs(dirs)
os.chdir(dirs)

def get_comments(url,headers,start,max_restart_num,movie_name,collection):
if start >= 5000:
print(“已爬取5000條評論,結束爬取”)
return
data = {
‘start’: start,
‘limit’: 20,
‘sort’: ‘new_score’,
‘status’: ‘P’,
}
response = requests.get(url=url, headers=headers, params=data)
tree = etree.HTML(response.text)
comment_item = tree.xpath(’//div[@id =“comments”]/div[@class=“comment-item”]’)
len_comments = len(comment_item)
if len_comments > 0:
for i in range(1, len_comments + 1):
votes = tree.xpath(’//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“votes”]’.format(i))
commenters = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/a’.format(i))
ratings = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/span[contains(@class,“rating”)]/@title’.format(
i))
comments_time = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]//span[@class=“comment-info”]/span[@class="comment-time "]’.format(
i))
comments = tree.xpath(
‘//div[@id =“comments”]/div[@class=“comment-item”][{}]/div[@class=“comment”]/p/span’.format(i))

        vote = (votes[0].text.strip())
        commenter = (commenters[0].text.strip())
        try:
            rating = (str(ratings[0]))
        except:
            rating = 'null'
        comment_time = (comments_time[0].text.strip())
        comment = (comments[0].text.strip())

        comment_dict = {}
        comment_dict['vote'] = vote
        comment_dict['commenter'] = commenter
        comment_dict['rating'] = rating
        comment_dict['comments_time'] = comment_time
        comment_dict['comments'] = comment

        comment_dict['movie_name'] = movie_name
        #存入資料庫
        

        print("正在存取第{}條資料".format(start+i))

print(comment_dict)

        jsObj = json.dumps(comment_dict,ensure_ascii=False)  
        with  open('test1.txt', 'a+',encoding='utf-8')  as file:
            file.writelines(jsObj+'\n')


    file.close()
    headers['Referer'] = response.url
    start += 20
    data['start'] = start
    time.sleep(5)
    return get_comments(url, headers, start, max_restart_num,movie_name,collection)
else:
    # print(response.status_code)
    if max_restart_num>0 :
        if response.status_code != 200:
            print("fail to crawl ,waiting 10s to restart continuing crawl...")
            time.sleep(10)
            # headers['User-Agent'] = Headers.getUA()
            # print(start)
            return get_comments(url, headers, start, max_restart_num-1, movie_name, collection)
        else:
            print("finished crawling")
            return
    else:
        print("max_restart_num has run out")
        with open('log.txt',"a") as fp:
            fp.write('\n{}--latest start:{}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), start))
        return

def get_dict(filename,number):

with open(filename ,'r',encoding='utf-8') as f:
    data=f.readlines()
    f.close()
n=len(data)
array=[]
for i in range(n):
    temp=data[i]
    temp=eval(temp)
#print(temp['rating'])
#print(type(temp))
    if temp['rating']=="力薦"or"推薦":
        array.append(temp)

words_list = []
num=0
for doc in array:
    num+=1
        # print(doc['comments'])
    comment = doc['comments']
    t_list = jieba.lcut(str(comment),cut_all=False)
    for word in t_list: #當詞不在停用詞集中出現,並且長度大於1小於5,將之視為課作為詞頻統計的詞
        if  5>len(word)>1:
            words_list.append(word)
words_dict = dict(Counter(words_list))

num = number - 1
dict1 = {k:v for k,v in words_dict.items() if v > num}
return dict1

def get_wordcloud(dict1,save=False):
# 詞雲設定
mask_color_path = “bg_1.png” # 設定背景圖片路徑
font_path = ‘C:\Windows\Fonts\simkai.ttf’ # 為matplotlib設定中文字型路徑沒
imgname1 = “color_by_defualut.png” # 儲存的圖片名字1(只按照背景圖片形狀)
imgname2 = “color_by_img.png” # 儲存的圖片名字2(顏色按照背景圖片顏色佈局生成)
width = 1000
height = 860
margin = 2
# 設定背景圖片
mask_coloring = imread(mask_color_path)
# 設定WordCloud屬性
wc = WordCloud( font_path =font_path ,
background_color=“white”, # 背景顏色
max_words=150, # 詞雲顯示的最大詞數
mask=mask_coloring, # 設定背景圖片
max_font_size=200, # 字型最大值
# random_state=42,
width=width, height=height, margin=margin, # 設定圖片預設的大小,但是如果使用背景圖片的話,那麼儲存的圖片大小將會按照其大小儲存,margin為詞語邊緣距離
)
# 生成詞雲
wc.generate_from_frequencies(dict1)

bg_color = ImageColorGenerator(mask_coloring)
    # 重定義字型顏色
wc.recolor(color_func=bg_color)
    # 定義自定義字型,檔名從1.b檢視系統中文字型中來
myfont = FontProperties(fname=font_path)
plt.figure()
title='詞雲'
plt.title(title, fontproperties=myfont)
plt.imshow(wc)
plt.axis("off")
plt.show()

if save is True:#儲存到
    os.chdir(dirs)
    wc.to_file(imgname2)

if name ==‘main’:
base_url = ‘https://movie.douban.com/subject/26935251
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36’,
‘Upgrade-Insecure-Requests’: ‘1’,
‘Connection’:‘keep-alive’,
‘Upgrade-Insecure-Requests’:‘1’,
‘Host’:‘movie.douban.com’,
}
start = 0
response = requests.get(base_url,headers)
tree = etree.HTML(response.text)
movie_name = tree.xpath(’//div[@id=“content”]/h1/span’)[0].text.strip()
# print(movie_name)

url = base_url+'/comments'
filename='test1.txt'

try:
    get_comments(url, headers,start, 5, movie_name,None)
    dict1=get_dict(filename,5)
    get_wordcloud(dict1,save=True)
finally:
    pass