1. 程式人生 > >豆瓣評論資料詞雲畫像()

豆瓣評論資料詞雲畫像()

'''
#這段程式碼是從一個網`在這裡插入程式碼片`站借用過來的,具體哪個網址一下子忘記了。
#可以直接執行。
'''
from selenium import webdriver
import time
import codecs
import jieba
import jieba.analyse as analyse
from wordcloud import WordCloud
from scipy.misc import imread
from os import path

def get_douban_comments(url):
    # comments_list = [] # 評論列表
    login_url = 'https://accounts.douban.com/login?source=movie'
    user_name = '15527546531'  # 這裡替換成你的豆瓣使用者名稱
    password = '15898405110ABCD'  # 這裡替換成你的密碼
    driver = webdriver.Firefox() # 啟動Firefox()
    driver.get(login_url)
    driver.find_element_by_id('email').clear() # 清除輸入框
    driver.find_element_by_id('email').send_keys(user_name) # 輸入使用者名稱
    driver.find_element_by_id('password').clear()
    driver.find_element_by_id('password').send_keys(password) # 輸入密碼
    captcha_field = input('請開啟瀏覽器輸入驗證碼:') # 手動填入驗證碼
    driver.find_element_by_id('captcha_field').send_keys(captcha_field)
    driver.find_element_by_class_name('btn-submit').click() # 點選登入按鈕
    time.sleep(5) # 等待跳轉到登入之後的頁面
    driver.get(url) # 定位到目標頁面
    driver.implicitly_wait(3) # 智慧等待3秒
    n = 1 # 頁數
    count = 0 # 評論數目
# 注意:下次需要做詞雲的時候 需要重新給一個名字或者吧上次的檔案清空,
    `在這裡插入程式碼片`#因為是新增的模式
    file = codecs.open("pjl_comment.txt",mode='a',encoding='utf-8')


    while True:
        try:
            comments_list = []  # 評論列表
            results = driver.find_elements_by_class_name('comment')
            print("results:",len(results))
            for result in results:
                # print(result)
                # author = result.find_elements_by_tag_name('a')[1].text # 作者
                # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目
                # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間
                comment = result.find_element_by_tag_name('p').text # 評論內容
                print(comment)
                comments_list.append(comment+u'\n')
                print(u"查詢到第%d個評論" % count)
                count += 1
            driver.find_element_by_class_name('next').click() # 點選下一頁
            print( u'第%d頁查詢完畢!' % n)
            n += 1
            time.sleep(2)
            file.writelines(comments_list)
        except Exception() as e:
            print(e)
    # with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f:
    #     f.writelines(comments_list)
    # print(u"查詢到第%d頁,第%d個評論!" %(n,count))


# 得到所有關鍵詞
def get_all_keywords(file_name):
    word_lists = [] # 關鍵詞列表
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        Lists = f.readlines() # 文字列表
        for List in Lists:
            cut_list = list(jieba.cut(List))
            for word in cut_list:
                word_lists.append(word)
    word_lists_set = set(word_lists) # 去除重複元素
    sort_count = []
    word_lists_set = list(word_lists_set)
    length = len(word_lists_set)
    print(u"共有%d個關鍵詞" % length)
    k = 1
    for w in word_lists_set:
        sort_count.append(w+u':'+(word_lists.count(w))+u"次\n")
        print(u"%d---" % k + w+u":"+(word_lists.count(w))+ u"次")
        k += 1
    with codecs.open('count_word.txt','w',encoding='utf-8') as f:
        f.writelines(sort_count)

def get_top_keywords(file_name):
    top_word_lists = [] # 關鍵詞列表
    with codecs.open(file_name,'r',encoding='utf-8') as f:
        texts = f.read() # 讀取整個檔案作為一個字串
        Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True)
        n = 1
        for result in Result:
            print(u"%d:" % n )
            for C in result[0]: # result[0] 包含關鍵詞和詞性
                print(C,u"  ")
            print(u"權重:"+ str(result[1])) # 關鍵詞權重
            n += 1



# 繪製詞雲
def draw_wordcloud():
   with codecs.open('pjl_comment.txt',encoding='utf-8') as f:
       comment_text = f.read()
   cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連線成為字串
   d = "E:\\pythonStudy_2\\machine-learning" #當前檔案資料夾所在目錄
   color_mask = imread("E:\\pythonStudy_2\\machine-learning\\tmp.png") # 讀取背景圖片
   cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40)
   word_cloud = cloud.generate(cut_text) # 產生詞雲
   word_cloud.to_file("pjl_cloud2.jpg")



if __name__ == '__main__':

    url = "https://movie.douban.com/subject/26752088/comments?status=P" # 我不是藥神
    get_douban_comments(url)

    # file_name = 'pjl_comment.txt'
    # get_top_keywords(file_name)

    # draw_wordcloud()