豆瓣評論資料詞雲畫像()
阿新 • • 發佈:2018-11-25
''' #這段程式碼是從一個網`在這裡插入程式碼片`站借用過來的,具體哪個網址一下子忘記了。 #可以直接執行。 ''' from selenium import webdriver import time import codecs import jieba import jieba.analyse as analyse from wordcloud import WordCloud from scipy.misc import imread from os import path def get_douban_comments(url): # comments_list = [] # 評論列表 login_url = 'https://accounts.douban.com/login?source=movie' user_name = '15527546531' # 這裡替換成你的豆瓣使用者名稱 password = '15898405110ABCD' # 這裡替換成你的密碼 driver = webdriver.Firefox() # 啟動Firefox() driver.get(login_url) driver.find_element_by_id('email').clear() # 清除輸入框 driver.find_element_by_id('email').send_keys(user_name) # 輸入使用者名稱 driver.find_element_by_id('password').clear() driver.find_element_by_id('password').send_keys(password) # 輸入密碼 captcha_field = input('請開啟瀏覽器輸入驗證碼:') # 手動填入驗證碼 driver.find_element_by_id('captcha_field').send_keys(captcha_field) driver.find_element_by_class_name('btn-submit').click() # 點選登入按鈕 time.sleep(5) # 等待跳轉到登入之後的頁面 driver.get(url) # 定位到目標頁面 driver.implicitly_wait(3) # 智慧等待3秒 n = 1 # 頁數 count = 0 # 評論數目 # 注意:下次需要做詞雲的時候 需要重新給一個名字或者吧上次的檔案清空, `在這裡插入程式碼片`#因為是新增的模式 file = codecs.open("pjl_comment.txt",mode='a',encoding='utf-8') while True: try: comments_list = [] # 評論列表 results = driver.find_elements_by_class_name('comment') print("results:",len(results)) for result in results: # print(result) # author = result.find_elements_by_tag_name('a')[1].text # 作者 # vote = result.find_element_by_class_name('comment-vote').find_element_by_tag_name('span').text # 贊同數目 # time0 = result.find_element_by_class_name('comment-info').find_elements_by_tag_name('span')[1].text # 時間 comment = result.find_element_by_tag_name('p').text # 評論內容 print(comment) comments_list.append(comment+u'\n') print(u"查詢到第%d個評論" % count) count += 1 driver.find_element_by_class_name('next').click() # 點選下一頁 print( u'第%d頁查詢完畢!' % n) n += 1 time.sleep(2) file.writelines(comments_list) except Exception() as e: print(e) # with codecs.open('pjl_comment.txt','a',encoding='utf-8') as f: # f.writelines(comments_list) # print(u"查詢到第%d頁,第%d個評論!" %(n,count)) # 得到所有關鍵詞 def get_all_keywords(file_name): word_lists = [] # 關鍵詞列表 with codecs.open(file_name,'r',encoding='utf-8') as f: Lists = f.readlines() # 文字列表 for List in Lists: cut_list = list(jieba.cut(List)) for word in cut_list: word_lists.append(word) word_lists_set = set(word_lists) # 去除重複元素 sort_count = [] word_lists_set = list(word_lists_set) length = len(word_lists_set) print(u"共有%d個關鍵詞" % length) k = 1 for w in word_lists_set: sort_count.append(w+u':'+(word_lists.count(w))+u"次\n") print(u"%d---" % k + w+u":"+(word_lists.count(w))+ u"次") k += 1 with codecs.open('count_word.txt','w',encoding='utf-8') as f: f.writelines(sort_count) def get_top_keywords(file_name): top_word_lists = [] # 關鍵詞列表 with codecs.open(file_name,'r',encoding='utf-8') as f: texts = f.read() # 讀取整個檔案作為一個字串 Result = analyse.textrank(texts,topK=20,withWeight=True,withFlag=True) n = 1 for result in Result: print(u"%d:" % n ) for C in result[0]: # result[0] 包含關鍵詞和詞性 print(C,u" ") print(u"權重:"+ str(result[1])) # 關鍵詞權重 n += 1 # 繪製詞雲 def draw_wordcloud(): with codecs.open('pjl_comment.txt',encoding='utf-8') as f: comment_text = f.read() cut_text = " ".join(jieba.cut(comment_text)) # 將jieba分詞得到的關鍵詞用空格連線成為字串 d = "E:\\pythonStudy_2\\machine-learning" #當前檔案資料夾所在目錄 color_mask = imread("E:\\pythonStudy_2\\machine-learning\\tmp.png") # 讀取背景圖片 cloud = WordCloud(font_path=path.join(d,'simsun.ttc'),background_color='white',mask=color_mask,max_words=2000,max_font_size=40) word_cloud = cloud.generate(cut_text) # 產生詞雲 word_cloud.to_file("pjl_cloud2.jpg") if __name__ == '__main__': url = "https://movie.douban.com/subject/26752088/comments?status=P" # 我不是藥神 get_douban_comments(url) # file_name = 'pjl_comment.txt' # get_top_keywords(file_name) # draw_wordcloud()