2022.01.01-群聊關鍵詞統計
阿新 • • 發佈:2022-01-02
新年新氣象,申請了cnblogs,新的一年希望多多更新~
首先是三道leetcode題:
2022. 將一維陣列轉變成二維陣列
class Solution: def construct2DArray(self, original: List[int], m: int, n: int) -> List[List[int]]: if len(original) != m * n: return [] res = [[0] * n for i in range(m)] # 這邊要注意不能寫成深拷貝,或者調換m、n的順序 for i in range(m): for j in range(n): print(i, j, i * n + j) res[i][j] = original[i * n + j] print(res) return res
劍指 Offer 09. 用兩個棧實現佇列
class CQueue { stack<int> sa, sb; public: CQueue() { // c++ 清空棧的方式 while (!sa.empty()) { sa.pop(); } while (!sb.empty()) { sb.pop(); } } void appendTail(int value) { sa.push(value); } int deleteHead() { if (sb.empty()) { while (!sa.empty()) { sb.push(sa.top()); sa.pop(); } } if (!sb.empty()) { int res = sb.top(); sb.pop(); return res; } else { return -1; } } }; /** * Your CQueue object will be instantiated and called as such: * CQueue* obj = new CQueue(); * obj->appendTail(value); * int param_2 = obj->deleteHead(); */
劍指 Offer 30. 包含min函式的棧
class MinStack { stack<int> s, sm; public: /** initialize your data structure here. */ MinStack() { while (!s.empty()) { s.pop(); } while (!sm.empty()) { sm.pop(); } } void push(int x) { s.push(x); if (sm.empty() || x <= sm.top()) { sm.push(x); } } void pop() { if (s.top() == sm.top()) { sm.pop(); } s.pop(); } int top() { return s.top(); } int min() { return sm.top(); } }; /** * Your MinStack object will be instantiated and called as such: * MinStack* obj = new MinStack(); * obj->push(x); * obj->pop(); * int param_3 = obj->top(); * int param_4 = obj->min(); */
今天搞了一個比較有意思的事情,總結了某個活躍群的年度詞彙,效果如下:
Step1:匯出微信群的聊天記錄,手機usb連線電腦使用itunes可以直接備份
Step2:使用軟體wxbackup匯出備份中的微信聊天記錄
Step3:讀取json,jieba分詞,wordcloud畫圖
程式碼主要參考部落格:
https://pythondict.com/python-paintings/python-qixi-wechat-wordcloud/#lwptoc1
https://www.cnblogs.com/huzihu/p/9675304.html
# coding:utf-8
import json
import jieba
import numpy
import codecs
import pandas
import matplotlib.pyplot as plt
from wordcloud import WordCloud
def loadJson():
with open('message.js') as dataFile:
data = dataFile.read()
obj = data[data.find('{'): data.rfind('}') + 1]
jsonObj = json.loads(obj)
data = open("聊天記錄.txt", 'w+', encoding='utf-8')
msg = jsonObj["message"]
for i in range(len(msg)):
mi = msg[i]
# 只關注2021年的使用者訊息
if mi['m_uiMessageType'] == 1 and mi['m_uiCreateTime'] > 1609430400:
data.write(jsonObj["message"][i]['m_nsContent'] + '\n')
data.close()
def load_file_segment():
# 讀取文字檔案並分詞
jieba.load_userdict("mywords.txt")
# 載入我們自己的詞典
f = codecs.open(u"聊天記錄.txt", 'r', encoding='utf-8')
# 開啟檔案
content = f.read()
# 讀取檔案到content中
f.close()
# 關閉檔案
segment = []
# 儲存分詞結果
segs = jieba.cut(content)
# 對整體進行分詞
for seg in segs:
if len(seg) != 1 and seg != '\r\n':
# 如果說分詞得到的結果非單字,且不是換行符,則加入到陣列中
segment.append(seg)
return segment
def get_words_count_dict():
segment = load_file_segment()
# 獲得分詞結果
df = pandas.DataFrame({'segment': segment})
# 將分詞陣列轉化為pandas資料結構
stopwords = pandas.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
encoding="utf-8")
# 載入停用詞
df = df[~df.segment.isin(stopwords.stopword)]
# 如果不是在停用詞中
words_count = df.groupby(by=['segment'])['segment'].agg([("計數", numpy.size)])
# 按詞分組,計算每個詞的個數
words_count = words_count.reset_index().sort_values(by="計數", ascending=False)
# reset_index是為了保留segment欄位,排序,數字大的在前面
return words_count
# loadJson()
words_count = get_words_count_dict()
# 獲得詞語和頻數
words = words_count.set_index("segment").to_dict()
# 篩選後統計
word_counts = words['計數']
# wc = {k: v for k, v in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:200]}
# print(wc)
# 繪製詞雲
my_cloud = WordCloud(
background_color='white', # 設定背景顏色 預設是black
width=2000, height=1000,
max_words=200, # 詞雲顯示的最大詞語數量
font_path='/System/Library/Fonts/Hiragino Sans GB.ttc', # 設定字型 顯示中文
max_font_size=140, # 設定字型最大值
min_font_size=40, # 設定子圖最小值
random_state=100 # 設定隨機生成狀態,即多少種配色方案
).generate_from_frequencies(word_counts)
# 顯示生成的詞雲圖片
plt.imshow(my_cloud)
# 顯示設定詞雲圖中無座標軸
plt.axis('off')
plt.show()
my_cloud.to_file(r"cloud.png")