2022.01.01-群聊關鍵詞統計

阿新 • • 發佈：2022-01-02

新年新氣象，申請了cnblogs，新的一年希望多多更新~

首先是三道leetcode題：

2022. 將一維陣列轉變成二維陣列

class Solution:
    def construct2DArray(self, original: List[int], m: int, n: int) -> List[List[int]]:
        if len(original) != m * n:
            return []
        res = [[0] * n for i in range(m)] # 這邊要注意不能寫成深拷貝，或者調換m、n的順序
        for i in range(m):
            for j in range(n):
                print(i, j, i * n + j)
                res[i][j] = original[i * n + j] 
        print(res)
        return res

劍指 Offer 09. 用兩個棧實現佇列

class CQueue {
    stack<int> sa, sb;

public:
    CQueue() {
        // c++ 清空棧的方式
        while (!sa.empty()) { 
            sa.pop();
        }
        while (!sb.empty()) {
            sb.pop();
        }
    }
    
    void appendTail(int value) {
        sa.push(value);
    }
    
    int deleteHead() {
        if (sb.empty()) {
            while (!sa.empty()) {
            sb.push(sa.top());
            sa.pop();
          }
        }
        if (!sb.empty()) {
            int res = sb.top();
            sb.pop();
            return res;
        } else {
            return -1;
        } 
    }
};

/**
 * Your CQueue object will be instantiated and called as such:
 * CQueue* obj = new CQueue();
 * obj->appendTail(value);
 * int param_2 = obj->deleteHead();
 */

劍指 Offer 30. 包含min函式的棧

class MinStack {
    stack<int> s, sm;
public:
    /** initialize your data structure here. */
    MinStack() {
        while (!s.empty()) {
            s.pop();
        }
        while (!sm.empty()) {
            sm.pop();
        }
    }
    
    void push(int x) {
        s.push(x);
        if (sm.empty() || x <= sm.top()) {
            sm.push(x);
        }
    }
    
    void pop() {
        
        if (s.top() == sm.top()) {
            sm.pop();
        }
        s.pop();
    }
    
    int top() {
        return s.top();
    }
    
    int min() {
        return sm.top();
    }
};

/**
 * Your MinStack object will be instantiated and called as such:
 * MinStack* obj = new MinStack();
 * obj->push(x);
 * obj->pop();
 * int param_3 = obj->top();
 * int param_4 = obj->min();
 */

今天搞了一個比較有意思的事情，總結了某個活躍群的年度詞彙，效果如下：

Step1：匯出微信群的聊天記錄，手機usb連線電腦使用itunes可以直接備份

Step2：使用軟體wxbackup匯出備份中的微信聊天記錄

Step3：讀取json，jieba分詞，wordcloud畫圖

程式碼主要參考部落格：

https://pythondict.com/python-paintings/python-qixi-wechat-wordcloud/#lwptoc1

https://www.cnblogs.com/huzihu/p/9675304.html

# coding:utf-8
import json
import jieba
import numpy
import codecs
import pandas
import matplotlib.pyplot as plt
from wordcloud import WordCloud


def loadJson():
    with open('message.js') as dataFile:
        data = dataFile.read()
        obj = data[data.find('{'): data.rfind('}') + 1]
        jsonObj = json.loads(obj)

        data = open("聊天記錄.txt", 'w+', encoding='utf-8')
        msg = jsonObj["message"]
        for i in range(len(msg)):
            mi = msg[i]
            # 只關注2021年的使用者訊息
            if mi['m_uiMessageType'] == 1 and mi['m_uiCreateTime'] > 1609430400:
                data.write(jsonObj["message"][i]['m_nsContent'] + '\n')
        data.close()


def load_file_segment():
    # 讀取文字檔案並分詞
    jieba.load_userdict("mywords.txt")
    # 載入我們自己的詞典
    f = codecs.open(u"聊天記錄.txt", 'r', encoding='utf-8')
    # 開啟檔案
    content = f.read()
    # 讀取檔案到content中
    f.close()
    # 關閉檔案
    segment = []
    # 儲存分詞結果
    segs = jieba.cut(content)
    # 對整體進行分詞
    for seg in segs:
        if len(seg) != 1 and seg != '\r\n':
            # 如果說分詞得到的結果非單字，且不是換行符，則加入到陣列中
            segment.append(seg)
    return segment


def get_words_count_dict():
    segment = load_file_segment()
    # 獲得分詞結果
    df = pandas.DataFrame({'segment': segment})
    # 將分詞陣列轉化為pandas資料結構
    stopwords = pandas.read_csv("stopwords.txt", index_col=False, quoting=3, sep="\t", names=['stopword'],
                                encoding="utf-8")
    # 載入停用詞
    df = df[~df.segment.isin(stopwords.stopword)]
    # 如果不是在停用詞中
    words_count = df.groupby(by=['segment'])['segment'].agg([("計數", numpy.size)])
    # 按詞分組，計算每個詞的個數
    words_count = words_count.reset_index().sort_values(by="計數", ascending=False)
    # reset_index是為了保留segment欄位，排序，數字大的在前面
    return words_count


# loadJson()

words_count = get_words_count_dict()
# 獲得詞語和頻數

words = words_count.set_index("segment").to_dict()
# 篩選後統計

word_counts = words['計數']
# wc = {k: v for k, v in sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:200]}
# print(wc)

# 繪製詞雲
my_cloud = WordCloud(
    background_color='white',  # 設定背景顏色  預設是black
    width=2000, height=1000,
    max_words=200,  # 詞雲顯示的最大詞語數量
    font_path='/System/Library/Fonts/Hiragino Sans GB.ttc',  # 設定字型  顯示中文
    max_font_size=140,  # 設定字型最大值
    min_font_size=40,  # 設定子圖最小值
    random_state=100  # 設定隨機生成狀態，即多少種配色方案
).generate_from_frequencies(word_counts)

# 顯示生成的詞雲圖片
plt.imshow(my_cloud)
# 顯示設定詞雲圖中無座標軸
plt.axis('off')
plt.show()
my_cloud.to_file(r"cloud.png")