1. 程式人生 > 其它 >jieba 分詞(西遊記)

jieba 分詞(西遊記)

import jieba

with open('./西遊記.txt', 'r', encoding='utf_8') as f:
    words = jieba.lcut(f.read())  # 使用精確模式對文字進行分詞
counts = {}  # 通過鍵值對的形式儲存詞語及其出現的次數

for word in words:
    if len(word) == 1:
        continue
    elif word == "大聖" or word == "老孫" or word == "行者" or word =l= "孫大聖" or word == "孫行者" or word == "猴王" or word == "悟空" or word == "齊天大聖" or word == "猴子":
        rword = "孫悟空"
    elif word == "師父" or word == "三藏" or word == "聖僧":
        rword = "唐僧"
    elif word == "呆子" or word == "八戒" or word == "老豬":
        rword = "豬八戒"
    elif word == "沙和尚":
        rword = "沙僧"
    elif word == "妖精" or word == "妖魔" or word == "妖道":
        rword = "妖怪"
    elif word == "佛祖":
        rword = "如來"
    elif word == "三太子":
        rword = "白馬"
    else:
        rword = word
    counts[rword] = counts.get(rword, 0) + 1

items = list(counts.items())  # 將鍵值對轉換成列表
items.sort(key=lambda x: x[1], reverse=True)  # 根據詞語出現的次數進行從大到小排序

for i in range(20):
    word, count = items[i]
    print("{0:<10}{1:>5}".format(word, count))