jieba分詞 西遊記
import jieba
txt = open("《西遊記》.txt", "r", encoding='utf-8').read()
words = jieba.lcut(txt) # 使用精確模式對文字進行分詞
counts = {} # 通過鍵值對的形式儲存詞語及其出現的次數
for word in words:
if len(word) == 1:
continue
elif word == "大聖" or word == "老孫" or word == "行者" or word == "孫大聖" or word == "孫行者" \
or word == "猴王" or word == "悟空" or word == "齊天大聖" or word == "猴子":
rword = "孫悟空"
elif word == "師父" or word == "三藏" or word == "聖僧":
rword = "唐僧"
elif word == "呆子" or word == "八戒" or word == "老豬":
rword = "豬八戒"
elif word == "沙和尚":
rword = "沙僧"
elif word == "妖精" or word == "妖魔" or word == "妖道":
rword = "妖怪"
elif word == "佛祖":
rword = "如來"
elif word == "三太子":
rword = "白馬"
else:
rword = word
counts[rword] = counts.get(rword, 0) + 1
items = list(counts.items()) # 將鍵值對轉換成列表
items.sort(key=lambda x: x[1], reverse=True) # 根據詞語出現的次數進行從大到小排序
for i in range(20):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))