Java開發框架之Spring JDBC
阿新 • • 發佈:2021-11-10
一.對西遊記文件的分詞
程式碼如下:
import jieba
import jieba
def takeSecond(elem):
return elem[1]
def main():
path = "西遊記.txt"
file = open(path, "r", encoding="utf-8")
text = file.read()
file.close()
words = jieba.lcut(text)
counts = {}
for word in words:
if len(word) == 1:
continue
elif word == "大聖" or word == "老孫" or word == "行者" or word == "孫大聖" or word == "孫行者" or word == "猴王" or word == "悟空" or word == "齊天大聖" or word == "猴子":
rword = "孫悟空"
elif word == "師父" or word == "三藏" or word == "聖僧":
rword = "唐僧"
elif word == " 呆子" or word == "八戒" or word == "老豬":
rword = "豬八戒"
elif word == "沙和尚":
rword = "沙僧"
elif word == "妖精" or word == "妖魔" or word == "妖道":
rword = "妖怪"
elif word == "佛祖":
rword = "如來"
elif word == "三太子":
rword = " 白馬"
else:
rword = word
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=takeSecond, reverse=True)
for i in range(20):
item = items[i]
keyWord = item[0]
count = item[1]
print("{0:<10}{1:>5}".format(keyWord, count))
main()
執行結果如下:
一部 35
三藏 30
行者 28
如來 20
唐僧 18
聖僧 18
八戒 15
師父 14
佛祖 14
大仙 12
沙僧 10
正是 9
金剛 9
四眾 8
山門 8
東土 8
菩薩 8
論經 8
傳經 7
長老 7