1. 程式人生 > >NLTK統計中文詞頻並輸出

NLTK統計中文詞頻並輸出

# -*- coding: utf-8 -*-
'''
使用NLTK對中文進行詞頻統計並輸出
'''
from nltk import FreqDist


def delblankline(infile, outfile):
    infopen = open(infile, 'r',encoding="utf-8")
    outfopen = open(outfile, 'w',encoding="utf-8")
    lines = infopen.readlines()
    cnt = Counter()
    for char in lines:
        cnt[char]+=1
vocab = cnt.most_common() for each in vocab[:15000]:#對前15000個詞進行輸出 outfopen.write(each[0]) delblankline("原始檔路徑", "目標檔案路徑")