1. 程式人生 > >python統計中文單詞

python統計中文單詞

#coding:UTF-8
import sys   
sys.setrecursionlimit(100000000) 
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
    s = ''
    for fg in inIo:
        s = s + open(fg, 'r').read().decode('utf-8')
    print "一共" , len(s) , "單詞"
    lt = set(s)
    word = []
    for x in lt:
        if  19968 <= ord(x) <= 40869:
            word.append(x)
    sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
    m = map(sts, word)
    m = wordsort(m[0], m, 0)
    w = open(outIo, writing)
    for i in m:
        w.writelines(i)
        w.write('\n')
    w.flush()
    w.close()
    
def wordsort(x, m, i):
    if len(m[i:]) == 1:
        return m
    for v in m[i + 1:]:
        f = float(v[1])
        if f > float(m[i][1]):
            ind = m.index(v, i + 1)
            z = m[i]
            m[i] = v
            m[ind] = z
    i += 1
    return wordsort(m[i], m, i) 
 
if __name__ == '__main__':
    wordHan(['test1.txt', 'test2.txt'], writing='w')
    wordEn('test1.txt', writing='w')