python統計中文單詞
阿新 • • 發佈:2019-01-30
#coding:UTF-8
import sys
sys.setrecursionlimit(100000000)
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
s = ''
for fg in inIo:
s = s + open(fg, 'r').read().decode('utf-8')
print "一共" , len(s) , "單詞"
lt = set(s)
word = []
for x in lt:
if 19968 <= ord(x) <= 40869:
word.append(x)
sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
m = map(sts, word)
m = wordsort(m[0], m, 0)
w = open(outIo, writing)
for i in m:
w.writelines(i)
w.write('\n')
w.flush()
w.close()
def wordsort(x, m, i):
if len(m[i:]) == 1:
return m
for v in m[i + 1:]:
f = float(v[1])
if f > float(m[i][1]):
ind = m.index(v, i + 1)
z = m[i]
m[i] = v
m[ind] = z
i += 1
return wordsort(m[i], m, i)
if __name__ == '__main__':
wordHan(['test1.txt', 'test2.txt'], writing='w')
wordEn('test1.txt', writing='w')
import sys
sys.setrecursionlimit(100000000)
def wordHan(inIo, outIo='wordcountHAN.txt', writing='w'):
s = ''
for fg in inIo:
s = s + open(fg, 'r').read().decode('utf-8')
print "一共" , len(s) , "單詞"
lt = set(s)
word = []
for x in lt:
if 19968 <= ord(x) <= 40869:
word.append(x)
sts = lambda x:[x + "-->", str(s.count(x) * 1.0 / len(s))]
m = map(sts, word)
m = wordsort(m[0], m, 0)
w = open(outIo, writing)
for i in m:
w.writelines(i)
w.write('\n')
w.flush()
w.close()
def wordsort(x, m, i):
if len(m[i:]) == 1:
return m
for v in m[i + 1:]:
f = float(v[1])
if f > float(m[i][1]):
ind = m.index(v, i + 1)
z = m[i]
m[i] = v
m[ind] = z
i += 1
return wordsort(m[i], m, i)
if __name__ == '__main__':
wordHan(['test1.txt', 'test2.txt'], writing='w')
wordEn('test1.txt', writing='w')