1. 程式人生 > 其它 >08 分散式計算MapReduce--詞頻統計

08 分散式計算MapReduce--詞頻統計

import re import collections
def count_word(path): result={} with open(path) as file_process: texts = file_process.read() # 小寫 texts = texts.lower() # 過濾標點 texts = re.sub("\"|,|\.|!|\?"," ",texts)
for word in texts.split(): if word not in result: result[word]=0 result[word] +=1 return result

def sort_by_count(d): # 排序 d = collections.OrderedDict(sorted(d.items(), key=lambda t: -t[1])) return d
# 檔名 file_name = "./english.txt"
dword = count_word(file_name) dword = sort_by_count(dword)
# 輸出 for key, value in dword.items(): print(key, value, sep=":")