spark wordcount 單詞統計
阿新 • • 發佈:2018-12-17
spark wordcount 單詞統計
檔案1.txt
hello world
hello tom
hello lucy
tom lucy
hello python
# -*- coding:utf-8 -*- import os import shutil from pyspark import SparkContext inputpath = '1.txt' outputpath = 'result' sc = SparkContext('local', 'wordcount') # 讀取檔案 input = sc.textFile(inputpath) # 切分單詞 words = input.flatMap(lambda line: line.split(' ')) # 轉換成鍵值對並計數 counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y) # 輸出結果 result=counts.collect() print result for (word,count) in result: print word,count # 刪除輸出目錄 if os.path.exists(outputpath): shutil.rmtree(outputpath, True) # 將統計結果寫入結果檔案 counts.saveAsTextFile(outputpath)