1. 程式人生 > >spark wordcount 單詞統計

spark wordcount 單詞統計

spark wordcount 單詞統計

檔案1.txt

hello world
hello tom
hello lucy
tom lucy
hello python
# -*- coding:utf-8 -*-
import os
import shutil

from pyspark import SparkContext

inputpath = '1.txt'
outputpath = 'result'

sc = SparkContext('local', 'wordcount')

# 讀取檔案
input = sc.textFile(inputpath)
# 切分單詞
words = input.flatMap(lambda line: line.split(' '))
# 轉換成鍵值對並計數
counts = words.map(lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)

# 輸出結果
result=counts.collect()
print result
for (word,count) in result:
    print word,count


# 刪除輸出目錄
if os.path.exists(outputpath):
    shutil.rmtree(outputpath, True)

# 將統計結果寫入結果檔案
counts.saveAsTextFile(outputpath)