python讀取hdfs並返回dataframe
阿新 • • 發佈:2019-01-29
不多說,直接上程式碼
from hdfs import Client
import pandas as pd
HDFSHOST = "http://xxx:50070"
FILENAME = "/tmp/preprocess/part-00000" #hdfs檔案路徑
COLUMNNAMES = [xx']
def readHDFS():
'''
讀取hdfs檔案
Returns:
df:dataframe hdfs資料
'''
client = Client(HDFSHOST)
# 目前讀取hdfs檔案採用方式:
# 1. 先從hdfs讀取二進位制資料流檔案
# 2. 將二進位制檔案另存為.csv
# 3. 使用pandas讀取csv檔案
with client.read(FILENAME) as fs:
content = fs.read()
s = str(content, 'utf-8')
file = open("data/tmp/data.csv", "w")
file.write(s)
df = pd.read_csv("data/tmp/data.csv", names=COLUMNNAMES)
return df