1. 程式人生 > >python讀取hdfs並返回dataframe

python讀取hdfs並返回dataframe

不多說,直接上程式碼

from hdfs import Client

import pandas as pd

HDFSHOST = "http://xxx:50070"

FILENAME = "/tmp/preprocess/part-00000" #hdfs檔案路徑

COLUMNNAMES = [xx']

def readHDFS():

'''

讀取hdfs檔案

Returns:

df:dataframe hdfs資料

'''

client = Client(HDFSHOST)

# 目前讀取hdfs檔案採用方式:

# 1. 先從hdfs讀取二進位制資料流檔案

# 2. 將二進位制檔案另存為.csv

# 3. 使用pandas讀取csv檔案

with client.read(FILENAME) as fs:

content = fs.read()

s = str(content, 'utf-8')

file = open("data/tmp/data.csv", "w")

file.write(s)

df = pd.read_csv("data/tmp/data.csv", names=COLUMNNAMES)

return df