python使用dataframe統計的一個小指令碼:
阿新 • • 發佈:2018-12-15
此指令碼是我讀取JSON檔案,解析為詞典,然後讀取為dataframe,通過列名進行統計,最後統計輸出到結果檔案
_metaclass_=type import ConfigParser import os import pandas as pd import json from __future__ import division #讀取配置檔案獲取輸入路徑,輸出路徑 cf = ConfigParser.ConfigParser() cf.read("config.conf")#配置檔案路徑 inpath = cf.get("config","inpath") outpath = cf.get("config","outpath") print inpath #遍歷dirname,獲取所有檔案路徑 result = [] for maindir, subdir, file_name_list in os.walk(inpath): for filename in file_name_list: apath = os.path.join(maindir, filename) result.append(apath) print result #遍歷檔案路徑,獲取每個檔案的質控結果 results = pd.DataFrame([['表名稱','欄位名稱','總行數','空值行數','空值佔比','去重後行數','樣例資料','樣例資料對應條數']],columns=['tablename','columnName','總行數','無效記錄數','無效佔比','去重行數','樣例資料','樣例資料對應條數']) df = pd.DataFrame() for inpath in result: if "_SUCCESS" not in inpath:#測試過濾出目標檔案並統計 f = open(inpath) lines = f.readlines() for line in lines: dic = json.loads(line) ps = pd.DataFrame(dic,index=['1']) df = df.append(ps) names = df.columns tablename = inpath.split('\\')[len(inpath.split('\\'))-1] sumcounts = len(df) print sumcounts for name in names: ylsj = df[name].value_counts()[0:5].reset_index()['index'].tolist() ylsjcount = df[name].value_counts()[0:5].reset_index()[name].tolist() discount = len(df.drop_duplicates([name])) nullcounts = len(df[df[name].isin(['','NULL'])]) ratio = nullcounts/sumcounts*100 result = pd.DataFrame([[tablename,name,sumcounts,nullcounts,ratio,discount,ylsj,ylsjcount]],columns=['tablename','columnName','總行數','無效記錄數','無效佔比','去重行數','樣例資料','樣例資料對應條數']) print result results = results.append(result,ignore_index=True) results.to_csv(outpath, index=False,mode='a', header=False )
版權所有!