單因子分析 python 實現
阿新 • • 發佈:2019-01-05
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pandas as pd import numpy as np ## 讀取資料 df = pd.read_csv("./HR.csv", header=0) # 檢視資料結構 summary = df.describe() # 求均值 row_mean = df.mean(axis=1) col_mean = df.mean() # 選擇資料 ## 列 print(df["satisfaction_level"].head()) print(df[0:3]) ## 標籤 print(df.loc[0:3]) print(df.loc[0, ["satisfaction_level"]]) ### 1 ### 異常值分析 ### 空值處理 sl_l = df["satisfaction_level"] df[df['satisfaction_level'].isnull()] #print(sl_l.isnull()) print(sl_l.isnull().sum()) print(sl_l[sl_l.isnull()]) ## 對空值的填充 #print(sl_l.fillna(value=5)) ## 對空值的丟棄 #print(sl_l.dropna(how="any")) sl_l = sl_l.dropna(how="any") ### 2 ### 資料過大、過小異常處理 le_s = df['last_evaluation'] le_s[le_s.isnull()] le_s.isnull().sum() ## 偏度 le_s.skew() ## 峰度 le_s.kurt() ## 連續異常值處理方式(取四分位上下界) #(1) le_s = le_s[le_s <= 1] q_low = le_s.quantile(q=0.25) q_high = le_s.quantile(q=0.75) q_interval = q_high - q_low k = 1.5 ### 資料篩選 le_s = le_s[le_s<q_interval+k*q_interval][le_s>q_low-k*q_interval] ### 分佈情況 np.histogram(le_s.values,bins=np.arange(0.0, 1.1,0.1 )) ### 3 ## 排序 np_s = df['number_project'] np_s.value_counts(normalize=True).sort_index() ### 4 分佈情況 pl5_s = df['promotion_last_5years'] pl5_s.value_counts() pl5_s.value_counts(normalize=True) ## 5 條件篩選 s_s = df['salary'] s_s.where(s_s!=="nme").dropna() ### 總結 # 去空值 df = pd.read_csv("./HR.csv", header=0) df = df.dropna(axis=0,how='any') df[df['last_evaluation']<=1][df['salary']!='nme'] le_s = df['last_evaluation'] q_low = le_s.quantile(q=0.25) q_high = le_s.quantile(q=0.75) q_interval = q_high - q_low k=1.5 le_s = le_s[le_s<k*q_interval+q_high][le_s>k*q_interval-q_low] df[le_s<k*q_interval+q_high][le_s>k*q_interval-q_low][df['salary']!='nme'] 簡單對比分析 df.groupby("department").mean()
待續。。。