特徵 重要度展示
阿新 • • 發佈:2018-12-13
RF評價特徵重要度,畫出特徵排行
import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split,GridSearchCV from sklearn.metrics import classification_report def read_data(): # load pickle #df = pd.read_pickle("./output/killed_collision_normal2class.pkl") df = pd.read_pickle("./output/killed_collision_normal2class.pkl") X_train, X_test, y_train, y_test=train_test_split(df.drop(columns=["KILLED"]), df["KILLED"], test_size=0.3, random_state=0) return df, X_train, X_test, y_train, y_test #---------讀取資料集 pd_data,X_train, X_test, y_train, y_test = read_data() def feature_importance(features_num=20): if(features_num > X_train.shape[1]): print("the features num is too big for the trainData") return forest = RandomForestClassifier(n_estimators=500,random_state=0,n_jobs=-1,max_features=20) forest.fit(X_train,y_train) y_true, y_pred = y_test, forest.predict(X_test) print(classification_report(y_true, y_pred)) importance = forest.feature_importances_ indices = np.argsort(importance)[::-1] print("----the importance of features and its importance_score------") j=1 features_names=[] im_list= [] for i in indices[0:features_num]: f_name = X_train.columns.values[i] print(j,f_name,importance[i]) features_names.append(X_train.columns.values[i]) im_list.append(importance[i]) j+=1 draw_importance(features_names,im_list) def draw_importance(features,importances): indices = np.argsort(importances) print(indices) print(features) plt.title('Feature Importances') plt.barh(range(len(indices)), np.array(importances)[indices], color='b', align='center') plt.yticks(range(len(indices)), np.array(features)[indices]) plt.xlabel('Relative Importance') plt.show() if __name__=="__main__": feature_importance()