Xgboost篩選特徵重要性
阿新 • • 發佈:2019-01-10
基本思想
根據結構分數的增益情況計算出來選擇哪個特徵的哪個分割點,某個特徵的重要性,就是它在所有樹中出現的次數之和。
使用程式碼
import pandas as pd
import xgboost as xgb
import operator
from matplotlib import pylab as plt
def ceate_feature_map(features):
outfile = open('xgb.fmap', 'w')
i = 0
for feat in features:
outfile.write('{0}\t{1}\tq\n' .format(i, feat))
i = i + 1
outfile.close()
def get_data():
train = pd.read_csv("../input/train.csv")
features = list(train.columns[2:])
y_train = train.Hazard
for feat in train.select_dtypes(include=['object']).columns:
m = train.groupby([feat])['Hazard'].mean()
train[feat].replace(m,inplace=True )
x_train = train[features]
return features, x_train, y_train
def get_data2():
from sklearn.datasets import load_iris
#獲取資料
iris = load_iris()
x_train=pd.DataFrame(iris.data)
features=["sepal_length","sepal_width","petal_length","petal_width"]
x_train.columns=features
y_train=pd.DataFrame(iris.target)
return features, x_train, y_train
#features, x_train, y_train = get_data()
features, x_train, y_train = get_data2()
ceate_feature_map(features)
xgb_params = {"objective": "reg:linear", "eta": 0.01, "max_depth": 8, "seed": 42, "silent": 1}
num_rounds = 1000
dtrain = xgb.DMatrix(x_train, label=y_train)
xgb_model = xgb.train(xgb_params, dtrain, num_rounds)
importance = xgb_model.get_fscore(fmap='xgb.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))
df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
plt.figure()
df.plot()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(16, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('relative importance')
plt.gcf().savefig('feature_importance_xgb.png')
import pandas as pd
import matplotlib.pylab as plt
feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)
#新版需要轉換成dict or list
#feat_imp = pd.Series(dict(clf.get_booster().get_fscore())).sort_values(ascending=False)
#plt.bar(feat_imp.index, feat_imp)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()