1. 程式人生 > >特徵選擇 python實踐

特徵選擇 python實踐

下面介紹兩種之前競賽使用到的特徵選擇方案

方案一

  • 流程圖
    這裡寫圖片描述

以上方法使用方差、卡方檢驗、決策樹模型輸出特徵重要性方法綜合起來進行特徵選擇,該方案在馬上AI全球挑戰賽中發揮了比較大的作用。該連結是我們的解決方案,開源是一種精神,僅供大家共同學習交流。

  • python程式碼實現
#coding=utf-8

import numpy as np
import pandas as pd

'''單變數特徵選取'''
from sklearn.feature_selection import SelectKBest, chi2
'''去除方差小的特徵'''
from sklearn.feature_selection import
VarianceThreshold '''迴圈特徵選取''' from sklearn.svm import SVC from sklearn.feature_selection import RFE '''RFE_CV''' from sklearn.ensemble import ExtraTreesClassifier class FeatureSelection(object): def __init__(self, feature_num): self.feature_num = feature_num self.train_test, self.label, self.test = self.read_data() # features #
self.feature_name = list(self.train_test.columns) # feature name # def read_data(self): test = pd.read_csv(r'test_feature.csv', encoding='utf-8') train_test = pd.read_csv(r'train_test_feature.csv', encoding='utf-8') print('讀取資料完畢。。。') label = train_test[['target'
]] test = test.iloc[:, 1:] train_test = train_test.iloc[:, 2:] return train_test, label, test def variance_threshold(self): sel = VarianceThreshold() sel.fit_transform(self.train_test) feature_var = list(sel.variances_) # feature variance # features = dict(zip(self.feature_name, feature_var)) features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] # print(features) # 100 cols # return set(features) # return set type # def select_k_best(self): ch2 = SelectKBest(chi2, k=self.feature_num) ch2.fit(self.train_test, self.label) feature_var = list(ch2.scores_) # feature scores # features = dict(zip(self.feature_name, feature_var)) features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] # print(features) # 100 cols # return set(features) # return set type # def svc_select(self): svc = SVC(kernel='rbf', C=1, random_state=2018) # linear # rfe = RFE(estimator=svc, n_features_to_select=self.feature_num, step=1) rfe.fit(self.train_test, self.label.ravel()) print(rfe.ranking_) return rfe.ranking_ def tree_select(self): clf = ExtraTreesClassifier(n_estimators=300, max_depth=7, n_jobs=4) clf.fit(self.train_test, self.label) feature_var = list(clf.feature_importances_) # feature scores # features = dict(zip(self.feature_name, feature_var)) features = list(dict(sorted(features.items(), key=lambda d: d[1])).keys())[-self.feature_num:] # print(features) # 100 cols # return set(features) # return set type # def return_feature_set(self, variance_threshold=False, select_k_best=False, svc_select=False, tree_select=False): names = set([]) if variance_threshold is True: name_one = self.variance_threshold() names = names.union(name_one) if select_k_best is True: name_two = self.select_k_best() names = names.intersection(name_two) if svc_select is True: name_three = self.svc_select() names = names.intersection(name_three) if tree_select is True: name_four = self.tree_select() names = names.intersection(name_four) print(names) return list(names) selection = FeatureSelection(100) selection.return_feature_set(variance_threshold=True, select_k_best=True, svc_select=False, tree_select=True)

由於使用SVC方法速度太慢,我就沒有使用它

方案二

方案二是使用遺傳演算法做特徵選擇,演算法原理我就不多闡述了,可以見我另一篇博文,雖然是用遺傳演算法解決Tsp問題,但除了編碼方式不一樣外其它幾乎差不多。

End:如有不當之處,還望不吝賜教