構建決策樹和svm模型(某金融資料集)
阿新 • • 發佈:2018-12-22
根據金融資料集作出的決策樹和svm模型
# 匯入需要的包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score, f1_score
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn. svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
# 因為資料並非utf-8編碼,要使用gbk編碼讀入,否則出錯
data = pd.read_csv('./data.csv', index_col=0, encoding='gbk')
# 單獨提取出y標籤列,和其餘的88列標記為X
y = data['status']
X = data.drop('status', axis=1)
#X值的行列數,以及y的分佈型別
print('X.shape:', X.shape)
print('y 的分佈\n' , y.value_counts())
X.shape: (4754, 88)
y 的分佈
0 3561
1 1193
Name: status, dtype: int64
# 首先剔除一些明顯無用的特徵,如 id_name, custid, trade_no, bank_card_no,
X.drop(['id_name', 'custid', 'trade_no', 'bank_card_no'], axis=1, inplace=True)
print(X.shape)
(4754, 84)
# 選取數值型特徵
X_num = X.select_dtypes('number' ).copy()
print(X_num.shape)
type(X_num.mean())
(4754, 80)
pandas.core.series.Series
#使用均值填充缺失值
X_num.fillna(X_num.mean(), inplace=True)
#觀察除數值型以外的變數
X_str = X.select_dtypes(exclude='number').copy()
X_str.describe()
reg_preference_for_trad | source | latest_query_time | loans_latest_time | |
---|---|---|---|---|
count | 4752 | 4754 | 4450 | 4457 |
unique | 5 | 1 | 207 | 232 |
top | 一線城市 | xs | 2018-04-14 | 2018-05-03 |
freq | 3403 | 4754 | 423 | 134 |
#把reg_preference用虛擬變數代替,其它三個變數刪除
X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0])
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X_str_dummy.head()
#X_str.drop(['latest_query_time'],axis=1,inplace=True)
一線城市 | 三線城市 | 二線城市 | 其他城市 | 境外 | |
---|---|---|---|---|---|
5 | 1 | 0 | 0 | 0 | 0 |
10 | 1 | 0 | 0 | 0 | 0 |
12 | 1 | 0 | 0 | 0 | 0 |
13 | 0 | 1 | 0 | 0 | 0 |
14 | 1 | 0 | 0 | 0 | 0 |
X_cl = pd.concat([X_num, X_str_dummy], axis=1, sort=False)
X_cl.shape
(4754, 85)
#以三七比例分割訓練集和測試集
random_state = 1115
X_train, X_test, y_train, y_test = train_test_split(X_cl, y, test_size=0.3, random_state=random_state)
print(X_train.shape)
print(X_test.shape)
(3327, 85)
(1427, 85)
#svc模型,不明白嘗試了svc和lin_svc做出的預測f1都是0。只是因為沒有資料預處理時候歸一化?
"""
svc = SVC(C=1.0, kernel='rbf', gamma=0.1)
svc.fit(X_train, y_train)
#lin_svc模型
Lin_SVC = LinearSVC()
Lin_SVC.fit(X_train,y_train)
"""
#決策樹模型
clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
max_features=None, max_leaf_nodes=None,
min_impurity_split=1e-07, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best')
#評估
# 準確性
"""
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)
"""
"""
#lin_svc
y_train_pred = Lin_SVC.predict(X_train)
y_test_pred = Lin_SVC.predict(X_test)
"""
#決策樹
y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)
print('準確性:')
print('訓練集:{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('測試集:{:.4f}'.format(f1_score(y_test, y_test_pred)))
print('ROC AUC:')
print('訓練集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('測試集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))
準確性:
訓練集:0.4083
測試集:0.3992
ROC AUC:
訓練集:0.6227
測試集:0.6166
問題:
不明白svm以及線性svm作出的預測f1-score為什麼是0,只是因為資料沒有歸一化麼?