1. 程式人生 > >構建決策樹和svm模型(某金融資料集)

構建決策樹和svm模型(某金融資料集)

根據金融資料集作出的決策樹和svm模型

# 匯入需要的包
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score,  f1_score
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from  sklearn.
svm import LinearSVC from sklearn.tree import DecisionTreeClassifier
# 因為資料並非utf-8編碼,要使用gbk編碼讀入,否則出錯
data = pd.read_csv('./data.csv', index_col=0, encoding='gbk')


# 單獨提取出y標籤列,和其餘的88列標記為X
y = data['status']
X = data.drop('status', axis=1)
#X值的行列數,以及y的分佈型別
print('X.shape:', X.shape)
print('y 的分佈\n'
, y.value_counts())
X.shape: (4754, 88)
y 的分佈
 0    3561
1    1193
Name: status, dtype: int64
# 首先剔除一些明顯無用的特徵,如 id_name, custid, trade_no, bank_card_no,
X.drop(['id_name', 'custid', 'trade_no', 'bank_card_no'], axis=1, inplace=True)
print(X.shape)
(4754, 84)
# 選取數值型特徵
X_num = X.select_dtypes('number'
).copy() print(X_num.shape) type(X_num.mean())
(4754, 80)





pandas.core.series.Series
#使用均值填充缺失值
X_num.fillna(X_num.mean(), inplace=True)

#觀察除數值型以外的變數
X_str = X.select_dtypes(exclude='number').copy()
X_str.describe()
reg_preference_for_trad source latest_query_time loans_latest_time
count 4752 4754 4450 4457
unique 5 1 207 232
top 一線城市 xs 2018-04-14 2018-05-03
freq 3403 4754 423 134
#把reg_preference用虛擬變數代替,其它三個變數刪除
X_str['reg_preference_for_trad'] = X_str['reg_preference_for_trad'].fillna(X_str['reg_preference_for_trad'].mode()[0])
X_str_dummy = pd.get_dummies(X_str['reg_preference_for_trad'])
X_str_dummy.head()
#X_str.drop(['latest_query_time'],axis=1,inplace=True)
一線城市 三線城市 二線城市 其他城市 境外
5 1 0 0 0 0
10 1 0 0 0 0
12 1 0 0 0 0
13 0 1 0 0 0
14 1 0 0 0 0
X_cl = pd.concat([X_num, X_str_dummy], axis=1, sort=False)
X_cl.shape

(4754, 85)
#以三七比例分割訓練集和測試集
random_state = 1115
X_train, X_test, y_train, y_test = train_test_split(X_cl, y, test_size=0.3, random_state=random_state)
print(X_train.shape)
print(X_test.shape)

(3327, 85)
(1427, 85)
#svc模型,不明白嘗試了svc和lin_svc做出的預測f1都是0。只是因為沒有資料預處理時候歸一化?
"""
svc = SVC(C=1.0, kernel='rbf', gamma=0.1)
svc.fit(X_train, y_train)

#lin_svc模型
Lin_SVC = LinearSVC()
Lin_SVC.fit(X_train,y_train)
"""
#決策樹模型

clf = DecisionTreeClassifier(max_depth=4)
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
#評估
# 準確性
"""
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)
"""
"""
#lin_svc
y_train_pred = Lin_SVC.predict(X_train)
y_test_pred = Lin_SVC.predict(X_test)
"""
#決策樹

y_train_pred = clf.predict(X_train)
y_test_pred = clf.predict(X_test)

print('準確性:')
print('訓練集:{:.4f}'.format(f1_score(y_train, y_train_pred)))
print('測試集:{:.4f}'.format(f1_score(y_test, y_test_pred)))
print('ROC AUC:')
print('訓練集:{:.4f}'.format(roc_auc_score(y_train, y_train_pred)))
print('測試集:{:.4f}'.format(roc_auc_score(y_test, y_test_pred)))

準確性:
訓練集:0.4083
測試集:0.3992
ROC AUC:
訓練集:0.6227
測試集:0.6166

問題:
不明白svm以及線性svm作出的預測f1-score為什麼是0,只是因為資料沒有歸一化麼?