9.邏輯迴歸-下采樣、過取樣、交叉驗證
阿新 • • 發佈:2018-12-16
import numpy as np import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.model_selection import KFold, cross_val_score from sklearn.metrics import confusion_matrix, recall_score, classification_report from imblearn.over_sampling import SMOTE data = pd.read_csv('creditcard.csv') print(data.shape) print(data.columns) # print(data.head(100)) count_classes = pd.value_counts(data['Class'], sort=True) count_classes.plot(kind='bar') plt.title('Fraud class histogram') plt.xlabel('Class') plt.ylabel('Frequency') plt.show() # 歸一化 data['new_Amount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1)) # 丟掉某些無用列 data = data.drop(['Time', 'Amount'], axis=1) # 初始化資料 X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class'] # 獲取異常樣本的個數 number_records_fraud = len(data[data.Class == 1]) # 獲取異常樣本的索引 fraud_index = np.array(data[data.Class == 1].index) # 獲取正常樣本的個數 number_records_normal = len(data[data.Class == 0]) # 獲取正常樣本的索引 normal_index = data[data.Class == 0].index # 下采樣,採取與樣本少的數量一樣的資料 # 隨機選擇樣本 random_normal_index = np.random.choice(normal_index, number_records_fraud, replace=False) random_normal_index = np.array(random_normal_index) # print(len(random_normal_index))=492 # 將隨機選擇的樣本index與fraud樣本的索引連線成一個新的array under_sample_index = np.concatenate([random_normal_index, fraud_index]) # print(len(under_sample_index))=984 # 根據下采樣的索引獲取下采樣的資料集 under_sample_data = data.iloc[under_sample_index] # print(len(under_sample_data))=984 X_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns != 'Class']] y_under_sample_data = under_sample_data[under_sample_data.columns[under_sample_data.columns == 'Class']] # 另外一種寫法,待會驗證一下 # X_under_sample_data = under_sample_data.loc[under_sample_data.columns != 'Class'] # y_under_sample_data = under_sample_data.loc[under_sample_data.columns == 'Class'] # The whole dataset 全部資料集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # The under_sample dataset 下采樣資料集 X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_under_sample_data, y_under_sample_data, test_size=0.3, random_state=0) def printing_Kfold_scores(x_train_data, y_train_data): # 生成交叉驗證的引數,會得到二維列表train_index 和 test_index kfold = KFold(n_splits=5, shuffle=False) # 不同的正則項引數:懲罰力度 c_param_range = [0.01, 0.1, 1, 10, 100] # fold 中有兩個列表,train_index 和 test_index j = 0 for c_param in c_param_range: # 這裡for迴圈是為了使用不同的懲罰力度來初始化正則項 print('-----------------------------------') print('C Parameter:', c_param) print('-----------------------------------') print('') recall_accs = [] for iteration, index in enumerate(kfold.split(x_train_data), start=1): # for迴圈裡面是使用5次交叉驗證訓練 # 使用懲罰力度呼叫邏輯迴歸模型 # 模型初始化 lr = LogisticRegression(C = c_param, penalty = 'l1') # 訓練模型 lr.fit(x_train_data.iloc[index[0], :].values, y_train_data.iloc[index[0], :].values.ravel()) # 用訓練的模型預測資料 y_predicted_undersample = lr.predict(x_train_data.iloc[index[1], :].values) recall_acc = recall_score(y_train_data.iloc[index[1], :].values, y_predicted_undersample) recall_accs.append(recall_acc) print('Iteration:', iteration, ': Recall Score = ', recall_acc) print('Mean Recall Score:',np.mean(recall_accs)) # y_predicted_undersample = printing_Kfold_scores(X_train_undersample, y_train_undersample) # y_predicted_undersample = printing_Kfold_scores(X, y_train_undersample) kfold = KFold(n_splits=5, shuffle=False) recall_accs = [] for iteration, indexs in enumerate(kfold.split(X_train_undersample), start=1): lr = LogisticRegression(C=0.01, penalty='l1') lr.fit(X_train_undersample.iloc[indexs[0], :].values, y_train_undersample.iloc[indexs[0], :].values.ravel()) # 預測下采樣資料 # y_predicted_labels = lr.predict(X_test_undersample.values) # recall_acc = recall_score(y_test_undersample, y_predicted_labels) # 預測所有資料 y_predicted_labels = lr.predict(X_test.values) recall_acc = recall_score(y_test, y_predicted_labels) # 預測過取樣資料 # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) # over_sampler = SMOTE(random_state=0) # os_X, os_y = over_sampler.fit_sample(X_train, y_train) # y_predicted_labels = lr.predict(X_test.values) # recall_acc = recall_score(y_test, y_predicted_labels) print('Recall:',recall_acc) recall_accs.append(recall_acc) print('Recall Means:', np.mean(recall_accs))