1. 程式人生 > 實用技巧 >基於邏輯迴歸信用卡欺詐檢測

基於邏輯迴歸信用卡欺詐檢測

檔案讀取

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
#由於資料太多,只讀取前1000行
data = pd.read_csv("creditcard.csv",nrows=1000)
data.head()

資料預處理

缺失值、異常值的處理、刪除多餘列

#判斷是否有缺失值
data.isnull()
#如果有缺失值,是否進行填補
data.fillna(method='ffill')
#判斷是否有異常值,可以採用箱型圖
data[['V1','V2','V3']].boxplot()
#對於異常值可以刪除,或則修改,根據情況而做出判斷
#由於time這一列不起作用,可以直接刪除,還有種方法是做pca來選擇最佳的列,利用pca
data.drop(['time'])

特徵標準化

主要解決特徵因為數值差距過大而導致迭代次數過慢或則結果

#對Amount的那一列數進行特徵歸一化
from sklearn.preprocessing import StandardScaler
data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))#因為fiy_transform(data),data:2D資料

資料分析階段

#檢視標籤的數量
count_classes = pd.value_counts(data['Class'], sort = True)
#可以做條形圖進行統計

取樣方法

因為當資料不足的時候需要採取上取樣或者下采樣,因為從上圖分析可知,類別相差很多,為了預測的正確性,採用過取樣的方法,增加類別的樣本數量

X = data.loc[:, data.columns != 'Class']
y = data.loc[:, data.columns == 'Class']

# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1]) fraud_indices = np.array(data[data.Class == 1].index) # Picking the indices of the normal classes normal_indices = data[data.Class == 0].index # Out of the indices we picked, randomly select "x" number (number_records_fraud) random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False) random_normal_indices = np.array(random_normal_indices) print("random_normal_indices:",random_normal_indices) # # Appending the 2 indices under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) # # Under sample dataset under_sample_data = data.iloc[under_sample_indices,:] X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class'] # Showing ratio print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data)) print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data)) print("Total number of transactions in resampled data: ", len(under_sample_data))
random_normal_indices: [979 686]
Percentage of normal transactions:  0.5
Percentage of fraud transactions:  0.5
Total number of transactions in resampled data:  4
建立預測模型
from sklearn.model_selection import train_test_split

# Whole dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)

print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))

# Undersampled dataset
X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample
                                                                                                   ,y_undersample
                                                                                                   ,test_size = 0.3
                                                                                                   ,random_state = 0)
lr = LogisticRegression(C = best_c, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))