基於邏輯迴歸信用卡欺詐檢測
阿新 • • 發佈:2020-07-04
檔案讀取
import pandas as pd import matplotlib.pyplot as plt import numpy as np %matplotlib inline #由於資料太多,只讀取前1000行 data = pd.read_csv("creditcard.csv",nrows=1000) data.head()
資料預處理
缺失值、異常值的處理、刪除多餘列
#判斷是否有缺失值 data.isnull() #如果有缺失值,是否進行填補 data.fillna(method='ffill') #判斷是否有異常值,可以採用箱型圖 data[['V1','V2','V3']].boxplot()#對於異常值可以刪除,或則修改,根據情況而做出判斷
#由於time這一列不起作用,可以直接刪除,還有種方法是做pca來選擇最佳的列,利用pca
data.drop(['time'])
特徵標準化
主要解決特徵因為數值差距過大而導致迭代次數過慢或則結果
#對Amount的那一列數進行特徵歸一化 from sklearn.preprocessing import StandardScaler data['normAmount'] = StandardScaler().fit_transform(data['Amount'].values.reshape(-1, 1))#因為fiy_transform(data),data:2D資料
資料分析階段
#檢視標籤的數量 count_classes = pd.value_counts(data['Class'], sort = True) #可以做條形圖進行統計
取樣方法
因為當資料不足的時候需要採取上取樣或者下采樣,因為從上圖分析可知,類別相差很多,為了預測的正確性,採用過取樣的方法,增加類別的樣本數量
X = data.loc[:, data.columns != 'Class'] y = data.loc[:, data.columns == 'Class'] # Number of data points in the minority classnumber_records_fraud = len(data[data.Class == 1]) fraud_indices = np.array(data[data.Class == 1].index) # Picking the indices of the normal classes normal_indices = data[data.Class == 0].index # Out of the indices we picked, randomly select "x" number (number_records_fraud) random_normal_indices = np.random.choice(normal_indices, number_records_fraud, replace = False) random_normal_indices = np.array(random_normal_indices) print("random_normal_indices:",random_normal_indices) # # Appending the 2 indices under_sample_indices = np.concatenate([fraud_indices,random_normal_indices]) # # Under sample dataset under_sample_data = data.iloc[under_sample_indices,:] X_undersample = under_sample_data.loc[:, under_sample_data.columns != 'Class'] y_undersample = under_sample_data.loc[:, under_sample_data.columns == 'Class'] # Showing ratio print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.Class == 0])/len(under_sample_data)) print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.Class == 1])/len(under_sample_data)) print("Total number of transactions in resampled data: ", len(under_sample_data))
random_normal_indices: [979 686] Percentage of normal transactions: 0.5 Percentage of fraud transactions: 0.5 Total number of transactions in resampled data: 4
建立預測模型
from sklearn.model_selection import train_test_split # Whole dataset X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0) print("Number transactions train dataset: ", len(X_train)) print("Number transactions test dataset: ", len(X_test)) print("Total number of transactions: ", len(X_train)+len(X_test)) # Undersampled dataset X_train_undersample, X_test_undersample, y_train_undersample, y_test_undersample = train_test_split(X_undersample ,y_undersample ,test_size = 0.3 ,random_state = 0) lr = LogisticRegression(C = best_c, penalty = 'l1') lr.fit(X_train_undersample,y_train_undersample.values.ravel()) y_pred_undersample = lr.predict(X_test_undersample.values) # Compute confusion matrix cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample) np.set_printoptions(precision=2) print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))