python良\惡性腫瘤預測-LogisticRegression以及SGDClassifier
# -*- coding: utf-8 -*- """ Created on Fri Oct 12 16:56:56 2018
@author: fengjuan """
import pandas as pd import numpy as np #匯入matplotlib工具包的pyplot並簡稱為plt #import matplotlib.pyplot as plt #df_train.info() #建立特徵列表,網址裡資料沒有表頭 column_names=['Sample code number','Clump Thickness','Uniformity of Cell Size', 'Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size', 'Bare Nulclei','Bland Chromatin','Nomal Nucleoli','Mitoses','Class'] #從網上讀取 data=pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', names=column_names) #將資料裡的?替換為標準缺失值 data=data.replace(to_replace='?',value=np.nan) #丟棄有缺失的資料,只要有缺失就丟棄 data=data.dropna(how='any') data.info() #因為元資料沒有測試集,所以將資料集分成測試集和訓練集,隨機取樣25%作為測試集 from sklearn.cross_validation import train_test_split X_train,X_test,y_train,y_test=train_test_split(data[column_names[1:10]], data[column_names[10]], test_size=0.25,random_state=33) #查驗測試集和訓練集的數量和類別分類 print(y_train.value_counts()) print(y_test.value_counts())
‘’‘
輸出的結果是:
2 344 4 168 Name: Class, dtype: int64 2 100 4 71 Name: Class, dtype: int64
‘’‘
from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss = StandardScaler() X_train = ss.fit_transform(X_train) X_test = ss.transform(X_test) lr=LogisticRegression() sgdc=SGDClassifier() #呼叫LogisticRegression中fit函式/模組來訓練模型引數 lr.fit(X_train,y_train) #用訓練好的模型lr預測,結果儲存在變數lr_y_predict lr_y_predict=lr.predict(X_test) #呼叫SGDClassifier中fit函式/模組來訓練模型引數 sgdc.fit(X_train,y_train) #用訓練好的模型sgdc預測,結果儲存在變數sgdc_y_predict sgdc_y_predict=sgdc.predict(X_test) #效能預測 from sklearn.metrics import classification_report print('Accuracy of LR Classifier:',lr.score(X_test,y_test)) print(classification_report(y_test,lr_y_predict,target_names=['Benign', 'Malignant'])) print('Accuracy of SGD Classifier:',sgdc.score(X_test,y_test)) print(classification_report(y_test,sgdc_y_predict,target_names=['Benign', 'Malignant']))
'''結果: Accuracy of LR Classifier: 0.9883040935672515 precision recall f1-score support
Benign 0.99 0.99 0.99 100 Malignant 0.99 0.99 0.99 71
avg / total 0.99 0.99 0.99 171
Accuracy of SGD Classifier: 0.9766081871345029 precision recall f1-score support
Benign 0.99 0.97 0.98 100 Malignant 0.96 0.99 0.97 71
avg / total 0.98 0.98 0.98 171 '''