糖尿病模型預測
阿新 • • 發佈:2020-09-06
diabetes model prediction
""" # @Time : 2020/9/6 # @Author : Jimou Chen """ from sklearn.linear_model import LogisticRegression import pandas as pd import matplotlib.pyplot as plt import seaborn import numpy as np import missingno as msn from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split def label_distribution(data): p = data.Outcome.value_counts().plot(kind='bar') # 使用柱狀圖畫出 plt.show() # 視覺化資料釋出, 有些資料本不該為0的卻為0,其實是空的 p = seaborn.pairplot(data, hue='Outcome') plt.show() # 把空值的用柱狀圖畫出來 p = msn.bar(data) plt.show() def handle_data(): data = pd.read_csv('data/diabetes.csv') # 檢視標籤分佈 print(data.Outcome.value_counts()) # 把葡萄糖,血壓,面板厚度,胰島素,身體質量指數中的0替換為nan handle_col = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] data[handle_col] = data[handle_col].replace(0, np.nan) # 設定閥值 thresh_count = data.shape[0] * 0.8 # 若某一列資料缺失的數量超過20%就會被刪除 data = data.dropna(thresh=thresh_count, axis=1) # 填充資料,得到新的資料集data data['Glucose'] = data['Glucose'].fillna(data['Glucose'].mean()) data['BloodPressure'] = data['BloodPressure'].fillna(data['BloodPressure'].mean()) data['BMI'] = data['BMI'].fillna(data['BMI'].mean()) return data if __name__ == '__main__': new_data = handle_data() label_distribution(new_data) # 切分資料集 x_data = new_data.drop('Outcome', axis=1) y_data = new_data.Outcome x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, stratify=y_data) # 建模 model = LogisticRegression() model.fit(x_train, y_train) # 預測 pred = model.predict(x_test) # 評估 print(classification_report(pred, y_test))
D:\Anaconda\Anaconda3\python.exe D:/Appication/PyCharm/Git/kaggle-project/DiabetesPrediction/diabetes_predict.py 0 500 1 268 Name: Outcome, dtype: int64 precision recall f1-score support 0 0.90 0.80 0.85 169 1 0.58 0.76 0.66 62 accuracy 0.79 231 macro avg 0.74 0.78 0.75 231 weighted avg 0.81 0.79 0.80 231 Process finished with exit code 0