資料分析處理問題小例子(wine資料集)
阿新 • • 發佈:2018-12-04
剛學資料分析時做的小例子,從notebook上覆制過來,留個紀念~
資料集是從UCI上download下來的Wine資料集,下載地址,這是一個多分類問題,類別標籤為1,2,3。
先瞅瞅資料,
import numpy as np import pandas as pd from sklearn.linear_model import LogisticRegression #邏輯斯特迴歸,線性分類 from sklearn.linear_model import SGDClassifier #隨機梯度引數估計 from sklearn.svm import LinearSVC #支援向量機 from sklearn.naive_bayes import MultinomialNB #樸素貝葉斯 from sklearn.neighbors import KNeighborsClassifier #K近鄰 from sklearn.tree import DecisionTreeClassifier #決策樹 from sklearn.ensemble import RandomForestClassifier #隨機森林 from sklearn.ensemble import GradientBoostingClassifier #梯度提升決策樹 from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import ExtraTreesClassifier from sklearn.preprocessing import MinMaxScaler #最大最小歸一化 from sklearn.preprocessing import StandardScaler #標準化 from scipy.stats import pearsonr #皮爾森相關係數 from sklearn.model_selection import train_test_split #劃分資料集 from sklearn.model_selection import cross_val_score import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA #計算排列和組合數所需要的包 from itertools import combinations from scipy.special import comb columns=['0Alcohol','1Malic acid ','2Ash','3Alcalinity of ash', '4Magnesium','5Total phenols','6Flavanoid', '7Nonflavanoid phenols','8Proanthocyanins ','9Color intensity ','10Hue ','11OD280/OD315 of diluted wines' ,'12Proline ','13category'] data= pd.read_csv("G:/feature_code/wine_data.csv",header=None,names=columns) data.shape
(178, 14)
顯示前五行,
data.head()
0Alcohol | 1Malic acid | 2Ash | 3Alcalinity of ash | 4Magnesium | 5Total phenols | 6Flavanoid | 7Nonflavanoid phenols | 8Proanthocyanins | 9Color intensity | 10Hue | 11OD280/OD315 of diluted wines | 12Proline | 13category | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 1 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 | 1 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 | 1 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 | 1 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 1 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 14 columns): 0Alcohol 178 non-null float64 1Malic acid 178 non-null float64 2Ash 178 non-null float64 3Alcalinity of ash 178 non-null float64 4Magnesium 178 non-null int64 5Total phenols 178 non-null float64 6Flavanoid 178 non-null float64 7Nonflavanoid phenols 178 non-null float64 8Proanthocyanins 178 non-null float64 9Color intensity 178 non-null float64 10Hue 178 non-null float64 11OD280/OD315 of diluted wines 178 non-null float64 12Proline 178 non-null int64 13category 178 non-null int64 dtypes: float64(11), int64(3) memory usage: 19.5 KB
資料說明: 共178條記錄,資料沒有空缺值,還可以通過describe()方法看資料,主要關注mean這個,對資料分佈有個大體的瞭解。 然後再看資料樣本是否均衡
data['13category'].value_counts()
2 71 1 59 3 48 Name: 13category, dtype: int64
樣本較為均衡,差別不大
for i in data.iloc[:,0:13].columns:
sns.boxplot(x = data['13category'],y = data[i])
ax = sns.boxplot(x='13category', y=i, data=data)
ax = sns.stripplot(x='13category', y=i, data=data, jitter=True, edgecolor="gray")
plt.show()
通過對以上每個特徵與標籤的盒裝圖和散點圖分析,區分度不是很大,不容易進行特徵篩選,接下來計算特徵和分類的Pearson相關係數
def pearsonar(X,y):
pearson=[]
for col in X.columns.values:
pearson.append(abs(pearsonr(X[col].values,y)[0]))
pearsonr_X = pd.DataFrame({'col':X.columns,'corr_value':pearson})
pearsonr_X = pearsonr_X.sort_values(by='corr_value',ascending=False)
print pearsonr_X
pearsonar(X,y)
結果如下,
col corr_value 6 6Flavanoid 0.847498 11 11OD280/OD315 of diluted wines 0.788230 5 5Total phenols 0.719163 12 12Proline 0.633717 10 10Hue 0.617369 3 3Alcalinity of ash 0.517859 8 8Proanthocyanins 0.499130 7 7Nonflavanoid phenols 0.489109 1 1Malic acid 0.437776 0 0Alcohol 0.328222 9 9Color intensity 0.265668 4 4Magnesium 0.209179 2 2Ash 0.049643
分析發現只有特徵2與標籤的線性關係較低 ,再計算特徵間的線性相關
c=list(combinations([0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12],2))
p=[]
for i in range(len(c)):
p.append(abs(pearsonr(X.iloc[:,c[i][0]],X.iloc[:,c[i][1]])[0]))
pearsonr_ = pd.DataFrame({'col':c,'corr_value':p})
pearsonr_ = pearsonr_.sort_values(by='corr_value',ascending=False)
print pearsonr_
col corr_value 50 (5, 6) 0.864564 61 (6, 11) 0.787194 55 (5, 11) 0.699949 58 (6, 8) 0.652692 11 (0, 12) 0.643720 52 (5, 8) 0.612413 75 (10, 11) 0.565468 20 (1, 10) 0.561296 8 (0, 9) 0.546364 60 (6, 10) 0.543479 57 (6, 7) 0.537900 72 (9, 10) 0.521813 70 (8, 11) 0.519067 66 (7, 11) 0.503270 56 (5, 12) 0.498115 62 (6, 12) 0.494193 51 (5, 7) 0.449935 23 (2, 3) 0.443367 41 (3, 12) 0.440597 54 (5, 10) 0.433681 73 (9, 11) 0.428815 16 (1, 6) 0.411007 49 (4, 12) 0.393351 21 (1, 11) 0.368710 63 (7, 8) 0.365845 36 (3, 7) 0.361922 35 (3, 6) 0.351370 15 (1, 5) 0.335167 71 (8, 12) 0.330417 34 (3, 5) 0.321113 .. ... ... 76 (10, 12) 0.236183 32 (2, 12) 0.223626 18 (1, 8) 0.220746 42 (4, 5) 0.214401 1 (0, 2) 0.211545 46 (4, 9) 0.199950 37 (3, 8) 0.197327 43 (4, 6) 0.195784 22 (1, 12) 0.192011 27 (2, 7) 0.186230 59 (6, 9) 0.172379 12 (1, 2) 0.164045 6 (0, 7) 0.155929 64 (7, 9) 0.139057 7 (0, 8) 0.136698 25 (2, 5) 0.128980 26 (2, 6) 0.115077 0 (0, 1) 0.094397 33 (3, 4) 0.083333 30 (2, 10) 0.074667 10 (0, 11) 0.072343 9 (0, 10) 0.071747 48 (4, 11) 0.066004 47 (4, 10) 0.055398 53 (5, 9) 0.055136 14 (1, 4) 0.054575 68 (8, 9) 0.025250 38 (3, 9) 0.018732 28 (2, 8) 0.009652 31 (2, 11) 0.003911 [78 rows x 2 columns]
5、6、11三個特徵相關性較大,可能存在冗餘特徵
#通過隨機森林特徵重要性篩選特徵
def randomF_importfeat(X,y):
features_list=X.columns
forest = RandomForestClassifier(oob_score=True, n_estimators=10000)
forest.fit(X, y)
feature_importance = forest.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())
fi_threshold = 0
important_idx = np.where(feature_importance > fi_threshold)[0]
important_features = features_list[important_idx]
print( "\n", important_features.shape[0], "Important features(>", \
fi_threshold, "% of max importance)...\n")
sorted_idx = np.argsort(feature_importance[important_idx])[::-1]
#get the figure about important features
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.title('Feature Importance')
plt.barh(pos, feature_importance[important_idx][sorted_idx[::-1]], \
color='r',align='center')
plt.yticks(pos, important_features[sorted_idx[::-1]])
plt.xlabel('Relative Importance')
plt.draw()
plt.show()
randomF_importfeat(X,y)
('\n', 13L, 'Important features(>', 0, '% of max importance)...\n')
2這個特徵可以考慮刪除
先試下PCA降維效果
# 直接PCA降維後列舉各種模型測試
def _PCA(X,y):
ss=MinMaxScaler()
X=ss.fit_transform(X)
pca=PCA(n_components='mle')
X_new=pca.fit_transform(X)
clfs = [LogisticRegression(),SGDClassifier(),LinearSVC(),KNeighborsClassifier(),\
DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),GaussianNB()]
for model in clfs:
print("模型及模型引數:")
print(str(model))
print("模型準確率:")
print(np.mean(cross_val_score(model,X_new,y,cv=10)))
_PCA(X,y)
模型及模型引數: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) 模型準確率: 0.983333333333 模型及模型引數: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) 模型準確率: 0.967251461988 模型及模型引數: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) 模型準確率: 0.983333333333 模型及模型引數: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') 模型準確率: 0.971200980392 模型及模型引數: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') 模型準確率: 0.927048933609 模型及模型引數: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) 模型準確率: 0.971895424837 模型及模型引數: GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False) 模型準確率: 0.972222222222 模型及模型引數: GaussianNB(priors=None) 模型準確率: 0.977743378053
資料很標準,隨便PCA一下就能得到如此高分
再試試原始資料在各個預設引數模型上的表現如何,
#劃分訓練集和測試集
X_train,X_test,y_train,y_test=train_test_split(data.iloc[:,:13],data.iloc[:,13],test_size=0.2,random_state=0)
#此處採用最大最小歸一化, 可以換成StandardScaler()歸一化方法,如果用StandardScaler()方法的話,則不能使用MultinomialNB()模型
ss=MinMaxScaler()
#ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)
#模型及模型引數列表
clfs = [LogisticRegression(),SGDClassifier(),LinearSVC(),MultinomialNB(),KNeighborsClassifier(),\
DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),GaussianNB(),ExtraTreesClassifier()]
#輸出模型及引數資訊,以及模型分類準確性
for model in clfs:
print("模型及模型引數:")
print(str(model))
model.fit(X_train,y_train)
print("模型準確率:")
print(model.score(X_test,y_test))
模型及模型引數: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) 模型準確率: 0.972222222222 模型及模型引數: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) 模型準確率: 1.0 模型及模型引數: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) 模型準確率: 1.0 模型及模型引數: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 模型準確率: 0.944444444444 模型及模型引數: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') 模型準確率: 0.972222222222 模型及模型引數: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') 模型準確率: 0.972222222222 模型及模型引數: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) 模型準確率: 1.0 模型及模型引數: GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False) 模型準確率: 0.944444444444 模型及模型引數: GaussianNB(priors=None) 模型準確率: 0.916666666667 模型及模型引數: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) 模型準確率: 0.972222222222
刪除第二個特徵後,再試一下
params=[ 0,1,3,4,5,6,7,8,9,10,11,12]
X=data.iloc[:,:13]
X=[params]
#劃分訓練集和測試集
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
#此處採用最大最小歸一化, 可以換成StandardScaler()歸一化方法,如果用StandardScaler()方法的話,則不能使用MultinomialNB()模型
ss=MinMaxScaler()
#ss=StandardScaler()
X_train=ss.fit_transform(X_train)
X_test=ss.transform(X_test)
#模型及模型引數列表
clfs = [LogisticRegression(),SGDClassifier(),LinearSVC(),MultinomialNB(),KNeighborsClassifier(),\
DecisionTreeClassifier(),RandomForestClassifier(),GradientBoostingClassifier(),GaussianNB(),ExtraTreesClassifier()]
#輸出模型及引數資訊,以及模型分類準確性
for model in clfs:
print("模型及模型引數:")
print(str(model))
model.fit(X_train,y_train)
print("模型準確率:")
print(model.score(X_test,y_test))
模型及模型引數: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True, intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1, penalty='l2', random_state=None, solver='liblinear', tol=0.0001, verbose=0, warm_start=False) 模型準確率: 0.916666666667 模型及模型引數: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15, learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1, penalty='l2', power_t=0.5, random_state=None, shuffle=True, verbose=0, warm_start=False) 模型準確率: 0.944444444444 模型及模型引數: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, intercept_scaling=1, loss='squared_hinge', max_iter=1000, multi_class='ovr', penalty='l2', random_state=None, tol=0.0001, verbose=0) 模型準確率: 1.0 模型及模型引數: MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True) 模型準確率: 0.944444444444 模型及模型引數: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform') 模型準確率: 0.944444444444 模型及模型引數: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, presort=False, random_state=None, splitter='best') 模型準確率: 0.972222222222 模型及模型引數: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) 模型準確率: 0.972222222222 模型及模型引數: GradientBoostingClassifier(criterion='friedman_mse', init=None, learning_rate=0.1, loss='deviance', max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, presort='auto', random_state=None, subsample=1.0, verbose=0, warm_start=False) 模型準確率: 0.944444444444 模型及模型引數: GaussianNB(priors=None) 模型準確率: 0.916666666667 模型及模型引數: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini', max_depth=None, max_features='auto', max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=1, min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1, oob_score=False, random_state=None, verbose=0, warm_start=False) 模型準確率: 1.0
對於資料集較小的,試驗了一下貪婪搜尋了所有特徵組合可能
def greed(X,y):
ss=MinMaxScaler()
X=ss.fit_transform(X)
X=pd.DataFrame(X)
jilu=pd.DataFrame(columns=['m','feature','score'])
params=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12]
model = LinearSVC()
best_socre=0
n=13
i=1
index=0
while i<=n:
test_params=list(combinations(params, i))
j=int(comb(n,i))
i=i+1
for m in range(j):
z=list(test_params[m])
score = np.mean(cross_val_score(model,X[z],y,cv=10)) #10折交叉驗證取平均
if score>best_socre:
best_socre=score
best_feature=z
jilu.loc[index,['m']]=m
jilu.loc[index,['feature']]=str(z)
jilu.loc[index,['score']]=score
index=index+1
print(jilu)
print ("best_feature=",best_feature,"best_score=",best_socre)
greed(data.iloc[:,:13],data.iloc[:,13])