1. 程式人生 > >回歸分析過程實例(練習)

回歸分析過程實例(練習)

idt param ngs img lasso 標準 on() max map

By:HEHE

本實例是基於:混凝土抗壓強度的回歸分析

# 導包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import os

1. 數據基本面分析

# path
path_dir = os.path.dirname(os.path.dirname(os.getcwd()))

path_data = path_dir +  r'\concrete_data.xls'
# load_data
data = pd.read_excel(path_data)
# 查看數據基本面
data.head()
Cement (component 1)(kg in a m^3 mixture) Blast Furnace Slag (component 2)(kg in a m^3 mixture) Fly Ash (component 3)(kg in a m^3 mixture) Water (component 4)(kg in a m^3 mixture) Superplasticizer (component 5)(kg in a m^3 mixture) Coarse Aggregate (component 6)(kg in a m^3 mixture) Fine Aggregate (component 7)(kg in a m^3 mixture) Age (day) Concrete compressive strength(MPa, megapascals)
0 540.0 0.0 0.0 162.0 2.5 1040.0 676.0 28 79.986111
1 540.0 0.0 0.0 162.0 2.5 1055.0 676.0 28 61.887366
2 332.5 142.5 0.0 228.0 0.0 932.0 594.0 270 40.269535
3 332.5 142.5 0.0 228.0 0.0 932.0 594.0 365 41.052780
4 198.6 132.4 0.0 192.0 0.0 978.4 825.5 360 44.296075
# 修改列名
data.columns = ['cement_component', 'furnace_slag', 'flay_ash', 'water_component', 'superplasticizer',     'coarse_aggregate', 'fine_aggregate', 'age', 'concrete_strength']
data.head()
cement_component furnace_slag flay_ash water_component superplasticizer coarse_aggregate fine_aggregate age concrete_strength
0 540.0 0.0 0.0 162.0 2.5 1040.0 676.0 28 79.986111
1 540.0 0.0 0.0 162.0 2.5 1055.0 676.0 28 61.887366
2 332.5 142.5 0.0 228.0 0.0 932.0 594.0 270 40.269535
3 332.5 142.5 0.0 228.0 0.0 932.0 594.0 365 41.052780
4 198.6 132.4 0.0 192.0 0.0 978.4 825.5 360 44.296075
# 查看數據基本面
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
cement_component     1030 non-null float64
furnace_slag         1030 non-null float64
flay_ash             1030 non-null float64
water_component      1030 non-null float64
superplasticizer     1030 non-null float64
coarse_aggregate     1030 non-null float64
fine_aggregate       1030 non-null float64
age                  1030 non-null int64
concrete_strength    1030 non-null float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB
# 查看數據基本面
data.describe()
cement_component furnace_slag flay_ash water_component superplasticizer coarse_aggregate fine_aggregate age concrete_strength
count 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000 1030.000000
mean 281.165631 73.895485 54.187136 181.566359 6.203112 972.918592 773.578883 45.662136 35.817836
std 104.507142 86.279104 63.996469 21.355567 5.973492 77.753818 80.175427 63.169912 16.705679
min 102.000000 0.000000 0.000000 121.750000 0.000000 801.000000 594.000000 1.000000 2.331808
25% 192.375000 0.000000 0.000000 164.900000 0.000000 932.000000 730.950000 7.000000 23.707115
50% 272.900000 22.000000 0.000000 185.000000 6.350000 968.000000 779.510000 28.000000 34.442774
75% 350.000000 142.950000 118.270000 192.000000 10.160000 1029.400000 824.000000 56.000000 46.136287
max 540.000000 359.400000 200.100000 247.000000 32.200000 1145.000000 992.600000 365.000000 82.599225

數據基本面總結如下:

  1. 數據集共1030條數據,特征8個,目標為concrete_strength
  2. 數據集無缺失值,數據類型全為數值

2. EDA(數據探索性分析)

2.1 concrete_strength

sns.distplot(data['concrete_strength'], bins = 20, color = 'red')
<matplotlib.axes._subplots.AxesSubplot at 0x213da2c2080>

技術分享圖片

concrete_strength:數據分布正常,稍微有點右偏

2.2 features

plt.figure(figsize = (15,10.5))
plot_count = 1

for feature in list(data.columns)[:-1]:
    plt.subplot(3,3, plot_count)
    plt.scatter(data[feature], data['concrete_strength'])
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')
    plot_count +=1

plt.show()

技術分享圖片

plt.figure(figsize=(9,9))
corrmat = data.corr()
sns.heatmap(corrmat, vmax= 0.8, square = True, )
<matplotlib.axes._subplots.AxesSubplot at 0x213ddc4e7b8>

技術分享圖片

EDA總結:

  1. 數據相關性都不強,
  2. cement_component,water_component,superplasticizer,age似乎相關性高一點
  3. 由於特征都不多,可以分別用這四個特征以及所有特征嘗試一遍
  4. 沒有發現異常值
  5. 還沒決定數據要不要標準化

3. model

實驗內容:分別使用上面得到的特征,以及所有特征對混凝土強度做預測,同時使用不同的回歸算法

from sklearn.model_selection import train_test_split
# 按數據集特征切割訓練集測試集
def split_train_test(data, features=None, test_ratio=0.2):
    y = data['concrete_strength']
    if features != None:
        x = data[features]
    else:
        x = data.drop(['concrete_strength'], axis=1)
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size = test_ratio)
    return train_x, test_x, train_y, test_y
# 訓練集,測試集
train_x, test_x, train_y, test_y = split_train_test(data, test_ratio = 0)
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score
def data_cross_val(x,y, clfs, clfs_name, cv= 5):
    for i,clf in enumerate(clfs):
        scores = cross_val_score(estimator=clf, X= x, y= y, cv=cv, scoring ='r2')
        print(clfs_name[i])
        print('the R2 score: %f' %  np.mean(scores))

3.1 所有特征做回歸

clfs = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), GradientBoostingRegressor(), SVR()]
clfs_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor', 'SVR']
data_cross_val(train_x, train_y, clfs,clfs_name, cv = 5)
LinearRegression
the R2 score: 0.604974
Ridge
the R2 score: 0.604974
Lasso
the R2 score: 0.605090
ElasticNet
the R2 score: 0.605220
GradientBoostingRegressor
the R2 score: 0.908837
SVR
the R2 score: 0.023249

結論:單一的回歸器還是沒有梯度提升機好,可以嘗試用bagging和stacking的方式再實驗一下,或者增加特征。

3.2 部分相關特征做回歸

# 訓練集,測試集
features = ['cement_component','water_component','superplasticizer','age']
train_x, test_x, train_y, test_y = split_train_test(data, features, test_ratio = 0)
clfs = [LinearRegression(), Ridge(), Lasso(), ElasticNet(), GradientBoostingRegressor(), SVR()]
clfs_name = ['LinearRegression', 'Ridge', 'Lasso', 'ElasticNet', 'GradientBoostingRegressor', 'SVR']
data_cross_val(train_x, train_y, clfs,clfs_name, cv = 5)
LinearRegression
the R2 score: 0.485046
Ridge
the R2 score: 0.485045
Lasso
the R2 score: 0.484828
ElasticNet
the R2 score: 0.484840
GradientBoostingRegressor
the R2 score: 0.830816
SVR
the R2 score: 0.043992

總結:目前來說使用部分相關的特征來做回歸,由於特征數目太少,還不如用所有特征來的比較好

3.3 單線性回歸

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = LinearRegression()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue',
             linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.24550132796330282
flay_ash 0.012228585601186226
water_component 0.09828887425075417
superplasticizer 0.11471267678235075
coarse_aggregate 0.02046823335033021

技術分享圖片

features = ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']

data_tr = data
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, features)

# Create linear regression object
regr = LinearRegression()

# Train the model using the training sets
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.155569
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

技術分享圖片

alphas = np.arange(0.1,5,0.1)

model = Ridge()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.155562
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

技術分享圖片

model = Lasso()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.151682
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

技術分享圖片

model = ElasticNet()
cv = GridSearchCV(estimator=model, param_grid=dict(alpha=alphas))

y_pred = cv.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
print('Intercept: %f'%regr.intercept_)
print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.151796
Intercept: 84.481913
Coefficients: [ 0.04304209 -0.02577486 -0.1747249   0.15980663 -0.02633656]

技術分享圖片

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = GradientBoostingRegressor()

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue',
             linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.35248985320039705
flay_ash 0.17319875701989795
water_component 0.285023360910455
superplasticizer 0.19306275412216778
coarse_aggregate 0.17712532312647877

技術分享圖片

model = GradientBoostingRegressor()

y_pred = model.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue',
         linewidth=3)


print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
#print('Intercept: %f'%regr.intercept_)
#print('Coefficients: %s'%str(regr.coef_))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.177125

技術分享圖片

plt.figure(figsize=(15,7))
plot_count = 1

for feature in ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']:
    data_tr = data[['concrete_strength', feature]]
    data_tr=data_tr[(data_tr.T != 0).all()]
    
    x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

    # Create linear regression object
    regr = SVR(kernel='linear')

    # Train the model using the training sets
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    
    # Plot outputs
    plt.subplot(2,3,plot_count)
    
    plt.scatter(x_test, y_test,  color='black')
    plt.plot(x_test, y_pred, color='blue', linewidth=3)
    plt.xlabel(feature.replace('_',' ').title())
    plt.ylabel('Concrete strength')

    print(feature, r2_score(y_test, y_pred))
    
    plot_count+=1
        
plt.show()
cement_component 0.2054832593541437
flay_ash -0.044636249705873654
water_component 0.07749271320026574
superplasticizer 0.0671220299245393
coarse_aggregate 0.016036478490831563

技術分享圖片

model = SVR(kernel='linear')

y_pred = model.fit(x_train, y_train).predict(x_test)

plt.scatter(range(len(y_test)), y_test,  color='black')
plt.plot(y_pred, color='blue', linewidth=3)

print('Features: %s'%str(features))
print('R2 score: %f'%r2_score(y_test, y_pred))
Features: ['cement_component', 'flay_ash', 'water_component', 'superplasticizer', 'coarse_aggregate']
R2 score: 0.016036

技術分享圖片

4. 使用 cement_component和 water_component預測concrete_strength

feature = 'cement_component'
cc_new_data = np.array([[213.5]])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

regr = GradientBoostingRegressor()

# Train the model using the training sets

regr.fit(x_train, y_train)
cs_pred = regr.predict(cc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 36.472380
feature = 'water_component'
wc_new_data = np.array([[200]])

data_tr = data[['concrete_strength', feature]]
data_tr=data_tr[(data_tr.T != 0).all()]

x_train, x_test, y_train, y_test = split_train_test(data_tr, [feature])

regr = GradientBoostingRegressor()

# Train the model using the training sets
regr.fit(x_train, y_train)
cs_pred = regr.predict(wc_new_data)
print('Predicted value of concrete strength: %f'%cs_pred)
Predicted value of concrete strength: 32.648425

回歸分析過程實例(練習)