sklearn:隨機森林_迴歸樹_波士頓房價_填補缺失值
阿新 • • 發佈:2021-01-03
- 分類樹和迴歸樹引數差別:
- criterion
- 分類:使用資訊增益,
- 迴歸:
- 均方誤差MSE,使用均值。mse是父節點與葉子節點之間的均方誤差,用來選擇特徵。同時也是用於衡量模型質量的指標。均方誤差是正的,但是sklearn中的均方誤差是負數。
- 絕對誤差mae,使用中值。
- 注意:迴歸樹的介面score預設返回的是R方(負無窮到1,越接近1越好),不是mse
from sklearn.datasets import load_boston from sklearn.model_selection import cross_val_score from sklearn.ensemble import RandomForestRegressor
boston = load_boston()
import sklearn
sorted(sklearn.metrics.SCORERS.keys())
['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'balanced_accuracy', 'brier_score_loss', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_weighted', 'max_error', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']
regresor = RandomForestRegressor(n_estimators=100, random_state=0)
cross_val_score(regresor, boston.data, boston.target, cv=10
, scoring="neg_mean_squared_error" # 可以通過 sklearn.metrics.SCORERS.keys() 檢視scoring對應的引數,預設是R方
)
# 返回10次交叉驗證的衡量指標結果
array([-10.72900447, -5.36049859, -4.74614178, -20.84946337, -12.23497347, -17.99274635, -6.8952756 , -93.78884428, -29.80411702, -15.25776814])
用隨機森林迴歸填補缺失值
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
dataset = load_boston()
dataset.data.shape
(506, 13)
x_full, y_full = dataset.data, dataset.target # 儲存完整的資料
n_samples = x_full.shape[0]
n_features = x_full.shape[1]
n_samples, n_features
(506, 13)
# 首先確定希望放入的缺失值資料的比例。
rng = np.random.RandomState(0)
missing_rate = 0.5
n_missing_samples = int(np.floor(n_samples * n_features * missing_rate))
n_missing_samples
3289
# 構建缺失資料
missing_features = rng.randint(0, n_features, n_missing_samples) # 生成從0-n之間的n_missing_samples個數據
missing_samples = rng.randint(0, n_samples, n_missing_samples)
x_missing = x_full.copy()
y_missing = y_full.copy()
x_missing[missing_samples, missing_features] = np.nan
x_missing = pd.DataFrame(x_missing)
x_missing
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | NaN | 18.0 | NaN | NaN | 0.538 | NaN | 65.2 | 4.0900 | 1.0 | 296.0 | NaN | NaN | 4.98 |
1 | 0.02731 | 0.0 | NaN | 0.0 | 0.469 | NaN | 78.9 | 4.9671 | 2.0 | NaN | NaN | 396.90 | 9.14 |
2 | 0.02729 | NaN | 7.07 | 0.0 | NaN | 7.185 | 61.1 | NaN | 2.0 | 242.0 | NaN | NaN | NaN |
3 | NaN | NaN | NaN | 0.0 | 0.458 | NaN | 45.8 | NaN | NaN | 222.0 | 18.7 | NaN | NaN |
4 | NaN | 0.0 | 2.18 | 0.0 | NaN | 7.147 | NaN | NaN | NaN | NaN | 18.7 | NaN | 5.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | NaN | NaN | NaN | 0.0 | 0.573 | NaN | 69.1 | NaN | 1.0 | NaN | 21.0 | NaN | 9.67 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | NaN | 396.90 | 9.08 |
503 | NaN | NaN | 11.93 | NaN | 0.573 | 6.976 | 91.0 | NaN | NaN | NaN | 21.0 | NaN | 5.64 |
504 | 0.10959 | 0.0 | 11.93 | NaN | 0.573 | NaN | 89.3 | NaN | 1.0 | NaN | 21.0 | 393.45 | 6.48 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | NaN | NaN | 1.0 | NaN | NaN | 396.90 | 7.88 |
506 rows × 13 columns
from sklearn.impute import SimpleImputer # 專門用於填補缺失值的類
# 使用均值填充
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
x_missing_mean = imp_mean.fit_transform(x_missing)
x_missing_mean = pd.DataFrame(x_missing_mean)
x_missing_mean
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3.627579 | 18.000000 | 11.163464 | 0.066007 | 0.538000 | 6.305921 | 65.2 | 4.090000 | 1.000000 | 296.000000 | 18.521192 | 352.741952 | 4.980000 |
1 | 0.027310 | 0.000000 | 11.163464 | 0.000000 | 0.469000 | 6.305921 | 78.9 | 4.967100 | 2.000000 | 405.935275 | 18.521192 | 396.900000 | 9.140000 |
2 | 0.027290 | 10.722951 | 7.070000 | 0.000000 | 0.564128 | 7.185000 | 61.1 | 3.856371 | 2.000000 | 242.000000 | 18.521192 | 352.741952 | 12.991767 |
3 | 3.627579 | 10.722951 | 11.163464 | 0.000000 | 0.458000 | 6.305921 | 45.8 | 3.856371 | 9.383871 | 222.000000 | 18.700000 | 352.741952 | 12.991767 |
4 | 3.627579 | 0.000000 | 2.180000 | 0.000000 | 0.564128 | 7.147000 | 67.4 | 3.856371 | 9.383871 | 405.935275 | 18.700000 | 352.741952 | 5.330000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 3.627579 | 10.722951 | 11.163464 | 0.000000 | 0.573000 | 6.305921 | 69.1 | 3.856371 | 1.000000 | 405.935275 | 21.000000 | 352.741952 | 9.670000 |
502 | 0.045270 | 0.000000 | 11.930000 | 0.000000 | 0.573000 | 6.120000 | 76.7 | 2.287500 | 1.000000 | 273.000000 | 18.521192 | 396.900000 | 9.080000 |
503 | 3.627579 | 10.722951 | 11.930000 | 0.066007 | 0.573000 | 6.976000 | 91.0 | 3.856371 | 9.383871 | 405.935275 | 21.000000 | 352.741952 | 5.640000 |
504 | 0.109590 | 0.000000 | 11.930000 | 0.066007 | 0.573000 | 6.305921 | 89.3 | 3.856371 | 1.000000 | 405.935275 | 21.000000 | 393.450000 | 6.480000 |
505 | 0.047410 | 0.000000 | 11.930000 | 0.000000 | 0.573000 | 6.030000 | 67.4 | 3.856371 | 1.000000 | 405.935275 | 18.521192 | 396.900000 | 7.880000 |
506 rows × 13 columns
# 使用 0填充缺失值
imp_0 = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0)
x_missing_0 = imp_0.fit_transform(x_missing)
x_missing_0 = pd.DataFrame(x_missing_0)
x_missing_0
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00000 | 18.0 | 0.00 | 0.0 | 0.538 | 0.000 | 65.2 | 4.0900 | 1.0 | 296.0 | 0.0 | 0.00 | 4.98 |
1 | 0.02731 | 0.0 | 0.00 | 0.0 | 0.469 | 0.000 | 78.9 | 4.9671 | 2.0 | 0.0 | 0.0 | 396.90 | 9.14 |
2 | 0.02729 | 0.0 | 7.07 | 0.0 | 0.000 | 7.185 | 61.1 | 0.0000 | 2.0 | 242.0 | 0.0 | 0.00 | 0.00 |
3 | 0.00000 | 0.0 | 0.00 | 0.0 | 0.458 | 0.000 | 45.8 | 0.0000 | 0.0 | 222.0 | 18.7 | 0.00 | 0.00 |
4 | 0.00000 | 0.0 | 2.18 | 0.0 | 0.000 | 7.147 | 0.0 | 0.0000 | 0.0 | 0.0 | 18.7 | 0.00 | 5.33 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
501 | 0.00000 | 0.0 | 0.00 | 0.0 | 0.573 | 0.000 | 69.1 | 0.0000 | 1.0 | 0.0 | 21.0 | 0.00 | 9.67 |
502 | 0.04527 | 0.0 | 11.93 | 0.0 | 0.573 | 6.120 | 76.7 | 2.2875 | 1.0 | 273.0 | 0.0 | 396.90 | 9.08 |
503 | 0.00000 | 0.0 | 11.93 | 0.0 | 0.573 | 6.976 | 91.0 | 0.0000 | 0.0 | 0.0 | 21.0 | 0.00 | 5.64 |
504 | 0.10959 | 0.0 | 11.93 | 0.0 | 0.573 | 0.000 | 89.3 | 0.0000 | 1.0 | 0.0 | 21.0 | 393.45 | 6.48 |
505 | 0.04741 | 0.0 | 11.93 | 0.0 | 0.573 | 6.030 | 0.0 | 0.0000 | 1.0 | 0.0 | 0.0 | 396.90 | 7.88 |
506 rows × 13 columns
# 使用 隨機森林 填充缺失值
# 通過已有的 特徵資料 和 標籤資訊來 迴歸預測 缺失的資料
# 先填充缺失較少的特徵資料
x_missing_reg = x_missing.copy()
sortindex = np.argsort(x_missing_reg.isnull().sum(axis=0)).values # 計算出特徵空值資料,然後排序返回對應列的索引
sortindex
array([ 6, 12, 8, 7, 9, 0, 2, 1, 5, 4, 3, 10, 11], dtype=int64)
# 遍歷,填補空值
for i in sortindex:
df = x_missing_reg
fillc = df.iloc[:, i]
df = pd.concat([df.drop(i, axis=1), pd.DataFrame(y_full)], axis=1)
df_0 = SimpleImputer(missing_values=np.nan
, strategy='constant'
, fill_value=0
).fit_transform(df)
y_train = fillc[fillc.notnull()]
y_test = fillc[fillc.isnull()]
x_train = df_0[y_train.index, :]
x_test = df_0[y_test.index, :]
rfc = RandomForestRegressor(n_estimators=100)
rfc = rfc.fit(x_train, y_train)
y_predict = rfc.predict(x_test)
x_missing_reg.loc[x_missing_reg.loc[:, i].isnull(), i] = y_predict
# 對填補好的資料進行建模
X = [x_full, x_missing_mean, x_missing_0, x_missing_reg]
mse = []
std = []
for x in X:
estimator = RandomForestRegressor(random_state=0, n_estimators=100)
scores = cross_val_score(estimator, x, y_full, scoring='neg_mean_squared_error', cv=5).mean()
mse.append(scores * -1)
# 用所得的結果畫出條形圖
x_labels = ['Full data'
, 'Zero Imputation'
, 'Mean Imputation'
, 'Regressor Imputation'
]
colors = ['r', 'g', 'b', 'orange']
plt.figure(figsize=(12, 6))
ax = plt.subplot(111)
for i in range(len(mse)):
ax.barh(i, mse[i], color=colors[i], alpha=0.6, align='center')
ax.set_title('Imputation Techniques with Boston Data')
ax.set_xlim(left=np.min(mse) * 0.9,
right=np.max(mse) * 1.1
)
ax.set_yticks(range(len(mse)))
ax.set_xlabel('MSE')
ax.set_yticklabels(x_labels)
plt.show()