kaggle模型融合簡單入門
阿新 • • 發佈:2019-02-03
摘要:
kaggle的一個基本回歸問題。我們主要試驗一下xgboost模型。下面是簡單的調節部分至於超引數的選擇其實是需要GridSearchCV等方法來選擇,此處只是簡單的
交叉驗證目的是驗證不同的模型融合方法
特徵選擇:
#這個題屬性其實就兩種 類別屬性和數值屬性 和Tantic不太一樣我們可以統一處理
#處理類別屬性但是值卻是數字的特徵
#MSSubClass 的值其實應該是一個category,是住宅屬性 可是取值卻是整數
#所以我們考慮將這個類的型別重新確定
def processMSSubClass(df): df['MSSubClass']=df['MSSubClass'].astype(str) return df def processOverallQual(df): df['OverallQual'] = df['OverallQual'].astype(str) return df def processOverallCond(df): df['OverallCond'] = df['OverallCond'].astype(str) return df #這裡屬性比較多我們可以先用one-hot編碼生成大量衍生資料 #將所有的我們把所有的category資料,都給One-Hot了 def processDummies(df): df = processMSSubClass(df) df = processOverallQual(df) df = processOverallCond(df) df = pd.get_dummies(df) return df #處理數值資料的缺失值 #由於缺失值比較多我們處理下缺失值同時要看缺失值具體含義 #此處這些缺失並沒有實際意義所以用平均值來填充 def processMissData(df): meancols = df.dropna().mean() df = df.fillna(meancols) #df.isnull().sum().sum() return df #標準化資料 #將numeric_cols = all_df.columns[all_df.dtypes != 'object'] #df.iloc[:,:10].describe()可以研究具體數值 def processDataScaled(df): nummeric_cols = df.columns[df.dtypes!='object'] numeric_mean = df.loc[:,nummeric_cols].mean() numeric_std = df.loc[:,nummeric_cols].std() df.loc[:,nummeric_cols] = (df.loc[:,nummeric_cols]-numeric_mean)/numeric_std return df def processData(df): df = processDummies(df) df = processMissData(df) df = processDataScaled(df) return df
模型融合
最簡單的Bagging:
def BaggingModel(): input_df = pd.read_csv('train.csv', index_col=0) submit_df = pd.read_csv('test.csv', index_col=0) train_y = np.log1p(input_df.pop('SalePrice'))#訓練標籤 df = pd.concat([input_df,submit_df]) df = dataprocess.processData(df) input_df = df.loc[input_df.index] submit_df = df.loc[submit_df.index] train_X = input_df.values test_X = submit_df.values #在這裡,我們用CV結果來測試不同的分類器個數對最後結果的影響。 # 注意,我們在部署Bagging的時候,要把它的函式base_estimator裡填上你的小分類器(ridge) params = [1, 10, 15, 20, 25, 30, 40] test_scores = [] ridge = Ridge(15)#嶺迴歸alpha=15 params = [1, 10, 15, 20, 25, 30, 40] test_scores = [] for param in params: clf = BaggingRegressor(n_estimators=param, base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(np.mean(test_score)) plt.plot(params, test_scores) plt.title("n_estimator vs CV Error");
然後我們測試下Boost:
def BoostModel(): input_df = pd.read_csv('train.csv', index_col=0) submit_df = pd.read_csv('test.csv', index_col=0) train_y = np.log1p(input_df.pop('SalePrice')) # 訓練標籤 df = pd.concat([input_df, submit_df]) df = dataprocess.processData(df) input_df = df.loc[input_df.index] submit_df = df.loc[submit_df.index] train_X = input_df.values test_X = submit_df.values params = [10,15,20,25,30,35,40,45,50] ridge = Ridge(15)#嶺迴歸alpha=15 test_scores = [] for param in params: clf = AdaBoostRegressor(n_estimators=param,base_estimator=ridge) test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error')) test_scores.append(test_score) plt.plot(params,test_scores) plt.show()
然後我們用XGBOOST來進行模型選擇:
from xgboost import XGBRegressor
def xgboostModel():
input_df = pd.read_csv('train.csv', index_col=0)
submit_df = pd.read_csv('test.csv', index_col=0)
train_y = np.log1p(input_df.pop('SalePrice')) # 訓練標籤
df = pd.concat([input_df, submit_df])
df = dataprocess.processData(df)
input_df = df.loc[input_df.index]
submit_df = df.loc[submit_df.index]
train_X = input_df.values
test_X = submit_df.values
params = [1,2,3,4,5,6]
ridge = Ridge(15) # 嶺迴歸alpha=15
test_scores = []
for param in params:
clf = XGBRegressor(max_depth=param)
test_score = np.sqrt(-cross_val_score(clf, train_X, train_y, cv=10, scoring='neg_mean_squared_error'))
test_scores.append(np.mean(test_score))
plt.plot(params, test_scores)
plt.title("max_depth vs CV Error");
plt.show()
clf = XGBRegressor(max_depth=6)
clf.fit(train_X,train_y)
predictions = clf.predict(test_X).astype(np.float64)
predictions = np.exp(predictions) - 1
result = pd.DataFrame({"Id":submit_df.index,"SalePrice":predictions})
result.to_csv('xgboost_result.csv',index=False)
Stack:
如果我們用Stack模型。那麼就需要幾個備選模型這裡我們用一個Stacking的思維來汲取兩種或者多種模型的優點
首先,我們把最好的parameter拿出來,做成我們最終的model
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
#Stacking
def stackModel():
input_df = pd.read_csv('train.csv', index_col=0)
submit_df = pd.read_csv('test.csv', index_col=0)
train_y = np.log1p(input_df.pop('SalePrice')).as_matrix() # 訓練標籤
df = pd.concat([input_df, submit_df])
df = dataprocess.processData(df)
input_df = df.loc[input_df.index]
submit_df = df.loc[submit_df.index]
train_X = input_df.values
test_X = submit_df.values
clfs = [RandomForestRegressor(n_estimators=500,max_features=.3),
XGBRegressor(max_depth=6,n_estimators=500),
Ridge(15)]
#訓練過程
dataset_stack_train = np.zeros((train_X.shape[0],len(clfs)))
dataset_stack_test = np.zeros((test_X.shape[0],len(clfs)))
for j,clf in enumerate(clfs):
clf.fit(train_X,train_y)
y_submission = clf.predict(test_X)
y_train = clf.predict(train_X)
dataset_stack_train[:,j] = y_train
dataset_stack_test[:,j] = y_submission
print("開始Stacking....")
clf = RandomForestRegressor(n_estimators=1000,max_depth=8)
clf.fit(dataset_stack_train,train_y)
y_submission = clf.predict(dataset_stack_test)
predictions = np.expm1(y_submission)
result = pd.DataFrame({"Id": submit_df.index, "SalePrice": predictions})
result.to_csv('stack_result.csv', index=False)
我們看下結果的提升:
分數上升很多 而Blending的話我們其實就是每個模型遍歷的時候是用的交叉驗證雙重迴圈。需要遍歷K-Fold。其中每次預測是原來train的test部分。
後面融合的模型方法一樣。如果是分類我們可以用
LogisticRegression
迴歸