1. 程式人生 > >2-3動手學習深度學習-kaggle房價預測

2-3動手學習深度學習-kaggle房價預測


%matplotlib inline
import gluonbook as gb
from mxnet import autograd,gluon,init,nd
from mxnet.gluon import data as gdata,loss as gloss,nn
import pandas as pd
import numpy as np

# 讀入資料
train_data=pd.read_csv('../data/kaggle_house_pred_train.csv')
test_data=pd.read_csv('../data/kaggle_house_pred_test.csv')

# print(train_data.shape) (1460, 81)
# print(test_data.shape) (1459, 80)
# train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]]  先按照行來索引 前 4 個樣本的前 4 個特徵、後 2 個特徵和標籤(SalePrice)
# 第一個特徵是 Id,它能幫助模型記住每個訓練樣本
# ,但難以推廣到測試樣本,所以我們不使用它來訓練。我們將所有的訓練和測試資料的 79 個特徵按樣本連結

all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
# all_features.shape  (2919, 79)


# 預處理資料
# 我們對連續數值的特徵做標準化(standardization):
# 設該特徵在整個資料集上的均值為  μ ,標準差為  σ 。那麼,我們可以將該特徵的每個值先減去  μ 再除以  σ
# 得到標準化後的每個特徵值。對於缺失的特徵值,我們將其替換成該特徵的均值。

numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/x.std())
all_features=all_features.fillna(all_features.mean())

# 接下來將離散數值轉成指示特徵。舉個例子,假設特徵 MSZoning 裡面有兩個不同的離散值 RL 和 RM,
# 那麼這一步轉換將去掉 MSZoning 特徵,並新加兩個特徵 MSZoning_RL 和 MSZoning_RM,其值為 0 或 1。
# 如果一個樣本原來在 MSZoning 裡的值為 RL,那麼有 MSZoning_RL=0 且 MSZoning_RM=1。

# dummy_na=True 將缺失值也當做合法的特徵值併為其建立指示特徵。
all_features=pd.get_dummies(all_features,dummy_na=True)

# all_features.shape  (2919, 331)
# 可以看到這一步轉換將特徵數從 79 增加到了 331。

# 最後,通過values屬性得到 NumPy 格式的資料,並轉成 NDArray 方便後面的訓練。
n_train=train_data.shape[0]
train_features=nd.array(all_features[:n_train].values)
test_features=nd.array(all_features[n_train:].values)
train_labels=nd.array(train_data.SalePrice.values).reshape((-1,1))

# 訓練模型
# 我們使用一個基本的線性迴歸模型和平方損失函式來訓練模型。
loss=gloss.L2Loss()
def get_net():
    net=nn.Sequential()
    
    net.add(nn.Dense(64,activation='relu'),nn.Dense(1))
    net.initialize()
    return net
def log_rmse(net,train_features,train_labels):
    clipped_preds=nd.clip(net(train_features),1,float('inf'))
    rmse=nd.sqrt(2*loss(clipped_preds.log(),train_labels.log()).mean())
    return rmse.asscalar()
# Adam 優化演算法。相對之前使用的小批量隨機梯度下降,它對學習率相對不那麼敏感。
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = gdata.DataLoader(gdata.ArrayDataset(
        train_features, train_labels), batch_size, shuffle=True)
    # 這裡使用了 Adam 優化演算法。
    trainer = gluon.Trainer(net.collect_params(), 'adam', {
        'learning_rate': learning_rate, 'wd': weight_decay})
    for epoch in range(num_epochs):
        for X, y in train_iter:
            with autograd.record():
                l = loss(net(X), y)
            l.backward()
            trainer.step(batch_size)
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

# K  折交叉驗證。它將被用來選擇模型設計並調節超引數。以下實現了一個函式,它返回第i折交叉驗證時所需要的訓練和驗證資料。
def get_k_fold_data(k,i,X,y):
    assert k>1
    fold_size=X.shape[0]//k   #表取整
    print( fold_size)
    print('\n')
    X_train,y_train=None,None
    for j in range(k):
        idx=slice(j*fold_size,(j+1)*fold_size)#slice() 函式實現切片物件,主要用在切片操作函式裡的引數傳遞。返回一個切片物件。
        X_part,y_part=X[idx,:],y[idx]
        if j==i:
            X_valid,y_valid=X_part,y_part
        elif X_train is None:
            X_train,y_train=X_part,y_part
        else:
            X_train=nd.concat(X_train,X_part,dim=0)
            y_train=nd.concat(y_train,y_part,dim=0)
    return X_train,y_train,X_valid,y_valid


# 在  K 折交叉驗證中我們訓練  K 次並返回訓練和驗證的平均誤差。
def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
# def k_fold(k,X_train,y_train,num_eopchs,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum=0,0
    for i in range(k):
        data=get_k_fold_data(k,i,X_train,y_train)
        net=get_net()
        train_ls,valid_ls=train(net,*data,num_epochs,learning_rate,  #*data表示把資料都取出來
                               weight_decay,batch_size)
        train_l_sum+=train_ls[-1]
        valid_l_sum+=valid_ls[-1]
        if i==4:
             gb.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                        range(1, num_epochs + 1), valid_ls,
                        ['train', 'valid'])
#         print('fold %d,train rmse: %f,valid rmse: %f'%(i,train_ls[-1],valid[-1]))
        print('fold %d, train rmse: %f, valid rmse: %f' % (
            i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
verbose_epoch = num_epochs - 2
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                         weight_decay, batch_size)
print('%d-fold validation: avg train rmse: %f, avg valid rmse: %f'
      % (k, train_l, valid_l))

# 在預測之前,我們會使用完整的訓練資料集來重新訓練模型,並將預測結果存成提交所需要的格式
def train_and_pred(train_features,test_features,train_labels,test_data,
                  num_epochs,lr,weight_decay,batch_size):
    net=get_net()
    train_ls,_=train(net,train_features,train_labels,None,None,
                    num_epochs,lr,weight_decay,batch_size)
    gb.semilogy(range(1,num_epochs+1),train_ls,'epochs','rmse')
    print('train rmse %f'%train_ls[-1])
    preds=net(test_features).asnumpy()
    test_data['SalePrice']=pd.Series(preds.reshape(-1,1)[0])
    submission=pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    submission.to_csv('submission.csv',index=False)
train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)