1. 程式人生 > >時間序列預測之keras

時間序列預測之keras

'''
多變數時間序列預測
'''


import os
os.getcwd()
os.chdir('C:\\Users\\87671\\Desktop\\位元魔方')
from pandas import read_csv
from datetime import datetime
from numpy import concatenate
from matplotlib import pyplot
from pandas import concat
from pandas import DataFrame
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
# 把日期轉換成指定格式 年月日小時
def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')

dataset = read_csv('raw.csv', parse_dates=[['year','month','day','hour']],index_col=0,date_parser=parse)
# parse_dates指定時間轉化的列 ,date_parser指定轉化的形式,index_col指定某列作為索引,一箇中括號在一起表示合併在了一起 
dataset.drop('No',axis =1 ,inplace = True)
# 賦予特殊的列名
dataset.columns = ['pollution', 'dew', 'temp', 'press', 'wnd_dir', 'wnd_spd', 'snow', 'rain']
dataset.index.name = 'date'
# 填充NA為0
dataset['pollution'].fillna(0,inplace = True)
# 刪除前24行的資料
dataset = dataset[24:]
print(dataset.head(5))
#dataset.to_csv('pollution.csv')

dataset = read_csv("pollution.csv",header = 0,index_col = 0)

# 畫圖,顯示每個變數的5年資料
values = dataset.values                                      # 首先提取資料框裡面的values
groups = [0,1,2,3,5,6,7]
i = 1
pyplot.figure()
for group in groups:
    pyplot.subplot(len(groups),1,i)                          # 子圖要幾行幾列,每個圖在第幾個
    pyplot.plot(values[:,group])
    pyplot.title(dataset.columns[group],y = 0.5,loc='right')
    i += 1
pyplot.show()


'''
明確監督問題:根據前1個小時的天氣情況x和y,預測下一個階段的y
將時間序列資料集轉化為監督學習問題
基礎知識
1.df['t-1']=df['t'].shift(1) #建立了和t之後一項的序列,同等長度,前面的第一個變成了NA
  df['t-1']=df['t'].shift(-1) # 提前一項,最後一項為nan
'''


def series_to_supervised(data, n_in=1,n_out=1,dropnan = True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = DataFrame(data)
    cols,names = list(),list()
    # 輸入序列(t-n,...,t-1)
    for i in range(n_in,0,-1):
        cols.append(df.shift(i))
        names += [('var%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
    # 預測序列(t ,t+1,...,t+n)
    for i in range(0, n_out):              # 只包括前不包括後,所以range0,1代表只有一個
        cols.append(df.shift(-i))
        if i == 0:
            names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
    # 將他們放在一起
    agg = concat(cols,axis=1)
    agg.columns = names
    # 去除有na的行
    if dropnan:
        agg.dropna(inplace = True)
    return agg

'''
舉例
## 單變數(就只有一個時間變數)
values = [x for x in range(10)]
data = series_to_supervised(values)
print(data)
data = series_to_supervised(values,3)         # 滯後三項預測一個y
print(data)
data = series_to_supervised(values,2,2)      # 滯後2,預測2
print(data)
## 多變數(以2個時間變數為例子)
raw = DataFrame()
raw['ob1'] = [x for x in range(10)]
raw['ob2'] = [x for x in range(50, 60)]
values = raw.values
data = series_to_supervised(values)        # 2個都滯後一項
print(data)
'''
# 將類別變數(第5列)獨熱編碼
encoder = preprocessing.LabelEncoder()
values[:,4] = encoder.fit_transform(values[:,4])
# 確保所有的data是float形式
values = values.astype('float32')
# 標準化壓縮到0-1之間
scaler = preprocessing.MinMaxScaler(feature_range=(0,1)) 
scaled = scaler.fit_transform(values)
###########################################################################timestep=1
# 使用時間序列變監督學習的函式
reframed1 = series_to_supervised(scaled,1,1) 
# 其實我們只需要預測的是第一個變數,所以其他變數對應的時間序列可以刪去
reframed1.drop(reframed1.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True)
print(reframed1.head())  #前八個變數時原來資料集的X和Y,最後一個是Y

'''
現在資料準備好了,開始擬合模型
'''
#劃分訓練集測試集(這裡講第一年當做訓練集,其餘4年當做測試集)
values = reframed1.values
n_train_hours = 365*24
train = values[:n_train_hours,:]
test = values[n_train_hours:,:]
#劃分X和Y
train_X,train_y = train[:,:-1],train[:,-1]
test_X,test_y = test[:,:-1],test[:,-1]
#X轉化成LSTM需要的格式[樣本,時間步長,特徵]
train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1])) 
test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1]))
print(train_X.shape,train_y.shape,test_X.shape,test_y.shape)
# 設計網路
model = Sequential()
# 50個神經元,X[1]是一個時間步長,X[2]是8個維度
model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2])))
model.add(Dense(1))
# 平均絕對誤差mae
model.compile(loss='mae',optimizer = 'adam')
# 擬合network
# 50個批量大小為72的訓練時機
history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'],label='train')
pyplot.plot(history.history['val_loss'],label='test')
pyplot.legend()
pyplot.show()
# 看圖觀察可能過擬合

'''
模型評估
'''
# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]))  # 3W條資料,8個特徵
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:]), axis=1)       # 拼接成y對應X7個變數的格式
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]                                    # 只選擇第一列也就是我們真正的yhat
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))           # 計算mse
print('Test RMSE: %.3f' % rmse)



##################################################################################時間跨度為2呢

reframed = series_to_supervised(scaled,2,1) 
'''
不能再像原來那樣刪這些編號啦,可以這樣做
train_X, train_y = train[:, 0:n_obs], train[:, -n_features]
reframed.drop(reframed.columns[[9,10,11,12,13,14,15,]], axis = 1,inplace=True)
'''
'''
現在資料準備好了,開始擬合模型
'''
#劃分訓練集測試集(這裡講第一年當做訓練集,其餘4年當做測試集)
values = reframed.values
n_train_hours = 365*24
train = values[:n_train_hours,:]
test = values[n_train_hours:,:]
#劃分X和Y
# 和step=1不一樣
n_hours = 2                # timestep=2
n_features = 8
n_obs = n_hours * n_features  # step=2
train_X, train_y = train[:, 0:n_obs], train[:, -n_features]
test_X, test_y = test[:, 0:n_obs], test[:, -n_features]
# reshape input to be 3D [samples, timesteps, features]
train_X = train_X.reshape((train_X.shape[0], n_hours, n_features))
#這裡可以檢查train_X和reframed各自什麼樣子,經驗證是我們需要的樣子!
test_X = test_X.reshape((test_X.shape[0], n_hours, n_features))
print(train_X.shape, train_y.shape, test_X.shape, test_y.shape)

'''
train_X,train_y = train[:,:-1],train[:,-1]
test_X,test_y = test[:,:-1],test[:,-1]
#X轉化成LSTM需要的格式[樣本,時間不長,特徵]
train_X = train_X.reshape((train_X.shape[0],1,train_X.shape[1]))
test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[1]))
'''

# 設計網路
model = Sequential()
# 50個神經元,X是一個時間步長,8個維度
model.add(LSTM(50, input_shape=(train_X.shape[1],train_X.shape[2])))
model.add(Dense(1))
# 平均絕對誤差mae
model.compile(loss='mae',optimizer = 'adam')
# 擬合network
# 50個批量大小為72的訓練時機
history = model.fit(train_X,train_y,epochs=50,batch_size=72,validation_data=(test_X, test_y), verbose=2, shuffle=False)
# plot history
pyplot.plot(history.history['loss'],label='train')
pyplot.plot(history.history['val_loss'],label='test')
pyplot.legend()
pyplot.show()
# 看圖觀察可能過擬合

'''
模型評估
'''

'''
# make a prediction 返回原來的單位
trainPredict = model.predict(train_X)
testPredict = model.predict(test_X)
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])
import math
trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0]))
trainScore = math.sqrt(mean_squared_error(trainY[0],trainPredict[:,0]))
print('Train RMSE: %.3f' % trainScore)         
print('Test RMSE: %.3f' % testScore)
'''


yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], test_X.shape[2]*n_hours))  # 3W條資料,16個特徵
# invert scaling for forecast
inv_yhat = concatenate((yhat, test_X[:, 1:8]), axis=1)               # 因為之前標準化之前是8列,所以inverse的時候也要是8列,
                                                                     # 當然我們有用的只是第一列,所以1:8,8:15都可以
inv_yhat = scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:,0]                                             # 只選擇第一列也就是我們真正的yhat
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, 1:8]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:,0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))                     # 計算mse
print('Test RMSE: %.3f' % rmse)                                      # 26.335

# 看一下預測的曲線(這裡值取了前面50個)
plt.plot(inv_y[0:50],color='green')
plt.plot(inv_yhat[0:50])
plt.show()