ARIMA 時間序列預測
阿新 • • 發佈:2018-11-25
ARIMA 時間序列預測
(學習資料及程式碼均從網上獲取。)
資料記錄AirPassengers.csv:
Month,#Passengers 1949-01,112 1949-02,118 1949-03,132 1949-04,129 1949-05,121 1949-06,135 1949-07,148 1949-08,148 1949-09,136 1949-10,119 1949-11,104 1949-12,118 1950-01,115 1950-02,126 1950-03,141 1950-04,135 1950-05,125 1950-06,149 1950-07,170 1950-08,170 1950-09,158 1950-10,133 1950-11,114 1950-12,140 1951-01,145 1951-02,150 1951-03,178 1951-04,163 1951-05,172 1951-06,178 1951-07,199 1951-08,199 1951-09,184 1951-10,162 1951-11,146 1951-12,166 1952-01,171 1952-02,180 1952-03,193 1952-04,181 1952-05,183 1952-06,218 1952-07,230 1952-08,242 1952-09,209 1952-10,191 1952-11,172 1952-12,194 1953-01,196 1953-02,196 1953-03,236 1953-04,235 1953-05,229 1953-06,243 1953-07,264 1953-08,272 1953-09,237 1953-10,211 1953-11,180 1953-12,201 1954-01,204 1954-02,188 1954-03,235 1954-04,227 1954-05,234 1954-06,264 1954-07,302 1954-08,293 1954-09,259 1954-10,229 1954-11,203 1954-12,229 1955-01,242 1955-02,233 1955-03,267 1955-04,269 1955-05,270 1955-06,315 1955-07,364 1955-08,347 1955-09,312 1955-10,274 1955-11,237 1955-12,278 1956-01,284 1956-02,277 1956-03,317 1956-04,313 1956-05,318 1956-06,374 1956-07,413 1956-08,405 1956-09,355 1956-10,306 1956-11,271 1956-12,306 1957-01,315 1957-02,301 1957-03,356 1957-04,348 1957-05,355 1957-06,422 1957-07,465 1957-08,467 1957-09,404 1957-10,347 1957-11,305 1957-12,336 1958-01,340 1958-02,318 1958-03,362 1958-04,348 1958-05,363 1958-06,435 1958-07,491 1958-08,505 1958-09,404 1958-10,359 1958-11,310 1958-12,337 1959-01,360 1959-02,342 1959-03,406 1959-04,396 1959-05,420 1959-06,472 1959-07,548 1959-08,559 1959-09,463 1959-10,407 1959-11,362 1959-12,405 1960-01,417 1960-02,391 1960-03,419 1960-04,461 1960-05,472 1960-06,535 1960-07,622 1960-08,606 1960-09,508 1960-10,461 1960-11,390 1960-12,432
ARIMA演算法
# -*- coding: utf-8 -*- import pandas as pd import numpy as np import matplotlib.pylab as plt from matplotlib.pylab import rcParams #rcParams設定好畫布的大小 rcParams['figure.figsize'] = 15, 6 data = pd.read_csv("./AirPassengers.csv") print (data.head()) print ('\n Data types:') print (data.dtypes) dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m') #---其中parse_dates 表明選擇資料中的哪個column作為date-time資訊, #---index_col 告訴pandas以哪個column作為 index #--- date_parser 使用一個function(本文用lambda表示式代替),使一個string轉換為一個datetime變數 data = pd.read_csv('AirPassengers.csv', parse_dates=['Month'], index_col='Month',date_parser=dateparse) print (data.head()) print (data.index) #檢查時序資料的穩定性(Stationarity) from statsmodels.tsa.stattools import adfuller def test_stationarity(timeseries): #這裡以一年為一個視窗,每一個時間t的值由它前面12個月(包括自己)的均值代替,標準差同理。 rolmean = timeseries.rolling(window=12).mean() rolstd = timeseries.rolling(window=12).std() #plot rolling statistics: fig = plt.figure() fig.add_subplot() orig = plt.plot(timeseries, color = 'blue',label='Original') mean = plt.plot(rolmean , color = 'red',label = 'rolling mean') std = plt.plot(rolstd, color = 'black', label= 'Rolling standard deviation') plt.legend(loc = 'best') plt.title('Rolling Mean & Standard Deviation') plt.show(block=False) #Dickey-Fuller test: print ('Results of Dickey-Fuller Test:') dftest = adfuller(timeseries,autolag = 'AIC') #dftest的輸出前一項依次為檢測值,p值,滯後數,使用的觀測數,各個置信度下的臨界值 dfoutput = pd.Series(dftest[0:4],index = ['Test Statistic','p-value','#Lags Used','Number of Observations Used']) for key,value in dftest[4].items(): dfoutput['Critical value (%s)' %key] = value print (dfoutput) ts = data['#Passengers'] test_stationarity(ts) ts_log = np.log(ts) #moving_avg = pd.rolling_mean(ts_log,12) moving_avg = ts_log.rolling(window=12).mean() plt.plot(ts_log ,color = 'blue') plt.plot(moving_avg, color='red') ts_log_moving_avg_diff = ts_log-moving_avg ts_log_moving_avg_diff.dropna(inplace = True) test_stationarity(ts_log_moving_avg_diff) # halflife的值決定了衰減因子alpha: alpha = 1 - exp(log(0.5) / halflife) #expweighted_avg = pd.ewma(ts_log,halflife=12) expweighted_avg = pd.DataFrame.ewm(ts_log,halflife=12).mean() ts_log_ewma_diff = ts_log - expweighted_avg test_stationarity(ts_log_ewma_diff) ts_log_diff = ts_log - ts_log.shift() ts_log_diff.dropna(inplace=True) test_stationarity(ts_log_diff) #分解(decomposing) 可以用來把時序資料中的趨勢和週期性資料都分離出來: from statsmodels.tsa.seasonal import seasonal_decompose def decompose(timeseries): # 返回包含三個部分 trend(趨勢部分) , seasonal(季節性部分) 和residual (殘留部分) decomposition = seasonal_decompose(timeseries) trend = decomposition.trend seasonal = decomposition.seasonal residual = decomposition.resid plt.subplot(411) plt.plot(ts_log, label='Original') plt.legend(loc='best') plt.subplot(412) plt.plot(trend, label='Trend') plt.legend(loc='best') plt.subplot(413) plt.plot(seasonal,label='Seasonality') plt.legend(loc='best') plt.subplot(414) plt.plot(residual, label='Residuals') plt.legend(loc='best') plt.tight_layout() return trend , seasonal, residual #消除了trend 和seasonal之後,只對residual部分作為想要的時序資料進行處理 trend , seasonal, residual = decompose(ts_log) residual.dropna(inplace=True) test_stationarity(residual) # ============================================================================= # 對時序資料進行預測# # ============================================================================= #ACF and PACF plots: from statsmodels.tsa.stattools import acf, pacf lag_acf = acf(ts_log_diff, nlags=20) lag_pacf = pacf(ts_log_diff, nlags=20, method='ols') #Plot ACF: plt.subplot(121) plt.plot(lag_acf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Autocorrelation Function') #Plot PACF: plt.subplot(122) plt.plot(lag_pacf) plt.axhline(y=0,linestyle='--',color='gray') plt.axhline(y=-1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.axhline(y=1.96/np.sqrt(len(ts_log_diff)),linestyle='--',color='gray') plt.title('Partial Autocorrelation Function') plt.tight_layout() from statsmodels.tsa.arima_model import ARIMA model = ARIMA(ts_log, order=(2, 1, 0)) results_AR = model.fit(disp=-1) plt.plot(ts_log_diff) plt.plot(results_AR.fittedvalues, color='red') plt.title('RSS: %.4f'% sum((results_AR.fittedvalues-ts_log_diff)**2)) model = ARIMA(ts_log, order=(0, 1, 2)) results_MA = model.fit(disp=-1) plt.plot(ts_log_diff) plt.plot(results_MA.fittedvalues, color='red') plt.title('RSS: %.4f'% sum((results_MA.fittedvalues-ts_log_diff)**2)) model = ARIMA(ts_log, order=(2, 1, 2)) results_ARIMA = model.fit(disp=-1) plt.plot(ts_log_diff) plt.plot(results_ARIMA.fittedvalues, color='red') plt.title('RSS: %.4f'% sum((results_ARIMA.fittedvalues-ts_log_diff)**2)) # ============================================================================= # 預測 # ============================================================================= #ARIMA擬合的其實是一階差分ts_log_diff,predictions_ARIMA_diff[i]是第i個月與i-1個月的ts_log的差值。 #由於差分化有一階滯後,所以第一個月的資料是空的, predictions_ARIMA_diff = pd.Series(results_ARIMA.fittedvalues, copy=True) print (predictions_ARIMA_diff.head()) #累加現有的diff,得到每個值與第一個月的差分(同log底的情況下)。 #即predictions_ARIMA_diff_cumsum[i] 是第i個月與第1個月的ts_log的差值。 predictions_ARIMA_diff_cumsum = predictions_ARIMA_diff.cumsum() #先ts_log_diff => ts_log=>ts_log => ts #先以ts_log的第一個值作為基數,複製給所有值,然後每個時刻的值累加與第一個月對應的差值(這樣就解決了,第一個月diff資料為空的問題了) #然後得到了predictions_ARIMA_log => predictions_ARIMA predictions_ARIMA_log = pd.Series(ts_log.ix[0], index=ts_log.index) predictions_ARIMA_log = predictions_ARIMA_log.add(predictions_ARIMA_diff_cumsum,fill_value=0) predictions_ARIMA = np.exp(predictions_ARIMA_log) plt.figure() plt.plot(ts) plt.plot(predictions_ARIMA) plt.title('RMSE: %.4f'% np.sqrt(sum((predictions_ARIMA-ts)**2)/len(ts)))
執行結果如下:
Month #Passengers 0 1949-01 112 1 1949-02 118 2 1949-03 132 3 1949-04 129 4 1949-05 121 Data types: Month object #Passengers int64 dtype: object #Passengers Month 1949-01-01 112 1949-02-01 118 1949-03-01 132 1949-04-01 129 1949-05-01 121 DatetimeIndex(['1949-01-01', '1949-02-01', '1949-03-01', '1949-04-01', '1949-05-01', '1949-06-01', '1949-07-01', '1949-08-01', '1949-09-01', '1949-10-01', ... '1960-03-01', '1960-04-01', '1960-05-01', '1960-06-01', '1960-07-01', '1960-08-01', '1960-09-01', '1960-10-01', '1960-11-01', '1960-12-01'], dtype='datetime64[ns]', name='Month', length=144, freq=None)
Results of Dickey-Fuller Test:
Test Statistic 0.815369
p-value 0.991880
#Lags Used 13.000000
Number of Observations Used 130.000000
Critical value (1%) -3.481682
Critical value (5%) -2.884042
Critical value (10%) -2.578770
dtype: float64
Results of Dickey-Fuller Test:
Test Statistic -3.162908
p-value 0.022235
#Lags Used 13.000000
Number of Observations Used 119.000000
Critical value (1%) -3.486535
Critical value (5%) -2.886151
Critical value (10%) -2.579896
dtype: float64
Results of Dickey-Fuller Test:
Test Statistic -3.601262
p-value 0.005737
#Lags Used 13.000000
Number of Observations Used 130.000000
Critical value (1%) -3.481682
Critical value (5%) -2.884042
Critical value (10%) -2.578770
dtype: float64
Results of Dickey-Fuller Test:
Test Statistic -2.717131
p-value 0.071121
#Lags Used 14.000000
Number of Observations Used 128.000000
Critical value (1%) -3.482501
Critical value (5%) -2.884398
Critical value (10%) -2.578960
dtype: float64
Results of Dickey-Fuller Test:
Test Statistic -6.332387e+00
p-value 2.885059e-08
#Lags Used 9.000000e+00
Number of Observations Used 1.220000e+02
Critical value (1%) -3.485122e+00
Critical value (5%) -2.885538e+00
Critical value (10%) -2.579569e+00
dtype: float64