第八章 財政收入預測分析
阿新 • • 發佈:2021-05-07
第八章 財政收入預測分析
實訓
實訓1 求取企業所得稅各特徵間的相關係數
import numpy as np
import pandas as pd
inputfile='./income_tax.csv'
data=pd.read_csv(inputfile)
print('相關係數矩陣為:',np.round(data.corr(method='pearson'),2))
相關係數矩陣為: year x1 x2 x3 x4 x5 x6 x7 x8 x9 x10 y year 1.00 1.00 0.98 0.97 0.97 0.95 -0.89 0.95 0.98 0.93 0.97 0.99 x1 1.00 1.00 0.99 0.98 0.98 0.95 -0.89 0.97 0.99 0.93 0.98 0.99 x2 0.98 0.99 1.00 0.99 0.98 0.92 -0.89 0.99 0.98 0.90 0.99 0.98 x3 0.97 0.98 0.99 1.00 0.96 0.92 -0.91 0.99 0.98 0.91 0.99 0.97 x4 0.97 0.98 0.98 0.96 1.00 0.90 -0.82 0.95 0.97 0.92 0.96 0.99 x5 0.95 0.95 0.92 0.92 0.90 1.00 -0.91 0.88 0.96 0.92 0.94 0.95 x6 -0.89 -0.89 -0.89 -0.91 -0.82 -0.91 1.00 -0.87 -0.91 -0.84 -0.92 -0.88 x7 0.95 0.97 0.99 0.99 0.95 0.88 -0.87 1.00 0.97 0.86 0.98 0.95 x8 0.98 0.99 0.98 0.98 0.97 0.96 -0.91 0.97 1.00 0.95 0.99 0.99 x9 0.93 0.93 0.90 0.91 0.92 0.92 -0.84 0.86 0.95 1.00 0.91 0.95 x10 0.97 0.98 0.99 0.99 0.96 0.94 -0.92 0.98 0.99 0.91 1.00 0.98 y 0.99 0.99 0.98 0.97 0.99 0.95 -0.88 0.95 0.99 0.95 0.98 1.00
實訓2 選取企業所得稅預測關鍵特徵
from sklearn.linear_model import Lasso
lasso=Lasso(normalize=True,max_iter=10000)
lasso.fit(data.iloc[:,1:11],data['y'])
print('相關係數為:',np.round(lasso.coef_,5))
相關係數為: [ 1.61300000e-02 -1.05300000e-02 -5.01000000e-03 3.87999184e+03
-1.66000000e-03 -9.72594842e+03 2.42900000e-02 -3.23400000e-02
7.08000000e-03 8.41000000e-03]
#計算相關係數非0的個數為
print('相關係數非零個數為:',np.sum(lasso.coef_!=0))
相關係數非零個數為: 10
mask=lasso.coef_ !=0
print('相關係數是否為0:',mask)
相關係數是否為0: [ True True True True True True True True True True]
outputfile='./new_reg_data1.csv'
mask = np.append(mask,True)
mask = np.append(mask,True)
new_reg_data=data.iloc[:,mask]
new_reg_data. to_csv(outputfile)
print('輸出資料的維度為:',new_reg_data.shape)
輸出資料的維度為: (12, 12)
mask
array([ True, True, True, True, True, True, True, True, True,
True, True, True])
實訓3 構建企業所得稅預測模型
import numpy as np
import pandas as pd
from GM11 import GM11
inputfile='./income_tax.csv'
data=pd.read_csv(inputfile)
inputfile1='./new_reg_data1.csv'
data1=pd.read_csv(inputfile1)
data1
Unnamed: 0 | year | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2004 | 12113416 | 18895479 | 10092421 | 559.6 | 2075416 | 31.99 | 3733922 | 80922 | 1053156 | 2690984 | 236416 |
1 | 1 | 2005 | 14859261 | 21627825 | 11751668 | 554.5 | 3184744 | 29.87 | 4785787 | 167217 | 1154425 | 3005475 | 268360 |
2 | 2 | 2006 | 17880638 | 25453413 | 13489283 | 566.1 | 3981959 | 30.69 | 5459314 | 154958 | 1434440 | 3384477 | 326556 |
3 | 3 | 2007 | 20452183 | 29787941 | 15191582 | 575.2 | 4048305 | 31.63 | 6331382 | 186678 | 3621757 | 4088545 | 373397 |
4 | 4 | 2008 | 24415160 | 35118425 | 16963824 | 582.1 | 5388451 | 28.95 | 6870406 | 219390 | 4196301 | 4767231 | 455820 |
5 | 5 | 2009 | 28257805 | 41646681 | 18633437 | 599.0 | 7531147 | 24.88 | 7507109 | 376839 | 7068265 | 8389925 | 596693 |
6 | 6 | 2010 | 32278717 | 48903250 | 21055373 | 633.1 | 6930269 | 30.85 | 8754491 | 458096 | 17829885 | 8431405 | 756412 |
7 | 7 | 2011 | 34051588 | 55607710 | 26598516 | 612.8 | 7791165 | 23.16 | 10134050 | 485760 | 17019222 | 11076649 | 732282 |
8 | 8 | 2012 | 40022658 | 65574525 | 32635731 | 632.4 | 10312744 | 20.42 | 12805288 | 653736 | 26192835 | 13991612 | 935248 |
9 | 9 | 2013 | 45769763 | 76419207 | 34122005 | 664.7 | 9585263 | 22.55 | 15613171 | 668043 | 21639131 | 15351387 | 1061594 |
10 | 10 | 2014 | 47206504 | 86167948 | 37583868 | 677.3 | 8256048 | 20.90 | 17417072 | 703733 | 21396742 | 15796804 | 1075045 |
11 | 11 | 2015 | 52273431 | 99643373 | 44545508 | 680.7 | 11053751 | 19.70 | 21828895 | 877889 | 22659148 | 20881374 | 1155923 |
data
year | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | y | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2004 | 12113416 | 18895479 | 10092421 | 559.6 | 2075416 | 31.99 | 3733922 | 80922 | 1053156 | 2690984 | 236416 |
1 | 2005 | 14859261 | 21627825 | 11751668 | 554.5 | 3184744 | 29.87 | 4785787 | 167217 | 1154425 | 3005475 | 268360 |
2 | 2006 | 17880638 | 25453413 | 13489283 | 566.1 | 3981959 | 30.69 | 5459314 | 154958 | 1434440 | 3384477 | 326556 |
3 | 2007 | 20452183 | 29787941 | 15191582 | 575.2 | 4048305 | 31.63 | 6331382 | 186678 | 3621757 | 4088545 | 373397 |
4 | 2008 | 24415160 | 35118425 | 16963824 | 582.1 | 5388451 | 28.95 | 6870406 | 219390 | 4196301 | 4767231 | 455820 |
5 | 2009 | 28257805 | 41646681 | 18633437 | 599.0 | 7531147 | 24.88 | 7507109 | 376839 | 7068265 | 8389925 | 596693 |
6 | 2010 | 32278717 | 48903250 | 21055373 | 633.1 | 6930269 | 30.85 | 8754491 | 458096 | 17829885 | 8431405 | 756412 |
7 | 2011 | 34051588 | 55607710 | 26598516 | 612.8 | 7791165 | 23.16 | 10134050 | 485760 | 17019222 | 11076649 | 732282 |
8 | 2012 | 40022658 | 65574525 | 32635731 | 632.4 | 10312744 | 20.42 | 12805288 | 653736 | 26192835 | 13991612 | 935248 |
9 | 2013 | 45769763 | 76419207 | 34122005 | 664.7 | 9585263 | 22.55 | 15613171 | 668043 | 21639131 | 15351387 | 1061594 |
10 | 2014 | 47206504 | 86167948 | 37583868 | 677.3 | 8256048 | 20.90 | 17417072 | 703733 | 21396742 | 15796804 | 1075045 |
11 | 2015 | 52273431 | 99643373 | 44545508 | 680.7 | 11053751 | 19.70 | 21828895 | 877889 | 22659148 | 20881374 | 1155923 |
data1.index=range(2004,2016)
data1.index
RangeIndex(start=2004, stop=2016, step=1)
data1.loc[2014]=None
data1.loc[2015]=None
l=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
for i in l:
f=GM11(data1.loc[range(2004,2014),i].as_matrix())[0]
data1.loc[2014,i]=f(len(data1)-1)
data1.loc[2015,i]=f(len(data1)-1)
data1[i]=data1[i].round(2)
outputfile='./data1_GM11.xls'
y=list(data['y'].values)
# data['y']
# y.extend([np.nan,np.nan])
data1['y']=y
# data1['y']
data1.to_excel(outputfile)
print('預測結果:',data1.loc[2014:2015,:])
預測結果: Unnamed: 0 year x1 x2 x3 x4 \
2014 NaN NaN 52340607.88 89248500.46 40098282.29 668.71
2015 NaN NaN 52340607.88 89248500.46 40098282.29 668.71
x5 x6 x7 x8 x9 x10 \
2014 12106141.48 21.42 17086599.52 892880.8 45963285.04 20247419.64
2015 12106141.48 21.42 17086599.52 892880.8 45963285.04 20247419.64
y
2014 1075045
2015 1155923
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
"""
#支援向量機迴歸預測模型
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVR
import matplotlib.pyplot as plt
from sklearn.metrics import explained_variance_score,\
mean_absolute_error,mean_squared_error,\
median_absolute_error,r2_score
inputfile='./data1_GM11.xls'
data=pd.read_excel(inputfile)
data
feature=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']
data_train=data.loc[range(0,12)].copy()
data_mean=data_train.mean()
data_std=data_train.std()
data_train=(data_train-data_mean)/data_std
x_train=data_train[feature].as_matrix()
y_train=data_train['y'].as_matrix()
linearsvr=LinearSVR()
linearsvr.fit(x_train,y_train)
x=((data[feature]-data_mean[feature])/\
data_std[feature]).as_matrix()
data[u'y_pred']=linearsvr.predict(x) * \
data_std['y']+data_mean['y']
outputfile='./data1_GM11_revenue.xls'
data.to_excel(outputfile)
print('真實值與預測值分別為:\n',data[['y','y_pred']])
真實值與預測值分別為:
y y_pred
0 236416 2.367047e+05
1 268360 2.683600e+05
2 326556 3.274633e+05
3 373397 3.794810e+05
4 455820 4.558120e+05
5 596693 5.968044e+05
6 756412 7.534051e+05
7 732282 7.322462e+05
8 935248 9.158043e+05
9 1061594 1.061308e+06
10 1075045 1.155385e+06
11 1155923 1.155385e+06
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:17: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:18: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
"the number of iterations.", ConvergenceWarning)
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
print('預測圖:',data[['y','y_pred']].plot(subplots=True,style=['b-o','r-*'],xticks=data.index[::2]))
預測圖: [<matplotlib.axes._subplots.AxesSubplot object at 0x0000022832C78808>
<matplotlib.axes._subplots.AxesSubplot object at 0x0000022832D26F48>]