1. 程式人生 > 實用技巧 >Linear Regression Analysis

Linear Regression Analysis

第二章

2.12

(1)擬合模型:

> library(openxlsx)                            #載入包 openxlsx
> data = read.xlsx("22_data.xlsx",sheet = 2)   #read.xlsx 函式讀入資料
> x = data[,1]
> y = data[,2]
> res = lm(y~x)   #構造線性迴歸模型函式
> res             #結果

Call:                
lm(formula = y ~ x)

Coefficients:
(Intercept)            x      #得出線性迴歸模型 y = -6.332 + 9.208 x
     -6.332        9.208  

> summary(res)     #列印方差分析,係數的估計值及其檢驗。

Call:
lm(formula = y ~ x)

Residuals:         #殘差統計量,殘差第一四分位數(1Q)和第三分位數(3Q)有大約相同的幅度,意味著有較對稱的鐘形分佈
    Min      1Q  Median      3Q     Max 
-2.5629 -1.2581 -0.2550  0.8681  4.0581 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -6.33209    1.67005  -3.792  0.00353 **       #截距的點估計值及其檢驗
x            9.20847    0.03382 272.255  < 2e-16 ***      #自變數係數的點估計值及其檢驗
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.946 on 10 degrees of freedom
Multiple R-squared:  0.9999,	Adjusted R-squared:  0.9999     #相關係數與調整的相關係數
F-statistic: 7.412e+04 on 1 and 10 DF,  p-value: < 2.2e-16      #模型的顯著性檢驗(F檢驗)

(2)根據上面程式結果,自變數具有顯著性,模型具有顯著性。

(3) 不能支援管理員的觀點,根據構造的線性迴歸模型,平均環境溫度增加1°F,平均月水蒸氣消耗量將增加 9208+lb ,達不到10000lb.

(4) 使用58°F的平均環境溫度構造一個月中水蒸氣消耗量的99%預測區間:

> library(openxlsx)
> data = read.xlsx("22_data.xlsx",sheet = 2)
> x = data[,1]
> y = data[,2]
> fun = function(x)   #計算預測值函式
+ {
+   y = -6.332 + 9.208*x
+ }
> y_pred = fun(x)     #計算所有數的預測值
> s_y0_pred = function(x0,x,y,n)  #構造計算預測值標準差的函式
+ {
+   n = 12
+   y_pred = fun(x)
+   sse = sum((y_pred - y)*(y_pred - y))
+   se = sqrt(sse/(n-2))
+   se * sqrt(1/n + ((x0-mean(x))^2)/sum((x-rep(mean(x),n))^2))
+ }
> x0 = 58 ; n = 12
> y0_pred = fun(x0) #當環境溫度為58°F,對應的因變數預測值
> s = s_y0_pred(x0,x,y,n)
> print(c(y0_pred-qt(0.995,n-2)*s,y0_pred+qt(0.995,n-2)*s)) #輸出結果
[1] 525.5666 529.8974

2.13

a.做出散點圖

> data = read.xlsx("22_data.xlsx",sheet = 1)
> x = data[,2]
> y = data[,1]
> plot(x,y,main = "散點圖",xlab = "index",ylab = "days")
> abline(lm(y~x))

b.估計預測方程

> lm(y~x)

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
     -193.0         15.3 

預測方程為:y = *-193.0 + 15.3 x

c.進行迴歸顯著性檢驗

> summary(lm(y~x))

Call:
lm(formula = y ~ x)

Residuals:
   Min     1Q Median     3Q    Max 
-41.70 -21.54   2.12  18.56  36.42 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) -192.984    163.503  -1.180    0.258 #p值大於0.05
x             15.296      9.421   1.624    0.127 #p值大於0.05 , 迴歸變數與響應變數沒有顯著相關性

Residual standard error: 23.79 on 14 degrees of freedom
Multiple R-squared:  0.1585,	Adjusted R-squared:  0.09835 
F-statistic: 2.636 on 1 and 14 DF,  p-value: 0.1267

根據上述結果,指數與天數並沒有顯著相關性。

d.計算並畫出95%置信帶與95%預測帶

> sx = sort(x)
> #計算置信區間 > conf = predict(fm,data.frame(x = sx),interval = "confidence") > #計算預測區間 > pred = predict(fm,data.frame(x=sx),interval = "prediction") > plot(x,y,ylim = c(0,150),xlab = "index",ylab = "days",main = "95%預測帶、置信帶") > abline(fm) > lines(sx,conf[,2],col = "red") > lines(sx,conf[,3],col = "red") > lines(sx,pred[,2],col = "blue") > lines(sx,pred[,3],col = "blue")

2.14

a.散點圖

b.估計預測方程

> fm = lm(y~x)
> fm

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
     0.6714      -0.2964 

預測方程為:y = 0.6714 - 0.2964 x

c.資料分析

> summary(fm)

Call:
lm(formula = y ~ x)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.20464 -0.10634  0.02196  0.08527  0.20643 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)   0.6714     0.1595   4.209  0.00563 **
x            -0.2964     0.2314  -1.281  0.24754    #p值大於0.05 ,該自變數沒有顯著相關
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.15 on 6 degrees of freedom
Multiple R-squared:  0.2147,	Adjusted R-squared:  0.08382  R^2 = 0.2147
F-statistic:  1.64 on 1 and 6 DF,  p-value: 0.2475   #整個模型不具有顯著性。         

d.計算並畫出95%置信帶和95%預測帶

> plot(x,y,main = "散點圖",xlab = "比率",ylab = "黏度",ylim = c(-0.1,1))
> sx = sort(x)
> conf = predict(fm,data.frame(sx),interval = "confidence")
> pred = predict(fm,data.frame(sx),interval = "prediction")
> abline(fm)
> lines(sx,conf[,2],col = "red") #繪製置信下限
> lines(sx,conf[,3],col = "red") #繪製置信上限
> lines(sx,pred[,2],col = "blue") #繪製預測下限
> lines(sx,pred[,3],col = "blue") #繪製預測上限

2.15

a.估計預測方程

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   1.281511    -0.008758

預測方程為: y = 1.281511 - 0.008758 x

b.全面分析此模型

> fm = lm(y~x)
> summary(fm)

Call:
lm(formula = y ~ x)

Residuals:
      Min        1Q    Median        3Q       Max 
-0.043955 -0.035863 -0.009305  0.019900  0.069559 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.2815107  0.0468683   27.34 1.58e-07 *** 
x           -0.0087578  0.0007284  -12.02 2.01e-05 *** #根據 p 值,自變數溫度極顯著
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.04743 on 6 degrees of freedom
Multiple R-squared:  0.9602,	Adjusted R-squared:  0.9535 
F-statistic: 144.6 on 1 and 6 DF,  p-value: 2.007e-05  #根據 p 值,整個迴歸模型是顯著的

c.畫95%置信帶、預測帶

2.16

先畫出散點圖:

從散點圖可以看出容量與壓力之間具有明顯的線性關係,我們構造一元線性模型:

> fm = lm(y~x)
> fm

Call:
lm(formula = y ~ x)

Coefficients:
(Intercept)            x  
   -290.707        2.346  

估計預測模型為: y = -290.707 + 2.346x

再對模型進行檢驗:

> summary(fm)

Call:
lm(formula = y ~ x)

Residuals:
    Min      1Q  Median      3Q     Max 
-4.3276 -0.9227  0.0773  1.2676  2.9577 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) -2.907e+02  1.355e+00  -214.6   <2e-16 *** 
x            2.346e+00  4.007e-04  5855.4   <2e-16 *** #該自變數極顯著
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.741 on 31 degrees of freedom
Multiple R-squared:      1,	Adjusted R-squared:      1 
F-statistic: 3.429e+07 on 1 and 31 DF,  p-value: < 2.2e-16 #整個迴歸模型極顯著

2.17

> x = data[,2]
> y = data[,1]
> n = length(x)
> plot(x,y)
> 
> fm = lm(y~x) #一元迴歸模型
> abline(fm)
> coef(fm)
(Intercept)           x 
 163.930734    1.579647 
> summary(fm) 

Call:
lm(formula = y ~ x)

Residuals:
     Min       1Q   Median       3Q      Max 
-1.41483 -0.91550 -0.05148  0.76941  2.72840 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 163.9307     2.6551   61.74  < 2e-16 ***
x             1.5796     0.1051   15.04 1.88e-10 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.291 on 15 degrees of freedom
Multiple R-squared:  0.9378,	Adjusted R-squared:  0.9336 
F-statistic:   226 on 1 and 15 DF,  p-value: 1.879e-10

> anova(fm) #方差分析
Analysis of Variance Table

Response: y
          Df Sum Sq Mean Sq F value    Pr(>F)    
x          1 376.92  376.92  226.04 1.879e-10 ***
Residuals 15  25.01    1.67                      
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> 
> rm(list = ls())

2.18

library(openxlsx)
data = read.xlsx("2.18.xlsx",sheet = 1)
x = data[,2]
y = data[,3]
n = length(x)
plot(x,y)

fm = lm(y~x) #一元迴歸
coef(fm) #輸出迴歸係數

summary(fm)
anova(fm)

#構造此資料的95%置信帶與預測帶
sx = sort(x)
conf = predict(fm,data.frame(x = sx),interval = "confidence")
pred = predict(fm,data.frame(x = sx),interval = "prediction")
abline(fm)
lines(sx,conf[,2],col = 'red')
lines(sx,conf[,3],col = 'red')
lines(sx,pred[,2],col = 'blue')
lines(sx,pred[,3],col = 'blue')

plot(x,y,main = "%95置信帶與95%預測帶",xlab = "花費錢數",ylab = "每週掙回的印象",ylim=c(-100,200))
rm(list = ls())

第三章

3.8

#a.擬合co2產量y與總溶劑量x6和氫消耗量x7關係的多元迴歸模型

library(openxlsx) data = read.xlsx("3.8.xlsx",sheet = 1) data y = data[,1] #響應變數 x = data[,c(7,8)] #迴歸變數 fm = lm(y~.,x) #多元線性迴歸 summary(fm) anova(fm) #檢驗顯著性 summary(fm) #d confint(fm) #e x6 = data[,7] fm1 = lm(y~x6) summary(fm1) anova(fm1) confint(fm1,level = 0.95) rm(list = ls())

3.9

library(openxlsx)
data = read.xlsx("3.9.xlsx",sheet = 1)
y = data[,1]
x = data[,c(2,5)]

#a.擬合多元迴歸模型
fm = lm(y~.,x)
coef(fm)
#b,c 檢驗迴歸顯著性()
anova(fm)
summary(fm)

#e
#檢驗是否有潛在的多重共線性
r2 = 0.6367
vif = 1/(1-r2)

rm(list = ls())

3.10

> #3.10
> library(openxlsx)
> data = read.xlsx("3.10.xlsx",sheet = 1)
> #a
> y = data[,6]
> x = data[,c(1:5)]
> fm = lm(y~.,x)
> coef(fm)
(Intercept)     Clarity       Aroma        Body      Flavor 
  3.9968648   2.3394535   0.4825505   0.2731612   1.1683238 
   Oakiness 
 -0.6840102 
> #b,c
> summary(fm)

Call:
lm(formula = y ~ ., data = x)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.85552 -0.57448 -0.07092  0.67275  1.68093 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3.9969     2.2318   1.791 0.082775 .  
Clarity       2.3395     1.7348   1.349 0.186958    
Aroma         0.4826     0.2724   1.771 0.086058 .  
Body          0.2732     0.3326   0.821 0.417503    
Flavor        1.1683     0.3045   3.837 0.000552 ***
Oakiness     -0.6840     0.2712  -2.522 0.016833 *  
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.163 on 32 degrees of freedom
Multiple R-squared:  0.7206,	Adjusted R-squared:  0.6769 
F-statistic: 16.51 on 5 and 32 DF,  p-value: 4.703e-08

> #d
> xx = data[,c(2,4)]
> fm1 = lm(y~.,xx)
> summary(fm1)

Call:
lm(formula = y ~ ., data = xx)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.19048 -0.60300 -0.03203  0.66039  2.46287 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   4.3462     1.0091   4.307 0.000127 ***
Aroma         0.5180     0.2759   1.877 0.068849 .  
Flavor        1.1702     0.2905   4.027 0.000288 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.229 on 35 degrees of freedom
Multiple R-squared:  0.6586,	Adjusted R-squared:  0.639 
F-statistic: 33.75 on 2 and 35 DF,  p-value: 6.811e-09

> AIC(fm) #優先考慮的模型應是AIC值最小的那一個
[1] 126.7552
> AIC(fm1)
[1] 128.3761
> #e
> conf = confint(fm)
> conf1 = confint(fm1)
> conf = as.matrix(conf)
> conf1 = as.matrix(conf1)
> 
> conf[5,2]-conf[5,1]
[1] 1.240414
> conf1[3,2]-conf[3,1]
[1] 1.83241
> 
> rm(list = ls())
> 

3.11

> #3.11
> library(openxlsx)
> data = read.xlsx("3.11.xlsx",sheet = 1)
> y = data[,6]
> x = data[,c(1:5)]
> #a
> fm = lm(y~.,x)
> coef(fm)
  (Intercept)            x1            x2            x3 
 5.207905e+01  5.555556e-02  2.821429e-01  1.250000e-01 
           x4            x5 
 1.776357e-16 -1.606498e+01 
> #b,c
> summary(fm)

Call:
lm(formula = y ~ ., data = x)

Residuals:
    Min      1Q  Median      3Q     Max 
-12.250  -4.438   0.125   5.250   9.500 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  5.208e+01  1.889e+01   2.757 0.020218 *  
x1           5.556e-02  2.987e-02   1.860 0.092544 .  
x2           2.821e-01  5.761e-02   4.897 0.000625 ***
x3           1.250e-01  4.033e-01   0.310 0.762949    
x4           1.776e-16  2.016e-01   0.000 1.000000    
x5          -1.606e+01  1.456e+00 -11.035  6.4e-07 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 8.065 on 10 degrees of freedom
Multiple R-squared:  0.9372,	Adjusted R-squared:  0.9058 
F-statistic: 29.86 on 5 and 10 DF,  p-value: 1.055e-05

> #d
> xx = data[,c(2,5)]
> fm1 = lm(y~.,xx)
> summary(fm1)

Call:
lm(formula = y ~ ., data = xx)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.375  -4.188  -0.875   3.438  12.625 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  80.13461    5.69146  14.080 3.01e-09 ***
x2            0.28214    0.05883   4.796 0.000349 ***
x5          -16.06498    1.48659 -10.807 7.26e-08 ***
---
Signif. codes:  
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 8.236 on 13 degrees of freedom
Multiple R-squared:  0.9149,	Adjusted R-squared:  0.9018 
F-statistic: 69.89 on 2 and 13 DF,  p-value: 1.107e-07

> AIC(fm)
[1] 118.6885
> AIC(fm1)
[1] 117.5552
> 
> #e
> confint(fm)
                   2.5 %      97.5 %
(Intercept)   9.99688896  94.1612109
x1           -0.01100273   0.1221138
x2            0.15378045   0.4105053
x3           -0.77353688   1.0235369
x4           -0.44926844   0.4492684
x5          -19.30879739 -12.8211665
> confint(fm1) #溫度:x2
                  2.5 %      97.5 %
(Intercept)  67.8389462  92.4302647
x2            0.1550559   0.4092298
x5          -19.2765650 -12.8533989
> 

  

第四章

例4.1 根據例3.1資料,輸出殘差,標準化殘差,學生化殘差,press殘差,外部學生化殘差 表格

library(openxlsx)

#例4.1---------------------------------------------------------
#處理資料 
data = read.xlsx("3.1.xlsx",sheet = 1)
data = data[,c(2,3,4)]
names(data)=c("time","cases","distance")
y = data$time
x1 = data$cases
x2 = data$distance

#線性迴歸
fm = lm(y~x1+x2)

#殘差
ei = residuals(fm) 
View(ei)

#標準化殘差(1)
di = rstandard(fm)
View(di)

#計算mse的函式
mse = function(ei,p) #ei是殘差向量,p是迴歸引數個數
{
  n = length(ei)
  sse = sum(ei**2)
  mse = sse/(n-p)
  return(mse)
}
di_ = ei/sqrt(mse(ei,3))#標準化殘差(2)
View(di_)

#學生化殘差(1)
ri = rstudent(fm)
View(ri)

#計算帽子矩陣,並提取對角線元素
H = function(X) #X是迴歸向量矩陣
{
  h = X%*%solve(t(X)%*%X)%*%t(X)
  hii = diag(h)
  return(hii)
}
X = cbind(1,x1,x2)
hii = H(X) #計算hii
View(hii)
ri_ = ei/sqrt(mse(ei,3)*(1-hii)) #學生化殘差(2)
View(ri_)

#計算PRESS統計量
e_i = ei/(1-hii) #計算e(i)
View(e_i)

#外部學生化殘差
ti = function(ei,X) #輸入殘差迴歸變數矩陣
{
  
  p = ncol(X) #迴歸引數個數
  n = length(ei) #資料個數
  hii = H(X)  #帽子矩陣主對角線元素
  s2_i = ((n-p)*mse(ei,p) -(ei**2)/(1-hii)) / (n-p-1) #計算S(i)^2
  ans = ei / sqrt(s2_i*(1-hii))
  return(ans)
}
ti = ti(ei,X)
View(ti)

#計算PRESS統計量
press = function(ei,X)
{
  hii = H(X)
  res = sum((ei/(1-hii))**2)
  #View(res)
}
Press = (ei/(1-hii))**2
View(Press)
PRESS = press(ei,X) #輸出PRESS統計量

#將所有殘差資料寫入表格
Num = seq(1,length(ei))
mydata = cbind(Num,ei,di_,ri_,hii,e_i,ti,Press)
class(mydata)
View(mydata)
write.xlsx(mydata,"C:\\Users\\86130\\Desktop\\mydata.xlsx")

  

#例4.2
ti                 #外部學生化殘差
View(ti)
n = length(ti)     #資料個數
order = rank(ti)   #rank函式返回ti按升序排序之後的序號
Pi = (order-1/2)/n #累積概率
plot(ti,Pi,xlim=c(-3,5))  #畫正態概率圖
fm_tP = lm(Pi~ti)  #線性迴歸模型
abline(fm_tP)      #添加回歸線

#例4.3
#畫殘差與擬合值y_i的殘差圖
plot(fitted(fm),ti) #fitted(fm)返回模型fm的預測值
abline(h = 0) #新增直線y=0

#例4.4
#畫殘差與迴歸變數的殘差圖
par(mfrow =c(1,2))
plot(x1,ti,xlab = "箱數",ylab = "外部學生化殘差")
abline(h=0)       #h:y軸 v:x軸
plot(x2,ti,xlab = "距離",ylab = "外部學生化殘差")
abline(h=0)

#例4.5
#畫偏回歸圖
#迴歸變數x1的偏回歸圖
lm.y_x2 = lm(y~x2)
lm.x1_x2 = lm(x1~x2)
plot(resid(lm.x1_x2),resid(lm.y_x2),xlab = "箱數",ylab = "時間")
#迴歸變數x2的偏回歸圖
lm.y_x1 = lm(y~x1)
lm.x2_x1 = lm(x2~x1)
plot(resid(lm.x2_x1),resid(lm.y_x1),xlab = "距離",ylab = "時間",pch = 10)

#例4.6
#計算PRESS的預測R^2
R_pred = function(X,y)
{
  hii = H(X)
  ei = resid(lm(y~X[,2]+X[,3]))
  PRESS = sum((ei/(1-hii))**2)
  sst = sum((y-mean(y))**2)
  ans = 1-PRESS/sst
  return(ans)
}
R_pred(X,y)

#例4.7
data = read.xlsx("2.1.xlsx",sheet = 1)
names(data) = c("order","y","x")
x = data$x
y = data$y
X = cbind(1,x)
fm = lm(y~x)
#繪製正態概率圖
plot_ZP = function(ti) #輸入外部學生化殘差
{
  n = length(ti)
  order = rank(ti)   #按升序排列,t(i)是第order個
  Pi = (order-1/2)/n #累積概率
  plot(ti,Pi,xlim=c(-3,3),xlab = "學生化殘差",ylab = "百分比")  #畫正態概率圖
}
ei = resid(fm)
ti = ti(ei,X)       #計算外部學生化殘差ti
plot_ZP(ti)         #繪製正態概率圖
plot(fitted(fm),ti) #繪製殘差與所預測y_pred的殘差圖
abline(h = 0)
#繪製除去5,6兩點的正態概率圖
data = data[-c(5,6),]
x = data$x
y = data$y
X = cbind(1,x)
fm1 = lm(y~x)   #線性模型
ei = resid(fm1) 
ti = ti(ei,X)   #計算外部學生化殘差ti
plot(fitted(fm1),ti) #繪製殘差與所預測y_pred的殘差圖
abline(h = 0)

#例4.8
data = read.xlsx("4.8.xlsx",sheet = 1)
x = data$x
y = data$y
fm = lm(y~x)  #線性迴歸
a = anova(fm) #方差分析
sst = sum(a[2]) #總平方和
ssg = a[1,2]    #迴歸平方和
sse = a[2,2]    #殘差平方和
level_x = c(table(x)>1) #檢視哪些自變數重複
#進行失擬檢驗
library(rsm)     #載入rsm包用於失擬檢驗
lm.rsm<-rsm(y~FO(x))
loftest(lm.rsm)  #呼叫失擬檢驗函式loftest

rm(list = ls())

  

例4.10 通過近鄰點估計純誤差

#例4.10
data = read.xlsx("3.1.xlsx",sheet = 1) #匯入資料
names(data)=c("order","time","cases","distance")
y = data$time        #準備資料
x1 = data$cases
x2 = data$distance
fm = lm(y~x1 + x2)   #線性迴歸
coef(fm)
b_cases = coef(fm)[2]    #beta1
b_distance = coef(fm)[3] #beta2
y_pred = predict(fm) #計算預測值
ei = resid(fm)       #殘差
new_data = cbind(data,y_pred,ei)  #構建新資料
new_data = new_data[order(new_data$y_pred),] #按照預測值升序排序
a = anova(fm)        #方差分析
sse = a[3,2]         #殘差平方和
mse = a[3,3]         #殘差均方和

#計算Dii'
Di_i = function(i,i_,mse,beta1,beta2,new_data) #i第i個點,i_第i_個點,data資料集
{
  one = beta1*(new_data$cases[i]-new_data$cases[i_])/sqrt(mse)
  two = beta2*(new_data$distance[i]-new_data$distance[i_])/sqrt(mse)
  ans = one**2 + two**2
  return(ans)
}

#定義一個數據框用來儲存結果
σ_ans = data.frame(
  i = numeric(0),        #觀測值i
  i_ = numeric(0),       #觀測值i_
  Dii = numeric(0),      #Di_i
  delta = numeric(0)     #E|ei-ei_|
)
  
#計算相鄰k個點的兩點的 Di_i,i,i_,Delta殘差
for (k in c(1:4))
{
  for (i in c(1:24))
  {
    if (i+k>25)
      break
    D = Di_i(i,i+k,mse,b_cases,b_distance,new_data) #計算相鄰k個點的兩點的Di_i
    E = abs(new_data$ei[i]-new_data$ei[i+k])        #計算相鄰k個點的兩點的Delta殘差
    another = data.frame(
      i = new_data$order[i],
      i_ = new_data$order[i+k],
      Dii = D,
      delta = E
    )
    σ_ans = rbind(σ_ans,another) #合併兩個資料框
  }
}
names(σ_ans) = c("i","i_","Dii^2","Delta")  #重新命名最後的資料框
σ_ans = σ_ans[order(σ_ans$Dii^2),] #按照Di_i對資料框進行排序
row.names(σ_ans) = c(1:90)         #對每一行進行編號

#計算累計標準差
std = numeric(0)                   #儲存累計標準差
sum_Delta = 0                      #儲存累計Delta殘差
for (i in 1:90)
{
  sum_Delta = sum_Delta + σ_ans$Delta[i] #0.886/m*Σ(Delta)
  res = 0.886/i*sum_Delta
  std[i] = res
}
σ_ans = cbind(std,σ_ans)

  

4.16

#######################自己編的函式,執行後直接呼叫#######################
#計算mse的函式
mse = function(ei,p) #ei是殘差向量,p是迴歸引數個數
{
  n = length(ei)
  sse = sum(ei**2)
  mse = sse/(n-p)
  return(mse)
}

#計算帽子矩陣,並提取對角線元素
H = function(X) #X是迴歸向量矩陣
{
  h = X%*%solve(t(X)%*%X)%*%t(X)
  hii = diag(h)
  return(hii)
}

#外部學生化殘差
ti = function(ei,X) #輸入殘差迴歸變數矩陣
{
  p = ncol(X) #迴歸引數個數
  n = length(ei) #資料個數
  hii = H(X)  #帽子矩陣主對角線元素
  s2_i = ((n-p)*mse(ei,p) -(ei**2)/(1-hii)) / (n-p-1) #計算S(i)^2
  ans = ei / sqrt(s2_i*(1-hii))
  return(ans)
}

#計算PRESS統計量
press = function(ei,X) #X是自變數的設計矩陣
{
  hii = H(X)
  res = sum((ei/(1-hii))**2)
  #View(res)
}

#計算PRESS的預測R^2
R_pred = function(X,y)
{
  hii = H(X)
  ei = resid(lm(y~X[,2]+X[,3]))
  PRESS = sum((ei/(1-hii))**2)
  sst = sum((y-mean(y))**2)
  ans = 1-PRESS/sst
  return(ans)
}

#繪製正態概率圖
plot_ZP = function(ti) #輸入外部學生化殘差
{
  n = length(ti)
  order = rank(ti)   #按升序排列,t(i)是第order個
  Pi = (order-1/2)/n #累積概率
  plot(ti,Pi,xlim=c(-3,3),xlab = "學生化殘差",ylab = "百分比")  #畫正態概率圖
}

#進行失擬檢驗
library(rsm)     #載入rsm包用於失擬檢驗
lm.rsm<-rsm(y~FO(x))
loftest(lm.rsm)  #呼叫失擬檢驗函式loftest

#計算Dii'
Di_i = function(i,i_,mse,beta1,beta2,new_data) #i第i個點,i_第i_個點,data資料集
{
  one = beta1*(new_data$cases[i]-new_data$cases[i_])/sqrt(mse)
  two = beta2*(new_data$distance[i]-new_data$distance[i_])/sqrt(mse)
  ans = one**2 + two**2
  return(ans)
}

  

#4.16
#a.殘差的正態概率圖
data = read.xlsx('3.12.xlsx',sheet = 1) #匯入資料
y = data$y
x1 = data$x1
x2 = data$x2
X = cbind(1,x1,x2) #處理資料
fm = lm(y~x1+x2)   #線性迴歸模型
ei = resid(fm)     #計算殘差
ti = ti(ei,X)      #ti()自制求外部學生化殘差函式
plot_ZP(ti)        #plot_zp()自制繪製正態概率圖函式
#為什麼要編寫函式?
#1.這些題目都是重複的程式碼操作
#2.如果是想多次重複打程式碼來熟悉,大可不必,因為會忘的。
#正態概率圖有一個異常點,order(ti) 返回第一小的是第28號點

#b.殘差與響應變數預測值的殘差圖 
plot(fitted(fm),ti)
#殘差圖表明殘差包含在一條水平帶中,模型不存在明顯的缺點。

#c.
#模型fm的PRESS統計量
press_fm = press(ei,X)
#新模型fm1的PRESS統計量
fm1 = lm(y~x2)
ei = resid(fm1)
X = cbind(1,x2)
press_fm1 = press(ei,X) #press()自制求press統計量函式
#選擇press統計量小的模型