diff --git a/M1/General Linear Models/Chapter 1/Code_Chapter1.Rmd b/M1/General Linear Models/Chapter 1/Code_Chapter1.Rmd new file mode 100644 index 0000000..7212c63 --- /dev/null +++ b/M1/General Linear Models/Chapter 1/Code_Chapter1.Rmd @@ -0,0 +1,335 @@ +--- +title: "Linear Models and Their Generalizations" +subtitle : "Code Chapter 1: Linear gaussian regression" +author: "Katia Meziani" +output: +# pdf_document: +# toc: yes +html_document: +code_folding: hide +css: ./style.css +df_print: paged +highlight: tango +number_sections: no +theme: flatly +toc: yes +toc_float: +collapsed: no + +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(warning = FALSE, message = FALSE) +``` + + + + + + + + + +# Motivations + + + **The Ozone Example** (With **Ⓡ** ) + + +This dataset (Air Breizh, Summer 2001) contains 112 observations and 13 variables. + +```{r} +library(rmarkdown) +ozone1 <- read.table("ozone1.txt", header = TRUE, sep = " ", dec = ".") +paged_table(ozone1) +``` + +Let's correct the incorrect declaration of variables: + + + +```{r} +library(dplyr) +ozone1$Date <- as.Date(as.factor(ozone1$Date), format = "%Y%m%d") +ozone1$vent <- as.factor(ozone1$vent) +ozone1$pluie <- as.factor(ozone1$pluie) +ozone1 <- ozone1 %>% mutate(across(where(is.character), as.numeric)) +ozone1 <- ozone1 %>% mutate(across(where(is.integer), as.numeric)) +str(ozone1) +``` + + + + + + + +# 1. Linear regression model + +## Simple linear model + + + + +Let's plot this line (here with **Ⓡ** ) is called the least squares line. + +```{r, out.width = "60%", fig.align = 'center', message = FALSE, warning = FALSE} +library(ggplot2) +library(plotly) +p1 <- ggplot(ozone1) + + aes(x = T12, y = maxO3) + + geom_point(col = 'red', size = 0.5) + + geom_smooth(method = "lm", se = FALSE) +ggplotly(p1) +``` + + +With **Ⓡ** , we can compute $\widehat {\boldsymbol{\beta}}=(\widehat{\beta}_0, \widehat{\beta}_1)$ + +```{r} +library(kableExtra) +mod <- lm(maxO3 ~ T12, data = ozone1) +mod$coefficients %>% kbl(col.names = "Estimation") +``` + + + + + + + + + + + +# 2. Ordinary least squares estimator (OLSE) + + + + + + + +## Declaration of the model with Under **Ⓡ** + +Let's revisit the ozone example and, for this chapter, select only the numerical regressors with **Ⓡ** + +```{r} +names(ozone1) +Ozone <- ozone1[, 2:12] +names(Ozone) +``` + + + +Under **Ⓡ** ,we can declare the linear model in two ways: either by naming the variables or by using a `.`. + +```{r} +mod <- lm(maxO3 ~ T9 + + T12 + + T15 + + Ne9 + + Ne12 + + Ne15 + + Vx9 + + Vx12 + + Vx15 + + maxO3v, data = Ozone) +mod <- lm(maxO3 ~ ., data = Ozone) +``` + +Now, let's display the estimated values $\widehat{\boldsymbol{\beta}}$ of $\boldsymbol{\beta}$. + +```{r} +coefficients(mod) +``` + + + + +# 3. Residuals analysis + + + +```{r} +summary(mod$residuals) +``` + + + + + +We can display Residual standard error with **Ⓡ** +```{r} +cat("Residual standard error:", summary(mod)$sigma, "on", summary(mod)$df[2], "degrees of freedom\n") +``` + + + + + + + + + + + +# 4. Inference in the Gaussian linear regression model + + +## Confidence intervals/regions + + + + + + + +**Ⓡ** Display the confidence interval for $\beta_j$ + + +```{r} +mod <- lm(maxO3 ~ ., data = Ozone) +cbind(mod$coefficient, confint(mod, level = 0.95)) +``` + + +**Ⓡ** Display the confidence interval for +$\sigma^2$ + + +```{r} + +n <- length(Ozone$maxO3) +p <- length(coef(mod)) +alpha <- 0.05 + +sigma2_hat <- summary(mod)$sigma^2 + +IC_sigma2 <- sigma2_hat * c((n - p) / qchisq(1 - alpha / 2, n - p), + (n - p) / qchisq(alpha / 2, n - p)) + +cat("estimation of sigma : ", sigma2_hat, "\n") +cat("Intervalle de confiance pour sigma^2 (95%) : [", IC_sigma2[1], ", ", IC_sigma2[2], "]\n") +``` + + + + + + + + + + + + + + +# 5. Hypothesis testing + +## Test: $\mathscr{H}_0:\,c^\top{\beta}=a$ + + + + +Declare our linear model and display with **Ⓡ** + +```{r} +library(kableExtra) +mod <- lm(maxO3 ~ ., data = Ozone) +summary(mod) %>% coefficients() %>% kbl() +``` + + + + +## Global Fisher test + + + + + +Under **Ⓡ** , the `summary` displays the Global Fisher test + + +```{r, eval=FALSE} +F-statistic: 32.67 on 10 and 101 DF, p-value: < 2.2e-16 +``` + + + +## Fisher test for nested models + + + + +**Ⓡ** Consider 2 nested models +$$ +\left\{\begin{array}{l} +\text{mod}_1:\, \text{maxO3=T9+Ne9+Vx9+maxO3v}+\epsilon,\\ +\text{maxO3=T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v}+\epsilon +\end{array}\right. +$$ +and display the nested Fisher test +$$ +\left\{\begin{array}{l} +\mathscr{H}_0: \, \text{mod}_1 \text{ is adequat}\\ +\mathscr{H}_1: \, \text{mod}_2 \text{ is adequat} +\end{array}\right. +$$ + +```{r} +mod_1<-lm(maxO3~T9+Ne9+Vx9+maxO3v,data=Ozone) +mod_2<-lm(maxO3~T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v,data=Ozone) + +anova(mod_1,mod_2) +``` + + + + +# 6. $R^2$-coefficient + + + + +Let display these coefficient with **Ⓡ** + +```{r} +cat("Multiple R-squared:", summary(mod)$r.squared, ", Adjusted R-squared:", summary(mod)$adj.r.squared, "\n") + +``` + + + + + + +# 7. Prediction + + +## 7.1. Fitted values + + + + +Note that with **Ⓡ** , we can display the fitted values $\widehat Y_i$ + +```{r} +haty<-predict(mod,data=Ozone) +# or equivalently +#haty<-mod$fitted.values +y<-Ozone$maxO3 +cbind.data.frame(y,haty)%>%rmarkdown::paged_table() +``` + + + + + + + + + + diff --git a/M1/General Linear Models/Chapter 1/Code_Chapter1.html b/M1/General Linear Models/Chapter 1/Code_Chapter1.html new file mode 100644 index 0000000..d2a8a38 --- /dev/null +++ b/M1/General Linear Models/Chapter 1/Code_Chapter1.html @@ -0,0 +1,4299 @@ + + + + +
+ + + + + + + + + +The Ozone Example +(With Ⓡ )
+This dataset (Air Breizh, Summer 2001) contains 112 observations and +13 variables.
+library(rmarkdown)
+ozone1 <- read.table("ozone1.txt", header = TRUE, sep = " ", dec = ".")
+paged_table(ozone1)
+Let’s correct the incorrect declaration of variables:
+library(dplyr)
+ozone1$Date <- as.Date(as.factor(ozone1$Date), format = "%Y%m%d")
+ozone1$vent <- as.factor(ozone1$vent)
+ozone1$pluie <- as.factor(ozone1$pluie)
+ozone1 <- ozone1 %>% mutate(across(where(is.character), as.numeric))
+ozone1 <- ozone1 %>% mutate(across(where(is.integer), as.numeric))
+str(ozone1)
+## 'data.frame': 112 obs. of 14 variables:
+## $ Date : Date, format: "2001-06-01" "2001-06-02" ...
+## $ maxO3 : num 87 82 92 114 94 80 79 79 101 106 ...
+## $ T9 : num 15.6 17 15.3 16.2 17.4 17.7 16.8 14.9 16.1 18.3 ...
+## $ T12 : num 18.5 18.4 17.6 19.7 20.5 19.8 15.6 17.5 19.6 21.9 ...
+## $ T15 : num 18.4 17.7 19.5 22.5 20.4 18.3 14.9 18.9 21.4 22.9 ...
+## $ Ne9 : num 4 5 2 1 8 6 7 5 2 5 ...
+## $ Ne12 : num 4 5 5 1 8 6 8 5 4 6 ...
+## $ Ne15 : num 8 7 4 0 7 7 8 4 4 8 ...
+## $ Vx9 : num 0.695 -4.33 2.954 0.985 -0.5 ...
+## $ Vx12 : num -1.71 -4 1.879 0.347 -2.954 ...
+## $ Vx15 : num -0.695 -3 0.521 -0.174 -4.33 ...
+## $ maxO3v: num 84 87 82 92 114 94 80 99 79 101 ...
+## $ vent : Factor w/ 4 levels "Est","Nord","Ouest",..: 2 2 1 2 3 3 3 2 2 3 ...
+## $ pluie : Factor w/ 2 levels "Pluie","Sec": 2 2 2 2 2 1 2 2 2 2 ...
+Let’s plot this line (here with Ⓡ ) is called the +least squares line.
+library(ggplot2)
+library(plotly)
+p1 <- ggplot(ozone1) +
+ aes(x = T12, y = maxO3) +
+ geom_point(col = 'red', size = 0.5) +
+ geom_smooth(method = "lm", se = FALSE)
+ggplotly(p1)
+
+
+With Ⓡ , we can compute +\(\widehat +{\boldsymbol{\beta}}=(\widehat{\beta}_0, \widehat{\beta}_1)\)
+library(kableExtra)
+mod <- lm(maxO3 ~ T12, data = ozone1)
+mod$coefficients %>% kbl(col.names = "Estimation")
+| + | ++Estimation + | +
|---|---|
| +(Intercept) + | ++-27.419636 + | +
| +T12 + | ++5.468685 + | +
Let’s revisit the ozone example and, for this chapter, select only +the numerical regressors with Ⓡ
+names(ozone1)
+## [1] "Date" "maxO3" "T9" "T12" "T15" "Ne9" "Ne12" "Ne15"
+## [9] "Vx9" "Vx12" "Vx15" "maxO3v" "vent" "pluie"
+Ozone <- ozone1[, 2:12]
+names(Ozone)
+## [1] "maxO3" "T9" "T12" "T15" "Ne9" "Ne12" "Ne15" "Vx9"
+## [9] "Vx12" "Vx15" "maxO3v"
+Under Ⓡ ,we can declare
+the linear model in two ways: either by naming the variables or by using
+a ..
mod <- lm(maxO3 ~ T9 +
+ T12 +
+ T15 +
+ Ne9 +
+ Ne12 +
+ Ne15 +
+ Vx9 +
+ Vx12 +
+ Vx15 +
+ maxO3v, data = Ozone)
+mod <- lm(maxO3 ~ ., data = Ozone)
+Now, let’s display the estimated values \(\widehat{\boldsymbol{\beta}}\) of \(\boldsymbol{\beta}\).
+coefficients(mod)
+## (Intercept) T9 T12 T15 Ne9 Ne12
+## 12.24441987 -0.01901425 2.22115189 0.55853087 -2.18909215 -0.42101517
+## Ne15 Vx9 Vx12 Vx15 maxO3v
+## 0.18373097 0.94790917 0.03119824 0.41859252 0.35197646
+summary(mod$residuals)
+## Min. 1st Qu. Median Mean 3rd Qu. Max.
+## -53.566 -8.727 -0.403 0.000 7.599 39.458
+We can display Residual standard error with Ⓡ +
+cat("Residual standard error:", summary(mod)$sigma, "on", summary(mod)$df[2], "degrees of freedom\n")
+## Residual standard error: 14.36002 on 101 degrees of freedom
+Ⓡ Display the +confidence interval for \(\beta_j\)
+mod <- lm(maxO3 ~ ., data = Ozone)
+cbind(mod$coefficient, confint(mod, level = 0.95))
+## 2.5 % 97.5 %
+## (Intercept) 12.24441987 -14.4802083 38.9690481
+## T9 -0.01901425 -2.2510136 2.2129851
+## T12 2.22115189 -0.6214249 5.0637287
+## T15 0.55853087 -1.7121182 2.8291799
+## Ne9 -2.18909215 -4.0502993 -0.3278850
+## Ne12 -0.42101517 -3.1340934 2.2920630
+## Ne15 0.18373097 -1.8055357 2.1729977
+## Vx9 0.94790917 -0.8618028 2.7576211
+## Vx12 0.03119824 -2.0620871 2.1244836
+## Vx15 0.41859252 -1.3978619 2.2350470
+## maxO3v 0.35197646 0.2272237 0.4767292
+Ⓡ Display the +confidence interval for \(\sigma^2\)
+n <- length(Ozone$maxO3)
+p <- length(coef(mod))
+alpha <- 0.05
+
+sigma2_hat <- summary(mod)$sigma^2
+
+IC_sigma2 <- sigma2_hat * c((n - p) / qchisq(1 - alpha / 2, n - p),
+ (n - p) / qchisq(alpha / 2, n - p))
+
+cat("estimation of sigma : ", sigma2_hat, "\n")
+## estimation of sigma : 206.2102
+cat("Intervalle de confiance pour sigma^2 (95%) : [", IC_sigma2[1], ", ", IC_sigma2[2], "]\n")
+## Intervalle de confiance pour sigma^2 (95%) : [ 159.3518 , 277.3877 ]
+Declare our linear model and display with Ⓡ +
+library(kableExtra)
+mod <- lm(maxO3 ~ ., data = Ozone)
+summary(mod) %>% coefficients() %>% kbl()
+| + | ++Estimate + | ++Std. Error + | ++t value + | ++Pr(>|t|) + | +
|---|---|---|---|---|
| +(Intercept) + | ++12.2444199 + | ++13.4719013 + | ++0.9088858 + | ++0.3655741 + | +
| +T9 + | ++-0.0190143 + | ++1.1251522 + | ++-0.0168993 + | ++0.9865503 + | +
| +T12 + | ++2.2211519 + | ++1.4329447 + | ++1.5500612 + | ++0.1242547 + | +
| +T15 + | ++0.5585309 + | ++1.1446356 + | ++0.4879552 + | ++0.6266392 + | +
| +Ne9 + | ++-2.1890921 + | ++0.9382356 + | ++-2.3332008 + | ++0.0216199 + | +
| +Ne12 + | ++-0.4210152 + | ++1.3676644 + | ++-0.3078352 + | ++0.7588417 + | +
| +Ne15 + | ++0.1837310 + | ++1.0027906 + | ++0.1832197 + | ++0.8549930 + | +
| +Vx9 + | ++0.9479092 + | ++0.9122769 + | ++1.0390586 + | ++0.3012586 + | +
| +Vx12 + | ++0.0311982 + | ++1.0552264 + | ++0.0295654 + | ++0.9764720 + | +
| +Vx15 + | ++0.4185925 + | ++0.9156758 + | ++0.4571405 + | ++0.6485518 + | +
| +maxO3v + | ++0.3519765 + | ++0.0628879 + | ++5.5968846 + | ++0.0000002 + | +
Under Ⓡ , the
+summary displays the Global Fisher test
F-statistic: 32.67 on 10 and 101 DF, p-value: < 2.2e-16
+Ⓡ Consider 2 nested +models \[ +\left\{\begin{array}{l} +\text{mod}_1:\, \text{maxO3=T9+Ne9+Vx9+maxO3v}+\epsilon,\\ +\text{maxO3=T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v}+\epsilon +\end{array}\right. +\] and display the nested Fisher test \[ +\left\{\begin{array}{l} +\mathscr{H}_0: \, \text{mod}_1 \text{ is adequat}\\ +\mathscr{H}_1: \, \text{mod}_2 \text{ is adequat} +\end{array}\right. +\]
+mod_1<-lm(maxO3~T9+Ne9+Vx9+maxO3v,data=Ozone)
+mod_2<-lm(maxO3~T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v,data=Ozone)
+
+anova(mod_1,mod_2)
+## Analysis of Variance Table
+##
+## Model 1: maxO3 ~ T9 + Ne9 + Vx9 + maxO3v
+## Model 2: maxO3 ~ T9 + T12 + Ne9 + Ne12 + Vx9 + Vx12 + maxO3v
+## Res.Df RSS Df Sum of Sq F Pr(>F)
+## 1 107 23263
+## 2 104 20914 3 2349.8 3.8951 0.01106 *
+## ---
+## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
+Let display these coefficient with Ⓡ
+cat("Multiple R-squared:", summary(mod)$r.squared, ", Adjusted R-squared:", summary(mod)$adj.r.squared, "\n")
+## Multiple R-squared: 0.7638413 , Adjusted R-squared: 0.7404593
+Note that with Ⓡ , we can display +the fitted values \(\widehat Y_i\)
+haty<-predict(mod,data=Ozone)
+# or equivalently
+#haty<-mod$fitted.values
+y<-Ozone$maxO3
+cbind.data.frame(y,haty)%>%rmarkdown::paged_table()
+