mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-25 00:52:11 +01:00
Add Chapter 1 of GLM
This commit is contained in:
335
M1/General Linear Models/Chapter 1/Code_Chapter1.Rmd
Normal file
335
M1/General Linear Models/Chapter 1/Code_Chapter1.Rmd
Normal file
@@ -0,0 +1,335 @@
|
||||
---
|
||||
title: "Linear Models and Their Generalizations"
|
||||
subtitle : "Code Chapter 1: Linear gaussian regression"
|
||||
author: "Katia Meziani"
|
||||
output:
|
||||
# pdf_document:
|
||||
# toc: yes
|
||||
html_document:
|
||||
code_folding: hide
|
||||
css: ./style.css
|
||||
df_print: paged
|
||||
highlight: tango
|
||||
number_sections: no
|
||||
theme: flatly
|
||||
toc: yes
|
||||
toc_float:
|
||||
collapsed: no
|
||||
|
||||
---
|
||||
|
||||
```{r setup, include=FALSE}
|
||||
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Motivations
|
||||
|
||||
<span style="color: blue;">
|
||||
<span class="underline">**The Ozone Example**</span> (With <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> )
|
||||
</span>
|
||||
|
||||
This dataset (Air Breizh, Summer 2001) contains 112 observations and 13 variables.
|
||||
|
||||
```{r}
|
||||
library(rmarkdown)
|
||||
ozone1 <- read.table("ozone1.txt", header = TRUE, sep = " ", dec = ".")
|
||||
paged_table(ozone1)
|
||||
```
|
||||
|
||||
Let's correct the incorrect declaration of variables:
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
library(dplyr)
|
||||
ozone1$Date <- as.Date(as.factor(ozone1$Date), format = "%Y%m%d")
|
||||
ozone1$vent <- as.factor(ozone1$vent)
|
||||
ozone1$pluie <- as.factor(ozone1$pluie)
|
||||
ozone1 <- ozone1 %>% mutate(across(where(is.character), as.numeric))
|
||||
ozone1 <- ozone1 %>% mutate(across(where(is.integer), as.numeric))
|
||||
str(ozone1)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 1. Linear regression model
|
||||
|
||||
## Simple linear model
|
||||
|
||||
|
||||
|
||||
|
||||
Let's plot this line (here with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> ) is called the least squares line.
|
||||
|
||||
```{r, out.width = "60%", fig.align = 'center', message = FALSE, warning = FALSE}
|
||||
library(ggplot2)
|
||||
library(plotly)
|
||||
p1 <- ggplot(ozone1) +
|
||||
aes(x = T12, y = maxO3) +
|
||||
geom_point(col = 'red', size = 0.5) +
|
||||
geom_smooth(method = "lm", se = FALSE)
|
||||
ggplotly(p1)
|
||||
```
|
||||
|
||||
|
||||
With <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> , we can compute $\widehat {\boldsymbol{\beta}}=(\widehat{\beta}_0, \widehat{\beta}_1)$
|
||||
|
||||
```{r}
|
||||
library(kableExtra)
|
||||
mod <- lm(maxO3 ~ T12, data = ozone1)
|
||||
mod$coefficients %>% kbl(col.names = "Estimation")
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 2. Ordinary least squares estimator (OLSE)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Declaration of the model with Under <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span>
|
||||
|
||||
Let's revisit the ozone example and, for this chapter, select only the numerical regressors with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span>
|
||||
|
||||
```{r}
|
||||
names(ozone1)
|
||||
Ozone <- ozone1[, 2:12]
|
||||
names(Ozone)
|
||||
```
|
||||
|
||||
|
||||
|
||||
Under <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> ,we can declare the linear model in two ways: either by naming the variables or by using a `.`.
|
||||
|
||||
```{r}
|
||||
mod <- lm(maxO3 ~ T9 +
|
||||
T12 +
|
||||
T15 +
|
||||
Ne9 +
|
||||
Ne12 +
|
||||
Ne15 +
|
||||
Vx9 +
|
||||
Vx12 +
|
||||
Vx15 +
|
||||
maxO3v, data = Ozone)
|
||||
mod <- lm(maxO3 ~ ., data = Ozone)
|
||||
```
|
||||
|
||||
Now, let's display the estimated values $\widehat{\boldsymbol{\beta}}$ of $\boldsymbol{\beta}$.
|
||||
|
||||
```{r}
|
||||
coefficients(mod)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
# 3. Residuals analysis
|
||||
|
||||
|
||||
|
||||
```{r}
|
||||
summary(mod$residuals)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
We can display Residual standard error with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span>
|
||||
```{r}
|
||||
cat("Residual standard error:", summary(mod)$sigma, "on", summary(mod)$df[2], "degrees of freedom\n")
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 4. Inference in the Gaussian linear regression model
|
||||
|
||||
|
||||
## Confidence intervals/regions
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> Display the confidence interval for $\beta_j$
|
||||
|
||||
|
||||
```{r}
|
||||
mod <- lm(maxO3 ~ ., data = Ozone)
|
||||
cbind(mod$coefficient, confint(mod, level = 0.95))
|
||||
```
|
||||
|
||||
|
||||
<span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> Display the confidence interval for
|
||||
$\sigma^2$
|
||||
|
||||
|
||||
```{r}
|
||||
|
||||
n <- length(Ozone$maxO3)
|
||||
p <- length(coef(mod))
|
||||
alpha <- 0.05
|
||||
|
||||
sigma2_hat <- summary(mod)$sigma^2
|
||||
|
||||
IC_sigma2 <- sigma2_hat * c((n - p) / qchisq(1 - alpha / 2, n - p),
|
||||
(n - p) / qchisq(alpha / 2, n - p))
|
||||
|
||||
cat("estimation of sigma : ", sigma2_hat, "\n")
|
||||
cat("Intervalle de confiance pour sigma^2 (95%) : [", IC_sigma2[1], ", ", IC_sigma2[2], "]\n")
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 5. Hypothesis testing
|
||||
|
||||
## Test: $\mathscr{H}_0:\,c^\top{\beta}=a$
|
||||
|
||||
|
||||
|
||||
|
||||
Declare our linear model and display with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span>
|
||||
|
||||
```{r}
|
||||
library(kableExtra)
|
||||
mod <- lm(maxO3 ~ ., data = Ozone)
|
||||
summary(mod) %>% coefficients() %>% kbl()
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
## Global Fisher test
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Under <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> , the `summary` displays the Global Fisher test
|
||||
|
||||
|
||||
```{r, eval=FALSE}
|
||||
F-statistic: 32.67 on 10 and 101 DF, p-value: < 2.2e-16
|
||||
```
|
||||
|
||||
|
||||
|
||||
## Fisher test for nested models
|
||||
|
||||
|
||||
|
||||
|
||||
<span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> Consider 2 nested models
|
||||
$$
|
||||
\left\{\begin{array}{l}
|
||||
\text{mod}_1:\, \text{maxO3=T9+Ne9+Vx9+maxO3v}+\epsilon,\\
|
||||
\text{maxO3=T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v}+\epsilon
|
||||
\end{array}\right.
|
||||
$$
|
||||
and display the nested Fisher test
|
||||
$$
|
||||
\left\{\begin{array}{l}
|
||||
\mathscr{H}_0: \, \text{mod}_1 \text{ is adequat}\\
|
||||
\mathscr{H}_1: \, \text{mod}_2 \text{ is adequat}
|
||||
\end{array}\right.
|
||||
$$
|
||||
|
||||
```{r}
|
||||
mod_1<-lm(maxO3~T9+Ne9+Vx9+maxO3v,data=Ozone)
|
||||
mod_2<-lm(maxO3~T9+T12+Ne9+Ne12+Vx9+Vx12+maxO3v,data=Ozone)
|
||||
|
||||
anova(mod_1,mod_2)
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
# 6. $R^2$-coefficient
|
||||
|
||||
|
||||
|
||||
|
||||
Let display these coefficient with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span>
|
||||
|
||||
```{r}
|
||||
cat("Multiple R-squared:", summary(mod)$r.squared, ", Adjusted R-squared:", summary(mod)$adj.r.squared, "\n")
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# 7. Prediction
|
||||
|
||||
|
||||
## 7.1. Fitted values
|
||||
|
||||
|
||||
|
||||
|
||||
Note that with <span class="solution"><span style="color: blue;">**Ⓡ**</span> </span> , we can display the fitted values $\widehat Y_i$
|
||||
|
||||
```{r}
|
||||
haty<-predict(mod,data=Ozone)
|
||||
# or equivalently
|
||||
#haty<-mod$fitted.values
|
||||
y<-Ozone$maxO3
|
||||
cbind.data.frame(y,haty)%>%rmarkdown::paged_table()
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user