Files
ArtStudies/M1/General Linear Models/TP1-bis/TP1.Rmd
Arthur DANJOU 03bf0a4db2 Refactor code for improved readability and consistency across R Markdown files
- Updated comments and code formatting in `3-td_ggplot2 - enonce.Rmd` for clarity.
- Enhanced code structure in `4-td_graphiques - enonce.Rmd` by organizing options and library calls.
- Replaced pipe operator `%>%` with `|>` in `Code_Lec3.Rmd` for consistency with modern R syntax.
- Cleaned up commented-out code and ensured consistent spacing in ggplot calls.
2025-11-06 09:26:58 +01:00

140 lines
3.1 KiB
Plaintext

```{r}
setwd("/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/TP1-bis")
library(tidyverse)
options(scipen = 999, digits = 5)
```
```{r}
data <- read.csv("data01.csv", header = TRUE, sep = ",", dec = ".")
head(data, 20)
```
```{r}
ggplot(data, aes(x = cholesterol)) +
geom_histogram(binwidth = 5, color = "black", fill = "gray80") +
labs(x = "Cholesterol", y = "Frequency", title = "Histogram of cholesterol") +
theme_bw(14)
ggplot(data, aes(x = poids)) +
geom_histogram(binwidth = 2.5, color = "black", fill = "gray80") +
labs(x = "Poids", y = "Frequency", title = "Histogram of Poids") +
theme_bw(14)
ggplot(aes(y = cholesterol, x = poids), data = data) +
geom_point() +
labs(y = "Cholesterol (y)", x = "Poids, kg (x)", title = "Scatter plot of cholesterol and poids") +
theme_bw(14)
```
# OLSE
```{r}
x <- data[, "poids"]
y <- data[, "cholesterol"]
Sxy <- sum((x - mean(x)) * y)
Sxx <- sum((x - mean(x))^2)
beta1 <- Sxy / Sxx
beta0 <- mean(y) - beta1 * mean(x)
c(beta0, beta1)
```
Final Equation: y = 18.4470 + 2.0523 * x
```{r}
X <- cbind(1, x)
colnames(X) <- NULL
X_t_X <- t(X) %*% X
inv_X_t_x <- solve(X_t_X)
betas <- inv_X_t_x %*% t(X) %*% y
betas
```
```{r}
model <- lm(data, formula = cholesterol ~ poids)
summary(model)
coef(model)
```
```{r}
data <- data |>
mutate(yhat = beta0 + beta1 * poids) |>
mutate(residuals = cholesterol - yhat)
data
ggplot(data, aes(x = poids, y = cholesterol)) +
geom_point(size = 2, shape = 21, fill = "blue", color = "cyan") +
geom_line(aes(y = yhat), color = "blue") +
labs(x = "Poids", y = "Cholesterol", title = "OLS Regression Line") +
theme_bw(14)
```
```{r}
mean(data[, "cholesterol"])
mean(data[, "yhat"])
mean(data[, "residuals"]) |> round(10)
cov(data[, "residuals"], data[, "poids"]) |> round(10)
(RSS <- sum((data[, "residuals"])^2))
(TSS <- sum((y - mean(y))^2))
TSS - beta1 * Sxy
```
```{r}
dof <- nrow(data) - 2
sigma_hat_2 <- RSS / dof
cov_beta <- sigma_hat_2 * inv_X_t_x
var_beta <- diag(cov_beta)
std_beta <- sqrt(var_beta)
sigma(model)^2
vcov(model)
```
# Hypothesis Testing
```{r}
(t_beta <- beta1 / std_beta[2])
qt(0.975, dof)
2 * pt(abs(t_beta), dof, lower.tail = FALSE)
summary(model)
```
```{r}
MSS <- (TSS - RSS)
(F <- MSS / (RSS / dof))
qf(0.95, 1, dof)
pf(F, 1, dof, lower.tail = FALSE)
anova(model)
```
# Confidence Interval
```{r}
(CI_beta1 <- c(beta1 - qt(0.97, dof) * std_beta[2], beta1 + qt(0.97, dof) * std_beta[2]))
confint(model, 0.95)
t <- qt(0.975, dof)
sigma_hat <- sigma(model)
n <- nrow(data)
data <- data |>
mutate(error = t *
sigma_hat *
sqrt(1 / n + (poids - mean(poids))^2 / RSS)) |>
mutate(conf.low = yhat - error, conf.high = yhat + error, error = NULL)
ggplot(data, aes(x = poids, y = cholesterol)) +
geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.3, fill = "blue") +
geom_point(size = 2, shape = 21, fill = "blue", color = "cyan") +
geom_line(aes(y = yhat), color = "blue") +
labs(x = "Poids", y = "Cholesterol", title = "OLS Regression Line") +
theme_bw(14)
```
# R^2
```{r}
(R2 <- MSS / TSS)
summary(model)$r.squared
cor(data[, "cholesterol"], data[, "poids"])^2
```