```{r} setwd("/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/TP2-bis") library(tidyverse) library(GGally) library(broom) library(scales) library(car) library(qqplotr) options(scipen = 999, digits = 5) ``` ```{r} data <- read.csv("data02.csv", sep = ",", header = TRUE, dec = ".") data |> mutate(type = factor(type, levels = c("maths", "english", "final"), labels = c("maths", "english", "final"))) |> ggplot(aes(x = note)) + facet_wrap(vars(type), scales = "free_x") + geom_histogram(binwidth = 4, color = "black", fill = "grey80") + labs(title = "Histogram of notes", x = "Note") + theme_bw(14) ``` ```{r} data_wide <- pivot_wider(data, names_from = type, values_from = note) data_wide |> select(-id) |> ggpairs() + theme_bw(14) ``` ```{r} model <- lm(data_wide, formula = final ~ maths + english) summary(model) ``` ```{r} tidy(model, conf.int = TRUE, conf.level = 0.95) glance(model) (R2 <- summary(model)$r.squared) (R2 / 2) * 57 / (1 - R2) vcov(model) ``` # Hypothesis testing ```{r} C <- c(0, 1, -1) beta <- cbind(coef(model)) (C_beta <- C %*% beta) X <- model.matrix(model) (inv_XtX <- solve(t(X) %*% X)) q <- 1 numerator <- t(C_beta) %*% solve(t(C) %*% inv_XtX %*% C) %*% C_beta denominator <- sigma(model)^2 F <- (numerator / q) / denominator F dof <- nrow(data_wide) - 3 qf(0.95, q, dof) pf(F, q, dof, lower.tail = FALSE) linearHypothesis(model, "maths - english = 0") ``` # Submodel testing ```{r} data_predict <- predict(model, newdata = expand.grid(maths = seq(70, 90, 2), english = c(75, 85)), interval = "confidence") |> as_tibble() |> bind_cols(expand.grid(maths = seq(70, 90, 2), english = c(75, 85))) data_predict |> mutate(english = as.factor(english)) |> ggplot(aes(x = maths, y = fit, color = english, fill = english, label = round(fit, 1))) + geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2, show.legend = FALSE) + geom_point(size = 2) + geom_line(aes(y = fit)) + geom_text(vjust = -1, show.legend = FALSE) + labs(title = "Prediction of final note", x = "Maths note", y = "Final note", color = "English", fill = "English") + theme_bw(14) ``` ```{r} diag_data <- augment(model) ggplot(diag_data, aes(x = .fitted, y = .resid)) + geom_point() + geom_hline(yintercept = 0) + labs(title = "Residuals vs Fitted", x = "Fitted values", y = "Residuals") + theme_bw(14) ``` ```{r} ggplot(diag_data, aes(sample = .resid)) + stat_qq_band(alpha = 0.2, fill = "blue") + stat_qq_line(color = "red") + stat_qq_point(size = 1) + labs(y = "Sample quantile", x = "Theoritical quantile") + theme_minimal(base_size = 14) ggplot(diag_data, aes(x = .resid)) + geom_histogram(fill = "dodgerblue", color = "black", bins = 7) + labs(y = "Count", x = "Résiduals") + scale_y_continuous(expand = expansion(c(0, 0.05))) + scale_x_continuous(breaks = pretty_breaks(n = 10)) + theme_minimal(base_size = 14) mutate(diag_data, obs = row_number()) |> ggplot(aes(x = obs, y = .cooksd)) + geom_segment(aes(x = obs, y = 0, xend = obs, yend = .cooksd)) + geom_point(color = "blue", size = 1) + scale_x_continuous(breaks = seq(0, 60, 10)) + labs(y = "Cook's distance", x = "Index") + theme_minimal(base_size = 14) influenceIndexPlot(model, vars = "cook", id = list(n = 5), main = NULL) ```