diff --git a/M1/General Linear Models/TP1-bis/TP1.Rmd b/M1/General Linear Models/TP1-bis/TP1.Rmd new file mode 100644 index 0000000..3fbd899 --- /dev/null +++ b/M1/General Linear Models/TP1-bis/TP1.Rmd @@ -0,0 +1,139 @@ +```{r} +setwd('/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/TP1-bis') + +library(tidyverse) +options(scipen = 999, digits = 5) +``` +```{r} +data <- read.csv("data01.csv", header = TRUE, sep = ",", dec = ".") +head(data, 20) +``` + +```{r} +ggplot(data, aes(x = cholesterol)) + + geom_histogram(binwidth = 5, color = "black", fill = "gray80") + + labs(x = "Cholesterol", y = "Frequency", title = "Histogram of cholesterol") + + theme_bw(14) + +ggplot(data, aes(x = poids)) + + geom_histogram(binwidth = 2.5, color = "black", fill = "gray80") + + labs(x = "Poids", y = "Frequency", title = "Histogram of Poids") + + theme_bw(14) + +ggplot(aes(y = cholesterol, x = poids), data = data) + + geom_point() + + labs(y = "Cholesterol (y)", x = "Poids, kg (x)", title = "Scatter plot of cholesterol and poids") + + theme_bw(14) +``` + +# OLSE +```{r} +x <- data[, "poids"] +y <- data[, "cholesterol"] +Sxy <- sum((x - mean(x)) * y) +Sxx <- sum((x - mean(x))^2) + +beta1 <- Sxy / Sxx +beta0 <- mean(y) - beta1 * mean(x) + +c(beta0, beta1) +``` + +Final Equation: y = 18.4470 + 2.0523 * x + +```{r} +X <- cbind(1, x) + +colnames(X) <- NULL +X_t_X <- t(X) %*% X +inv_X_t_x <- solve(X_t_X) +betas <- inv_X_t_x %*% t(X) %*% y +betas +``` +```{r} +model <- lm(data, formula = cholesterol ~ poids) +summary(model) +coef(model) +``` +```{r} +data <- data %>% + mutate(yhat = beta0 + beta1 * poids) %>% + mutate(residuals = cholesterol - yhat) + +data + +ggplot(data, aes(x = poids, y = cholesterol)) + + geom_point(size = 2, shape = 21, fill = "blue", color = "cyan") + + geom_line(aes(y = yhat), color = "blue") + + labs(x = "Poids", y = "Cholesterol", title = "OLS Regression Line") + + theme_bw(14) +``` +```{r} +mean(data[, "cholesterol"]) +mean(data[, "yhat"]) +mean(data[, "residuals"]) %>% round(10) +cov(data[, "residuals"], data[, "poids"]) %>% round(10) +(RSS <- sum((data[, "residuals"])^2)) +(TSS <- sum((y - mean(y))^2)) +TSS - beta1 * Sxy +``` + +```{r} +dof <- nrow(data) - 2 + +sigma_hat_2 <- RSS / dof + +cov_beta <- sigma_hat_2 * inv_X_t_x +var_beta <- diag(cov_beta) +std_beta <- sqrt(var_beta) + +sigma(model)^2 +vcov(model) +``` + + +# Hypothesis Testing + +```{r} +(t_beta <- beta1 / std_beta[2]) +qt(0.975, dof) +2 * pt(abs(t_beta), dof, lower.tail = FALSE) +summary(model) +``` +```{r} +MSS <- (TSS - RSS) +(F <- MSS / (RSS / dof)) +qf(0.95, 1, dof) +pf(F, 1, dof, lower.tail = FALSE) +anova(model) +``` + +# Confidence Interval +```{r} +(CI_beta1 <- c(beta1 - qt(0.97, dof) * std_beta[2], beta1 + qt(0.97, dof) * std_beta[2])) +confint(model, 0.95) + +t <- qt(0.975, dof) +sigma_hat <- sigma(model) +n <- nrow(data) + +data <- data %>% + mutate(error = t * + sigma_hat * + sqrt(1 / n + (poids - mean(poids))^2 / RSS)) %>% + mutate(conf.low = yhat - error, conf.high = yhat + error, error = NULL) + +ggplot(data, aes(x = poids, y = cholesterol)) + + geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = 0.3, fill = "blue") + + geom_point(size = 2, shape = 21, fill = "blue", color = "cyan") + + geom_line(aes(y = yhat), color = "blue") + + labs(x = "Poids", y = "Cholesterol", title = "OLS Regression Line") + + theme_bw(14) +``` + +# R^2 +```{r} +(R2 <- MSS / TSS) +summary(model)$r.squared +cor(data[, "cholesterol"], data[, "poids"])^2 +``` diff --git a/M1/General Linear Models/TP1-bis/data01.csv b/M1/General Linear Models/TP1-bis/data01.csv new file mode 100644 index 0000000..d48cdad --- /dev/null +++ b/M1/General Linear Models/TP1-bis/data01.csv @@ -0,0 +1,51 @@ +id,cholesterol,poids +1,144.47452579195235,61.2 +2,145.88924868817446,62.6 +3,161.1126037995184,69.9 +4,155.4458581667524,63.8 +5,147.97655520732974,64 +6,170.24615865999974,70.5 +7,144.19706954005656,65.4 +8,140.1067951922945,58.3 +9,142.7466793355334,60.7 +10,145.2692979993652,61.7 +11,160.3640967819068,68.5 +12,148.56479095579394,65 +13,149.76055898423206,65.1 +14,143.85346030579532,63.9 +15,137.88860216547474,61.2 +16,164.79575052668758,70.8 +17,154.49767048200715,65.5 +18,131.4504444444691,55.5 +19,158.60793029776818,66.4 +20,154.20805689206188,61.6 +21,136.4149514835298,59.1 +22,134.5904701014675,62.6 +23,144.2563545892844,59.3 +24,138.22197617664875,60.5 +25,139.21587117755206,60.9 +26,138.70662904163586,56.6 +27,153.73921419517316,66.9 +28,143.2077515962295,64.1 +29,139.1754465872936,58.8 +30,158.02566076295264,68.6 +31,151.67509002312616,65.2 +32,147.4035408260477,62.3 +33,153.80002424297288,67.1 +34,158.72992406100167,67.1 +35,153.92208543890675,66.8 +36,155.54678399213427,66.3 +37,158.2201910034931,65.8 +38,149.64656232873804,63.2 +39,143.75436129736758,62.2 +40,150.4910110335996,61.9 +41,147.0277746100402,60.7 +42,148.96429207047277,62.6 +43,138.37451986964854,58.3 +44,163.40504312344743,72.3 +45,165.13133732815768,68.4 +46,135.39612367787572,58.9 +47,155.49199962327577,61.8 +48,151.67314985744744,61.6 +49,153.49019996627314,66.7 +50,142.15508998013192,63.1