Files
ArtStudies/M2/Linear Models/Biaised Models/Code_Lec3.Rmd

515 lines
11 KiB
Plaintext

---
title: "Code: Biased regression"
subtitle: "Methods for Regression and classification"
author: "Katia Meziani"
output:
html_document:
self_contained: true
math_method:
engine: mathjax
url: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
code_folding: hide
css: ./style.css
df_print: paged
highlight: tango
number_sections: no
theme: flatly
toc: yes
toc_float:
collapsed: no
---
<style>
.column {
width: 50%;
float: left;
}
</style>
<style>
.image-wrapper {
text-align: center; /* Centre toutes les images */
}
.image-container {
display: inline-block;
margin-right: 20px; /* Espacement entre les images */
text-align: center; /* Centre le texte sous l'image */
}
.image-container:last-child {
margin-right: 0; /* Pas d'espacement après la dernière image */
}
</style>
<style>
.attention {
font-size: 24px;
color: red;
font-weight: bold;
}
</style>
<style>
.underline {
text-decoration: underline;
}
</style>
<style contenteditable>
.brd {
border: 2px solid black; ; padding: 5px
}
</style>
<style>
.brdpurple {
border: 2px solid black;
padding: 10px; /* Espace interne */
background-color: rgba(128, 0, 128, 0.05); /* Fond violet très très clair avec transparence */
margin: 10px 0; /* Espace autour de l'encadrement */
}
</style>
<style contenteditable>
.brdred {
border: 2px solid #B22222 ;
padding: 10px; /* Ajouter de l'espace autour du texte */
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
<style>
.solution {
font-size: 24px;
color: yellow; /* Couleur jaune pour simuler la lumière */
}
</style>
<style>
.underline {
text-decoration: underline;
}
</style>
<style>
.brdgreen {
border: 2px solid black; /* Bordure verte */
padding: 10px; /* Espace interne */
background-color: rgba(114, 213, 114, 0.03); /* Fond vert pastel très clair avec transparence */
margin: 10px 0; /* Espace autour de l'encadrement */
}
</style>
<style>
.brdblack {
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
padding: 10px; /* Ajouter de l'espace autour du texte */
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
<style>
.brdblackempty {
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
padding: 10px; /* Ajouter de l'espace autour du texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
\newcommand{\Lasso}{\widehat{\beta}^{\lambda,L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\1}{\mathbb{1}}
\newcommand{\RE}{\text{RE}}
\newcommand{\sign}{\text{sign}}
\newcommand{\e}{\epsilon}
\renewcommand{\P}{\text{P}}
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.align = "center", message = FALSE, warning = FALSE)
```
## Packages
We begin by loading the necessary packages for this analysis.
```{r message=FALSE,warning=FALSE}
library(glmnet) # for regularized regression
library(caret) # for training and evaluating models
library(ggplot2) # for data visualization
library(ggfortify) # to extend ggplot2 features for autoplot
library(reshape2) # for reshaping data
library(Metrics) # for calculating metrics like RMSE
library(vip) # for variable importance visualization
library(dplyr) # for data manipulation
library(tidyverse) # includes ggplot2, dplyr, and other useful packages
```
****
# The Dataset: Cookies
## Upload Datasets
```{r}
setwd("/Users/arthurdanjou/Workspace/studies/M2/Linear Models/Biaised Models")
```
```{r}
# Loading the training dataset
cookie.train <- read.csv("Cookies_Train.csv", header = TRUE, row.names = 1)
```
```{r}
# Loading the test dataset
cookie.test <- read.csv("Cookies_Test.csv", header = TRUE, row.names = 1)
```
## Custom Control Parameters
```{r}
custom <- trainControl(
method = "repeatedcv",
number = 5, # Using 5-fold cross-validation
repeats = 3, # Repeating 3 times for robustness
summaryFunction = defaultSummary, # Default metrics (RMSE, MAE)
allowParallel = TRUE # Use parallel processing if resources allow
)
```
****
# Models Study
****
## Linear regression analysis
```{r}
set.seed(602)
linear.mod <- train(sugars ~ ., cookie.train, method = "lm", preProc = c("center", "scale"), trControl = custom)
linear.mod$results
```
```{r}
Ytrain <- cookie.train$sugars
dfc_train <- data.frame(ytrain = Ytrain, linear.mod = fitted(linear.mod))
dfc_train %>% rmarkdown::paged_table()
```
```{r}
dfc_train %>%
ggplot(aes(x = ytrain, y = linear.mod)) +
geom_point(size = 2, color = "#983399") +
geom_smooth(method = "lm", color = "#389900") +
ggtitle("Train Dataset") +
ylab("Fitted Values") +
xlab("Actual Values (Y)")
```
```{r}
Ytest <- cookie.test$sugars
dfc_test <- data.frame(ytest = Ytest)
dfc_test$linear.mod <- predict(linear.mod, newdata = cookie.test)
# dfc_test%>%rmarkdown::paged_table()
dfc_test %>%
ggplot(aes(x = ytest, y = linear.mod)) +
geom_point(size = 2, color = "#983399") +
geom_smooth(method = "lm", color = "#389900") +
ggtitle("Test Dataset") +
ylab("Fitted Values") +
xlab("Actual Values (Y)")
```
****
## Lasso regression analysis
```{r}
set.seed(602)
# grid_Lasso <- seq(0.001, 0.1, length = 100)
grid_Lasso <- 10^seq(0, 0.1, length = 100)
Lasso <- train(sugars ~ ., cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = grid_Lasso),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
library(plotly)
ggplotly(ggplot(Lasso))
```
```{r}
Lasso$results %>% rmarkdown::paged_table()
```
```{r}
Lasso$bestTune
```
```{r}
Lasso$results[which.min(Lasso$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(Lasso$finalModel, xvar = "lambda", label = TRUE)
plot(Lasso$finalModel, xvar = "dev", label = TRUE)
```
```{r}
library(vip)
vip(Lasso, num_features = 15)
```
```{r}
coef_lasso <- data.frame(
Variable = rownames(as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))),
Coefficient = as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))[, 1]
)
coef_lasso %>%
subset(Coefficient != 0) %>%
rmarkdown::paged_table()
```
****
## Ridge regression analysis
```{r}
set.seed(602)
lambda_ridge <- seq(11, 12, length = 100)
ridge <- train(sugars ~ .,
data = cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = lambda_ridge),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
library(plotly)
ggplotly(ggplot(ridge))
```
```{r}
ridge$results %>% rmarkdown::paged_table()
```
```{r}
ridge$bestTune
```
```{r}
ridge$results[which.min(ridge$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(ridge$finalModel, xvar = "lambda", label = TRUE)
plot(ridge$finalModel, xvar = "dev", label = TRUE)
```
```{r}
vip(ridge, num_features = 15)
```
```{r}
data.frame(as.matrix(coef(ridge$finalModel, ridge$bestTune$lambda))) %>%
rmarkdown::paged_table()
```
****
## ElasticNet regression analysis
```{r}
set.seed(602)
alpha_Enet <- seq(0.5, 0.9, length = 10)
lambda_Enet <- seq(0.01, 0.05, length = 10)
ElNet <- train(sugars ~ ., cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = alpha_Enet, lambda = lambda_Enet),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
ggplotly(ggplot(ElNet))
```
```{r}
ElNet$results %>% rmarkdown::paged_table()
```
```{r}
ElNet$bestTune
```
```{r}
ElNet$results[which.min(ElNet$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(ElNet$finalModel, xvar = "lambda", label = T)
plot(ElNet$finalModel, xvar = "dev", label = T)
```
```{r}
vip(ElNet, num_features = 20)
```
```{r}
coef_elnet <- data.frame(
Variable = rownames(as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))),
Coefficient = as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))[, 1]
)
coef_elnet %>%
subset(Coefficient != 0) %>%
rmarkdown::paged_table()
```
****
## PLS regression analysis
```{r}
set.seed(602)
pls_mod <- train(sugars ~ ., cookie.train,
method = "pls",
tuneLength = 20,
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
ggplotly(ggplot(pls_mod))
```
```{r}
pls_mod$results %>% rmarkdown::paged_table()
```
```{r}
pls_mod$bestTune
```
```{r}
pls_mod$results[which.min(pls_mod$results$RMSE), ]
```
```{r}
vip(pls_mod, num_features = 20)
```
```{r}
data.frame(Coefficients = as.matrix(coef(pls_mod$finalModel))) %>%
rmarkdown::paged_table()
```
****
# Models Comparaison
****
## Graphical comparison of model performance
### On the training set
```{r}
yTrain <- cookie.train$sugars
dTrain <- data.frame(yTrain = yTrain)
dTrain$linear <- fitted(linear.mod)
dTrain$Lasso <- fitted(Lasso)
dTrain$ridge <- fitted(ridge)
dTrain$ElNet <- fitted(ElNet)
dTrain$pls <- fitted(pls_mod)
melt.dTrain <- melt(dTrain, id = "yTrain", variable.name = "model")
melt.dTrain %>% ggplot() +
aes(x = yTrain, y = value) +
geom_smooth(method = "lm") +
geom_point(size = 1, colour = "#983399") +
facet_wrap(~model, nrow = 3) +
ggtitle("Train dataset") +
ylab("Fitted value") +
xlab("Y")
```
```{r}
dTrain %>% rmarkdown::paged_table()
```
```{r}
melt.dTrain %>% rmarkdown::paged_table()
```
### On the test set
```{r}
yTest <- cookie.test$sugars
dTest <- data.frame(yTest = yTest)
dTest$linear <- predict(linear.mod, newdata = cookie.test)
dTest$Lasso <- predict(Lasso, newdata = cookie.test)
dTest$ridge <- predict(ridge, newdata = cookie.test)
dTest$ElNet <- predict(ElNet, newdata = cookie.test)
dTest$pls <- predict(pls_mod, newdata = cookie.test)
# dTest%>% rmarkdown::paged_table()
melt.dTest <- melt(dTest, id = "yTest", variable.name = "model")
# melt.dTest%>% rmarkdown::paged_table()
melt.dTest %>% ggplot() +
aes(x = yTest, y = value) +
geom_smooth(method = "lm") +
geom_point(size = 1, colour = "#983399") +
facet_wrap(~model, nrow = 3) +
ggtitle("Test dataset") +
ylab("Fitted value") +
xlab("Y") +
theme_bw()
```
****
## RMSE comparaison among models
```{r}
RMSE <- rbind.data.frame(
cbind(rmse(yTrain, dTrain$linear), rmse(yTest, dTest$linear)),
cbind(rmse(yTrain, dTrain$Lasso), rmse(yTest, dTest$Lasso)),
cbind(rmse(yTrain, dTrain$ridge), rmse(yTest, dTest$ridge)),
cbind(rmse(yTrain, dTrain$ElNet), rmse(yTest, dTest$ElNet)),
cbind(rmse(yTrain, dTrain$pls), rmse(yTest, dTest$pls))
)
names(RMSE) <- c("Train", "Test")
row.names(RMSE) <- c("Linear", "Lasso", "Ridge", "ElNet", "PLS")
RMSE %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
```
```{r}
summary(yTrain)
summary(yTest)
```
```{r, echo=FALSE}
LM_train <- round(rmse(yTrain, dTrain$linear), 2)
LM_test <- round(rmse(yTest, dTest$linear), 2)
Lasso_train <- round(rmse(yTrain, dTrain$Lasso), 2)
Lasso_test <- round(rmse(yTest, dTest$Lasso), 2)
Ridge_train <- round(rmse(yTrain, dTrain$ridge), 2)
Ridge_test <- round(rmse(yTest, dTest$ridge), 2)
ElNet_train <- round(rmse(yTrain, dTrain$ElNet), 2)
ElNet_test <- round(rmse(yTest, dTest$ElNet), 2)
PLS_train <- round(rmse(yTrain, dTrain$pls), 2)
PLS_test <- round(rmse(yTest, dTest$pls), 2)
```