mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-14 18:59:59 +01:00
515 lines
11 KiB
Plaintext
515 lines
11 KiB
Plaintext
---
|
|
title: "Code: Biased regression"
|
|
subtitle: "Methods for Regression and classification"
|
|
author: "Katia Meziani"
|
|
output:
|
|
html_document:
|
|
self_contained: true
|
|
math_method:
|
|
engine: mathjax
|
|
url: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
|
|
code_folding: hide
|
|
css: ./style.css
|
|
df_print: paged
|
|
highlight: tango
|
|
number_sections: no
|
|
theme: flatly
|
|
toc: yes
|
|
toc_float:
|
|
collapsed: no
|
|
---
|
|
|
|
<style>
|
|
.column {
|
|
width: 50%;
|
|
float: left;
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.image-wrapper {
|
|
text-align: center; /* Centre toutes les images */
|
|
}
|
|
.image-container {
|
|
display: inline-block;
|
|
margin-right: 20px; /* Espacement entre les images */
|
|
text-align: center; /* Centre le texte sous l'image */
|
|
}
|
|
.image-container:last-child {
|
|
margin-right: 0; /* Pas d'espacement après la dernière image */
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.attention {
|
|
font-size: 24px;
|
|
color: red;
|
|
font-weight: bold;
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.underline {
|
|
text-decoration: underline;
|
|
}
|
|
</style>
|
|
|
|
<style contenteditable>
|
|
.brd {
|
|
border: 2px solid black; ; padding: 5px
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.brdpurple {
|
|
border: 2px solid black;
|
|
padding: 10px; /* Espace interne */
|
|
background-color: rgba(128, 0, 128, 0.05); /* Fond violet très très clair avec transparence */
|
|
margin: 10px 0; /* Espace autour de l'encadrement */
|
|
}
|
|
</style>
|
|
|
|
<style contenteditable>
|
|
.brdred {
|
|
border: 2px solid #B22222 ;
|
|
padding: 10px; /* Ajouter de l'espace autour du texte */
|
|
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
|
|
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.solution {
|
|
font-size: 24px;
|
|
color: yellow; /* Couleur jaune pour simuler la lumière */
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.underline {
|
|
text-decoration: underline;
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.brdgreen {
|
|
border: 2px solid black; /* Bordure verte */
|
|
padding: 10px; /* Espace interne */
|
|
background-color: rgba(114, 213, 114, 0.03); /* Fond vert pastel très clair avec transparence */
|
|
margin: 10px 0; /* Espace autour de l'encadrement */
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.brdblack {
|
|
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
|
|
padding: 10px; /* Ajouter de l'espace autour du texte */
|
|
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
|
|
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
|
|
}
|
|
</style>
|
|
|
|
<style>
|
|
.brdblackempty {
|
|
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
|
|
padding: 10px; /* Ajouter de l'espace autour du texte */
|
|
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
|
|
}
|
|
</style>
|
|
|
|
\newcommand{\Lasso}{\widehat{\beta}^{\lambda,L}}
|
|
\newcommand{\R}{\mathbb{R}}
|
|
\newcommand{\1}{\mathbb{1}}
|
|
\newcommand{\RE}{\text{RE}}
|
|
\newcommand{\sign}{\text{sign}}
|
|
\newcommand{\e}{\epsilon}
|
|
\renewcommand{\P}{\text{P}}
|
|
|
|
```{r setup, include=FALSE}
|
|
knitr::opts_chunk$set(fig.align = "center", message = FALSE, warning = FALSE)
|
|
```
|
|
|
|
## Packages
|
|
|
|
We begin by loading the necessary packages for this analysis.
|
|
|
|
```{r message=FALSE,warning=FALSE}
|
|
library(glmnet) # for regularized regression
|
|
library(caret) # for training and evaluating models
|
|
library(ggplot2) # for data visualization
|
|
library(ggfortify) # to extend ggplot2 features for autoplot
|
|
library(reshape2) # for reshaping data
|
|
library(Metrics) # for calculating metrics like RMSE
|
|
library(vip) # for variable importance visualization
|
|
library(dplyr) # for data manipulation
|
|
library(tidyverse) # includes ggplot2, dplyr, and other useful packages
|
|
```
|
|
|
|
****
|
|
|
|
# The Dataset: Cookies
|
|
|
|
## Upload Datasets
|
|
|
|
```{r}
|
|
setwd("/Users/arthurdanjou/Workspace/studies/M2/Linear Models/Biaised Models")
|
|
```
|
|
|
|
```{r}
|
|
# Loading the training dataset
|
|
cookie.train <- read.csv("Cookies_Train.csv", header = TRUE, row.names = 1)
|
|
```
|
|
|
|
```{r}
|
|
# Loading the test dataset
|
|
cookie.test <- read.csv("Cookies_Test.csv", header = TRUE, row.names = 1)
|
|
```
|
|
|
|
## Custom Control Parameters
|
|
|
|
```{r}
|
|
custom <- trainControl(
|
|
method = "repeatedcv",
|
|
number = 5, # Using 5-fold cross-validation
|
|
repeats = 3, # Repeating 3 times for robustness
|
|
summaryFunction = defaultSummary, # Default metrics (RMSE, MAE)
|
|
allowParallel = TRUE # Use parallel processing if resources allow
|
|
)
|
|
```
|
|
|
|
****
|
|
|
|
# Models Study
|
|
|
|
****
|
|
|
|
## Linear regression analysis
|
|
|
|
```{r}
|
|
set.seed(602)
|
|
linear.mod <- train(sugars ~ ., cookie.train, method = "lm", preProc = c("center", "scale"), trControl = custom)
|
|
linear.mod$results
|
|
```
|
|
|
|
```{r}
|
|
Ytrain <- cookie.train$sugars
|
|
dfc_train <- data.frame(ytrain = Ytrain, linear.mod = fitted(linear.mod))
|
|
dfc_train %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
dfc_train %>%
|
|
ggplot(aes(x = ytrain, y = linear.mod)) +
|
|
geom_point(size = 2, color = "#983399") +
|
|
geom_smooth(method = "lm", color = "#389900") +
|
|
ggtitle("Train Dataset") +
|
|
ylab("Fitted Values") +
|
|
xlab("Actual Values (Y)")
|
|
```
|
|
|
|
```{r}
|
|
Ytest <- cookie.test$sugars
|
|
dfc_test <- data.frame(ytest = Ytest)
|
|
dfc_test$linear.mod <- predict(linear.mod, newdata = cookie.test)
|
|
# dfc_test%>%rmarkdown::paged_table()
|
|
|
|
dfc_test %>%
|
|
ggplot(aes(x = ytest, y = linear.mod)) +
|
|
geom_point(size = 2, color = "#983399") +
|
|
geom_smooth(method = "lm", color = "#389900") +
|
|
ggtitle("Test Dataset") +
|
|
ylab("Fitted Values") +
|
|
xlab("Actual Values (Y)")
|
|
```
|
|
|
|
****
|
|
|
|
## Lasso regression analysis
|
|
|
|
```{r}
|
|
set.seed(602)
|
|
# grid_Lasso <- seq(0.001, 0.1, length = 100)
|
|
grid_Lasso <- 10^seq(0, 0.1, length = 100)
|
|
Lasso <- train(sugars ~ ., cookie.train,
|
|
method = "glmnet",
|
|
tuneGrid = expand.grid(alpha = 1, lambda = grid_Lasso),
|
|
preProc = c("center", "scale"),
|
|
trControl = custom
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
library(plotly)
|
|
ggplotly(ggplot(Lasso))
|
|
```
|
|
|
|
```{r}
|
|
Lasso$results %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
Lasso$bestTune
|
|
```
|
|
|
|
```{r}
|
|
Lasso$results[which.min(Lasso$results$RMSE), ]
|
|
```
|
|
|
|
```{r}
|
|
par(mfrow = c(1, 2))
|
|
plot(Lasso$finalModel, xvar = "lambda", label = TRUE)
|
|
plot(Lasso$finalModel, xvar = "dev", label = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
library(vip)
|
|
vip(Lasso, num_features = 15)
|
|
```
|
|
|
|
```{r}
|
|
coef_lasso <- data.frame(
|
|
Variable = rownames(as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))),
|
|
Coefficient = as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))[, 1]
|
|
)
|
|
coef_lasso %>%
|
|
subset(Coefficient != 0) %>%
|
|
rmarkdown::paged_table()
|
|
```
|
|
|
|
****
|
|
|
|
## Ridge regression analysis
|
|
|
|
```{r}
|
|
set.seed(602)
|
|
lambda_ridge <- seq(11, 12, length = 100)
|
|
ridge <- train(sugars ~ .,
|
|
data = cookie.train,
|
|
method = "glmnet",
|
|
tuneGrid = expand.grid(alpha = 0, lambda = lambda_ridge),
|
|
preProc = c("center", "scale"),
|
|
trControl = custom
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
library(plotly)
|
|
ggplotly(ggplot(ridge))
|
|
```
|
|
|
|
```{r}
|
|
ridge$results %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
ridge$bestTune
|
|
```
|
|
|
|
```{r}
|
|
ridge$results[which.min(ridge$results$RMSE), ]
|
|
```
|
|
|
|
```{r}
|
|
par(mfrow = c(1, 2))
|
|
plot(ridge$finalModel, xvar = "lambda", label = TRUE)
|
|
plot(ridge$finalModel, xvar = "dev", label = TRUE)
|
|
```
|
|
|
|
```{r}
|
|
vip(ridge, num_features = 15)
|
|
```
|
|
|
|
```{r}
|
|
data.frame(as.matrix(coef(ridge$finalModel, ridge$bestTune$lambda))) %>%
|
|
rmarkdown::paged_table()
|
|
```
|
|
|
|
****
|
|
|
|
## ElasticNet regression analysis
|
|
|
|
```{r}
|
|
set.seed(602)
|
|
alpha_Enet <- seq(0.5, 0.9, length = 10)
|
|
lambda_Enet <- seq(0.01, 0.05, length = 10)
|
|
|
|
ElNet <- train(sugars ~ ., cookie.train,
|
|
method = "glmnet",
|
|
tuneGrid = expand.grid(alpha = alpha_Enet, lambda = lambda_Enet),
|
|
preProc = c("center", "scale"),
|
|
trControl = custom
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
ggplotly(ggplot(ElNet))
|
|
```
|
|
|
|
```{r}
|
|
ElNet$results %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
ElNet$bestTune
|
|
```
|
|
|
|
```{r}
|
|
ElNet$results[which.min(ElNet$results$RMSE), ]
|
|
```
|
|
|
|
```{r}
|
|
par(mfrow = c(1, 2))
|
|
plot(ElNet$finalModel, xvar = "lambda", label = T)
|
|
plot(ElNet$finalModel, xvar = "dev", label = T)
|
|
```
|
|
|
|
```{r}
|
|
vip(ElNet, num_features = 20)
|
|
```
|
|
|
|
```{r}
|
|
coef_elnet <- data.frame(
|
|
Variable = rownames(as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))),
|
|
Coefficient = as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))[, 1]
|
|
)
|
|
coef_elnet %>%
|
|
subset(Coefficient != 0) %>%
|
|
rmarkdown::paged_table()
|
|
```
|
|
|
|
****
|
|
|
|
## PLS regression analysis
|
|
|
|
```{r}
|
|
set.seed(602)
|
|
pls_mod <- train(sugars ~ ., cookie.train,
|
|
method = "pls",
|
|
tuneLength = 20,
|
|
preProc = c("center", "scale"),
|
|
trControl = custom
|
|
)
|
|
```
|
|
|
|
```{r}
|
|
ggplotly(ggplot(pls_mod))
|
|
```
|
|
|
|
```{r}
|
|
pls_mod$results %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
pls_mod$bestTune
|
|
```
|
|
|
|
```{r}
|
|
pls_mod$results[which.min(pls_mod$results$RMSE), ]
|
|
```
|
|
|
|
```{r}
|
|
vip(pls_mod, num_features = 20)
|
|
```
|
|
|
|
```{r}
|
|
data.frame(Coefficients = as.matrix(coef(pls_mod$finalModel))) %>%
|
|
rmarkdown::paged_table()
|
|
```
|
|
|
|
****
|
|
|
|
# Models Comparaison
|
|
|
|
****
|
|
|
|
## Graphical comparison of model performance
|
|
|
|
### On the training set
|
|
|
|
```{r}
|
|
yTrain <- cookie.train$sugars
|
|
dTrain <- data.frame(yTrain = yTrain)
|
|
dTrain$linear <- fitted(linear.mod)
|
|
dTrain$Lasso <- fitted(Lasso)
|
|
dTrain$ridge <- fitted(ridge)
|
|
dTrain$ElNet <- fitted(ElNet)
|
|
dTrain$pls <- fitted(pls_mod)
|
|
melt.dTrain <- melt(dTrain, id = "yTrain", variable.name = "model")
|
|
melt.dTrain %>% ggplot() +
|
|
aes(x = yTrain, y = value) +
|
|
geom_smooth(method = "lm") +
|
|
geom_point(size = 1, colour = "#983399") +
|
|
facet_wrap(~model, nrow = 3) +
|
|
ggtitle("Train dataset") +
|
|
ylab("Fitted value") +
|
|
xlab("Y")
|
|
```
|
|
|
|
```{r}
|
|
dTrain %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
```{r}
|
|
melt.dTrain %>% rmarkdown::paged_table()
|
|
```
|
|
|
|
### On the test set
|
|
|
|
```{r}
|
|
yTest <- cookie.test$sugars
|
|
dTest <- data.frame(yTest = yTest)
|
|
dTest$linear <- predict(linear.mod, newdata = cookie.test)
|
|
dTest$Lasso <- predict(Lasso, newdata = cookie.test)
|
|
dTest$ridge <- predict(ridge, newdata = cookie.test)
|
|
dTest$ElNet <- predict(ElNet, newdata = cookie.test)
|
|
dTest$pls <- predict(pls_mod, newdata = cookie.test)
|
|
# dTest%>% rmarkdown::paged_table()
|
|
melt.dTest <- melt(dTest, id = "yTest", variable.name = "model")
|
|
# melt.dTest%>% rmarkdown::paged_table()
|
|
melt.dTest %>% ggplot() +
|
|
aes(x = yTest, y = value) +
|
|
geom_smooth(method = "lm") +
|
|
geom_point(size = 1, colour = "#983399") +
|
|
facet_wrap(~model, nrow = 3) +
|
|
ggtitle("Test dataset") +
|
|
ylab("Fitted value") +
|
|
xlab("Y") +
|
|
theme_bw()
|
|
```
|
|
|
|
****
|
|
|
|
## RMSE comparaison among models
|
|
|
|
```{r}
|
|
RMSE <- rbind.data.frame(
|
|
cbind(rmse(yTrain, dTrain$linear), rmse(yTest, dTest$linear)),
|
|
cbind(rmse(yTrain, dTrain$Lasso), rmse(yTest, dTest$Lasso)),
|
|
cbind(rmse(yTrain, dTrain$ridge), rmse(yTest, dTest$ridge)),
|
|
cbind(rmse(yTrain, dTrain$ElNet), rmse(yTest, dTest$ElNet)),
|
|
cbind(rmse(yTrain, dTrain$pls), rmse(yTest, dTest$pls))
|
|
)
|
|
names(RMSE) <- c("Train", "Test")
|
|
row.names(RMSE) <- c("Linear", "Lasso", "Ridge", "ElNet", "PLS")
|
|
RMSE %>%
|
|
kableExtra::kbl() %>%
|
|
kableExtra::kable_styling()
|
|
```
|
|
|
|
```{r}
|
|
summary(yTrain)
|
|
summary(yTest)
|
|
```
|
|
|
|
```{r, echo=FALSE}
|
|
LM_train <- round(rmse(yTrain, dTrain$linear), 2)
|
|
LM_test <- round(rmse(yTest, dTest$linear), 2)
|
|
Lasso_train <- round(rmse(yTrain, dTrain$Lasso), 2)
|
|
Lasso_test <- round(rmse(yTest, dTest$Lasso), 2)
|
|
Ridge_train <- round(rmse(yTrain, dTrain$ridge), 2)
|
|
Ridge_test <- round(rmse(yTest, dTest$ridge), 2)
|
|
ElNet_train <- round(rmse(yTrain, dTrain$ElNet), 2)
|
|
ElNet_test <- round(rmse(yTest, dTest$ElNet), 2)
|
|
PLS_train <- round(rmse(yTrain, dTrain$pls), 2)
|
|
PLS_test <- round(rmse(yTest, dTest$pls), 2)
|
|
``` |