Add initial R project configuration file (studies.Rproj) with default settings

This commit is contained in:
2025-10-08 15:33:39 +02:00
parent 1effa6dc4b
commit 3cacb6be8a
5 changed files with 6298 additions and 0 deletions

View File

@@ -0,0 +1,515 @@
---
title: "Code: Biased regression"
subtitle: "Methods for Regression and classification"
author: "Katia Meziani"
output:
html_document:
self_contained: true
math_method:
engine: mathjax
url: https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js
code_folding: hide
css: ./style.css
df_print: paged
highlight: tango
number_sections: no
theme: flatly
toc: yes
toc_float:
collapsed: no
---
<style>
.column {
width: 50%;
float: left;
}
</style>
<style>
.image-wrapper {
text-align: center; /* Centre toutes les images */
}
.image-container {
display: inline-block;
margin-right: 20px; /* Espacement entre les images */
text-align: center; /* Centre le texte sous l'image */
}
.image-container:last-child {
margin-right: 0; /* Pas d'espacement après la dernière image */
}
</style>
<style>
.attention {
font-size: 24px;
color: red;
font-weight: bold;
}
</style>
<style>
.underline {
text-decoration: underline;
}
</style>
<style contenteditable>
.brd {
border: 2px solid black; ; padding: 5px
}
</style>
<style>
.brdpurple {
border: 2px solid black;
padding: 10px; /* Espace interne */
background-color: rgba(128, 0, 128, 0.05); /* Fond violet très très clair avec transparence */
margin: 10px 0; /* Espace autour de l'encadrement */
}
</style>
<style contenteditable>
.brdred {
border: 2px solid #B22222 ;
padding: 10px; /* Ajouter de l'espace autour du texte */
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
<style>
.solution {
font-size: 24px;
color: yellow; /* Couleur jaune pour simuler la lumière */
}
</style>
<style>
.underline {
text-decoration: underline;
}
</style>
<style>
.brdgreen {
border: 2px solid black; /* Bordure verte */
padding: 10px; /* Espace interne */
background-color: rgba(114, 213, 114, 0.03); /* Fond vert pastel très clair avec transparence */
margin: 10px 0; /* Espace autour de l'encadrement */
}
</style>
<style>
.brdblack {
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
padding: 10px; /* Ajouter de l'espace autour du texte */
background-color: #f9f9f9; /* Optionnel : couleur de fond pour le texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
<style>
.brdblackempty {
border: 2px solid black; /* Définir la couleur et l'épaisseur de la bordure */
padding: 10px; /* Ajouter de l'espace autour du texte */
margin: 10px 0; /* Ajouter de l'espace au-dessus et en-dessous de l'encadrement */
}
</style>
\newcommand{\Lasso}{\widehat{\beta}^{\lambda,L}}
\newcommand{\R}{\mathbb{R}}
\newcommand{\1}{\mathbb{1}}
\newcommand{\RE}{\text{RE}}
\newcommand{\sign}{\text{sign}}
\newcommand{\e}{\epsilon}
\renewcommand{\P}{\text{P}}
```{r setup, include=FALSE}
knitr::opts_chunk$set(fig.align = "center", message = FALSE, warning = FALSE)
```
## Packages
We begin by loading the necessary packages for this analysis.
```{r message=FALSE,warning=FALSE}
library(glmnet) # for regularized regression
library(caret) # for training and evaluating models
library(ggplot2) # for data visualization
library(ggfortify) # to extend ggplot2 features for autoplot
library(reshape2) # for reshaping data
library(Metrics) # for calculating metrics like RMSE
library(vip) # for variable importance visualization
library(dplyr) # for data manipulation
library(tidyverse) # includes ggplot2, dplyr, and other useful packages
```
****
# The Dataset: Cookies
## Upload Datasets
```{r}
setwd("/Users/arthurdanjou/Workspace/studies/M2/Linear Models/Biaised Models")
```
```{r}
# Loading the training dataset
cookie.train <- read.csv("Cookies_Train.csv", header = TRUE, row.names = 1)
```
```{r}
# Loading the test dataset
cookie.test <- read.csv("Cookies_Test.csv", header = TRUE, row.names = 1)
```
## Custom Control Parameters
```{r}
custom <- trainControl(
method = "repeatedcv",
number = 5, # Using 5-fold cross-validation
repeats = 3, # Repeating 3 times for robustness
summaryFunction = defaultSummary, # Default metrics (RMSE, MAE)
allowParallel = TRUE # Use parallel processing if resources allow
)
```
****
# Models Study
****
## Linear regression analysis
```{r}
set.seed(602)
linear.mod <- train(sugars ~ ., cookie.train, method = "lm", preProc = c("center", "scale"), trControl = custom)
linear.mod$results
```
```{r}
Ytrain <- cookie.train$sugars
dfc_train <- data.frame(ytrain = Ytrain, linear.mod = fitted(linear.mod))
dfc_train %>% rmarkdown::paged_table()
```
```{r}
dfc_train %>%
ggplot(aes(x = ytrain, y = linear.mod)) +
geom_point(size = 2, color = "#983399") +
geom_smooth(method = "lm", color = "#389900") +
ggtitle("Train Dataset") +
ylab("Fitted Values") +
xlab("Actual Values (Y)")
```
```{r}
Ytest <- cookie.test$sugars
dfc_test <- data.frame(ytest = Ytest)
dfc_test$linear.mod <- predict(linear.mod, newdata = cookie.test)
# dfc_test%>%rmarkdown::paged_table()
dfc_test %>%
ggplot(aes(x = ytest, y = linear.mod)) +
geom_point(size = 2, color = "#983399") +
geom_smooth(method = "lm", color = "#389900") +
ggtitle("Test Dataset") +
ylab("Fitted Values") +
xlab("Actual Values (Y)")
```
****
## Lasso regression analysis
```{r}
set.seed(602)
# grid_Lasso <- seq(0.001, 0.1, length = 100)
grid_Lasso <- 10^seq(0, 0.1, length = 100)
Lasso <- train(sugars ~ ., cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 1, lambda = grid_Lasso),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
library(plotly)
ggplotly(ggplot(Lasso))
```
```{r}
Lasso$results %>% rmarkdown::paged_table()
```
```{r}
Lasso$bestTune
```
```{r}
Lasso$results[which.min(Lasso$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(Lasso$finalModel, xvar = "lambda", label = TRUE)
plot(Lasso$finalModel, xvar = "dev", label = TRUE)
```
```{r}
library(vip)
vip(Lasso, num_features = 15)
```
```{r}
coef_lasso <- data.frame(
Variable = rownames(as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))),
Coefficient = as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))[, 1]
)
coef_lasso %>%
subset(Coefficient != 0) %>%
rmarkdown::paged_table()
```
****
## Ridge regression analysis
```{r}
set.seed(602)
lambda_ridge <- seq(11, 12, length = 100)
ridge <- train(sugars ~ .,
data = cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0, lambda = lambda_ridge),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
library(plotly)
ggplotly(ggplot(ridge))
```
```{r}
ridge$results %>% rmarkdown::paged_table()
```
```{r}
ridge$bestTune
```
```{r}
ridge$results[which.min(ridge$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(ridge$finalModel, xvar = "lambda", label = TRUE)
plot(ridge$finalModel, xvar = "dev", label = TRUE)
```
```{r}
vip(ridge, num_features = 15)
```
```{r}
data.frame(as.matrix(coef(ridge$finalModel, ridge$bestTune$lambda))) %>%
rmarkdown::paged_table()
```
****
## ElasticNet regression analysis
```{r}
set.seed(602)
alpha_Enet <- seq(0.5, 0.9, length = 10)
lambda_Enet <- seq(0.01, 0.05, length = 10)
ElNet <- train(sugars ~ ., cookie.train,
method = "glmnet",
tuneGrid = expand.grid(alpha = alpha_Enet, lambda = lambda_Enet),
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
ggplotly(ggplot(ElNet))
```
```{r}
ElNet$results %>% rmarkdown::paged_table()
```
```{r}
ElNet$bestTune
```
```{r}
ElNet$results[which.min(ElNet$results$RMSE), ]
```
```{r}
par(mfrow = c(1, 2))
plot(ElNet$finalModel, xvar = "lambda", label = T)
plot(ElNet$finalModel, xvar = "dev", label = T)
```
```{r}
vip(ElNet, num_features = 20)
```
```{r}
coef_elnet <- data.frame(
Variable = rownames(as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))),
Coefficient = as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))[, 1]
)
coef_elnet %>%
subset(Coefficient != 0) %>%
rmarkdown::paged_table()
```
****
## PLS regression analysis
```{r}
set.seed(602)
pls_mod <- train(sugars ~ ., cookie.train,
method = "pls",
tuneLength = 20,
preProc = c("center", "scale"),
trControl = custom
)
```
```{r}
ggplotly(ggplot(pls_mod))
```
```{r}
pls_mod$results %>% rmarkdown::paged_table()
```
```{r}
pls_mod$bestTune
```
```{r}
pls_mod$results[which.min(pls_mod$results$RMSE), ]
```
```{r}
vip(pls_mod, num_features = 20)
```
```{r}
data.frame(Coefficients = as.matrix(coef(pls_mod$finalModel))) %>%
rmarkdown::paged_table()
```
****
# Models Comparaison
****
## Graphical comparison of model performance
### On the training set
```{r}
yTrain <- cookie.train$sugars
dTrain <- data.frame(yTrain = yTrain)
dTrain$linear <- fitted(linear.mod)
dTrain$Lasso <- fitted(Lasso)
dTrain$ridge <- fitted(ridge)
dTrain$ElNet <- fitted(ElNet)
dTrain$pls <- fitted(pls_mod)
melt.dTrain <- melt(dTrain, id = "yTrain", variable.name = "model")
melt.dTrain %>% ggplot() +
aes(x = yTrain, y = value) +
geom_smooth(method = "lm") +
geom_point(size = 1, colour = "#983399") +
facet_wrap(~model, nrow = 3) +
ggtitle("Train dataset") +
ylab("Fitted value") +
xlab("Y")
```
```{r}
dTrain %>% rmarkdown::paged_table()
```
```{r}
melt.dTrain %>% rmarkdown::paged_table()
```
### On the test set
```{r}
yTest <- cookie.test$sugars
dTest <- data.frame(yTest = yTest)
dTest$linear <- predict(linear.mod, newdata = cookie.test)
dTest$Lasso <- predict(Lasso, newdata = cookie.test)
dTest$ridge <- predict(ridge, newdata = cookie.test)
dTest$ElNet <- predict(ElNet, newdata = cookie.test)
dTest$pls <- predict(pls_mod, newdata = cookie.test)
# dTest%>% rmarkdown::paged_table()
melt.dTest <- melt(dTest, id = "yTest", variable.name = "model")
# melt.dTest%>% rmarkdown::paged_table()
melt.dTest %>% ggplot() +
aes(x = yTest, y = value) +
geom_smooth(method = "lm") +
geom_point(size = 1, colour = "#983399") +
facet_wrap(~model, nrow = 3) +
ggtitle("Test dataset") +
ylab("Fitted value") +
xlab("Y") +
theme_bw()
```
****
## RMSE comparaison among models
```{r}
RMSE <- rbind.data.frame(
cbind(rmse(yTrain, dTrain$linear), rmse(yTest, dTest$linear)),
cbind(rmse(yTrain, dTrain$Lasso), rmse(yTest, dTest$Lasso)),
cbind(rmse(yTrain, dTrain$ridge), rmse(yTest, dTest$ridge)),
cbind(rmse(yTrain, dTrain$ElNet), rmse(yTest, dTest$ElNet)),
cbind(rmse(yTrain, dTrain$pls), rmse(yTest, dTest$pls))
)
names(RMSE) <- c("Train", "Test")
row.names(RMSE) <- c("Linear", "Lasso", "Ridge", "ElNet", "PLS")
RMSE %>%
kableExtra::kbl() %>%
kableExtra::kable_styling()
```
```{r}
summary(yTrain)
summary(yTest)
```
```{r, echo=FALSE}
LM_train <- round(rmse(yTrain, dTrain$linear), 2)
LM_test <- round(rmse(yTest, dTest$linear), 2)
Lasso_train <- round(rmse(yTrain, dTrain$Lasso), 2)
Lasso_test <- round(rmse(yTest, dTest$Lasso), 2)
Ridge_train <- round(rmse(yTrain, dTrain$ridge), 2)
Ridge_test <- round(rmse(yTest, dTest$ridge), 2)
ElNet_train <- round(rmse(yTrain, dTrain$ElNet), 2)
ElNet_test <- round(rmse(yTest, dTest$ElNet), 2)
PLS_train <- round(rmse(yTrain, dTrain$pls), 2)
PLS_test <- round(rmse(yTest, dTest$pls), 2)
```

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

13
studies.Rproj Normal file
View File

@@ -0,0 +1,13 @@
Version: 1.0
RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX