Packages

We begin by loading the necessary packages for this analysis.

library(glmnet) # for regularized regression
library(caret) # for training and evaluating models
library(ggplot2) # for data visualization
library(ggfortify) # to extend ggplot2 features for autoplot
library(reshape2) # for reshaping data
library(Metrics) # for calculating metrics like RMSE
library(vip) # for variable importance visualization
library(dplyr) # for data manipulation
library(tidyverse) # includes ggplot2, dplyr, and other useful packages

The Dataset: Cookies

Upload Datasets

setwd("/Users/arthurdanjou/Workspace/studies/M2/Linear Models/Biaised Models")
# Loading the training dataset
cookie.train <- read.csv("Cookies_Train.csv", header = TRUE, row.names = 1)
# Loading the test dataset
cookie.test <- read.csv("Cookies_Test.csv", header = TRUE, row.names = 1)

Custom Control Parameters

custom <- trainControl(
  method = "repeatedcv",
  number = 10, # Using 5-fold cross-validation
  repeats = 10, # Repeating 3 times for robustness
  summaryFunction = defaultSummary, # Default metrics (RMSE, MAE)
  allowParallel = TRUE # Use parallel processing if resources allow
)

Models Study


Linear regression analysis

set.seed(602)
linear.mod <- train(sugars ~ ., cookie.train, method = "lm", preProc = c("center", "scale"), trControl = custom)
linear.mod$results
Ytrain <- cookie.train$sugars
dfc_train <- data.frame(ytrain = Ytrain, linear.mod = fitted(linear.mod))
dfc_train %>% rmarkdown::paged_table()
dfc_train %>%
  ggplot(aes(x = ytrain, y = linear.mod)) +
  geom_point(size = 2, color = "#983399") +
  geom_smooth(method = "lm", color = "#389900") +
  ggtitle("Train Dataset") +
  ylab("Fitted Values") +
  xlab("Actual Values (Y)")

Ytest <- cookie.test$sugars
dfc_test <- data.frame(ytest = Ytest)
dfc_test$linear.mod <- predict(linear.mod, newdata = cookie.test)
# dfc_test%>%rmarkdown::paged_table()

dfc_test %>%
  ggplot(aes(x = ytest, y = linear.mod)) +
  geom_point(size = 2, color = "#983399") +
  geom_smooth(method = "lm", color = "#389900") +
  ggtitle("Test Dataset") +
  ylab("Fitted Values") +
  xlab("Actual Values (Y)")


Lasso regression analysis

set.seed(602)
# grid_Lasso <- seq(0.001, 0.1, length = 100)
grid_Lasso <- 10^seq(-4, 1, length = 100)
Lasso <- train(sugars ~ ., cookie.train,
  method = "glmnet",
  tuneGrid = expand.grid(alpha = 1, lambda = grid_Lasso),
  preProc = c("center", "scale"),
  trControl = custom
)
library(plotly)
ggplotly(ggplot(Lasso))
Lasso$results %>% rmarkdown::paged_table()
Lasso$bestTune
Lasso$results[which.min(Lasso$results$RMSE), ]
par(mfrow = c(1, 2))
plot(Lasso$finalModel, xvar = "lambda", label = TRUE)
plot(Lasso$finalModel, xvar = "dev", label = TRUE)

library(vip)
vip(Lasso, num_features = 15)

coef_lasso <- data.frame(
  Variable = rownames(as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))),
  Coefficient = as.matrix(coef(Lasso$finalModel, Lasso$bestTune$lambda))[, 1]
)
coef_lasso %>%
  subset(Coefficient != 0) %>%
  rmarkdown::paged_table()

Ridge regression analysis

set.seed(602)
lambda_ridge <- seq(11, 12, length = 100)
ridge <- train(sugars ~ .,
  data = cookie.train,
  method = "glmnet",
  tuneGrid = expand.grid(alpha = 0, lambda = lambda_ridge),
  preProc = c("center", "scale"),
  trControl = custom
)
library(plotly)
ggplotly(ggplot(ridge))
ridge$results %>% rmarkdown::paged_table()
ridge$bestTune
ridge$results[which.min(ridge$results$RMSE), ]
par(mfrow = c(1, 2))
plot(ridge$finalModel, xvar = "lambda", label = TRUE)
plot(ridge$finalModel, xvar = "dev", label = TRUE)

vip(ridge, num_features = 15)

data.frame(as.matrix(coef(ridge$finalModel, ridge$bestTune$lambda))) %>%
  rmarkdown::paged_table()

ElasticNet regression analysis

set.seed(602)
alpha_Enet <- seq(0.5, 0.9, length = 10)
lambda_Enet <- seq(0.01, 0.05, length = 10)

ElNet <- train(sugars ~ ., cookie.train,
  method = "glmnet",
  tuneGrid = expand.grid(alpha = alpha_Enet, lambda = lambda_Enet),
  preProc = c("center", "scale"),
  trControl = custom
)
ggplotly(ggplot(ElNet))
ElNet$results %>% rmarkdown::paged_table()
ElNet$bestTune
ElNet$results[which.min(ElNet$results$RMSE), ]
par(mfrow = c(1, 2))
plot(ElNet$finalModel, xvar = "lambda", label = T)
plot(ElNet$finalModel, xvar = "dev", label = T)

vip(ElNet, num_features = 20)

coef_elnet <- data.frame(
  Variable = rownames(as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))),
  Coefficient = as.matrix(coef(ElNet$finalModel, ElNet$bestTune$lambda))[, 1]
)
coef_elnet %>%
  subset(Coefficient != 0) %>%
  rmarkdown::paged_table()

PLS regression analysis

set.seed(602)
pls_mod <- train(sugars ~ ., cookie.train,
  method = "pls",
  tuneLength = 20,
  preProc = c("center", "scale"),
  trControl = custom
)
ggplotly(ggplot(pls_mod))
pls_mod$results %>% rmarkdown::paged_table()
pls_mod$bestTune
pls_mod$results[which.min(pls_mod$results$RMSE), ]
vip(pls_mod, num_features = 20)

data.frame(Coefficients = as.matrix(coef(pls_mod$finalModel))) %>%
  rmarkdown::paged_table()

Models Comparaison


Graphical comparison of model performance

On the training set

yTrain <- cookie.train$sugars
dTrain <- data.frame(yTrain = yTrain)
dTrain$linear <- fitted(linear.mod)
dTrain$Lasso <- fitted(Lasso)
dTrain$ridge <- fitted(ridge)
dTrain$ElNet <- fitted(ElNet)
dTrain$pls <- fitted(pls_mod)
melt.dTrain <- melt(dTrain, id = "yTrain", variable.name = "model")
melt.dTrain %>% ggplot() +
  aes(x = yTrain, y = value) +
  geom_smooth(method = "lm") +
  geom_point(size = 1, colour = "#983399") +
  facet_wrap(~model, nrow = 3) +
  ggtitle("Train dataset") +
  ylab("Fitted value") +
  xlab("Y")

dTrain %>% rmarkdown::paged_table()
melt.dTrain %>% rmarkdown::paged_table()

On the test set

yTest <- cookie.test$sugars
dTest <- data.frame(yTest = yTest)
dTest$linear <- predict(linear.mod, newdata = cookie.test)
dTest$Lasso <- predict(Lasso, newdata = cookie.test)
dTest$ridge <- predict(ridge, newdata = cookie.test)
dTest$ElNet <- predict(ElNet, newdata = cookie.test)
dTest$pls <- predict(pls_mod, newdata = cookie.test)
# dTest%>% rmarkdown::paged_table()
melt.dTest <- melt(dTest, id = "yTest", variable.name = "model")
# melt.dTest%>% rmarkdown::paged_table()
melt.dTest %>% ggplot() +
  aes(x = yTest, y = value) +
  geom_smooth(method = "lm") +
  geom_point(size = 1, colour = "#983399") +
  facet_wrap(~model, nrow = 3) +
  ggtitle("Test dataset") +
  ylab("Fitted value") +
  xlab("Y") +
  theme_bw()


RMSE comparaison among models

RMSE <- rbind.data.frame(
  cbind(rmse(yTrain, dTrain$linear), rmse(yTest, dTest$linear)),
  cbind(rmse(yTrain, dTrain$Lasso), rmse(yTest, dTest$Lasso)),
  cbind(rmse(yTrain, dTrain$ridge), rmse(yTest, dTest$ridge)),
  cbind(rmse(yTrain, dTrain$ElNet), rmse(yTest, dTest$ElNet)),
  cbind(rmse(yTrain, dTrain$pls), rmse(yTest, dTest$pls))
)
names(RMSE) <- c("Train", "Test")
row.names(RMSE) <- c("Linear", "Lasso", "Ridge", "ElNet", "PLS")
RMSE %>%
  kableExtra::kbl() %>%
  kableExtra::kable_styling()
Train Test
Linear 0.000000 11.5983578
Lasso 1.125158 1.0607338
Ridge 2.526064 2.5945854
ElNet 1.133296 1.0950125
PLS 2.425705 0.8836003
summary(yTrain)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.95   13.32   16.36   16.54   19.82   23.11
summary(yTest)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   10.12   13.38   16.66   16.66   19.93   23.19