mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-14 15:54:13 +01:00
513 lines
14 KiB
Plaintext
513 lines
14 KiB
Plaintext
---
|
|
title: "Generalized Linear Models - Project by Arthur DANJOU"
|
|
always_allow_html: true
|
|
output:
|
|
html_document:
|
|
toc: true
|
|
toc_depth: 4
|
|
fig_caption: true
|
|
---
|
|
|
|
## Data Analysis
|
|
|
|
```{r}
|
|
setwd('/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/Projet')
|
|
```
|
|
|
|
```{r}
|
|
library(MASS)
|
|
library(AER)
|
|
library(rmarkdown)
|
|
library(car)
|
|
library(corrplot)
|
|
library(carData)
|
|
library(ggfortify)
|
|
library(ggplot2)
|
|
library(gridExtra)
|
|
library(caret)
|
|
```
|
|
|
|
### Data Preprocessing
|
|
```{r}
|
|
data <- read.csv("./projet.csv", header = TRUE, sep = ",", dec = ".")
|
|
|
|
# Factors
|
|
data$saison <- as.factor(data$saison)
|
|
data$mois <- as.factor(data$mois)
|
|
data$jour_mois <- as.factor(data$jour_mois)
|
|
data$jour_semaine <- as.factor(data$jour_semaine)
|
|
data$horaire <- as.factor(data$horaire)
|
|
data$jour_travail <- as.factor(data$jour_travail)
|
|
data$vacances <- as.factor(data$vacances)
|
|
data$meteo <- as.factor(data$meteo)
|
|
|
|
# Quantitative variables
|
|
data$humidite_sqrt <- sqrt(data$humidite)
|
|
data$humidite_square <- data$humidite^2
|
|
|
|
data$temperature1_square <- data$temperature1^2
|
|
data$temperature2_square <- data$temperature2^2
|
|
|
|
data$vent_square <- data$vent^2
|
|
data$vent_sqrt <- sqrt(data$vent)
|
|
|
|
# Remove obs column
|
|
rownames(data) <- data$obs
|
|
data <- data[, -1]
|
|
paged_table(data)
|
|
```
|
|
|
|
```{r}
|
|
colSums(is.na(data))
|
|
sum(duplicated(data))
|
|
```
|
|
|
|
```{r}
|
|
str(data)
|
|
summary(data)
|
|
```
|
|
|
|
### Study of the Quantitative Variables
|
|
|
|
#### Distribution of Variables
|
|
|
|
```{r}
|
|
plot_velos <- ggplot(data, aes(x = velos)) +
|
|
geom_histogram(bins = 25, fill = "blue") +
|
|
labs(title = "Distribution du nombre de vélos loués", x = "Nombre de locations de vélos")
|
|
|
|
plot_temperature1 <- ggplot(data, aes(x = temperature1)) +
|
|
geom_histogram(bins = 30, fill = "green") +
|
|
labs(title = "Distribution de la température1", x = "Température moyenne mesuré (°C)")
|
|
|
|
plot_temperature2 <- ggplot(data, aes(x = temperature2)) +
|
|
geom_histogram(bins = 30, fill = "green") +
|
|
labs(title = "Distribution de la température2", x = "Température moyenne ressentie (°C)")
|
|
|
|
plot_humidite <- ggplot(data, aes(x = humidite)) +
|
|
geom_histogram(bins = 30, fill = "green") +
|
|
labs(title = "Distribution de la humidité", x = "Pourcentage d'humidité")
|
|
|
|
plot_vent <- ggplot(data, aes(x = vent)) +
|
|
geom_histogram(bins = 30, fill = "green") +
|
|
labs(title = "Distribution de la vent", x = "Vitesse du vent (Km/h)")
|
|
|
|
grid.arrange(plot_velos, plot_temperature1, plot_temperature2, plot_humidite, plot_vent, ncol = 3)
|
|
```
|
|
|
|
#### Correlation between Quantitatives Variables
|
|
|
|
```{r}
|
|
#detach(package:arm) # If error, uncomment this line due to duplicate function 'corrplot'
|
|
corr_matrix <- cor(data[, sapply(data, is.numeric)])
|
|
corrplot(corr_matrix, type = "lower", tl.col = "black", tl.srt = 45)
|
|
pairs(data[, sapply(data, is.numeric)])
|
|
cor.test(data$temperature1, data$temperature2)
|
|
```
|
|
|
|
### Study of the Qualitative Variables
|
|
|
|
```{r}
|
|
plot_saison <- ggplot(data, aes(x = saison, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par saison", x = "Saison", y = "Nombre de vélos loués")
|
|
|
|
plot_jour_semaine <- ggplot(data, aes(x = jour_semaine, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par jour de la semaine", x = "Jour de la semaine", y = "Nombre de vélos loués")
|
|
|
|
plot_mois <- ggplot(data, aes(x = mois, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par mois de l'année", x = "Mois de l'année", y = "Nombre de vélos loués")
|
|
|
|
plot_jour_mois <- ggplot(data, aes(x = jour_mois, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par jour du mois", x = "Jour du mois", y = "Nombre de vélos loués")
|
|
|
|
plot_horaire <- ggplot(data, aes(x = horaire, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par horaire de la journée", x = "Horaire de la journée", y = "Nombre de vélos loués")
|
|
|
|
plot_jour_travail <- ggplot(data, aes(x = jour_travail, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par jour travaillé", x = "Jour travaillé", y = "Nombre de vélos loués")
|
|
|
|
plot_vacances <- ggplot(data, aes(x = vacances, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par vacances", x = "Vacances", y = "Nombre de vélos loués")
|
|
|
|
plot_meteo <- ggplot(data, aes(x = meteo, y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Nombre de vélos par météo", x = "Météo", y = "Nombre de vélos loués")
|
|
|
|
grid.arrange(plot_horaire, plot_jour_semaine, plot_jour_mois, plot_mois, plot_saison, plot_jour_travail, plot_vacances, plot_meteo, ncol = 3)
|
|
```
|
|
|
|
```{r}
|
|
chisq.test(data$mois, data$saison)
|
|
chisq.test(data$meteo, data$saison)
|
|
chisq.test(data$meteo, data$mois)
|
|
```
|
|
|
|
### Outliers Detection
|
|
|
|
```{r}
|
|
boxplot_velos <- ggplot(data, aes(x = "", y = velos)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de vélos", y = "Nombre de vélos")
|
|
|
|
boxplot_temperature1 <- ggplot(data, aes(x = "", y = temperature1)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de température1", y = "Température moyenne mesuré (°C)")
|
|
|
|
boxplot_temperature1_sq <- ggplot(data, aes(x = "", y = temperature1_square)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de température1^2", y = "Température moyenne mesuré (°C^2)")
|
|
|
|
boxplot_temperature2 <- ggplot(data, aes(x = "", y = temperature2)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de température2", y = "Température moyenne ressentie (°C)")
|
|
|
|
boxplot_temperature2_sq <- ggplot(data, aes(x = "", y = temperature2_square)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de température2^2", y = "Température moyenne ressentie (°C^2)")
|
|
|
|
boxplot_humidite <- ggplot(data, aes(x = "", y = humidite)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de humidité", y = "Pourcentage d'humidité")
|
|
|
|
boxplot_humidite_sqrt <- ggplot(data, aes(x = "", y = humidite_sqrt)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de sqrt(humidité)", y = "Pourcentage d'humidité (sqrt(%))")
|
|
|
|
boxplot_humidite_square <- ggplot(data, aes(x = "", y = humidite_square)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de humidité^2", y = "Pourcentage d'humidité (%^2)")
|
|
|
|
boxplot_vent <- ggplot(data, aes(x = "", y = vent)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de vent", y = "Vitesse du vent (Km/h)")
|
|
|
|
boxplot_vent_sqrt <- ggplot(data, aes(x = "", y = vent_sqrt)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de sqrt(vent)", y = "Vitesse du vent sqrt(Km/h)")
|
|
|
|
boxplot_vent_square <- ggplot(data, aes(x = "", y = vent_square)) +
|
|
geom_boxplot(fill = "lightblue") +
|
|
labs(title = "Boxplot de vent^2", y = "Vitesse du vent (Km/h)^2")
|
|
|
|
grid.arrange(boxplot_velos, boxplot_temperature1, boxplot_temperature1_sq, boxplot_temperature2, boxplot_temperature2_sq, boxplot_humidite, boxplot_humidite_sqrt, boxplot_humidite_square, boxplot_vent, boxplot_vent_sqrt, boxplot_vent_square, ncol = 4)
|
|
```
|
|
|
|
```{r}
|
|
length_data <- nrow(data)
|
|
numeric_vars <- sapply(data, is.numeric)
|
|
total_outliers <- 0
|
|
|
|
for (var in names(data)[numeric_vars]) {
|
|
data_outlier <- data[[var]]
|
|
Q1 <- quantile(data_outlier, 0.25)
|
|
Q3 <- quantile(data_outlier, 0.75)
|
|
IQR <- Q3 - Q1
|
|
|
|
lower_limit <- Q1 - 1.5 * IQR
|
|
upper_limit <- Q3 + 1.5 * IQR
|
|
|
|
outliers <- data[data_outlier < lower_limit | data_outlier > upper_limit,]
|
|
cat("Number of outliers for the variable", var, ":", nrow(outliers), "\n")
|
|
data <- data[data_outlier >= lower_limit & data_outlier <= upper_limit,]
|
|
}
|
|
|
|
cat("Number of outliers removed :", length_data - nrow(data), "\n")
|
|
cat("Data length after removing outliers :", nrow(data), "\n")
|
|
```
|
|
|
|
## Model Creation and Comparison
|
|
|
|
### Data Split
|
|
|
|
```{r}
|
|
set.seed(123)
|
|
data_split <- rsample::initial_split(data, prop = 0.8)
|
|
data_train <- rsample::training(data_split)
|
|
data_test <- rsample::testing(data_split)
|
|
```
|
|
|
|
### Choice of the Distribution *vélos*
|
|
|
|
```{r}
|
|
model_poisson <- glm(velos ~ ., family = poisson, data = data_train)
|
|
model_nb <- glm.nb(velos ~ ., data = data_train)
|
|
model_gaussian <- glm(velos ~ ., family = gaussian, data = data_train)
|
|
t(AIC(model_poisson, model_nb, model_gaussian)[2])
|
|
|
|
dispersiontest(model_poisson)
|
|
|
|
mean_velos <- mean(data_train$velos)
|
|
var_velos <- var(data_train$velos)
|
|
cat("Mean :", mean_velos, "Variance :", var_velos, "\n")
|
|
```
|
|
|
|
### Model Selection
|
|
|
|
```{r}
|
|
model_full_quantitative <- glm.nb(
|
|
velos ~ vent +
|
|
vent_square +
|
|
vent_sqrt +
|
|
|
|
humidite +
|
|
humidite_square +
|
|
humidite_sqrt +
|
|
|
|
temperature1_square +
|
|
temperature1 +
|
|
|
|
temperature2 +
|
|
temperature2_square
|
|
, data = data_train)
|
|
summary(model_full_quantitative)
|
|
```
|
|
|
|
```{r}
|
|
anova(model_full_quantitative)
|
|
```
|
|
|
|
```{r}
|
|
model_quantitative_quali <- glm.nb(
|
|
velos ~
|
|
vent +
|
|
vent_square +
|
|
vent_sqrt +
|
|
humidite +
|
|
humidite_square +
|
|
humidite_sqrt +
|
|
temperature1 +
|
|
temperature1_square +
|
|
|
|
horaire +
|
|
saison +
|
|
meteo +
|
|
mois +
|
|
vacances +
|
|
jour_travail +
|
|
jour_semaine +
|
|
jour_mois
|
|
, data = data_train)
|
|
summary(model_quantitative_quali)
|
|
```
|
|
|
|
```{r}
|
|
anova(model_quantitative_quali)
|
|
```
|
|
|
|
```{r}
|
|
model_quanti_quali_final <- glm.nb(
|
|
velos ~
|
|
vent +
|
|
vent_square + #
|
|
humidite +
|
|
humidite_square +
|
|
humidite_sqrt + #
|
|
temperature1 +
|
|
temperature1_square +
|
|
|
|
horaire +
|
|
saison +
|
|
meteo +
|
|
mois +
|
|
vacances
|
|
, data = data_train)
|
|
summary(model_quanti_quali_final)
|
|
```
|
|
|
|
```{r}
|
|
model_0 <- glm.nb(velos ~ 1, data = data_train)
|
|
model_forward <- stepAIC(
|
|
model_0,
|
|
vélos ~ vent +
|
|
humidite +
|
|
humidite_square +
|
|
temperature1 +
|
|
temperature1_square +
|
|
|
|
horaire +
|
|
saison +
|
|
meteo +
|
|
mois +
|
|
vacances,
|
|
data = data_train,
|
|
trace = FALSE,
|
|
direction = "forward"
|
|
)
|
|
model_backward <- stepAIC(
|
|
model_quanti_quali_final,
|
|
~1,
|
|
trace = FALSE,
|
|
direction = "backward"
|
|
)
|
|
model_both <- stepAIC(
|
|
model_0,
|
|
vélos ~ vent +
|
|
humidite +
|
|
humidite_square +
|
|
temperature1 +
|
|
temperature1_square +
|
|
|
|
horaire +
|
|
saison +
|
|
meteo +
|
|
mois +
|
|
vacances,
|
|
data = data_train,
|
|
trace = FALSE,
|
|
direction = "both"
|
|
)
|
|
AIC(model_forward, model_both, model_backward)
|
|
summary(model_forward)
|
|
```
|
|
|
|
```{r}
|
|
model_final_without_interaction <- glm.nb(
|
|
velos ~ horaire +
|
|
mois +
|
|
meteo +
|
|
temperature1 +
|
|
saison +
|
|
temperature1_square +
|
|
humidite_square +
|
|
vacances +
|
|
humidite +
|
|
vent
|
|
, data = data_train)
|
|
summary(model_final_without_interaction)
|
|
```
|
|
|
|
### Final Model
|
|
|
|
#### Choice and Validation of the Model
|
|
|
|
```{r}
|
|
model_final <- glm.nb(
|
|
velos ~ horaire +
|
|
mois +
|
|
meteo +
|
|
temperature1 +
|
|
saison +
|
|
temperature1_square +
|
|
humidite_square +
|
|
vacances +
|
|
humidite +
|
|
vent +
|
|
|
|
horaire:temperature1 +
|
|
temperature1_square:mois
|
|
, data = data_train
|
|
)
|
|
summary(model_final)
|
|
```
|
|
|
|
```{r}
|
|
anova(model_final, test = "Chisq")
|
|
```
|
|
|
|
```{r}
|
|
lrtest(model_final, model_final_without_interaction)
|
|
```
|
|
|
|
```{r}
|
|
dispersion_ratio <- sum(residuals(model_final, type = "pearson")^2) / df.residual(model_final)
|
|
print(paste("Dispersion Ratio:", round(dispersion_ratio, 2)))
|
|
|
|
hist(residuals(model_final, type = "deviance"), breaks = 30, freq = FALSE, col = "lightblue", main = "Histogram of Deviance Residuals")
|
|
x <- seq(-5, 5, length = 100)
|
|
curve(dnorm(x), col = "darkblue", lwd = 2, add = TRUE)
|
|
|
|
autoplot(model_final, 1:4)
|
|
```
|
|
|
|
```{r}
|
|
library(arm)
|
|
binnedplot(fitted(model_final), residuals(model_final, type = "deviance"), col.int = "blue", col.pts = 2)
|
|
```
|
|
|
|
## Performance and Limitations of the Final Model
|
|
|
|
### Predictions and *Mean Square Error*
|
|
|
|
```{r}
|
|
data_test$predictions <- predict(model_final, newdata = data_test, type = "response")
|
|
data_train$predictions <- predict(model_final, newdata = data_train, type = "response")
|
|
|
|
predictions_ci <- predict(
|
|
model_final,
|
|
newdata = data_test,
|
|
type = "link",
|
|
se.fit = TRUE
|
|
)
|
|
|
|
data_test$lwr <- exp(predictions_ci$fit - 1.96 * predictions_ci$se.fit)
|
|
data_test$upr <- exp(predictions_ci$fit + 1.96 * predictions_ci$se.fit)
|
|
|
|
MSE <- mean((data_test$velos - data_test$predictions)^2)
|
|
cat("MSE :", MSE, "\n")
|
|
cat("RMSE train :", sqrt(mean((data_train$velos - data_train$predictions)^2)), "\n")
|
|
cat('RMSE :', sqrt(MSE), '\n')
|
|
```
|
|
|
|
### Evaluation of Model Performance
|
|
|
|
```{r}
|
|
bounds <- c(200, 650)
|
|
|
|
cat("Observations vs Prédictions\n")
|
|
cat(nrow(data_test[data_test$velos < bounds[1],]), "vs", sum(data_test$predictions < bounds[1]), "\n")
|
|
cat(nrow(data_test[data_test$velos > bounds[1] & data_test$velos < bounds[2],]), "vs", sum(data_test$predictions > bounds[1] & data_test$predictions < bounds[2]), "\n")
|
|
cat(nrow(data_test[data_test$velos > bounds[2],]), "vs", sum(data_test$predictions > bounds[2]), "\n")
|
|
cat('\n')
|
|
|
|
categories <- c(0, bounds, Inf)
|
|
categories_label <- c("Low", "Mid", "High")
|
|
|
|
data_test$velos_cat <- cut(data_test$velos,
|
|
breaks = categories,
|
|
labels = categories_label,
|
|
include.lowest = TRUE)
|
|
data_test$predictions_cat <- cut(data_test$predictions,
|
|
breaks = categories,
|
|
labels = categories_label,
|
|
include.lowest = TRUE)
|
|
conf_matrix <- confusionMatrix(data_test$velos_cat, data_test$predictions_cat)
|
|
conf_matrix
|
|
```
|
|
|
|
```{r}
|
|
ggplot(data_test, aes(x = velos, y = predictions)) +
|
|
geom_point(color = "blue", alpha = 0.6, size = 2) +
|
|
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
|
|
geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2, fill = "grey") +
|
|
labs(
|
|
title = "Comparison of Observed and Predicted Bikes",
|
|
x = "Number of Observed Bikes",
|
|
y = "Number of Predicted Bikes"
|
|
) +
|
|
theme_minimal()
|
|
|
|
ggplot(data_test, aes(x = velos_cat, fill = predictions_cat)) +
|
|
geom_bar(position = "dodge") +
|
|
labs(title = "Predictions vs Observations (by categories)",
|
|
x = "Observed categories", y = "Number of predictions")
|
|
|
|
df_plot <- data.frame(observed = data_test$velos, predicted = data_test$predictions)
|
|
|
|
# Créer l'histogramme avec la courbe de densité
|
|
ggplot(df_plot, aes(x = observed)) +
|
|
geom_histogram(aes(y = ..density..), binwidth = 50, fill = "lightblue", color = "black", alpha = 0.7) + # Histogramme des données observées
|
|
geom_density(aes(x = predicted), color = "red", size = 1) + # Courbe de densité des valeurs prédites
|
|
labs(title = "Observed distribution vs. Predicted distribution",
|
|
x = "Number of vélos",
|
|
y = "Density") +
|
|
theme_bw()
|
|
|
|
```
|