---
title: "Generalized Linear Models - Project by Arthur DANJOU"
always_allow_html: true
output:
html_document:
toc: true
toc_depth: 4
fig_caption: true
---

## Data Analysis

```{r}
setwd('/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/Projet')
```

```{r}
library(MASS)
library(AER)
library(rmarkdown)
library(car)
library(corrplot)
library(carData)
library(ggfortify)
library(ggplot2)
library(gridExtra)
library(caret)
```

### Data Preprocessing
```{r}
data <- read.csv("./projet.csv", header = TRUE, sep = ",", dec = ".")

# Factors
data$saison <- as.factor(data$saison)
data$mois <- as.factor(data$mois)
data$jour_mois <- as.factor(data$jour_mois)
data$jour_semaine <- as.factor(data$jour_semaine)
data$horaire <- as.factor(data$horaire)
data$jour_travail <- as.factor(data$jour_travail)
data$vacances <- as.factor(data$vacances)
data$meteo <- as.factor(data$meteo)

# Quantitative variables
data$humidite_sqrt <- sqrt(data$humidite)
data$humidite_square <- data$humidite^2

data$temperature1_square <- data$temperature1^2
data$temperature2_square <- data$temperature2^2

data$vent_square <- data$vent^2
data$vent_sqrt <- sqrt(data$vent)

# Remove obs column
rownames(data) <- data$obs
data <- data[, -1]
paged_table(data)
```

```{r}
colSums(is.na(data))
sum(duplicated(data))
```

```{r}
str(data)
summary(data)
```

### Study of the Quantitative Variables

#### Distribution of Variables

```{r}
plot_velos <- ggplot(data, aes(x = velos)) +
  geom_histogram(bins = 25, fill = "blue") +
  labs(title = "Distribution du nombre de vélos loués", x = "Nombre de locations de vélos")

plot_temperature1 <- ggplot(data, aes(x = temperature1)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la température1", x = "Température moyenne mesuré (°C)")

plot_temperature2 <- ggplot(data, aes(x = temperature2)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la température2", x = "Température moyenne ressentie (°C)")

plot_humidite <- ggplot(data, aes(x = humidite)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la humidité", x = "Pourcentage d'humidité")

plot_vent <- ggplot(data, aes(x = vent)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la vent", x = "Vitesse du vent (Km/h)")

grid.arrange(plot_velos, plot_temperature1, plot_temperature2, plot_humidite, plot_vent, ncol = 3)
```

#### Correlation between Quantitatives Variables

```{r}
#detach(package:arm) # If error, uncomment this line due to duplicate function 'corrplot'
corr_matrix <- cor(data[, sapply(data, is.numeric)])
corrplot(corr_matrix, type = "lower", tl.col = "black", tl.srt = 45)
pairs(data[, sapply(data, is.numeric)])
cor.test(data$temperature1, data$temperature2)
```

### Study of the Qualitative Variables

```{r}
plot_saison <- ggplot(data, aes(x = saison, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par saison", x = "Saison", y = "Nombre de vélos loués")

plot_jour_semaine <- ggplot(data, aes(x = jour_semaine, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour de la semaine", x = "Jour de la semaine", y = "Nombre de vélos loués")

plot_mois <- ggplot(data, aes(x = mois, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par mois de l'année", x = "Mois de l'année", y = "Nombre de vélos loués")

plot_jour_mois <- ggplot(data, aes(x = jour_mois, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour du mois", x = "Jour du mois", y = "Nombre de vélos loués")

plot_horaire <- ggplot(data, aes(x = horaire, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par horaire de la journée", x = "Horaire de la journée", y = "Nombre de vélos loués")

plot_jour_travail <- ggplot(data, aes(x = jour_travail, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour travaillé", x = "Jour travaillé", y = "Nombre de vélos loués")

plot_vacances <- ggplot(data, aes(x = vacances, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par vacances", x = "Vacances", y = "Nombre de vélos loués")

plot_meteo <- ggplot(data, aes(x = meteo, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par météo", x = "Météo", y = "Nombre de vélos loués")

grid.arrange(plot_horaire, plot_jour_semaine, plot_jour_mois, plot_mois, plot_saison, plot_jour_travail, plot_vacances, plot_meteo, ncol = 3)
```

```{r}
chisq.test(data$mois, data$saison)
chisq.test(data$meteo, data$saison)
chisq.test(data$meteo, data$mois)
```

### Outliers Detection

```{r}
boxplot_velos <- ggplot(data, aes(x = "", y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vélos", y = "Nombre de vélos")

boxplot_temperature1 <- ggplot(data, aes(x = "", y = temperature1)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température1", y = "Température moyenne mesuré (°C)")

boxplot_temperature1_sq <- ggplot(data, aes(x = "", y = temperature1_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température1^2", y = "Température moyenne mesuré (°C^2)")

boxplot_temperature2 <- ggplot(data, aes(x = "", y = temperature2)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température2", y = "Température moyenne ressentie (°C)")

boxplot_temperature2_sq <- ggplot(data, aes(x = "", y = temperature2_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température2^2", y = "Température moyenne ressentie (°C^2)")

boxplot_humidite <- ggplot(data, aes(x = "", y = humidite)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de humidité", y = "Pourcentage d'humidité")

boxplot_humidite_sqrt <- ggplot(data, aes(x = "", y = humidite_sqrt)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de sqrt(humidité)", y = "Pourcentage d'humidité (sqrt(%))")

boxplot_humidite_square <- ggplot(data, aes(x = "", y = humidite_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de humidité^2", y = "Pourcentage d'humidité (%^2)")

boxplot_vent <- ggplot(data, aes(x = "", y = vent)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vent", y = "Vitesse du vent (Km/h)")

boxplot_vent_sqrt <- ggplot(data, aes(x = "", y = vent_sqrt)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de sqrt(vent)", y = "Vitesse du vent sqrt(Km/h)")

boxplot_vent_square <- ggplot(data, aes(x = "", y = vent_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vent^2", y = "Vitesse du vent (Km/h)^2")

grid.arrange(boxplot_velos, boxplot_temperature1, boxplot_temperature1_sq, boxplot_temperature2, boxplot_temperature2_sq, boxplot_humidite, boxplot_humidite_sqrt, boxplot_humidite_square, boxplot_vent, boxplot_vent_sqrt, boxplot_vent_square, ncol = 4)
```

```{r}
length_data <- nrow(data)
numeric_vars <- sapply(data, is.numeric)
total_outliers <- 0

for (var in names(data)[numeric_vars]) {
  data_outlier <- data[[var]]
  Q1 <- quantile(data_outlier, 0.25)
  Q3 <- quantile(data_outlier, 0.75)
  IQR <- Q3 - Q1

  lower_limit <- Q1 - 1.5 * IQR
  upper_limit <- Q3 + 1.5 * IQR

  outliers <- data[data_outlier < lower_limit | data_outlier > upper_limit,]
  cat("Number of outliers for the variable", var, ":", nrow(outliers), "\n")
  data <- data[data_outlier >= lower_limit & data_outlier <= upper_limit,]
}

cat("Number of outliers removed :", length_data - nrow(data), "\n")
cat("Data length after removing outliers :", nrow(data), "\n")
```

## Model Creation and Comparison

### Data Split

```{r}
set.seed(123)
data_split <- rsample::initial_split(data, prop = 0.8)
data_train <- rsample::training(data_split)
data_test <- rsample::testing(data_split)
```

### Choice of the Distribution *vélos*

```{r}
model_poisson <- glm(velos ~ ., family = poisson, data = data_train)
model_nb <- glm.nb(velos ~ ., data = data_train)
model_gaussian <- glm(velos ~ ., family = gaussian, data = data_train)
t(AIC(model_poisson, model_nb, model_gaussian)[2])

dispersiontest(model_poisson)

mean_velos <- mean(data_train$velos)
var_velos <- var(data_train$velos)
cat("Mean :", mean_velos, "Variance :", var_velos, "\n")
```

### Model Selection

```{r}
model_full_quantitative <- glm.nb(
  velos ~ vent +
    vent_square +
    vent_sqrt +

    humidite +
    humidite_square +
    humidite_sqrt +

    temperature1_square +
    temperature1 +

    temperature2 +
    temperature2_square
  , data = data_train)
summary(model_full_quantitative)
```

```{r}
anova(model_full_quantitative)
```

```{r}
model_quantitative_quali <- glm.nb(
  velos ~
    vent +
      vent_square +
      vent_sqrt +
      humidite +
      humidite_square +
      humidite_sqrt +
      temperature1 +
      temperature1_square +

      horaire +
      saison +
      meteo +
      mois +
      vacances +
      jour_travail +
      jour_semaine +
      jour_mois
  , data = data_train)
summary(model_quantitative_quali)
```

```{r}
anova(model_quantitative_quali)
```

```{r}
model_quanti_quali_final <- glm.nb(
  velos ~
    vent +
      vent_square + #
      humidite +
      humidite_square +
      humidite_sqrt + #
      temperature1 +
      temperature1_square +

      horaire +
      saison +
      meteo +
      mois +
      vacances
  , data = data_train)
summary(model_quanti_quali_final)
```

```{r}
model_0 <- glm.nb(velos ~ 1, data = data_train)
model_forward <- stepAIC(
  model_0,
  vélos ~ vent +
    humidite +
    humidite_square +
    temperature1 +
    temperature1_square +

    horaire +
    saison +
    meteo +
    mois +
    vacances,
  data = data_train,
  trace = FALSE,
  direction = "forward"
)
model_backward <- stepAIC(
  model_quanti_quali_final,
  ~1,
  trace = FALSE,
  direction = "backward"
)
model_both <- stepAIC(
  model_0,
  vélos ~ vent +
    humidite +
    humidite_square +
    temperature1 +
    temperature1_square +

    horaire +
    saison +
    meteo +
    mois +
    vacances,
  data = data_train,
  trace = FALSE,
  direction = "both"
)
AIC(model_forward, model_both, model_backward)
summary(model_forward)
```

```{r}
model_final_without_interaction <- glm.nb(
  velos ~ horaire +
    mois +
    meteo +
    temperature1 +
    saison +
    temperature1_square +
    humidite_square +
    vacances +
    humidite +
    vent
  , data = data_train)
summary(model_final_without_interaction)
```

### Final Model

#### Choice and Validation of the Model

```{r}
model_final <- glm.nb(
  velos ~ horaire +
    mois +
    meteo +
    temperature1 +
    saison +
    temperature1_square +
    humidite_square +
    vacances +
    humidite +
    vent +

    horaire:temperature1 +
    temperature1_square:mois
  , data = data_train
)
summary(model_final)
```

```{r}
anova(model_final, test = "Chisq")
```

```{r}
lrtest(model_final, model_final_without_interaction)
```

```{r}
dispersion_ratio <- sum(residuals(model_final, type = "pearson")^2) / df.residual(model_final)
print(paste("Dispersion Ratio:", round(dispersion_ratio, 2)))

hist(residuals(model_final, type = "deviance"), breaks = 30, freq = FALSE, col = "lightblue", main = "Histogram of Deviance Residuals")
x <- seq(-5, 5, length = 100)
curve(dnorm(x), col = "darkblue", lwd = 2, add = TRUE)

autoplot(model_final, 1:4)
```

```{r}
library(arm)
binnedplot(fitted(model_final), residuals(model_final, type = "deviance"), col.int = "blue", col.pts = 2)
```

## Performance and Limitations of the Final Model

### Predictions and *Mean Square Error*

```{r}
data_test$predictions <- predict(model_final, newdata = data_test, type = "response")
data_train$predictions <- predict(model_final, newdata = data_train, type = "response")

predictions_ci <- predict(
  model_final,
  newdata = data_test,
  type = "link",
  se.fit = TRUE
)

data_test$lwr <- exp(predictions_ci$fit - 1.96 * predictions_ci$se.fit)
data_test$upr <- exp(predictions_ci$fit + 1.96 * predictions_ci$se.fit)

MSE <- mean((data_test$velos - data_test$predictions)^2)
cat("MSE :", MSE, "\n")
cat("RMSE train :", sqrt(mean((data_train$velos - data_train$predictions)^2)), "\n")
cat('RMSE :', sqrt(MSE), '\n')
```

### Evaluation of Model Performance

```{r}
bounds <- c(200, 650)

cat("Observations vs Prédictions\n")
cat(nrow(data_test[data_test$velos < bounds[1],]), "vs", sum(data_test$predictions < bounds[1]), "\n")
cat(nrow(data_test[data_test$velos > bounds[1] & data_test$velos < bounds[2],]), "vs", sum(data_test$predictions > bounds[1] & data_test$predictions < bounds[2]), "\n")
cat(nrow(data_test[data_test$velos > bounds[2],]), "vs", sum(data_test$predictions > bounds[2]), "\n")
cat('\n')

categories <- c(0, bounds, Inf)
categories_label <- c("Low", "Mid", "High")

data_test$velos_cat <- cut(data_test$velos,
                           breaks = categories,
                           labels = categories_label,
                           include.lowest = TRUE)
data_test$predictions_cat <- cut(data_test$predictions,
                                 breaks = categories,
                                 labels = categories_label,
                                 include.lowest = TRUE)
conf_matrix <- confusionMatrix(data_test$velos_cat, data_test$predictions_cat)
conf_matrix
```

```{r}
ggplot(data_test, aes(x = velos, y = predictions)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2, fill = "grey") +
  labs(
    title = "Comparison of Observed and Predicted Bikes",
    x = "Number of Observed Bikes",
    y = "Number of Predicted Bikes"
  ) +
  theme_minimal()

ggplot(data_test, aes(x = velos_cat, fill = predictions_cat)) +
  geom_bar(position = "dodge") +
  labs(title = "Predictions vs Observations (by categories)",
       x = "Observed categories", y = "Number of predictions")

df_plot <- data.frame(observed = data_test$velos, predicted = data_test$predictions)

# Créer l'histogramme avec la courbe de densité
ggplot(df_plot, aes(x = observed)) +
  geom_histogram(aes(y = ..density..), binwidth = 50, fill = "lightblue", color = "black", alpha = 0.7) + # Histogramme des données observées
  geom_density(aes(x = predicted), color = "red", size = 1) + # Courbe de densité des valeurs prédites
  labs(title = "Observed distribution vs. Predicted distribution",
       x = "Number of vélos",
       y = "Density") +
  theme_bw()

```