Data Analysis

setwd('/Users/arthurdanjou/Workspace/studies/M1/General Linear Models/Projet')
library(MASS)
library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
library(rmarkdown)
library(car)
library(corrplot)
## corrplot 0.95 loaded
library(carData)
library(ggfortify)
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster

Data Preprocessing

data <- read.csv("./projet.csv", header = TRUE, sep = ",", dec = ".")

# Factors
data$saison <- as.factor(data$saison)
data$mois <- as.factor(data$mois)
data$jour_mois <- as.factor(data$jour_mois)
data$jour_semaine <- as.factor(data$jour_semaine)
data$horaire <- as.factor(data$horaire)
data$jour_travail <- as.factor(data$jour_travail)
data$vacances <- as.factor(data$vacances)
data$meteo <- as.factor(data$meteo)

# Quantitative variables
data$humidite_sqrt <- sqrt(data$humidite)
data$humidite_square <- data$humidite^2

data$temperature1_square <- data$temperature1^2
data$temperature2_square <- data$temperature2^2

data$vent_square <- data$vent^2
data$vent_sqrt <- sqrt(data$vent)

# Remove obs column
rownames(data) <- data$obs
data <- data[, -1]
paged_table(data)
colSums(is.na(data))
##              saison                mois           jour_mois        jour_semaine 
##                   0                   0                   0                   0 
##             horaire        jour_travail            vacances               meteo 
##                   0                   0                   0                   0 
##               velos            humidite                vent        temperature1 
##                   0                   0                   0                   0 
##        temperature2       humidite_sqrt     humidite_square temperature1_square 
##                   0                   0                   0                   0 
## temperature2_square         vent_square           vent_sqrt 
##                   0                   0                   0
sum(duplicated(data))
## [1] 0
str(data)
## 'data.frame':    1817 obs. of  19 variables:
##  $ saison             : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mois               : Factor w/ 12 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ jour_mois          : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 2 2 2 2 2 ...
##  $ jour_semaine       : Factor w/ 7 levels "1","2","3","4",..: 7 7 7 7 7 1 1 1 1 1 ...
##  $ horaire            : Factor w/ 5 levels "1","2","3","4",..: 1 2 3 4 5 1 2 3 4 5 ...
##  $ jour_travail       : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ vacances           : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
##  $ meteo              : Factor w/ 3 levels "1","2","3": 1 1 1 2 2 2 2 2 3 1 ...
##  $ velos              : int  105 61 340 305 174 54 82 297 268 100 ...
##  $ humidite           : num  78 78.2 75.5 82.2 88.8 ...
##  $ vent               : num  1.38 6.84 30.98 30.58 26.39 ...
##  $ temperature1       : num  2.88 5.39 11.98 12.21 11.55 ...
##  $ temperature2       : num  2.29 5.25 11.5 12.5 11.8 ...
##  $ humidite_sqrt      : num  8.83 8.85 8.69 9.07 9.42 ...
##  $ humidite_square    : num  6084 6123 5700 6765 7885 ...
##  $ temperature1_square: num  8.29 29.05 143.52 149.08 133.4 ...
##  $ temperature2_square: num  5.24 27.56 132.25 156.25 139.24 ...
##  $ vent_square        : num  1.9 46.8 959.8 935.1 696.4 ...
##  $ vent_sqrt          : num  1.17 2.62 5.57 5.53 5.14 ...
summary(data)
##  saison       mois       jour_mois    jour_semaine horaire jour_travail
##  1:444   3      :155   1      :  60   1:259        1:362   1: 573      
##  2:460   5      :155   2      :  60   2:260        2:363   2:1244      
##  3:468   7      :155   3      :  60   3:258        3:364               
##  4:445   10     :155   4      :  60   4:259        4:365               
##          12     :155   5      :  60   5:257        5:363               
##          8      :153   6      :  60   6:260                            
##          (Other):889   (Other):1457   7:264                            
##  vacances meteo        velos           humidite           vent      
##  1:1767   1:1192   Min.   :   7.0   Min.   :  0.00   Min.   : 0.00  
##  2:  50   2: 464   1st Qu.: 275.0   1st Qu.: 48.40   1st Qu.:12.47  
##           3: 161   Median : 619.0   Median : 62.75   Median :20.00  
##                    Mean   : 684.2   Mean   : 62.94   Mean   :21.23  
##                    3rd Qu.:1041.0   3rd Qu.: 78.00   3rd Qu.:28.28  
##                    Max.   :2036.0   Max.   :100.00   Max.   :68.66  
##                                                                     
##   temperature1    temperature2    humidite_sqrt    humidite_square
##  Min.   :-6.59   Min.   :-14.50   Min.   : 0.000   Min.   :    0  
##  1st Qu.: 7.51   1st Qu.:  5.25   1st Qu.: 6.957   1st Qu.: 2343  
##  Median :15.31   Median : 15.80   Median : 7.921   Median : 3938  
##  Mean   :15.24   Mean   : 15.25   Mean   : 7.828   Mean   : 4323  
##  3rd Qu.:23.15   3rd Qu.: 25.00   3rd Qu.: 8.832   3rd Qu.: 6084  
##  Max.   :36.65   Max.   : 47.25   Max.   :10.000   Max.   :10000  
##                                                                   
##  temperature1_square temperature2_square  vent_square       vent_sqrt    
##  Min.   :   0.0001   Min.   :   0.00     Min.   :   0.0   Min.   :0.000  
##  1st Qu.:  56.4001   1st Qu.:  39.06     1st Qu.: 155.5   1st Qu.:3.531  
##  Median : 234.3961   Median : 249.64     Median : 400.0   Median :4.472  
##  Mean   : 319.6898   Mean   : 369.70     Mean   : 585.5   Mean   :4.400  
##  3rd Qu.: 535.9225   3rd Qu.: 625.00     3rd Qu.: 799.8   3rd Qu.:5.318  
##  Max.   :1343.2225   Max.   :2232.56     Max.   :4714.2   Max.   :8.286  
## 

Study of the Quantitative Variables

Distribution of Variables

plot_velos <- ggplot(data, aes(x = velos)) +
  geom_histogram(bins = 25, fill = "blue") +
  labs(title = "Distribution du nombre de vélos loués", x = "Nombre de locations de vélos")

plot_temperature1 <- ggplot(data, aes(x = temperature1)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la température1", x = "Température moyenne mesuré (°C)")

plot_temperature2 <- ggplot(data, aes(x = temperature2)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la température2", x = "Température moyenne ressentie (°C)")

plot_humidite <- ggplot(data, aes(x = humidite)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la humidité", x = "Pourcentage d'humidité")

plot_vent <- ggplot(data, aes(x = vent)) +
  geom_histogram(bins = 30, fill = "green") +
  labs(title = "Distribution de la vent", x = "Vitesse du vent (Km/h)")

grid.arrange(plot_velos, plot_temperature1, plot_temperature2, plot_humidite, plot_vent, ncol = 3)

Correlation between Quantitatives Variables

#detach(package:arm) # If error, uncomment this line due to duplicate function 'corrplot'
corr_matrix <- cor(data[, sapply(data, is.numeric)])
corrplot(corr_matrix, type = "lower", tl.col = "black", tl.srt = 45)

pairs(data[, sapply(data, is.numeric)])

cor.test(data$temperature1, data$temperature2)
## 
##  Pearson's product-moment correlation
## 
## data:  data$temperature1 and data$temperature2
## t = 384.61, df = 1815, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9933370 0.9944541
## sample estimates:
##      cor 
## 0.993921

Study of the Qualitative Variables

plot_saison <- ggplot(data, aes(x = saison, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par saison", x = "Saison", y = "Nombre de vélos loués")

plot_jour_semaine <- ggplot(data, aes(x = jour_semaine, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour de la semaine", x = "Jour de la semaine", y = "Nombre de vélos loués")

plot_mois <- ggplot(data, aes(x = mois, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par mois de l'année", x = "Mois de l'année", y = "Nombre de vélos loués")

plot_jour_mois <- ggplot(data, aes(x = jour_mois, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour du mois", x = "Jour du mois", y = "Nombre de vélos loués")

plot_horaire <- ggplot(data, aes(x = horaire, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par horaire de la journée", x = "Horaire de la journée", y = "Nombre de vélos loués")

plot_jour_travail <- ggplot(data, aes(x = jour_travail, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par jour travaillé", x = "Jour travaillé", y = "Nombre de vélos loués")

plot_vacances <- ggplot(data, aes(x = vacances, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par vacances", x = "Vacances", y = "Nombre de vélos loués")

plot_meteo <- ggplot(data, aes(x = meteo, y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Nombre de vélos par météo", x = "Météo", y = "Nombre de vélos loués")

grid.arrange(plot_horaire, plot_jour_semaine, plot_jour_mois, plot_mois, plot_saison, plot_jour_travail, plot_vacances, plot_meteo, ncol = 3)

chisq.test(data$mois, data$saison)
## 
##  Pearson's Chi-squared test
## 
## data:  data$mois and data$saison
## X-squared = 4380.8, df = 33, p-value < 2.2e-16
chisq.test(data$meteo, data$saison)
## 
##  Pearson's Chi-squared test
## 
## data:  data$meteo and data$saison
## X-squared = 20.78, df = 6, p-value = 0.002009
chisq.test(data$meteo, data$mois)
## 
##  Pearson's Chi-squared test
## 
## data:  data$meteo and data$mois
## X-squared = 91.211, df = 22, p-value = 2.13e-10

Outliers Detection

boxplot_velos <- ggplot(data, aes(x = "", y = velos)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vélos", y = "Nombre de vélos")

boxplot_temperature1 <- ggplot(data, aes(x = "", y = temperature1)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température1", y = "Température moyenne mesuré (°C)")

boxplot_temperature1_sq <- ggplot(data, aes(x = "", y = temperature1_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température1^2", y = "Température moyenne mesuré (°C^2)")

boxplot_temperature2 <- ggplot(data, aes(x = "", y = temperature2)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température2", y = "Température moyenne ressentie (°C)")

boxplot_temperature2_sq <- ggplot(data, aes(x = "", y = temperature2_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de température2^2", y = "Température moyenne ressentie (°C^2)")

boxplot_humidite <- ggplot(data, aes(x = "", y = humidite)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de humidité", y = "Pourcentage d'humidité")

boxplot_humidite_sqrt <- ggplot(data, aes(x = "", y = humidite_sqrt)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de sqrt(humidité)", y = "Pourcentage d'humidité (sqrt(%))")

boxplot_humidite_square <- ggplot(data, aes(x = "", y = humidite_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de humidité^2", y = "Pourcentage d'humidité (%^2)")

boxplot_vent <- ggplot(data, aes(x = "", y = vent)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vent", y = "Vitesse du vent (Km/h)")

boxplot_vent_sqrt <- ggplot(data, aes(x = "", y = vent_sqrt)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de sqrt(vent)", y = "Vitesse du vent sqrt(Km/h)")

boxplot_vent_square <- ggplot(data, aes(x = "", y = vent_square)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Boxplot de vent^2", y = "Vitesse du vent (Km/h)^2")

grid.arrange(boxplot_velos, boxplot_temperature1, boxplot_temperature1_sq, boxplot_temperature2, boxplot_temperature2_sq, boxplot_humidite, boxplot_humidite_sqrt, boxplot_humidite_square, boxplot_vent, boxplot_vent_sqrt, boxplot_vent_square, ncol = 4)

length_data <- nrow(data)
numeric_vars <- sapply(data, is.numeric)
total_outliers <- 0

for (var in names(data)[numeric_vars]) {
  data_outlier <- data[[var]]
  Q1 <- quantile(data_outlier, 0.25)
  Q3 <- quantile(data_outlier, 0.75)
  IQR <- Q3 - Q1

  lower_limit <- Q1 - 1.5 * IQR
  upper_limit <- Q3 + 1.5 * IQR

  outliers <- data[data_outlier < lower_limit | data_outlier > upper_limit,]
  cat("Number of outliers for the variable", var, ":", nrow(outliers), "\n")
  data <- data[data_outlier >= lower_limit & data_outlier <= upper_limit,]
}
## Number of outliers for the variable velos : 0 
## Number of outliers for the variable humidite : 5 
## Number of outliers for the variable vent : 24 
## Number of outliers for the variable temperature1 : 0 
## Number of outliers for the variable temperature2 : 0 
## Number of outliers for the variable humidite_sqrt : 1 
## Number of outliers for the variable humidite_square : 0 
## Number of outliers for the variable temperature1_square : 4 
## Number of outliers for the variable temperature2_square : 11 
## Number of outliers for the variable vent_square : 80 
## Number of outliers for the variable vent_sqrt : 27
cat("Number of outliers removed :", length_data - nrow(data), "\n")
## Number of outliers removed : 152
cat("Data length after removing outliers :", nrow(data), "\n")
## Data length after removing outliers : 1665

Model Creation and Comparison

Data Split

set.seed(123)
data_split <- rsample::initial_split(data, prop = 0.8)
data_train <- rsample::training(data_split)
data_test <- rsample::testing(data_split)

Choice of the Distribution vélos

model_poisson <- glm(velos ~ ., family = poisson, data = data_train)
model_nb <- glm.nb(velos ~ ., data = data_train)
model_gaussian <- glm(velos ~ ., family = gaussian, data = data_train)
t(AIC(model_poisson, model_nb, model_gaussian)[2])
##     model_poisson model_nb model_gaussian
## AIC      82988.75  17532.5       18420.81
dispersiontest(model_poisson)
## 
##  Overdispersion test
## 
## data:  model_poisson
## z = 20.793, p-value < 2.2e-16
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   54.85188
mean_velos <- mean(data_train$velos)
var_velos <- var(data_train$velos)
cat("Mean :", mean_velos, "Variance :", var_velos, "\n")
## Mean : 693.4317 Variance : 214261.1

Model Selection

model_full_quantitative <- glm.nb(
  velos ~ vent +
    vent_square +
    vent_sqrt +

    humidite +
    humidite_square +
    humidite_sqrt +

    temperature1_square +
    temperature1 +

    temperature2 +
    temperature2_square
  , data = data_train)
summary(model_full_quantitative)
## 
## Call:
## glm.nb(formula = velos ~ vent + vent_square + vent_sqrt + humidite + 
##     humidite_square + humidite_sqrt + temperature1_square + temperature1 + 
##     temperature2 + temperature2_square, data = data_train, init.theta = 2.91504998, 
##     link = log)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          8.2125765  2.2378964   3.670 0.000243 ***
## vent                 0.0281498  0.0499554   0.563 0.573096    
## vent_square         -0.0002027  0.0004951  -0.409 0.682254    
## vent_sqrt           -0.1673324  0.2546295  -0.657 0.511078    
## humidite             0.1061905  0.0816427   1.301 0.193370    
## humidite_square     -0.0004965  0.0002360  -2.104 0.035388 *  
## humidite_sqrt       -0.8882801  0.8081472  -1.099 0.271700    
## temperature1_square -0.0010461  0.0009065  -1.154 0.248518    
## temperature1         0.0702146  0.0379386   1.851 0.064206 .  
## temperature2         0.0260605  0.0252172   1.033 0.301397    
## temperature2_square -0.0006738  0.0005534  -1.218 0.223397    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(2.915) family taken to be 1)
## 
##     Null deviance: 2358.2  on 1331  degrees of freedom
## Residual deviance: 1408.9  on 1321  degrees of freedom
## AIC: 19155
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  2.915 
##           Std. Err.:  0.108 
## 
##  2 x log-likelihood:  -19131.246
anova(model_full_quantitative)
## Warning in anova.negbin(model_full_quantitative): tests made without
## re-estimating 'theta'
## Analysis of Deviance Table
## 
## Model: Negative Binomial(2.915), link: log
## 
## Response: velos
## 
## Terms added sequentially (first to last)
## 
## 
##                     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                 1331     2358.2              
## vent                 1     8.84      1330     2349.3  0.002946 ** 
## vent_square          1     4.04      1329     2345.3  0.044442 *  
## vent_sqrt            1     0.33      1328     2345.0  0.568601    
## humidite             1   183.86      1327     2161.1 < 2.2e-16 ***
## humidite_square      1    48.09      1326     2113.0 4.067e-12 ***
## humidite_sqrt        1     5.51      1325     2107.5  0.018912 *  
## temperature1_square  1   431.26      1324     1676.2 < 2.2e-16 ***
## temperature1         1   265.92      1323     1410.3 < 2.2e-16 ***
## temperature2         1     0.02      1322     1410.3  0.898079    
## temperature2_square  1     1.44      1321     1408.9  0.229603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_quantitative_quali <- glm.nb(
  velos ~
    vent +
      vent_square +
      vent_sqrt +
      humidite +
      humidite_square +
      humidite_sqrt +
      temperature1 +
      temperature1_square +

      horaire +
      saison +
      meteo +
      mois +
      vacances +
      jour_travail +
      jour_semaine +
      jour_mois
  , data = data_train)
summary(model_quantitative_quali)
## 
## Call:
## glm.nb(formula = velos ~ vent + vent_square + vent_sqrt + humidite + 
##     humidite_square + humidite_sqrt + temperature1 + temperature1_square + 
##     horaire + saison + meteo + mois + vacances + jour_travail + 
##     jour_semaine + jour_mois, data = data_train, init.theta = 10.11407919, 
##     link = log)
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          5.7578159  1.2735469   4.521 6.15e-06 ***
## vent                 0.0065391  0.0277120   0.236 0.813457    
## vent_square          0.0000256  0.0002747   0.093 0.925750    
## vent_sqrt           -0.0872814  0.1411176  -0.619 0.536245    
## humidite             0.0718573  0.0465415   1.544 0.122603    
## humidite_square     -0.0003074  0.0001352  -2.273 0.023026 *  
## humidite_sqrt       -0.5879792  0.4597808  -1.279 0.200959    
## temperature1         0.0538688  0.0050283  10.713  < 2e-16 ***
## temperature1_square -0.0009887  0.0001523  -6.490 8.59e-11 ***
## horaire2             1.4014996  0.0282339  49.639  < 2e-16 ***
## horaire3             1.2441366  0.0342452  36.330  < 2e-16 ***
## horaire4             1.6669509  0.0340096  49.014  < 2e-16 ***
## horaire5             1.3009922  0.0289632  44.919  < 2e-16 ***
## saison2              0.2113832  0.0569855   3.709 0.000208 ***
## saison3              0.3238081  0.0647808   4.999 5.78e-07 ***
## saison4              0.5120136  0.0557309   9.187  < 2e-16 ***
## meteo2              -0.0388243  0.0227839  -1.704 0.088376 .  
## meteo3              -0.4183265  0.0423535  -9.877  < 2e-16 ***
## mois2                0.1673029  0.0489754   3.416 0.000635 ***
## mois3                0.1690126  0.0550596   3.070 0.002143 ** 
## mois4                0.2656971  0.0844978   3.144 0.001664 ** 
## mois5                0.5093488  0.0894762   5.693 1.25e-08 ***
## mois6                0.4535267  0.0938529   4.832 1.35e-06 ***
## mois7                0.3494709  0.1036055   3.373 0.000743 ***
## mois8                0.3217628  0.1007131   3.195 0.001399 ** 
## mois9                0.3638295  0.0911294   3.992 6.54e-05 ***
## mois10               0.2067528  0.0832300   2.484 0.012987 *  
## mois11               0.1452113  0.0794983   1.827 0.067760 .  
## mois12               0.1929111  0.0622809   3.097 0.001952 ** 
## vacances2           -0.1803626  0.0653453  -2.760 0.005778 ** 
## jour_travail2        0.0234146  0.0328769   0.712 0.476347    
## jour_semaine2       -0.0435844  0.0340130  -1.281 0.200052    
## jour_semaine3       -0.0365816  0.0336789  -1.086 0.277397    
## jour_semaine4       -0.0390571  0.0331463  -1.178 0.238668    
## jour_semaine5       -0.0133017  0.0336982  -0.395 0.693043    
## jour_semaine6               NA         NA      NA       NA    
## jour_semaine7        0.0302082  0.0329428   0.917 0.359148    
## jour_mois2           0.0413982  0.0679635   0.609 0.542442    
## jour_mois3           0.0756509  0.0680085   1.112 0.265978    
## jour_mois4           0.1520220  0.0659987   2.303 0.021256 *  
## jour_mois5           0.1174765  0.0679582   1.729 0.083870 .  
## jour_mois6           0.0194055  0.0678443   0.286 0.774855    
## jour_mois7           0.0515925  0.0695970   0.741 0.458510    
## jour_mois8           0.0088355  0.0679031   0.130 0.896472    
## jour_mois9           0.0763066  0.0669394   1.140 0.254314    
## jour_mois10          0.1225251  0.0661751   1.852 0.064094 .  
## jour_mois11          0.1082326  0.0713126   1.518 0.129085    
## jour_mois12          0.0648527  0.0675506   0.960 0.337024    
## jour_mois13          0.0099991  0.0692432   0.144 0.885181    
## jour_mois14          0.0894572  0.0685495   1.305 0.191892    
## jour_mois15          0.0621761  0.0702483   0.885 0.376108    
## jour_mois16          0.0553908  0.0653521   0.848 0.396674    
## jour_mois17          0.1266634  0.0669505   1.892 0.058505 .  
## jour_mois18          0.0767661  0.0652818   1.176 0.239628    
## jour_mois19          0.0961955  0.0675449   1.424 0.154397    
## jour_mois20          0.1152297  0.0705546   1.633 0.102427    
## jour_mois21          0.1368864  0.0705549   1.940 0.052362 .  
## jour_mois22          0.0723416  0.0677257   1.068 0.285450    
## jour_mois23          0.0672894  0.0678375   0.992 0.321236    
## jour_mois24         -0.0371423  0.0678872  -0.547 0.584298    
## jour_mois25          0.0099320  0.0686310   0.145 0.884935    
## jour_mois26         -0.0009005  0.0681434  -0.013 0.989456    
## jour_mois27         -0.0409743  0.0713894  -0.574 0.565999    
## jour_mois28          0.0344889  0.0671148   0.514 0.607336    
## jour_mois29          0.0385154  0.0673973   0.571 0.567683    
## jour_mois30          0.1637766  0.0670492   2.443 0.014580 *  
## jour_mois31          0.1191179  0.0777575   1.532 0.125544    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(10.1141) family taken to be 1)
## 
##     Null deviance: 8029.0  on 1331  degrees of freedom
## Residual deviance: 1356.7  on 1266  degrees of freedom
## AIC: 17531
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  10.114 
##           Std. Err.:  0.398 
## 
##  2 x log-likelihood:  -17396.536
anova(model_quantitative_quali)
## Warning in anova.negbin(model_quantitative_quali): tests made without
## re-estimating 'theta'
## Analysis of Deviance Table
## 
## Model: Negative Binomial(10.1141), link: log
## 
## Response: velos
## 
## Terms added sequentially (first to last)
## 
## 
##                     Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                 1331     8029.0              
## vent                 1    30.35      1330     7998.6 3.609e-08 ***
## vent_square          1    13.88      1329     7984.8 0.0001951 ***
## vent_sqrt            1     1.12      1328     7983.6 0.2905323    
## humidite             1   630.37      1327     7353.3 < 2.2e-16 ***
## humidite_square      1   164.51      1326     7188.8 < 2.2e-16 ***
## humidite_sqrt        1    18.83      1325     7169.9 1.426e-05 ***
## temperature1         1  2047.02      1324     5122.9 < 2.2e-16 ***
## temperature1_square  1   340.16      1323     4782.7 < 2.2e-16 ***
## horaire              4  2909.21      1319     1873.5 < 2.2e-16 ***
## saison               3   285.86      1316     1587.7 < 2.2e-16 ***
## meteo                2   114.28      1314     1473.4 < 2.2e-16 ***
## mois                11    66.77      1303     1406.6 4.995e-10 ***
## vacances             1    10.42      1302     1396.2 0.0012449 ** 
## jour_travail         1     1.30      1301     1394.9 0.2541936    
## jour_semaine         5     4.18      1296     1390.7 0.5231791    
## jour_mois           30    34.01      1266     1356.7 0.2803690    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model_quanti_quali_final <- glm.nb(
  velos ~
    vent +
      vent_square + #
      humidite +
      humidite_square +
      humidite_sqrt + #
      temperature1 +
      temperature1_square +

      horaire +
      saison +
      meteo +
      mois +
      vacances
  , data = data_train)
summary(model_quanti_quali_final)
## 
## Call:
## glm.nb(formula = velos ~ vent + vent_square + humidite + humidite_square + 
##     humidite_sqrt + temperature1 + temperature1_square + horaire + 
##     saison + meteo + mois + vacances, data = data_train, init.theta = 9.822854355, 
##     link = log)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          5.373e+00  1.248e+00   4.304 1.67e-05 ***
## vent                -9.504e-03  4.036e-03  -2.355 0.018529 *  
## vent_square          1.630e-04  9.481e-05   1.719 0.085660 .  
## humidite             5.968e-02  4.615e-02   1.293 0.195913    
## humidite_square     -2.771e-04  1.341e-04  -2.067 0.038757 *  
## humidite_sqrt       -4.645e-01  4.559e-01  -1.019 0.308254    
## temperature1         5.358e-02  4.882e-03  10.976  < 2e-16 ***
## temperature1_square -1.033e-03  1.469e-04  -7.034 2.00e-12 ***
## horaire2             1.398e+00  2.847e-02  49.083  < 2e-16 ***
## horaire3             1.248e+00  3.442e-02  36.244  < 2e-16 ***
## horaire4             1.668e+00  3.414e-02  48.867  < 2e-16 ***
## horaire5             1.298e+00  2.919e-02  44.454  < 2e-16 ***
## saison2              2.114e-01  5.714e-02   3.700 0.000216 ***
## saison3              3.146e-01  6.485e-02   4.851 1.23e-06 ***
## saison4              4.924e-01  5.589e-02   8.810  < 2e-16 ***
## meteo2              -3.463e-02  2.274e-02  -1.523 0.127822    
## meteo3              -4.184e-01  4.210e-02  -9.938  < 2e-16 ***
## mois2                1.657e-01  4.924e-02   3.366 0.000762 ***
## mois3                1.718e-01  5.498e-02   3.125 0.001777 ** 
## mois4                2.794e-01  8.413e-02   3.321 0.000898 ***
## mois5                5.298e-01  8.927e-02   5.934 2.95e-09 ***
## mois6                4.858e-01  9.293e-02   5.227 1.72e-07 ***
## mois7                3.945e-01  1.026e-01   3.846 0.000120 ***
## mois8                3.642e-01  9.993e-02   3.644 0.000268 ***
## mois9                4.037e-01  9.013e-02   4.479 7.49e-06 ***
## mois10               2.384e-01  8.283e-02   2.878 0.004005 ** 
## mois11               1.710e-01  7.934e-02   2.155 0.031136 *  
## mois12               2.175e-01  6.227e-02   3.493 0.000478 ***
## vacances2           -1.746e-01  5.379e-02  -3.246 0.001170 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.8229) family taken to be 1)
## 
##     Null deviance: 7803.6  on 1331  degrees of freedom
## Residual deviance: 1357.7  on 1303  degrees of freedom
## AIC: 17496
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.823 
##           Std. Err.:  0.386 
## 
##  2 x log-likelihood:  -17436.000
model_0 <- glm.nb(velos ~ 1, data = data_train)
model_forward <- stepAIC(
  model_0,
  vélos ~ vent +
    humidite +
    humidite_square +
    temperature1 +
    temperature1_square +

    horaire +
    saison +
    meteo +
    mois +
    vacances,
  data = data_train,
  trace = FALSE,
  direction = "forward"
)
model_backward <- stepAIC(
  model_quanti_quali_final,
  ~1,
  trace = FALSE,
  direction = "backward"
)
model_both <- stepAIC(
  model_0,
  vélos ~ vent +
    humidite +
    humidite_square +
    temperature1 +
    temperature1_square +

    horaire +
    saison +
    meteo +
    mois +
    vacances,
  data = data_train,
  trace = FALSE,
  direction = "both"
)
AIC(model_forward, model_both, model_backward)
##                df      AIC
## model_forward  28 17496.08
## model_both     28 17496.08
## model_backward 29 17495.00
summary(model_forward)
## 
## Call:
## glm.nb(formula = velos ~ horaire + mois + meteo + temperature1 + 
##     saison + humidite_square + temperature1_square + humidite + 
##     vacances + vent, data = data_train, init.theta = 9.793225092, 
##     link = log)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          4.073e+00  1.134e-01  35.913  < 2e-16 ***
## horaire2             1.397e+00  2.851e-02  49.005  < 2e-16 ***
## horaire3             1.247e+00  3.442e-02  36.232  < 2e-16 ***
## horaire4             1.668e+00  3.419e-02  48.794  < 2e-16 ***
## horaire5             1.296e+00  2.921e-02  44.351  < 2e-16 ***
## mois2                1.675e-01  4.927e-02   3.399 0.000676 ***
## mois3                1.740e-01  5.504e-02   3.161 0.001573 ** 
## mois4                2.850e-01  8.407e-02   3.390 0.000700 ***
## mois5                5.331e-01  8.929e-02   5.971 2.36e-09 ***
## mois6                4.899e-01  9.302e-02   5.266 1.39e-07 ***
## mois7                4.001e-01  1.027e-01   3.896 9.77e-05 ***
## mois8                3.725e-01  9.997e-02   3.726 0.000195 ***
## mois9                4.072e-01  9.024e-02   4.512 6.42e-06 ***
## mois10               2.420e-01  8.290e-02   2.919 0.003510 ** 
## mois11               1.733e-01  7.945e-02   2.181 0.029176 *  
## mois12               2.213e-01  6.234e-02   3.550 0.000385 ***
## meteo2              -3.797e-02  2.272e-02  -1.672 0.094590 .  
## meteo3              -4.297e-01  4.158e-02 -10.334  < 2e-16 ***
## temperature1         5.379e-02  4.887e-03  11.007  < 2e-16 ***
## saison2              2.070e-01  5.679e-02   3.644 0.000268 ***
## saison3              3.105e-01  6.468e-02   4.801 1.58e-06 ***
## saison4              4.910e-01  5.597e-02   8.773  < 2e-16 ***
## humidite_square     -1.372e-04  2.661e-05  -5.157 2.51e-07 ***
## temperature1_square -1.045e-03  1.468e-04  -7.113 1.13e-12 ***
## humidite             1.206e-02  3.387e-03   3.560 0.000371 ***
## vacances2           -1.718e-01  5.383e-02  -3.192 0.001414 ** 
## vent                -2.818e-03  1.028e-03  -2.741 0.006118 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.7932) family taken to be 1)
## 
##     Null deviance: 7780.7  on 1331  degrees of freedom
## Residual deviance: 1357.8  on 1305  degrees of freedom
## AIC: 17496
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.793 
##           Std. Err.:  0.385 
## 
##  2 x log-likelihood:  -17440.082
model_final_without_interaction <- glm.nb(
  velos ~ horaire +
    mois +
    meteo +
    temperature1 +
    saison +
    temperature1_square +
    humidite_square +
    vacances +
    humidite +
    vent
  , data = data_train)
summary(model_final_without_interaction)
## 
## Call:
## glm.nb(formula = velos ~ horaire + mois + meteo + temperature1 + 
##     saison + temperature1_square + humidite_square + vacances + 
##     humidite + vent, data = data_train, init.theta = 9.793225092, 
##     link = log)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          4.073e+00  1.134e-01  35.913  < 2e-16 ***
## horaire2             1.397e+00  2.851e-02  49.005  < 2e-16 ***
## horaire3             1.247e+00  3.442e-02  36.232  < 2e-16 ***
## horaire4             1.668e+00  3.419e-02  48.794  < 2e-16 ***
## horaire5             1.296e+00  2.921e-02  44.351  < 2e-16 ***
## mois2                1.675e-01  4.927e-02   3.399 0.000676 ***
## mois3                1.740e-01  5.504e-02   3.161 0.001573 ** 
## mois4                2.850e-01  8.407e-02   3.390 0.000700 ***
## mois5                5.331e-01  8.929e-02   5.971 2.36e-09 ***
## mois6                4.899e-01  9.302e-02   5.266 1.39e-07 ***
## mois7                4.001e-01  1.027e-01   3.896 9.77e-05 ***
## mois8                3.725e-01  9.997e-02   3.726 0.000195 ***
## mois9                4.072e-01  9.024e-02   4.512 6.42e-06 ***
## mois10               2.420e-01  8.290e-02   2.919 0.003510 ** 
## mois11               1.733e-01  7.945e-02   2.181 0.029176 *  
## mois12               2.213e-01  6.234e-02   3.550 0.000385 ***
## meteo2              -3.797e-02  2.272e-02  -1.672 0.094590 .  
## meteo3              -4.297e-01  4.158e-02 -10.334  < 2e-16 ***
## temperature1         5.379e-02  4.887e-03  11.007  < 2e-16 ***
## saison2              2.070e-01  5.679e-02   3.644 0.000268 ***
## saison3              3.105e-01  6.468e-02   4.801 1.58e-06 ***
## saison4              4.910e-01  5.597e-02   8.773  < 2e-16 ***
## temperature1_square -1.045e-03  1.468e-04  -7.113 1.13e-12 ***
## humidite_square     -1.372e-04  2.661e-05  -5.157 2.51e-07 ***
## vacances2           -1.718e-01  5.383e-02  -3.192 0.001414 ** 
## humidite             1.206e-02  3.387e-03   3.560 0.000371 ***
## vent                -2.818e-03  1.028e-03  -2.741 0.006118 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(9.7932) family taken to be 1)
## 
##     Null deviance: 7780.7  on 1331  degrees of freedom
## Residual deviance: 1357.8  on 1305  degrees of freedom
## AIC: 17496
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  9.793 
##           Std. Err.:  0.385 
## 
##  2 x log-likelihood:  -17440.082

Final Model

Choice and Validation of the Model

model_final <- glm.nb(
  velos ~ horaire +
    mois +
    meteo +
    temperature1 +
    saison +
    temperature1_square +
    humidite_square +
    vacances +
    humidite +
    vent +

    horaire:temperature1 +
    temperature1_square:mois
  , data = data_train
)
summary(model_final)
## 
## Call:
## glm.nb(formula = velos ~ horaire + mois + meteo + temperature1 + 
##     saison + temperature1_square + humidite_square + vacances + 
##     humidite + vent + horaire:temperature1 + temperature1_square:mois, 
##     data = data_train, init.theta = 10.64651881, link = log)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 4.1688082  0.1195673  34.866  < 2e-16 ***
## horaire2                    1.6136959  0.0494090  32.660  < 2e-16 ***
## horaire3                    1.3248624  0.0622321  21.289  < 2e-16 ***
## horaire4                    1.6124217  0.0611669  26.361  < 2e-16 ***
## horaire5                    1.1846038  0.0526976  22.479  < 2e-16 ***
## mois2                       0.1166036  0.0554402   2.103 0.035446 *  
## mois3                       0.0947222  0.0664671   1.425 0.154128    
## mois4                       0.0604694  0.0986704   0.613 0.539980    
## mois5                       0.5272220  0.1174380   4.489 7.14e-06 ***
## mois6                       0.6533283  0.1542792   4.235 2.29e-05 ***
## mois7                       0.7377272  0.1852322   3.983 6.81e-05 ***
## mois8                       0.4846330  0.1807033   2.682 0.007320 ** 
## mois9                       0.4051305  0.1386855   2.921 0.003487 ** 
## mois10                      0.1218501  0.1020733   1.194 0.232575    
## mois11                      0.2258393  0.0942154   2.397 0.016528 *  
## mois12                      0.1426387  0.0728088   1.959 0.050103 .  
## meteo2                     -0.0120673  0.0220598  -0.547 0.584360    
## meteo3                     -0.4136117  0.0404725 -10.220  < 2e-16 ***
## temperature1                0.0509424  0.0074086   6.876 6.15e-12 ***
## saison2                     0.2240149  0.0552593   4.054 5.04e-05 ***
## saison3                     0.3385659  0.0625326   5.414 6.16e-08 ***
## saison4                     0.5050795  0.0538878   9.373  < 2e-16 ***
## temperature1_square        -0.0036795  0.0012369  -2.975 0.002931 ** 
## humidite_square            -0.0001265  0.0000264  -4.792 1.65e-06 ***
## vacances2                  -0.1925533  0.0524129  -3.674 0.000239 ***
## humidite                    0.0101473  0.0033687   3.012 0.002593 ** 
## vent                       -0.0033359  0.0010029  -3.326 0.000880 ***
## horaire2:temperature1      -0.0157180  0.0031003  -5.070 3.98e-07 ***
## horaire3:temperature1      -0.0047919  0.0035916  -1.334 0.182146    
## horaire4:temperature1       0.0028109  0.0035586   0.790 0.429588    
## horaire5:temperature1       0.0067708  0.0031848   2.126 0.033508 *  
## mois2:temperature1_square   0.0027785  0.0011641   2.387 0.016998 *  
## mois3:temperature1_square   0.0032223  0.0011660   2.764 0.005716 ** 
## mois4:temperature1_square   0.0036563  0.0011573   3.159 0.001581 ** 
## mois5:temperature1_square   0.0027471  0.0011715   2.345 0.019033 *  
## mois6:temperature1_square   0.0024540  0.0011820   2.076 0.037883 *  
## mois7:temperature1_square   0.0022636  0.0011880   1.905 0.056727 .  
## mois8:temperature1_square   0.0025486  0.0011881   2.145 0.031948 *  
## mois9:temperature1_square   0.0027241  0.0011786   2.311 0.020818 *  
## mois10:temperature1_square  0.0032170  0.0011638   2.764 0.005706 ** 
## mois11:temperature1_square  0.0022141  0.0011684   1.895 0.058106 .  
## mois12:temperature1_square  0.0034507  0.0012066   2.860 0.004237 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(10.6465) family taken to be 1)
## 
##     Null deviance: 8440.1  on 1331  degrees of freedom
## Residual deviance: 1357.4  on 1290  degrees of freedom
## AIC: 17416
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  10.647 
##           Std. Err.:  0.420 
## 
##  2 x log-likelihood:  -17329.890
anova(model_final, test = "Chisq")
## Warning in anova.negbin(model_final, test = "Chisq"): tests made without
## re-estimating 'theta'
## Analysis of Deviance Table
## 
## Model: Negative Binomial(10.6465), link: log
## 
## Response: velos
## 
## Terms added sequentially (first to last)
## 
## 
##                          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                                      1331     8440.1              
## horaire                   4   4252.9      1327     4187.3 < 2.2e-16 ***
## mois                     11   2126.6      1316     2060.7 < 2.2e-16 ***
## meteo                     2    306.4      1314     1754.2 < 2.2e-16 ***
## temperature1              1     78.8      1313     1675.4 < 2.2e-16 ***
## saison                    3     72.3      1310     1603.2 1.385e-15 ***
## temperature1_square       1     40.2      1309     1563.0 2.294e-10 ***
## humidite_square           1     60.0      1308     1503.0 9.541e-15 ***
## vacances                  1      9.2      1307     1493.8 0.0024119 ** 
## humidite                  1     13.4      1306     1480.4 0.0002541 ***
## vent                      1      8.1      1305     1472.3 0.0043736 ** 
## horaire:temperature1      4     71.0      1301     1401.2 1.371e-14 ***
## mois:temperature1_square 11     43.8      1290     1357.4 7.908e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(model_final, model_final_without_interaction)
## Likelihood ratio test
## 
## Model 1: velos ~ horaire + mois + meteo + temperature1 + saison + temperature1_square + 
##     humidite_square + vacances + humidite + vent + horaire:temperature1 + 
##     temperature1_square:mois
## Model 2: velos ~ horaire + mois + meteo + temperature1 + saison + temperature1_square + 
##     humidite_square + vacances + humidite + vent
##   #Df  LogLik  Df  Chisq Pr(>Chisq)    
## 1  43 -8664.9                          
## 2  28 -8720.0 -15 110.19  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dispersion_ratio <- sum(residuals(model_final, type = "pearson")^2) / df.residual(model_final)
print(paste("Dispersion Ratio:", round(dispersion_ratio, 2)))
## [1] "Dispersion Ratio: 0.97"
hist(residuals(model_final, type = "deviance"), breaks = 30, freq = FALSE, col = "lightblue", main = "Histogram of Deviance Residuals")
x <- seq(-5, 5, length = 100)
curve(dnorm(x), col = "darkblue", lwd = 2, add = TRUE)

autoplot(model_final, 1:4)

library(arm)
## Loading required package: Matrix
## Loading required package: lme4
## 
## arm (Version 1.14-4, built: 2024-4-1)
## Working directory is /Users/arthurdanjou/Workspace/studies/M1/General Linear Models/Projet
## 
## Attaching package: 'arm'
## The following object is masked from 'package:corrplot':
## 
##     corrplot
## The following object is masked from 'package:car':
## 
##     logit
binnedplot(fitted(model_final), residuals(model_final, type = "deviance"), col.int = "blue", col.pts = 2)

Performance and Limitations of the Final Model

Predictions and Mean Square Error

data_test$predictions <- predict(model_final, newdata = data_test, type = "response")
data_train$predictions <- predict(model_final, newdata = data_train, type = "response")

predictions_ci <- predict(
  model_final,
  newdata = data_test,
  type = "link",
  se.fit = TRUE
)

data_test$lwr <- exp(predictions_ci$fit - 1.96 * predictions_ci$se.fit)
data_test$upr <- exp(predictions_ci$fit + 1.96 * predictions_ci$se.fit)

MSE <- mean((data_test$velos - data_test$predictions)^2)
cat("MSE :", MSE, "\n")
## MSE : 43331
cat("RMSE train :", sqrt(mean((data_train$velos - data_train$predictions)^2)), "\n")
## RMSE train : 195.0359
cat('RMSE :', sqrt(MSE), '\n')
## RMSE : 208.161

Evaluation of Model Performance

bounds <- c(200, 650)

cat("Observations vs Prédictions\n")
## Observations vs Prédictions
cat(nrow(data_test[data_test$velos < bounds[1],]), "vs", sum(data_test$predictions < bounds[1]), "\n")
## 50 vs 39
cat(nrow(data_test[data_test$velos > bounds[1] & data_test$velos < bounds[2],]), "vs", sum(data_test$predictions > bounds[1] & data_test$predictions < bounds[2]), "\n")
## 124 vs 128
cat(nrow(data_test[data_test$velos > bounds[2],]), "vs", sum(data_test$predictions > bounds[2]), "\n")
## 159 vs 166
cat('\n')
categories <- c(0, bounds, Inf)
categories_label <- c("Low", "Mid", "High")

data_test$velos_cat <- cut(data_test$velos,
                           breaks = categories,
                           labels = categories_label,
                           include.lowest = TRUE)
data_test$predictions_cat <- cut(data_test$predictions,
                                 breaks = categories,
                                 labels = categories_label,
                                 include.lowest = TRUE)
conf_matrix <- confusionMatrix(data_test$velos_cat, data_test$predictions_cat)
conf_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Mid High
##       Low   36  14    0
##       Mid    3  95   26
##       High   0  19  140
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8138          
##                  95% CI : (0.7678, 0.8542)
##     No Information Rate : 0.4985          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6903          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Low Class: Mid Class: High
## Sensitivity              0.9231     0.7422      0.8434
## Specificity              0.9524     0.8585      0.8862
## Pos Pred Value           0.7200     0.7661      0.8805
## Neg Pred Value           0.9894     0.8421      0.8506
## Prevalence               0.1171     0.3844      0.4985
## Detection Rate           0.1081     0.2853      0.4204
## Detection Prevalence     0.1502     0.3724      0.4775
## Balanced Accuracy        0.9377     0.8004      0.8648
ggplot(data_test, aes(x = velos, y = predictions)) +
  geom_point(color = "blue", alpha = 0.6, size = 2) +
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +
  geom_ribbon(aes(ymin = lwr, ymax = upr), alpha = 0.2, fill = "grey") +
  labs(
    title = "Comparison of Observed and Predicted Bikes",
    x = "Number of Observed Bikes",
    y = "Number of Predicted Bikes"
  ) +
  theme_minimal()

ggplot(data_test, aes(x = velos_cat, fill = predictions_cat)) +
  geom_bar(position = "dodge") +
  labs(title = "Predictions vs Observations (by categories)",
       x = "Observed categories", y = "Number of predictions")

df_plot <- data.frame(observed = data_test$velos, predicted = data_test$predictions)

# Créer l'histogramme avec la courbe de densité
ggplot(df_plot, aes(x = observed)) +
  geom_histogram(aes(y = ..density..), binwidth = 50, fill = "lightblue", color = "black", alpha = 0.7) + # Histogramme des données observées
  geom_density(aes(x = predicted), color = "red", size = 1) + # Courbe de densité des valeurs prédites
  labs(title = "Observed distribution vs. Predicted distribution",
       x = "Number of vélos",
       y = "Density") +
  theme_bw()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.