From 8f5f2b417c503aa8b1afc044e56fa6612b696c2a Mon Sep 17 00:00:00 2001 From: Arthur DANJOU Date: Thu, 6 Nov 2025 09:13:40 +0100 Subject: [PATCH] Implement code changes to enhance functionality and improve performance --- .../tp1/3-td_ggplot2 - enonce.Rmd | 91 ++++++----- .../tp1/3-td_ggplot2---enonce.html | 145 +++++++++--------- 2 files changed, 113 insertions(+), 123 deletions(-) diff --git a/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd b/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd index 37eef63..46d4688 100644 --- a/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd +++ b/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd @@ -32,7 +32,7 @@ knitr::opts_chunk$set( fig.height = 6, fig.width = 12 ) -options(encoding = 'UTF-8') +options(encoding = "UTF-8") ``` ```{r, echo = FALSE, fig.keep= 'none'} @@ -94,7 +94,7 @@ réalisées sur des manchots sur 3 îles de l'archipelle Palmer. Dans un premier temps, il faut installer le package et le charger. ```{r} -# install.packages("palmerpenguins") +# install.packages("palmerpenguins") #nolint library(palmerpenguins) ``` @@ -297,8 +297,8 @@ On présente ci-dessous un aperçu des données. fold <- getwd() # Load data -load(paste0(fold, "/data/datafreMPTL.RData")) -# load(paste0(fold, "/M2/Data Visualisation/tp1", "/data/datafreMPTL.RData")) +# load(paste0(fold, "/M2/Data Visualisation/tp1", "/data/datafreMPTL.RData")) # VSCode # nolint +load(paste0(fold, "/data/datafreMPTL.RData")) # RStudio paged_table(dat, options = list(rows.print = 15)) ``` @@ -470,7 +470,7 @@ package **ggplot2**. # On regroupe selon les modalites de la DrivAge # l'exposition, le nombre de sinistres et la frequence df_plot <- dat |> - group_by(DrivAge) |> + group_by(DrivAge) |> summarize( exp = sum(Exposure), nb_claims = sum(ClaimNb), @@ -478,23 +478,23 @@ df_plot <- dat |> ) # Histogramme exposition -p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) + +p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) + geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) + labs(x = "Age du conducteur", y = "Exposition en années") + theme_bw() - # Histogramme frequence - p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) + - geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) + - labs(x = "Age du conducteur", y = "Frequence") + - theme_bw() - -plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12) +# Histogramme frequence +p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) + + geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) + + labs(x = "Age du conducteur", y = "Frequence") + + theme_bw() + +plot_grid(p1, p2, labels = c("A", "B"), label_size = 12) ``` ```{r} df_plot <- dat |> - group_by(DrivAge) |> + group_by(DrivAge) |> summarize( exp = sum(Exposure), nb_claims = sum(ClaimNb), @@ -503,9 +503,9 @@ df_plot <- dat |> # Scatter plot frequence p3 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) + - geom_point() + + geom_point() + geom_smooth() + - labs(x = "Age du conducteur", y = "Frequence") + + labs(x = "Age du conducteur", y = "Frequence") + theme_bw() p3 ``` @@ -522,9 +522,9 @@ améliorations en modifiant les variables `DrivAge` et `BonusMalus`. ```{r} # On regroupe selon les modalites de la DrivAge et de Area -# l'exposition, le nombre de sinistres et la frequence +# l'exposition, le nombre de sinistres et la frequence df_plot <- dat |> - group_by(DrivAge, BonusMalus) |> + group_by(DrivAge, BonusMalus) |> summarize( exp = sum(Exposure), nb_claims = sum(ClaimNb), @@ -532,7 +532,7 @@ df_plot <- dat |> ) p4 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) + - geom_point() + + geom_point() + geom_smooth() + labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") + theme_bw() @@ -556,12 +556,14 @@ df_plot <- dat |> filter(DrivAge <= 85, BonusMalus <= 125) |> # regroupement en classes d'ages de 5 ans mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) |> - mutate(BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE)) - + mutate( + BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE) + ) + # On regroupe selon les modalites de la DrivAge et de Area -# l'exposition, le nombre de sinistres et la frequence +# l'exposition, le nombre de sinistres et la frequence df_plot <- df_plot |> - group_by(DrivAge, BonusMalus) |> + group_by(DrivAge, BonusMalus) |> summarize( exp = sum(Exposure), nb_claims = sum(ClaimNb), @@ -570,7 +572,7 @@ df_plot <- df_plot |> # Scatter plot frequence p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) + - geom_point() + + geom_point() + geom_smooth() + labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") + theme_bw() @@ -624,35 +626,28 @@ couples. # df : nom du data.frame # var1 : nom de la variable explicative 1 # var2 : nom de la variable explicative 2 -plot_pairwise_disc <- function(df, var1, var2) -{ - df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2)) - -# replace variable vname by the binning variable - if(is.numeric(df$varx)) - { +plot_pairwise_disc <- function(df, var1, var2) { + df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2)) + + # replace variable vname by the binning variable + if (is.numeric(df$varx)) { df <- df |> mutate(varx = ntile(varx, 5)) } - - if(is.numeric(df$vary)) - { + + if (is.numeric(df$vary)) { df <- df |> - mutate(vary = ntile(vary, 5), - vary = factor(vary)) + mutate(vary = ntile(vary, 5), vary = factor(vary)) } - - df |> - group_by(varx, vary) |> - summarize(exp = sum(Exposure), - nb_claims = sum(ClaimNb), - freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |> - ggplot(aes(x = varx, - y = freq, - colour = vary, - group = vary), alpha = 0.3) + - geom_point() + geom_line() + theme_bw() + - labs(x = var1, y = "Frequence", colour = var2) + + df |> + group_by(varx, vary) |> + summarize(exp = sum(Exposure), + nb_claims = sum(ClaimNb), + freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |> + ggplot(aes(x = varx, y = freq, colour = vary, group = vary), alpha = 0.3) + + geom_point() + geom_line() + theme_bw() + + labs(x = var1, y = "Frequence", colour = var2) } ``` diff --git a/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html b/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html index ac02922..ab69faf 100644 --- a/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html +++ b/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html @@ -4945,17 +4945,17 @@ package ggplot2.

Construire un tableau présentant l’exposition cumulée et le nombre d’observations avec 0 sinistre, 1 sinistre, …

-
dat %>%
-  group_by(ClaimNb) %>%
-  summarise(n = n(), Exposure = round(sum(Exposure), 0)) %>%
+
dat |>
+  group_by(ClaimNb) |>
+  summarise(n = n(), Exposure = round(sum(Exposure), 0)) |>
   kable(
     col_names = c(
       "Nombre de sinistres",
       "Nombres d'observations",
       "Exposition totale"
     )
-  ) %>%
-  kable_styling(full_width = F)
+ ) |> + kable_styling(full_width = FALSE)
@@ -5040,8 +5040,8 @@ package ggplot2.

# On regroupe selon les modalites de la DrivAge
 # l'exposition, le nombre de sinistres et la frequence
-df_plot <- dat %>%
-  group_by(DrivAge) %>% 
+df_plot <- dat |>
+  group_by(DrivAge) |>
   summarize(
     exp = sum(Exposure),
     nb_claims = sum(ClaimNb),
@@ -5049,20 +5049,20 @@ package ggplot2.

) # Histogramme exposition -p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) + +p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) + geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) + labs(x = "Age du conducteur", y = "Exposition en années") + theme_bw() - # Histogramme frequence - p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) + - geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) + - labs(x = "Age du conducteur", y = "Frequence") + - theme_bw() - -plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)
-
df_plot <- dat %>%
-  group_by(DrivAge) %>% 
+# Histogramme frequence
+p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
+  geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
+  labs(x = "Age du conducteur", y = "Frequence") +
+  theme_bw()
+
+plot_grid(p1, p2, labels = c("A", "B"), label_size = 12)
+
df_plot <- dat |>
+  group_by(DrivAge) |>
   summarize(
     exp = sum(Exposure),
     nb_claims = sum(ClaimNb),
@@ -5071,9 +5071,9 @@ package ggplot2.

# Scatter plot frequence p3 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) + - geom_point() + + geom_point() + geom_smooth() + - labs(x = "Age du conducteur", y = "Frequence") + + labs(x = "Age du conducteur", y = "Frequence") + theme_bw() p3

@@ -5087,9 +5087,9 @@ package ggplot2.

améliorations en modifiant les variables DrivAge et BonusMalus.

# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence 
-df_plot <- dat %>%
-  group_by(DrivAge, BonusMalus) %>% 
+# l'exposition, le nombre de sinistres et la frequence
+df_plot <- dat |>
+  group_by(DrivAge, BonusMalus) |>
   summarize(
     exp = sum(Exposure),
     nb_claims = sum(ClaimNb),
@@ -5097,7 +5097,7 @@ améliorations en modifiant les variables DrivAge et BonusMal
   )
 
 p4 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
-  geom_point() + 
+  geom_point() +
   geom_smooth() +
   labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
   theme_bw()
@@ -5115,29 +5115,31 @@ l’exposition est très faible.
 lim_classes <- c(50, 75, 100, 125, Inf)
 
 # Exclusion des donnees "extremes" et faire les regroupement
-df_plot <- dat %>%
-  filter(DrivAge <= 85, BonusMalus <= 125) %>%
+df_plot <- dat |>
+  filter(DrivAge <= 85, BonusMalus <= 125) |>
   # regroupement en classes d'ages de 5 ans
-  mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) %>%
-  mutate(BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE))
-  
-# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence 
-df_plot <- df_plot %>%
-  group_by(DrivAge, BonusMalus) %>% 
-  summarize(
-    exp = sum(Exposure),
-    nb_claims = sum(ClaimNb),
-    freq = sum(ClaimNb) / sum(Exposure)
-  )
-
-# Scatter plot frequence
-p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
-  geom_point() + 
-  geom_smooth() +
-  labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
-  theme_bw()
-p5
+ mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) |> + mutate( + BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE) + ) + +# On regroupe selon les modalites de la DrivAge et de Area +# l'exposition, le nombre de sinistres et la frequence +df_plot <- df_plot |> + group_by(DrivAge, BonusMalus) |> + summarize( + exp = sum(Exposure), + nb_claims = sum(ClaimNb), + freq = sum(ClaimNb) / sum(Exposure) + ) + +# Scatter plot frequence +p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) + + geom_point() + + geom_smooth() + + labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") + + theme_bw() +p5

@@ -5182,36 +5184,29 @@ couples.

# df : nom du data.frame # var1 : nom de la variable explicative 1 # var2 : nom de la variable explicative 2 -plot_pairwise_disc <- function(df, var1, var2) -{ - df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2)) - -# replace variable vname by the binning variable - if(is.numeric(df$varx)) - { - df <- df %>% - mutate(varx = ntile(varx, 5)) - } - - if(is.numeric(df$vary)) - { - df <- df %>% - mutate(vary = ntile(vary, 5), - vary = factor(vary)) - } - - df %>% - group_by(varx, vary) %>% - summarize(exp = sum(Exposure), - nb_claims = sum(ClaimNb), - freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") %>% - ggplot(aes(x = varx, - y = freq, - colour = vary, - group = vary), alpha = 0.3) + - geom_point() + geom_line() + theme_bw() + - labs(x = var1, y = "Frequence", colour = var2) -}
+plot_pairwise_disc <- function(df, var1, var2) { + df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2)) + + # replace variable vname by the binning variable + if (is.numeric(df$varx)) { + df <- df |> + mutate(varx = ntile(varx, 5)) + } + + if (is.numeric(df$vary)) { + df <- df |> + mutate(vary = ntile(vary, 5), vary = factor(vary)) + } + + df |> + group_by(varx, vary) |> + summarize(exp = sum(Exposure), + nb_claims = sum(ClaimNb), + freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |> + ggplot(aes(x = varx, y = freq, colour = vary, group = vary), alpha = 0.3) + + geom_point() + geom_line() + theme_bw() + + labs(x = var1, y = "Frequence", colour = var2) +}
p1 <- plot_pairwise_disc(dat, "DrivAge", "BonusMalus")
 p2 <- plot_pairwise_disc(dat, "VehAge", "BonusMalus")
 p3 <- plot_pairwise_disc(dat, "BonusMalus", "VehBrand")