From 8f5f2b417c503aa8b1afc044e56fa6612b696c2a Mon Sep 17 00:00:00 2001
From: Arthur DANJOU
Date: Thu, 6 Nov 2025 09:13:40 +0100
Subject: [PATCH] Implement code changes to enhance functionality and improve
performance
---
.../tp1/3-td_ggplot2 - enonce.Rmd | 91 ++++++-----
.../tp1/3-td_ggplot2---enonce.html | 145 +++++++++---------
2 files changed, 113 insertions(+), 123 deletions(-)
diff --git a/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd b/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd
index 37eef63..46d4688 100644
--- a/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd
+++ b/M2/Data Visualisation/tp1/3-td_ggplot2 - enonce.Rmd
@@ -32,7 +32,7 @@ knitr::opts_chunk$set(
fig.height = 6,
fig.width = 12
)
-options(encoding = 'UTF-8')
+options(encoding = "UTF-8")
```
```{r, echo = FALSE, fig.keep= 'none'}
@@ -94,7 +94,7 @@ réalisées sur des manchots sur 3 îles de l'archipelle Palmer.
Dans un premier temps, il faut installer le package et le charger.
```{r}
-# install.packages("palmerpenguins")
+# install.packages("palmerpenguins") #nolint
library(palmerpenguins)
```
@@ -297,8 +297,8 @@ On présente ci-dessous un aperçu des données.
fold <- getwd()
# Load data
-load(paste0(fold, "/data/datafreMPTL.RData"))
-# load(paste0(fold, "/M2/Data Visualisation/tp1", "/data/datafreMPTL.RData"))
+# load(paste0(fold, "/M2/Data Visualisation/tp1", "/data/datafreMPTL.RData")) # VSCode # nolint
+load(paste0(fold, "/data/datafreMPTL.RData")) # RStudio
paged_table(dat, options = list(rows.print = 15))
```
@@ -470,7 +470,7 @@ package **ggplot2**.
# On regroupe selon les modalites de la DrivAge
# l'exposition, le nombre de sinistres et la frequence
df_plot <- dat |>
- group_by(DrivAge) |>
+ group_by(DrivAge) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -478,23 +478,23 @@ df_plot <- dat |>
)
# Histogramme exposition
-p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) +
+p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) +
geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
labs(x = "Age du conducteur", y = "Exposition en années") +
theme_bw()
- # Histogramme frequence
- p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
- geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
- labs(x = "Age du conducteur", y = "Frequence") +
- theme_bw()
-
-plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)
+# Histogramme frequence
+p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
+ geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
+ labs(x = "Age du conducteur", y = "Frequence") +
+ theme_bw()
+
+plot_grid(p1, p2, labels = c("A", "B"), label_size = 12)
```
```{r}
df_plot <- dat |>
- group_by(DrivAge) |>
+ group_by(DrivAge) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -503,9 +503,9 @@ df_plot <- dat |>
# Scatter plot frequence
p3 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
- geom_point() +
+ geom_point() +
geom_smooth() +
- labs(x = "Age du conducteur", y = "Frequence") +
+ labs(x = "Age du conducteur", y = "Frequence") +
theme_bw()
p3
```
@@ -522,9 +522,9 @@ améliorations en modifiant les variables `DrivAge` et `BonusMalus`.
```{r}
# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence
+# l'exposition, le nombre de sinistres et la frequence
df_plot <- dat |>
- group_by(DrivAge, BonusMalus) |>
+ group_by(DrivAge, BonusMalus) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -532,7 +532,7 @@ df_plot <- dat |>
)
p4 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
- geom_point() +
+ geom_point() +
geom_smooth() +
labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
theme_bw()
@@ -556,12 +556,14 @@ df_plot <- dat |>
filter(DrivAge <= 85, BonusMalus <= 125) |>
# regroupement en classes d'ages de 5 ans
mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) |>
- mutate(BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE))
-
+ mutate(
+ BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE)
+ )
+
# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence
+# l'exposition, le nombre de sinistres et la frequence
df_plot <- df_plot |>
- group_by(DrivAge, BonusMalus) |>
+ group_by(DrivAge, BonusMalus) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -570,7 +572,7 @@ df_plot <- df_plot |>
# Scatter plot frequence
p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
- geom_point() +
+ geom_point() +
geom_smooth() +
labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
theme_bw()
@@ -624,35 +626,28 @@ couples.
# df : nom du data.frame
# var1 : nom de la variable explicative 1
# var2 : nom de la variable explicative 2
-plot_pairwise_disc <- function(df, var1, var2)
-{
- df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2))
-
-# replace variable vname by the binning variable
- if(is.numeric(df$varx))
- {
+plot_pairwise_disc <- function(df, var1, var2) {
+ df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2))
+
+ # replace variable vname by the binning variable
+ if (is.numeric(df$varx)) {
df <- df |>
mutate(varx = ntile(varx, 5))
}
-
- if(is.numeric(df$vary))
- {
+
+ if (is.numeric(df$vary)) {
df <- df |>
- mutate(vary = ntile(vary, 5),
- vary = factor(vary))
+ mutate(vary = ntile(vary, 5), vary = factor(vary))
}
-
- df |>
- group_by(varx, vary) |>
- summarize(exp = sum(Exposure),
- nb_claims = sum(ClaimNb),
- freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |>
- ggplot(aes(x = varx,
- y = freq,
- colour = vary,
- group = vary), alpha = 0.3) +
- geom_point() + geom_line() + theme_bw() +
- labs(x = var1, y = "Frequence", colour = var2)
+
+ df |>
+ group_by(varx, vary) |>
+ summarize(exp = sum(Exposure),
+ nb_claims = sum(ClaimNb),
+ freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |>
+ ggplot(aes(x = varx, y = freq, colour = vary, group = vary), alpha = 0.3) +
+ geom_point() + geom_line() + theme_bw() +
+ labs(x = var1, y = "Frequence", colour = var2)
}
```
diff --git a/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html b/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html
index ac02922..ab69faf 100644
--- a/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html
+++ b/M2/Data Visualisation/tp1/3-td_ggplot2---enonce.html
@@ -4945,17 +4945,17 @@ package ggplot2.
Construire un tableau présentant l’exposition cumulée et le nombre
d’observations avec 0 sinistre, 1 sinistre, …
-dat %>%
- group_by(ClaimNb) %>%
- summarise(n = n(), Exposure = round(sum(Exposure), 0)) %>%
+dat |>
+ group_by(ClaimNb) |>
+ summarise(n = n(), Exposure = round(sum(Exposure), 0)) |>
kable(
col_names = c(
"Nombre de sinistres",
"Nombres d'observations",
"Exposition totale"
)
- ) %>%
- kable_styling(full_width = F)
+ ) |>
+ kable_styling(full_width = FALSE)
@@ -5040,8 +5040,8 @@ package ggplot2.
# On regroupe selon les modalites de la DrivAge
# l'exposition, le nombre de sinistres et la frequence
-df_plot <- dat %>%
- group_by(DrivAge) %>%
+df_plot <- dat |>
+ group_by(DrivAge) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -5049,20 +5049,20 @@ package ggplot2.
)
# Histogramme exposition
-p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) +
+p1 <- ggplot(df_plot, aes(x = DrivAge, y = exp)) +
geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
labs(x = "Age du conducteur", y = "Exposition en années") +
theme_bw()
- # Histogramme frequence
- p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
- geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
- labs(x = "Age du conducteur", y = "Frequence") +
- theme_bw()
-
-plot_grid(p1, p2, labels = c('A', 'B'), label_size = 12)
-df_plot <- dat %>%
- group_by(DrivAge) %>%
+# Histogramme frequence
+p2 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
+ geom_bar(stat = "identity", fill = "grey", color = "black", alpha = 0.5) +
+ labs(x = "Age du conducteur", y = "Frequence") +
+ theme_bw()
+
+plot_grid(p1, p2, labels = c("A", "B"), label_size = 12)
+df_plot <- dat |>
+ group_by(DrivAge) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -5071,9 +5071,9 @@ package ggplot2.
# Scatter plot frequence
p3 <- ggplot(df_plot, aes(x = DrivAge, y = freq)) +
- geom_point() +
+ geom_point() +
geom_smooth() +
- labs(x = "Age du conducteur", y = "Frequence") +
+ labs(x = "Age du conducteur", y = "Frequence") +
theme_bw()
p3

@@ -5087,9 +5087,9 @@ package ggplot2.
améliorations en modifiant les variables DrivAge et BonusMalus.
# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence
-df_plot <- dat %>%
- group_by(DrivAge, BonusMalus) %>%
+# l'exposition, le nombre de sinistres et la frequence
+df_plot <- dat |>
+ group_by(DrivAge, BonusMalus) |>
summarize(
exp = sum(Exposure),
nb_claims = sum(ClaimNb),
@@ -5097,7 +5097,7 @@ améliorations en modifiant les variables DrivAge et BonusMal
)
p4 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
- geom_point() +
+ geom_point() +
geom_smooth() +
labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
theme_bw()
@@ -5115,29 +5115,31 @@ l’exposition est très faible.
lim_classes <- c(50, 75, 100, 125, Inf)
# Exclusion des donnees "extremes" et faire les regroupement
-df_plot <- dat %>%
- filter(DrivAge <= 85, BonusMalus <= 125) %>%
+df_plot <- dat |>
+ filter(DrivAge <= 85, BonusMalus <= 125) |>
# regroupement en classes d'ages de 5 ans
- mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) %>%
- mutate(BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE))
-
-# On regroupe selon les modalites de la DrivAge et de Area
-# l'exposition, le nombre de sinistres et la frequence
-df_plot <- df_plot %>%
- group_by(DrivAge, BonusMalus) %>%
- summarize(
- exp = sum(Exposure),
- nb_claims = sum(ClaimNb),
- freq = sum(ClaimNb) / sum(Exposure)
- )
-
-# Scatter plot frequence
-p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
- geom_point() +
- geom_smooth() +
- labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
- theme_bw()
-p5
+ mutate(DrivAge = ceiling(pmin(DrivAge, 85) / 5) * 5) |>
+ mutate(
+ BonusMalus = cut(BonusMalus, breaks = lim_classes, include.lowest = TRUE)
+ )
+
+# On regroupe selon les modalites de la DrivAge et de Area
+# l'exposition, le nombre de sinistres et la frequence
+df_plot <- df_plot |>
+ group_by(DrivAge, BonusMalus) |>
+ summarize(
+ exp = sum(Exposure),
+ nb_claims = sum(ClaimNb),
+ freq = sum(ClaimNb) / sum(Exposure)
+ )
+
+# Scatter plot frequence
+p5 <- ggplot(df_plot, aes(x = DrivAge, y = freq, color = BonusMalus)) +
+ geom_point() +
+ geom_smooth() +
+ labs(x = "Age du conducteur", y = "Frequence", color = "Bonus-Malus") +
+ theme_bw()
+p5

@@ -5182,36 +5184,29 @@ couples.
# df : nom du data.frame
# var1 : nom de la variable explicative 1
# var2 : nom de la variable explicative 2
-
plot_pairwise_disc <- function(df, var1, var2)
-
{
-
df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2))
-
-
# replace variable vname by the binning variable
-
if(is.numeric(df$varx))
-
{
-
df <- df %>%
-
mutate(varx = ntile(varx, 5))
-
}
-
-
if(is.numeric(df$vary))
-
{
-
df <- df %>%
-
mutate(vary = ntile(vary, 5),
-
vary = factor(vary))
-
}
-
-
df %>%
-
group_by(varx, vary) %>%
-
summarize(exp = sum(Exposure),
-
nb_claims = sum(ClaimNb),
-
freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") %>%
-
ggplot(aes(x = varx,
-
y = freq,
-
colour = vary,
-
group = vary), alpha = 0.3) +
-
geom_point() + geom_line() + theme_bw() +
-
labs(x = var1, y = "Frequence", colour = var2)
-
}
+plot_pairwise_disc <- function(df, var1, var2) {
+ df <- rename(df, "varx" = all_of(var1), "vary" = all_of(var2))
+
+ # replace variable vname by the binning variable
+ if (is.numeric(df$varx)) {
+ df <- df |>
+ mutate(varx = ntile(varx, 5))
+ }
+
+ if (is.numeric(df$vary)) {
+ df <- df |>
+ mutate(vary = ntile(vary, 5), vary = factor(vary))
+ }
+
+ df |>
+ group_by(varx, vary) |>
+ summarize(exp = sum(Exposure),
+ nb_claims = sum(ClaimNb),
+ freq = sum(ClaimNb) / sum(Exposure), .groups = "drop") |>
+ ggplot(aes(x = varx, y = freq, colour = vary, group = vary), alpha = 0.3) +
+ geom_point() + geom_line() + theme_bw() +
+ labs(x = var1, y = "Frequence", colour = var2)
+}
p1 <- plot_pairwise_disc(dat, "DrivAge", "BonusMalus")
p2 <- plot_pairwise_disc(dat, "VehAge", "BonusMalus")
p3 <- plot_pairwise_disc(dat, "BonusMalus", "VehBrand")