diff --git a/M2/Clustering In Practice/Encoding.Rmd b/M2/Clustering In Practice/Encoding.Rmd index b2bc6d4..bc82518 100644 --- a/M2/Clustering In Practice/Encoding.Rmd +++ b/M2/Clustering In Practice/Encoding.Rmd @@ -7,7 +7,7 @@ library(dplyr) ```{r} df <- data.frame( - team = c('A', 'A', 'B', 'B', 'B', 'B', 'C', 'C'), + team = c("A", "A", "B", "B", "B", "B", "C", "C"), points = c(25, 12, 15, 14, 19, 23, 25, 29) ) @@ -22,24 +22,24 @@ one_hot_data ```{r} train <- data.frame( target = c(10, 20, 15), - cat_col1 = c('city1', 'city2', 'city1'), - cat_col2 = c('james', 'adam', 'charles') + cat_col1 = c("city1", "city2", "city1"), + cat_col2 = c("james", "adam", "charles") ) global_mean <- mean(train$target) alpha <- 10 -target_encoding <- train %>% - group_by(cat_col1) %>% +target_encoding <- train |> + group_by(cat_col1) |> summarise( n = n(), sum_target = sum(target), cat_col1_te = (sum_target + (alpha * global_mean)) / (n + alpha), .groups = "drop" - ) %>% + ) |> select(cat_col1, cat_col1_te) -train <- train %>% left_join(target_encoding, by = "cat_col1") +train <- train |> left_join(target_encoding, by = "cat_col1") ``` # Frequential Encoding @@ -47,7 +47,7 @@ train <- train %>% left_join(target_encoding, by = "cat_col1") ```{r} df <- data.frame( - color = c('blue', 'red', 'blue', 'green'), + color = c("blue", "red", "blue", "green"), value = c(10, 20, 10, 30) ) ``` \ No newline at end of file diff --git a/M2/Clustering In Practice/compression_image.R b/M2/Clustering In Practice/compression_image.R index 2bfa38f..9e59947 100644 --- a/M2/Clustering In Practice/compression_image.R +++ b/M2/Clustering In Practice/compression_image.R @@ -1,7 +1,8 @@ # Objectifs pédagogiques # Comprendre la représentation matricielle d'une image. # Interpréter les centroïdes comme une palette de couleurs optimale (résumé). -# Analyser le compromis entre distorsion (perte de qualité) et taux de compression. + +setwd("~/Workspace/studies/M2/Clustering In Practice") library(jpeg) @@ -25,7 +26,6 @@ head(img_matrix) k <- 8 # Application de K-means -# On augmente iter.max car la convergence sur des milliers de pixels peut être lente set.seed(123) km_model <- kmeans(img_matrix, centers = k, iter.max = 20, nstart = 3) @@ -42,16 +42,16 @@ img_compressed <- array(img_compressed_matrix, dim = dims) # Affichage comparatif par(mfrow = c(1, 2), mar = c(1, 1, 1, 1)) -plot(0, 0, type='n', axes=FALSE, ann=FALSE) +plot(0, 0, type = "n", axes = FALSE, ann = FALSE) rasterImage(img, -1, -1, 1, 1) title("Originale (Millions de couleurs)") -plot(0, 0, type='n', axes=FALSE, ann=FALSE) +plot(0, 0, type = "n", axes = FALSE, ann = FALSE) rasterImage(img_compressed, -1, -1, 1, 1) title(paste("Compressée (k =", k, ")")) # 4. Questions : coût de l'information (Distorsion) -# Calculez l'erreur quadratique moyenne (MSE) entre l'image originale et +# Calculez l'erreur quadratique moyenne (MSE) entre l'image originale et # l'image compressée : # Plus $k$ est petit, plus le résumé est ..., plus le MSE ..... @@ -64,7 +64,7 @@ mse_imager <- function(img1, img2) { # Ici, on redimensionne img2 sur la taille d'img1 img2 <- imresize(img2, size_x = width(img1), size_y = height(img1)) if (spectrum(img2) != spectrum(img1)) { - img2 <- grayscale(img2) # fallback simple si nb de canaux diffère + img2 <- grayscale(img2) # fallback simple si nb de canaux diffère img1 <- grayscale(img1) } } @@ -102,71 +102,75 @@ elbow_wss <- function(X, ks = 2:32, nstart = 10, scale_data = FALSE) { X <- scale(X) } wss <- numeric(length(ks)) - + # Cas k = 1 : WSS = TSS (variance totale) - total_ss <- sum(scale(X, scale = FALSE)^2) # TSS + total_ss <- sum(scale(X, scale = FALSE)^2) # TSS for (i in seq_along(ks)) { k <- ks[i] cat(" k =", k, "\n") if (k == 1) { wss[i] <- total_ss } else { - set.seed(123) # reproductible + set.seed(123) # reproductible km <- kmeans(X, centers = k, nstart = nstart, iter.max = 100) wss[i] <- km$tot.withinss } } - - plot(ks, wss, type = "b", pch = 19, xlab = "Nombre de clusters (k)", - ylab = "Inertie intra-classe (WSS)", - main = "Méthode du coude (k-means)") + + plot(ks, wss, + type = "b", pch = 19, xlab = "Nombre de clusters (k)", + ylab = "Inertie intra-classe (WSS)", + main = "Méthode du coude (k-means)" + ) grid() -# invisible(data.frame(k = ks, WSS = wss)) + # invisible(data.frame(k = ks, WSS = wss)) } # Exemple d'utilisation : - res <- elbow_wss(img_compressed, ks = 2:32, nstart = 20, scale_data = FALSE) +res <- elbow_wss(img_compressed, ks = 2:32, nstart = 20, scale_data = FALSE) ############################################################################### - - elbow_wss_safe <- function(X, ks = 2:32, nstart = 20, scale_data = FALSE, seed = 123) { - X <- as.matrix(X) - if (scale_data) X <- scale(X) - set.seed(seed) - - # Nombre de lignes distinctes - n_unique <- nrow(unique(X)) - if (n_unique < 2) stop("Moins de 2 points distincts : k-means n'a pas de sens.") - - # Tronquer ks si nécessaire - ks <- ks[ks <= n_unique] - if (length(ks) == 0) stop("Tous les k demandés dépassent le nombre de points distincts.") - - wss <- numeric(length(ks)) - # TSS (k = 1) - total_ss <- sum(scale(X, scale = FALSE)^2) - - for (i in seq_along(ks)) { - k <- ks[i] - cat(" k =", k, "\n") - if (k == 1) { - wss[i] <- total_ss - } else { - km <- kmeans(X, centers = k, nstart = nstart, iter.max = 100) - wss[i] <- km$tot.withinss - } - } - - plot(ks, wss, type = "b", pch = 19, xlab = "Nombre de clusters (k)", - ylab = "Inertie intra-classe (WSS)", main = "Méthode du coude (k-means)") - axis(1, at = ks) - grid() -# invisible(data.frame(k = ks, WSS = wss)) - } - - # Exemple : - res <- elbow_wss_safe(img_compressed, ks = 2:32, nstart = 20) - + +elbow_wss_safe <- function(X, ks = 2:32, nstart = 20, scale_data = FALSE, seed = 123) { + X <- as.matrix(X) + if (scale_data) X <- scale(X) + set.seed(seed) + + # Nombre de lignes distinctes + n_unique <- nrow(unique(X)) + if (n_unique < 2) stop("Moins de 2 points distincts : k-means n'a pas de sens.") + + # Tronquer ks si nécessaire + ks <- ks[ks <= n_unique] + if (length(ks) == 0) stop("Tous les k demandés dépassent le nombre de points distincts.") + + wss <- numeric(length(ks)) + # TSS (k = 1) + total_ss <- sum(scale(X, scale = FALSE)^2) + + for (i in seq_along(ks)) { + k <- ks[i] + cat(" k =", k, "\n") + if (k == 1) { + wss[i] <- total_ss + } else { + km <- kmeans(X, centers = k, nstart = nstart, iter.max = 100) + wss[i] <- km$tot.withinss + } + } + + plot(ks, wss, + type = "b", pch = 19, xlab = "Nombre de clusters (k)", + ylab = "Inertie intra-classe (WSS)", main = "Méthode du coude (k-means)" + ) + axis(1, at = ks) + grid() + # invisible(data.frame(k = ks, WSS = wss)) +} + +# Exemple : +res <- elbow_wss_safe(img_compressed, ks = 2:32, nstart = 20) + @@ -175,11 +179,11 @@ elbow_wss <- function(X, ks = 2:32, nstart = 10, scale_data = FALSE) { jpeg("./data/image_compressed.jpg") # Afficher l'image compressée dans le fichier -plot(0, 0, type='n', axes=FALSE, ann=FALSE) +plot(0, 0, type = "n", axes = FALSE, ann = FALSE) rasterImage(img_compressed, -1, -1, 1, 1) info <- file.info("./data/PampasGrass.jpg") -(taille_octets_reelle <- info$size/1024) +(taille_octets_reelle <- info$size / 1024) info <- file.info("./data/image_compressed.jpg") -(taille_octets_compresse <- info$size/1024) +(taille_octets_compresse <- info$size / 1024) diff --git a/M2/Data Visualisation/Project/NoticeTechnique.Rmd b/M2/Data Visualisation/Project/NoticeTechnique.Rmd index 0263800..2f45136 100644 --- a/M2/Data Visualisation/Project/NoticeTechnique.Rmd +++ b/M2/Data Visualisation/Project/NoticeTechnique.Rmd @@ -185,9 +185,9 @@ cor_mat <- cor(vars_candidates, use = "pairwise.complete.obs") ``` ```{r} -cor_df <- as.data.frame(cor_mat) %>% - tibble::rownames_to_column(var = "Var1") %>% - pivot_longer(-Var1, names_to = "Var2", values_to = "r") %>% +cor_df <- as.data.frame(cor_mat) |> + tibble::rownames_to_column(var = "Var1") |> + pivot_longer(-Var1, names_to = "Var2", values_to = "r") |> mutate( Var1 = factor(Var1, levels = unique(Var1)), Var2 = factor(Var2, levels = rev(unique(Var1))) @@ -503,7 +503,9 @@ Bien que nous ayons appliqué une transformation logarithmique pour corriger l'a Nous appliquons donc une standardisation (Z-score) : $z = \frac{x - \mu}{\sigma}$ ```{r} -data_scaled <- tb_ready |> select(log_inc, log_mort) |> scale() +data_scaled <- tb_ready |> + select(log_inc, log_mort) |> + scale() check_table <- data.frame( Variable = c("Incidence (Log)", "Mortalité (Log)"), diff --git a/M2/Data Visualisation/Project/app.R b/M2/Data Visualisation/Project/app.R index 822694d..7711860 100644 --- a/M2/Data Visualisation/Project/app.R +++ b/M2/Data Visualisation/Project/app.R @@ -43,13 +43,11 @@ ui <- shinydashboard::dashboardPage( tabName = "methodo", icon = icon("info-circle") ), - menuItem( "Vue d'Ensemble", tabName = "dashboard", icon = icon("dashboard") ), - menuItem("Données Brutes", tabName = "data", icon = icon("table")), # Footer - Informations et crédits @@ -66,7 +64,6 @@ ui <- shinydashboard::dashboardPage( ) ) ), - hr(), # Filtre par Région @@ -147,7 +144,6 @@ ui <- shinydashboard::dashboardPage( plotlyOutput("cluster_scatter", height = "530px") ) ), - fluidRow( # Plot des tendances box( @@ -190,7 +186,6 @@ ui <- shinydashboard::dashboardPage( # Page 3 - Méthodologie tabItem( tabName = "methodo", - fluidRow( # Indicateurs OMS box( @@ -307,7 +302,6 @@ ui <- shinydashboard::dashboardPage( p("Dernière mise à jour du dataset : Octobre 2024.") ), ), - column( width = 6, box( @@ -319,7 +313,6 @@ ui <- shinydashboard::dashboardPage( p( "Afin de synthétiser l'information et de faciliter la prise de décision, j'ai appliqué un algorithme d'apprentissage non supervisé (K-Means) pour regrouper les pays ayant des profils épidémiques similaires sous 4 clusters." ), - h4("Méthodologie"), tags$ul( tags$li( @@ -335,7 +328,6 @@ ui <- shinydashboard::dashboardPage( "Utilisation de `set.seed(123)` pour garantir la reproductibilité des résultats." ) ), - h4("Interprétation des 3 Groupes"), # Tableau des Groupes @@ -395,7 +387,6 @@ ui <- shinydashboard::dashboardPage( status = "primary", solidHeader = TRUE, width = 12, - tags$p( "Ce projet suit une approche Open Science.", style = "font-style: italic;" @@ -403,7 +394,6 @@ ui <- shinydashboard::dashboardPage( tags$p( "L'intégralité du code source (Rmd, App) ainsi que la notice technique (PDF) sont disponibles en libre accès sur le dépôt GitHub." ), - tags$a( href = "https://go.arthurdanjou.fr/datavis-code", target = "_blank", @@ -524,7 +514,9 @@ server <- function(input, output, session) { # KPI - Pire pays output$kpi_worst_country <- shinydashboard::renderValueBox({ data <- filtered_data() - worst <- data |> arrange(desc(e_inc_100k)) |> slice(1) + worst <- data |> + arrange(desc(e_inc_100k)) |> + slice(1) if (nrow(worst) > 0) { valueBox( @@ -743,7 +735,7 @@ server <- function(input, output, session) { output$cluster_scatter <- plotly::renderPlotly({ data <- filtered_data() sel_iso <- selected_country() - highlight_data <- data %>% filter(iso3 == sel_iso) + highlight_data <- data |> filter(iso3 == sel_iso) p <- ggplot(data, aes(x = e_inc_100k, y = e_mort_exc_tbhiv_100k)) + geom_point( diff --git a/M2/Data Visualisation/Project/renv.lock b/M2/Data Visualisation/Project/renv.lock index 0a314b7..3b69858 100644 --- a/M2/Data Visualisation/Project/renv.lock +++ b/M2/Data Visualisation/Project/renv.lock @@ -2056,7 +2056,7 @@ "Type": "Package", "Title": "A Forward-Pipe Operator for R", "Authors@R": "c( person(\"Stefan Milton\", \"Bache\", , \"stefan@stefanbache.dk\", role = c(\"aut\", \"cph\"), comment = \"Original author and creator of magrittr\"), person(\"Hadley\", \"Wickham\", , \"hadley@posit.co\", role = \"aut\"), person(\"Lionel\", \"Henry\", , \"lionel@posit.co\", role = \"cre\"), person(\"Posit Software, PBC\", role = c(\"cph\", \"fnd\"), comment = c(ROR = \"03wc8by49\")) )", - "Description": "Provides a mechanism for chaining commands with a new forward-pipe operator, %>%. This operator will forward a value, or the result of an expression, into the next function call/expression. There is flexible support for the type of right-hand side expressions. For more information, see package vignette. To quote Rene Magritte, \"Ceci n'est pas un pipe.\"", + "Description": "Provides a mechanism for chaining commands with a new forward-pipe operator, |>. This operator will forward a value, or the result of an expression, into the next function call/expression. There is flexible support for the type of right-hand side expressions. For more information, see package vignette. To quote Rene Magritte, \"Ceci n'est pas un pipe.\"", "License": "MIT + file LICENSE", "URL": "https://magrittr.tidyverse.org, https://github.com/tidyverse/magrittr", "BugReports": "https://github.com/tidyverse/magrittr/issues", diff --git a/M2/Data Visualisation/tp3/tp3.Rmd b/M2/Data Visualisation/tp3/tp3.Rmd index 5acb37e..f66498f 100644 --- a/M2/Data Visualisation/tp3/tp3.Rmd +++ b/M2/Data Visualisation/tp3/tp3.Rmd @@ -32,8 +32,8 @@ Note that the `echo = FALSE` parameter was added to the code chunk to prevent pr ```{r} library(kableExtra) -mtcars[1:5, 1:5] %>% - kbl() %>% +mtcars[1:5, 1:5] |> + kbl() |> kable_styling() ```