Add "Clustering In Practice" section: add Encoding.Rmd and data/chiffres.csv; update README

This commit is contained in:
2026-01-08 13:44:01 +01:00
parent 2e2500b509
commit c8c1bf4807
3 changed files with 2062 additions and 0 deletions

View File

@@ -0,0 +1,53 @@
```{r}
library(caret)
library(dplyr)
```
# One Hot Encoding
```{r}
df <- data.frame(
team = c('A', 'A', 'B', 'B', 'B', 'B', 'C', 'C'),
points = c(25, 12, 15, 14, 19, 23, 25, 29)
)
dummies <- dummyVars(~team + points, data = df)
one_hot_data <- predict(dummies, newdata = df)
one_hot_data
```
# Target Encoding
```{r}
train <- data.frame(
target = c(10, 20, 15),
cat_col1 = c('city1', 'city2', 'city1'),
cat_col2 = c('james', 'adam', 'charles')
)
global_mean <- mean(train$target)
alpha <- 10
target_encoding <- train %>%
group_by(cat_col1) %>%
summarise(
n = n(),
sum_target = sum(target),
cat_col1_te = (sum_target + (alpha * global_mean)) / (n + alpha),
.groups = "drop"
) %>%
select(cat_col1, cat_col1_te)
train <- train %>% left_join(target_encoding, by = "cat_col1")
```
# Frequential Encoding
```{r}
df <- data.frame(
color = c('blue', 'red', 'blue', 'green'),
value = c(10, 20, 10, 30)
)
```

File diff suppressed because it is too large Load Diff

View File

@@ -29,6 +29,7 @@ The projects are organized into two main sections:
- `Statistical Learning`
- `M2`
- `Clustering In Practice`
- `Data Visualisation`
- `Deep Learning`
- `Generative AI`