Merge branch 'master' of https://github.com/ArthurDanjou/studies

Add Lab 4
Add TP2
2026-01-14 15:54:13 +01:00 · 2026-01-13 10:36:14 +01:00 · 2026-01-13 10:36:09 +01:00 · 2026-01-13 10:36:04 +01:00 · 2026-01-13 10:35:56 +01:00 · 2026-01-13 10:35:41 +01:00
16 changed files with 21367 additions and 3840 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,12 @@ Baudelaire_len_32.p

 NoticeTechnique_files
 .posit
-renv
+renv
+
+results/
+results_stage_1/
+results_stage_2/
+*.safetensors
+*.pt
+*.pth
+*.bin
--- a/Learning/TP1.ipynb
+++ b/Learning/TP1.ipynb
@@ -0,0 +1,208 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "8226e658",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7e95cb09",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.microsoft.datawrangler.viewer.v0+json": {
+       "columns": [
+        {
+         "name": "index",
+         "rawType": "int64",
+         "type": "integer"
+        },
+        {
+         "name": "X1",
+         "rawType": "float64",
+         "type": "float"
+        },
+        {
+         "name": "X2",
+         "rawType": "float64",
+         "type": "float"
+        },
+        {
+         "name": "Y",
+         "rawType": "float64",
+         "type": "float"
+        }
+       ],
+       "ref": "018727a2-2342-424f-8395-021f40817c5a",
+       "rows": [
+        [
+         "0",
+         "-0.8363543",
+         "4.520502",
+         "-19.868094121443526"
+        ],
+        [
+         "1",
+         "0.4020083",
+         "3.252834",
+         "-10.46598545005849"
+        ],
+        [
+         "2",
+         "-0.2492138",
+         "3.610425",
+         "-12.91499193423918"
+        ],
+        [
+         "3",
+         "-0.6257167",
+         "4.58877",
+         "-20.67839639765537"
+        ],
+        [
+         "4",
+         "-0.9899948",
+         "4.893924",
+         "-22.99404413854238"
+        ]
+       ],
+       "shape": {
+        "columns": 3,
+        "rows": 5
+       }
+      },
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>X1</th>\n",
+       "      <th>X2</th>\n",
+       "      <th>Y</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-0.836354</td>\n",
+       "      <td>4.520502</td>\n",
+       "      <td>-19.868094</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0.402008</td>\n",
+       "      <td>3.252834</td>\n",
+       "      <td>-10.465985</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-0.249214</td>\n",
+       "      <td>3.610425</td>\n",
+       "      <td>-12.914992</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-0.625717</td>\n",
+       "      <td>4.588770</td>\n",
+       "      <td>-20.678396</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>-0.989995</td>\n",
+       "      <td>4.893924</td>\n",
+       "      <td>-22.994044</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         X1        X2          Y\n",
+       "0 -0.836354  4.520502 -19.868094\n",
+       "1  0.402008  3.252834 -10.465985\n",
+       "2 -0.249214  3.610425 -12.914992\n",
+       "3 -0.625717  4.588770 -20.678396\n",
+       "4 -0.989995  4.893924 -22.994044"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data = pd.read_excel(\"./data/data_pdp.xlsx\")\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "4e9a9a97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def partial_dependant_function(data: pd.DataFrame, model: object, feature: str, grid_points: list) -> list:\n",
+    "    \"\"\"Compute the Partial Dependence Plot (PDP) for a given feature.\"\"\"\n",
+    "    pdp = []\n",
+    "    for val in grid_points:\n",
+    "        data_temp = data.copy()\n",
+    "        data_temp[feature] = val\n",
+    "        preds = model.predict(data_temp)\n",
+    "        pdp.append(preds.mean())\n",
+    "    return pdp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9553a1d8",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "studies",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/Learning/data/data_pdp.xlsx
+++ b/Learning/data/data_pdp.xlsx
--- a/Practice/Encoding.Rmd
+++ b/Practice/Encoding.Rmd
@@ -0,0 +1,53 @@
+```{r}
+library(caret)
+library(dplyr)
+```
+
+# One Hot Encoding
+
+```{r}
+df <- data.frame(
+  team = c('A', 'A', 'B', 'B', 'B', 'B', 'C', 'C'),
+  points = c(25, 12, 15, 14, 19, 23, 25, 29)
+)
+
+dummies <- dummyVars(~team + points, data = df)
+one_hot_data <- predict(dummies, newdata = df)
+
+one_hot_data
+```
+
+# Target Encoding
+
+```{r}
+train <- data.frame(
+  target = c(10, 20, 15),
+  cat_col1 = c('city1', 'city2', 'city1'),
+  cat_col2 = c('james', 'adam', 'charles')
+)
+
+global_mean <- mean(train$target)
+alpha <- 10
+
+target_encoding <- train %>%
+  group_by(cat_col1) %>%
+  summarise(
+    n = n(),
+    sum_target = sum(target),
+    cat_col1_te = (sum_target + (alpha * global_mean)) / (n + alpha),
+    .groups = "drop"
+  ) %>%
+  select(cat_col1, cat_col1_te)
+
+train <- train %>% left_join(target_encoding, by = "cat_col1")
+```
+
+# Frequential Encoding
+
+
+```{r}
+df <- data.frame(
+  color = c('blue', 'red', 'blue', 'green'),
+  value = c(10, 20, 10, 30)
+)
+```
--- a/Practice/data/chiffres.csv
+++ b/Practice/data/chiffres.csv
--- a/Benchmark.ipynb
+++ b/Benchmark.ipynb
@@ -0,0 +1,851 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "172a7a9f",
+   "metadata": {},
+   "source": [
+    "# TP2 - Benchmark automatique\n",
+    "\n",
+    "Dans ce TP nous allons définir une fonction pour mesurer les performances d'un modèle de langage via l'exécution de plusieurs benchmarks. Nous avons vu en cours trois manières de mesurer la performance d'un modèle de langage qu'on peut résumer à:\n",
+    "1. **Évaluation automatique**: via un ensemble de questions dont on connait la réponse\n",
+    "2. **Évaluation humaine**: qualification humaine de la réponse d'un modèle à une question\n",
+    "3. **Évaluation par modèle de langage**: notation ou comparaison de réponse d'un ou plusieurs modèles par un autre modèle\n",
+    "\n",
+    "Nous nous intéressons ici au premier point, en particulier avec les benchmarks [GSM8K](https://huggingface.co/datasets/openai/gsm8k) et [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag).\n",
+    "Dans l'ensemble du notebook nous utiliserons la librairie LangChain.\n",
+    "\n",
+    "Il est à garder en tête que ce notebook n'a qu'une portée pédagogique et n'est pas forcément à jour puisque le domaine évolue rapidement, ni que les pratiques sont celles validées par l'industrie.\n",
+    "\n",
+    "## Uniformisation des benchmarks\n",
+    "\n",
+    "Pour chaque benchmark que l'on considère, nous avons besoin de plusieurs informations :\n",
+    "* **Dataset** : une fonction pour charger les questions du benchmark\n",
+    "* **Référence** : une fonction capable d'identifier la réponse attentue\n",
+    "* **Prompt** : un prompt qui permet de demander correctement au modèle de répondre à la question\n",
+    "* **Chaîne** : une fonction qui renvoie la chaîne de traitement de LangChain\n",
+    "* **Score** : une fonction qui score la performance d'un modèle sur une question\n",
+    "\n",
+    "Nous allons commencer par créer une classe qui regroupe ces desiderata :\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd75374d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "from langchain_core.runnables import Runnable\n",
+    "\n",
+    "\n",
+    "class Benchmark:\n",
+    "    \"\"\"Base class for benchmarks.\"\"\"\n",
+    "\n",
+    "    name: str\n",
+    "\n",
+    "    def __init__(self, prompt: PromptTemplate) -> None:\n",
+    "        \"\"\"Initialize the benchmark with a prompt template.\"\"\"\n",
+    "        self.prompt = prompt\n",
+    "\n",
+    "    def load_data(self) -> list:\n",
+    "        \"\"\"Load and return the benchmark data samples.\"\"\"\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def build_chain(self, model) -> Runnable:\n",
+    "        \"\"\"Build and return the evaluation chain using the provided model.\"\"\"\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def get_reference(self, sample) -> str:\n",
+    "        \"\"\"Extract and return the reference answer from a data sample.\"\"\"\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def score(self, prediction, reference) -> float:\n",
+    "        \"\"\"Score the prediction against the reference answer.\"\"\"\n",
+    "        raise NotImplementedError"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e2ab41df",
+   "metadata": {},
+   "source": [
+    "Pour rendre cette classe plus concrète, commençons par travailler avec le benchmark [GSM8K](https://huggingface.co/datasets/openai/gsm8k).\n",
+    "\n",
+    "### Benchmark GSM8K\n",
+    "\n",
+    "On commence par charger le dataset et observer une question.\n",
+    "\n",
+    "**Consigne** : Résoudre la question *à la main* et vérifier votre réponse. On recommande d'explorer plusieurs questions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93979ba0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of questions: 1319\n",
+      "Example of question:\n",
+      " Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\n",
+      "And its answer:\n",
+      " Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\n",
+      "She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n",
+      "#### 18\n"
+     ]
+    }
+   ],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "np.random.seed(42)\n",
+    "\n",
+    "dataset = load_dataset(\"gsm8k\", \"main\")\n",
+    "dataset = dataset[\"test\"]\n",
+    "\n",
+    "print(f\"Number of questions: {len(dataset)}\")\n",
+    "index = 0\n",
+    "print(\"Example of question:\\n\", dataset[index][\"question\"])\n",
+    "print(\"And its answer:\\n\", dataset[index][\"answer\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "82d797f0",
+   "metadata": {},
+   "source": [
+    "Après avoir inspecté plusieurs éléments du dataset, on remarque que la réponse finale est placée après la chaîne de caractères \"####\".\n",
+    "\n",
+    "**Consigne**: Construire une fonction `get_reference` qui prend en argument un élément de GMS8K (dictionnaire avec question et réponse) et renvoie la réponse attendue (string). On pourra utiliser la fonction [`search`](https://docs.python.org/3/library/re.html#re.search) de la librairie [`re`](https://docs.python.org/3/library/re.html#).\n",
+    "Puis tester cette fonction sur l'exemple précédent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b336056a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reference: 18\n"
+     ]
+    }
+   ],
+   "source": [
+    "from re import search\n",
+    "\n",
+    "\n",
+    "def get_reference(sample: dict) -> str:\n",
+    "    \"\"\"Extract the reference answer from a data sample.\"\"\"\n",
+    "    match = search(r\"#### (\\d+)\", sample[\"answer\"])\n",
+    "    return match.group(1) if match else None\n",
+    "\n",
+    "\n",
+    "index = 0\n",
+    "reference = get_reference(sample=dataset[index])\n",
+    "print(f\"Reference: {reference}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4c137e6a",
+   "metadata": {},
+   "source": [
+    "Il nous reste maintenant à définir un prompt tel que l'on puisse appeler un modèle et tester notre mécanique."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b899872",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "prompt = PromptTemplate(\n",
+    "    input_variables=[\"question\"],\n",
+    "    template=(\n",
+    "        \"\"\"You are a careful mathematician. Solve the problem step by step, then display your answer in the end.\n",
+    "        Question: {question}\n",
+    "        Answer:\"\"\"\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36433b53",
+   "metadata": {},
+   "source": [
+    "En intégrant l'appel à un modèle via Ollama sur notre ordinateur, on peut définir avec LangChain :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f0676b6",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model answer : Here's how we can solve this problem step by step:\n",
+      "\n",
+      "1. **Calculate the total number of eggs laid:** Janet's ducks lay 16 eggs per day.\n",
+      "\n",
+      "2. **Calculate the number of eggs eaten:** She eats 3 eggs per day.\n",
+      "\n",
+      "3. **Calculate the number of eggs remaining after breakfast:** 16 eggs (laid) - 3 eggs (eaten) = 13 eggs\n",
+      "\n",
+      "4. **Calculate the number of eggs used for baking:** She uses 4 eggs for baking.\n",
+      "\n",
+      "5. **Calculate the number of eggs remaining after baking:** 13 eggs - 4 eggs (baking) = 9 eggs\n",
+      "\n",
+      "6. **Calculate the earnings from selling the remaining eggs:** She sells 9 eggs at $2 per egg.  So she makes 9 * $2 = $18.\n",
+      "\n",
+      "**Answer:** $18\n",
+      "The answer was : 18\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.runnables import RunnablePassthrough\n",
+    "from langchain_ollama import OllamaLLM\n",
+    "\n",
+    "model = OllamaLLM(model=\"gemma3:4b\")\n",
+    "\n",
+    "chain = {\"question\": RunnablePassthrough()} | prompt | model | StrOutputParser()\n",
+    "\n",
+    "index = 0\n",
+    "\n",
+    "question = dataset[index][\"question\"]\n",
+    "answer = get_reference(dataset[index])\n",
+    "response = chain.invoke(question)\n",
+    "print(f\"Model answer : {response}\")\n",
+    "print(f\"The answer was : {answer}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "97dd7db7",
+   "metadata": {},
+   "source": [
+    "Il nous faut extraire la dernière valeur numérique pour obtenir automatiquement la réponse du modèle.\n",
+    "\n",
+    "**Consigne** : Définir une fonction `score` qui prend en paramètre la réponse du modèle et la réponse attendue puis renvoie si les deux réponses sont identiques (1 / 0). On pourra utiliser la fonction [`findall`](https://docs.python.org/3/library/re.html#re.findall) de la librairie `re`.\n",
+    "Puis l'appliquer sur l'exemple précédent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ad43cf84",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The model scored 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "from re import findall\n",
+    "\n",
+    "\n",
+    "def score(prediction, reference):\n",
+    "    if reference is None:\n",
+    "        return 0.0\n",
+    "\n",
+    "    numbers = findall(r\"\\d+\", prediction)\n",
+    "    return 1.0 if numbers and numbers[-1] == reference else 0.0\n",
+    "\n",
+    "\n",
+    "value = score(response, answer)\n",
+    "print(f\"The model scored {value}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a2ec5088",
+   "metadata": {},
+   "source": [
+    "Nous avons l'ensemble des éléments nécessaire pour définir la classe `GSM8KBenchmark` depuis la classe `Benchmark` que nous avons défini précédemment.\n",
+    "\n",
+    "**Consigne** : Définir cette classe comme sous-classe de `Benchmark`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "d83f4394",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GSM8KBenchmark(Benchmark):\n",
+    "    name = \"GSM8K\"\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        return load_dataset(\"gsm8k\", \"main\", split=\"test\")\n",
+    "\n",
+    "    def build_chain(self, model):\n",
+    "        return (\n",
+    "            {\"question\": RunnablePassthrough()}\n",
+    "            | self.prompt\n",
+    "            | model\n",
+    "            | StrOutputParser()\n",
+    "        )\n",
+    "\n",
+    "    def get_reference(self, sample):\n",
+    "        match = search(r\"#### (\\d+)\", sample[\"answer\"])\n",
+    "        return match.group(1) if match else None\n",
+    "\n",
+    "    def score(self, prediction, reference):\n",
+    "        if reference is None:\n",
+    "            return 0.0\n",
+    "        numbers = findall(r\"\\d+\", prediction)\n",
+    "        return 1.0 if numbers and numbers[-1] == reference else 0.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dfc3cb78",
+   "metadata": {},
+   "source": [
+    "Il est maintenant temps de définir une fonction qui *fait* le benchmark.\n",
+    "\n",
+    "**Consigne** : Définir une fonction `run_benchmark` qui prend en paramètre :\n",
+    "* `model_name` : le nom du modèle Ollama que l'on veut tester\n",
+    "* `benchmark` : la classe benchmark que l'on souhaite tester\n",
+    "* `max_samples` : le nombre maximum de questions que l'on souhaite utiliser\n",
+    "\n",
+    "Puisque l'object avec lequel nous travaillons est un dataset HuggingFace, pour sélectionner $n$ lignes, on utilisera \n",
+    "```python\n",
+    "dataset = dataset.select(range(max_samples))\n",
+    "```\n",
+    "De cette manière on préserve la structure."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2d7125af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm import tqdm\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def run_benchmark(\n",
+    "    model_name: str, benchmark: Benchmark, max_samples: int | None = None\n",
+    ") -> dict:\n",
+    "    model = OllamaLLM(model=model_name)\n",
+    "\n",
+    "    data = benchmark.load_data()\n",
+    "    if max_samples:\n",
+    "        data = data.select(range(max_samples))\n",
+    "    chain = benchmark.build_chain(model)\n",
+    "\n",
+    "    scores = []\n",
+    "\n",
+    "    for sample in tqdm(data, desc=f\"Running {benchmark.name}\"):\n",
+    "        prediction = chain.invoke(sample)\n",
+    "        reference = benchmark.get_reference(sample)\n",
+    "        scores.append(benchmark.score(prediction, reference))\n",
+    "\n",
+    "    results = {\n",
+    "        \"benchmark\": benchmark.name,\n",
+    "        \"model\": model_name,\n",
+    "        \"num_samples\": len(scores),\n",
+    "        \"accuracy\": np.mean(scores),\n",
+    "    }\n",
+    "    return results\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81de8940",
+   "metadata": {},
+   "source": [
+    "**Consigne** : Utiliser la fonction `run_benchmark` en définissant un prompt pour GSM8K."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6bbeb53",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running GSM8K: 100%|██████████| 5/5 [00:50<00:00, 10.18s/it]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'benchmark': 'GSM8K', 'model': 'gemma3:4b', 'num_samples': 5, 'accuracy': 0.8}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompt_GMS8K = PromptTemplate(\n",
+    "    input_variables=[\"question\"],\n",
+    "    template=(\n",
+    "        \"\"\"You are a careful mathematician. Solve the problem step by step, then display your answer in the end.\n",
+    "        Question: {question}\n",
+    "        Answer:\"\"\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "benchmark_GSM8K = GSM8KBenchmark(prompt=prompt_GMS8K)\n",
+    "results = run_benchmark(\n",
+    "    model_name=\"gemma3:4b\", benchmark=benchmark_GSM8K, max_samples=5\n",
+    ")\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0c943124",
+   "metadata": {},
+   "source": [
+    "### HellaSwag\n",
+    "\n",
+    "Maintenant que nous avons réussi à le faire pour le dataset GMS8K, attaquons-nous à [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag).\n",
+    "\n",
+    "**Consigne** : En suivant la même approche que précédemment, implémenter une sous classe `HellaSwagBenchmark` à partir de la classe `Benchmark`. Puis utiliser la fonction `run_benchmark` pour valider votre travail."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "32886901",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HellaSwagBenchmark(Benchmark):\n",
+    "    name = \"HellaSwag\"\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        return load_dataset(\"hellaswag\", split=\"validation\")\n",
+    "\n",
+    "    def build_chain(self, model):\n",
+    "        return (\n",
+    "            {\n",
+    "                \"context\": lambda x: x[\"ctx\"],\n",
+    "                \"choices\": lambda x: \"\\n\".join(\n",
+    "                    f\"{index}: {choice}\" for index, choice in enumerate(x[\"endings\"])\n",
+    "                ),\n",
+    "            }\n",
+    "            | self.prompt\n",
+    "            | model\n",
+    "            | StrOutputParser()\n",
+    "        )\n",
+    "\n",
+    "    def get_reference(self, sample):\n",
+    "        return str(sample[\"label\"])\n",
+    "\n",
+    "    def score(self, prediction, reference):\n",
+    "        match = search(r\"\\d\", prediction)\n",
+    "        return 1.0 if match and match.group(0) == reference else 0.0\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "96a3031a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running HellaSwag: 100%|██████████| 5/5 [00:02<00:00,  2.08it/s]"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'benchmark': 'HellaSwag', 'model': 'gemma3:4b', 'num_samples': 5, 'accuracy': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "prompt_HellaSwag = PromptTemplate(\n",
+    "    input_variables=[\"context\", \"choices\"],\n",
+    "    template=(\n",
+    "        \"\"\"You will be given a context and then different choices. You need to find the most likely continuation to the context. Answer with the number of the most likely choice only.\n",
+    "        Context: {context}\n",
+    "        Choices: {choices}\n",
+    "        Answer:\"\"\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "benchmark_HellaSwag = HellaSwagBenchmark(prompt=prompt_HellaSwag)\n",
+    "\n",
+    "results = run_benchmark(\n",
+    "    model_name=\"gemma3:4b\", benchmark=benchmark_HellaSwag, max_samples=5\n",
+    ")\n",
+    "print(results)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c542783c",
+   "metadata": {},
+   "source": [
+    "## Réponses structurées\n",
+    "\n",
+    "Sur quelques exemples tout semble fonctionner ! Mais il y a au moins une fragilité dans notre travail : la récupération de la réponse est peu fiable et largement dépendante des prompts.\n",
+    "\n",
+    "\n",
+    "Par exemple pour GMS8K, on aimerait avoir une réponse sous la forme d'un JSON :\n",
+    "```json\n",
+    "{\n",
+    "  \"reasoning\": \"étapes de raisonnement\",\n",
+    "  \"final_answer\": 18\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "De cette manière ce serait particulièrement simple d'extraire la réponse, sans pour autant ne pas avoir de *réflexion* du modèle. En revanche pour HellaSwag, un JSON extrêment simple suffit :\n",
+    "```json\n",
+    "{\n",
+    "    \"choice\": 2\n",
+    "}\n",
+    "```\n",
+    "\n",
+    "Pour forcer le modèle à suivre ces formats, nous allons utiliser l'option [Pydantic](https://docs.langchain.com/oss/python/langchain/structured-output). Elle s'utilise comme suit, pour GSM8K :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "988dbca3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pydantic import BaseModel, Field\n",
+    "\n",
+    "\n",
+    "class GSM8KOutput(BaseModel):\n",
+    "    reasoning: str = Field(description=\"Step-by-step reasoning\")\n",
+    "    final_answer: float = Field(description=\"Final numeric answer\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d855adfe",
+   "metadata": {},
+   "source": [
+    "Concernant l'intégration dans le prompt :"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f25afddc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The output should be formatted as a JSON instance that conforms to the JSON schema below.\n",
+      "\n",
+      "As an example, for the schema {\"properties\": {\"foo\": {\"title\": \"Foo\", \"description\": \"a list of strings\", \"type\": \"array\", \"items\": {\"type\": \"string\"}}}, \"required\": [\"foo\"]}\n",
+      "the object {\"foo\": [\"bar\", \"baz\"]} is a well-formatted instance of the schema. The object {\"properties\": {\"foo\": [\"bar\", \"baz\"]}} is not well-formatted.\n",
+      "\n",
+      "Here is the output schema:\n",
+      "```\n",
+      "{\"properties\": {\"reasoning\": {\"description\": \"Step-by-step reasoning\", \"title\": \"Reasoning\", \"type\": \"string\"}, \"final_answer\": {\"description\": \"Final numeric answer\", \"title\": \"Final Answer\", \"type\": \"number\"}}, \"required\": [\"reasoning\", \"final_answer\"]}\n",
+      "```\n"
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.output_parsers import PydanticOutputParser\n",
+    "\n",
+    "parser_gsm8k = PydanticOutputParser(pydantic_object=GSM8KOutput)\n",
+    "\n",
+    "prompt_gsm8k = PromptTemplate(\n",
+    "    input_variables=[\"question\"],\n",
+    "    partial_variables={\"format_instructions\": parser_gsm8k.get_format_instructions()},\n",
+    "    template=(\n",
+    "        \"\"\"You are a careful mathematician. Solve the problem step by step.\n",
+    "        Question: {question}\n",
+    "        {format_instructions}\"\"\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "print(parser_gsm8k.get_format_instructions())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d1dcc480",
+   "metadata": {},
+   "source": [
+    "**Consigne** : Modifier la classe `Benchmark` et la sous-classe `GMS8KBenchmark` pour intégrer ces évolutions."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "542a31d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_core.runnables import Runnable\n",
+    "from langchain_core.prompts import PromptTemplate\n",
+    "\n",
+    "\n",
+    "class Benchmark:\n",
+    "    name: str\n",
+    "\n",
+    "    def __init__(self, prompt: PromptTemplate, parser: PydanticOutputParser):\n",
+    "        self.prompt = prompt\n",
+    "        self.parser = parser\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def build_chain(self, model) -> Runnable:\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def get_reference(self, sample):\n",
+    "        raise NotImplementedError\n",
+    "\n",
+    "    def score(self, prediction, reference):\n",
+    "        raise NotImplementedError"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c94f1dd1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GSM8KBenchmark(Benchmark):\n",
+    "    name = \"GSM8K\"\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        return load_dataset(\"gsm8k\", \"main\", split=\"test\")\n",
+    "\n",
+    "    def build_chain(self, model):\n",
+    "        return {\"question\": RunnablePassthrough()} | self.prompt | model | self.parser\n",
+    "\n",
+    "    def get_reference(self, sample):\n",
+    "        match = search(r\"#### (\\d+)\", sample[\"answer\"])\n",
+    "        return float(match.group(1)) if match else None\n",
+    "\n",
+    "    def score(self, prediction: GSM8KOutput, reference: float | None):\n",
+    "        if reference is None:\n",
+    "            return 0.0\n",
+    "        return 1.0 if prediction.final_answer == reference else 0.0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b2076f24",
+   "metadata": {},
+   "source": [
+    "**Consigne** : Utiliser la fonction `run_benchmark` et vérifier que tout fonctionne."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "31e433b0",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running GSM8K: 100%|██████████| 5/5 [01:01<00:00, 12.25s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'benchmark': 'GSM8K', 'model': 'gemma3:4b', 'num_samples': 5, 'accuracy': 0.8}"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "gsm8k = GSM8KBenchmark(\n",
+    "    prompt=prompt_gsm8k,\n",
+    "    parser=parser_gsm8k,\n",
+    ")\n",
+    "\n",
+    "run_benchmark(\"gemma3:4b\", gsm8k, max_samples=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7ed90cd",
+   "metadata": {},
+   "source": [
+    "**Consigne** : Réaliser la même modification pour HellaSwag, et vérifier que cela fonctionne."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e678bed2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HellaSwagOutput(BaseModel):\n",
+    "    choice: int = Field(description=\"Index of the chosen continuation\")\n",
+    "\n",
+    "\n",
+    "class HellaSwagBenchmark(Benchmark):\n",
+    "    name = \"HellaSwag\"\n",
+    "\n",
+    "    def load_data(self):\n",
+    "        return load_dataset(\"hellaswag\", split=\"validation\")\n",
+    "\n",
+    "    def build_chain(self, model):\n",
+    "        return (\n",
+    "            {\n",
+    "                \"context\": lambda x: x[\"ctx\"],\n",
+    "                \"choices\": lambda x: \"\\n\".join(\n",
+    "                    f\"{index}: {choice}\" for index, choice in enumerate(x[\"endings\"])\n",
+    "                ),\n",
+    "            }\n",
+    "            | self.prompt\n",
+    "            | model\n",
+    "            | self.parser\n",
+    "        )\n",
+    "\n",
+    "    def get_reference(self, sample):\n",
+    "        return str(sample[\"label\"])\n",
+    "\n",
+    "    def score(self, prediction: HellaSwagOutput, reference: str) -> float:\n",
+    "        return 1.0 if str(prediction.choice) == reference else 0.0\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2455f816",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Running HellaSwag: 100%|██████████| 5/5 [00:15<00:00,  3.12s/it]\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "{'benchmark': 'HellaSwag',\n",
+       " 'model': 'gemma3:4b',\n",
+       " 'num_samples': 5,\n",
+       " 'accuracy': 1.0}"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "parser_hellaswag = PydanticOutputParser(pydantic_object=HellaSwagOutput)\n",
+    "\n",
+    "prompt_HellaSwag = PromptTemplate(\n",
+    "    input_variables=[\"context\", \"choices\"],\n",
+    "    partial_variables={\n",
+    "        \"format_instructions\": parser_hellaswag.get_format_instructions()\n",
+    "    },\n",
+    "    template=(\n",
+    "        \"\"\"You will be given a context and then different choices. You need to find the most likely continuation to the context.\n",
+    "        Context: {context}\n",
+    "        Choices: {choices}\n",
+    "        {format_instructions}\"\"\"\n",
+    "    ),\n",
+    ")\n",
+    "\n",
+    "hella_swag = HellaSwagBenchmark(\n",
+    "    prompt=prompt_HellaSwag,\n",
+    "    parser=parser_hellaswag,\n",
+    ")\n",
+    "\n",
+    "run_benchmark(\"gemma3:4b\", hella_swag, max_samples=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba9acd54",
+   "metadata": {},
+   "source": [
+    "## Pour aller plus loin\n",
+    "\n",
+    "On pourrait implémenter d'autres benchmark, comparer vraiment des modèles entre eux, comparer des prompts entre eux..."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/M2/Generative
+++ b/M2/Generative
--- a/Starter.ipynb
+++ b/Starter.ipynb
@@ -1,456 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "172a7a9f",
-   "metadata": {},
-   "source": [
-    "# TP2 - Benchmark automatique\n",
-    "\n",
-    "Dans ce TP nous allons définir une fonction pour mesurer les performances d'un modèle de langage via l'exécution de plusieurs benchmarks. Nous avons vu en cours trois manières de mesurer la performance d'un modèle de langage qu'on peut résumer à:\n",
-    "1. **Évaluation automatique**: via un ensemble de questions dont on connait la réponse\n",
-    "2. **Évaluation humaine**: qualification humaine de la réponse d'un modèle à une question\n",
-    "3. **Évaluation par modèle de langage**: notation ou comparaison de réponse d'un ou plusieurs modèles par un autre modèle\n",
-    "\n",
-    "Nous nous intéressons ici au premier point, en particulier avec les benchmarks [GSM8K](https://huggingface.co/datasets/openai/gsm8k) et [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag).\n",
-    "Dans l'ensemble du notebook nous utiliserons la librairie LangChain.\n",
-    "\n",
-    "Il est à garder en tête que ce notebook n'a qu'une portée pédagogique et n'est pas forcément à jour puisque le domaine évolue rapidement, ni que les pratiques sont celles validées par l'industrie.\n",
-    "\n",
-    "## Uniformisation des benchmarks\n",
-    "\n",
-    "Pour chaque benchmark que l'on considère, nous avons besoin de plusieurs informations :\n",
-    "* **Dataset** : une fonction pour charger les questions du benchmark\n",
-    "* **Référence** : une fonction capable d'identifier la réponse attentue\n",
-    "* **Prompt** : un prompt qui permet de demander correctement au modèle de répondre à la question\n",
-    "* **Chaîne** : une fonction qui renvoie la chaîne de traitement de LangChain\n",
-    "* **Score** : une fonction qui score la performance d'un modèle sur une question\n",
-    "\n",
-    "Nous allons commencer par créer une classe qui regroupe ces desiderata :\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd75374d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain_core.runnables import Runnable\n",
-    "from langchain_core.prompts import PromptTemplate\n",
-    "\n",
-    "\n",
-    "class Benchmark:\n",
-    "    name: str\n",
-    "\n",
-    "    def __init__(self, prompt: PromptTemplate):\n",
-    "        self.prompt = prompt\n",
-    "\n",
-    "    def load_data(self):\n",
-    "        raise NotImplementedError\n",
-    "\n",
-    "    def build_chain(self, model) -> Runnable:\n",
-    "        raise NotImplementedError\n",
-    "\n",
-    "    def get_reference(self, sample):\n",
-    "        raise NotImplementedError\n",
-    "\n",
-    "    def score(self, prediction, reference):\n",
-    "        raise NotImplementedError"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2ab41df",
-   "metadata": {},
-   "source": [
-    "Pour rendre cette classe plus concrète, commençons par travailler avec le benchmark [GSM8K](https://huggingface.co/datasets/openai/gsm8k).\n",
-    "\n",
-    "### Benchmark GSM8K\n",
-    "\n",
-    "On commence par charger le dataset et observer une question.\n",
-    "\n",
-    "**Consigne** : Résoudre la question *à la main* et vérifier votre réponse. On recommande d'explorer plusieurs questions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "93979ba0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import numpy as np; np.random.seed(42)\n",
-    "from datasets import load_dataset\n",
-    "\n",
-    "dataset = load_dataset(\"gsm8k\", \"main\")\n",
-    "dataset = dataset[\"test\"]\n",
-    "\n",
-    "print(f\"Number of questions: {len(dataset)}\")\n",
-    "index = 0\n",
-    "print(\"Example of question:\\n\", dataset[index][\"question\"])\n",
-    "print(\"And its answer:\\n\", dataset[index][\"answer\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "82d797f0",
-   "metadata": {},
-   "source": [
-    "Après avoir inspecté plusieurs éléments du dataset, on remarque que la réponse finale est placée après la chaîne de caractères \"####\".\n",
-    "\n",
-    "**Consigne**: Construire une fonction `get_reference` qui prend en argument un élément de GMS8K (dictionnaire avec question et réponse) et renvoie la réponse attendue (string). On pourra utiliser la fonction [`search`](https://docs.python.org/3/library/re.html#re.search) de la librairie [`re`](https://docs.python.org/3/library/re.html#).\n",
-    "Puis tester cette fonction sur l'exemple précédent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b336056a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "4c137e6a",
-   "metadata": {},
-   "source": [
-    "Il nous reste maintenant à définir un prompt tel que l'on puisse appeler un modèle et tester notre mécanique."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0b899872",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain_core.prompts import PromptTemplate\n",
-    "\n",
-    "prompt = PromptTemplate(\n",
-    "    input_variables=[\"question\"],\n",
-    "    template=(\n",
-    "        \"\"\"You are a careful mathematician. Solve the problem step by step, then display your answer in the end.\n",
-    "        Question: {question}\n",
-    "        Answer:\"\"\"\n",
-    "    )\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "36433b53",
-   "metadata": {},
-   "source": [
-    "En intégrant l'appel à un modèle via Ollama sur notre ordinateur, on peut définir avec LangChain :"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2f0676b6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain_core.runnables import RunnablePassthrough\n",
-    "from langchain_core.output_parsers import StrOutputParser\n",
-    "from langchain_ollama import OllamaLLM\n",
-    "\n",
-    "model = OllamaLLM(model=\"gemma3:4b\")\n",
-    "\n",
-    "chain = (\n",
-    "    {\"question\": RunnablePassthrough()}\n",
-    "    | prompt\n",
-    "    | model\n",
-    "    | StrOutputParser()\n",
-    ")\n",
-    "\n",
-    "index = 0\n",
-    "\n",
-    "question = dataset[index][\"question\"]\n",
-    "answer = get_reference(dataset[index])\n",
-    "response = chain.invoke(question)\n",
-    "print(f\"Model answer : {response}\")\n",
-    "print(f\"The answer was : {answer}\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "97dd7db7",
-   "metadata": {},
-   "source": [
-    "Il nous faut extraire la dernière valeur numérique pour obtenir automatiquement la réponse du modèle.\n",
-    "\n",
-    "**Consigne** : Définir une fonction `score` qui prend en paramètre la réponse du modèle et la réponse attendue puis renvoie si les deux réponses sont identiques (1 / 0). On pourra utiliser la fonction [`findall`](https://docs.python.org/3/library/re.html#re.findall) de la librairie `re`.\n",
-    "Puis l'appliquer sur l'exemple précédent."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ad43cf84",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a2ec5088",
-   "metadata": {},
-   "source": [
-    "Nous avons l'ensemble des éléments nécessaire pour définir la classe `GSM8KBenchmark` depuis la classe `Benchmark` que nous avons défini précédemment.\n",
-    "\n",
-    "**Consigne** : Définir cette classe comme sous-classe de `Benchmark`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d83f4394",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dfc3cb78",
-   "metadata": {},
-   "source": [
-    "Il est maintenant temps de définir une fonction qui *fait* le benchmark.\n",
-    "\n",
-    "**Consigne** : Définir une fonction `run_benchmark` qui prend en paramètre :\n",
-    "* `model_name` : le nom du modèle Ollama que l'on veut tester\n",
-    "* `benchmark` : la classe benchmark que l'on souhaite tester\n",
-    "* `max_samples` : le nombre maximum de questions que l'on souhaite utiliser\n",
-    "\n",
-    "Puisque l'object avec lequel nous travaillons est un dataset HuggingFace, pour sélectionner $n$ lignes, on utilisera \n",
-    "```python\n",
-    "dataset = dataset.select(range(max_samples))\n",
-    "```\n",
-    "De cette manière on préserve la structure."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2d7125af",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "81de8940",
-   "metadata": {},
-   "source": [
-    "**Consigne** : Utiliser la fonction `run_benchmark` en définissant un prompt pour GSM8K."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f6bbeb53",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0c943124",
-   "metadata": {},
-   "source": [
-    "### HellaSwag\n",
-    "\n",
-    "Maintenant que nous avons réussi à le faire pour le dataset GMS8K, attaquons-nous à [HellaSwag](https://huggingface.co/datasets/Rowan/hellaswag).\n",
-    "\n",
-    "**Consigne** : En suivant la même approche que précédemment, implémenter une sous classe `HellaSwagBenchmark` à partir de la classe `Benchmark`. Puis utiliser la fonction `run_benchmark` pour valider votre travail."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "32886901",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "96a3031a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "c542783c",
-   "metadata": {},
-   "source": [
-    "## Réponses structurées\n",
-    "\n",
-    "Sur quelques exemples tout semble fonctionner ! Mais il y a au moins une fragilité dans notre travail : la récupération de la réponse est peu fiable et largement dépendante des prompts.\n",
-    "\n",
-    "\n",
-    "Par exemple pour GMS8K, on aimerait avoir une réponse sous la forme d'un JSON :\n",
-    "```json\n",
-    "{\n",
-    "  \"reasoning\": \"étapes de raisonnement\",\n",
-    "  \"final_answer\": 18\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "De cette manière ce serait particulièrement simple d'extraire la réponse, sans pour autant ne pas avoir de *réflexion* du modèle. En revanche pour HellaSwag, un JSON extrêment simple suffit :\n",
-    "```json\n",
-    "{\n",
-    "    \"choice\": 2\n",
-    "}\n",
-    "```\n",
-    "\n",
-    "Pour forcer le modèle à suivre ces formats, nous allons utiliser l'option [Pydantic](https://docs.langchain.com/oss/python/langchain/structured-output). Elle s'utilise comme suit, pour GSM8K :"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "988dbca3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from pydantic import BaseModel, Field\n",
-    "\n",
-    "class GSM8KOutput(BaseModel):\n",
-    "    reasoning: str = Field(description=\"Step-by-step reasoning\")\n",
-    "    final_answer: float = Field(description=\"Final numeric answer\")\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d855adfe",
-   "metadata": {},
-   "source": [
-    "Concernant l'intégration dans le prompt :"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f25afddc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.output_parsers import PydanticOutputParser\n",
-    "\n",
-    "parser_gsm8k = PydanticOutputParser(pydantic_object=GSM8KOutput)\n",
-    "\n",
-    "prompt_gsm8k = PromptTemplate(\n",
-    "    input_variables=[\"question\"],\n",
-    "    partial_variables={\"format_instructions\": parser_gsm8k.get_format_instructions()},\n",
-    "    template=(\n",
-    "        \"\"\"You are a careful mathematician. Solve the problem step by step.\n",
-    "        Question: {question}\n",
-    "        {format_instructions}\"\"\"\n",
-    "    ),\n",
-    ")\n",
-    "\n",
-    "print(parser_gsm8k.get_format_instructions())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "d1dcc480",
-   "metadata": {},
-   "source": [
-    "**Consigne** : Modifier la classe `Benchmark` et la sous-classe `GMS8KBenchmark` pour intégrer ces évolutions."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "542a31d6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c94f1dd1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b2076f24",
-   "metadata": {},
-   "source": [
-    "**Consigne** : Utiliser la fonction `run_benchmark` et vérifier que tout fonctionne."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "31e433b0",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b7ed90cd",
-   "metadata": {},
-   "source": [
-    "**Consigne** : Réaliser la même modification pour HellaSwag, et vérifier que cela fonctionne."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e678bed2",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2455f816",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "ba9acd54",
-   "metadata": {},
-   "source": [
-    "## Pour aller plus loin\n",
-    "\n",
-    "On pourrait implémenter d'autres benchmark, comparer vraiment des modèles entre eux, comparer des prompts entre eux..."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
--- a/Starter.ipynb
+++ b/Starter.ipynb
--- a/Fitting.ipynb
+++ b/Fitting.ipynb
--- a/AI/TP2/data/Complete.csv
+++ b/AI/TP2/data/Complete.csv
--- a/Programming.ipynb
+++ b/Programming.ipynb
--- a/M2/Reinforcement
+++ b/M2/Reinforcement
--- a/README.md
+++ b/README.md
@@ -29,6 +29,7 @@ The projects are organized into two main sections:
  - `Statistical Learning`

 - `M2`
+  - `Clustering In Practice`
  - `Data Visualisation`
  - `Deep Learning`
  - `Generative AI`
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,9 @@ description = "A curated collection of mathematics and data science projects dev
 readme = "README.md"
 requires-python = ">= 3.11"
 dependencies = [
+    "accelerate>=1.12.0",
    "catboost>=1.2.8",
+    "datasets>=4.4.2",
    "faiss-cpu>=1.13.2",
    "imblearn>=0.0",
    "ipykernel>=6.29.5",
@@ -13,13 +15,16 @@ dependencies = [
    "langchain-community>=0.4.1",
    "langchain-huggingface>=1.2.0",
    "langchain-ollama>=1.0.1",
+    "langchain-text-splitters>=1.1.0",
    "matplotlib>=3.10.1",
    "nbformat>=5.10.4",
    "numpy>=2.2.5",
    "opencv-python>=4.11.0.86",
+    "openpyxl>=3.1.5",
    "pandas>=2.2.3",
    "pandas-stubs>=2.3.2.250926",
    "plotly>=6.3.0",
+    "polars>=1.37.0",
    "pypdf>=6.5.0",
    "scikit-learn>=1.6.1",
    "scipy>=1.15.2",
--- a/uv.lock
+++ b/uv.lock
Author	SHA1	Message	Date
Arthur DANJOU	b8b0024852	Merge branch 'master' of https://github.com/ArthurDanjou/studies	2026-01-13 10:36:14 +01:00
Arthur DANJOU	3e1ac18acd	Add Lab 4	2026-01-13 10:36:09 +01:00
Arthur DANJOU	77feb27b97	Add TP2	2026-01-13 10:36:04 +01:00
Arthur DANJOU	bcb8c66a9d	add new dependencies	2026-01-13 10:35:56 +01:00
Arthur DANJOU	03bc530c3a	edit .gitignore	2026-01-13 10:35:41 +01:00
Arthur DANJOU	27fd147d0f	Implement feature X to enhance user experience and fix bug Y in module Z	2026-01-12 12:54:32 +01:00
Arthur DANJOU	56fdd5da45	Ajout de la dépendance "polars" version 1.37.0 dans pyproject.toml et uv.lock	2026-01-12 10:59:50 +01:00
Arthur DANJOU	3e6b2e313a	Add langchain-text-splitters dependency to pyproject.toml and uv.lock - Updated pyproject.toml to include langchain-text-splitters version >=1.1.0 in dependencies. - Modified uv.lock to add langchain-text-splitters in both dependencies and requires-dist sections.	2026-01-12 10:48:31 +01:00
Arthur DANJOU	346695212d	Ajout de fichiers pour le calcul des graphiques de dépendance partielle : ajout de TP1.ipynb et data/data_pdp.xlsx ; mise à jour des dépendances dans pyproject.toml et uv.lock	2026-01-12 10:37:11 +01:00
Arthur DANJOU	8e7bbc1fe9	Implement feature X to enhance user experience and optimize performance	2026-01-12 10:37:04 +01:00
Arthur DANJOU	c8c1bf4807	Add "Clustering In Practice" section: add Encoding.Rmd and data/chiffres.csv; update README	2026-01-08 13:44:01 +01:00
Arthur DANJOU	2e2500b509	Update execution counts and runtime metrics in the Maze Game notebook for consistency and accuracy	2026-01-06 13:09:06 +01:00
Arthur DANJOU	5f5bd609d7	Remove unnecessary newline in policy comparison output for clarity in Lab 3 notebook	2026-01-06 13:09:02 +01:00
Arthur DANJOU	e56fd6f2af	Implement feature X to enhance user experience and optimize performance	2026-01-06 12:32:09 +01:00