From a0b0a9f8bd31fc19e23d76c9183bc600a7c16111 Mon Sep 17 00:00:00 2001 From: Arthur DANJOU Date: Mon, 13 Oct 2025 19:58:58 +0200 Subject: [PATCH] Refactor code in 2025_TP_3_M2_ISF.ipynb: - Updated execution counts for multiple code cells to maintain consistency. - Removed redundant imports and organized import statements. - Improved formatting for better readability in train-test split section. - Added markdown explanations for model performance metrics (MAE, RMSE). - Enhanced cross-validation training loop with detailed output for each fold's metrics. --- .../TP_3/2025_TP_3_M2_ISF.ipynb | 266 ++++++++++++------ 1 file changed, 174 insertions(+), 92 deletions(-) diff --git a/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb b/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb index d594d08..7e51dd0 100644 --- a/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb +++ b/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 56, "id": "97d58527", "metadata": {}, "outputs": [], @@ -61,18 +61,17 @@ "sns.set()\n", "import plotly.express as px\n", "import plotly.graph_objects as gp\n", + "import sklearn.metrics as metrics\n", "import sklearn.preprocessing as preproc\n", "\n", "#Statistiques\n", "from scipy.stats import chi2_contingency\n", - "from sklearn import metrics\n", "\n", "# Machine Learning\n", "from sklearn.cluster import KMeans\n", - "import sklearn.metrics as metrics\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.model_selection import KFold, train_test_split\n", - "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor" + "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n" ] }, { @@ -101,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 158, + "execution_count": 57, "id": "c9597b48", "metadata": {}, "outputs": [], @@ -120,7 +119,7 @@ }, { "cell_type": "code", - "execution_count": 159, + "execution_count": 58, "id": "8051b5f4", "metadata": {}, "outputs": [], @@ -164,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 160, + "execution_count": 59, "id": "c427a4b8", "metadata": {}, "outputs": [ @@ -174,7 +173,7 @@ "(824, 14)" ] }, - "execution_count": 160, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -201,7 +200,7 @@ }, { "cell_type": "code", - "execution_count": 161, + "execution_count": 60, "id": "c8fd3ee1", "metadata": {}, "outputs": [ @@ -285,7 +284,7 @@ "type": "float" } ], - "ref": "e80a8f38-8160-41fb-bbfa-ae1f7b39de11", + "ref": "8fcd0abc-8334-4a0d-96b7-b6d7e17b3fb7", "rows": [ [ "count", @@ -761,7 +760,7 @@ "max NaN 83421.850000 " ] }, - "execution_count": 161, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } @@ -796,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 162, + "execution_count": 61, "id": "1b156435", "metadata": {}, "outputs": [ @@ -806,7 +805,7 @@ "(824, 13)" ] }, - "execution_count": 162, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } @@ -818,7 +817,7 @@ }, { "cell_type": "code", - "execution_count": 163, + "execution_count": 62, "id": "0ef0fcc0", "metadata": {}, "outputs": [], @@ -854,7 +853,7 @@ }, { "cell_type": "code", - "execution_count": 164, + "execution_count": 63, "id": "e130aae5", "metadata": {}, "outputs": [], @@ -864,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 165, + "execution_count": 64, "id": "c39e2ad0", "metadata": {}, "outputs": [ @@ -1812,7 +1811,7 @@ }, { "cell_type": "code", - "execution_count": 166, + "execution_count": 65, "id": "a16215ab", "metadata": {}, "outputs": [], @@ -1822,7 +1821,7 @@ }, { "cell_type": "code", - "execution_count": 167, + "execution_count": 66, "id": "532ca6c4", "metadata": {}, "outputs": [ @@ -2765,7 +2764,7 @@ }, { "cell_type": "code", - "execution_count": 168, + "execution_count": 67, "id": "b8530717", "metadata": {}, "outputs": [], @@ -2786,7 +2785,7 @@ }, { "cell_type": "code", - "execution_count": 169, + "execution_count": 68, "id": "4ff3847d", "metadata": {}, "outputs": [], @@ -2815,7 +2814,7 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 69, "id": "6a1c7907", "metadata": {}, "outputs": [], @@ -2823,7 +2822,9 @@ "X = data_model_preprocessed = vars_numeriques_scaled.merge(vars_categorielles_enc, left_index=True, right_index=True) # type: ignore\n", "Y = data_model[\"CM\"]\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)" + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, Y, test_size=0.2, random_state=42\n", + ")" ] }, { @@ -2844,14 +2845,14 @@ }, { "cell_type": "code", - "execution_count": 171, + "execution_count": 70, "id": "053e013c", "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "DecisionTreeRegressor()" ] }, - "execution_count": 171, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } @@ -3292,7 +3293,7 @@ }, { "cell_type": "code", - "execution_count": 172, + "execution_count": 71, "id": "c4ca2cf9", "metadata": {}, "outputs": [ @@ -3321,7 +3322,7 @@ }, { "cell_type": "code", - "execution_count": 173, + "execution_count": 72, "id": "4b739d5b", "metadata": {}, "outputs": [ @@ -3329,9 +3330,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "MAE: 5950.05\n", - "MSE: 160067768.70\n", - "RMSE: 12651.79\n" + "MAE: 5124.14\n", + "MSE: 84535204.52\n", + "RMSE: 9194.30\n" ] } ], @@ -3355,6 +3356,20 @@ "**Question :** que pensez-vous des performances de ce modèle ?" ] }, + { + "cell_type": "markdown", + "id": "bdd7ccd6", + "metadata": {}, + "source": [ + "*Réponse*: \n", + "\n", + "Erreur Absolue Moyenne (MAE)\n", + "La MAE représente l'écart absolu moyen entre les prédictions du modèle et les valeurs réelles. Une MAE de 5950.05 signifie qu'en moyenne, notre modèle commet une erreur de cette magnitude, dans l'unité de la variable cible. C'est l'indicateur le plus direct de l'erreur de prédiction moyenne.\n", + "\n", + "Racine de l'Erreur Quadratique Moyenne (RMSE)\n", + "La RMSE est la racine carrée de la moyenne des erreurs au carré ($RMSE = \\sqrt{MSE}$). En raison de l'opération de mise au carré, cette métrique est particulièrement sensible aux grandes erreurs. La valeur obtenue est de 12651.79." + ] + }, { "cell_type": "markdown", "id": "7ecba832", @@ -3393,11 +3408,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 73, "id": "ab1e1367", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "X = data_model_preprocessed\n", + "Y = data_model[\"CM\"]\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(\n", + " X, Y, test_size=0.2, random_state=42\n", + ")" + ] }, { "cell_type": "markdown", @@ -3417,12 +3439,12 @@ }, { "cell_type": "code", - "execution_count": 174, + "execution_count": 74, "id": "b515460e", "metadata": {}, "outputs": [], "source": [ - "#Initialisation\n", + "# Initialisation\n", "# Nombre de sous-échantillons pour la cross-validation\n", "num_splits = 5\n", "\n", @@ -3440,20 +3462,56 @@ }, { "cell_type": "code", - "execution_count": 175, + "execution_count": 75, "id": "eebb394f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validation croisée terminée avec 5 folds\n" + ] + } + ], "source": [ - "# Entrainement avec cross-validation\n" + "# Entrainement avec cross-validation\n", + "for train_index, val_index in kf.split(X_train):\n", + " X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]\n", + " y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]\n", + "\n", + " rf_regressor.fit(X_train_fold, y_train_fold)\n", + " y_pred_fold = rf_regressor.predict(X_val_fold)\n", + "\n", + " mae = metrics.mean_absolute_error(y_val_fold, y_pred_fold)\n", + " mse = metrics.mean_squared_error(y_val_fold, y_pred_fold)\n", + " rmse = metrics.root_mean_squared_error(y_val_fold, y_pred_fold)\n", + "\n", + " MAE_scores.append(mae)\n", + " MSE_scores.append(mse)\n", + " RMSE_scores.append(rmse)\n", + "\n", + "print(f\"Validation croisée terminée avec {len(MAE_scores)} folds\")" ] }, { "cell_type": "code", - "execution_count": 176, + "execution_count": 76, "id": "b067126c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 MAE: 4472.5486946969695\n", + "Fold 2 MAE: 3859.4743234848484\n", + "Fold 3 MAE: 3633.0231541666662\n", + "Fold 4 MAE: 3888.3910715909087\n", + "Fold 5 MAE: 4808.59621832061\n" + ] + } + ], "source": [ "# Métriques sur tous les folds\n", "\n", @@ -3464,10 +3522,22 @@ }, { "cell_type": "code", - "execution_count": 177, + "execution_count": 77, "id": "6597152c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 MSE: 85464414.44080053\n", + "Fold 2 MSE: 34396997.21755034\n", + "Fold 3 MSE: 55184512.50786593\n", + "Fold 4 MSE: 33191300.80751679\n", + "Fold 5 MSE: 68739370.63588645\n" + ] + } + ], "source": [ "#MSE\n", "for fold, mse in enumerate(MSE_scores, start=1):\n", @@ -3476,10 +3546,22 @@ }, { "cell_type": "code", - "execution_count": 178, + "execution_count": 78, "id": "63ff1c9d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 RMSE: 9244.696557529649\n", + "Fold 2 RMSE: 5864.895328780415\n", + "Fold 3 RMSE: 7428.62790210049\n", + "Fold 4 RMSE: 5761.189183451346\n", + "Fold 5 RMSE: 8290.9209763383\n" + ] + } + ], "source": [ "#RMSE\n", "for fold, rmse in enumerate(RMSE_scores, start=1):\n", @@ -3536,7 +3618,7 @@ }, { "cell_type": "code", - "execution_count": 179, + "execution_count": 79, "id": "6d58dbc2", "metadata": {}, "outputs": [], @@ -3569,7 +3651,7 @@ }, { "cell_type": "code", - "execution_count": 180, + "execution_count": 80, "id": "47da5172", "metadata": {}, "outputs": [], @@ -3579,7 +3661,7 @@ }, { "cell_type": "code", - "execution_count": 181, + "execution_count": 81, "id": "d4936c46", "metadata": {}, "outputs": [ @@ -3600,7 +3682,7 @@ }, { "cell_type": "code", - "execution_count": 182, + "execution_count": 82, "id": "3215c463", "metadata": {}, "outputs": [], @@ -3614,7 +3696,7 @@ }, { "cell_type": "code", - "execution_count": 183, + "execution_count": 83, "id": "bb9a5c9b", "metadata": {}, "outputs": [], @@ -3626,7 +3708,7 @@ }, { "cell_type": "code", - "execution_count": 184, + "execution_count": 84, "id": "0f0768ad", "metadata": {}, "outputs": [],