Refactor code for improved readability and consistency across multiple Jupyter notebooks

- Added missing commas in various print statements and function calls for better syntax.
- Reformatted code to enhance clarity, including breaking long lines and aligning parameters.
- Updated function signatures to use float type for sigma parameters instead of int for better precision.
- Cleaned up comments and documentation strings for clarity and consistency.
- Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
2025-12-13 23:38:17 +01:00
parent f89ff4a016
commit d5a6bfd339
50 changed files with 779 additions and 449 deletions

View File

@@ -115,6 +115,7 @@
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
"from catboost import CatBoostClassifier, Pool\n",
"from sklearn.metrics import (\n",
" classification_report,\n",
@@ -163,10 +164,10 @@
"\n",
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
"print(\n",
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\",\n",
")\n",
"print(\n",
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\",\n",
")\n",
"\n",
"# Chargement depuis UCI\n",
@@ -192,7 +193,11 @@
"\n",
"try:\n",
" df = pd.read_csv(\n",
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
" url,\n",
" names=column_names,\n",
" sep=r\",\\s*\",\n",
" engine=\"python\",\n",
" na_values=\"?\",\n",
" )\n",
" print(\"Dataset chargé depuis UCI repository\")\n",
"except: # noqa: E722\n",
@@ -301,7 +306,7 @@
" n_samples,\n",
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
" ),\n",
" }\n",
" },\n",
" )\n",
"\n",
" # Création de la cible avec logique réaliste\n",
@@ -647,19 +652,25 @@
"\n",
"# Taux de revenu >50K par catégorie\n",
"df.groupby(\"education\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 0], color=\"skyblue\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 0],\n",
" color=\"skyblue\",\n",
")\n",
"axes[0, 0].set_title(\"Taux de revenu >50K par niveau d'éducation\")\n",
"axes[0, 0].set_xlabel(\"Taux\")\n",
"\n",
"df.groupby(\"occupation\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 1], color=\"lightcoral\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 1],\n",
" color=\"lightcoral\",\n",
")\n",
"axes[0, 1].set_title(\"Taux de revenu >50K par occupation\")\n",
"axes[0, 1].set_xlabel(\"Taux\")\n",
"\n",
"df.groupby(\"marital_status\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 2], color=\"lightgreen\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 2],\n",
" color=\"lightgreen\",\n",
")\n",
"axes[0, 2].set_title(\"Taux de revenu >50K par statut marital\")\n",
"axes[0, 2].set_xlabel(\"Taux\")\n",
@@ -758,7 +769,11 @@
"\n",
"# Split train/test stratifié\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=42, stratify=y\n",
" X,\n",
" y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
" stratify=y,\n",
")"
]
},
@@ -1020,7 +1035,7 @@
"feature_names = X_train.columns\n",
"\n",
"importance_df = pd.DataFrame(\n",
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
").sort_values(\"importance\", ascending=False)\n",
"\n",
"print(importance_df)\n",
@@ -1283,7 +1298,10 @@
"y_reg = df[\"montant_defaut\"]\n",
"\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n",
" X_reg, y_reg, test_size=0.2, random_state=42\n",
" X_reg,\n",
" y_reg,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")\n",
"\n",
"# Pools\n",
@@ -1292,7 +1310,11 @@
"\n",
"# Modèle\n",
"model_reg = CatBoostRegressor(\n",
" iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=100\n",
" iterations=500,\n",
" learning_rate=0.1,\n",
" depth=6,\n",
" random_seed=42,\n",
" verbose=100,\n",
")\n",
"\n",
"model_reg.fit(train_pool_reg, eval_set=test_pool_reg)\n",
@@ -1363,7 +1385,7 @@
"feature_names = X_reg.columns\n",
"\n",
"importance_df = pd.DataFrame(\n",
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
").sort_values(\"importance\", ascending=False)\n",
"\n",
"print(importance_df)\n",
@@ -1580,7 +1602,7 @@
"# Importance SHAP moyenne\n",
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
"shap_df = pd.DataFrame(\n",
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance},\n",
").sort_values(\"shap_importance\", ascending=False)\n",
"\n",
"print(\"\\nImportance SHAP moyenne :\")\n",