mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-30 07:28:39 +01:00
Refactor code for improved readability and consistency across multiple Jupyter notebooks
- Added missing commas in various print statements and function calls for better syntax. - Reformatted code to enhance clarity, including breaking long lines and aligning parameters. - Updated function signatures to use float type for sigma parameters instead of int for better precision. - Cleaned up comments and documentation strings for clarity and consistency. - Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
@@ -115,6 +115,7 @@
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"from catboost import CatBoostClassifier, Pool\n",
|
||||
"from sklearn.metrics import (\n",
|
||||
" classification_report,\n",
|
||||
@@ -163,10 +164,10 @@
|
||||
"\n",
|
||||
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
|
||||
"print(\n",
|
||||
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
|
||||
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\",\n",
|
||||
")\n",
|
||||
"print(\n",
|
||||
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
|
||||
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Chargement depuis UCI\n",
|
||||
@@ -192,7 +193,11 @@
|
||||
"\n",
|
||||
"try:\n",
|
||||
" df = pd.read_csv(\n",
|
||||
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
|
||||
" url,\n",
|
||||
" names=column_names,\n",
|
||||
" sep=r\",\\s*\",\n",
|
||||
" engine=\"python\",\n",
|
||||
" na_values=\"?\",\n",
|
||||
" )\n",
|
||||
" print(\"Dataset chargé depuis UCI repository\")\n",
|
||||
"except: # noqa: E722\n",
|
||||
@@ -301,7 +306,7 @@
|
||||
" n_samples,\n",
|
||||
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Création de la cible avec logique réaliste\n",
|
||||
@@ -647,19 +652,25 @@
|
||||
"\n",
|
||||
"# Taux de revenu >50K par catégorie\n",
|
||||
"df.groupby(\"education\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 0], color=\"skyblue\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 0],\n",
|
||||
" color=\"skyblue\",\n",
|
||||
")\n",
|
||||
"axes[0, 0].set_title(\"Taux de revenu >50K par niveau d'éducation\")\n",
|
||||
"axes[0, 0].set_xlabel(\"Taux\")\n",
|
||||
"\n",
|
||||
"df.groupby(\"occupation\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 1], color=\"lightcoral\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 1],\n",
|
||||
" color=\"lightcoral\",\n",
|
||||
")\n",
|
||||
"axes[0, 1].set_title(\"Taux de revenu >50K par occupation\")\n",
|
||||
"axes[0, 1].set_xlabel(\"Taux\")\n",
|
||||
"\n",
|
||||
"df.groupby(\"marital_status\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 2], color=\"lightgreen\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 2],\n",
|
||||
" color=\"lightgreen\",\n",
|
||||
")\n",
|
||||
"axes[0, 2].set_title(\"Taux de revenu >50K par statut marital\")\n",
|
||||
"axes[0, 2].set_xlabel(\"Taux\")\n",
|
||||
@@ -758,7 +769,11 @@
|
||||
"\n",
|
||||
"# Split train/test stratifié\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, y, test_size=0.2, random_state=42, stratify=y\n",
|
||||
" X,\n",
|
||||
" y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
" stratify=y,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -1020,7 +1035,7 @@
|
||||
"feature_names = X_train.columns\n",
|
||||
"\n",
|
||||
"importance_df = pd.DataFrame(\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
|
||||
").sort_values(\"importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(importance_df)\n",
|
||||
@@ -1283,7 +1298,10 @@
|
||||
"y_reg = df[\"montant_defaut\"]\n",
|
||||
"\n",
|
||||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n",
|
||||
" X_reg, y_reg, test_size=0.2, random_state=42\n",
|
||||
" X_reg,\n",
|
||||
" y_reg,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Pools\n",
|
||||
@@ -1292,7 +1310,11 @@
|
||||
"\n",
|
||||
"# Modèle\n",
|
||||
"model_reg = CatBoostRegressor(\n",
|
||||
" iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=100\n",
|
||||
" iterations=500,\n",
|
||||
" learning_rate=0.1,\n",
|
||||
" depth=6,\n",
|
||||
" random_seed=42,\n",
|
||||
" verbose=100,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model_reg.fit(train_pool_reg, eval_set=test_pool_reg)\n",
|
||||
@@ -1363,7 +1385,7 @@
|
||||
"feature_names = X_reg.columns\n",
|
||||
"\n",
|
||||
"importance_df = pd.DataFrame(\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
|
||||
").sort_values(\"importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(importance_df)\n",
|
||||
@@ -1580,7 +1602,7 @@
|
||||
"# Importance SHAP moyenne\n",
|
||||
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
|
||||
"shap_df = pd.DataFrame(\n",
|
||||
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
|
||||
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance},\n",
|
||||
").sort_values(\"shap_importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(\"\\nImportance SHAP moyenne :\")\n",
|
||||
|
||||
Reference in New Issue
Block a user