Refactor code for improved readability and consistency across multiple Jupyter notebooks

- Added missing commas in various print statements and function calls for better syntax.
- Reformatted code to enhance clarity, including breaking long lines and aligning parameters.
- Updated function signatures to use float type for sigma parameters instead of int for better precision.
- Cleaned up comments and documentation strings for clarity and consistency.
- Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
2025-12-13 23:38:17 +01:00
parent f89ff4a016
commit d5a6bfd339
50 changed files with 779 additions and 449 deletions

View File

@@ -58,14 +58,15 @@
"import seaborn as sns\n",
"\n",
"sns.set()\n",
"import plotly.express as px\n",
"\n",
"# Machine Learning\n",
"import sklearn.preprocessing as preproc\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"import plotly.express as px\n",
"\n",
"# Statistiques\n",
"from scipy.stats import chi2_contingency\n",
"\n",
"# Machine Learning\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn import metrics\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.model_selection import (\n",
@@ -93,7 +94,7 @@
"source": [
"def cramers_V(var1, var2):\n",
" crosstab = np.array(\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
" ) # Cross table building\n",
" stat = chi2_contingency(crosstab)[\n",
" 0\n",
@@ -16171,7 +16172,9 @@
"source": [
"# Observation de la distribution\n",
"fig = px.histogram(\n",
" data_model, x=\"SINISTRE\", title=\"Distribution de la variable 'sinistré'\"\n",
" data_model,\n",
" x=\"SINISTRE\",\n",
" title=\"Distribution de la variable 'sinistré'\",\n",
")\n",
"fig.show()"
]
@@ -16221,17 +16224,15 @@
"for col in data_set.columns:\n",
" if True in data_model[col].isna().unique():\n",
" variables_na.append(data_model[col])\n",
" else:\n",
" if str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_numeriques.append(data_model[col])\n",
" elif str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_categorielles.append(data_model[col])"
" variables_numeriques.append(data_model[col])\n",
" elif len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_categorielles.append(data_model[col])"
]
},
{
@@ -16631,7 +16632,8 @@
" col = []\n",
" for var2 in vars_categorielles:\n",
" cramers = cramers_V(\n",
" vars_categorielles[var1], vars_categorielles[var2]\n",
" vars_categorielles[var1],\n",
" vars_categorielles[var2],\n",
" ) # V de Cramer\n",
" col.append(round(cramers, 2)) # arrondi du résultat\n",
" rows.append(col)\n",
@@ -16658,7 +16660,7 @@
" for j in range(i + 1, v_cramer_resultats.shape[0]):\n",
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
" print(\n",
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\",\n",
" )"
]
},
@@ -16856,7 +16858,7 @@
" for j in range(i + 1, nb_variables):\n",
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
" print(\n",
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\",\n",
" )"
]
},
@@ -17520,7 +17522,7 @@
"# One hot encoding des variables catégorielles\n",
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
" vars_categorielles\n",
" vars_categorielles,\n",
")\n",
"\n",
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
@@ -17704,7 +17706,8 @@
"\n",
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
"vars_numeriques_scaled = pd.DataFrame(\n",
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
" vars_numeriques_scaled,\n",
" columns=vars_numeriques.columns,\n",
")\n",
"vars_numeriques_scaled.head()"
]
@@ -17756,7 +17759,9 @@
"outputs": [],
"source": [
"X_global = vars_numeriques_scaled.merge(\n",
" variables_categorielles_ohe, left_index=True, right_index=True\n",
" variables_categorielles_ohe,\n",
" left_index=True,\n",
" right_index=True,\n",
")\n",
"# Réorganisation des données\n",
"X = X_global.to_numpy()\n",
@@ -17772,7 +17777,11 @@
"source": [
"# Sampling en 80% train et 20% test\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=0.2, random_state=42, stratify=Y\n",
" X,\n",
" Y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
" stratify=Y,\n",
")"
]
},
@@ -17824,7 +17833,9 @@
" estimator=gbc,\n",
" param_grid=param_grid,\n",
" cv=StratifiedKFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" n_splits=num_folds,\n",
" shuffle=True,\n",
" random_state=42,\n",
" ), # Validation croisée avec 5 folds\n",
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
@@ -17884,7 +17895,11 @@
"source": [
"# Recall de chaque fold\n",
"recall_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"recall\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -17893,7 +17908,11 @@
"\n",
"# Accuracy de chaque fold\n",
"accuracy_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"accuracy\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -17903,7 +17922,11 @@
"\n",
"# Precision de chaque fold\n",
"precision_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"precision\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52512,7 +52535,9 @@
"# Observation de la distribution sur Y_train\n",
"df = pd.DataFrame(y_train_resampled, columns=[\"SINISTRE\"])\n",
"fig = px.histogram(\n",
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
" df,\n",
" x=\"SINISTRE\",\n",
" title=\"Distribution de la variable Y_train_resampled\",\n",
")\n",
"fig.show()"
]
@@ -52565,7 +52590,9 @@
" estimator=gb,\n",
" param_grid=param_grid,\n",
" cv=StratifiedKFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" n_splits=num_folds,\n",
" shuffle=True,\n",
" random_state=42,\n",
" ), # Validation croisée stratifiée avec 5 plis\n",
" scoring=\"recall\", # Métrique d'évaluation\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
@@ -52618,7 +52645,11 @@
"# Zoom sur la CV\n",
"# Recall de chaque fold\n",
"recall_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"recall\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"recall\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52627,7 +52658,11 @@
"\n",
"# Accuracy de chaque fold\n",
"accuracy_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"accuracy\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"accuracy\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52637,7 +52672,11 @@
"\n",
"# Precision de chaque fold\n",
"precision_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"precision\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"precision\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -53146,7 +53185,8 @@
"# Matrice de confusion\n",
"confusion_matrix = metrics.confusion_matrix(y_test, y_pred)\n",
"cm_display = metrics.ConfusionMatrixDisplay(\n",
" confusion_matrix=confusion_matrix, display_labels=[False, True]\n",
" confusion_matrix=confusion_matrix,\n",
" display_labels=[False, True],\n",
")\n",
"\n",
"cm_display.plot()"