Refactor code formatting and improve readability in Jupyter notebooks for TP_4 and TP_5

- Adjusted indentation and line breaks for better clarity in function definitions and import statements.
- Standardized string quotes for consistency across the codebase.
- Enhanced readability of DataFrame creation and manipulation by breaking long lines into multiple lines.
- Cleaned up print statements and comments for improved understanding.
- Ensured consistent use of whitespace around operators and after commas.
This commit is contained in:
2025-11-25 10:46:16 +01:00
parent 751412c1cd
commit e57995ba85
17 changed files with 11975 additions and 11713 deletions

View File

@@ -69,11 +69,11 @@
"from sklearn import metrics\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.model_selection import (\n",
" GridSearchCV,\n",
" StratifiedKFold,\n",
" cross_val_score,\n",
" train_test_split,\n",
")\n"
" GridSearchCV,\n",
" StratifiedKFold,\n",
" cross_val_score,\n",
" train_test_split,\n",
")"
]
},
{
@@ -91,12 +91,18 @@
"metadata": {},
"outputs": [],
"source": [
"def cramers_V(var1,var2) :\n",
" crosstab = np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building\n",
" stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test\n",
" obs = np.sum(crosstab) # Number of observations\n",
" mini = min(crosstab.shape)-1 # Take the minimum value between the colmns and the rows of the cross table\n",
" return (stat/(obs*mini))"
"def cramers_V(var1, var2):\n",
" crosstab = np.array(\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
" ) # Cross table building\n",
" stat = chi2_contingency(crosstab)[\n",
" 0\n",
" ] # Keeping of the test statistic of the Chi2 test\n",
" obs = np.sum(crosstab) # Number of observations\n",
" mini = (\n",
" min(crosstab.shape) - 1\n",
" ) # Take the minimum value between the colmns and the rows of the cross table\n",
" return stat / (obs * mini)"
]
},
{
@@ -133,7 +139,7 @@
"metadata": {},
"outputs": [],
"source": [
"path = input_path + '/base_retraitee.csv'\n",
"path = input_path + \"/base_retraitee.csv\"\n",
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")"
]
},
@@ -16225,7 +16231,7 @@
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_categorielles.append(data_model[col])\n"
" variables_categorielles.append(data_model[col])"
]
},
{
@@ -16653,7 +16659,7 @@
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
" print(\n",
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
" )\n"
" )"
]
},
{
@@ -16851,7 +16857,7 @@
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
" print(\n",
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
" )\n"
" )"
]
},
{
@@ -17820,7 +17826,7 @@
" cv=StratifiedKFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" ), # Validation croisée avec 5 folds\n",
" scoring='recall', # Métrique d'évaluation (moins c'est mieux)\n",
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
")\n",
"\n",
@@ -17877,14 +17883,18 @@
],
"source": [
"# Recall de chaque fold\n",
"recall_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='recall')\n",
"recall_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
"for i, score in enumerate(recall_scores):\n",
" print(f\"Recall pour le fold {i + 1}: {score}\")\n",
"\n",
"# Accuracy de chaque fold\n",
"accuracy_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='accuracy')\n",
"accuracy_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
"print(\"\\n\")\n",
@@ -17892,12 +17902,14 @@
" print(f\"Accuracy pour le fold {i + 1}: {score}\")\n",
"\n",
"# Precision de chaque fold\n",
"precision_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='precision')\n",
"precision_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
"print(\"\\n\")\n",
"for i, score in enumerate(precision_scores):\n",
" print(f\"Precision pour le fold {i + 1}: {score}\")\n"
" print(f\"Precision pour le fold {i + 1}: {score}\")"
]
},
{
@@ -30178,7 +30190,7 @@
"# Observation de la distribution sur Y_train\n",
"df = pd.DataFrame(y_train, columns=[\"SINISTRE\"])\n",
"fig = px.histogram(df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train\")\n",
"fig.show()\n"
"fig.show()"
]
},
{
@@ -52502,7 +52514,7 @@
"fig = px.histogram(\n",
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
")\n",
"fig.show()\n"
"fig.show()"
]
},
{
@@ -52530,7 +52542,7 @@
"num_folds = 5\n",
"\n",
"# Initialisation du modèle GradientBoostingClassifier\n",
"gb = GradientBoostingClassifier(random_state=42)\n"
"gb = GradientBoostingClassifier(random_state=42)"
]
},
{
@@ -52567,7 +52579,7 @@
"print(\"Meilleurs hyperparamètres : \", best_params)\n",
"\n",
"# Initialiser un modèle avec les meilleurs hyperparamètres\n",
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)\n"
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)"
]
},
{