mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-28 18:56:02 +01:00
Refactor code for improved readability and consistency across multiple Jupyter notebooks
- Added missing commas in various print statements and function calls for better syntax. - Reformatted code to enhance clarity, including breaking long lines and aligning parameters. - Updated function signatures to use float type for sigma parameters instead of int for better precision. - Cleaned up comments and documentation strings for clarity and consistency. - Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
@@ -144,7 +144,8 @@
|
||||
],
|
||||
"source": [
|
||||
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\n",
|
||||
" \"XX\", \"Mardi 19 septembre\"\n",
|
||||
" \"XX\",\n",
|
||||
" \"Mardi 19 septembre\",\n",
|
||||
")\n",
|
||||
"print(salutation)"
|
||||
]
|
||||
@@ -259,7 +260,7 @@
|
||||
],
|
||||
"source": [
|
||||
"a = 2\n",
|
||||
"if 5 > a:\n",
|
||||
"if a < 5:\n",
|
||||
" print(\"Cinq!\")\n",
|
||||
"else:\n",
|
||||
" print(\"a!\")\n",
|
||||
@@ -2612,11 +2613,10 @@
|
||||
" binaires.append(col)\n",
|
||||
" else:\n",
|
||||
" quantitatives.append(col)\n",
|
||||
" elif len(data_set[col].dropna().unique()) == 2:\n",
|
||||
" binaires.append(col)\n",
|
||||
" else:\n",
|
||||
" if len(data_set[col].dropna().unique()) == 2:\n",
|
||||
" binaires.append(col)\n",
|
||||
" else:\n",
|
||||
" categorielles.append(col)\n",
|
||||
" categorielles.append(col)\n",
|
||||
"\n",
|
||||
"print(\"Variables quantitatives :\", quantitatives)\n",
|
||||
"print(\"\\nVariables catégorielles :\", categorielles)\n",
|
||||
@@ -3527,7 +3527,7 @@
|
||||
"source": [
|
||||
"fig = px.histogram(data_set.sort_values(\"ANNEE_CTR\"), x=\"ANNEE_CTR\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
" type=\"category\",\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
@@ -18662,7 +18662,7 @@
|
||||
" data_set,\n",
|
||||
" x=\"CONTRAT_ANCIENNETE\",\n",
|
||||
" category_orders={\n",
|
||||
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"]\n",
|
||||
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"],\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
@@ -48902,7 +48902,7 @@
|
||||
" data_set,\n",
|
||||
" x=\"GROUPE_KM\",\n",
|
||||
" category_orders={\n",
|
||||
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"]\n",
|
||||
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"],\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
@@ -64021,7 +64021,7 @@
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(data_set.sort_values(\"ZONE_RISQUE\"), x=\"ZONE_RISQUE\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
" type=\"category\",\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
@@ -64877,10 +64877,11 @@
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(\n",
|
||||
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"), x=\"AGE_ASSURE_PRINCIPAL\"\n",
|
||||
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"),\n",
|
||||
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
|
||||
")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
" type=\"category\",\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
@@ -79999,7 +80000,7 @@
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(data_set.sort_values(\"GENRE\"), x=\"GENRE\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
" type=\"category\",\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
@@ -80064,7 +80065,8 @@
|
||||
" [\n",
|
||||
" data_h,\n",
|
||||
" pd.DataFrame(\n",
|
||||
" [[13, \"M\", 0]], columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"]\n",
|
||||
" [[13, \"M\", 0]],\n",
|
||||
" columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"],\n",
|
||||
" ),\n",
|
||||
" ],\n",
|
||||
" ignore_index=True,\n",
|
||||
@@ -82329,17 +82331,17 @@
|
||||
"# ANNEE_CONSTRUCTION,VALEUR_DU_BIEN,DEUXIEME_CONDUCTEUR)\n",
|
||||
"\n",
|
||||
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(\n",
|
||||
" data_retraitee[\"GROUPE_KM\"].mode()[0]\n",
|
||||
" data_retraitee[\"GROUPE_KM\"].mode()[0],\n",
|
||||
")\n",
|
||||
"data_retraitee[\"GENRE\"] = data_retraitee[\"GENRE\"].fillna(\"M\")\n",
|
||||
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(\n",
|
||||
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median()\n",
|
||||
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median(),\n",
|
||||
")\n",
|
||||
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(\n",
|
||||
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0]\n",
|
||||
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0],\n",
|
||||
")\n",
|
||||
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(\n",
|
||||
" False\n",
|
||||
" False,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -83750,7 +83752,10 @@
|
||||
"source": [
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.line(\n",
|
||||
" plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\"\n",
|
||||
" plot_data,\n",
|
||||
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
|
||||
" y=\"FREQ\",\n",
|
||||
" title=\"Sinistralité selon l'âge\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -85539,7 +85544,10 @@
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\"\n",
|
||||
" plot_data,\n",
|
||||
" x=\"ENERGIE\",\n",
|
||||
" y=\"FREQ\",\n",
|
||||
" title=\"Sinistralité selon le carburant\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -86416,7 +86424,10 @@
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\"\n",
|
||||
" plot_data,\n",
|
||||
" x=\"VALEUR_DU_BIEN\",\n",
|
||||
" y=\"CM\",\n",
|
||||
" title=\"Coût moyen selon le prix\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -89044,7 +89055,10 @@
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\"\n",
|
||||
" plot_data,\n",
|
||||
" x=\"GENRE\",\n",
|
||||
" y=\"CM\",\n",
|
||||
" title=\"Coût moyen selon l'âge de l'assuré\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
|
||||
@@ -56,16 +56,15 @@
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"sns.set()\n",
|
||||
"import matplotlib.pyplot as plt # noqa: E402\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import plotly.express as px\n",
|
||||
"import plotly.graph_objects as gp\n",
|
||||
"from scipy.cluster.hierarchy import dendrogram, linkage # noqa: E402\n",
|
||||
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
|
||||
"\n",
|
||||
"# Statistiques\n",
|
||||
"from scipy.stats import chi2_contingency # noqa: E402, F401\n",
|
||||
"from scipy.stats import chi2_contingency # noqa: F401\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"from sklearn.cluster import AgglomerativeClustering, KMeans # noqa: E402"
|
||||
"from sklearn.cluster import AgglomerativeClustering, KMeans"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -899,7 +898,9 @@
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=3, metric=\"euclidean\", linkage=\"single\"\n",
|
||||
" n_clusters=3,\n",
|
||||
" metric=\"euclidean\",\n",
|
||||
" linkage=\"single\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data)\n",
|
||||
@@ -972,7 +973,9 @@
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=3, metric=\"euclidean\", linkage=\"complete\"\n",
|
||||
" n_clusters=3,\n",
|
||||
" metric=\"euclidean\",\n",
|
||||
" linkage=\"complete\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data)\n",
|
||||
@@ -1482,7 +1485,7 @@
|
||||
"\n",
|
||||
"# Group by ZONE_RISQUE and aggregate the necessary columns\n",
|
||||
"data = data_retraitee.groupby([\"ZONE_RISQUE\"], as_index=False).agg(\n",
|
||||
" {\"NB\": \"sum\", \"CHARGE\": \"sum\", \"EXPO\": \"sum\"}\n",
|
||||
" {\"NB\": \"sum\", \"CHARGE\": \"sum\", \"EXPO\": \"sum\"},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Calculate derived metrics\n",
|
||||
@@ -1547,7 +1550,11 @@
|
||||
"source": [
|
||||
"# Initialisation de l'algorithme\n",
|
||||
"kmeans_FREQ = KMeans(\n",
|
||||
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
|
||||
" init=\"random\",\n",
|
||||
" n_clusters=5,\n",
|
||||
" n_init=1,\n",
|
||||
" random_state=42,\n",
|
||||
" max_iter=300,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
@@ -3559,7 +3566,11 @@
|
||||
"source": [
|
||||
"# Initialisation de l'algorithme\n",
|
||||
"kmeans_FREQ_CM = KMeans(\n",
|
||||
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
|
||||
" init=\"random\",\n",
|
||||
" n_clusters=5,\n",
|
||||
" n_init=1,\n",
|
||||
" random_state=42,\n",
|
||||
" max_iter=300,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
@@ -4621,7 +4632,9 @@
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=5, metric=\"euclidean\", linkage=\"single\"\n",
|
||||
" n_clusters=5,\n",
|
||||
" metric=\"euclidean\",\n",
|
||||
" linkage=\"single\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data_x)\n",
|
||||
@@ -5650,7 +5663,9 @@
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=5, metric=\"euclidean\", linkage=\"single\"\n",
|
||||
" n_clusters=5,\n",
|
||||
" metric=\"euclidean\",\n",
|
||||
" linkage=\"single\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data_x)\n",
|
||||
|
||||
@@ -60,12 +60,13 @@
|
||||
"\n",
|
||||
"sns.set()\n",
|
||||
"import plotly.express as px\n",
|
||||
"import sklearn.metrics as metrics\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"\n",
|
||||
"# Statistiques\n",
|
||||
"from scipy.stats import chi2_contingency\n",
|
||||
"\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"from sklearn import metrics\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
|
||||
@@ -89,7 +90,7 @@
|
||||
"source": [
|
||||
"def cramers_V(var1, var2):\n",
|
||||
" crosstab = np.array(\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
|
||||
" ) # Cross table building\n",
|
||||
" stat = chi2_contingency(crosstab)[\n",
|
||||
" 0\n",
|
||||
@@ -2027,17 +2028,15 @@
|
||||
"for colu in data_set.columns:\n",
|
||||
" if True in data_set[colu].isna().unique():\n",
|
||||
" variables_na.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" if str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_numeriques.append(data_set[colu])\n",
|
||||
" elif str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_set[colu])"
|
||||
" variables_numeriques.append(data_set[colu])\n",
|
||||
" elif len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_set[colu])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2437,7 +2436,8 @@
|
||||
" col = []\n",
|
||||
" for var2 in vars_categorielles:\n",
|
||||
" cramers = cramers_V(\n",
|
||||
" vars_categorielles[var1], vars_categorielles[var2]\n",
|
||||
" vars_categorielles[var1],\n",
|
||||
" vars_categorielles[var2],\n",
|
||||
" ) # V de Cramer\n",
|
||||
" col.append(round(cramers, 2)) # arrondi du résultat\n",
|
||||
" rows.append(col)\n",
|
||||
@@ -2468,7 +2468,7 @@
|
||||
" + \" et \"\n",
|
||||
" + v_cramer_resultats.columns[j]\n",
|
||||
" + \" sont trop dépendantes, V-CRAMER = \"\n",
|
||||
" + str(v_cramer_resultats.iloc[i, j])\n",
|
||||
" + str(v_cramer_resultats.iloc[i, j]),\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -2662,7 +2662,7 @@
|
||||
" + \" et \"\n",
|
||||
" + correlations_num.columns[j]\n",
|
||||
" + \" sont trop dépendantes, corr = \"\n",
|
||||
" + str(correlations_num.iloc[i, j])\n",
|
||||
" + str(correlations_num.iloc[i, j]),\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -3312,7 +3312,7 @@
|
||||
"# One hot encoding des variables catégorielles\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
|
||||
" vars_categorielles\n",
|
||||
" vars_categorielles,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
|
||||
@@ -3496,7 +3496,8 @@
|
||||
"\n",
|
||||
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
|
||||
"vars_numeriques_scaled = pd.DataFrame(\n",
|
||||
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
|
||||
" vars_numeriques_scaled,\n",
|
||||
" columns=vars_numeriques.columns,\n",
|
||||
")\n",
|
||||
"vars_numeriques_scaled.head()"
|
||||
]
|
||||
@@ -3525,7 +3526,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -3542,7 +3545,10 @@
|
||||
"\n",
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42\n",
|
||||
" X,\n",
|
||||
" Y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -3707,7 +3713,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Réorganisation des données\n",
|
||||
@@ -3888,7 +3896,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")\n",
|
||||
"# Réorganisation des données\n",
|
||||
"X = X_global.to_numpy()\n",
|
||||
@@ -4127,6 +4137,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.model_selection import GridSearchCV, KFold"
|
||||
]
|
||||
@@ -4140,7 +4151,10 @@
|
||||
"source": [
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42\n",
|
||||
" X,\n",
|
||||
" Y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -4186,7 +4200,9 @@
|
||||
" estimator=rf,\n",
|
||||
" param_grid=param_grid,\n",
|
||||
" cv=KFold(\n",
|
||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||
" n_splits=num_folds,\n",
|
||||
" shuffle=True,\n",
|
||||
" random_state=42,\n",
|
||||
" ), # Validation croisée avec 5 folds\n",
|
||||
" scoring=\"neg_mean_squared_error\", # Métrique d'évaluation (moins c'est mieux)\n",
|
||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||
@@ -4247,7 +4263,11 @@
|
||||
"# Cross validation\n",
|
||||
"# RMSE de chaque fold\n",
|
||||
"rmse_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_root_mean_squared_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_root_mean_squared_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -4256,7 +4276,11 @@
|
||||
"\n",
|
||||
"# MSE de chaque fold\n",
|
||||
"mse_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_squared_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_mean_squared_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -4266,7 +4290,11 @@
|
||||
"\n",
|
||||
"# MAE de chaque fold\n",
|
||||
"mae_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_absolute_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_mean_absolute_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
|
||||
@@ -58,14 +58,15 @@
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"sns.set()\n",
|
||||
"import plotly.express as px\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"from imblearn.over_sampling import RandomOverSampler\n",
|
||||
"\n",
|
||||
"import plotly.express as px\n",
|
||||
"\n",
|
||||
"# Statistiques\n",
|
||||
"from scipy.stats import chi2_contingency\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"from sklearn import metrics\n",
|
||||
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
||||
"from sklearn.model_selection import (\n",
|
||||
@@ -93,7 +94,7 @@
|
||||
"source": [
|
||||
"def cramers_V(var1, var2):\n",
|
||||
" crosstab = np.array(\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
|
||||
" ) # Cross table building\n",
|
||||
" stat = chi2_contingency(crosstab)[\n",
|
||||
" 0\n",
|
||||
@@ -16171,7 +16172,9 @@
|
||||
"source": [
|
||||
"# Observation de la distribution\n",
|
||||
"fig = px.histogram(\n",
|
||||
" data_model, x=\"SINISTRE\", title=\"Distribution de la variable 'sinistré'\"\n",
|
||||
" data_model,\n",
|
||||
" x=\"SINISTRE\",\n",
|
||||
" title=\"Distribution de la variable 'sinistré'\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -16221,17 +16224,15 @@
|
||||
"for col in data_set.columns:\n",
|
||||
" if True in data_model[col].isna().unique():\n",
|
||||
" variables_na.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" if str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_model[col].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" variables_numeriques.append(data_model[col])\n",
|
||||
" elif str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_model[col].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" if len(data_model[col].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_model[col])"
|
||||
" variables_numeriques.append(data_model[col])\n",
|
||||
" elif len(data_model[col].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_model[col])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -16631,7 +16632,8 @@
|
||||
" col = []\n",
|
||||
" for var2 in vars_categorielles:\n",
|
||||
" cramers = cramers_V(\n",
|
||||
" vars_categorielles[var1], vars_categorielles[var2]\n",
|
||||
" vars_categorielles[var1],\n",
|
||||
" vars_categorielles[var2],\n",
|
||||
" ) # V de Cramer\n",
|
||||
" col.append(round(cramers, 2)) # arrondi du résultat\n",
|
||||
" rows.append(col)\n",
|
||||
@@ -16658,7 +16660,7 @@
|
||||
" for j in range(i + 1, v_cramer_resultats.shape[0]):\n",
|
||||
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
|
||||
" print(\n",
|
||||
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
|
||||
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\",\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -16856,7 +16858,7 @@
|
||||
" for j in range(i + 1, nb_variables):\n",
|
||||
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
|
||||
" print(\n",
|
||||
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
|
||||
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\",\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -17520,7 +17522,7 @@
|
||||
"# One hot encoding des variables catégorielles\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
|
||||
" vars_categorielles\n",
|
||||
" vars_categorielles,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
|
||||
@@ -17704,7 +17706,8 @@
|
||||
"\n",
|
||||
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
|
||||
"vars_numeriques_scaled = pd.DataFrame(\n",
|
||||
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
|
||||
" vars_numeriques_scaled,\n",
|
||||
" columns=vars_numeriques.columns,\n",
|
||||
")\n",
|
||||
"vars_numeriques_scaled.head()"
|
||||
]
|
||||
@@ -17756,7 +17759,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")\n",
|
||||
"# Réorganisation des données\n",
|
||||
"X = X_global.to_numpy()\n",
|
||||
@@ -17772,7 +17777,11 @@
|
||||
"source": [
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42, stratify=Y\n",
|
||||
" X,\n",
|
||||
" Y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
" stratify=Y,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -17824,7 +17833,9 @@
|
||||
" estimator=gbc,\n",
|
||||
" param_grid=param_grid,\n",
|
||||
" cv=StratifiedKFold(\n",
|
||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||
" n_splits=num_folds,\n",
|
||||
" shuffle=True,\n",
|
||||
" random_state=42,\n",
|
||||
" ), # Validation croisée avec 5 folds\n",
|
||||
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
|
||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||
@@ -17884,7 +17895,11 @@
|
||||
"source": [
|
||||
"# Recall de chaque fold\n",
|
||||
"recall_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"recall\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -17893,7 +17908,11 @@
|
||||
"\n",
|
||||
"# Accuracy de chaque fold\n",
|
||||
"accuracy_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"accuracy\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -17903,7 +17922,11 @@
|
||||
"\n",
|
||||
"# Precision de chaque fold\n",
|
||||
"precision_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"precision\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -52512,7 +52535,9 @@
|
||||
"# Observation de la distribution sur Y_train\n",
|
||||
"df = pd.DataFrame(y_train_resampled, columns=[\"SINISTRE\"])\n",
|
||||
"fig = px.histogram(\n",
|
||||
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
|
||||
" df,\n",
|
||||
" x=\"SINISTRE\",\n",
|
||||
" title=\"Distribution de la variable Y_train_resampled\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -52565,7 +52590,9 @@
|
||||
" estimator=gb,\n",
|
||||
" param_grid=param_grid,\n",
|
||||
" cv=StratifiedKFold(\n",
|
||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||
" n_splits=num_folds,\n",
|
||||
" shuffle=True,\n",
|
||||
" random_state=42,\n",
|
||||
" ), # Validation croisée stratifiée avec 5 plis\n",
|
||||
" scoring=\"recall\", # Métrique d'évaluation\n",
|
||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||
@@ -52618,7 +52645,11 @@
|
||||
"# Zoom sur la CV\n",
|
||||
"# Recall de chaque fold\n",
|
||||
"recall_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"recall\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train_resampled,\n",
|
||||
" y_train_resampled,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"recall\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -52627,7 +52658,11 @@
|
||||
"\n",
|
||||
"# Accuracy de chaque fold\n",
|
||||
"accuracy_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"accuracy\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train_resampled,\n",
|
||||
" y_train_resampled,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"accuracy\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -52637,7 +52672,11 @@
|
||||
"\n",
|
||||
"# Precision de chaque fold\n",
|
||||
"precision_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"precision\"\n",
|
||||
" best_gbc,\n",
|
||||
" X_train_resampled,\n",
|
||||
" y_train_resampled,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"precision\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -53146,7 +53185,8 @@
|
||||
"# Matrice de confusion\n",
|
||||
"confusion_matrix = metrics.confusion_matrix(y_test, y_pred)\n",
|
||||
"cm_display = metrics.ConfusionMatrixDisplay(\n",
|
||||
" confusion_matrix=confusion_matrix, display_labels=[False, True]\n",
|
||||
" confusion_matrix=confusion_matrix,\n",
|
||||
" display_labels=[False, True],\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"cm_display.plot()"
|
||||
|
||||
@@ -115,6 +115,7 @@
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"from catboost import CatBoostClassifier, Pool\n",
|
||||
"from sklearn.metrics import (\n",
|
||||
" classification_report,\n",
|
||||
@@ -163,10 +164,10 @@
|
||||
"\n",
|
||||
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
|
||||
"print(\n",
|
||||
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
|
||||
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\",\n",
|
||||
")\n",
|
||||
"print(\n",
|
||||
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
|
||||
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Chargement depuis UCI\n",
|
||||
@@ -192,7 +193,11 @@
|
||||
"\n",
|
||||
"try:\n",
|
||||
" df = pd.read_csv(\n",
|
||||
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
|
||||
" url,\n",
|
||||
" names=column_names,\n",
|
||||
" sep=r\",\\s*\",\n",
|
||||
" engine=\"python\",\n",
|
||||
" na_values=\"?\",\n",
|
||||
" )\n",
|
||||
" print(\"Dataset chargé depuis UCI repository\")\n",
|
||||
"except: # noqa: E722\n",
|
||||
@@ -301,7 +306,7 @@
|
||||
" n_samples,\n",
|
||||
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" },\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Création de la cible avec logique réaliste\n",
|
||||
@@ -647,19 +652,25 @@
|
||||
"\n",
|
||||
"# Taux de revenu >50K par catégorie\n",
|
||||
"df.groupby(\"education\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 0], color=\"skyblue\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 0],\n",
|
||||
" color=\"skyblue\",\n",
|
||||
")\n",
|
||||
"axes[0, 0].set_title(\"Taux de revenu >50K par niveau d'éducation\")\n",
|
||||
"axes[0, 0].set_xlabel(\"Taux\")\n",
|
||||
"\n",
|
||||
"df.groupby(\"occupation\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 1], color=\"lightcoral\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 1],\n",
|
||||
" color=\"lightcoral\",\n",
|
||||
")\n",
|
||||
"axes[0, 1].set_title(\"Taux de revenu >50K par occupation\")\n",
|
||||
"axes[0, 1].set_xlabel(\"Taux\")\n",
|
||||
"\n",
|
||||
"df.groupby(\"marital_status\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
|
||||
" kind=\"barh\", ax=axes[0, 2], color=\"lightgreen\"\n",
|
||||
" kind=\"barh\",\n",
|
||||
" ax=axes[0, 2],\n",
|
||||
" color=\"lightgreen\",\n",
|
||||
")\n",
|
||||
"axes[0, 2].set_title(\"Taux de revenu >50K par statut marital\")\n",
|
||||
"axes[0, 2].set_xlabel(\"Taux\")\n",
|
||||
@@ -758,7 +769,11 @@
|
||||
"\n",
|
||||
"# Split train/test stratifié\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, y, test_size=0.2, random_state=42, stratify=y\n",
|
||||
" X,\n",
|
||||
" y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
" stratify=y,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -1020,7 +1035,7 @@
|
||||
"feature_names = X_train.columns\n",
|
||||
"\n",
|
||||
"importance_df = pd.DataFrame(\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
|
||||
").sort_values(\"importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(importance_df)\n",
|
||||
@@ -1283,7 +1298,10 @@
|
||||
"y_reg = df[\"montant_defaut\"]\n",
|
||||
"\n",
|
||||
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n",
|
||||
" X_reg, y_reg, test_size=0.2, random_state=42\n",
|
||||
" X_reg,\n",
|
||||
" y_reg,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Pools\n",
|
||||
@@ -1292,7 +1310,11 @@
|
||||
"\n",
|
||||
"# Modèle\n",
|
||||
"model_reg = CatBoostRegressor(\n",
|
||||
" iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=100\n",
|
||||
" iterations=500,\n",
|
||||
" learning_rate=0.1,\n",
|
||||
" depth=6,\n",
|
||||
" random_seed=42,\n",
|
||||
" verbose=100,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model_reg.fit(train_pool_reg, eval_set=test_pool_reg)\n",
|
||||
@@ -1363,7 +1385,7 @@
|
||||
"feature_names = X_reg.columns\n",
|
||||
"\n",
|
||||
"importance_df = pd.DataFrame(\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
|
||||
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
|
||||
").sort_values(\"importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(importance_df)\n",
|
||||
@@ -1580,7 +1602,7 @@
|
||||
"# Importance SHAP moyenne\n",
|
||||
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
|
||||
"shap_df = pd.DataFrame(\n",
|
||||
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
|
||||
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance},\n",
|
||||
").sort_values(\"shap_importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(\"\\nImportance SHAP moyenne :\")\n",
|
||||
|
||||
Reference in New Issue
Block a user