mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-30 03:28:58 +01:00
Refactor code for improved readability and consistency across multiple Jupyter notebooks
- Added missing commas in various print statements and function calls for better syntax. - Reformatted code to enhance clarity, including breaking long lines and aligning parameters. - Updated function signatures to use float type for sigma parameters instead of int for better precision. - Cleaned up comments and documentation strings for clarity and consistency. - Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
@@ -60,12 +60,13 @@
|
||||
"\n",
|
||||
"sns.set()\n",
|
||||
"import plotly.express as px\n",
|
||||
"import sklearn.metrics as metrics\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"\n",
|
||||
"# Statistiques\n",
|
||||
"from scipy.stats import chi2_contingency\n",
|
||||
"\n",
|
||||
"import sklearn.preprocessing as preproc\n",
|
||||
"from sklearn import metrics\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
|
||||
@@ -89,7 +90,7 @@
|
||||
"source": [
|
||||
"def cramers_V(var1, var2):\n",
|
||||
" crosstab = np.array(\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
|
||||
" ) # Cross table building\n",
|
||||
" stat = chi2_contingency(crosstab)[\n",
|
||||
" 0\n",
|
||||
@@ -2027,17 +2028,15 @@
|
||||
"for colu in data_set.columns:\n",
|
||||
" if True in data_set[colu].isna().unique():\n",
|
||||
" variables_na.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" if str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_numeriques.append(data_set[colu])\n",
|
||||
" elif str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_set[colu])"
|
||||
" variables_numeriques.append(data_set[colu])\n",
|
||||
" elif len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_set[colu])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2437,7 +2436,8 @@
|
||||
" col = []\n",
|
||||
" for var2 in vars_categorielles:\n",
|
||||
" cramers = cramers_V(\n",
|
||||
" vars_categorielles[var1], vars_categorielles[var2]\n",
|
||||
" vars_categorielles[var1],\n",
|
||||
" vars_categorielles[var2],\n",
|
||||
" ) # V de Cramer\n",
|
||||
" col.append(round(cramers, 2)) # arrondi du résultat\n",
|
||||
" rows.append(col)\n",
|
||||
@@ -2468,7 +2468,7 @@
|
||||
" + \" et \"\n",
|
||||
" + v_cramer_resultats.columns[j]\n",
|
||||
" + \" sont trop dépendantes, V-CRAMER = \"\n",
|
||||
" + str(v_cramer_resultats.iloc[i, j])\n",
|
||||
" + str(v_cramer_resultats.iloc[i, j]),\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -2662,7 +2662,7 @@
|
||||
" + \" et \"\n",
|
||||
" + correlations_num.columns[j]\n",
|
||||
" + \" sont trop dépendantes, corr = \"\n",
|
||||
" + str(correlations_num.iloc[i, j])\n",
|
||||
" + str(correlations_num.iloc[i, j]),\n",
|
||||
" )"
|
||||
]
|
||||
},
|
||||
@@ -3312,7 +3312,7 @@
|
||||
"# One hot encoding des variables catégorielles\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
|
||||
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
|
||||
" vars_categorielles\n",
|
||||
" vars_categorielles,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
|
||||
@@ -3496,7 +3496,8 @@
|
||||
"\n",
|
||||
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
|
||||
"vars_numeriques_scaled = pd.DataFrame(\n",
|
||||
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
|
||||
" vars_numeriques_scaled,\n",
|
||||
" columns=vars_numeriques.columns,\n",
|
||||
")\n",
|
||||
"vars_numeriques_scaled.head()"
|
||||
]
|
||||
@@ -3525,7 +3526,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -3542,7 +3545,10 @@
|
||||
"\n",
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42\n",
|
||||
" X,\n",
|
||||
" Y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -3707,7 +3713,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Réorganisation des données\n",
|
||||
@@ -3888,7 +3896,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"X_global = vars_numeriques_scaled.merge(\n",
|
||||
" variables_categorielles_ohe, left_index=True, right_index=True\n",
|
||||
" variables_categorielles_ohe,\n",
|
||||
" left_index=True,\n",
|
||||
" right_index=True,\n",
|
||||
")\n",
|
||||
"# Réorganisation des données\n",
|
||||
"X = X_global.to_numpy()\n",
|
||||
@@ -4127,6 +4137,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.model_selection import GridSearchCV, KFold"
|
||||
]
|
||||
@@ -4140,7 +4151,10 @@
|
||||
"source": [
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42\n",
|
||||
" X,\n",
|
||||
" Y,\n",
|
||||
" test_size=0.2,\n",
|
||||
" random_state=42,\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
@@ -4186,7 +4200,9 @@
|
||||
" estimator=rf,\n",
|
||||
" param_grid=param_grid,\n",
|
||||
" cv=KFold(\n",
|
||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||
" n_splits=num_folds,\n",
|
||||
" shuffle=True,\n",
|
||||
" random_state=42,\n",
|
||||
" ), # Validation croisée avec 5 folds\n",
|
||||
" scoring=\"neg_mean_squared_error\", # Métrique d'évaluation (moins c'est mieux)\n",
|
||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||
@@ -4247,7 +4263,11 @@
|
||||
"# Cross validation\n",
|
||||
"# RMSE de chaque fold\n",
|
||||
"rmse_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_root_mean_squared_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_root_mean_squared_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -4256,7 +4276,11 @@
|
||||
"\n",
|
||||
"# MSE de chaque fold\n",
|
||||
"mse_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_squared_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_mean_squared_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
@@ -4266,7 +4290,11 @@
|
||||
"\n",
|
||||
"# MAE de chaque fold\n",
|
||||
"mae_scores = cross_val_score(\n",
|
||||
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_absolute_error\"\n",
|
||||
" best_rf,\n",
|
||||
" X_train,\n",
|
||||
" y_train,\n",
|
||||
" cv=num_folds,\n",
|
||||
" scoring=\"neg_mean_absolute_error\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
|
||||
Reference in New Issue
Block a user