Refactor code for improved readability and consistency across multiple Jupyter notebooks

- Added missing commas in various print statements and function calls for better syntax.
- Reformatted code to enhance clarity, including breaking long lines and aligning parameters.
- Updated function signatures to use float type for sigma parameters instead of int for better precision.
- Cleaned up comments and documentation strings for clarity and consistency.
- Ensured consistent formatting in plotting functions and data handling.
This commit is contained in:
2025-12-13 23:38:17 +01:00
parent f89ff4a016
commit d5a6bfd339
50 changed files with 779 additions and 449 deletions

View File

@@ -27,25 +27,32 @@
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"import tensorflow as tf\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from tensorflow import keras\n",
"\n",
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n",
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
" X_train_full, y_train_full, train_size=0.8\n",
" X_train_full,\n",
" y_train_full,\n",
" train_size=0.8,\n",
")\n",
"\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28\n",
" -1,\n",
" 28,\n",
" 28,\n",
")\n",
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28\n",
" -1,\n",
" 28,\n",
" 28,\n",
")\n",
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28\n",
" -1,\n",
" 28,\n",
" 28,\n",
")"
]
},
@@ -79,13 +86,17 @@
" keras.layers.Input(shape=[28, 28]),\n",
" keras.layers.Flatten(),\n",
" keras.layers.Dense(\n",
" 256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
" 256,\n",
" activation=\"relu\",\n",
" kernel_regularizer=keras.regularizers.l2(0.001),\n",
" ),\n",
" keras.layers.Dense(\n",
" 128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
" 128,\n",
" activation=\"relu\",\n",
" kernel_regularizer=keras.regularizers.l2(0.001),\n",
" ),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
")"
]
},
@@ -174,7 +185,7 @@
" kernel_regularizer=keras.regularizers.l2(lambda_l2),\n",
" ),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
" )\n",
" model.compile(\n",
" loss=\"sparse_categorical_crossentropy\",\n",
@@ -220,7 +231,7 @@
" \"lambda_l2\": lambda_l2,\n",
" \"history\": pd.DataFrame(history.history),\n",
" \"n_epochs\": n_epochs,\n",
" }\n",
" },\n",
" )"
]
},

View File

@@ -58,7 +58,10 @@
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
" X_train_full, y_train_full, test_size=0.2, random_state=42\n",
" X_train_full,\n",
" y_train_full,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")\n",
"print(X_train.shape, y_train.shape)\n",
"print(X_valid.shape, y_valid.shape)"
@@ -181,7 +184,7 @@
" keras.layers.Dense(256, activation=\"relu\"),\n",
" keras.layers.Dense(128, activation=\"relu\"),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
")"
]
},
@@ -563,7 +566,7 @@
" keras.layers.Dense(256, activation=\"relu\"),\n",
" keras.layers.Dense(128, activation=\"relu\"),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
" )\n",
" model.compile(\n",
" loss=\"sparse_categorical_crossentropy\",\n",
@@ -673,7 +676,10 @@
" plt.subplot(1, 2, 1)\n",
" plt.plot(history_df[\"val_loss\"], linestyle=\"--\", color=colors[_])\n",
" plt.plot(\n",
" history_df[\"loss\"], label=f\"LR={learning_rate}\", alpha=0.5, color=colors[_]\n",
" history_df[\"loss\"],\n",
" label=f\"LR={learning_rate}\",\n",
" alpha=0.5,\n",
" color=colors[_],\n",
" )\n",
" plt.xlabel(\"Epochs\")\n",
" plt.ylabel(\"Loss\")\n",

View File

@@ -18,15 +18,12 @@
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"import tensorflow as tf\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from tensorflow import keras\n",
@@ -35,18 +32,29 @@
" keras.datasets.fashion_mnist.load_data()\n",
")\n",
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
" X_train_full, y_train_full, train_size=0.8\n",
" X_train_full,\n",
" y_train_full,\n",
" train_size=0.8,\n",
")\n",
"\n",
"scaler = StandardScaler()\n",
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28, 1\n",
" -1,\n",
" 28,\n",
" 28,\n",
" 1,\n",
")\n",
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28, 1\n",
" -1,\n",
" 28,\n",
" 28,\n",
" 1,\n",
")\n",
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
" -1, 28, 28, 1\n",
" -1,\n",
" 28,\n",
" 28,\n",
" 1,\n",
")"
]
},

View File

@@ -60,7 +60,10 @@
"from sklearn.model_selection import train_test_split\n",
"\n",
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
" X_train_full, y_train_full, test_size=0.2, random_state=42\n",
" X_train_full,\n",
" y_train_full,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")\n",
"print(X_train.shape, y_train.shape)\n",
"print(X_valid.shape, y_valid.shape)"
@@ -178,16 +181,22 @@
" [\n",
" keras.layers.Input(shape=(28, 28, 1)),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.MaxPooling2D(pool_size=2, strides=2),\n",
" keras.layers.Flatten(),\n",
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
")"
]
},
@@ -374,33 +383,45 @@
" [\n",
" keras.layers.Input(shape=(28, 28, 1)),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.BatchNormalization(),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.MaxPooling2D(pool_size=2, strides=2),\n",
" keras.layers.Flatten(),\n",
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
" )\n",
" else:\n",
" model = keras.models.Sequential(\n",
" [\n",
" keras.layers.Input(shape=(28, 28, 1)),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.MaxPooling2D(pool_size=2, strides=2),\n",
" keras.layers.Flatten(),\n",
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
" )\n",
"\n",
" model.compile(\n",
@@ -653,7 +674,9 @@
"outputs": [],
"source": [
"def agregate_result(\n",
" results: list, normalized: bool, metric_name: str = \"accuracy\"\n",
" results: list,\n",
" normalized: bool,\n",
" metric_name: str = \"accuracy\",\n",
") -> pd.DataFrame:\n",
" train_curves = []\n",
" val_curves = []\n",
@@ -699,7 +722,9 @@
" ax = axs[idx]\n",
" for normalized in [True, False]:\n",
" train, val = agregate_result(\n",
" training_curves, normalized=normalized, metric_name=metric\n",
" training_curves,\n",
" normalized=normalized,\n",
" metric_name=metric,\n",
" )\n",
" train_runs = train.reshape(-1, epochs)\n",
" val_runs = val.reshape(-1, epochs)\n",

View File

@@ -20,7 +20,6 @@
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"%matplotlib inline\n",
"import matplotlib.pyplot as plt\n",
@@ -300,23 +299,35 @@
" [\n",
" keras.layers.InputLayer(shape=(32, 32, 3)),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.Dropout(0.2),\n",
" keras.layers.Conv2D(\n",
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=32,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.MaxPooling2D(pool_size=2),\n",
" keras.layers.Conv2D(\n",
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=16,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.Dropout(0.2),\n",
" keras.layers.Conv2D(\n",
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
" filters=16,\n",
" kernel_size=3,\n",
" activation=\"relu\",\n",
" padding=\"same\",\n",
" ),\n",
" keras.layers.Flatten(),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
" )\n",
"\n",
" return model\n",
@@ -348,7 +359,9 @@
"outputs": [],
"source": [
"def compile_train(\n",
" optimizer_function: str, learning_rate: float, **kwargs\n",
" optimizer_function: str,\n",
" learning_rate: float,\n",
" **kwargs,\n",
") -> keras.callbacks.History:\n",
" model = get_model()\n",
" optimizer = optimizer_function(learning_rate=learning_rate)\n",
@@ -401,7 +414,10 @@
"epochs = 5\n",
"batch_size = 64\n",
"history_adam = compile_train(\n",
" keras.optimizers.Adam, learning_rate=0.001, epochs=epochs, batch_size=batch_size\n",
" keras.optimizers.Adam,\n",
" learning_rate=0.001,\n",
" epochs=epochs,\n",
" batch_size=batch_size,\n",
")"
]
},
@@ -557,7 +573,10 @@
"histories = []\n",
"for optimizer in optimizers:\n",
" history = compile_train(\n",
" optimizer, learning_rate=learning_rate, epochs=epochs, batch_size=batch_size\n",
" optimizer,\n",
" learning_rate=learning_rate,\n",
" epochs=epochs,\n",
" batch_size=batch_size,\n",
" )\n",
" name = optimizer.__name__\n",
" label = f\"{name} (lr={learning_rate:.06})\"\n",

View File

@@ -17,10 +17,11 @@
"metadata": {},
"outputs": [],
"source": [
"import keras\n",
"import numpy as np\n",
"import seaborn as sns\n",
"\n",
"import keras\n",
"\n",
"sns.set(style=\"whitegrid\")\n",
"\n",
"\n",
@@ -149,7 +150,7 @@
" keras.layers.Embedding(\n",
" input_dim=vocabulary_size,\n",
" output_dim=dimension,\n",
" )\n",
" ),\n",
" )\n",
" model.add(keras.layers.SimpleRNN(128, return_sequences=False))\n",
" model.add(keras.layers.Dense(vocabulary_size, activation=\"softmax\"))\n",

View File

@@ -120,9 +120,7 @@
},
"outputs": [],
"source": [
"character_to_index = {\n",
" character: index for index, character in enumerate(characters)\n",
"}\n",
"character_to_index = {character: index for index, character in enumerate(characters)}\n",
"index_to_character = dict(enumerate(characters))"
]
},
@@ -317,7 +315,7 @@
" keras.layers.SimpleRNN(128, return_sequences=False),\n",
" # Ajouter une couche Dense\n",
" keras.layers.Dense(n_characters, activation=\"softmax\"),\n",
" ]\n",
" ],\n",
")\n",
"\n",
"model.summary()"
@@ -429,11 +427,14 @@
"print(len(epochs), len(historic[\"loss\"]))\n",
"\n",
"for index, (metric_name, axis) in enumerate(\n",
" zip([\"loss\", \"accuracy\"], [axis_1, axis_2], strict=False)\n",
" zip([\"loss\", \"accuracy\"], [axis_1, axis_2], strict=False),\n",
"):\n",
" color = sns.color_palette()[index]\n",
" axis.plot(\n",
" epochs[: len(historic[metric_name])], historic[metric_name], lw=2, color=color\n",
" epochs[: len(historic[metric_name])],\n",
" historic[metric_name],\n",
" lw=2,\n",
" color=color,\n",
" )\n",
" axis.plot(\n",
" epochs[: len(historic[\"val_\" + metric_name])],\n",
@@ -604,7 +605,8 @@
"outputs": [],
"source": [
"random_index = np.random.multinomial(\n",
" 1, y_test[np.random.randint(0, len(X_test) - 1)].ravel()\n",
" 1,\n",
" y_test[np.random.randint(0, len(X_test) - 1)].ravel(),\n",
").argmax()"
]
},

View File

@@ -144,7 +144,8 @@
],
"source": [
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\n",
" \"XX\", \"Mardi 19 septembre\"\n",
" \"XX\",\n",
" \"Mardi 19 septembre\",\n",
")\n",
"print(salutation)"
]
@@ -259,7 +260,7 @@
],
"source": [
"a = 2\n",
"if 5 > a:\n",
"if a < 5:\n",
" print(\"Cinq!\")\n",
"else:\n",
" print(\"a!\")\n",
@@ -2612,11 +2613,10 @@
" binaires.append(col)\n",
" else:\n",
" quantitatives.append(col)\n",
" elif len(data_set[col].dropna().unique()) == 2:\n",
" binaires.append(col)\n",
" else:\n",
" if len(data_set[col].dropna().unique()) == 2:\n",
" binaires.append(col)\n",
" else:\n",
" categorielles.append(col)\n",
" categorielles.append(col)\n",
"\n",
"print(\"Variables quantitatives :\", quantitatives)\n",
"print(\"\\nVariables catégorielles :\", categorielles)\n",
@@ -3527,7 +3527,7 @@
"source": [
"fig = px.histogram(data_set.sort_values(\"ANNEE_CTR\"), x=\"ANNEE_CTR\")\n",
"fig.update_xaxes(\n",
" type=\"category\"\n",
" type=\"category\",\n",
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
"\n",
"fig.show()"
@@ -18662,7 +18662,7 @@
" data_set,\n",
" x=\"CONTRAT_ANCIENNETE\",\n",
" category_orders={\n",
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"]\n",
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"],\n",
" },\n",
")\n",
"fig.show()"
@@ -48902,7 +48902,7 @@
" data_set,\n",
" x=\"GROUPE_KM\",\n",
" category_orders={\n",
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"]\n",
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"],\n",
" },\n",
")\n",
"fig.show()"
@@ -64021,7 +64021,7 @@
"# Ecrivez votre code ici\n",
"fig = px.histogram(data_set.sort_values(\"ZONE_RISQUE\"), x=\"ZONE_RISQUE\")\n",
"fig.update_xaxes(\n",
" type=\"category\"\n",
" type=\"category\",\n",
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
"\n",
"fig.show()"
@@ -64877,10 +64877,11 @@
"source": [
"# Ecrivez votre code ici\n",
"fig = px.histogram(\n",
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"), x=\"AGE_ASSURE_PRINCIPAL\"\n",
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"),\n",
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
")\n",
"fig.update_xaxes(\n",
" type=\"category\"\n",
" type=\"category\",\n",
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
"\n",
"fig.show()"
@@ -79999,7 +80000,7 @@
"# Ecrivez votre code ici\n",
"fig = px.histogram(data_set.sort_values(\"GENRE\"), x=\"GENRE\")\n",
"fig.update_xaxes(\n",
" type=\"category\"\n",
" type=\"category\",\n",
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
"\n",
"fig.show()"
@@ -80064,7 +80065,8 @@
" [\n",
" data_h,\n",
" pd.DataFrame(\n",
" [[13, \"M\", 0]], columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"]\n",
" [[13, \"M\", 0]],\n",
" columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"],\n",
" ),\n",
" ],\n",
" ignore_index=True,\n",
@@ -82329,17 +82331,17 @@
"# ANNEE_CONSTRUCTION,VALEUR_DU_BIEN,DEUXIEME_CONDUCTEUR)\n",
"\n",
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(\n",
" data_retraitee[\"GROUPE_KM\"].mode()[0]\n",
" data_retraitee[\"GROUPE_KM\"].mode()[0],\n",
")\n",
"data_retraitee[\"GENRE\"] = data_retraitee[\"GENRE\"].fillna(\"M\")\n",
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(\n",
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median()\n",
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median(),\n",
")\n",
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(\n",
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0]\n",
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0],\n",
")\n",
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(\n",
" False\n",
" False,\n",
")"
]
},
@@ -83750,7 +83752,10 @@
"source": [
"# Représentation graphique\n",
"fig = px.line(\n",
" plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\"\n",
" plot_data,\n",
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
" y=\"FREQ\",\n",
" title=\"Sinistralité selon l'âge\",\n",
")\n",
"fig.show()"
]
@@ -85539,7 +85544,10 @@
"\n",
"# Représentation graphique\n",
"fig = px.scatter(\n",
" plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\"\n",
" plot_data,\n",
" x=\"ENERGIE\",\n",
" y=\"FREQ\",\n",
" title=\"Sinistralité selon le carburant\",\n",
")\n",
"fig.show()"
]
@@ -86416,7 +86424,10 @@
"\n",
"# Représentation graphique\n",
"fig = px.scatter(\n",
" plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\"\n",
" plot_data,\n",
" x=\"VALEUR_DU_BIEN\",\n",
" y=\"CM\",\n",
" title=\"Coût moyen selon le prix\",\n",
")\n",
"fig.show()"
]
@@ -89044,7 +89055,10 @@
"\n",
"# Représentation graphique\n",
"fig = px.scatter(\n",
" plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\"\n",
" plot_data,\n",
" x=\"GENRE\",\n",
" y=\"CM\",\n",
" title=\"Coût moyen selon l'âge de l'assuré\",\n",
")\n",
"fig.show()"
]

View File

@@ -56,16 +56,15 @@
"import seaborn as sns\n",
"\n",
"sns.set()\n",
"import matplotlib.pyplot as plt # noqa: E402\n",
"import matplotlib.pyplot as plt\n",
"import plotly.express as px\n",
"import plotly.graph_objects as gp\n",
"from scipy.cluster.hierarchy import dendrogram, linkage # noqa: E402\n",
"from scipy.cluster.hierarchy import dendrogram, linkage\n",
"\n",
"# Statistiques\n",
"from scipy.stats import chi2_contingency # noqa: E402, F401\n",
"from scipy.stats import chi2_contingency # noqa: F401\n",
"\n",
"# Machine Learning\n",
"from sklearn.cluster import AgglomerativeClustering, KMeans # noqa: E402"
"from sklearn.cluster import AgglomerativeClustering, KMeans"
]
},
{
@@ -899,7 +898,9 @@
"source": [
"# Calcul de la partition de l'espace\n",
"hierarchical_cluster = AgglomerativeClustering(\n",
" n_clusters=3, metric=\"euclidean\", linkage=\"single\"\n",
" n_clusters=3,\n",
" metric=\"euclidean\",\n",
" linkage=\"single\",\n",
")\n",
"\n",
"labels = hierarchical_cluster.fit_predict(data)\n",
@@ -972,7 +973,9 @@
"source": [
"# Calcul de la partition de l'espace\n",
"hierarchical_cluster = AgglomerativeClustering(\n",
" n_clusters=3, metric=\"euclidean\", linkage=\"complete\"\n",
" n_clusters=3,\n",
" metric=\"euclidean\",\n",
" linkage=\"complete\",\n",
")\n",
"\n",
"labels = hierarchical_cluster.fit_predict(data)\n",
@@ -1482,7 +1485,7 @@
"\n",
"# Group by ZONE_RISQUE and aggregate the necessary columns\n",
"data = data_retraitee.groupby([\"ZONE_RISQUE\"], as_index=False).agg(\n",
" {\"NB\": \"sum\", \"CHARGE\": \"sum\", \"EXPO\": \"sum\"}\n",
" {\"NB\": \"sum\", \"CHARGE\": \"sum\", \"EXPO\": \"sum\"},\n",
")\n",
"\n",
"# Calculate derived metrics\n",
@@ -1547,7 +1550,11 @@
"source": [
"# Initialisation de l'algorithme\n",
"kmeans_FREQ = KMeans(\n",
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
" init=\"random\",\n",
" n_clusters=5,\n",
" n_init=1,\n",
" random_state=42,\n",
" max_iter=300,\n",
")\n",
"\n",
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
@@ -3559,7 +3566,11 @@
"source": [
"# Initialisation de l'algorithme\n",
"kmeans_FREQ_CM = KMeans(\n",
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
" init=\"random\",\n",
" n_clusters=5,\n",
" n_init=1,\n",
" random_state=42,\n",
" max_iter=300,\n",
")\n",
"\n",
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
@@ -4621,7 +4632,9 @@
"source": [
"# Calcul de la partition de l'espace\n",
"hierarchical_cluster = AgglomerativeClustering(\n",
" n_clusters=5, metric=\"euclidean\", linkage=\"single\"\n",
" n_clusters=5,\n",
" metric=\"euclidean\",\n",
" linkage=\"single\",\n",
")\n",
"\n",
"labels = hierarchical_cluster.fit_predict(data_x)\n",
@@ -5650,7 +5663,9 @@
"source": [
"# Calcul de la partition de l'espace\n",
"hierarchical_cluster = AgglomerativeClustering(\n",
" n_clusters=5, metric=\"euclidean\", linkage=\"single\"\n",
" n_clusters=5,\n",
" metric=\"euclidean\",\n",
" linkage=\"single\",\n",
")\n",
"\n",
"labels = hierarchical_cluster.fit_predict(data_x)\n",

View File

@@ -60,12 +60,13 @@
"\n",
"sns.set()\n",
"import plotly.express as px\n",
"import sklearn.metrics as metrics\n",
"import sklearn.preprocessing as preproc\n",
"\n",
"# Statistiques\n",
"from scipy.stats import chi2_contingency\n",
"\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn import metrics\n",
"\n",
"# Machine Learning\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
@@ -89,7 +90,7 @@
"source": [
"def cramers_V(var1, var2):\n",
" crosstab = np.array(\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
" ) # Cross table building\n",
" stat = chi2_contingency(crosstab)[\n",
" 0\n",
@@ -2027,17 +2028,15 @@
"for colu in data_set.columns:\n",
" if True in data_set[colu].isna().unique():\n",
" variables_na.append(data_set[colu])\n",
" else:\n",
" if str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_set[colu].unique()) == 2:\n",
" variables_categorielles.append(data_set[colu])\n",
" else:\n",
" variables_numeriques.append(data_set[colu])\n",
" elif str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_set[colu].unique()) == 2:\n",
" variables_categorielles.append(data_set[colu])\n",
" else:\n",
" if len(data_set[colu].unique()) == 2:\n",
" variables_categorielles.append(data_set[colu])\n",
" else:\n",
" variables_categorielles.append(data_set[colu])"
" variables_numeriques.append(data_set[colu])\n",
" elif len(data_set[colu].unique()) == 2:\n",
" variables_categorielles.append(data_set[colu])\n",
" else:\n",
" variables_categorielles.append(data_set[colu])"
]
},
{
@@ -2437,7 +2436,8 @@
" col = []\n",
" for var2 in vars_categorielles:\n",
" cramers = cramers_V(\n",
" vars_categorielles[var1], vars_categorielles[var2]\n",
" vars_categorielles[var1],\n",
" vars_categorielles[var2],\n",
" ) # V de Cramer\n",
" col.append(round(cramers, 2)) # arrondi du résultat\n",
" rows.append(col)\n",
@@ -2468,7 +2468,7 @@
" + \" et \"\n",
" + v_cramer_resultats.columns[j]\n",
" + \" sont trop dépendantes, V-CRAMER = \"\n",
" + str(v_cramer_resultats.iloc[i, j])\n",
" + str(v_cramer_resultats.iloc[i, j]),\n",
" )"
]
},
@@ -2662,7 +2662,7 @@
" + \" et \"\n",
" + correlations_num.columns[j]\n",
" + \" sont trop dépendantes, corr = \"\n",
" + str(correlations_num.iloc[i, j])\n",
" + str(correlations_num.iloc[i, j]),\n",
" )"
]
},
@@ -3312,7 +3312,7 @@
"# One hot encoding des variables catégorielles\n",
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
" vars_categorielles\n",
" vars_categorielles,\n",
")\n",
"\n",
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
@@ -3496,7 +3496,8 @@
"\n",
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
"vars_numeriques_scaled = pd.DataFrame(\n",
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
" vars_numeriques_scaled,\n",
" columns=vars_numeriques.columns,\n",
")\n",
"vars_numeriques_scaled.head()"
]
@@ -3525,7 +3526,9 @@
"outputs": [],
"source": [
"X_global = vars_numeriques_scaled.merge(\n",
" variables_categorielles_ohe, left_index=True, right_index=True\n",
" variables_categorielles_ohe,\n",
" left_index=True,\n",
" right_index=True,\n",
")"
]
},
@@ -3542,7 +3545,10 @@
"\n",
"# Sampling en 80% train et 20% test\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=0.2, random_state=42\n",
" X,\n",
" Y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")"
]
},
@@ -3707,7 +3713,9 @@
"outputs": [],
"source": [
"X_global = vars_numeriques_scaled.merge(\n",
" variables_categorielles_ohe, left_index=True, right_index=True\n",
" variables_categorielles_ohe,\n",
" left_index=True,\n",
" right_index=True,\n",
")\n",
"\n",
"# Réorganisation des données\n",
@@ -3888,7 +3896,9 @@
"outputs": [],
"source": [
"X_global = vars_numeriques_scaled.merge(\n",
" variables_categorielles_ohe, left_index=True, right_index=True\n",
" variables_categorielles_ohe,\n",
" left_index=True,\n",
" right_index=True,\n",
")\n",
"# Réorganisation des données\n",
"X = X_global.to_numpy()\n",
@@ -4127,6 +4137,7 @@
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.model_selection import GridSearchCV, KFold"
]
@@ -4140,7 +4151,10 @@
"source": [
"# Sampling en 80% train et 20% test\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=0.2, random_state=42\n",
" X,\n",
" Y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")"
]
},
@@ -4186,7 +4200,9 @@
" estimator=rf,\n",
" param_grid=param_grid,\n",
" cv=KFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" n_splits=num_folds,\n",
" shuffle=True,\n",
" random_state=42,\n",
" ), # Validation croisée avec 5 folds\n",
" scoring=\"neg_mean_squared_error\", # Métrique d'évaluation (moins c'est mieux)\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
@@ -4247,7 +4263,11 @@
"# Cross validation\n",
"# RMSE de chaque fold\n",
"rmse_scores = cross_val_score(\n",
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_root_mean_squared_error\"\n",
" best_rf,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"neg_root_mean_squared_error\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -4256,7 +4276,11 @@
"\n",
"# MSE de chaque fold\n",
"mse_scores = cross_val_score(\n",
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_squared_error\"\n",
" best_rf,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"neg_mean_squared_error\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -4266,7 +4290,11 @@
"\n",
"# MAE de chaque fold\n",
"mae_scores = cross_val_score(\n",
" best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_absolute_error\"\n",
" best_rf,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"neg_mean_absolute_error\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",

View File

@@ -58,14 +58,15 @@
"import seaborn as sns\n",
"\n",
"sns.set()\n",
"import plotly.express as px\n",
"\n",
"# Machine Learning\n",
"import sklearn.preprocessing as preproc\n",
"from imblearn.over_sampling import RandomOverSampler\n",
"\n",
"import plotly.express as px\n",
"\n",
"# Statistiques\n",
"from scipy.stats import chi2_contingency\n",
"\n",
"# Machine Learning\n",
"import sklearn.preprocessing as preproc\n",
"from sklearn import metrics\n",
"from sklearn.ensemble import GradientBoostingClassifier\n",
"from sklearn.model_selection import (\n",
@@ -93,7 +94,7 @@
"source": [
"def cramers_V(var1, var2):\n",
" crosstab = np.array(\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
" pd.crosstab(var1, var2, rownames=None, colnames=None),\n",
" ) # Cross table building\n",
" stat = chi2_contingency(crosstab)[\n",
" 0\n",
@@ -16171,7 +16172,9 @@
"source": [
"# Observation de la distribution\n",
"fig = px.histogram(\n",
" data_model, x=\"SINISTRE\", title=\"Distribution de la variable 'sinistré'\"\n",
" data_model,\n",
" x=\"SINISTRE\",\n",
" title=\"Distribution de la variable 'sinistré'\",\n",
")\n",
"fig.show()"
]
@@ -16221,17 +16224,15 @@
"for col in data_set.columns:\n",
" if True in data_model[col].isna().unique():\n",
" variables_na.append(data_model[col])\n",
" else:\n",
" if str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_numeriques.append(data_model[col])\n",
" elif str(data_model[col].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" if len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_categorielles.append(data_model[col])"
" variables_numeriques.append(data_model[col])\n",
" elif len(data_model[col].unique()) == 2:\n",
" variables_categorielles.append(data_model[col])\n",
" else:\n",
" variables_categorielles.append(data_model[col])"
]
},
{
@@ -16631,7 +16632,8 @@
" col = []\n",
" for var2 in vars_categorielles:\n",
" cramers = cramers_V(\n",
" vars_categorielles[var1], vars_categorielles[var2]\n",
" vars_categorielles[var1],\n",
" vars_categorielles[var2],\n",
" ) # V de Cramer\n",
" col.append(round(cramers, 2)) # arrondi du résultat\n",
" rows.append(col)\n",
@@ -16658,7 +16660,7 @@
" for j in range(i + 1, v_cramer_resultats.shape[0]):\n",
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
" print(\n",
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\",\n",
" )"
]
},
@@ -16856,7 +16858,7 @@
" for j in range(i + 1, nb_variables):\n",
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
" print(\n",
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\",\n",
" )"
]
},
@@ -17520,7 +17522,7 @@
"# One hot encoding des variables catégorielles\n",
"preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n",
"preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n",
" vars_categorielles\n",
" vars_categorielles,\n",
")\n",
"\n",
"variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n",
@@ -17704,7 +17706,8 @@
"\n",
"vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n",
"vars_numeriques_scaled = pd.DataFrame(\n",
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
" vars_numeriques_scaled,\n",
" columns=vars_numeriques.columns,\n",
")\n",
"vars_numeriques_scaled.head()"
]
@@ -17756,7 +17759,9 @@
"outputs": [],
"source": [
"X_global = vars_numeriques_scaled.merge(\n",
" variables_categorielles_ohe, left_index=True, right_index=True\n",
" variables_categorielles_ohe,\n",
" left_index=True,\n",
" right_index=True,\n",
")\n",
"# Réorganisation des données\n",
"X = X_global.to_numpy()\n",
@@ -17772,7 +17777,11 @@
"source": [
"# Sampling en 80% train et 20% test\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, Y, test_size=0.2, random_state=42, stratify=Y\n",
" X,\n",
" Y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
" stratify=Y,\n",
")"
]
},
@@ -17824,7 +17833,9 @@
" estimator=gbc,\n",
" param_grid=param_grid,\n",
" cv=StratifiedKFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" n_splits=num_folds,\n",
" shuffle=True,\n",
" random_state=42,\n",
" ), # Validation croisée avec 5 folds\n",
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
@@ -17884,7 +17895,11 @@
"source": [
"# Recall de chaque fold\n",
"recall_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"recall\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -17893,7 +17908,11 @@
"\n",
"# Accuracy de chaque fold\n",
"accuracy_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"accuracy\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -17903,7 +17922,11 @@
"\n",
"# Precision de chaque fold\n",
"precision_scores = cross_val_score(\n",
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
" best_gbc,\n",
" X_train,\n",
" y_train,\n",
" cv=num_folds,\n",
" scoring=\"precision\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52512,7 +52535,9 @@
"# Observation de la distribution sur Y_train\n",
"df = pd.DataFrame(y_train_resampled, columns=[\"SINISTRE\"])\n",
"fig = px.histogram(\n",
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
" df,\n",
" x=\"SINISTRE\",\n",
" title=\"Distribution de la variable Y_train_resampled\",\n",
")\n",
"fig.show()"
]
@@ -52565,7 +52590,9 @@
" estimator=gb,\n",
" param_grid=param_grid,\n",
" cv=StratifiedKFold(\n",
" n_splits=num_folds, shuffle=True, random_state=42\n",
" n_splits=num_folds,\n",
" shuffle=True,\n",
" random_state=42,\n",
" ), # Validation croisée stratifiée avec 5 plis\n",
" scoring=\"recall\", # Métrique d'évaluation\n",
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
@@ -52618,7 +52645,11 @@
"# Zoom sur la CV\n",
"# Recall de chaque fold\n",
"recall_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"recall\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"recall\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52627,7 +52658,11 @@
"\n",
"# Accuracy de chaque fold\n",
"accuracy_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"accuracy\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"accuracy\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -52637,7 +52672,11 @@
"\n",
"# Precision de chaque fold\n",
"precision_scores = cross_val_score(\n",
" best_gbc, X_train_resampled, y_train_resampled, cv=num_folds, scoring=\"precision\"\n",
" best_gbc,\n",
" X_train_resampled,\n",
" y_train_resampled,\n",
" cv=num_folds,\n",
" scoring=\"precision\",\n",
")\n",
"\n",
"# Afficher les scores pour chaque fold\n",
@@ -53146,7 +53185,8 @@
"# Matrice de confusion\n",
"confusion_matrix = metrics.confusion_matrix(y_test, y_pred)\n",
"cm_display = metrics.ConfusionMatrixDisplay(\n",
" confusion_matrix=confusion_matrix, display_labels=[False, True]\n",
" confusion_matrix=confusion_matrix,\n",
" display_labels=[False, True],\n",
")\n",
"\n",
"cm_display.plot()"

View File

@@ -115,6 +115,7 @@
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"\n",
"from catboost import CatBoostClassifier, Pool\n",
"from sklearn.metrics import (\n",
" classification_report,\n",
@@ -163,10 +164,10 @@
"\n",
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
"print(\n",
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\",\n",
")\n",
"print(\n",
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\",\n",
")\n",
"\n",
"# Chargement depuis UCI\n",
@@ -192,7 +193,11 @@
"\n",
"try:\n",
" df = pd.read_csv(\n",
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
" url,\n",
" names=column_names,\n",
" sep=r\",\\s*\",\n",
" engine=\"python\",\n",
" na_values=\"?\",\n",
" )\n",
" print(\"Dataset chargé depuis UCI repository\")\n",
"except: # noqa: E722\n",
@@ -301,7 +306,7 @@
" n_samples,\n",
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
" ),\n",
" }\n",
" },\n",
" )\n",
"\n",
" # Création de la cible avec logique réaliste\n",
@@ -647,19 +652,25 @@
"\n",
"# Taux de revenu >50K par catégorie\n",
"df.groupby(\"education\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 0], color=\"skyblue\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 0],\n",
" color=\"skyblue\",\n",
")\n",
"axes[0, 0].set_title(\"Taux de revenu >50K par niveau d'éducation\")\n",
"axes[0, 0].set_xlabel(\"Taux\")\n",
"\n",
"df.groupby(\"occupation\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 1], color=\"lightcoral\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 1],\n",
" color=\"lightcoral\",\n",
")\n",
"axes[0, 1].set_title(\"Taux de revenu >50K par occupation\")\n",
"axes[0, 1].set_xlabel(\"Taux\")\n",
"\n",
"df.groupby(\"marital_status\")[\"income\"].mean().sort_values(ascending=False).plot(\n",
" kind=\"barh\", ax=axes[0, 2], color=\"lightgreen\"\n",
" kind=\"barh\",\n",
" ax=axes[0, 2],\n",
" color=\"lightgreen\",\n",
")\n",
"axes[0, 2].set_title(\"Taux de revenu >50K par statut marital\")\n",
"axes[0, 2].set_xlabel(\"Taux\")\n",
@@ -758,7 +769,11 @@
"\n",
"# Split train/test stratifié\n",
"X_train, X_test, y_train, y_test = train_test_split(\n",
" X, y, test_size=0.2, random_state=42, stratify=y\n",
" X,\n",
" y,\n",
" test_size=0.2,\n",
" random_state=42,\n",
" stratify=y,\n",
")"
]
},
@@ -1020,7 +1035,7 @@
"feature_names = X_train.columns\n",
"\n",
"importance_df = pd.DataFrame(\n",
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
").sort_values(\"importance\", ascending=False)\n",
"\n",
"print(importance_df)\n",
@@ -1283,7 +1298,10 @@
"y_reg = df[\"montant_defaut\"]\n",
"\n",
"X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(\n",
" X_reg, y_reg, test_size=0.2, random_state=42\n",
" X_reg,\n",
" y_reg,\n",
" test_size=0.2,\n",
" random_state=42,\n",
")\n",
"\n",
"# Pools\n",
@@ -1292,7 +1310,11 @@
"\n",
"# Modèle\n",
"model_reg = CatBoostRegressor(\n",
" iterations=500, learning_rate=0.1, depth=6, random_seed=42, verbose=100\n",
" iterations=500,\n",
" learning_rate=0.1,\n",
" depth=6,\n",
" random_seed=42,\n",
" verbose=100,\n",
")\n",
"\n",
"model_reg.fit(train_pool_reg, eval_set=test_pool_reg)\n",
@@ -1363,7 +1385,7 @@
"feature_names = X_reg.columns\n",
"\n",
"importance_df = pd.DataFrame(\n",
" {\"feature\": feature_names, \"importance\": feature_importance}\n",
" {\"feature\": feature_names, \"importance\": feature_importance},\n",
").sort_values(\"importance\", ascending=False)\n",
"\n",
"print(importance_df)\n",
@@ -1580,7 +1602,7 @@
"# Importance SHAP moyenne\n",
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
"shap_df = pd.DataFrame(\n",
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance},\n",
").sort_values(\"shap_importance\", ascending=False)\n",
"\n",
"print(\"\\nImportance SHAP moyenne :\")\n",

View File

@@ -299,14 +299,15 @@
"\n",
" # With probability 1ε: exploit (choose an arm with the highest estimated value).\n",
" max_val = np.max(\n",
" Q\n",
" Q,\n",
" ) # Compute the maximum value of the array Q and store it in the variable max_val\n",
" candidates = np.isclose(\n",
" Q, max_val\n",
" Q,\n",
" max_val,\n",
" ) # (see Hint) Find all positions in Q where the value equals max_val.\n",
"\n",
" return np.random.choice(\n",
" candidates\n",
" candidates,\n",
" ) # pick one of those best arms uniformly at random."
]
},
@@ -614,13 +615,16 @@
"\n",
" # record what happens at each time step\n",
" rewards = np.zeros(\n",
" T, dtype=float\n",
" T,\n",
" dtype=float,\n",
" ) # rewards[t] = observed reward at step t (0 or 1),\n",
" chose_opt = np.zeros(\n",
" T, dtype=float\n",
" T,\n",
" dtype=float,\n",
" ) # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
" regret = np.zeros(\n",
" T, dtype=float\n",
" T,\n",
" dtype=float,\n",
" ) # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
"\n",
" # -------------------------------------------------\n",
@@ -714,7 +718,11 @@
"outputs": [],
"source": [
"def run_many(\n",
" runs: int, T: int, epsilon: float, k: int, update_style: str = \"incremental\"\n",
" runs: int,\n",
" T: int,\n",
" epsilon: float,\n",
" k: int,\n",
" update_style: str = \"incremental\",\n",
"):\n",
" \"\"\"Run multiple independent experiments of T time steps using ε-greedy on a k-armed Bernoulli bandit.\n",
"\n",
@@ -844,7 +852,9 @@
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(\n",
" results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" results[eps][\"avg_reward\"],\n",
" label=f\"ε={eps}\",\n",
" color=colors[eps_list.index(eps)],\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"Average reward\")\n",
@@ -856,7 +866,9 @@
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(\n",
" results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" results[eps][\"avg_opt\"],\n",
" label=f\"ε={eps}\",\n",
" color=colors[eps_list.index(eps)],\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"P(select optimal arm)\")\n",
@@ -881,7 +893,9 @@
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(\n",
" results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" results[eps][\"avg_cumreg\"],\n",
" label=f\"ε={eps}\",\n",
" color=colors[eps_list.index(eps)],\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"Average cumulative regret\")\n",
@@ -929,7 +943,7 @@
"# Calculate final performance metrics for each epsilon\n",
"print(\"### Performance Summary for Different ε Values\\n\")\n",
"print(\n",
" f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\"\n",
" f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\",\n",
")\n",
"print(\"-\" * 80)\n",
"\n",
@@ -940,7 +954,7 @@
" final_cum_regret = results[eps][\"avg_cumreg\"][-1]\n",
"\n",
" print(\n",
" f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\"\n",
" f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\",\n",
" )\n",
"\n",
"# Find the best epsilon based on multiple criteria\n",
@@ -1029,10 +1043,10 @@
"t_incr = time_runner(\"incremental\")\n",
"\n",
"print(\n",
" f\"Naive sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_naive:.3f} s\"\n",
" f\"Naive sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_naive:.3f} s\",\n",
")\n",
"print(\n",
" f\"Incremental sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_incr:.3f} s\"\n",
" f\"Incremental sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_incr:.3f} s\",\n",
")\n",
"print(f\"Speedup (naive / incremental): {t_naive / t_incr:.2f}×\")"
]

View File

@@ -40,11 +40,12 @@
"import numpy as np\n",
"\n",
"np.set_printoptions(\n",
" precision=3, suppress=True\n",
" precision=3,\n",
" suppress=True,\n",
") # (not mandatory) This line is for limiting floats to 3 decimal places, avoiding scientific notation (like 1.23e-04) for small numbers.\n",
"\n",
"# For reproducibility\n",
"rng = np.random.default_rng(seed=42) # This line creates a random number generator.\n"
"rng = np.random.default_rng(seed=42) # This line creates a random number generator."
]
},
{
@@ -110,7 +111,7 @@
" \"#..#..G\",\n",
" \"#..X..#\",\n",
" \"#######\",\n",
"]\n"
"]"
]
},
{
@@ -142,7 +143,7 @@
"n_rows = len(maze_str)\n",
"print(n_rows)\n",
"n_cols = len(maze_str[0])\n",
"print(n_cols)\n"
"print(n_cols)"
]
},
{
@@ -169,7 +170,7 @@
"source": [
"print(\"Maze:\")\n",
"for row in maze_str:\n",
" print(row)\n"
" print(row)"
]
},
{
@@ -207,7 +208,7 @@
" \"S\",\n",
" \"G\",\n",
" \"X\",\n",
"} # The vector Free represents cells that the agent is allowed to move into.\n"
"} # The vector Free represents cells that the agent is allowed to move into."
]
},
{
@@ -277,7 +278,7 @@
"print(\"Number of states (non-wall cells):\", n_states)\n",
"print(\"Start state:\", start_state, \"at\", state_to_pos[start_state])\n",
"print(\"Goal states:\", goal_states, \"at\", state_to_pos[goal_states[0]])\n",
"print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])\n"
"print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])"
]
},
{
@@ -304,7 +305,7 @@
],
"source": [
"my_dict = {\"key1\": \"value1\", \"key2\": \"value2\"}\n",
"print(my_dict[\"key2\"])\n"
"print(my_dict[\"key2\"])"
]
},
{
@@ -383,7 +384,7 @@
"def plot_maze_with_states():\n",
" \"\"\"Plot the maze with state indices.\"\"\"\n",
" grid = np.ones(\n",
" (n_rows, n_cols)\n",
" (n_rows, n_cols),\n",
" ) # Start with a matrix of ones. Here 1 means “free cell”\n",
" for i in range(n_rows):\n",
" for j in range(n_cols):\n",
@@ -571,7 +572,7 @@
" # If the next cell is a wall, the robot stays in place.\n",
" return i, j\n",
"\n",
" return candidate_i, candidate_j # Otherwise, return the new position\n"
" return candidate_i, candidate_j # Otherwise, return the new position"
]
},
{
@@ -601,7 +602,7 @@
"outputs": [],
"source": [
"gamma = 0.95\n",
"p_error = 0.1 # probability of the error to a random other direction\n"
"p_error = 0.1 # probability of the error to a random other direction"
]
},
{
@@ -662,7 +663,7 @@
"# Set rewards for each state\n",
"step_penalty = -0.01\n",
"goal_reward = 1.0\n",
"trap_reward = -1.0\n"
"trap_reward = -1.0"
]
},
{
@@ -710,7 +711,7 @@
" elif s in trap_states:\n",
" R[s] = trap_reward\n",
" else:\n",
" R[s] = step_penalty\n"
" R[s] = step_penalty"
]
},
{
@@ -735,7 +736,7 @@
"\n",
"def is_terminal(s: int) -> bool:\n",
" \"\"\"Check if a state is terminal (goal or trap).\"\"\"\n",
" return s in terminal_states\n"
" return s in terminal_states"
]
},
{
@@ -797,9 +798,9 @@
" error_i, error_j = move_deterministic(i, j, a2)\n",
" s_error = pos_to_state[(error_i, error_j)] # get its state index s_error\n",
" P[a, s, s_error] += p_error / len(\n",
" other_actions\n",
" other_actions,\n",
" ) # add p_error / 3 to P[a, s, s_error]\n",
"# So for each (s,a), probabilities over all s_next sum to 1.\n"
"# So for each (s,a), probabilities over all s_next sum to 1."
]
},
{
@@ -843,7 +844,7 @@
" # If everything is correct, they should be very close to 1.\n",
"\n",
" probs = P[a].sum(axis=1)\n",
" print(f\"Action {action_names[a]}:\", probs)\n"
" print(f\"Action {action_names[a]}:\", probs)"
]
},
{
@@ -995,7 +996,7 @@
"\n",
" for _it in range(max_iter): # Main iterative loop\n",
" V_new = np.zeros_like(\n",
" V\n",
" V,\n",
" ) # Create a new value vector and we will compute an updated value for each state.\n",
"\n",
" # Now we update each state using the Bellman expectation equation\n",
@@ -1004,7 +1005,7 @@
" V_new[s] = R[s] + gamma * np.sum(P[a, s, :] * V)\n",
"\n",
" delta = np.max(\n",
" np.abs(V_new - V)\n",
" np.abs(V_new - V),\n",
" ) # This measures how much the value function changed in this iteration:\n",
" # If delta is small, the values start to converge; otherwise, we need to keep iterating.\n",
" V = V_new # Update V, i.e. Set the new values for the next iteration.\n",
@@ -1012,7 +1013,7 @@
" if delta < theta: # Check convergence: When changes are tiny, we stop.\n",
" break\n",
"\n",
" return V # Return the final value function, this is our estimate for V^{pi}(s), s in the state set.\n"
" return V # Return the final value function, this is our estimate for V^{pi}(s), s in the state set."
]
},
{
@@ -1082,7 +1083,7 @@
"source": [
"V_random = policy_evaluation(policy=random_policy, P=P, R=R, gamma=gamma)\n",
"print(\"Value function under random policy:\")\n",
"print(V_random)\n"
"print(V_random)"
]
},
{
@@ -1127,7 +1128,8 @@
"def plot_values(V: np.ndarray, title=\"Value function\") -> None:\n",
" \"\"\"Plot the value function V on the maze as a heatmap.\"\"\"\n",
" grid_values = np.full(\n",
" (n_rows, n_cols), np.nan\n",
" (n_rows, n_cols),\n",
" np.nan,\n",
" ) # Initializes a grid the same size as the maze. Every cell starts as NaN.\n",
" for (\n",
" s,\n",
@@ -1152,7 +1154,13 @@
"\n",
" for s, (i, j) in state_to_pos.items():\n",
" ax.text(\n",
" j, i, f\"{V[s]:.2f}\", ha=\"center\", va=\"center\", color=\"white\", fontsize=9\n",
" j,\n",
" i,\n",
" f\"{V[s]:.2f}\",\n",
" ha=\"center\",\n",
" va=\"center\",\n",
" color=\"white\",\n",
" fontsize=9,\n",
" )\n",
"\n",
" # Remove axis ticks and set title\n",
@@ -1162,7 +1170,7 @@
" plt.show()\n",
"\n",
"\n",
"plot_values(V_random, title=\"Value function: random policy\")\n"
"plot_values(V_random, title=\"Value function: random policy\")"
]
},
{
@@ -1247,7 +1255,7 @@
" ax.set_yticklabels([])\n",
" ax.grid(True)\n",
" ax.set_title(title)\n",
" plt.show()\n"
" plt.show()"
]
},
{
@@ -1276,7 +1284,7 @@
}
],
"source": [
"plot_policy(policy=random_policy, title=\"Policy\")\n"
"plot_policy(policy=random_policy, title=\"Policy\")"
]
},
{

View File

@@ -23,11 +23,12 @@
"import numpy as np\n",
"\n",
"np.set_printoptions(\n",
" precision=3, suppress=True\n",
" precision=3,\n",
" suppress=True,\n",
") # (not mandatory) This line is for limiting floats to 3 decimal places, avoiding scientific notation (like 1.23e-04) for small numbers.\n",
"\n",
"# For reproducibility\n",
"rng = np.random.default_rng(seed=42) # This line creates a random number generator.\n"
"rng = np.random.default_rng(seed=42) # This line creates a random number generator."
]
},
{
@@ -102,7 +103,7 @@
" \"S\",\n",
" \"G\",\n",
" \"X\",\n",
"} # The vector Free represents cells that the agent is allowed to move into.\n"
"} # The vector Free represents cells that the agent is allowed to move into."
]
},
{
@@ -164,7 +165,7 @@
"print(\"Number of states (non-wall cells):\", n_states)\n",
"print(\"Start state:\", start_state, \"at\", state_to_pos[start_state])\n",
"print(\"Goal states:\", goal_states, \"at\", state_to_pos[goal_states[0]])\n",
"print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])\n"
"print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])"
]
},
{
@@ -188,7 +189,7 @@
"def plot_maze_with_states():\n",
" \"\"\"Plot the maze with state indices.\"\"\"\n",
" grid = np.ones(\n",
" (n_rows, n_cols)\n",
" (n_rows, n_cols),\n",
" ) # Start with a matrix of ones. Here 1 means “free cell”\n",
" for i in range(n_rows):\n",
" for j in range(n_cols):\n",
@@ -316,7 +317,7 @@
" # If the next cell is a wall, the robot stays in place.\n",
" return i, j\n",
"\n",
" return candidate_i, candidate_j # Otherwise, return the new position\n"
" return candidate_i, candidate_j # Otherwise, return the new position"
]
},
{
@@ -335,7 +336,7 @@
"outputs": [],
"source": [
"gamma = 0.95\n",
"p_error = 0.1 # probability of the error to a random other direction\n"
"p_error = 0.1 # probability of the error to a random other direction"
]
},
{
@@ -360,7 +361,7 @@
"# Set rewards for each state\n",
"step_penalty = -0.01\n",
"goal_reward = 1.0\n",
"trap_reward = -1.0\n"
"trap_reward = -1.0"
]
},
{
@@ -376,7 +377,7 @@
" elif s in trap_states:\n",
" R[s] = trap_reward\n",
" else:\n",
" R[s] = step_penalty\n"
" R[s] = step_penalty"
]
},
{
@@ -391,7 +392,7 @@
"\n",
"def is_terminal(s: int) -> bool:\n",
" \"\"\"Check if a state is terminal (goal or trap).\"\"\"\n",
" return s in terminal_states\n"
" return s in terminal_states"
]
},
{
@@ -437,9 +438,9 @@
" error_i, error_j = move_deterministic(i, j, a2)\n",
" s_error = pos_to_state[(error_i, error_j)] # get its state index s_error\n",
" P[a, s, s_error] += p_error / len(\n",
" other_actions\n",
" other_actions,\n",
" ) # add p_error / 3 to P[a, s, s_error]\n",
"# So for each (s,a), probabilities over all s_next sum to 1.\n"
"# So for each (s,a), probabilities over all s_next sum to 1."
]
},
{
@@ -476,7 +477,7 @@
" # If everything is correct, they should be very close to 1.\n",
"\n",
" probs = P[a].sum(axis=1)\n",
" print(f\"Action {action_names[a]}:\", probs)\n"
" print(f\"Action {action_names[a]}:\", probs)"
]
},
{
@@ -520,7 +521,7 @@
"\n",
" for _it in range(max_iter): # Main iterative loop\n",
" V_new = np.zeros_like(\n",
" V\n",
" V,\n",
" ) # Create a new value vector and we will compute an updated value for each state.\n",
"\n",
" # Now we update each state using the Bellman expectation equation\n",
@@ -529,7 +530,7 @@
" V_new[s] = R[s] + gamma * np.sum(P[a, s, :] * V)\n",
"\n",
" delta = np.max(\n",
" np.abs(V_new - V)\n",
" np.abs(V_new - V),\n",
" ) # This measures how much the value function changed in this iteration:\n",
" # If delta is small, the values start to converge; otherwise, we need to keep iterating.\n",
" V = V_new # Update V, i.e. Set the new values for the next iteration.\n",
@@ -537,7 +538,7 @@
" if delta < theta: # Check convergence: When changes are tiny, we stop.\n",
" break\n",
"\n",
" return V # Return the final value function, this is our estimate for V^{pi}(s), s in the state set.\n"
" return V # Return the final value function, this is our estimate for V^{pi}(s), s in the state set."
]
},
{
@@ -550,7 +551,8 @@
"def plot_values(V: np.ndarray, title=\"Value function\") -> None:\n",
" \"\"\"Plot the value function V on the maze as a heatmap.\"\"\"\n",
" grid_values = np.full(\n",
" (n_rows, n_cols), np.nan\n",
" (n_rows, n_cols),\n",
" np.nan,\n",
" ) # Initializes a grid the same size as the maze. Every cell starts as NaN.\n",
" for (\n",
" s,\n",
@@ -575,14 +577,20 @@
"\n",
" for s, (i, j) in state_to_pos.items():\n",
" ax.text(\n",
" j, i, f\"{V[s]:.2f}\", ha=\"center\", va=\"center\", color=\"white\", fontsize=9\n",
" j,\n",
" i,\n",
" f\"{V[s]:.2f}\",\n",
" ha=\"center\",\n",
" va=\"center\",\n",
" color=\"white\",\n",
" fontsize=9,\n",
" )\n",
"\n",
" # Remove axis ticks and set title\n",
" ax.set_xticks([])\n",
" ax.set_yticks([])\n",
" ax.set_title(title)\n",
" plt.show()\n"
" plt.show()"
]
},
{
@@ -659,7 +667,7 @@
" ax.set_yticklabels([])\n",
" ax.grid(True)\n",
" ax.set_title(title)\n",
" plt.show()\n"
" plt.show()"
]
},
{
@@ -716,7 +724,7 @@
"source": [
"V_random = policy_evaluation(policy=random_policy, P=P, R=R, gamma=gamma)\n",
"print(\"Value function under random policy:\")\n",
"print(V_random)\n"
"print(V_random)"
]
},
{
@@ -748,7 +756,7 @@
],
"source": [
"plot_values(V_random, title=\"Value function: random policy\")\n",
"plot_policy(policy=random_policy, title=\"Random Policy\")\n"
"plot_policy(policy=random_policy, title=\"Random Policy\")"
]
},
{
@@ -847,7 +855,7 @@
"V_my_policy = policy_evaluation(policy=my_policy, P=P, R=R, gamma=gamma)\n",
"\n",
"plot_values(V=V_my_policy, title=\"Value function: my policy\")\n",
"plot_policy(policy=my_policy, title=\"My policy\")\n"
"plot_policy(policy=my_policy, title=\"My policy\")"
]
},
{

View File

@@ -9,6 +9,7 @@
"source": [
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
"from sklearn.cluster import KMeans"
]
},
@@ -77,11 +78,11 @@
" return eigenvalues[idx_L], eigenvectors[:, idx_L]\n",
"\n",
"\n",
"def compute_W_matrix(sigma: int, X: np.ndarray) -> np.ndarray:\n",
"def compute_W_matrix(sigma: float, X: np.ndarray) -> np.ndarray:\n",
" \"\"\"Fill the similarity matrix W.\n",
"\n",
" Args:\n",
" sigma (int): Parameter for the Gaussian kernel.\n",
" sigma (float): Parameter for the Gaussian kernel.\n",
" X (np.ndarray): Input data.\n",
"\n",
" Returns:\n",
@@ -94,7 +95,7 @@
" W[i, j] = (\n",
" 0 if i == j else np.exp(-(np.abs(X[i] - X[j]) ** 2) / (2 * sigma**2))\n",
" )\n",
" return W\n"
" return W"
]
},
{
@@ -104,11 +105,11 @@
"metadata": {},
"outputs": [],
"source": [
"def create_X(sigma: int, n: int, m: int) -> np.ndarray:\n",
"def create_X(sigma: float, n: int, m: int) -> np.ndarray:\n",
" \"\"\"Create a dataset with 4 Gaussian clusters.\n",
"\n",
" Args:\n",
" sigma (int): Standard deviation of the clusters.\n",
" sigma (float): Standard deviation of the clusters.\n",
" n (int): Total number of data points.\n",
" m (int): Number of clusters.\n",
"\n",
@@ -125,11 +126,11 @@
" return np.concatenate([norm_1, norm_2, norm_3, norm_4])\n",
"\n",
"\n",
"def plot_eigenvalues(sigma: int, n: int, m: int, k: int) -> None:\n",
"def plot_eigenvalues(sigma: float, n: int, m: int, k: int) -> None:\n",
" \"\"\"Plot the eigenvalues of the Laplacian for different sigma values.\n",
"\n",
" Args:\n",
" sigma (int): Standard deviation of the clusters.\n",
" sigma (float): Standard deviation of the clusters.\n",
" n (int): Total number of data points.\n",
" m (int): Number of clusters.\n",
" k (int): Number of eigenvalues to compute.\n",
@@ -153,7 +154,7 @@
" plt.xlabel(\"Index\")\n",
" plt.ylabel(\"Eigenvalue\")\n",
"\n",
" plt.show()\n"
" plt.show()"
]
},
{
@@ -236,7 +237,7 @@
"n = 200\n",
"m = 4\n",
"for sigma in [0.1, 1 / 4, 0.5, 1]:\n",
" plot_eigenvalues(sigma, n=n, m=m, k=k)\n"
" plot_eigenvalues(sigma, n=n, m=m, k=k)"
]
},
{
@@ -246,14 +247,14 @@
"metadata": {},
"outputs": [],
"source": [
"def spectral_clustering(sigma, n: int, m: int, k: int) -> np.ndarray:\n",
"def spectral_clustering(sigma: float, n: int, m: int, k: int) -> np.ndarray:\n",
" \"\"\"Perform spectral clustering on the data.\n",
"\n",
" Args:\n",
" sigma: The sigma value for the similarity matrix.\n",
" n: Number of data points.\n",
" m: Number of clusters.\n",
" k: Number of eigenvectors to use.\n",
" sigma (float): The sigma value for the similarity matrix.\n",
" n (int): Number of data points.\n",
" m (int): Number of clusters.\n",
" k (int): Number of eigenvectors to use.\n",
"\n",
" Returns:\n",
" X: The data points.\n",
@@ -269,7 +270,7 @@
"\n",
" kmeans = KMeans(n_clusters=k)\n",
"\n",
" return X, kmeans.fit_predict(U_normalized)\n"
" return X, kmeans.fit_predict(U_normalized)"
]
},
{
@@ -312,7 +313,7 @@
"\n",
"plt.scatter(X, np.zeros_like(X), c=clusters, cmap=\"magma\")\n",
"plt.title(\"Spectral Clustering Results\")\n",
"plt.xlabel(\"Data Points\")\n"
"plt.xlabel(\"Data Points\")"
]
},
{