mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-14 15:54:13 +01:00
Refactor code formatting and improve readability in Jupyter notebooks for TP_4 and TP_5
- Adjusted indentation and line breaks for better clarity in function definitions and import statements. - Standardized string quotes for consistency across the codebase. - Enhanced readability of DataFrame creation and manipulation by breaking long lines into multiple lines. - Cleaned up print statements and comments for improved understanding. - Ensured consistent use of whitespace around operators and after commas.
This commit is contained in:
@@ -715,10 +715,7 @@
|
||||
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
||||
" approx_error = np.abs(I_approx - I_exact)\n",
|
||||
" approx_errors.append(approx_error)\n",
|
||||
" print(\n",
|
||||
" f\"{N:5d} | \"\n",
|
||||
" + \" \".join(f\"{e:.3f} \" for e in approx_errors)\n",
|
||||
" )"
|
||||
" print(f\"{N:5d} | \" + \" \".join(f\"{e:.3f} \" for e in approx_errors))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -773,10 +770,7 @@
|
||||
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
||||
" approx_error = np.abs(I_approx - I_exact)\n",
|
||||
" approx_errors.append(approx_error)\n",
|
||||
" print(\n",
|
||||
" f\"{N:5d} | \"\n",
|
||||
" + \" \".join(f\"{e:.3f} \" for e in approx_errors)\n",
|
||||
" )"
|
||||
" print(f\"{N:5d} | \" + \" \".join(f\"{e:.3f} \" for e in approx_errors))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -333,6 +333,8 @@
|
||||
"source": [
|
||||
"def f(x):\n",
|
||||
" return 1 / (1 + x**2)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"a, b = -5, 5\n",
|
||||
"xx = np.linspace(a, b, 200)\n",
|
||||
"\n",
|
||||
@@ -375,6 +377,8 @@
|
||||
"source": [
|
||||
"def f(x):\n",
|
||||
" return 1 / (1 + x**2)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"a, b = -5, 5\n",
|
||||
"xx = np.linspace(a, b, 200)\n",
|
||||
"\n",
|
||||
|
||||
@@ -65,12 +65,20 @@
|
||||
"source": [
|
||||
"def f1(x):\n",
|
||||
" return np.exp(x) - 1 - x\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def f2(x):\n",
|
||||
" return x - np.sin(x)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def f3(x):\n",
|
||||
" return x + np.sin(x)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def f4(x):\n",
|
||||
" return x + np.cos(x) - 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def f5(x):\n",
|
||||
" return x - np.cos(x) + 1"
|
||||
]
|
||||
|
||||
@@ -159,11 +159,15 @@
|
||||
"\n",
|
||||
" for n in range(N - 1):\n",
|
||||
" p1 = f(vt[n], yn[:, n])\n",
|
||||
"\n",
|
||||
" def F1(p2):\n",
|
||||
" return f(vt[n] + h / 3, yn[:, n] + h / 6 * (p1 + p2)) - p2\n",
|
||||
"\n",
|
||||
" p2 = newton(F1, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
||||
"\n",
|
||||
" def F2(yn1):\n",
|
||||
" return yn[:, n] + h / 4 * (3 * p2 + f(vt[n + 1], yn1)) - yn1\n",
|
||||
"\n",
|
||||
" yn[:, n + 1] = newton(F2, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
||||
" return yn"
|
||||
]
|
||||
|
||||
@@ -66,6 +66,8 @@
|
||||
"\n",
|
||||
"def f(x):\n",
|
||||
" return np.tanh(x)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"aL, aR = -20, 3\n",
|
||||
"print(dichotomy(f, aL, aR))"
|
||||
]
|
||||
@@ -135,9 +137,15 @@
|
||||
"\n",
|
||||
"def f(x):\n",
|
||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"x0 = 1.8\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def df(x):\n",
|
||||
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(Newton(f, df, x0))"
|
||||
]
|
||||
},
|
||||
@@ -188,6 +196,8 @@
|
||||
"\n",
|
||||
"def f(x):\n",
|
||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"xx = [(1, 1.9), (1, 2.3), (1, 2.4)]\n",
|
||||
"\n",
|
||||
"for x0, x1 in xx:\n",
|
||||
@@ -265,8 +275,12 @@
|
||||
"\n",
|
||||
"def f(x):\n",
|
||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def df(x):\n",
|
||||
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(DichotomyNewton(f, df, -20, 3))"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -1412,7 +1412,10 @@
|
||||
"f, axarr = plt.subplots(2, 3, sharex=\"col\", sharey=\"row\", figsize=(15, 12))\n",
|
||||
"\n",
|
||||
"for idx, clf, tt in zip(\n",
|
||||
" product([0, 1, 2], [0, 1, 2]), KNNs, [f\"KNN (k={k})\" for k in nb_neighbors], strict=False\n",
|
||||
" product([0, 1, 2], [0, 1, 2]),\n",
|
||||
" KNNs,\n",
|
||||
" [f\"KNN (k={k})\" for k in nb_neighbors],\n",
|
||||
" strict=False,\n",
|
||||
"):\n",
|
||||
" Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
|
||||
" Z = Z.reshape(xx.shape)\n",
|
||||
|
||||
@@ -2545,7 +2545,9 @@
|
||||
"\n",
|
||||
"MSEs = []\n",
|
||||
"for name, estimator in zip(\n",
|
||||
" [\"LassoCV\", \"LassoBIC\", \"RidgeCV\", \"OLS\"], [lassoCV, lassoBIC, ridgeCV, linReg], strict=False\n",
|
||||
" [\"LassoCV\", \"LassoBIC\", \"RidgeCV\", \"OLS\"],\n",
|
||||
" [lassoCV, lassoBIC, ridgeCV, linReg],\n",
|
||||
" strict=False,\n",
|
||||
"):\n",
|
||||
" y_pred = estimator.predict(Xtest)\n",
|
||||
" MSE = mean_squared_error(Ytest, y_pred)\n",
|
||||
|
||||
@@ -24,20 +24,29 @@
|
||||
"%matplotlib inline\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"sns.set(style='whitegrid')\n",
|
||||
"\n",
|
||||
"sns.set(style=\"whitegrid\")\n",
|
||||
"\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from tensorflow import keras\n",
|
||||
"\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.mnist.load_data())\n",
|
||||
"X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, train_size=0.8)\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n",
|
||||
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
|
||||
" X_train_full, y_train_full, train_size=0.8\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)\n",
|
||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)\n",
|
||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)"
|
||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28\n",
|
||||
")\n",
|
||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28\n",
|
||||
")\n",
|
||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -69,11 +78,15 @@
|
||||
" [\n",
|
||||
" keras.layers.Input(shape=[28, 28]),\n",
|
||||
" keras.layers.Flatten(),\n",
|
||||
" keras.layers.Dense(256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)),\n",
|
||||
" keras.layers.Dense(128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)),\n",
|
||||
" keras.layers.Dense(\n",
|
||||
" 256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dense(\n",
|
||||
" 128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -150,8 +163,16 @@
|
||||
" [\n",
|
||||
" keras.layers.Input(shape=[28, 28]),\n",
|
||||
" keras.layers.Flatten(),\n",
|
||||
" keras.layers.Dense(256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(lambda_l2)),\n",
|
||||
" keras.layers.Dense(128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(lambda_l2)),\n",
|
||||
" keras.layers.Dense(\n",
|
||||
" 256,\n",
|
||||
" activation=\"relu\",\n",
|
||||
" kernel_regularizer=keras.regularizers.l2(lambda_l2),\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dense(\n",
|
||||
" 128,\n",
|
||||
" activation=\"relu\",\n",
|
||||
" kernel_regularizer=keras.regularizers.l2(lambda_l2),\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
" )\n",
|
||||
@@ -218,20 +239,28 @@
|
||||
" lambda_l2 = result[\"lambda_l2\"]\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 1)\n",
|
||||
" plt.plot(history_df[\"val_loss\"], label=f\"LR={learning_rate}, L2={lambda_l2}\", color=colors[_])\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"val_loss\"],\n",
|
||||
" label=f\"LR={learning_rate}, L2={lambda_l2}\",\n",
|
||||
" color=colors[_],\n",
|
||||
" )\n",
|
||||
" plt.plot(history_df[\"loss\"], linestyle=\"--\", color=colors[_])\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Loss\")\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 2)\n",
|
||||
" plt.plot(history_df[\"val_accuracy\"], label=f\"LR={learning_rate}, L2={lambda_l2}\", color=colors[_])\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"val_accuracy\"],\n",
|
||||
" label=f\"LR={learning_rate}, L2={lambda_l2}\",\n",
|
||||
" color=colors[_],\n",
|
||||
" )\n",
|
||||
" plt.plot(history_df[\"accuracy\"], linestyle=\"--\", color=colors[_])\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Accuracy\")\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.show()\n"
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -26,11 +26,11 @@
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"sns.set(style='whitegrid')\n",
|
||||
"sns.set(style=\"whitegrid\")\n",
|
||||
"\n",
|
||||
"from tensorflow import keras\n",
|
||||
"\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.mnist.load_data())"
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -61,7 +61,7 @@
|
||||
" X_train_full, y_train_full, test_size=0.2, random_state=42\n",
|
||||
")\n",
|
||||
"print(X_train.shape, y_train.shape)\n",
|
||||
"print(X_valid.shape, y_valid.shape)\n"
|
||||
"print(X_valid.shape, y_valid.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -174,13 +174,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"model = keras.models.Sequential([\n",
|
||||
"model = keras.models.Sequential(\n",
|
||||
" [\n",
|
||||
" keras.layers.Input(shape=[28, 28]),\n",
|
||||
" keras.layers.Flatten(),\n",
|
||||
" keras.layers.Dense(256, activation=\"relu\"),\n",
|
||||
" keras.layers.Dense(128, activation=\"relu\"),\n",
|
||||
" keras.layers.Dense(10, activation=\"softmax\")\n",
|
||||
"])"
|
||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -332,7 +334,7 @@
|
||||
" loss=\"sparse_categorical_crossentropy\",\n",
|
||||
" optimizer=keras.optimizers.SGD(learning_rate=1e-3),\n",
|
||||
" metrics=[\"accuracy\"],\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -379,7 +381,7 @@
|
||||
" epochs=epochs,\n",
|
||||
" batch_size=batch_size,\n",
|
||||
" validation_data=(X_valid, y_valid),\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -435,17 +437,17 @@
|
||||
" plt.figure(figsize=(12, 4))\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 1)\n",
|
||||
" plt.plot(history_df['loss'], label='Training Loss')\n",
|
||||
" plt.plot(history_df[\"loss\"], label=\"Training Loss\")\n",
|
||||
" plt.plot(history_df[\"val_loss\"], label=\"Validation Loss\")\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Loss\")\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 2)\n",
|
||||
" plt.plot(history_df['accuracy'], label='Accuracy')\n",
|
||||
" plt.plot(history_df[\"accuracy\"], label=\"Accuracy\")\n",
|
||||
" plt.plot(history_df[\"val_accuracy\"], label=\"Validation Accuracy\")\n",
|
||||
" plt.xlabel('Epochs')\n",
|
||||
" plt.ylabel('Accuracy')\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Accuracy\")\n",
|
||||
" plt.legend()"
|
||||
]
|
||||
},
|
||||
@@ -645,7 +647,7 @@
|
||||
" \"n_epochs\": n_epochs,\n",
|
||||
" \"history\": pd.DataFrame(history.history),\n",
|
||||
" }\n",
|
||||
" results.append(result)\n"
|
||||
" results.append(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -669,36 +671,27 @@
|
||||
" learning_rate = result[\"learning_rate\"]\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 1)\n",
|
||||
" plt.plot(history_df[\"val_loss\"], linestyle=\"--\", color=colors[_])\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"val_loss\"],\n",
|
||||
" linestyle=\"--\",\n",
|
||||
" color=colors[_]\n",
|
||||
" )\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"loss\"], label=f\"LR={learning_rate}\", alpha=0.5,\n",
|
||||
" color=colors[_]\n",
|
||||
" history_df[\"loss\"], label=f\"LR={learning_rate}\", alpha=0.5, color=colors[_]\n",
|
||||
" )\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Loss\")\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.subplot(1, 2, 2)\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"val_accuracy\"],\n",
|
||||
" linestyle=\"--\",\n",
|
||||
" color=colors[_]\n",
|
||||
" )\n",
|
||||
" plt.plot(history_df[\"val_accuracy\"], linestyle=\"--\", color=colors[_])\n",
|
||||
" plt.plot(\n",
|
||||
" history_df[\"accuracy\"],\n",
|
||||
" label=f\"LR={learning_rate}\",\n",
|
||||
" alpha=0.5,\n",
|
||||
" color=colors[_]\n",
|
||||
" color=colors[_],\n",
|
||||
" )\n",
|
||||
" plt.xlabel(\"Epochs\")\n",
|
||||
" plt.ylabel(\"Accuracy\")\n",
|
||||
" plt.legend()\n",
|
||||
"\n",
|
||||
" plt.show()\n"
|
||||
" plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -767,7 +760,7 @@
|
||||
" \"n_epochs\": n_epochs,\n",
|
||||
" \"history\": pd.DataFrame(history.history),\n",
|
||||
" }\n",
|
||||
" results.append(result)\n"
|
||||
" results.append(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -24,20 +24,30 @@
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"sns.set(style='whitegrid')\n",
|
||||
"sns.set(style=\"whitegrid\")\n",
|
||||
"\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from tensorflow import keras\n",
|
||||
"\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.fashion_mnist.load_data())\n",
|
||||
"X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, train_size=0.8)\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (\n",
|
||||
" keras.datasets.fashion_mnist.load_data()\n",
|
||||
")\n",
|
||||
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
|
||||
" X_train_full, y_train_full, train_size=0.8\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"scaler = StandardScaler()\n",
|
||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)\n",
|
||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)\n",
|
||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)"
|
||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28, 1\n",
|
||||
")\n",
|
||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28, 1\n",
|
||||
")\n",
|
||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||
" -1, 28, 28, 1\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -26,11 +26,13 @@
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"import seaborn as sns\n",
|
||||
"\n",
|
||||
"sns.set(style='whitegrid')\n",
|
||||
"sns.set(style=\"whitegrid\")\n",
|
||||
"\n",
|
||||
"from tensorflow import keras\n",
|
||||
"\n",
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.fashion_mnist.load_data())"
|
||||
"(X_train_full, y_train_full), (X_test, y_test) = (\n",
|
||||
" keras.datasets.fashion_mnist.load_data()\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -186,7 +188,7 @@
|
||||
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
|
||||
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -627,10 +629,7 @@
|
||||
" batch_size=batch_size,\n",
|
||||
" validation_data=(X_valid, y_valid),\n",
|
||||
" )\n",
|
||||
" training_curves.append({\n",
|
||||
" 'history': history,\n",
|
||||
" 'normalization': normalized\n",
|
||||
" })"
|
||||
" training_curves.append({\"history\": history, \"normalization\": normalized})"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -653,7 +652,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def agregate_result(results: list, normalized: bool, metric_name: str = 'accuracy') -> pd.DataFrame:\n",
|
||||
"def agregate_result(\n",
|
||||
" results: list, normalized: bool, metric_name: str = \"accuracy\"\n",
|
||||
") -> pd.DataFrame:\n",
|
||||
" train_curves = []\n",
|
||||
" val_curves = []\n",
|
||||
"\n",
|
||||
@@ -663,7 +664,7 @@
|
||||
" train_curves.append(hist_obj.history[metric_name])\n",
|
||||
" val_curves.append(hist_obj.history[f\"val_{metric_name}\"])\n",
|
||||
"\n",
|
||||
" return np.array(train_curves).flatten(), np.array(val_curves).flatten()\n"
|
||||
" return np.array(train_curves).flatten(), np.array(val_curves).flatten()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -697,7 +698,9 @@
|
||||
"for idx, metric in enumerate(metrics):\n",
|
||||
" ax = axs[idx]\n",
|
||||
" for normalized in [True, False]:\n",
|
||||
" train, val = agregate_result(training_curves, normalized=normalized, metric_name=metric)\n",
|
||||
" train, val = agregate_result(\n",
|
||||
" training_curves, normalized=normalized, metric_name=metric\n",
|
||||
" )\n",
|
||||
" train_runs = train.reshape(-1, epochs)\n",
|
||||
" val_runs = val.reshape(-1, epochs)\n",
|
||||
"\n",
|
||||
@@ -710,10 +713,22 @@
|
||||
" label_prefix = \"With BN\" if normalized else \"Without BN\"\n",
|
||||
"\n",
|
||||
" ax.plot(mean_train, label=label_prefix, color=color, linestyle=\"-\")\n",
|
||||
" ax.fill_between(range(epochs), mean_train - std_train, mean_train + std_train, color=color, alpha=0.2)\n",
|
||||
" ax.fill_between(\n",
|
||||
" range(epochs),\n",
|
||||
" mean_train - std_train,\n",
|
||||
" mean_train + std_train,\n",
|
||||
" color=color,\n",
|
||||
" alpha=0.2,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ax.plot(mean_val, color=color, linestyle=\"--\")\n",
|
||||
" ax.fill_between(range(epochs), mean_val - std_val, mean_val + std_val, color=color, alpha=0.2)\n",
|
||||
" ax.fill_between(\n",
|
||||
" range(epochs),\n",
|
||||
" mean_val - std_val,\n",
|
||||
" mean_val + std_val,\n",
|
||||
" color=color,\n",
|
||||
" alpha=0.2,\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" ax.set_title(f\"Training and Validation {metric.capitalize()}\")\n",
|
||||
" ax.set_xlabel(\"Epochs\")\n",
|
||||
@@ -721,7 +736,7 @@
|
||||
" ax.legend()\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -43,7 +43,7 @@
|
||||
" 7: \"horse\",\n",
|
||||
" 8: \"ship\",\n",
|
||||
" 9: \"truck \",\n",
|
||||
"}\n"
|
||||
"}"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -299,13 +299,21 @@
|
||||
" model = keras.Sequential(\n",
|
||||
" [\n",
|
||||
" keras.layers.InputLayer(shape=(32, 32, 3)),\n",
|
||||
" keras.layers.Conv2D(filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
||||
" keras.layers.Conv2D(\n",
|
||||
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dropout(0.2),\n",
|
||||
" keras.layers.Conv2D(filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
||||
" keras.layers.Conv2D(\n",
|
||||
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||
" ),\n",
|
||||
" keras.layers.MaxPooling2D(pool_size=2),\n",
|
||||
" keras.layers.Conv2D(filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
||||
" keras.layers.Conv2D(\n",
|
||||
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||
" ),\n",
|
||||
" keras.layers.Dropout(0.2),\n",
|
||||
" keras.layers.Conv2D(filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
||||
" keras.layers.Conv2D(\n",
|
||||
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||
" ),\n",
|
||||
" keras.layers.Flatten(),\n",
|
||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||
" ]\n",
|
||||
@@ -316,7 +324,7 @@
|
||||
"\n",
|
||||
"model = get_model()\n",
|
||||
"model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n",
|
||||
"model.summary()\n"
|
||||
"model.summary()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -339,7 +347,9 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def compile_train(optimizer_function: str, learning_rate: float, **kwargs) -> keras.callbacks.History:\n",
|
||||
"def compile_train(\n",
|
||||
" optimizer_function: str, learning_rate: float, **kwargs\n",
|
||||
") -> keras.callbacks.History:\n",
|
||||
" model = get_model()\n",
|
||||
" optimizer = optimizer_function(learning_rate=learning_rate)\n",
|
||||
" model.compile(\n",
|
||||
@@ -390,7 +400,9 @@
|
||||
"source": [
|
||||
"epochs = 5\n",
|
||||
"batch_size = 64\n",
|
||||
"history_adam = compile_train(keras.optimizers.Adam, learning_rate=0.001, epochs=epochs, batch_size=batch_size)"
|
||||
"history_adam = compile_train(\n",
|
||||
" keras.optimizers.Adam, learning_rate=0.001, epochs=epochs, batch_size=batch_size\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -603,7 +615,7 @@
|
||||
"plt.xlabel(\"Epochs\")\n",
|
||||
"plt.ylabel(\"Validation Loss\")\n",
|
||||
"plt.legend()\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -121,11 +121,11 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print('33' + \"42\")\n",
|
||||
"print(\"33\" + \"42\")\n",
|
||||
"a = \"toto est toto\"\n",
|
||||
"a[4:]\n",
|
||||
"# Tout est objet\n",
|
||||
"print(a.split(' '))"
|
||||
"print(a.split(\" \"))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -143,7 +143,9 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\"XX\", \"Mardi 19 septembre\")\n",
|
||||
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\n",
|
||||
" \"XX\", \"Mardi 19 septembre\"\n",
|
||||
")\n",
|
||||
"print(salutation)"
|
||||
]
|
||||
},
|
||||
@@ -195,7 +197,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"a = [1,'q',2,3,5,8,'TOTO']\n",
|
||||
"a = [1, \"q\", 2, 3, 5, 8, \"TOTO\"]\n",
|
||||
"print(a[1])\n",
|
||||
"print(a[-1])\n",
|
||||
"print(a[1:3])\n",
|
||||
@@ -409,6 +411,8 @@
|
||||
"source": [
|
||||
"def square(x):\n",
|
||||
" return x * x\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(square(3))"
|
||||
]
|
||||
},
|
||||
@@ -488,12 +492,12 @@
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"\n",
|
||||
"serie = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})\n",
|
||||
"serie = pd.Series({\"a\": 1, \"b\": 2, \"c\": 3, \"d\": 4, \"e\": 5})\n",
|
||||
"\n",
|
||||
"print(serie)\n",
|
||||
"print(serie.index)\n",
|
||||
"print(serie.mean())\n",
|
||||
"print(serie['b'])\n",
|
||||
"print(serie[\"b\"])\n",
|
||||
"print(serie.b)"
|
||||
]
|
||||
},
|
||||
@@ -595,10 +599,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"df = pd.DataFrame({\n",
|
||||
" 's': s,\n",
|
||||
" 't': t\n",
|
||||
"})\n",
|
||||
"df = pd.DataFrame({\"s\": s, \"t\": t})\n",
|
||||
"\n",
|
||||
"print(df)"
|
||||
]
|
||||
@@ -634,7 +635,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"df['SUM'] = df['s'] + df['t']\n",
|
||||
"df[\"SUM\"] = df[\"s\"] + df[\"t\"]\n",
|
||||
"\n",
|
||||
"print(df)"
|
||||
]
|
||||
@@ -663,7 +664,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"print(df['SUM'].mean())"
|
||||
"print(df[\"SUM\"].mean())"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -809,7 +810,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"index = pd.date_range('2012-01-01', periods=250, freq='D')\n",
|
||||
"index = pd.date_range(\"2012-01-01\", periods=250, freq=\"D\")\n",
|
||||
"\n",
|
||||
"serie = pd.Series(X, index=index)\n",
|
||||
"print(serie)"
|
||||
@@ -923,7 +924,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path = input_path + '/base_modelisation.csv'\n",
|
||||
"path = input_path + \"/base_modelisation.csv\"\n",
|
||||
"data_set = pd.read_csv(path, sep=\";\", decimal=\",\")"
|
||||
]
|
||||
},
|
||||
@@ -2606,7 +2607,7 @@
|
||||
"binaires = []\n",
|
||||
"\n",
|
||||
"for col in data_set.columns:\n",
|
||||
" if data_set[col].dtype in ['int64', 'float64']:\n",
|
||||
" if data_set[col].dtype in [\"int64\", \"float64\"]:\n",
|
||||
" if len(data_set[col].dropna().unique()) == 2:\n",
|
||||
" binaires.append(col)\n",
|
||||
" else:\n",
|
||||
@@ -2619,7 +2620,7 @@
|
||||
"\n",
|
||||
"print(\"Variables quantitatives :\", quantitatives)\n",
|
||||
"print(\"\\nVariables catégorielles :\", categorielles)\n",
|
||||
"print(\"\\nVariables binaires :\", binaires)\n"
|
||||
"print(\"\\nVariables binaires :\", binaires)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -3524,8 +3525,10 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fig = px.histogram(data_set.sort_values('ANNEE_CTR'), x=\"ANNEE_CTR\")\n",
|
||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"fig = px.histogram(data_set.sort_values(\"ANNEE_CTR\"), x=\"ANNEE_CTR\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -18655,8 +18658,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fig = px.histogram(data_set, x=\"CONTRAT_ANCIENNETE\",\n",
|
||||
" category_orders={'CONTRAT_ANCIENNETE': ['(-1,0]','(0,1]',\"(1,2]\",\"(2,5]\",\"(5,10]\"]})\n",
|
||||
"fig = px.histogram(\n",
|
||||
" data_set,\n",
|
||||
" x=\"CONTRAT_ANCIENNETE\",\n",
|
||||
" category_orders={\n",
|
||||
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"]\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -48890,8 +48898,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fig = px.histogram(data_set, x=\"GROUPE_KM\",\n",
|
||||
" category_orders={'GROUPE_KM': [\"[0;20000[\",\"[20000;40000[\",\"[40000;60000[\",\"[60000;99999[\"]})\n",
|
||||
"fig = px.histogram(\n",
|
||||
" data_set,\n",
|
||||
" x=\"GROUPE_KM\",\n",
|
||||
" category_orders={\n",
|
||||
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"]\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -64006,8 +64019,10 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(data_set.sort_values('ZONE_RISQUE'), x=\"ZONE_RISQUE\")\n",
|
||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"fig = px.histogram(data_set.sort_values(\"ZONE_RISQUE\"), x=\"ZONE_RISQUE\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -64861,8 +64876,12 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(data_set.sort_values('AGE_ASSURE_PRINCIPAL'), x=\"AGE_ASSURE_PRINCIPAL\")\n",
|
||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"fig = px.histogram(\n",
|
||||
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"), x=\"AGE_ASSURE_PRINCIPAL\"\n",
|
||||
")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -79978,8 +79997,10 @@
|
||||
],
|
||||
"source": [
|
||||
"# Ecrivez votre code ici\n",
|
||||
"fig = px.histogram(data_set.sort_values('GENRE'), x=\"GENRE\")\n",
|
||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"fig = px.histogram(data_set.sort_values(\"GENRE\"), x=\"GENRE\")\n",
|
||||
"fig.update_xaxes(\n",
|
||||
" type=\"category\"\n",
|
||||
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -80011,7 +80032,12 @@
|
||||
],
|
||||
"source": [
|
||||
"# Préparation des données : compter le nombre de femmes et hommes par âge\n",
|
||||
"tmp = data_set[[\"AGE_ASSURE_PRINCIPAL\",\"GENRE\"]].value_counts().to_frame('counts').reset_index()\n",
|
||||
"tmp = (\n",
|
||||
" data_set[[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\"]]\n",
|
||||
" .value_counts()\n",
|
||||
" .to_frame(\"counts\")\n",
|
||||
" .reset_index()\n",
|
||||
")\n",
|
||||
"data_f = tmp[tmp[\"GENRE\"] == \"F\"]\n",
|
||||
"data_h = tmp[tmp[\"GENRE\"] == \"M\"]\n",
|
||||
"\n",
|
||||
@@ -80034,11 +80060,19 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Il faut ajouter l'âge 13 dans la liste 2 (data_h)\n",
|
||||
"data_h = pd.concat([data_h, pd.DataFrame([[13, \"M\",0]],columns=['AGE_ASSURE_PRINCIPAL', 'GENRE','counts'])], ignore_index=True)\n",
|
||||
"data_h = pd.concat(\n",
|
||||
" [\n",
|
||||
" data_h,\n",
|
||||
" pd.DataFrame(\n",
|
||||
" [[13, \"M\", 0]], columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"]\n",
|
||||
" ),\n",
|
||||
" ],\n",
|
||||
" ignore_index=True,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# On ordonne les dataframes\n",
|
||||
"data_h = data_h.sort_values('AGE_ASSURE_PRINCIPAL', ascending = True)\n",
|
||||
"data_f = data_f.sort_values('AGE_ASSURE_PRINCIPAL', ascending = True)"
|
||||
"data_h = data_h.sort_values(\"AGE_ASSURE_PRINCIPAL\", ascending=True)\n",
|
||||
"data_f = data_f.sort_values(\"AGE_ASSURE_PRINCIPAL\", ascending=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -80049,9 +80083,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Graphique\n",
|
||||
"y_age = data_h['AGE_ASSURE_PRINCIPAL']\n",
|
||||
"x_M = data_h['counts']\n",
|
||||
"x_F = data_f['counts'] * -1"
|
||||
"y_age = data_h[\"AGE_ASSURE_PRINCIPAL\"]\n",
|
||||
"x_M = data_h[\"counts\"]\n",
|
||||
"x_F = data_f[\"counts\"] * -1"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -80907,22 +80941,20 @@
|
||||
"fig = gp.Figure()\n",
|
||||
"\n",
|
||||
"# Ajout des données H\n",
|
||||
"fig.add_trace(gp.Bar(y= y_age, x = x_M,\n",
|
||||
" name = 'Hommes',\n",
|
||||
" orientation = 'h'))\n",
|
||||
"fig.add_trace(gp.Bar(y=y_age, x=x_M, name=\"Hommes\", orientation=\"h\"))\n",
|
||||
"\n",
|
||||
"# Ajout des données F\n",
|
||||
"fig.add_trace(gp.Bar(y = y_age, x = x_F,\n",
|
||||
" name = 'Femmes', orientation = 'h'))\n",
|
||||
"fig.add_trace(gp.Bar(y=y_age, x=x_F, name=\"Femmes\", orientation=\"h\"))\n",
|
||||
"\n",
|
||||
"# layout du graphique\n",
|
||||
"fig.update_layout(title = 'Population du portefeuille',\n",
|
||||
" title_font_size = 22, barmode = 'relative',\n",
|
||||
" bargap = 0.0, bargroupgap = 0,\n",
|
||||
" xaxis = {'title': 'Count',\n",
|
||||
" 'title_font_size': 14},\n",
|
||||
" yaxis = {'title': 'Age',\n",
|
||||
" 'title_font_size': 14}\n",
|
||||
"fig.update_layout(\n",
|
||||
" title=\"Population du portefeuille\",\n",
|
||||
" title_font_size=22,\n",
|
||||
" barmode=\"relative\",\n",
|
||||
" bargap=0.0,\n",
|
||||
" bargroupgap=0,\n",
|
||||
" xaxis={\"title\": \"Count\", \"title_font_size\": 14},\n",
|
||||
" yaxis={\"title\": \"Age\", \"title_font_size\": 14},\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"fig.show()"
|
||||
@@ -81795,7 +81827,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"fig = px.histogram(data_set[data_set['CHARGE'] >= 0], x=\"CHARGE\", nbins=50)\n",
|
||||
"fig = px.histogram(data_set[data_set[\"CHARGE\"] >= 0], x=\"CHARGE\", nbins=50)\n",
|
||||
"fig.update_layout(title=\"Distribution des coûts des sinistres\")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
@@ -82271,7 +82303,7 @@
|
||||
"data_retraitee = data_set\n",
|
||||
"\n",
|
||||
"# Option 1 : Suppression des variables avec trop de NA (PUISSANCE_VEHICULE)\n",
|
||||
"data_retraitee = data_retraitee.drop(\"PUISSANCE_VEHICULE\", axis='columns')\n",
|
||||
"data_retraitee = data_retraitee.drop(\"PUISSANCE_VEHICULE\", axis=\"columns\")\n",
|
||||
"data_retraitee.head()"
|
||||
]
|
||||
},
|
||||
@@ -82296,11 +82328,19 @@
|
||||
"# Option 2 : Remplacer par la classe la plus représentée/valeur moyenne (GROUPE_KM,GENRE,\n",
|
||||
"# ANNEE_CONSTRUCTION,VALEUR_DU_BIEN,DEUXIEME_CONDUCTEUR)\n",
|
||||
"\n",
|
||||
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(data_retraitee[\"GROUPE_KM\"].mode()[0])\n",
|
||||
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(\n",
|
||||
" data_retraitee[\"GROUPE_KM\"].mode()[0]\n",
|
||||
")\n",
|
||||
"data_retraitee[\"GENRE\"] = data_retraitee[\"GENRE\"].fillna(\"M\")\n",
|
||||
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(data_retraitee[\"ANNEE_CONSTRUCTION\"].median())\n",
|
||||
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0])\n",
|
||||
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(False)"
|
||||
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(\n",
|
||||
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median()\n",
|
||||
")\n",
|
||||
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(\n",
|
||||
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0]\n",
|
||||
")\n",
|
||||
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(\n",
|
||||
" False\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -82313,7 +82353,9 @@
|
||||
"# Option 3 : Remplacer par une valeur prudente (ZONE_RISQUE,DEUXIEME_CONDUCTEUR)\n",
|
||||
"\n",
|
||||
"# Remplacer par la zone avec le plus de sinistres\n",
|
||||
"zone_plus_sinsitree = data_retraitee[[\"ZONE_RISQUE\", \"NB\"]].groupby([\"ZONE_RISQUE\"]).sum()\n",
|
||||
"zone_plus_sinsitree = (\n",
|
||||
" data_retraitee[[\"ZONE_RISQUE\", \"NB\"]].groupby([\"ZONE_RISQUE\"]).sum()\n",
|
||||
")\n",
|
||||
"zone_plus_sinsitree.sort_values(\"NB\", ascending=False)\n",
|
||||
"\n",
|
||||
"data_retraitee[\"ZONE_RISQUE\"] = data_retraitee[\"ZONE_RISQUE\"].fillna(\"C\")"
|
||||
@@ -83707,7 +83749,9 @@
|
||||
],
|
||||
"source": [
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.line(plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\")\n",
|
||||
"fig = px.line(\n",
|
||||
" plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\"\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -84618,7 +84662,12 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(plot_data, x=\"ZONE_RISQUE\", y=\"FREQ\", title=\"Sinistralité selon la zone géographique\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data,\n",
|
||||
" x=\"ZONE_RISQUE\",\n",
|
||||
" y=\"FREQ\",\n",
|
||||
" title=\"Sinistralité selon la zone géographique\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -85489,7 +85538,9 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\"\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -86355,7 +86406,7 @@
|
||||
"source": [
|
||||
"# Agrégation selon la variable d'intérêt\n",
|
||||
"plot_data = data_retraitee[[\"VALEUR_DU_BIEN\", \"CHARGE\", \"NB\"]]\n",
|
||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
||||
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||
"plot_data = plot_data.groupby([\"VALEUR_DU_BIEN\"], as_index=False).sum()\n",
|
||||
"\n",
|
||||
"# Calcul du CM\n",
|
||||
@@ -86364,7 +86415,9 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\"\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -87229,7 +87282,7 @@
|
||||
"source": [
|
||||
"# Agrégation selon la variable d'intérêt\n",
|
||||
"plot_data = data_retraitee[[\"ANNEE_CONSTRUCTION\", \"CHARGE\", \"NB\"]]\n",
|
||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
||||
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||
"plot_data = plot_data.groupby([\"ANNEE_CONSTRUCTION\"], as_index=False).sum()\n",
|
||||
"\n",
|
||||
"# Calcul du CM\n",
|
||||
@@ -87238,7 +87291,12 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.line(plot_data, x=\"ANNEE_CONSTRUCTION\", y=\"CM\", title=\"Coût moyen selon l'ancienneté du bien\")\n",
|
||||
"fig = px.line(\n",
|
||||
" plot_data,\n",
|
||||
" x=\"ANNEE_CONSTRUCTION\",\n",
|
||||
" y=\"CM\",\n",
|
||||
" title=\"Coût moyen selon l'ancienneté du bien\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -88100,7 +88158,7 @@
|
||||
"source": [
|
||||
"# Agrégation selon la variable d'intérêt\n",
|
||||
"plot_data = data_retraitee[[\"AGE_ASSURE_PRINCIPAL\", \"CHARGE\", \"NB\"]]\n",
|
||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
||||
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||
"plot_data = plot_data.groupby([\"AGE_ASSURE_PRINCIPAL\"], as_index=False).sum()\n",
|
||||
"\n",
|
||||
"# Calcul du CM\n",
|
||||
@@ -88109,7 +88167,12 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data,\n",
|
||||
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
|
||||
" y=\"CM\",\n",
|
||||
" title=\"Coût moyen selon l'âge de l'assuré\",\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
@@ -88971,7 +89034,7 @@
|
||||
"source": [
|
||||
"# Agrégation selon la variable d'intérêt\n",
|
||||
"plot_data = data_retraitee[[\"GENRE\", \"CHARGE\", \"NB\"]]\n",
|
||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
||||
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||
"plot_data = plot_data.groupby([\"GENRE\"], as_index=False).sum()\n",
|
||||
"\n",
|
||||
"# Calcul du CM\n",
|
||||
@@ -88980,7 +89043,9 @@
|
||||
"print(plot_data)\n",
|
||||
"\n",
|
||||
"# Représentation graphique\n",
|
||||
"fig = px.scatter(plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\")\n",
|
||||
"fig = px.scatter(\n",
|
||||
" plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\"\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
|
||||
@@ -65,7 +65,7 @@
|
||||
"from scipy.stats import chi2_contingency # noqa: E402, F401\n",
|
||||
"\n",
|
||||
"# Machine Learning\n",
|
||||
"from sklearn.cluster import AgglomerativeClustering, KMeans # noqa: E402\n"
|
||||
"from sklearn.cluster import AgglomerativeClustering, KMeans # noqa: E402"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -149,9 +149,7 @@
|
||||
"init_points = np.array([1, 2, 18])\n",
|
||||
"\n",
|
||||
"# Itinitialisation algo\n",
|
||||
"kmeans = KMeans(init=init_points.reshape(-1,1),\n",
|
||||
" n_clusters=3,\n",
|
||||
" n_init = 1)"
|
||||
"kmeans = KMeans(init=init_points.reshape(-1, 1), n_clusters=3, n_init=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -650,7 +648,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Représentation Graphique\n",
|
||||
"plt.scatter(x, y, c=labels, cmap='viridis')\n",
|
||||
"plt.scatter(x, y, c=labels, cmap=\"viridis\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -694,9 +692,7 @@
|
||||
"init_points = np.array([18, 20, 31])\n",
|
||||
"\n",
|
||||
"# Itinitialisation algo\n",
|
||||
"kmeans = KMeans(init=init_points.reshape(-1,1),\n",
|
||||
" n_clusters=3,\n",
|
||||
" n_init = 1)\n",
|
||||
"kmeans = KMeans(init=init_points.reshape(-1, 1), n_clusters=3, n_init=1)\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
"data_x = np.array(x)\n",
|
||||
@@ -731,7 +727,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Représentation Graphique\n",
|
||||
"plt.scatter(x, y, c=labels, cmap='viridis')\n",
|
||||
"plt.scatter(x, y, c=labels, cmap=\"viridis\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -775,9 +771,7 @@
|
||||
"init_points = np.array([1, np.mean([2, 18]), np.mean([20, 31])])\n",
|
||||
"\n",
|
||||
"# Itinitialisation algo\n",
|
||||
"kmeans = KMeans(init=init_points.reshape(-1,1),\n",
|
||||
" n_clusters=3,\n",
|
||||
" n_init=1)\n",
|
||||
"kmeans = KMeans(init=init_points.reshape(-1, 1), n_clusters=3, n_init=1)\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
"data_x = np.array(x)\n",
|
||||
@@ -812,7 +806,7 @@
|
||||
],
|
||||
"source": [
|
||||
"# Représentation Graphique\n",
|
||||
"plt.scatter(x, y, c=labels, cmap='viridis')\n",
|
||||
"plt.scatter(x, y, c=labels, cmap=\"viridis\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -872,7 +866,7 @@
|
||||
"# Calcul de CAH avec lien simple\n",
|
||||
"data = list(zip(x, strict=False))\n",
|
||||
"\n",
|
||||
"linkage_data = linkage(data, method='single', metric='euclidean')\n",
|
||||
"linkage_data = linkage(data, method=\"single\", metric=\"euclidean\")\n",
|
||||
"dendrogram(linkage_data, labels=x)\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
@@ -904,13 +898,15 @@
|
||||
],
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='single')\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=3, metric=\"euclidean\", linkage=\"single\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data)\n",
|
||||
"print(labels)\n",
|
||||
"\n",
|
||||
"# Représentation Graphique\n",
|
||||
"plt.scatter(x, y, c=labels, cmap='viridis')\n",
|
||||
"plt.scatter(x, y, c=labels, cmap=\"viridis\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -943,7 +939,7 @@
|
||||
"# Calcul de CAH avec lien complet\n",
|
||||
"data = list(zip(x, strict=False))\n",
|
||||
"\n",
|
||||
"linkage_data = linkage(data, method='complete', metric='euclidean')\n",
|
||||
"linkage_data = linkage(data, method=\"complete\", metric=\"euclidean\")\n",
|
||||
"dendrogram(linkage_data, labels=x)\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
@@ -975,13 +971,15 @@
|
||||
],
|
||||
"source": [
|
||||
"# Calcul de la partition de l'espace\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='complete')\n",
|
||||
"hierarchical_cluster = AgglomerativeClustering(\n",
|
||||
" n_clusters=3, metric=\"euclidean\", linkage=\"complete\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data)\n",
|
||||
"print(labels)\n",
|
||||
"\n",
|
||||
"# Représentation Graphique\n",
|
||||
"plt.scatter(x, y, c=labels, cmap='viridis')\n",
|
||||
"plt.scatter(x, y, c=labels, cmap=\"viridis\")\n",
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
@@ -1479,15 +1477,13 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"path = input_path + '/base_retraitee.csv'\n",
|
||||
"path = input_path + \"/base_retraitee.csv\"\n",
|
||||
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")\n",
|
||||
"\n",
|
||||
"# Group by ZONE_RISQUE and aggregate the necessary columns\n",
|
||||
"data = data_retraitee.groupby([\"ZONE_RISQUE\"], as_index=False).agg({\n",
|
||||
" 'NB': 'sum',\n",
|
||||
" 'CHARGE': 'sum',\n",
|
||||
" 'EXPO': 'sum'\n",
|
||||
"})\n",
|
||||
"data = data_retraitee.groupby([\"ZONE_RISQUE\"], as_index=False).agg(\n",
|
||||
" {\"NB\": \"sum\", \"CHARGE\": \"sum\", \"EXPO\": \"sum\"}\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Calculate derived metrics\n",
|
||||
"data[\"FREQ\"] = data[\"NB\"] / data[\"EXPO\"]\n",
|
||||
@@ -1550,7 +1546,9 @@
|
||||
],
|
||||
"source": [
|
||||
"# Initialisation de l'algorithme\n",
|
||||
"kmeans_FREQ = KMeans(init='random', n_clusters=5, n_init=1, random_state=42, max_iter=300)\n",
|
||||
"kmeans_FREQ = KMeans(\n",
|
||||
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
"data_freq = data[\"FREQ\"].to_numpy()\n",
|
||||
@@ -2555,10 +2553,10 @@
|
||||
],
|
||||
"source": [
|
||||
"# Initialisation de l'algorithme\n",
|
||||
"kmeans_CM = KMeans(init='random', n_clusters=5, n_init=1, random_state=42)\n",
|
||||
"kmeans_CM = KMeans(init=\"random\", n_clusters=5, n_init=1, random_state=42)\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
"data_cm = data['CM'].to_numpy()\n",
|
||||
"data_cm = data[\"CM\"].to_numpy()\n",
|
||||
"data_cm = data_cm.reshape(-1, 1)\n",
|
||||
"\n",
|
||||
"# Fitting\n",
|
||||
@@ -3560,10 +3558,12 @@
|
||||
],
|
||||
"source": [
|
||||
"# Initialisation de l'algorithme\n",
|
||||
"kmeans_FREQ_CM = KMeans(init='random', n_clusters=5, n_init=1, random_state=42, max_iter=300)\n",
|
||||
"kmeans_FREQ_CM = KMeans(\n",
|
||||
" init=\"random\", n_clusters=5, n_init=1, random_state=42, max_iter=300\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Transformation des données : plusieurs échantillons de 1 dimension\n",
|
||||
"data_x = data['FREQxCHARGE'].to_numpy()\n",
|
||||
"data_x = data[\"FREQxCHARGE\"].to_numpy()\n",
|
||||
"data_x = data_x.reshape(-1, 1)\n",
|
||||
"\n",
|
||||
"# Fitting\n",
|
||||
@@ -4578,11 +4578,11 @@
|
||||
],
|
||||
"source": [
|
||||
"# Calcul de CAH avec lien simple\n",
|
||||
"data_x = data['FREQ'].to_numpy()\n",
|
||||
"data_x = data[\"FREQ\"].to_numpy()\n",
|
||||
"data_x = data_x.reshape(-1, 1)\n",
|
||||
"\n",
|
||||
"linkage_data = linkage(data_x, method='single', metric='euclidean')\n",
|
||||
"dendrogram(linkage_data, labels=np.array(data['ZONE_RISQUE']))\n",
|
||||
"linkage_data = linkage(data_x, method=\"single\", metric=\"euclidean\")\n",
|
||||
"dendrogram(linkage_data, labels=np.array(data[\"ZONE_RISQUE\"]))\n",
|
||||
"\n",
|
||||
"plt.show()"
|
||||
]
|
||||
@@ -4625,7 +4625,7 @@
|
||||
")\n",
|
||||
"\n",
|
||||
"labels = hierarchical_cluster.fit_predict(data_x)\n",
|
||||
"print(pd.DataFrame({\"ZONE_RISQUE\": data['ZONE_RISQUE'], \"Cluster\": labels}))\n"
|
||||
"print(pd.DataFrame({\"ZONE_RISQUE\": data[\"ZONE_RISQUE\"], \"Cluster\": labels}))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -5613,7 +5613,7 @@
|
||||
"linkage_data = linkage(data_x, method=\"single\", metric=\"euclidean\")\n",
|
||||
"dendrogram(linkage_data, labels=np.array(data[\"ZONE_RISQUE\"]))\n",
|
||||
"\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -6654,7 +6654,7 @@
|
||||
" \"EXPO\",\n",
|
||||
" \"CHARGE\",\n",
|
||||
" ]\n",
|
||||
"]\n"
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -6707,7 +6707,7 @@
|
||||
" \"ANCIENNETE_PERMIS\",\n",
|
||||
" \"ANNEE_CONSTRUCTION\",\n",
|
||||
" ],\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -6736,7 +6736,7 @@
|
||||
"data_source_MP[\"Cluster\"] = kmeans.labels_\n",
|
||||
"data_sinistralite_MP = data_source_MP[[\"Cluster\", \"NB\", \"EXPO\", \"CHARGE\"]]\n",
|
||||
"\n",
|
||||
"data_sinistralite_MP = data_sinistralite_MP.groupby([\"Cluster\"], as_index=False).sum()\n"
|
||||
"data_sinistralite_MP = data_sinistralite_MP.groupby([\"Cluster\"], as_index=False).sum()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -6973,7 +6973,7 @@
|
||||
"base_MP = base_MP.merge(data_sinistralite_MP, left_index=True, right_on=\"Cluster\")\n",
|
||||
"\n",
|
||||
"base_MP = base_MP.iloc[:, [4, 0, 1, 2, 3, 5, 6, 7]]\n",
|
||||
"base_MP.head()\n"
|
||||
"base_MP.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -69,7 +69,7 @@
|
||||
"# Machine Learning\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor\n",
|
||||
"from sklearn.model_selection import KFold, cross_val_score, train_test_split\n",
|
||||
"from sklearn.tree import DecisionTreeRegressor\n"
|
||||
"from sklearn.tree import DecisionTreeRegressor"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -98,7 +98,7 @@
|
||||
" mini = (\n",
|
||||
" min(crosstab.shape) - 1\n",
|
||||
" ) # Take the minimum value between the columns and the rows of the cross table\n",
|
||||
" return stat / (obs * mini)\n"
|
||||
" return stat / (obs * mini)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -135,7 +135,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path =input_path + '/base_retraitee.csv'\n",
|
||||
"path = input_path + \"/base_retraitee.csv\"\n",
|
||||
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")"
|
||||
]
|
||||
},
|
||||
@@ -1111,7 +1111,7 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data_model.describe(include='all')"
|
||||
"data_model.describe(include=\"all\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2037,7 +2037,7 @@
|
||||
" if len(data_set[colu].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_set[colu])\n"
|
||||
" variables_categorielles.append(data_set[colu])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2469,7 +2469,7 @@
|
||||
" + v_cramer_resultats.columns[j]\n",
|
||||
" + \" sont trop dépendantes, V-CRAMER = \"\n",
|
||||
" + str(v_cramer_resultats.iloc[i, j])\n",
|
||||
" )\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2487,7 +2487,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"vars_numeriques = pd.DataFrame(variables_numeriques).transpose()\n"
|
||||
"vars_numeriques = pd.DataFrame(variables_numeriques).transpose()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2663,7 +2663,7 @@
|
||||
" + correlations_num.columns[j]\n",
|
||||
" + \" sont trop dépendantes, corr = \"\n",
|
||||
" + str(correlations_num.iloc[i, j])\n",
|
||||
" )\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -3498,7 +3498,7 @@
|
||||
"vars_numeriques_scaled = pd.DataFrame(\n",
|
||||
" vars_numeriques_scaled, columns=vars_numeriques.columns\n",
|
||||
")\n",
|
||||
"vars_numeriques_scaled.head()\n"
|
||||
"vars_numeriques_scaled.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -3638,7 +3638,7 @@
|
||||
"\n",
|
||||
"print(f\"MAE: {mae:.2f}\")\n",
|
||||
"print(f\"MSE: {mse:.2f}\")\n",
|
||||
"print(f\"RMSE: {rmse:.2f}\")\n"
|
||||
"print(f\"RMSE: {rmse:.2f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -3779,7 +3779,7 @@
|
||||
" # Concaténation des résultats\n",
|
||||
" MAE_scores.append(MAE)\n",
|
||||
" MSE_scores.append(MSE)\n",
|
||||
" RMSE_scores.append(RMSE)\n"
|
||||
" RMSE_scores.append(RMSE)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -4005,7 +4005,7 @@
|
||||
" RMSE_best_score = RMSE_scores\n",
|
||||
"\n",
|
||||
" # Sauvegarde du modèle pour l'utiliser directement\n",
|
||||
" best_model_regressor = rf_regressor\n"
|
||||
" best_model_regressor = rf_regressor"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -4052,7 +4052,7 @@
|
||||
"\n",
|
||||
"# RMSE\n",
|
||||
"for fold, rmse in enumerate(RMSE_best_score, start=1):\n",
|
||||
" print(f\"Fold {fold} RMSE:\", rmse)\n"
|
||||
" print(f\"Fold {fold} RMSE:\", rmse)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -4139,7 +4139,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Sampling en 80% train et 20% test\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)"
|
||||
"X_train, X_test, y_train, y_test = train_test_split(\n",
|
||||
" X, Y, test_size=0.2, random_state=42\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -73,7 +73,7 @@
|
||||
" StratifiedKFold,\n",
|
||||
" cross_val_score,\n",
|
||||
" train_test_split,\n",
|
||||
")\n"
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -92,11 +92,17 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def cramers_V(var1, var2):\n",
|
||||
" crosstab = np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building\n",
|
||||
" stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test\n",
|
||||
" crosstab = np.array(\n",
|
||||
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
|
||||
" ) # Cross table building\n",
|
||||
" stat = chi2_contingency(crosstab)[\n",
|
||||
" 0\n",
|
||||
" ] # Keeping of the test statistic of the Chi2 test\n",
|
||||
" obs = np.sum(crosstab) # Number of observations\n",
|
||||
" mini = min(crosstab.shape)-1 # Take the minimum value between the colmns and the rows of the cross table\n",
|
||||
" return (stat/(obs*mini))"
|
||||
" mini = (\n",
|
||||
" min(crosstab.shape) - 1\n",
|
||||
" ) # Take the minimum value between the colmns and the rows of the cross table\n",
|
||||
" return stat / (obs * mini)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -133,7 +139,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"path = input_path + '/base_retraitee.csv'\n",
|
||||
"path = input_path + \"/base_retraitee.csv\"\n",
|
||||
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")"
|
||||
]
|
||||
},
|
||||
@@ -16225,7 +16231,7 @@
|
||||
" if len(data_model[col].unique()) == 2:\n",
|
||||
" variables_categorielles.append(data_model[col])\n",
|
||||
" else:\n",
|
||||
" variables_categorielles.append(data_model[col])\n"
|
||||
" variables_categorielles.append(data_model[col])"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -16653,7 +16659,7 @@
|
||||
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
|
||||
" print(\n",
|
||||
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
|
||||
" )\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -16851,7 +16857,7 @@
|
||||
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
|
||||
" print(\n",
|
||||
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
|
||||
" )\n"
|
||||
" )"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -17820,7 +17826,7 @@
|
||||
" cv=StratifiedKFold(\n",
|
||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||
" ), # Validation croisée avec 5 folds\n",
|
||||
" scoring='recall', # Métrique d'évaluation (moins c'est mieux)\n",
|
||||
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
|
||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||
")\n",
|
||||
"\n",
|
||||
@@ -17877,14 +17883,18 @@
|
||||
],
|
||||
"source": [
|
||||
"# Recall de chaque fold\n",
|
||||
"recall_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='recall')\n",
|
||||
"recall_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
"for i, score in enumerate(recall_scores):\n",
|
||||
" print(f\"Recall pour le fold {i + 1}: {score}\")\n",
|
||||
"\n",
|
||||
"# Accuracy de chaque fold\n",
|
||||
"accuracy_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='accuracy')\n",
|
||||
"accuracy_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
"print(\"\\n\")\n",
|
||||
@@ -17892,12 +17902,14 @@
|
||||
" print(f\"Accuracy pour le fold {i + 1}: {score}\")\n",
|
||||
"\n",
|
||||
"# Precision de chaque fold\n",
|
||||
"precision_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='precision')\n",
|
||||
"precision_scores = cross_val_score(\n",
|
||||
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Afficher les scores pour chaque fold\n",
|
||||
"print(\"\\n\")\n",
|
||||
"for i, score in enumerate(precision_scores):\n",
|
||||
" print(f\"Precision pour le fold {i + 1}: {score}\")\n"
|
||||
" print(f\"Precision pour le fold {i + 1}: {score}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -30178,7 +30190,7 @@
|
||||
"# Observation de la distribution sur Y_train\n",
|
||||
"df = pd.DataFrame(y_train, columns=[\"SINISTRE\"])\n",
|
||||
"fig = px.histogram(df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train\")\n",
|
||||
"fig.show()\n"
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -52502,7 +52514,7 @@
|
||||
"fig = px.histogram(\n",
|
||||
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
|
||||
")\n",
|
||||
"fig.show()\n"
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -52530,7 +52542,7 @@
|
||||
"num_folds = 5\n",
|
||||
"\n",
|
||||
"# Initialisation du modèle GradientBoostingClassifier\n",
|
||||
"gb = GradientBoostingClassifier(random_state=42)\n"
|
||||
"gb = GradientBoostingClassifier(random_state=42)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -52567,7 +52579,7 @@
|
||||
"print(\"Meilleurs hyperparamètres : \", best_params)\n",
|
||||
"\n",
|
||||
"# Initialiser un modèle avec les meilleurs hyperparamètres\n",
|
||||
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)\n"
|
||||
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -128,7 +128,7 @@
|
||||
"\n",
|
||||
"# Configuration des graphiques\n",
|
||||
"plt.style.use(\"seaborn-v0_8-darkgrid\")\n",
|
||||
"sns.set_palette(\"husl\")\n"
|
||||
"sns.set_palette(\"husl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -162,20 +162,38 @@
|
||||
"## Chargement du dataset Adult Income (Census)\n",
|
||||
"\n",
|
||||
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
|
||||
"print(\"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\")\n",
|
||||
"print(\"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\")\n",
|
||||
"print(\n",
|
||||
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
|
||||
")\n",
|
||||
"print(\n",
|
||||
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# Chargement depuis UCI\n",
|
||||
"url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
|
||||
"\n",
|
||||
"column_names = [\n",
|
||||
" 'age', 'workclass', 'fnlwgt', 'education', 'education_num',\n",
|
||||
" 'marital_status', 'occupation', 'relationship', 'race', 'sex',\n",
|
||||
" 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'\n",
|
||||
" \"age\",\n",
|
||||
" \"workclass\",\n",
|
||||
" \"fnlwgt\",\n",
|
||||
" \"education\",\n",
|
||||
" \"education_num\",\n",
|
||||
" \"marital_status\",\n",
|
||||
" \"occupation\",\n",
|
||||
" \"relationship\",\n",
|
||||
" \"race\",\n",
|
||||
" \"sex\",\n",
|
||||
" \"capital_gain\",\n",
|
||||
" \"capital_loss\",\n",
|
||||
" \"hours_per_week\",\n",
|
||||
" \"native_country\",\n",
|
||||
" \"income\",\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"try:\n",
|
||||
" df = pd.read_csv(url, names=column_names, sep=r',\\s*', engine='python', na_values='?')\n",
|
||||
" df = pd.read_csv(\n",
|
||||
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
|
||||
" )\n",
|
||||
" print(\"Dataset chargé depuis UCI repository\")\n",
|
||||
"except: # noqa: E722\n",
|
||||
" print(\"Impossible de charger depuis UCI, création d'un dataset simulé similaire...\")\n",
|
||||
@@ -183,44 +201,121 @@
|
||||
" np.random.seed(42)\n",
|
||||
" n_samples = 32561\n",
|
||||
"\n",
|
||||
" df = pd.DataFrame({\n",
|
||||
" 'age': np.random.randint(17, 90, n_samples),\n",
|
||||
" 'workclass': np.random.choice(['Private', 'Self-emp-not-inc', 'Local-gov', 'State-gov', 'Self-emp-inc',\n",
|
||||
" 'Federal-gov', 'Without-pay'], n_samples, p=[0.73, 0.08, 0.06, 0.04, 0.03, 0.03, 0.03]),\n",
|
||||
" 'fnlwgt': np.random.randint(12285, 1484705, n_samples),\n",
|
||||
" 'education': np.random.choice(\n",
|
||||
" ['HS-grad', 'Some-college', 'Bachelors', 'Masters', 'Assoc-voc',\n",
|
||||
" 'Doctorate', '11th', '9th', '7th-8th'], n_samples,\n",
|
||||
" p=[0.32, 0.22, 0.16, 0.05, 0.04, 0.01, 0.04, 0.03, 0.13]),\n",
|
||||
" 'education_num': np.random.randint(1, 16, n_samples),\n",
|
||||
" 'marital_status': np.random.choice(['Married-civ-spouse', 'Never-married', 'Divorced', 'Separated',\n",
|
||||
" 'Widowed'], n_samples, p=[0.46, 0.33, 0.14, 0.03, 0.04]),\n",
|
||||
" 'occupation': np.random.choice(['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical',\n",
|
||||
" 'Sales', 'Other-service', 'Machine-op-inspct', 'Tech-support'], n_samples,\n",
|
||||
" p=[0.13, 0.13, 0.13, 0.12, 0.11, 0.10, 0.06, 0.22]),\n",
|
||||
" 'relationship': np.random.choice(['Husband', 'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Other-relative'],\n",
|
||||
" n_samples, p=[0.40, 0.26, 0.16, 0.10, 0.05, 0.03]),\n",
|
||||
" 'race': np.random.choice(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'],\n",
|
||||
" n_samples, p=[0.85, 0.10, 0.03, 0.01, 0.01]),\n",
|
||||
" 'sex': np.random.choice(['Male', 'Female'], n_samples, p=[0.67, 0.33]),\n",
|
||||
" 'capital_gain': np.where(np.random.random(n_samples) < 0.92, 0, np.random.randint(1, 99999, n_samples)),\n",
|
||||
" 'capital_loss': np.where(np.random.random(n_samples) < 0.95, 0, np.random.randint(1, 4356, n_samples)),\n",
|
||||
" 'hours_per_week': np.random.randint(1, 99, n_samples),\n",
|
||||
" 'native_country': np.random.choice(['United-States', 'Mexico', 'Philippines', 'Germany', 'Canada',\n",
|
||||
" 'India', 'Other'], n_samples, p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04])\n",
|
||||
" })\n",
|
||||
" df = pd.DataFrame(\n",
|
||||
" {\n",
|
||||
" \"age\": np.random.randint(17, 90, n_samples),\n",
|
||||
" \"workclass\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"Private\",\n",
|
||||
" \"Self-emp-not-inc\",\n",
|
||||
" \"Local-gov\",\n",
|
||||
" \"State-gov\",\n",
|
||||
" \"Self-emp-inc\",\n",
|
||||
" \"Federal-gov\",\n",
|
||||
" \"Without-pay\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.73, 0.08, 0.06, 0.04, 0.03, 0.03, 0.03],\n",
|
||||
" ),\n",
|
||||
" \"fnlwgt\": np.random.randint(12285, 1484705, n_samples),\n",
|
||||
" \"education\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"HS-grad\",\n",
|
||||
" \"Some-college\",\n",
|
||||
" \"Bachelors\",\n",
|
||||
" \"Masters\",\n",
|
||||
" \"Assoc-voc\",\n",
|
||||
" \"Doctorate\",\n",
|
||||
" \"11th\",\n",
|
||||
" \"9th\",\n",
|
||||
" \"7th-8th\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.32, 0.22, 0.16, 0.05, 0.04, 0.01, 0.04, 0.03, 0.13],\n",
|
||||
" ),\n",
|
||||
" \"education_num\": np.random.randint(1, 16, n_samples),\n",
|
||||
" \"marital_status\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"Married-civ-spouse\",\n",
|
||||
" \"Never-married\",\n",
|
||||
" \"Divorced\",\n",
|
||||
" \"Separated\",\n",
|
||||
" \"Widowed\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.46, 0.33, 0.14, 0.03, 0.04],\n",
|
||||
" ),\n",
|
||||
" \"occupation\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"Prof-specialty\",\n",
|
||||
" \"Craft-repair\",\n",
|
||||
" \"Exec-managerial\",\n",
|
||||
" \"Adm-clerical\",\n",
|
||||
" \"Sales\",\n",
|
||||
" \"Other-service\",\n",
|
||||
" \"Machine-op-inspct\",\n",
|
||||
" \"Tech-support\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.13, 0.13, 0.13, 0.12, 0.11, 0.10, 0.06, 0.22],\n",
|
||||
" ),\n",
|
||||
" \"relationship\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"Husband\",\n",
|
||||
" \"Not-in-family\",\n",
|
||||
" \"Own-child\",\n",
|
||||
" \"Unmarried\",\n",
|
||||
" \"Wife\",\n",
|
||||
" \"Other-relative\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.40, 0.26, 0.16, 0.10, 0.05, 0.03],\n",
|
||||
" ),\n",
|
||||
" \"race\": np.random.choice(\n",
|
||||
" [\"White\", \"Black\", \"Asian-Pac-Islander\", \"Amer-Indian-Eskimo\", \"Other\"],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.85, 0.10, 0.03, 0.01, 0.01],\n",
|
||||
" ),\n",
|
||||
" \"sex\": np.random.choice([\"Male\", \"Female\"], n_samples, p=[0.67, 0.33]),\n",
|
||||
" \"capital_gain\": np.where(\n",
|
||||
" np.random.random(n_samples) < 0.92,\n",
|
||||
" 0,\n",
|
||||
" np.random.randint(1, 99999, n_samples),\n",
|
||||
" ),\n",
|
||||
" \"capital_loss\": np.where(\n",
|
||||
" np.random.random(n_samples) < 0.95,\n",
|
||||
" 0,\n",
|
||||
" np.random.randint(1, 4356, n_samples),\n",
|
||||
" ),\n",
|
||||
" \"hours_per_week\": np.random.randint(1, 99, n_samples),\n",
|
||||
" \"native_country\": np.random.choice(\n",
|
||||
" [\n",
|
||||
" \"United-States\",\n",
|
||||
" \"Mexico\",\n",
|
||||
" \"Philippines\",\n",
|
||||
" \"Germany\",\n",
|
||||
" \"Canada\",\n",
|
||||
" \"India\",\n",
|
||||
" \"Other\",\n",
|
||||
" ],\n",
|
||||
" n_samples,\n",
|
||||
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
|
||||
" ),\n",
|
||||
" }\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
" # Création de la cible avec logique réaliste\n",
|
||||
" income_score = (\n",
|
||||
" (df['age'] > 35).astype(int) * 20 +\n",
|
||||
" (df['education_num'] > 12).astype(int) * 30 +\n",
|
||||
" (df['hours_per_week'] > 40).astype(int) * 15 +\n",
|
||||
" (df['capital_gain'] > 0).astype(int) * 25 +\n",
|
||||
" (df['marital_status'] == 'Married-civ-spouse').astype(int) * 20 +\n",
|
||||
" (df['occupation'].isin(['Exec-managerial', 'Prof-specialty'])).astype(int) * 15 +\n",
|
||||
" np.random.normal(0, 15, n_samples)\n",
|
||||
" (df[\"age\"] > 35).astype(int) * 20\n",
|
||||
" + (df[\"education_num\"] > 12).astype(int) * 30\n",
|
||||
" + (df[\"hours_per_week\"] > 40).astype(int) * 15\n",
|
||||
" + (df[\"capital_gain\"] > 0).astype(int) * 25\n",
|
||||
" + (df[\"marital_status\"] == \"Married-civ-spouse\").astype(int) * 20\n",
|
||||
" + (df[\"occupation\"].isin([\"Exec-managerial\", \"Prof-specialty\"])).astype(int)\n",
|
||||
" * 15\n",
|
||||
" + np.random.normal(0, 15, n_samples)\n",
|
||||
" )\n",
|
||||
" df['income'] = (income_score > 60).map({True: '>50K', False: '<=50K'})"
|
||||
" df[\"income\"] = (income_score > 60).map({True: \">50K\", False: \"<=50K\"})"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -358,7 +453,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Encodage de la cible en 0/1\n",
|
||||
"df['income'] = (df['income'] == '>50K').astype(int)"
|
||||
"df[\"income\"] = (df[\"income\"] == \">50K\").astype(int)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -468,7 +563,7 @@
|
||||
" \"race\",\n",
|
||||
" \"sex\",\n",
|
||||
" \"native_country\",\n",
|
||||
"]\n"
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -497,7 +592,7 @@
|
||||
"source": [
|
||||
"print(\"\\n Cardinalité des variables catégorielles :\")\n",
|
||||
"for col in cat_features:\n",
|
||||
" print(f\" {col}: {df[col].nunique()} catégories uniques\")\n"
|
||||
" print(f\" {col}: {df[col].nunique()} catégories uniques\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -526,7 +621,7 @@
|
||||
"# Corrélation avec la cible\n",
|
||||
"print(\"\\n Corrélations avec le revenu >50K :\")\n",
|
||||
"correlations = df[numeric_features].corrwith(df[\"income\"]).sort_values(ascending=False)\n",
|
||||
"print(correlations)\n"
|
||||
"print(correlations)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -583,7 +678,7 @@
|
||||
"axes[1, 2].set_xlabel(\"Revenu >50K\")\n",
|
||||
"\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -647,7 +742,7 @@
|
||||
"\n",
|
||||
"print(\"\\n=== Préparation pour CatBoost ===\\n\")\n",
|
||||
"print(f\"Variables catégorielles : {cat_features}\")\n",
|
||||
"print(f\"Variables numériques : {numeric_features}\")\n"
|
||||
"print(f\"Variables numériques : {numeric_features}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -861,11 +956,15 @@
|
||||
"# Courbe ROC\n",
|
||||
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_baseline)\n",
|
||||
"plt.figure(figsize=(8, 6))\n",
|
||||
"plt.plot(fpr, tpr, label=f'CatBoost (AUC = {roc_auc_score(y_test, y_pred_proba_baseline):.3f})')\n",
|
||||
"plt.plot([0, 1], [0, 1], 'k--', label='Hasard')\n",
|
||||
"plt.xlabel('Taux de faux positifs')\n",
|
||||
"plt.ylabel('Taux de vrais positifs')\n",
|
||||
"plt.title('Courbe ROC')\n",
|
||||
"plt.plot(\n",
|
||||
" fpr,\n",
|
||||
" tpr,\n",
|
||||
" label=f\"CatBoost (AUC = {roc_auc_score(y_test, y_pred_proba_baseline):.3f})\",\n",
|
||||
")\n",
|
||||
"plt.plot([0, 1], [0, 1], \"k--\", label=\"Hasard\")\n",
|
||||
"plt.xlabel(\"Taux de faux positifs\")\n",
|
||||
"plt.ylabel(\"Taux de vrais positifs\")\n",
|
||||
"plt.title(\"Courbe ROC\")\n",
|
||||
"plt.legend()\n",
|
||||
"plt.grid(True)\n",
|
||||
"plt.show()"
|
||||
@@ -1133,7 +1232,7 @@
|
||||
"df[\"montant_credit\"] = np.random.uniform(3000, 450000)\n",
|
||||
"df[\"defaut\"] = df[\"capital_gain\"] < df[\"capital_loss\"]\n",
|
||||
"\n",
|
||||
"df = df.drop([\"capital_gain\", \"capital_loss\"], axis=1)\n"
|
||||
"df = df.drop([\"capital_gain\", \"capital_loss\"], axis=1)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1211,7 +1310,7 @@
|
||||
"perte_totale_predite = y_pred_reg.sum()\n",
|
||||
"print(f\"\\\\nPerte totale réelle : {perte_totale_reelle:,.2f}€\")\n",
|
||||
"print(f\"Perte totale prédite : {perte_totale_predite:,.2f}€\")\n",
|
||||
"print(f\"Erreur : {abs(perte_totale_reelle - perte_totale_predite):,.2f}€\")\n"
|
||||
"print(f\"Erreur : {abs(perte_totale_reelle - perte_totale_predite):,.2f}€\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1276,7 +1375,7 @@
|
||||
"plt.title(\"Top 15 variables les plus importantes\")\n",
|
||||
"plt.gca().invert_yaxis()\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1320,8 +1419,8 @@
|
||||
" depth=6,\n",
|
||||
" random_seed=42,\n",
|
||||
" verbose=0,\n",
|
||||
" auto_class_weights='Balanced',\n",
|
||||
" eval_metric='AUC'\n",
|
||||
" auto_class_weights=\"Balanced\",\n",
|
||||
" eval_metric=\"AUC\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model_weighted.fit(train_pool, eval_set=test_pool)"
|
||||
@@ -1405,7 +1504,7 @@
|
||||
" random_seed=42,\n",
|
||||
" verbose=0,\n",
|
||||
" scale_pos_weight=scale,\n",
|
||||
" eval_metric='AUC'\n",
|
||||
" eval_metric=\"AUC\",\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"model_scaled.fit(train_pool, eval_set=test_pool)"
|
||||
@@ -1427,10 +1526,7 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# CatBoost a un support natif pour SHAP\n",
|
||||
"shap_values = model_baseline.get_feature_importance(\n",
|
||||
" train_pool,\n",
|
||||
" type='ShapValues'\n",
|
||||
")"
|
||||
"shap_values = model_baseline.get_feature_importance(train_pool, type=\"ShapValues\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1483,13 +1579,12 @@
|
||||
"source": [
|
||||
"# Importance SHAP moyenne\n",
|
||||
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
|
||||
"shap_df = pd.DataFrame({\n",
|
||||
" 'feature': X_train.columns,\n",
|
||||
" 'shap_importance': shap_importance\n",
|
||||
"}).sort_values('shap_importance', ascending=False)\n",
|
||||
"shap_df = pd.DataFrame(\n",
|
||||
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
|
||||
").sort_values(\"shap_importance\", ascending=False)\n",
|
||||
"\n",
|
||||
"print(\"\\nImportance SHAP moyenne :\")\n",
|
||||
"print(shap_df.head(10))\n"
|
||||
"print(shap_df.head(10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1512,9 +1607,9 @@
|
||||
"source": [
|
||||
"# Visualisation\n",
|
||||
"plt.figure(figsize=(10, 8))\n",
|
||||
"plt.barh(shap_df['feature'][:15], shap_df['shap_importance'][:15])\n",
|
||||
"plt.xlabel('|SHAP value| moyen')\n",
|
||||
"plt.title('Importance des features (SHAP)')\n",
|
||||
"plt.barh(shap_df[\"feature\"][:15], shap_df[\"shap_importance\"][:15])\n",
|
||||
"plt.xlabel(\"|SHAP value| moyen\")\n",
|
||||
"plt.title(\"Importance des features (SHAP)\")\n",
|
||||
"plt.gca().invert_yaxis()\n",
|
||||
"plt.tight_layout()\n",
|
||||
"plt.show()"
|
||||
|
||||
Reference in New Issue
Block a user