mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-14 15:54:13 +01:00
Refactor code formatting and improve readability in Jupyter notebooks for TP_4 and TP_5
- Adjusted indentation and line breaks for better clarity in function definitions and import statements. - Standardized string quotes for consistency across the codebase. - Enhanced readability of DataFrame creation and manipulation by breaking long lines into multiple lines. - Cleaned up print statements and comments for improved understanding. - Ensured consistent use of whitespace around operators and after commas.
This commit is contained in:
@@ -715,10 +715,7 @@
|
|||||||
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
||||||
" approx_error = np.abs(I_approx - I_exact)\n",
|
" approx_error = np.abs(I_approx - I_exact)\n",
|
||||||
" approx_errors.append(approx_error)\n",
|
" approx_errors.append(approx_error)\n",
|
||||||
" print(\n",
|
" print(f\"{N:5d} | \" + \" \".join(f\"{e:.3f} \" for e in approx_errors))"
|
||||||
" f\"{N:5d} | \"\n",
|
|
||||||
" + \" \".join(f\"{e:.3f} \" for e in approx_errors)\n",
|
|
||||||
" )"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -773,10 +770,7 @@
|
|||||||
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
" I_exact = 2 / (k + 1) if k % 2 == 0 else 0\n",
|
||||||
" approx_error = np.abs(I_approx - I_exact)\n",
|
" approx_error = np.abs(I_approx - I_exact)\n",
|
||||||
" approx_errors.append(approx_error)\n",
|
" approx_errors.append(approx_error)\n",
|
||||||
" print(\n",
|
" print(f\"{N:5d} | \" + \" \".join(f\"{e:.3f} \" for e in approx_errors))"
|
||||||
" f\"{N:5d} | \"\n",
|
|
||||||
" + \" \".join(f\"{e:.3f} \" for e in approx_errors)\n",
|
|
||||||
" )"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -333,6 +333,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return 1 / (1 + x**2)\n",
|
" return 1 / (1 + x**2)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"a, b = -5, 5\n",
|
"a, b = -5, 5\n",
|
||||||
"xx = np.linspace(a, b, 200)\n",
|
"xx = np.linspace(a, b, 200)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -375,6 +377,8 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return 1 / (1 + x**2)\n",
|
" return 1 / (1 + x**2)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"a, b = -5, 5\n",
|
"a, b = -5, 5\n",
|
||||||
"xx = np.linspace(a, b, 200)\n",
|
"xx = np.linspace(a, b, 200)\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
|||||||
@@ -65,12 +65,20 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"def f1(x):\n",
|
"def f1(x):\n",
|
||||||
" return np.exp(x) - 1 - x\n",
|
" return np.exp(x) - 1 - x\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def f2(x):\n",
|
"def f2(x):\n",
|
||||||
" return x - np.sin(x)\n",
|
" return x - np.sin(x)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def f3(x):\n",
|
"def f3(x):\n",
|
||||||
" return x + np.sin(x)\n",
|
" return x + np.sin(x)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def f4(x):\n",
|
"def f4(x):\n",
|
||||||
" return x + np.cos(x) - 1\n",
|
" return x + np.cos(x) - 1\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def f5(x):\n",
|
"def f5(x):\n",
|
||||||
" return x - np.cos(x) + 1"
|
" return x - np.cos(x) + 1"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -159,11 +159,15 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" for n in range(N - 1):\n",
|
" for n in range(N - 1):\n",
|
||||||
" p1 = f(vt[n], yn[:, n])\n",
|
" p1 = f(vt[n], yn[:, n])\n",
|
||||||
|
"\n",
|
||||||
" def F1(p2):\n",
|
" def F1(p2):\n",
|
||||||
" return f(vt[n] + h / 3, yn[:, n] + h / 6 * (p1 + p2)) - p2\n",
|
" return f(vt[n] + h / 3, yn[:, n] + h / 6 * (p1 + p2)) - p2\n",
|
||||||
|
"\n",
|
||||||
" p2 = newton(F1, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
" p2 = newton(F1, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
||||||
|
"\n",
|
||||||
" def F2(yn1):\n",
|
" def F2(yn1):\n",
|
||||||
" return yn[:, n] + h / 4 * (3 * p2 + f(vt[n + 1], yn1)) - yn1\n",
|
" return yn[:, n] + h / 4 * (3 * p2 + f(vt[n + 1], yn1)) - yn1\n",
|
||||||
|
"\n",
|
||||||
" yn[:, n + 1] = newton(F2, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
" yn[:, n + 1] = newton(F2, yn[:, n], fprime=None, tol=tol, maxiter=itmax)\n",
|
||||||
" return yn"
|
" return yn"
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -66,6 +66,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return np.tanh(x)\n",
|
" return np.tanh(x)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"aL, aR = -20, 3\n",
|
"aL, aR = -20, 3\n",
|
||||||
"print(dichotomy(f, aL, aR))"
|
"print(dichotomy(f, aL, aR))"
|
||||||
]
|
]
|
||||||
@@ -135,9 +137,15 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"x0 = 1.8\n",
|
"x0 = 1.8\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def df(x):\n",
|
"def df(x):\n",
|
||||||
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"print(Newton(f, df, x0))"
|
"print(Newton(f, df, x0))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -188,6 +196,8 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"xx = [(1, 1.9), (1, 2.3), (1, 2.4)]\n",
|
"xx = [(1, 1.9), (1, 2.3), (1, 2.4)]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for x0, x1 in xx:\n",
|
"for x0, x1 in xx:\n",
|
||||||
@@ -265,8 +275,12 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"def f(x):\n",
|
"def f(x):\n",
|
||||||
" return np.log(np.exp(x) + np.exp(-x))\n",
|
" return np.log(np.exp(x) + np.exp(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"def df(x):\n",
|
"def df(x):\n",
|
||||||
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
" return (np.exp(x) - np.exp(-x)) / (np.exp(x) + np.exp(-x))\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
"print(DichotomyNewton(f, df, -20, 3))"
|
"print(DichotomyNewton(f, df, -20, 3))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -1412,7 +1412,10 @@
|
|||||||
"f, axarr = plt.subplots(2, 3, sharex=\"col\", sharey=\"row\", figsize=(15, 12))\n",
|
"f, axarr = plt.subplots(2, 3, sharex=\"col\", sharey=\"row\", figsize=(15, 12))\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for idx, clf, tt in zip(\n",
|
"for idx, clf, tt in zip(\n",
|
||||||
" product([0, 1, 2], [0, 1, 2]), KNNs, [f\"KNN (k={k})\" for k in nb_neighbors], strict=False\n",
|
" product([0, 1, 2], [0, 1, 2]),\n",
|
||||||
|
" KNNs,\n",
|
||||||
|
" [f\"KNN (k={k})\" for k in nb_neighbors],\n",
|
||||||
|
" strict=False,\n",
|
||||||
"):\n",
|
"):\n",
|
||||||
" Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
|
" Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])\n",
|
||||||
" Z = Z.reshape(xx.shape)\n",
|
" Z = Z.reshape(xx.shape)\n",
|
||||||
|
|||||||
@@ -2545,7 +2545,9 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"MSEs = []\n",
|
"MSEs = []\n",
|
||||||
"for name, estimator in zip(\n",
|
"for name, estimator in zip(\n",
|
||||||
" [\"LassoCV\", \"LassoBIC\", \"RidgeCV\", \"OLS\"], [lassoCV, lassoBIC, ridgeCV, linReg], strict=False\n",
|
" [\"LassoCV\", \"LassoBIC\", \"RidgeCV\", \"OLS\"],\n",
|
||||||
|
" [lassoCV, lassoBIC, ridgeCV, linReg],\n",
|
||||||
|
" strict=False,\n",
|
||||||
"):\n",
|
"):\n",
|
||||||
" y_pred = estimator.predict(Xtest)\n",
|
" y_pred = estimator.predict(Xtest)\n",
|
||||||
" MSE = mean_squared_error(Ytest, y_pred)\n",
|
" MSE = mean_squared_error(Ytest, y_pred)\n",
|
||||||
|
|||||||
@@ -24,20 +24,29 @@
|
|||||||
"%matplotlib inline\n",
|
"%matplotlib inline\n",
|
||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"sns.set(style='whitegrid')\n",
|
"\n",
|
||||||
|
"sns.set(style=\"whitegrid\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import tensorflow as tf\n",
|
"import tensorflow as tf\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
"from sklearn.preprocessing import StandardScaler\n",
|
||||||
"from tensorflow import keras\n",
|
"from tensorflow import keras\n",
|
||||||
"\n",
|
"\n",
|
||||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.mnist.load_data())\n",
|
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()\n",
|
||||||
"X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, train_size=0.8)\n",
|
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
|
||||||
|
" X_train_full, y_train_full, train_size=0.8\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"scaler = StandardScaler()\n",
|
"scaler = StandardScaler()\n",
|
||||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)\n",
|
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)\n",
|
" -1, 28, 28\n",
|
||||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28)"
|
")\n",
|
||||||
|
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
|
" -1, 28, 28\n",
|
||||||
|
")\n",
|
||||||
|
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
|
" -1, 28, 28\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -69,11 +78,15 @@
|
|||||||
" [\n",
|
" [\n",
|
||||||
" keras.layers.Input(shape=[28, 28]),\n",
|
" keras.layers.Input(shape=[28, 28]),\n",
|
||||||
" keras.layers.Flatten(),\n",
|
" keras.layers.Flatten(),\n",
|
||||||
" keras.layers.Dense(256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)),\n",
|
" keras.layers.Dense(\n",
|
||||||
" keras.layers.Dense(128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)),\n",
|
" 256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
|
||||||
|
" ),\n",
|
||||||
|
" keras.layers.Dense(\n",
|
||||||
|
" 128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(0.001)\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||||
" ]\n",
|
" ]\n",
|
||||||
")\n"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -150,8 +163,16 @@
|
|||||||
" [\n",
|
" [\n",
|
||||||
" keras.layers.Input(shape=[28, 28]),\n",
|
" keras.layers.Input(shape=[28, 28]),\n",
|
||||||
" keras.layers.Flatten(),\n",
|
" keras.layers.Flatten(),\n",
|
||||||
" keras.layers.Dense(256, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(lambda_l2)),\n",
|
" keras.layers.Dense(\n",
|
||||||
" keras.layers.Dense(128, activation=\"relu\", kernel_regularizer=keras.regularizers.l2(lambda_l2)),\n",
|
" 256,\n",
|
||||||
|
" activation=\"relu\",\n",
|
||||||
|
" kernel_regularizer=keras.regularizers.l2(lambda_l2),\n",
|
||||||
|
" ),\n",
|
||||||
|
" keras.layers.Dense(\n",
|
||||||
|
" 128,\n",
|
||||||
|
" activation=\"relu\",\n",
|
||||||
|
" kernel_regularizer=keras.regularizers.l2(lambda_l2),\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||||
" ]\n",
|
" ]\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
@@ -218,20 +239,28 @@
|
|||||||
" lambda_l2 = result[\"lambda_l2\"]\n",
|
" lambda_l2 = result[\"lambda_l2\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 1)\n",
|
" plt.subplot(1, 2, 1)\n",
|
||||||
" plt.plot(history_df[\"val_loss\"], label=f\"LR={learning_rate}, L2={lambda_l2}\", color=colors[_])\n",
|
" plt.plot(\n",
|
||||||
|
" history_df[\"val_loss\"],\n",
|
||||||
|
" label=f\"LR={learning_rate}, L2={lambda_l2}\",\n",
|
||||||
|
" color=colors[_],\n",
|
||||||
|
" )\n",
|
||||||
" plt.plot(history_df[\"loss\"], linestyle=\"--\", color=colors[_])\n",
|
" plt.plot(history_df[\"loss\"], linestyle=\"--\", color=colors[_])\n",
|
||||||
" plt.xlabel(\"Epochs\")\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel(\"Loss\")\n",
|
" plt.ylabel(\"Loss\")\n",
|
||||||
" plt.legend()\n",
|
" plt.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 2)\n",
|
" plt.subplot(1, 2, 2)\n",
|
||||||
" plt.plot(history_df[\"val_accuracy\"], label=f\"LR={learning_rate}, L2={lambda_l2}\", color=colors[_])\n",
|
" plt.plot(\n",
|
||||||
|
" history_df[\"val_accuracy\"],\n",
|
||||||
|
" label=f\"LR={learning_rate}, L2={lambda_l2}\",\n",
|
||||||
|
" color=colors[_],\n",
|
||||||
|
" )\n",
|
||||||
" plt.plot(history_df[\"accuracy\"], linestyle=\"--\", color=colors[_])\n",
|
" plt.plot(history_df[\"accuracy\"], linestyle=\"--\", color=colors[_])\n",
|
||||||
" plt.xlabel(\"Epochs\")\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel(\"Accuracy\")\n",
|
" plt.ylabel(\"Accuracy\")\n",
|
||||||
" plt.legend()\n",
|
" plt.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.show()\n"
|
" plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -26,11 +26,11 @@
|
|||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"sns.set(style='whitegrid')\n",
|
"sns.set(style=\"whitegrid\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from tensorflow import keras\n",
|
"from tensorflow import keras\n",
|
||||||
"\n",
|
"\n",
|
||||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.mnist.load_data())"
|
"(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.mnist.load_data()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -61,7 +61,7 @@
|
|||||||
" X_train_full, y_train_full, test_size=0.2, random_state=42\n",
|
" X_train_full, y_train_full, test_size=0.2, random_state=42\n",
|
||||||
")\n",
|
")\n",
|
||||||
"print(X_train.shape, y_train.shape)\n",
|
"print(X_train.shape, y_train.shape)\n",
|
||||||
"print(X_valid.shape, y_valid.shape)\n"
|
"print(X_valid.shape, y_valid.shape)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -88,9 +88,9 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"plt.figure(figsize=(10,10))\n",
|
"plt.figure(figsize=(10, 10))\n",
|
||||||
"for i in range(25):\n",
|
"for i in range(25):\n",
|
||||||
" plt.subplot(5,5,i+1)\n",
|
" plt.subplot(5, 5, i + 1)\n",
|
||||||
" plt.xticks([])\n",
|
" plt.xticks([])\n",
|
||||||
" plt.yticks([])\n",
|
" plt.yticks([])\n",
|
||||||
" plt.grid(False)\n",
|
" plt.grid(False)\n",
|
||||||
@@ -174,13 +174,15 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"model = keras.models.Sequential([\n",
|
"model = keras.models.Sequential(\n",
|
||||||
" keras.layers.Input(shape=[28, 28]),\n",
|
" [\n",
|
||||||
" keras.layers.Flatten(),\n",
|
" keras.layers.Input(shape=[28, 28]),\n",
|
||||||
" keras.layers.Dense(256, activation=\"relu\"),\n",
|
" keras.layers.Flatten(),\n",
|
||||||
" keras.layers.Dense(128, activation=\"relu\"),\n",
|
" keras.layers.Dense(256, activation=\"relu\"),\n",
|
||||||
" keras.layers.Dense(10, activation=\"softmax\")\n",
|
" keras.layers.Dense(128, activation=\"relu\"),\n",
|
||||||
"])"
|
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||||
|
" ]\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -293,7 +295,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print(28*28)\n",
|
"print(28 * 28)\n",
|
||||||
"print(256)\n",
|
"print(256)\n",
|
||||||
"print(128)\n",
|
"print(128)\n",
|
||||||
"print(10)\n",
|
"print(10)\n",
|
||||||
@@ -332,7 +334,7 @@
|
|||||||
" loss=\"sparse_categorical_crossentropy\",\n",
|
" loss=\"sparse_categorical_crossentropy\",\n",
|
||||||
" optimizer=keras.optimizers.SGD(learning_rate=1e-3),\n",
|
" optimizer=keras.optimizers.SGD(learning_rate=1e-3),\n",
|
||||||
" metrics=[\"accuracy\"],\n",
|
" metrics=[\"accuracy\"],\n",
|
||||||
")\n"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -379,7 +381,7 @@
|
|||||||
" epochs=epochs,\n",
|
" epochs=epochs,\n",
|
||||||
" batch_size=batch_size,\n",
|
" batch_size=batch_size,\n",
|
||||||
" validation_data=(X_valid, y_valid),\n",
|
" validation_data=(X_valid, y_valid),\n",
|
||||||
")\n"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -435,17 +437,17 @@
|
|||||||
" plt.figure(figsize=(12, 4))\n",
|
" plt.figure(figsize=(12, 4))\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 1)\n",
|
" plt.subplot(1, 2, 1)\n",
|
||||||
" plt.plot(history_df['loss'], label='Training Loss')\n",
|
" plt.plot(history_df[\"loss\"], label=\"Training Loss\")\n",
|
||||||
" plt.plot(history_df[\"val_loss\"], label=\"Validation Loss\")\n",
|
" plt.plot(history_df[\"val_loss\"], label=\"Validation Loss\")\n",
|
||||||
" plt.xlabel(\"Epochs\")\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel(\"Loss\")\n",
|
" plt.ylabel(\"Loss\")\n",
|
||||||
" plt.legend()\n",
|
" plt.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 2)\n",
|
" plt.subplot(1, 2, 2)\n",
|
||||||
" plt.plot(history_df['accuracy'], label='Accuracy')\n",
|
" plt.plot(history_df[\"accuracy\"], label=\"Accuracy\")\n",
|
||||||
" plt.plot(history_df[\"val_accuracy\"], label=\"Validation Accuracy\")\n",
|
" plt.plot(history_df[\"val_accuracy\"], label=\"Validation Accuracy\")\n",
|
||||||
" plt.xlabel('Epochs')\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel('Accuracy')\n",
|
" plt.ylabel(\"Accuracy\")\n",
|
||||||
" plt.legend()"
|
" plt.legend()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -645,7 +647,7 @@
|
|||||||
" \"n_epochs\": n_epochs,\n",
|
" \"n_epochs\": n_epochs,\n",
|
||||||
" \"history\": pd.DataFrame(history.history),\n",
|
" \"history\": pd.DataFrame(history.history),\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
" results.append(result)\n"
|
" results.append(result)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -669,36 +671,27 @@
|
|||||||
" learning_rate = result[\"learning_rate\"]\n",
|
" learning_rate = result[\"learning_rate\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 1)\n",
|
" plt.subplot(1, 2, 1)\n",
|
||||||
|
" plt.plot(history_df[\"val_loss\"], linestyle=\"--\", color=colors[_])\n",
|
||||||
" plt.plot(\n",
|
" plt.plot(\n",
|
||||||
" history_df[\"val_loss\"],\n",
|
" history_df[\"loss\"], label=f\"LR={learning_rate}\", alpha=0.5, color=colors[_]\n",
|
||||||
" linestyle=\"--\",\n",
|
|
||||||
" color=colors[_]\n",
|
|
||||||
" )\n",
|
|
||||||
" plt.plot(\n",
|
|
||||||
" history_df[\"loss\"], label=f\"LR={learning_rate}\", alpha=0.5,\n",
|
|
||||||
" color=colors[_]\n",
|
|
||||||
" )\n",
|
" )\n",
|
||||||
" plt.xlabel(\"Epochs\")\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel(\"Loss\")\n",
|
" plt.ylabel(\"Loss\")\n",
|
||||||
" plt.legend()\n",
|
" plt.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.subplot(1, 2, 2)\n",
|
" plt.subplot(1, 2, 2)\n",
|
||||||
" plt.plot(\n",
|
" plt.plot(history_df[\"val_accuracy\"], linestyle=\"--\", color=colors[_])\n",
|
||||||
" history_df[\"val_accuracy\"],\n",
|
|
||||||
" linestyle=\"--\",\n",
|
|
||||||
" color=colors[_]\n",
|
|
||||||
" )\n",
|
|
||||||
" plt.plot(\n",
|
" plt.plot(\n",
|
||||||
" history_df[\"accuracy\"],\n",
|
" history_df[\"accuracy\"],\n",
|
||||||
" label=f\"LR={learning_rate}\",\n",
|
" label=f\"LR={learning_rate}\",\n",
|
||||||
" alpha=0.5,\n",
|
" alpha=0.5,\n",
|
||||||
" color=colors[_]\n",
|
" color=colors[_],\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" plt.xlabel(\"Epochs\")\n",
|
" plt.xlabel(\"Epochs\")\n",
|
||||||
" plt.ylabel(\"Accuracy\")\n",
|
" plt.ylabel(\"Accuracy\")\n",
|
||||||
" plt.legend()\n",
|
" plt.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
" plt.show()\n"
|
" plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -767,7 +760,7 @@
|
|||||||
" \"n_epochs\": n_epochs,\n",
|
" \"n_epochs\": n_epochs,\n",
|
||||||
" \"history\": pd.DataFrame(history.history),\n",
|
" \"history\": pd.DataFrame(history.history),\n",
|
||||||
" }\n",
|
" }\n",
|
||||||
" results.append(result)\n"
|
" results.append(result)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -24,20 +24,30 @@
|
|||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"sns.set(style='whitegrid')\n",
|
"sns.set(style=\"whitegrid\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import tensorflow as tf\n",
|
"import tensorflow as tf\n",
|
||||||
"from sklearn.model_selection import train_test_split\n",
|
"from sklearn.model_selection import train_test_split\n",
|
||||||
"from sklearn.preprocessing import StandardScaler\n",
|
"from sklearn.preprocessing import StandardScaler\n",
|
||||||
"from tensorflow import keras\n",
|
"from tensorflow import keras\n",
|
||||||
"\n",
|
"\n",
|
||||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.fashion_mnist.load_data())\n",
|
"(X_train_full, y_train_full), (X_test, y_test) = (\n",
|
||||||
"X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, train_size=0.8)\n",
|
" keras.datasets.fashion_mnist.load_data()\n",
|
||||||
|
")\n",
|
||||||
|
"X_train, X_valid, y_train, y_valid = train_test_split(\n",
|
||||||
|
" X_train_full, y_train_full, train_size=0.8\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"scaler = StandardScaler()\n",
|
"scaler = StandardScaler()\n",
|
||||||
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)\n",
|
"X_train = scaler.fit_transform(X_train.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)\n",
|
" -1, 28, 28, 1\n",
|
||||||
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(-1, 28, 28, 1)"
|
")\n",
|
||||||
|
"X_valid = scaler.transform(X_valid.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
|
" -1, 28, 28, 1\n",
|
||||||
|
")\n",
|
||||||
|
"X_test = scaler.transform(X_test.astype(np.float32).reshape(-1, 28 * 28)).reshape(\n",
|
||||||
|
" -1, 28, 28, 1\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -26,11 +26,13 @@
|
|||||||
"import matplotlib.pyplot as plt\n",
|
"import matplotlib.pyplot as plt\n",
|
||||||
"import seaborn as sns\n",
|
"import seaborn as sns\n",
|
||||||
"\n",
|
"\n",
|
||||||
"sns.set(style='whitegrid')\n",
|
"sns.set(style=\"whitegrid\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"from tensorflow import keras\n",
|
"from tensorflow import keras\n",
|
||||||
"\n",
|
"\n",
|
||||||
"(X_train_full, y_train_full), (X_test, y_test) = (keras.datasets.fashion_mnist.load_data())"
|
"(X_train_full, y_train_full), (X_test, y_test) = (\n",
|
||||||
|
" keras.datasets.fashion_mnist.load_data()\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -186,7 +188,7 @@
|
|||||||
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
|
" keras.layers.Dense(units=64, activation=\"relu\"),\n",
|
||||||
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
|
" keras.layers.Dense(units=10, activation=\"softmax\"),\n",
|
||||||
" ]\n",
|
" ]\n",
|
||||||
")\n"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -627,10 +629,7 @@
|
|||||||
" batch_size=batch_size,\n",
|
" batch_size=batch_size,\n",
|
||||||
" validation_data=(X_valid, y_valid),\n",
|
" validation_data=(X_valid, y_valid),\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" training_curves.append({\n",
|
" training_curves.append({\"history\": history, \"normalization\": normalized})"
|
||||||
" 'history': history,\n",
|
|
||||||
" 'normalization': normalized\n",
|
|
||||||
" })"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -653,7 +652,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def agregate_result(results: list, normalized: bool, metric_name: str = 'accuracy') -> pd.DataFrame:\n",
|
"def agregate_result(\n",
|
||||||
|
" results: list, normalized: bool, metric_name: str = \"accuracy\"\n",
|
||||||
|
") -> pd.DataFrame:\n",
|
||||||
" train_curves = []\n",
|
" train_curves = []\n",
|
||||||
" val_curves = []\n",
|
" val_curves = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -663,7 +664,7 @@
|
|||||||
" train_curves.append(hist_obj.history[metric_name])\n",
|
" train_curves.append(hist_obj.history[metric_name])\n",
|
||||||
" val_curves.append(hist_obj.history[f\"val_{metric_name}\"])\n",
|
" val_curves.append(hist_obj.history[f\"val_{metric_name}\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
" return np.array(train_curves).flatten(), np.array(val_curves).flatten()\n"
|
" return np.array(train_curves).flatten(), np.array(val_curves).flatten()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -697,7 +698,9 @@
|
|||||||
"for idx, metric in enumerate(metrics):\n",
|
"for idx, metric in enumerate(metrics):\n",
|
||||||
" ax = axs[idx]\n",
|
" ax = axs[idx]\n",
|
||||||
" for normalized in [True, False]:\n",
|
" for normalized in [True, False]:\n",
|
||||||
" train, val = agregate_result(training_curves, normalized=normalized, metric_name=metric)\n",
|
" train, val = agregate_result(\n",
|
||||||
|
" training_curves, normalized=normalized, metric_name=metric\n",
|
||||||
|
" )\n",
|
||||||
" train_runs = train.reshape(-1, epochs)\n",
|
" train_runs = train.reshape(-1, epochs)\n",
|
||||||
" val_runs = val.reshape(-1, epochs)\n",
|
" val_runs = val.reshape(-1, epochs)\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -710,10 +713,22 @@
|
|||||||
" label_prefix = \"With BN\" if normalized else \"Without BN\"\n",
|
" label_prefix = \"With BN\" if normalized else \"Without BN\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ax.plot(mean_train, label=label_prefix, color=color, linestyle=\"-\")\n",
|
" ax.plot(mean_train, label=label_prefix, color=color, linestyle=\"-\")\n",
|
||||||
" ax.fill_between(range(epochs), mean_train - std_train, mean_train + std_train, color=color, alpha=0.2)\n",
|
" ax.fill_between(\n",
|
||||||
|
" range(epochs),\n",
|
||||||
|
" mean_train - std_train,\n",
|
||||||
|
" mean_train + std_train,\n",
|
||||||
|
" color=color,\n",
|
||||||
|
" alpha=0.2,\n",
|
||||||
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ax.plot(mean_val, color=color, linestyle=\"--\")\n",
|
" ax.plot(mean_val, color=color, linestyle=\"--\")\n",
|
||||||
" ax.fill_between(range(epochs), mean_val - std_val, mean_val + std_val, color=color, alpha=0.2)\n",
|
" ax.fill_between(\n",
|
||||||
|
" range(epochs),\n",
|
||||||
|
" mean_val - std_val,\n",
|
||||||
|
" mean_val + std_val,\n",
|
||||||
|
" color=color,\n",
|
||||||
|
" alpha=0.2,\n",
|
||||||
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ax.set_title(f\"Training and Validation {metric.capitalize()}\")\n",
|
" ax.set_title(f\"Training and Validation {metric.capitalize()}\")\n",
|
||||||
" ax.set_xlabel(\"Epochs\")\n",
|
" ax.set_xlabel(\"Epochs\")\n",
|
||||||
@@ -721,7 +736,7 @@
|
|||||||
" ax.legend()\n",
|
" ax.legend()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"plt.tight_layout()\n",
|
"plt.tight_layout()\n",
|
||||||
"plt.show()\n"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -43,7 +43,7 @@
|
|||||||
" 7: \"horse\",\n",
|
" 7: \"horse\",\n",
|
||||||
" 8: \"ship\",\n",
|
" 8: \"ship\",\n",
|
||||||
" 9: \"truck \",\n",
|
" 9: \"truck \",\n",
|
||||||
"}\n"
|
"}"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -299,13 +299,21 @@
|
|||||||
" model = keras.Sequential(\n",
|
" model = keras.Sequential(\n",
|
||||||
" [\n",
|
" [\n",
|
||||||
" keras.layers.InputLayer(shape=(32, 32, 3)),\n",
|
" keras.layers.InputLayer(shape=(32, 32, 3)),\n",
|
||||||
" keras.layers.Conv2D(filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
" keras.layers.Conv2D(\n",
|
||||||
|
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.Dropout(0.2),\n",
|
" keras.layers.Dropout(0.2),\n",
|
||||||
" keras.layers.Conv2D(filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
" keras.layers.Conv2D(\n",
|
||||||
|
" filters=32, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.MaxPooling2D(pool_size=2),\n",
|
" keras.layers.MaxPooling2D(pool_size=2),\n",
|
||||||
" keras.layers.Conv2D(filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
" keras.layers.Conv2D(\n",
|
||||||
|
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.Dropout(0.2),\n",
|
" keras.layers.Dropout(0.2),\n",
|
||||||
" keras.layers.Conv2D(filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"),\n",
|
" keras.layers.Conv2D(\n",
|
||||||
|
" filters=16, kernel_size=3, activation=\"relu\", padding=\"same\"\n",
|
||||||
|
" ),\n",
|
||||||
" keras.layers.Flatten(),\n",
|
" keras.layers.Flatten(),\n",
|
||||||
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
" keras.layers.Dense(10, activation=\"softmax\"),\n",
|
||||||
" ]\n",
|
" ]\n",
|
||||||
@@ -316,7 +324,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"model = get_model()\n",
|
"model = get_model()\n",
|
||||||
"model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n",
|
"model.compile(optimizer=\"adam\", loss=\"categorical_crossentropy\", metrics=[\"accuracy\"])\n",
|
||||||
"model.summary()\n"
|
"model.summary()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -339,7 +347,9 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def compile_train(optimizer_function: str, learning_rate: float, **kwargs) -> keras.callbacks.History:\n",
|
"def compile_train(\n",
|
||||||
|
" optimizer_function: str, learning_rate: float, **kwargs\n",
|
||||||
|
") -> keras.callbacks.History:\n",
|
||||||
" model = get_model()\n",
|
" model = get_model()\n",
|
||||||
" optimizer = optimizer_function(learning_rate=learning_rate)\n",
|
" optimizer = optimizer_function(learning_rate=learning_rate)\n",
|
||||||
" model.compile(\n",
|
" model.compile(\n",
|
||||||
@@ -388,9 +398,11 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"epochs=5\n",
|
"epochs = 5\n",
|
||||||
"batch_size=64\n",
|
"batch_size = 64\n",
|
||||||
"history_adam = compile_train(keras.optimizers.Adam, learning_rate=0.001, epochs=epochs, batch_size=batch_size)"
|
"history_adam = compile_train(\n",
|
||||||
|
" keras.optimizers.Adam, learning_rate=0.001, epochs=epochs, batch_size=batch_size\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -603,7 +615,7 @@
|
|||||||
"plt.xlabel(\"Epochs\")\n",
|
"plt.xlabel(\"Epochs\")\n",
|
||||||
"plt.ylabel(\"Validation Loss\")\n",
|
"plt.ylabel(\"Validation Loss\")\n",
|
||||||
"plt.legend()\n",
|
"plt.legend()\n",
|
||||||
"plt.show()\n"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -60,11 +60,11 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print(3+4)\n",
|
"print(3 + 4)\n",
|
||||||
"print(5*2.5)\n",
|
"print(5 * 2.5)\n",
|
||||||
"print(int)\n",
|
"print(int)\n",
|
||||||
"print(float)\n",
|
"print(float)\n",
|
||||||
"a=2**90\n",
|
"a = 2**90\n",
|
||||||
"print(a)\n",
|
"print(a)\n",
|
||||||
"print(type(a))"
|
"print(type(a))"
|
||||||
]
|
]
|
||||||
@@ -90,11 +90,11 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#from math import *\n",
|
"# from math import *\n",
|
||||||
"#help(math)\n",
|
"# help(math)\n",
|
||||||
"print (math.sqrt(3))\n",
|
"print(math.sqrt(3))\n",
|
||||||
"print (math.floor(3.2))\n",
|
"print(math.floor(3.2))\n",
|
||||||
"print (math.cos(math.pi/3.0))"
|
"print(math.cos(math.pi / 3.0))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -121,11 +121,11 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print('33' + \"42\")\n",
|
"print(\"33\" + \"42\")\n",
|
||||||
"a=\"toto est toto\"\n",
|
"a = \"toto est toto\"\n",
|
||||||
"a[4:]\n",
|
"a[4:]\n",
|
||||||
"#Tout est objet\n",
|
"# Tout est objet\n",
|
||||||
"print(a.split(' '))"
|
"print(a.split(\" \"))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -143,7 +143,9 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\"XX\", \"Mardi 19 septembre\")\n",
|
"salutation = \"Bonjour, monsieur {}. Comment allez vous en ce {}?\".format(\n",
|
||||||
|
" \"XX\", \"Mardi 19 septembre\"\n",
|
||||||
|
")\n",
|
||||||
"print(salutation)"
|
"print(salutation)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -195,12 +197,12 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"a = [1,'q',2,3,5,8,'TOTO']\n",
|
"a = [1, \"q\", 2, 3, 5, 8, \"TOTO\"]\n",
|
||||||
"print (a[1])\n",
|
"print(a[1])\n",
|
||||||
"print (a[-1])\n",
|
"print(a[-1])\n",
|
||||||
"print (a[1:3])\n",
|
"print(a[1:3])\n",
|
||||||
"print (a[-4:-2])\n",
|
"print(a[-4:-2])\n",
|
||||||
"print([5]*4)"
|
"print([5] * 4)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -221,10 +223,10 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"print (range(4, 10))\n",
|
"print(range(4, 10))\n",
|
||||||
"print (range(5, 50, 3))\n",
|
"print(range(5, 50, 3))\n",
|
||||||
"print ([3,1,4] + [1,5,9])\n",
|
"print([3, 1, 4] + [1, 5, 9])\n",
|
||||||
"print (len(range(4, 10)))"
|
"print(len(range(4, 10)))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -258,28 +260,28 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"a = 2\n",
|
"a = 2\n",
|
||||||
"if 5 > a:\n",
|
"if 5 > a:\n",
|
||||||
" print (\"Cinq!\")\n",
|
" print(\"Cinq!\")\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" print (\"a!\")\n",
|
" print(\"a!\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Zoom sur indentation\n",
|
"# Zoom sur indentation\n",
|
||||||
"num=3\n",
|
"num = 3\n",
|
||||||
"if num == 1:\n",
|
"if num == 1:\n",
|
||||||
" print (\"C'est 1\")\n",
|
" print(\"C'est 1\")\n",
|
||||||
"elif num == 2:\n",
|
"elif num == 2:\n",
|
||||||
" print (\"C'est 2\")\n",
|
" print(\"C'est 2\")\n",
|
||||||
"elif num == 3:\n",
|
"elif num == 3:\n",
|
||||||
" print (\"C'est 3\")\n",
|
" print(\"C'est 3\")\n",
|
||||||
"else :\n",
|
"else:\n",
|
||||||
" print (\"Autre que 1, 2 et 3\")\n",
|
" print(\"Autre que 1, 2 et 3\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(num in [1,2,3])\n",
|
"print(num in [1, 2, 3])\n",
|
||||||
"print(num not in [1,2,3])\n",
|
"print(num not in [1, 2, 3])\n",
|
||||||
"print(num != 5)\n",
|
"print(num != 5)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(num in [1,2,3] and num >0)\n",
|
"print(num in [1, 2, 3] and num > 0)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(not 5==3)"
|
"print(not 5 == 3)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -313,21 +315,21 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"a=[3,4,5]\n",
|
"a = [3, 4, 5]\n",
|
||||||
"#Boucle for\n",
|
"# Boucle for\n",
|
||||||
"for i in range(len(a)):\n",
|
"for i in range(len(a)):\n",
|
||||||
" a[i] += 3\n",
|
" a[i] += 3\n",
|
||||||
"print (a)\n",
|
"print(a)\n",
|
||||||
"%timeit for i in range(len(a)):a[i] += 3\n",
|
"%timeit for i in range(len(a)):a[i] += 3\n",
|
||||||
"print (a)\n",
|
"print(a)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"b=[3,4,5]\n",
|
"b = [3, 4, 5]\n",
|
||||||
"print([i+3 for i in b])\n",
|
"print([i + 3 for i in b])\n",
|
||||||
"%timeit [i+3 for i in b]\n",
|
"%timeit [i+3 for i in b]\n",
|
||||||
"print (b)\n",
|
"print(b)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for i,item in enumerate(b):\n",
|
"for i, item in enumerate(b):\n",
|
||||||
" print(i,item)"
|
" print(i, item)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -369,18 +371,18 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Boucle While , break et continue\n",
|
"# Boucle While , break et continue\n",
|
||||||
"print(\"démarrage boucle\")\n",
|
"print(\"démarrage boucle\")\n",
|
||||||
"i=0\n",
|
"i = 0\n",
|
||||||
"while i<100:\n",
|
"while i < 100:\n",
|
||||||
" print(i)\n",
|
" print(i)\n",
|
||||||
" i+=1\n",
|
" i += 1\n",
|
||||||
" if i==52:\n",
|
" if i == 52:\n",
|
||||||
" i+=10\n",
|
" i += 10\n",
|
||||||
" print(\"increment de 10\")\n",
|
" print(\"increment de 10\")\n",
|
||||||
" continue\n",
|
" continue\n",
|
||||||
" i+=2\n",
|
" i += 2\n",
|
||||||
" if i ==77:\n",
|
" if i == 77:\n",
|
||||||
" break"
|
" break"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -408,8 +410,10 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"def square(x):\n",
|
"def square(x):\n",
|
||||||
" return x*x\n",
|
" return x * x\n",
|
||||||
"print (square(3))"
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"print(square(3))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -486,14 +490,14 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"\n",
|
"\n",
|
||||||
"serie = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5})\n",
|
"serie = pd.Series({\"a\": 1, \"b\": 2, \"c\": 3, \"d\": 4, \"e\": 5})\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(serie)\n",
|
"print(serie)\n",
|
||||||
"print(serie.index)\n",
|
"print(serie.index)\n",
|
||||||
"print(serie.mean())\n",
|
"print(serie.mean())\n",
|
||||||
"print(serie['b'])\n",
|
"print(serie[\"b\"])\n",
|
||||||
"print(serie.b)"
|
"print(serie.b)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -594,11 +598,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"df = pd.DataFrame({\n",
|
"df = pd.DataFrame({\"s\": s, \"t\": t})\n",
|
||||||
" 's': s,\n",
|
|
||||||
" 't': t\n",
|
|
||||||
"})\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"print(df)"
|
"print(df)"
|
||||||
]
|
]
|
||||||
@@ -633,8 +634,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"df['SUM'] = df['s'] + df['t']\n",
|
"df[\"SUM\"] = df[\"s\"] + df[\"t\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(df)"
|
"print(df)"
|
||||||
]
|
]
|
||||||
@@ -662,8 +663,8 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"print(df['SUM'].mean())"
|
"print(df[\"SUM\"].mean())"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -774,7 +775,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"import numpy as np\n",
|
"import numpy as np\n",
|
||||||
"\n",
|
"\n",
|
||||||
"X = np.random.standard_normal(size=250)\n",
|
"X = np.random.standard_normal(size=250)\n",
|
||||||
@@ -809,7 +810,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"index = pd.date_range('2012-01-01', periods=250, freq='D')\n",
|
"index = pd.date_range(\"2012-01-01\", periods=250, freq=\"D\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"serie = pd.Series(X, index=index)\n",
|
"serie = pd.Series(X, index=index)\n",
|
||||||
"print(serie)"
|
"print(serie)"
|
||||||
@@ -872,7 +873,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Data frame\n",
|
"# Data frame\n",
|
||||||
"import pandas as pd\n",
|
"import pandas as pd\n",
|
||||||
"import plotly.express as px\n",
|
"import plotly.express as px\n",
|
||||||
"import plotly.graph_objects as gp\n",
|
"import plotly.graph_objects as gp\n",
|
||||||
@@ -923,8 +924,8 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"path = input_path + '/base_modelisation.csv'\n",
|
"path = input_path + \"/base_modelisation.csv\"\n",
|
||||||
"data_set = pd.read_csv(path,sep=\";\",decimal=\",\")"
|
"data_set = pd.read_csv(path, sep=\";\", decimal=\",\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -2413,7 +2414,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Dimensions\n",
|
"# Dimensions\n",
|
||||||
"data_set.shape"
|
"data_set.shape"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -2568,7 +2569,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Liste des colonnes selon leur type\n",
|
"# Liste des colonnes selon leur type\n",
|
||||||
"data_set.dtypes"
|
"data_set.dtypes"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -2600,13 +2601,13 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"quantitatives = []\n",
|
"quantitatives = []\n",
|
||||||
"categorielles = []\n",
|
"categorielles = []\n",
|
||||||
"binaires = []\n",
|
"binaires = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for col in data_set.columns:\n",
|
"for col in data_set.columns:\n",
|
||||||
" if data_set[col].dtype in ['int64', 'float64']:\n",
|
" if data_set[col].dtype in [\"int64\", \"float64\"]:\n",
|
||||||
" if len(data_set[col].dropna().unique()) == 2:\n",
|
" if len(data_set[col].dropna().unique()) == 2:\n",
|
||||||
" binaires.append(col)\n",
|
" binaires.append(col)\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
@@ -2619,7 +2620,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"print(\"Variables quantitatives :\", quantitatives)\n",
|
"print(\"Variables quantitatives :\", quantitatives)\n",
|
||||||
"print(\"\\nVariables catégorielles :\", categorielles)\n",
|
"print(\"\\nVariables catégorielles :\", categorielles)\n",
|
||||||
"print(\"\\nVariables binaires :\", binaires)\n"
|
"print(\"\\nVariables binaires :\", binaires)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -2638,7 +2639,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"variables_avec_na = []\n",
|
"variables_avec_na = []\n",
|
||||||
"\n",
|
"\n",
|
||||||
"for col in data_set.columns:\n",
|
"for col in data_set.columns:\n",
|
||||||
@@ -3524,8 +3525,10 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"fig = px.histogram(data_set.sort_values('ANNEE_CTR'), x=\"ANNEE_CTR\")\n",
|
"fig = px.histogram(data_set.sort_values(\"ANNEE_CTR\"), x=\"ANNEE_CTR\")\n",
|
||||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
"fig.update_xaxes(\n",
|
||||||
|
" type=\"category\"\n",
|
||||||
|
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -18655,8 +18658,13 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"fig = px.histogram(data_set, x=\"CONTRAT_ANCIENNETE\",\n",
|
"fig = px.histogram(\n",
|
||||||
" category_orders={'CONTRAT_ANCIENNETE': ['(-1,0]','(0,1]',\"(1,2]\",\"(2,5]\",\"(5,10]\"]})\n",
|
" data_set,\n",
|
||||||
|
" x=\"CONTRAT_ANCIENNETE\",\n",
|
||||||
|
" category_orders={\n",
|
||||||
|
" \"CONTRAT_ANCIENNETE\": [\"(-1,0]\", \"(0,1]\", \"(1,2]\", \"(2,5]\", \"(5,10]\"]\n",
|
||||||
|
" },\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -48890,8 +48898,13 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"fig = px.histogram(data_set, x=\"GROUPE_KM\",\n",
|
"fig = px.histogram(\n",
|
||||||
" category_orders={'GROUPE_KM': [\"[0;20000[\",\"[20000;40000[\",\"[40000;60000[\",\"[60000;99999[\"]})\n",
|
" data_set,\n",
|
||||||
|
" x=\"GROUPE_KM\",\n",
|
||||||
|
" category_orders={\n",
|
||||||
|
" \"GROUPE_KM\": [\"[0;20000[\", \"[20000;40000[\", \"[40000;60000[\", \"[60000;99999[\"]\n",
|
||||||
|
" },\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -64005,9 +64018,11 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"fig = px.histogram(data_set.sort_values('ZONE_RISQUE'), x=\"ZONE_RISQUE\")\n",
|
"fig = px.histogram(data_set.sort_values(\"ZONE_RISQUE\"), x=\"ZONE_RISQUE\")\n",
|
||||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
"fig.update_xaxes(\n",
|
||||||
|
" type=\"category\"\n",
|
||||||
|
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -64860,9 +64875,13 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"fig = px.histogram(data_set.sort_values('AGE_ASSURE_PRINCIPAL'), x=\"AGE_ASSURE_PRINCIPAL\")\n",
|
"fig = px.histogram(\n",
|
||||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
" data_set.sort_values(\"AGE_ASSURE_PRINCIPAL\"), x=\"AGE_ASSURE_PRINCIPAL\"\n",
|
||||||
|
")\n",
|
||||||
|
"fig.update_xaxes(\n",
|
||||||
|
" type=\"category\"\n",
|
||||||
|
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -79977,9 +79996,11 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Ecrivez votre code ici\n",
|
"# Ecrivez votre code ici\n",
|
||||||
"fig = px.histogram(data_set.sort_values('GENRE'), x=\"GENRE\")\n",
|
"fig = px.histogram(data_set.sort_values(\"GENRE\"), x=\"GENRE\")\n",
|
||||||
"fig.update_xaxes(type='category') #Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
"fig.update_xaxes(\n",
|
||||||
|
" type=\"category\"\n",
|
||||||
|
") # Cette ligne permet de forcer la variable comme variable catégorielle et non numérique\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -80010,14 +80031,19 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Préparation des données : compter le nombre de femmes et hommes par âge\n",
|
"# Préparation des données : compter le nombre de femmes et hommes par âge\n",
|
||||||
"tmp = data_set[[\"AGE_ASSURE_PRINCIPAL\",\"GENRE\"]].value_counts().to_frame('counts').reset_index()\n",
|
"tmp = (\n",
|
||||||
|
" data_set[[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\"]]\n",
|
||||||
|
" .value_counts()\n",
|
||||||
|
" .to_frame(\"counts\")\n",
|
||||||
|
" .reset_index()\n",
|
||||||
|
")\n",
|
||||||
"data_f = tmp[tmp[\"GENRE\"] == \"F\"]\n",
|
"data_f = tmp[tmp[\"GENRE\"] == \"F\"]\n",
|
||||||
"data_h = tmp[tmp[\"GENRE\"] == \"M\"]\n",
|
"data_h = tmp[tmp[\"GENRE\"] == \"M\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Comparaison des âges\n",
|
"# Comparaison des âges\n",
|
||||||
"list_1=sorted(data_f[\"AGE_ASSURE_PRINCIPAL\"].unique())\n",
|
"list_1 = sorted(data_f[\"AGE_ASSURE_PRINCIPAL\"].unique())\n",
|
||||||
"list_2=sorted(data_h[\"AGE_ASSURE_PRINCIPAL\"].unique())\n",
|
"list_2 = sorted(data_h[\"AGE_ASSURE_PRINCIPAL\"].unique())\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"Eléments dans la liste 2 mais pas dans la liste 1 \")\n",
|
"print(\"Eléments dans la liste 2 mais pas dans la liste 1 \")\n",
|
||||||
"print(list(set(list_2) - set(list_1)))\n",
|
"print(list(set(list_2) - set(list_1)))\n",
|
||||||
@@ -80034,11 +80060,19 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Il faut ajouter l'âge 13 dans la liste 2 (data_h)\n",
|
"# Il faut ajouter l'âge 13 dans la liste 2 (data_h)\n",
|
||||||
"data_h = pd.concat([data_h, pd.DataFrame([[13, \"M\",0]],columns=['AGE_ASSURE_PRINCIPAL', 'GENRE','counts'])], ignore_index=True)\n",
|
"data_h = pd.concat(\n",
|
||||||
|
" [\n",
|
||||||
|
" data_h,\n",
|
||||||
|
" pd.DataFrame(\n",
|
||||||
|
" [[13, \"M\", 0]], columns=[\"AGE_ASSURE_PRINCIPAL\", \"GENRE\", \"counts\"]\n",
|
||||||
|
" ),\n",
|
||||||
|
" ],\n",
|
||||||
|
" ignore_index=True,\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#On ordonne les dataframes\n",
|
"# On ordonne les dataframes\n",
|
||||||
"data_h = data_h.sort_values('AGE_ASSURE_PRINCIPAL', ascending = True)\n",
|
"data_h = data_h.sort_values(\"AGE_ASSURE_PRINCIPAL\", ascending=True)\n",
|
||||||
"data_f = data_f.sort_values('AGE_ASSURE_PRINCIPAL', ascending = True)"
|
"data_f = data_f.sort_values(\"AGE_ASSURE_PRINCIPAL\", ascending=True)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -80049,9 +80083,9 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Graphique\n",
|
"# Graphique\n",
|
||||||
"y_age = data_h['AGE_ASSURE_PRINCIPAL']\n",
|
"y_age = data_h[\"AGE_ASSURE_PRINCIPAL\"]\n",
|
||||||
"x_M = data_h['counts']\n",
|
"x_M = data_h[\"counts\"]\n",
|
||||||
"x_F = data_f['counts'] * -1"
|
"x_F = data_f[\"counts\"] * -1"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -80907,23 +80941,21 @@
|
|||||||
"fig = gp.Figure()\n",
|
"fig = gp.Figure()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Ajout des données H\n",
|
"# Ajout des données H\n",
|
||||||
"fig.add_trace(gp.Bar(y= y_age, x = x_M,\n",
|
"fig.add_trace(gp.Bar(y=y_age, x=x_M, name=\"Hommes\", orientation=\"h\"))\n",
|
||||||
" name = 'Hommes',\n",
|
|
||||||
" orientation = 'h'))\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Ajout des données F\n",
|
"# Ajout des données F\n",
|
||||||
"fig.add_trace(gp.Bar(y = y_age, x = x_F,\n",
|
"fig.add_trace(gp.Bar(y=y_age, x=x_F, name=\"Femmes\", orientation=\"h\"))\n",
|
||||||
" name = 'Femmes', orientation = 'h'))\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# layout du graphique\n",
|
"# layout du graphique\n",
|
||||||
"fig.update_layout(title = 'Population du portefeuille',\n",
|
"fig.update_layout(\n",
|
||||||
" title_font_size = 22, barmode = 'relative',\n",
|
" title=\"Population du portefeuille\",\n",
|
||||||
" bargap = 0.0, bargroupgap = 0,\n",
|
" title_font_size=22,\n",
|
||||||
" xaxis = {'title': 'Count',\n",
|
" barmode=\"relative\",\n",
|
||||||
" 'title_font_size': 14},\n",
|
" bargap=0.0,\n",
|
||||||
" yaxis = {'title': 'Age',\n",
|
" bargroupgap=0,\n",
|
||||||
" 'title_font_size': 14}\n",
|
" xaxis={\"title\": \"Count\", \"title_font_size\": 14},\n",
|
||||||
" )\n",
|
" yaxis={\"title\": \"Age\", \"title_font_size\": 14},\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -81795,7 +81827,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"fig = px.histogram(data_set[data_set['CHARGE'] >= 0], x=\"CHARGE\", nbins=50)\n",
|
"fig = px.histogram(data_set[data_set[\"CHARGE\"] >= 0], x=\"CHARGE\", nbins=50)\n",
|
||||||
"fig.update_layout(title=\"Distribution des coûts des sinistres\")\n",
|
"fig.update_layout(title=\"Distribution des coûts des sinistres\")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -81859,8 +81891,8 @@
|
|||||||
"# Nombre de NA par variable\n",
|
"# Nombre de NA par variable\n",
|
||||||
"nan_count = pd.DataFrame(data_set.isna().sum(), columns=[\"Nombre_NA\"])\n",
|
"nan_count = pd.DataFrame(data_set.isna().sum(), columns=[\"Nombre_NA\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Ajout du % par rapport à la taille de la BD\n",
|
"# Ajout du % par rapport à la taille de la BD\n",
|
||||||
"nan_count[\"Pourcentage\"]= nan_count.divide(data_set.shape[0])*100\n",
|
"nan_count[\"Pourcentage\"] = nan_count.divide(data_set.shape[0]) * 100\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(nan_count)"
|
"print(nan_count)"
|
||||||
]
|
]
|
||||||
@@ -82271,7 +82303,7 @@
|
|||||||
"data_retraitee = data_set\n",
|
"data_retraitee = data_set\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Option 1 : Suppression des variables avec trop de NA (PUISSANCE_VEHICULE)\n",
|
"# Option 1 : Suppression des variables avec trop de NA (PUISSANCE_VEHICULE)\n",
|
||||||
"data_retraitee = data_retraitee.drop(\"PUISSANCE_VEHICULE\", axis='columns')\n",
|
"data_retraitee = data_retraitee.drop(\"PUISSANCE_VEHICULE\", axis=\"columns\")\n",
|
||||||
"data_retraitee.head()"
|
"data_retraitee.head()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -82293,14 +82325,22 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Option 2 : Remplacer par la classe la plus représentée/valeur moyenne (GROUPE_KM,GENRE,\n",
|
"# Option 2 : Remplacer par la classe la plus représentée/valeur moyenne (GROUPE_KM,GENRE,\n",
|
||||||
"# ANNEE_CONSTRUCTION,VALEUR_DU_BIEN,DEUXIEME_CONDUCTEUR)\n",
|
"# ANNEE_CONSTRUCTION,VALEUR_DU_BIEN,DEUXIEME_CONDUCTEUR)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(data_retraitee[\"GROUPE_KM\"].mode()[0])\n",
|
"data_retraitee[\"GROUPE_KM\"] = data_retraitee[\"GROUPE_KM\"].fillna(\n",
|
||||||
|
" data_retraitee[\"GROUPE_KM\"].mode()[0]\n",
|
||||||
|
")\n",
|
||||||
"data_retraitee[\"GENRE\"] = data_retraitee[\"GENRE\"].fillna(\"M\")\n",
|
"data_retraitee[\"GENRE\"] = data_retraitee[\"GENRE\"].fillna(\"M\")\n",
|
||||||
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(data_retraitee[\"ANNEE_CONSTRUCTION\"].median())\n",
|
"data_retraitee[\"ANNEE_CONSTRUCTION\"] = data_retraitee[\"ANNEE_CONSTRUCTION\"].fillna(\n",
|
||||||
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0])\n",
|
" data_retraitee[\"ANNEE_CONSTRUCTION\"].median()\n",
|
||||||
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(False)"
|
")\n",
|
||||||
|
"data_retraitee[\"VALEUR_DU_BIEN\"] = data_retraitee[\"VALEUR_DU_BIEN\"].fillna(\n",
|
||||||
|
" data_retraitee[\"VALEUR_DU_BIEN\"].mode()[0]\n",
|
||||||
|
")\n",
|
||||||
|
"data_retraitee[\"DEUXIEME_CONDUCTEUR\"] = data_retraitee[\"DEUXIEME_CONDUCTEUR\"].fillna(\n",
|
||||||
|
" False\n",
|
||||||
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -82310,11 +82350,13 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Option 3 : Remplacer par une valeur prudente (ZONE_RISQUE,DEUXIEME_CONDUCTEUR)\n",
|
"# Option 3 : Remplacer par une valeur prudente (ZONE_RISQUE,DEUXIEME_CONDUCTEUR)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Remplacer par la zone avec le plus de sinistres\n",
|
"# Remplacer par la zone avec le plus de sinistres\n",
|
||||||
"zone_plus_sinsitree = data_retraitee[[\"ZONE_RISQUE\", \"NB\"]].groupby([\"ZONE_RISQUE\"]).sum()\n",
|
"zone_plus_sinsitree = (\n",
|
||||||
"zone_plus_sinsitree.sort_values(\"NB\",ascending = False)\n",
|
" data_retraitee[[\"ZONE_RISQUE\", \"NB\"]].groupby([\"ZONE_RISQUE\"]).sum()\n",
|
||||||
|
")\n",
|
||||||
|
"zone_plus_sinsitree.sort_values(\"NB\", ascending=False)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"data_retraitee[\"ZONE_RISQUE\"] = data_retraitee[\"ZONE_RISQUE\"].fillna(\"C\")"
|
"data_retraitee[\"ZONE_RISQUE\"] = data_retraitee[\"ZONE_RISQUE\"].fillna(\"C\")"
|
||||||
]
|
]
|
||||||
@@ -82781,7 +82823,7 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"data_retraitee.to_csv(\"./2_outputs/base_retraitee.csv\", index = False)"
|
"data_retraitee.to_csv(\"./2_outputs/base_retraitee.csv\", index=False)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -82809,11 +82851,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Calcul de la fréquence\n",
|
"# Calcul de la fréquence\n",
|
||||||
"data_retraitee[\"FREQ\"] = data_retraitee[\"NB\"] / data_retraitee[\"EXPO\"]\n",
|
"data_retraitee[\"FREQ\"] = data_retraitee[\"NB\"] / data_retraitee[\"EXPO\"]\n",
|
||||||
"data_retraitee[\"FREQ\"] = data_retraitee[\"FREQ\"].fillna(0)\n",
|
"data_retraitee[\"FREQ\"] = data_retraitee[\"FREQ\"].fillna(0)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul du coût moyen\n",
|
"# Calcul du coût moyen\n",
|
||||||
"data_retraitee[\"CM\"] = data_retraitee[\"CHARGE\"] / data_retraitee[\"NB\"]\n",
|
"data_retraitee[\"CM\"] = data_retraitee[\"CHARGE\"] / data_retraitee[\"NB\"]\n",
|
||||||
"data_retraitee[\"CM\"] = data_retraitee[\"CM\"].fillna(0)"
|
"data_retraitee[\"CM\"] = data_retraitee[\"CM\"].fillna(0)"
|
||||||
]
|
]
|
||||||
@@ -82846,11 +82888,11 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"AGE_ASSURE_PRINCIPAL\", \"NB\",\"EXPO\"]]\n",
|
"plot_data = data_retraitee[[\"AGE_ASSURE_PRINCIPAL\", \"NB\", \"EXPO\"]]\n",
|
||||||
"plot_data = plot_data.groupby([\"AGE_ASSURE_PRINCIPAL\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"AGE_ASSURE_PRINCIPAL\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul de la fréquence\n",
|
"# Calcul de la fréquence\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)"
|
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)"
|
||||||
]
|
]
|
||||||
@@ -83706,8 +83748,10 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.line(plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\")\n",
|
"fig = px.line(\n",
|
||||||
|
" plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"FREQ\", title=\"Sinistralité selon l'âge\"\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -84567,16 +84611,16 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"GENRE\", \"NB\",\"EXPO\"]]\n",
|
"plot_data = data_retraitee[[\"GENRE\", \"NB\", \"EXPO\"]]\n",
|
||||||
"plot_data = plot_data.groupby([\"GENRE\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"GENRE\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul de la fréquence\n",
|
"# Calcul de la fréquence\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"GENRE\", y=\"FREQ\", title=\"Sinistralité selon le genre\")\n",
|
"fig = px.scatter(plot_data, x=\"GENRE\", y=\"FREQ\", title=\"Sinistralité selon le genre\")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
@@ -84608,17 +84652,22 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"ZONE_RISQUE\", \"NB\",\"EXPO\"]]\n",
|
"plot_data = data_retraitee[[\"ZONE_RISQUE\", \"NB\", \"EXPO\"]]\n",
|
||||||
"plot_data = plot_data.groupby([\"ZONE_RISQUE\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"ZONE_RISQUE\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul de la fréquence\n",
|
"# Calcul de la fréquence\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"ZONE_RISQUE\", y=\"FREQ\", title=\"Sinistralité selon la zone géographique\")\n",
|
"fig = px.scatter(\n",
|
||||||
|
" plot_data,\n",
|
||||||
|
" x=\"ZONE_RISQUE\",\n",
|
||||||
|
" y=\"FREQ\",\n",
|
||||||
|
" title=\"Sinistralité selon la zone géographique\",\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -85479,17 +85528,19 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"ENERGIE\", \"NB\",\"EXPO\"]]\n",
|
"plot_data = data_retraitee[[\"ENERGIE\", \"NB\", \"EXPO\"]]\n",
|
||||||
"plot_data = plot_data.groupby([\"ENERGIE\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"ENERGIE\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul de la fréquence\n",
|
"# Calcul de la fréquence\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
"plot_data[\"FREQ\"] = plot_data[\"NB\"] / plot_data[\"EXPO\"]\n",
|
||||||
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
"plot_data[\"FREQ\"] = plot_data[\"FREQ\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\")\n",
|
"fig = px.scatter(\n",
|
||||||
|
" plot_data, x=\"ENERGIE\", y=\"FREQ\", title=\"Sinistralité selon le carburant\"\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -86353,18 +86404,20 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"VALEUR_DU_BIEN\", \"CHARGE\",\"NB\"]]\n",
|
"plot_data = data_retraitee[[\"VALEUR_DU_BIEN\", \"CHARGE\", \"NB\"]]\n",
|
||||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||||
"plot_data = plot_data.groupby([\"VALEUR_DU_BIEN\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"VALEUR_DU_BIEN\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul du CM\n",
|
"# Calcul du CM\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\")\n",
|
"fig = px.scatter(\n",
|
||||||
|
" plot_data, x=\"VALEUR_DU_BIEN\", y=\"CM\", title=\"Coût moyen selon le prix\"\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -87227,18 +87280,23 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"ANNEE_CONSTRUCTION\", \"CHARGE\",\"NB\"]]\n",
|
"plot_data = data_retraitee[[\"ANNEE_CONSTRUCTION\", \"CHARGE\", \"NB\"]]\n",
|
||||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||||
"plot_data = plot_data.groupby([\"ANNEE_CONSTRUCTION\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"ANNEE_CONSTRUCTION\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul du CM\n",
|
"# Calcul du CM\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.line(plot_data, x=\"ANNEE_CONSTRUCTION\", y=\"CM\", title=\"Coût moyen selon l'ancienneté du bien\")\n",
|
"fig = px.line(\n",
|
||||||
|
" plot_data,\n",
|
||||||
|
" x=\"ANNEE_CONSTRUCTION\",\n",
|
||||||
|
" y=\"CM\",\n",
|
||||||
|
" title=\"Coût moyen selon l'ancienneté du bien\",\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -88098,18 +88156,23 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"AGE_ASSURE_PRINCIPAL\", \"CHARGE\",\"NB\"]]\n",
|
"plot_data = data_retraitee[[\"AGE_ASSURE_PRINCIPAL\", \"CHARGE\", \"NB\"]]\n",
|
||||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||||
"plot_data = plot_data.groupby([\"AGE_ASSURE_PRINCIPAL\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"AGE_ASSURE_PRINCIPAL\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul du CM\n",
|
"# Calcul du CM\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"AGE_ASSURE_PRINCIPAL\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\")\n",
|
"fig = px.scatter(\n",
|
||||||
|
" plot_data,\n",
|
||||||
|
" x=\"AGE_ASSURE_PRINCIPAL\",\n",
|
||||||
|
" y=\"CM\",\n",
|
||||||
|
" title=\"Coût moyen selon l'âge de l'assuré\",\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -88969,18 +89032,20 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"#Agrégation selon la variable d'intérêt\n",
|
"# Agrégation selon la variable d'intérêt\n",
|
||||||
"plot_data = data_retraitee[[\"GENRE\", \"CHARGE\",\"NB\"]]\n",
|
"plot_data = data_retraitee[[\"GENRE\", \"CHARGE\", \"NB\"]]\n",
|
||||||
"plot_data= plot_data[plot_data['CHARGE'] > 0]\n",
|
"plot_data = plot_data[plot_data[\"CHARGE\"] > 0]\n",
|
||||||
"plot_data = plot_data.groupby([\"GENRE\"], as_index=False).sum()\n",
|
"plot_data = plot_data.groupby([\"GENRE\"], as_index=False).sum()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Calcul du CM\n",
|
"# Calcul du CM\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
"plot_data[\"CM\"] = plot_data[\"CHARGE\"] / plot_data[\"NB\"]\n",
|
||||||
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
"plot_data[\"CM\"] = plot_data[\"CM\"].fillna(0)\n",
|
||||||
"print(plot_data)\n",
|
"print(plot_data)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"#Représentation graphique\n",
|
"# Représentation graphique\n",
|
||||||
"fig = px.scatter(plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\")\n",
|
"fig = px.scatter(\n",
|
||||||
|
" plot_data, x=\"GENRE\", y=\"CM\", title=\"Coût moyen selon l'âge de l'assuré\"\n",
|
||||||
|
")\n",
|
||||||
"fig.show()"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@@ -69,11 +69,11 @@
|
|||||||
"from sklearn import metrics\n",
|
"from sklearn import metrics\n",
|
||||||
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
"from sklearn.ensemble import GradientBoostingClassifier\n",
|
||||||
"from sklearn.model_selection import (\n",
|
"from sklearn.model_selection import (\n",
|
||||||
" GridSearchCV,\n",
|
" GridSearchCV,\n",
|
||||||
" StratifiedKFold,\n",
|
" StratifiedKFold,\n",
|
||||||
" cross_val_score,\n",
|
" cross_val_score,\n",
|
||||||
" train_test_split,\n",
|
" train_test_split,\n",
|
||||||
")\n"
|
")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -91,12 +91,18 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def cramers_V(var1,var2) :\n",
|
"def cramers_V(var1, var2):\n",
|
||||||
" crosstab = np.array(pd.crosstab(var1,var2, rownames=None, colnames=None)) # Cross table building\n",
|
" crosstab = np.array(\n",
|
||||||
" stat = chi2_contingency(crosstab)[0] # Keeping of the test statistic of the Chi2 test\n",
|
" pd.crosstab(var1, var2, rownames=None, colnames=None)\n",
|
||||||
" obs = np.sum(crosstab) # Number of observations\n",
|
" ) # Cross table building\n",
|
||||||
" mini = min(crosstab.shape)-1 # Take the minimum value between the colmns and the rows of the cross table\n",
|
" stat = chi2_contingency(crosstab)[\n",
|
||||||
" return (stat/(obs*mini))"
|
" 0\n",
|
||||||
|
" ] # Keeping of the test statistic of the Chi2 test\n",
|
||||||
|
" obs = np.sum(crosstab) # Number of observations\n",
|
||||||
|
" mini = (\n",
|
||||||
|
" min(crosstab.shape) - 1\n",
|
||||||
|
" ) # Take the minimum value between the colmns and the rows of the cross table\n",
|
||||||
|
" return stat / (obs * mini)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -133,7 +139,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"path = input_path + '/base_retraitee.csv'\n",
|
"path = input_path + \"/base_retraitee.csv\"\n",
|
||||||
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")"
|
"data_retraitee = pd.read_csv(path, sep=\",\", decimal=\".\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
@@ -16225,7 +16231,7 @@
|
|||||||
" if len(data_model[col].unique()) == 2:\n",
|
" if len(data_model[col].unique()) == 2:\n",
|
||||||
" variables_categorielles.append(data_model[col])\n",
|
" variables_categorielles.append(data_model[col])\n",
|
||||||
" else:\n",
|
" else:\n",
|
||||||
" variables_categorielles.append(data_model[col])\n"
|
" variables_categorielles.append(data_model[col])"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -16653,7 +16659,7 @@
|
|||||||
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
|
" if v_cramer_resultats.iloc[i, j] > 0.7:\n",
|
||||||
" print(\n",
|
" print(\n",
|
||||||
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
|
" f\"{v_cramer_resultats.index.to_numpy()[i]} et {v_cramer_resultats.colmns[j]} sont trop dépendantes, V-CRAMER = {v_cramer_resultats.iloc[i, j]}\"\n",
|
||||||
" )\n"
|
" )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -16851,7 +16857,7 @@
|
|||||||
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
|
" if abs(correlations_num.iloc[i, j]) > 0.7:\n",
|
||||||
" print(\n",
|
" print(\n",
|
||||||
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
|
" f\"{correlations_num.index.to_numpy()[i]} et {correlations_num.columns[j]} sont trop dépendantes, corr = {correlations_num.iloc[i, j]}\"\n",
|
||||||
" )\n"
|
" )"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -17820,7 +17826,7 @@
|
|||||||
" cv=StratifiedKFold(\n",
|
" cv=StratifiedKFold(\n",
|
||||||
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
" n_splits=num_folds, shuffle=True, random_state=42\n",
|
||||||
" ), # Validation croisée avec 5 folds\n",
|
" ), # Validation croisée avec 5 folds\n",
|
||||||
" scoring='recall', # Métrique d'évaluation (moins c'est mieux)\n",
|
" scoring=\"recall\", # Métrique d'évaluation (moins c'est mieux)\n",
|
||||||
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
" n_jobs=-1, # Utiliser tous les cœurs du processeur\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
@@ -17877,14 +17883,18 @@
|
|||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"# Recall de chaque fold\n",
|
"# Recall de chaque fold\n",
|
||||||
"recall_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='recall')\n",
|
"recall_scores = cross_val_score(\n",
|
||||||
|
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"recall\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Afficher les scores pour chaque fold\n",
|
"# Afficher les scores pour chaque fold\n",
|
||||||
"for i, score in enumerate(recall_scores):\n",
|
"for i, score in enumerate(recall_scores):\n",
|
||||||
" print(f\"Recall pour le fold {i + 1}: {score}\")\n",
|
" print(f\"Recall pour le fold {i + 1}: {score}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Accuracy de chaque fold\n",
|
"# Accuracy de chaque fold\n",
|
||||||
"accuracy_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='accuracy')\n",
|
"accuracy_scores = cross_val_score(\n",
|
||||||
|
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"accuracy\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Afficher les scores pour chaque fold\n",
|
"# Afficher les scores pour chaque fold\n",
|
||||||
"print(\"\\n\")\n",
|
"print(\"\\n\")\n",
|
||||||
@@ -17892,12 +17902,14 @@
|
|||||||
" print(f\"Accuracy pour le fold {i + 1}: {score}\")\n",
|
" print(f\"Accuracy pour le fold {i + 1}: {score}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Precision de chaque fold\n",
|
"# Precision de chaque fold\n",
|
||||||
"precision_scores = cross_val_score(best_gbc, X_train, y_train, cv=num_folds, scoring='precision')\n",
|
"precision_scores = cross_val_score(\n",
|
||||||
|
" best_gbc, X_train, y_train, cv=num_folds, scoring=\"precision\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Afficher les scores pour chaque fold\n",
|
"# Afficher les scores pour chaque fold\n",
|
||||||
"print(\"\\n\")\n",
|
"print(\"\\n\")\n",
|
||||||
"for i, score in enumerate(precision_scores):\n",
|
"for i, score in enumerate(precision_scores):\n",
|
||||||
" print(f\"Precision pour le fold {i + 1}: {score}\")\n"
|
" print(f\"Precision pour le fold {i + 1}: {score}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -30178,7 +30190,7 @@
|
|||||||
"# Observation de la distribution sur Y_train\n",
|
"# Observation de la distribution sur Y_train\n",
|
||||||
"df = pd.DataFrame(y_train, columns=[\"SINISTRE\"])\n",
|
"df = pd.DataFrame(y_train, columns=[\"SINISTRE\"])\n",
|
||||||
"fig = px.histogram(df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train\")\n",
|
"fig = px.histogram(df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train\")\n",
|
||||||
"fig.show()\n"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -52502,7 +52514,7 @@
|
|||||||
"fig = px.histogram(\n",
|
"fig = px.histogram(\n",
|
||||||
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
|
" df, x=\"SINISTRE\", title=\"Distribution de la variable Y_train_resampled\"\n",
|
||||||
")\n",
|
")\n",
|
||||||
"fig.show()\n"
|
"fig.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -52530,7 +52542,7 @@
|
|||||||
"num_folds = 5\n",
|
"num_folds = 5\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Initialisation du modèle GradientBoostingClassifier\n",
|
"# Initialisation du modèle GradientBoostingClassifier\n",
|
||||||
"gb = GradientBoostingClassifier(random_state=42)\n"
|
"gb = GradientBoostingClassifier(random_state=42)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -52567,7 +52579,7 @@
|
|||||||
"print(\"Meilleurs hyperparamètres : \", best_params)\n",
|
"print(\"Meilleurs hyperparamètres : \", best_params)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Initialiser un modèle avec les meilleurs hyperparamètres\n",
|
"# Initialiser un modèle avec les meilleurs hyperparamètres\n",
|
||||||
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)\n"
|
"best_gbc = GradientBoostingClassifier(random_state=42, **best_params)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -128,7 +128,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"# Configuration des graphiques\n",
|
"# Configuration des graphiques\n",
|
||||||
"plt.style.use(\"seaborn-v0_8-darkgrid\")\n",
|
"plt.style.use(\"seaborn-v0_8-darkgrid\")\n",
|
||||||
"sns.set_palette(\"husl\")\n"
|
"sns.set_palette(\"husl\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -162,20 +162,38 @@
|
|||||||
"## Chargement du dataset Adult Income (Census)\n",
|
"## Chargement du dataset Adult Income (Census)\n",
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
|
"print(\"=== Chargement du dataset Adult Income ===\\n\")\n",
|
||||||
"print(\"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\")\n",
|
"print(\n",
|
||||||
"print(\"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\")\n",
|
" \"Dataset classique de Kaggle/UCI qui illustre parfaitement les forces de CatBoost\"\n",
|
||||||
|
")\n",
|
||||||
|
"print(\n",
|
||||||
|
" \"Objectif : Prédire si le revenu annuel > 50K$ basé sur des caractéristiques socio-démographiques\\n\"\n",
|
||||||
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Chargement depuis UCI\n",
|
"# Chargement depuis UCI\n",
|
||||||
"url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
|
"url = \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"column_names = [\n",
|
"column_names = [\n",
|
||||||
" 'age', 'workclass', 'fnlwgt', 'education', 'education_num',\n",
|
" \"age\",\n",
|
||||||
" 'marital_status', 'occupation', 'relationship', 'race', 'sex',\n",
|
" \"workclass\",\n",
|
||||||
" 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income'\n",
|
" \"fnlwgt\",\n",
|
||||||
|
" \"education\",\n",
|
||||||
|
" \"education_num\",\n",
|
||||||
|
" \"marital_status\",\n",
|
||||||
|
" \"occupation\",\n",
|
||||||
|
" \"relationship\",\n",
|
||||||
|
" \"race\",\n",
|
||||||
|
" \"sex\",\n",
|
||||||
|
" \"capital_gain\",\n",
|
||||||
|
" \"capital_loss\",\n",
|
||||||
|
" \"hours_per_week\",\n",
|
||||||
|
" \"native_country\",\n",
|
||||||
|
" \"income\",\n",
|
||||||
"]\n",
|
"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"try:\n",
|
"try:\n",
|
||||||
" df = pd.read_csv(url, names=column_names, sep=r',\\s*', engine='python', na_values='?')\n",
|
" df = pd.read_csv(\n",
|
||||||
|
" url, names=column_names, sep=r\",\\s*\", engine=\"python\", na_values=\"?\"\n",
|
||||||
|
" )\n",
|
||||||
" print(\"Dataset chargé depuis UCI repository\")\n",
|
" print(\"Dataset chargé depuis UCI repository\")\n",
|
||||||
"except: # noqa: E722\n",
|
"except: # noqa: E722\n",
|
||||||
" print(\"Impossible de charger depuis UCI, création d'un dataset simulé similaire...\")\n",
|
" print(\"Impossible de charger depuis UCI, création d'un dataset simulé similaire...\")\n",
|
||||||
@@ -183,44 +201,121 @@
|
|||||||
" np.random.seed(42)\n",
|
" np.random.seed(42)\n",
|
||||||
" n_samples = 32561\n",
|
" n_samples = 32561\n",
|
||||||
"\n",
|
"\n",
|
||||||
" df = pd.DataFrame({\n",
|
" df = pd.DataFrame(\n",
|
||||||
" 'age': np.random.randint(17, 90, n_samples),\n",
|
" {\n",
|
||||||
" 'workclass': np.random.choice(['Private', 'Self-emp-not-inc', 'Local-gov', 'State-gov', 'Self-emp-inc',\n",
|
" \"age\": np.random.randint(17, 90, n_samples),\n",
|
||||||
" 'Federal-gov', 'Without-pay'], n_samples, p=[0.73, 0.08, 0.06, 0.04, 0.03, 0.03, 0.03]),\n",
|
" \"workclass\": np.random.choice(\n",
|
||||||
" 'fnlwgt': np.random.randint(12285, 1484705, n_samples),\n",
|
" [\n",
|
||||||
" 'education': np.random.choice(\n",
|
" \"Private\",\n",
|
||||||
" ['HS-grad', 'Some-college', 'Bachelors', 'Masters', 'Assoc-voc',\n",
|
" \"Self-emp-not-inc\",\n",
|
||||||
" 'Doctorate', '11th', '9th', '7th-8th'], n_samples,\n",
|
" \"Local-gov\",\n",
|
||||||
" p=[0.32, 0.22, 0.16, 0.05, 0.04, 0.01, 0.04, 0.03, 0.13]),\n",
|
" \"State-gov\",\n",
|
||||||
" 'education_num': np.random.randint(1, 16, n_samples),\n",
|
" \"Self-emp-inc\",\n",
|
||||||
" 'marital_status': np.random.choice(['Married-civ-spouse', 'Never-married', 'Divorced', 'Separated',\n",
|
" \"Federal-gov\",\n",
|
||||||
" 'Widowed'], n_samples, p=[0.46, 0.33, 0.14, 0.03, 0.04]),\n",
|
" \"Without-pay\",\n",
|
||||||
" 'occupation': np.random.choice(['Prof-specialty', 'Craft-repair', 'Exec-managerial', 'Adm-clerical',\n",
|
" ],\n",
|
||||||
" 'Sales', 'Other-service', 'Machine-op-inspct', 'Tech-support'], n_samples,\n",
|
" n_samples,\n",
|
||||||
" p=[0.13, 0.13, 0.13, 0.12, 0.11, 0.10, 0.06, 0.22]),\n",
|
" p=[0.73, 0.08, 0.06, 0.04, 0.03, 0.03, 0.03],\n",
|
||||||
" 'relationship': np.random.choice(['Husband', 'Not-in-family', 'Own-child', 'Unmarried', 'Wife', 'Other-relative'],\n",
|
" ),\n",
|
||||||
" n_samples, p=[0.40, 0.26, 0.16, 0.10, 0.05, 0.03]),\n",
|
" \"fnlwgt\": np.random.randint(12285, 1484705, n_samples),\n",
|
||||||
" 'race': np.random.choice(['White', 'Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'],\n",
|
" \"education\": np.random.choice(\n",
|
||||||
" n_samples, p=[0.85, 0.10, 0.03, 0.01, 0.01]),\n",
|
" [\n",
|
||||||
" 'sex': np.random.choice(['Male', 'Female'], n_samples, p=[0.67, 0.33]),\n",
|
" \"HS-grad\",\n",
|
||||||
" 'capital_gain': np.where(np.random.random(n_samples) < 0.92, 0, np.random.randint(1, 99999, n_samples)),\n",
|
" \"Some-college\",\n",
|
||||||
" 'capital_loss': np.where(np.random.random(n_samples) < 0.95, 0, np.random.randint(1, 4356, n_samples)),\n",
|
" \"Bachelors\",\n",
|
||||||
" 'hours_per_week': np.random.randint(1, 99, n_samples),\n",
|
" \"Masters\",\n",
|
||||||
" 'native_country': np.random.choice(['United-States', 'Mexico', 'Philippines', 'Germany', 'Canada',\n",
|
" \"Assoc-voc\",\n",
|
||||||
" 'India', 'Other'], n_samples, p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04])\n",
|
" \"Doctorate\",\n",
|
||||||
" })\n",
|
" \"11th\",\n",
|
||||||
|
" \"9th\",\n",
|
||||||
|
" \"7th-8th\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.32, 0.22, 0.16, 0.05, 0.04, 0.01, 0.04, 0.03, 0.13],\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"education_num\": np.random.randint(1, 16, n_samples),\n",
|
||||||
|
" \"marital_status\": np.random.choice(\n",
|
||||||
|
" [\n",
|
||||||
|
" \"Married-civ-spouse\",\n",
|
||||||
|
" \"Never-married\",\n",
|
||||||
|
" \"Divorced\",\n",
|
||||||
|
" \"Separated\",\n",
|
||||||
|
" \"Widowed\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.46, 0.33, 0.14, 0.03, 0.04],\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"occupation\": np.random.choice(\n",
|
||||||
|
" [\n",
|
||||||
|
" \"Prof-specialty\",\n",
|
||||||
|
" \"Craft-repair\",\n",
|
||||||
|
" \"Exec-managerial\",\n",
|
||||||
|
" \"Adm-clerical\",\n",
|
||||||
|
" \"Sales\",\n",
|
||||||
|
" \"Other-service\",\n",
|
||||||
|
" \"Machine-op-inspct\",\n",
|
||||||
|
" \"Tech-support\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.13, 0.13, 0.13, 0.12, 0.11, 0.10, 0.06, 0.22],\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"relationship\": np.random.choice(\n",
|
||||||
|
" [\n",
|
||||||
|
" \"Husband\",\n",
|
||||||
|
" \"Not-in-family\",\n",
|
||||||
|
" \"Own-child\",\n",
|
||||||
|
" \"Unmarried\",\n",
|
||||||
|
" \"Wife\",\n",
|
||||||
|
" \"Other-relative\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.40, 0.26, 0.16, 0.10, 0.05, 0.03],\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"race\": np.random.choice(\n",
|
||||||
|
" [\"White\", \"Black\", \"Asian-Pac-Islander\", \"Amer-Indian-Eskimo\", \"Other\"],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.85, 0.10, 0.03, 0.01, 0.01],\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"sex\": np.random.choice([\"Male\", \"Female\"], n_samples, p=[0.67, 0.33]),\n",
|
||||||
|
" \"capital_gain\": np.where(\n",
|
||||||
|
" np.random.random(n_samples) < 0.92,\n",
|
||||||
|
" 0,\n",
|
||||||
|
" np.random.randint(1, 99999, n_samples),\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"capital_loss\": np.where(\n",
|
||||||
|
" np.random.random(n_samples) < 0.95,\n",
|
||||||
|
" 0,\n",
|
||||||
|
" np.random.randint(1, 4356, n_samples),\n",
|
||||||
|
" ),\n",
|
||||||
|
" \"hours_per_week\": np.random.randint(1, 99, n_samples),\n",
|
||||||
|
" \"native_country\": np.random.choice(\n",
|
||||||
|
" [\n",
|
||||||
|
" \"United-States\",\n",
|
||||||
|
" \"Mexico\",\n",
|
||||||
|
" \"Philippines\",\n",
|
||||||
|
" \"Germany\",\n",
|
||||||
|
" \"Canada\",\n",
|
||||||
|
" \"India\",\n",
|
||||||
|
" \"Other\",\n",
|
||||||
|
" ],\n",
|
||||||
|
" n_samples,\n",
|
||||||
|
" p=[0.90, 0.02, 0.01, 0.01, 0.01, 0.01, 0.04],\n",
|
||||||
|
" ),\n",
|
||||||
|
" }\n",
|
||||||
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
" # Création de la cible avec logique réaliste\n",
|
" # Création de la cible avec logique réaliste\n",
|
||||||
" income_score = (\n",
|
" income_score = (\n",
|
||||||
" (df['age'] > 35).astype(int) * 20 +\n",
|
" (df[\"age\"] > 35).astype(int) * 20\n",
|
||||||
" (df['education_num'] > 12).astype(int) * 30 +\n",
|
" + (df[\"education_num\"] > 12).astype(int) * 30\n",
|
||||||
" (df['hours_per_week'] > 40).astype(int) * 15 +\n",
|
" + (df[\"hours_per_week\"] > 40).astype(int) * 15\n",
|
||||||
" (df['capital_gain'] > 0).astype(int) * 25 +\n",
|
" + (df[\"capital_gain\"] > 0).astype(int) * 25\n",
|
||||||
" (df['marital_status'] == 'Married-civ-spouse').astype(int) * 20 +\n",
|
" + (df[\"marital_status\"] == \"Married-civ-spouse\").astype(int) * 20\n",
|
||||||
" (df['occupation'].isin(['Exec-managerial', 'Prof-specialty'])).astype(int) * 15 +\n",
|
" + (df[\"occupation\"].isin([\"Exec-managerial\", \"Prof-specialty\"])).astype(int)\n",
|
||||||
" np.random.normal(0, 15, n_samples)\n",
|
" * 15\n",
|
||||||
|
" + np.random.normal(0, 15, n_samples)\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" df['income'] = (income_score > 60).map({True: '>50K', False: '<=50K'})"
|
" df[\"income\"] = (income_score > 60).map({True: \">50K\", False: \"<=50K\"})"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -358,7 +453,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# Encodage de la cible en 0/1\n",
|
"# Encodage de la cible en 0/1\n",
|
||||||
"df['income'] = (df['income'] == '>50K').astype(int)"
|
"df[\"income\"] = (df[\"income\"] == \">50K\").astype(int)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -468,7 +563,7 @@
|
|||||||
" \"race\",\n",
|
" \"race\",\n",
|
||||||
" \"sex\",\n",
|
" \"sex\",\n",
|
||||||
" \"native_country\",\n",
|
" \"native_country\",\n",
|
||||||
"]\n"
|
"]"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -497,7 +592,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"print(\"\\n Cardinalité des variables catégorielles :\")\n",
|
"print(\"\\n Cardinalité des variables catégorielles :\")\n",
|
||||||
"for col in cat_features:\n",
|
"for col in cat_features:\n",
|
||||||
" print(f\" {col}: {df[col].nunique()} catégories uniques\")\n"
|
" print(f\" {col}: {df[col].nunique()} catégories uniques\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -526,7 +621,7 @@
|
|||||||
"# Corrélation avec la cible\n",
|
"# Corrélation avec la cible\n",
|
||||||
"print(\"\\n Corrélations avec le revenu >50K :\")\n",
|
"print(\"\\n Corrélations avec le revenu >50K :\")\n",
|
||||||
"correlations = df[numeric_features].corrwith(df[\"income\"]).sort_values(ascending=False)\n",
|
"correlations = df[numeric_features].corrwith(df[\"income\"]).sort_values(ascending=False)\n",
|
||||||
"print(correlations)\n"
|
"print(correlations)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -583,7 +678,7 @@
|
|||||||
"axes[1, 2].set_xlabel(\"Revenu >50K\")\n",
|
"axes[1, 2].set_xlabel(\"Revenu >50K\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"plt.tight_layout()\n",
|
"plt.tight_layout()\n",
|
||||||
"plt.show()\n"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -647,7 +742,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"print(\"\\n=== Préparation pour CatBoost ===\\n\")\n",
|
"print(\"\\n=== Préparation pour CatBoost ===\\n\")\n",
|
||||||
"print(f\"Variables catégorielles : {cat_features}\")\n",
|
"print(f\"Variables catégorielles : {cat_features}\")\n",
|
||||||
"print(f\"Variables numériques : {numeric_features}\")\n"
|
"print(f\"Variables numériques : {numeric_features}\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -861,11 +956,15 @@
|
|||||||
"# Courbe ROC\n",
|
"# Courbe ROC\n",
|
||||||
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_baseline)\n",
|
"fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba_baseline)\n",
|
||||||
"plt.figure(figsize=(8, 6))\n",
|
"plt.figure(figsize=(8, 6))\n",
|
||||||
"plt.plot(fpr, tpr, label=f'CatBoost (AUC = {roc_auc_score(y_test, y_pred_proba_baseline):.3f})')\n",
|
"plt.plot(\n",
|
||||||
"plt.plot([0, 1], [0, 1], 'k--', label='Hasard')\n",
|
" fpr,\n",
|
||||||
"plt.xlabel('Taux de faux positifs')\n",
|
" tpr,\n",
|
||||||
"plt.ylabel('Taux de vrais positifs')\n",
|
" label=f\"CatBoost (AUC = {roc_auc_score(y_test, y_pred_proba_baseline):.3f})\",\n",
|
||||||
"plt.title('Courbe ROC')\n",
|
")\n",
|
||||||
|
"plt.plot([0, 1], [0, 1], \"k--\", label=\"Hasard\")\n",
|
||||||
|
"plt.xlabel(\"Taux de faux positifs\")\n",
|
||||||
|
"plt.ylabel(\"Taux de vrais positifs\")\n",
|
||||||
|
"plt.title(\"Courbe ROC\")\n",
|
||||||
"plt.legend()\n",
|
"plt.legend()\n",
|
||||||
"plt.grid(True)\n",
|
"plt.grid(True)\n",
|
||||||
"plt.show()"
|
"plt.show()"
|
||||||
@@ -1133,7 +1232,7 @@
|
|||||||
"df[\"montant_credit\"] = np.random.uniform(3000, 450000)\n",
|
"df[\"montant_credit\"] = np.random.uniform(3000, 450000)\n",
|
||||||
"df[\"defaut\"] = df[\"capital_gain\"] < df[\"capital_loss\"]\n",
|
"df[\"defaut\"] = df[\"capital_gain\"] < df[\"capital_loss\"]\n",
|
||||||
"\n",
|
"\n",
|
||||||
"df = df.drop([\"capital_gain\", \"capital_loss\"], axis=1)\n"
|
"df = df.drop([\"capital_gain\", \"capital_loss\"], axis=1)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -1211,7 +1310,7 @@
|
|||||||
"perte_totale_predite = y_pred_reg.sum()\n",
|
"perte_totale_predite = y_pred_reg.sum()\n",
|
||||||
"print(f\"\\\\nPerte totale réelle : {perte_totale_reelle:,.2f}€\")\n",
|
"print(f\"\\\\nPerte totale réelle : {perte_totale_reelle:,.2f}€\")\n",
|
||||||
"print(f\"Perte totale prédite : {perte_totale_predite:,.2f}€\")\n",
|
"print(f\"Perte totale prédite : {perte_totale_predite:,.2f}€\")\n",
|
||||||
"print(f\"Erreur : {abs(perte_totale_reelle - perte_totale_predite):,.2f}€\")\n"
|
"print(f\"Erreur : {abs(perte_totale_reelle - perte_totale_predite):,.2f}€\")"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -1276,7 +1375,7 @@
|
|||||||
"plt.title(\"Top 15 variables les plus importantes\")\n",
|
"plt.title(\"Top 15 variables les plus importantes\")\n",
|
||||||
"plt.gca().invert_yaxis()\n",
|
"plt.gca().invert_yaxis()\n",
|
||||||
"plt.tight_layout()\n",
|
"plt.tight_layout()\n",
|
||||||
"plt.show()\n"
|
"plt.show()"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -1320,8 +1419,8 @@
|
|||||||
" depth=6,\n",
|
" depth=6,\n",
|
||||||
" random_seed=42,\n",
|
" random_seed=42,\n",
|
||||||
" verbose=0,\n",
|
" verbose=0,\n",
|
||||||
" auto_class_weights='Balanced',\n",
|
" auto_class_weights=\"Balanced\",\n",
|
||||||
" eval_metric='AUC'\n",
|
" eval_metric=\"AUC\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"model_weighted.fit(train_pool, eval_set=test_pool)"
|
"model_weighted.fit(train_pool, eval_set=test_pool)"
|
||||||
@@ -1405,7 +1504,7 @@
|
|||||||
" random_seed=42,\n",
|
" random_seed=42,\n",
|
||||||
" verbose=0,\n",
|
" verbose=0,\n",
|
||||||
" scale_pos_weight=scale,\n",
|
" scale_pos_weight=scale,\n",
|
||||||
" eval_metric='AUC'\n",
|
" eval_metric=\"AUC\",\n",
|
||||||
")\n",
|
")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"model_scaled.fit(train_pool, eval_set=test_pool)"
|
"model_scaled.fit(train_pool, eval_set=test_pool)"
|
||||||
@@ -1427,10 +1526,7 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"# CatBoost a un support natif pour SHAP\n",
|
"# CatBoost a un support natif pour SHAP\n",
|
||||||
"shap_values = model_baseline.get_feature_importance(\n",
|
"shap_values = model_baseline.get_feature_importance(train_pool, type=\"ShapValues\")"
|
||||||
" train_pool,\n",
|
|
||||||
" type='ShapValues'\n",
|
|
||||||
")"
|
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -1483,13 +1579,12 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Importance SHAP moyenne\n",
|
"# Importance SHAP moyenne\n",
|
||||||
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
|
"shap_importance = np.abs(shap_values[:, :-1]).mean(axis=0)\n",
|
||||||
"shap_df = pd.DataFrame({\n",
|
"shap_df = pd.DataFrame(\n",
|
||||||
" 'feature': X_train.columns,\n",
|
" {\"feature\": X_train.columns, \"shap_importance\": shap_importance}\n",
|
||||||
" 'shap_importance': shap_importance\n",
|
").sort_values(\"shap_importance\", ascending=False)\n",
|
||||||
"}).sort_values('shap_importance', ascending=False)\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"print(\"\\nImportance SHAP moyenne :\")\n",
|
"print(\"\\nImportance SHAP moyenne :\")\n",
|
||||||
"print(shap_df.head(10))\n"
|
"print(shap_df.head(10))"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -1512,9 +1607,9 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Visualisation\n",
|
"# Visualisation\n",
|
||||||
"plt.figure(figsize=(10, 8))\n",
|
"plt.figure(figsize=(10, 8))\n",
|
||||||
"plt.barh(shap_df['feature'][:15], shap_df['shap_importance'][:15])\n",
|
"plt.barh(shap_df[\"feature\"][:15], shap_df[\"shap_importance\"][:15])\n",
|
||||||
"plt.xlabel('|SHAP value| moyen')\n",
|
"plt.xlabel(\"|SHAP value| moyen\")\n",
|
||||||
"plt.title('Importance des features (SHAP)')\n",
|
"plt.title(\"Importance des features (SHAP)\")\n",
|
||||||
"plt.gca().invert_yaxis()\n",
|
"plt.gca().invert_yaxis()\n",
|
||||||
"plt.tight_layout()\n",
|
"plt.tight_layout()\n",
|
||||||
"plt.show()"
|
"plt.show()"
|
||||||
|
|||||||
Reference in New Issue
Block a user