diff --git a/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb b/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb index d298183..a3d72a0 100644 --- a/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb +++ b/M2/Machine Learning/TP_3/2025_TP_3_M2_ISF.ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 191, "id": "97d58527", "metadata": {}, "outputs": [], @@ -60,7 +60,6 @@ "\n", "sns.set()\n", "import plotly.express as px\n", - "import plotly.graph_objects as gp\n", "import sklearn.metrics as metrics\n", "import sklearn.preprocessing as preproc\n", "\n", @@ -68,10 +67,9 @@ "from scipy.stats import chi2_contingency\n", "\n", "# Machine Learning\n", - "from sklearn.cluster import KMeans\n", "from sklearn.ensemble import RandomForestRegressor\n", - "from sklearn.model_selection import KFold, train_test_split\n", - "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n" + "from sklearn.model_selection import KFold, cross_val_score, train_test_split\n", + "from sklearn.tree import DecisionTreeRegressor\n" ] }, { @@ -84,11 +82,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 192, "id": "c67db932", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def cramers_V(var1, var2):\n", + " crosstab = np.array(\n", + " pd.crosstab(var1, var2, rownames=None, colnames=None)\n", + " ) # Cross table building\n", + " stat = chi2_contingency(crosstab)[\n", + " 0\n", + " ] # Keeping of the test statistic of the Chi2 test\n", + " obs = np.sum(crosstab) # Number of observations\n", + " mini = (\n", + " min(crosstab.shape) - 1\n", + " ) # Take the minimum value between the columns and the rows of the cross table\n", + " return stat / (obs * mini)\n" + ] }, { "cell_type": "markdown", @@ -100,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 193, "id": "c9597b48", "metadata": {}, "outputs": [], @@ -119,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 194, "id": "8051b5f4", "metadata": {}, "outputs": [], @@ -163,31 +174,367 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 195, "id": "c427a4b8", "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/tp/_ld5_pzs6nx6mv1pbjhq1l740000gn/T/ipykernel_41302/358057511.py:7: SettingWithCopyWarning:\n", + "\n", + "\n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + "\n" + ] + }, { "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "ANNEE_CTR", + "rawType": "int64", + "type": "integer" + }, + { + "name": "CONTRAT_ANCIENNETE", + "rawType": "object", + "type": "string" + }, + { + "name": "FREQUENCE_PAIEMENT_COTISATION", + "rawType": "object", + "type": "string" + }, + { + "name": "GROUPE_KM", + "rawType": "object", + "type": "string" + }, + { + "name": "ZONE_RISQUE", + "rawType": "object", + "type": "string" + }, + { + "name": "AGE_ASSURE_PRINCIPAL", + "rawType": "int64", + "type": "integer" + }, + { + "name": "GENRE", + "rawType": "object", + "type": "string" + }, + { + "name": "DEUXIEME_CONDUCTEUR", + "rawType": "bool", + "type": "boolean" + }, + { + "name": "ANCIENNETE_PERMIS", + "rawType": "int64", + "type": "integer" + }, + { + "name": "ANNEE_CONSTRUCTION", + "rawType": "float64", + "type": "float" + }, + { + "name": "ENERGIE", + "rawType": "object", + "type": "string" + }, + { + "name": "EQUIPEMENT_SECURITE", + "rawType": "object", + "type": "string" + }, + { + "name": "VALEUR_DU_BIEN", + "rawType": "object", + "type": "string" + }, + { + "name": "CM", + "rawType": "float64", + "type": "float" + } + ], + "ref": "9e024176-1fe1-4a76-bb33-627401a1ea24", + "rows": [ + [ + "10", + "2019", + "(0,1]", + "MENSUEL", + "[0;20000[", + "C", + "40", + "M", + "False", + "37", + "2017.0", + "ESSENCE", + "VRAI", + "[15000;20000[", + "1072.98" + ], + [ + "34", + "2020", + "(-1,0]", + "MENSUEL", + "[20000;40000[", + "C", + "27", + "M", + "True", + "13", + "2018.0", + "AUTRE", + "FAUX", + "[35000;99999[", + "3750.0" + ], + [ + "36", + "2019", + "(-1,0]", + "MENSUEL", + "[20000;40000[", + "L", + "19", + "M", + "False", + "2", + "2017.0", + "ESSENCE", + "VRAI", + "[0;10000[", + "1838.49" + ], + [ + "78", + "2019", + "(-1,0]", + "MENSUEL", + "[20000;40000[", + "B", + "40", + "M", + "False", + "45", + "2018.0", + "DIESEL", + "FAUX", + "[15000;20000[", + "4892.74" + ], + [ + "89", + "2018", + "(1,2]", + "MENSUEL", + "[20000;40000[", + "C", + "20", + "M", + "False", + "11", + "2014.0", + "ESSENCE", + "FAUX", + "[25000;35000[", + "166.73" + ] + ], + "shape": { + "columns": 14, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ANNEE_CTRCONTRAT_ANCIENNETEFREQUENCE_PAIEMENT_COTISATIONGROUPE_KMZONE_RISQUEAGE_ASSURE_PRINCIPALGENREDEUXIEME_CONDUCTEURANCIENNETE_PERMISANNEE_CONSTRUCTIONENERGIEEQUIPEMENT_SECURITEVALEUR_DU_BIENCM
102019(0,1]MENSUEL[0;20000[C40MFalse372017.0ESSENCEVRAI[15000;20000[1072.98
342020(-1,0]MENSUEL[20000;40000[C27MTrue132018.0AUTREFAUX[35000;99999[3750.00
362019(-1,0]MENSUEL[20000;40000[L19MFalse22017.0ESSENCEVRAI[0;10000[1838.49
782019(-1,0]MENSUEL[20000;40000[B40MFalse452018.0DIESELFAUX[15000;20000[4892.74
892018(1,2]MENSUEL[20000;40000[C20MFalse112014.0ESSENCEFAUX[25000;35000[166.73
\n", + "
" + ], "text/plain": [ - "(824, 14)" + " ANNEE_CTR CONTRAT_ANCIENNETE FREQUENCE_PAIEMENT_COTISATION GROUPE_KM \\\n", + "10 2019 (0,1] MENSUEL [0;20000[ \n", + "34 2020 (-1,0] MENSUEL [20000;40000[ \n", + "36 2019 (-1,0] MENSUEL [20000;40000[ \n", + "78 2019 (-1,0] MENSUEL [20000;40000[ \n", + "89 2018 (1,2] MENSUEL [20000;40000[ \n", + "\n", + " ZONE_RISQUE AGE_ASSURE_PRINCIPAL GENRE DEUXIEME_CONDUCTEUR \\\n", + "10 C 40 M False \n", + "34 C 27 M True \n", + "36 L 19 M False \n", + "78 B 40 M False \n", + "89 C 20 M False \n", + "\n", + " ANCIENNETE_PERMIS ANNEE_CONSTRUCTION ENERGIE EQUIPEMENT_SECURITE \\\n", + "10 37 2017.0 ESSENCE VRAI \n", + "34 13 2018.0 AUTRE FAUX \n", + "36 2 2017.0 ESSENCE VRAI \n", + "78 45 2018.0 DIESEL FAUX \n", + "89 11 2014.0 ESSENCE FAUX \n", + "\n", + " VALEUR_DU_BIEN CM \n", + "10 [15000;20000[ 1072.98 \n", + "34 [35000;99999[ 3750.00 \n", + "36 [0;10000[ 1838.49 \n", + "78 [15000;20000[ 4892.74 \n", + "89 [25000;35000[ 166.73 " ] }, - "execution_count": 4, + "execution_count": 195, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "data_model = data_retraitee.copy()\n", + "data_model = data_retraitee\n", "\n", "# Filtre pour ne garder que les lignes qui ont un sinistre (NB > 0)\n", - "data_model = data_model[data_model['NB'] > 0]\n", + "data_model = data_model[data_model[\"NB\"] > 0]\n", "\n", "# Calcul du cout moyen \"théorique\" des sinistres\n", - "data_model[\"CM\"] = (data_model[\"CHARGE\"] / data_model[\"NB\"])\n", - "data_model = data_model.drop(['CHARGE', 'NB', \"EXPO\"], axis=1)\n", - "data_model.shape" + "data_model[\"CM\"] = data_model[\"CHARGE\"] / data_model[\"NB\"]\n", + "data_model = data_model.drop([\"CHARGE\", \"NB\", \"EXPO\"], axis=1)\n", + "data_model.head()" ] }, { @@ -200,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 196, "id": "c8fd3ee1", "metadata": {}, "outputs": [ @@ -284,7 +631,7 @@ "type": "float" } ], - "ref": "e29190e7-d62c-4ab7-ab0a-43dd875c8b98", + "ref": "59cf3e53-3de4-4283-9dac-6a29f574b6fe", "rows": [ [ "count", @@ -760,7 +1107,7 @@ "max NaN 83421.850000 " ] }, - "execution_count": 5, + "execution_count": 196, "metadata": {}, "output_type": "execute_result" } @@ -769,6 +1116,858 @@ "data_model.describe(include='all')" ] }, + { + "cell_type": "code", + "execution_count": 197, + "id": "2d32ae2b", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "plotlyServerURL": "https://plot.ly" + }, + "data": [ + { + "bingroup": "x", + "hovertemplate": "CM=%{x}
count=%{y}", + "legendgroup": "", + "marker": { + "color": "#636efa", + "pattern": { + "shape": "" + } + }, + "name": "", + "orientation": "v", + "showlegend": false, + "type": "histogram", + "x": { + "bdata": "UrgehevDkEAAAAAAAEytQClcj8L1uZxACtejcL0cs0CPwvUoXNdkQK5H4XqU+7JAKVyPwvXhoECuR+F6VBiiQClcj8L1D5lApHA9CtfUmUBxPQrXo6KrQGZmZmZm7ZdAAAAAACCV2UCkcD0KV8K/QPYoXI9CcbxAj8L1KFwZikAzMzMzs669QK5H4XpUdrlAXI/C9SiMi0BmZmZmpueuQOF6FK5H2qlAZmZmZma+Y0AK16NwPQOoQArXo3D9b7JAmpmZmZmjwEAAAAAAAEytQAAAAAAAmIlAXI/C9agVtECamZmZGZe/QClcj8L1BJ9Aj8L1KFzXZEDsUbgehUKVQEjhehSuV5BAPQrXo/DAqEBI4XoULvGnQOxRuB6FFJxAzczMzAwtuUDhehSuR/a3QKRwPQq3FNBAAAAAAABMrUB7FK5H4SbAQKRwPQrXmrNAAAAAAABMrUCPwvUoXF2SQPYoXI/CuYdAzczMzMypwUAAAAAAAEytQK5H4XoUPJpAUrgehauDuEAAAAAAAAAeQGZmZmZm+KhAAAAAAABMrUB7FK5H4TnDQEjhehSuX25A16NwPcqW1EA9CtejcMqRQAAAAAAATK1A16NwPQqfZEAzMzMz86O1QB+F61E4pp9ACtejcL1+pEBI4XoUrtOFQAAAAAAAoGRAAAAAAABMrUDNzMzMzK6KQHsUrkfhQJdAzczMzMyEmkDXo3A9Cp9kQFyPwvUoRHNAAAAAAADSqkBmZmZmZr5jQLgehetRCGdAMzMzMzP4kECamZmZmWmNQIXrUbieDbBAFK5H4XrSg0CkcD0KN4/IQOF6FK5H6JZAcT0K16Nse0D2KFyPwu2MQFK4HoXr66tACtejcL0OskCPwvUoXNdkQI/C9Shc12RAH4XrUbjLlkB7FK5HodrMQNejcD0KxqhAw/UoXI/qcEAAAAAAAPS6QPYoXI8iXNNAH4XrUbhjmECuR+F69L3IQAAAAAAATK1AFK5H4TposEBmZmZmZr5jQOF6FK7Hs6ZAZmZmZma+Y0AAAAAAAAWiQI/C9SjcY7RAAAAAAACQlkDsUbgehUacQOxRuB4FeKlAZmZmZma+Y0CkcD0KV+iYQMP1KFyPhZ9ASOF6FO5gsEDNzMzMTJilQM3MzMzMAptAAAAAAAAqlEC4HoXrURiLQGZmZmZmbpRAmpmZmZnKmkCPwvUoXNdkQI/C9Shc12RAAAAAAADwZUB7FK5H4WCVQBSuR+F6joRAAAAAAABMrUC4HoXrURKGQGZmZmZmZLBAKVyPwvXIm0A9CtejcKWWQBSuR+F6u5ZAexSuR+GMq0AAAAAAkMvZQM3MzMxMy6tACtejcD0xnUBxPQrXox+tQDMzMzOz2bVAcT0K16PUrEC4HoXr0QWoQOxRuB6FkYhA9ihcj2K7z0CPwvUoXNvIQPYoXI/ChWRAuB6F69Fzo0AAAAAAAEytQPYoXI/CU6dAj8L1KFzXZEDsUbgetd/XQIXrUbgeb5hAXI/C9RjN0EAzMzMzMziiQPYoXI9C7a9AhetRuB5xf0AAAAAAAPBlQIXrUbh+xMNAAAAAADCG1kDXo3A9CpdQQGZmZmZmvmNAmpmZmZmroECPwvUoXHePQM3MzMxMlqFASOF6FH46wEA9CtejcJahQAAAAACAXr5AKVyPwvX3l0DD9ShcjzycQJqZmZmZGqpA4XoUruffwEBcj8L1KDyhQAAAAAAATK1AAAAAAACrlUBmZmZmZgSlQAAAAAAA8GVAAAAAAADwZUC4HoXrUbg/QPYoXI/CxKRAPQrXo3CsqEBcj8L1KE+wQAAAAACAX6hAmpmZmZlxpEBxPQrX43W3QAAAAAAATK1ArkfhetSUskAfhetROIGuQOxRuB6FcahAMzMzMzOffEC4HoXrUfuqQMP1KFyPV7ZAXI/C9ajxs0CamZmZGRqSQMP1KFyPxMFA16NwPQpUs0CuR+F6FPu1QAAAAAAATK1AAAAAAADanEBxPQrXozSMQDMzMzMzl5xAcT0K12MNskAAAAAAgKGkQFK4HoWL5uBAAAAAAADwZUBcj8L1KLB7QIXrUbgWz/BA16NwPQqfZEAzMzMzs8yiQD0K16NwU7JACtejcL2frEDXo3A9irK2QK5H4Xr0qdRA16NwPQpInkDhehSuR6WVQBSuR+F6q7xAAAAAAADwZUA9CtejcPC2QClcj8L1dalA9ihcj8KFZEAK16Nw/SHIQHsUrkfh37ZAmZmZmdlrtkA9CtejcLOKQKRwPQrXK7NA9ihcj4ITsUDhehSuR8V1QHE9Ctejo6JAmpmZmZmZnEDD9Shcj6yTQDMzMzOz16lAj8L1KNyXq0DD9Shcj8bQQOF6FK6nb8FAAAAAAADUhEDhehSuRxSkQJqZmZkZ5KBAUrgehevMn0CamZmZ2Yu5QOxRuB6Fms9AzczMzMy9nUAUrkfheiulQAAAAAAAEolAUrgehesFgkDD9Shcj3CSQPYoXI9CKKRA16NwPYruq0BI4XoU7vy8QMP1KFyPinlA4XoUrkdkm0Bcj8L1KOuuQKRwPQpXGaZAXI/C9ShwlUBSuB6F61WYQDMzMzOzZaJAAAAAAACsgEBxPQrXo7yYQI/C9SjcdbNAXI/C9Sjkq0AUrkfhepOSQFK4HoXre6dA4XoUrkeroUCamZmZGRapQAAAAAAATK1Aj8L1KFzXZECPwvUoXEdmQM3MzMzMVptAZmZmZma/pkAAAAAAAEytQJqZmZmZHpZAH4XrUTh7rUBSuB6F62GEQNejcD0KtaJA16NwPYpuqkDsUbgedZfUQB+F61G4wJ9A16NwPQqfZEAUrkfhepagQGZmZmZmybRAAAAAAADwZUBI4XoUro2MQHE9CtejMpZA4XoUrke2m0AAAAAAANuTQAAAAAAATK1AexSuR+GirkB7FK5HYX+kQHE9CtfjGMNAAAAAAABMrUApXI/C1RbAQHsUrkfhm5NAUrgehetzoEAAAAAAAABoQAAAAAAATK1AmpmZmRmpoEAAAAAAAPWwQOF6FK5H2YZAhetRuJ4hsEAzMzMzMxOvQEjhehSus4dAFK5H4Xq5k0CPwvUo3I20QDMzMzOzpbZAAAAAAACgZEAAAAAAAPBlQArXo3B9uLZAMzMzMzMTskD2KFyPMijaQI/C9Shc12RAKVyPwnVep0BmZmZmZr5jQAAAAAAAIIxASOF6FK7poUDhehSux6GmQOF6FK4n4sJA7FG4HgWUoUC4HoXr0QaxQIXrUbgesYBAAAAAAABMrUDD9Shcj4SUQKRwPQrXN6hA7FG4HoVhlkBSuB6Fq2DMQNejcD0Kd51A16NwPQqYpEDsUbgehcqdQB+F61F4B7JA4XoUrkfZqkCPwvUoXNdkQFyPwvWovahAPQrXo3CKsEC4HoXrUb+mQLgehetRCq5A16NwPQodrkCPwvUonEy5QNejcD0Kh7VAj8L1KFzXZECPwvUo3JKkQPYoXI/CDYZAAAAAAABMrUAAAAAAAABoQI/C9ShcHZxAAAAAAAAAaEAAAAAAAABoQEjhehSuYKRASOF6FK6ymECPwvUo3IqgQNejcD0qfc9ACtejcD0In0Bcj8L1uCrQQAAAAAAA8GVAzczMzNRy10BmZmZmZr+dQLgehetR869ASOF6FO57tkD2KFyPwp6QQMP1KFyPbqRAMzMzMzOXgEBcj8L1KGeQQAAAAAAATK1AcT0K16OMvUCkcD0K1ymgQAAAAAAAv7VA16NwPQp3UkC4HoXrEVa0QD0K16NweJ5AuB6F61Gju0AAAAAAAEytQI/C9Shcb3pAZmZmZma8jkAAAAAAAPGkQK5H4XoUwbhACtejcD1TlkCPwvUoXM6cQAAAAAAA8GVA9ihcj0L7rUAAAAAAAPBlQMP1KFyPm8lAj8L1KFzXZEA9Ctej8KiiQNejcD0Kn2RAAAAAAADwZUD2KFyPwqViQAAAAAAA8GVAH4XrUbg5nkAK16NwPZmhQAAAAAAATK1AmpmZmZnYokAAAAAAAImgQGZmZma6K/BAZmZmZmaeokCuR+F6NDrNQD0K16OAdddAj8L1KFzXZECPwvUoXLCfQI/C9Shc8qVAPQrXo3AMlkA9CtejcGWOQM3MzMwc1tpAhetRuF6xtEDhehSuR1S4QBSuR+F6hoJAAAAAAAAYpUDXo3A9Cp9kQPYoXI/Cm4pASOF6FK4rjkCPwvUoHCLDQHE9CtcjH6VA16NwPQqnnkDD9Shcj+6bQArXo3A94JVAAAAAAADwZUDNzMzMzBSnQJqZmZnJFNNAPQrXo/AqwkCF61G4HgmBQD0K16PwYqBAMzMzMzMakUC4HoXrUWCWQFK4HoXrUZ9AXI/C9Sg4m0BmZmZm5l+qQBSuR+HaGdNAAAAAAADwZUCF61G4HlODQKRwPQrXALFAhetRuF6DskBmZmZmZr5jQAAAAAAATK1ArkfhepQ8oUAK16NwPS6iQPYoXI/CyqBAzczMzBw110ApXI/C9eCYQB+F61G4H5ZAj8L1KFzXZECF61G4HpK3QM3MzMzMwINAKVyPwvUVl0AzMzMzM4qZQFyPwvUoJItASOF6FC78rkAAAAAAAPBlQGZmZmZmba5AmpmZmd1d9EAzMzMzMySnQBSuR+F6RZFA4XoUrkdnn0AUrkfheqGnQOxRuB7FmbNAZmZmZma+Y0DsUbgeRZqxQD0K16PwlaJA16NwPQqfZECPwvUoXOahQNejcD0KJqFAXI/C9Sh8qECamZmZWYDaQAAAAAAAAGhAzczMzEwBoEB7FK5H4Y2iQPYoXI/CK6dAUrgehettgkCamZmZmdOEQKRwPQrXMKhA4XoUrscpsECkcD0K16CaQK5H4XoUEJ9AMzMzMzMBpkAAAAAAAPBlQArXo3C90KhASOF6FK6bg0C4HoXrUTSbQLgehetRVoFArkfhevzm5EAK16NwPbqkQJqZmZmZyJdAXI/C9ei+oEAK16NwvQOgQEjhehSuJ4FAAAAAAABMrUDNzMzMzMKeQAAAAAAATK1AzczMzMwEpUB7FK5HYW6qQIXrUbieTrdArkfhetTgsUAAAAAAAPBlQI/C9ShckK5AAAAAAACIdkC4HoXrUdeUQLgehetRCIRAj8L1KFwmyUCPwvUo3Pa5QK5H4XqU4rBApHA9CneNtEBcj8L1KFeeQArXo3C97qNAMzMzM7OntkCPwvUoXNmUQHsUrkfhooRAj8L1KFxrikBmZmZmZtqmQOxRuB6FY5hA7FG4HoUHl0B7FK5HoQ3BQKRwPQrXd9JAAAAAAABMrUDNzMzMTPynQBSuR+F6vpZAj8L1KNzuqEDXo3A9ismiQB+F61G45nlA16NwPQowoEAUrkfhOkK8QNejcD0KzJtAAAAAAADwZUBSuB6Fi3vQQEjhehQuralAMzMzM7PwqEAAAAAAAABoQEjhehR+qdBAAAAAAABwh0AK16NwPWymQArXo3A9uJNAH4XrUbi9ukAAAAAAAEytQAAAAAAATK1APQrXo3ARgkCkcD0KVzSwQI/C9ShcqapAPQrXo3A9nkDNzMzMzDqmQBSuR+F6xpVAPQrXo7AOuUDhehSuRzqxQI/C9Shcj5ZApHA9ChdZzEBI4XoUTnC0QD0K16PwEKdAAAAAAABMrUAK16NwPYuqQB+F61G4VatAXI/C9SgStUDhehSuR/WvQHE9CtcjBbBAAAAAAADwZUBmZmZmZr5jQB+F61G4T5hAuB6F6/GrxUB7FK5HQdHBQAAAAAAATK1AzczMzEzHuEBcj8L1KHWaQM3MzMxMkKpAj8L1KDxWskAAAAAAgOLbQI/C9Shc12RACtejcD2EgkAAAAAAAPBlQB+F61G4zapAj8L1KFy3jkAfhetRuH6dQPYoXI/ChWRArkfhehRFlkC4HoXrUdujQOF6FK5H76RAUrgeheuYzUCPwvUoPCDBQJqZmZmZarFA4XoUrseIoUBmZmZmZhqLQGZmZmYGodVAMzMzMzMnoUCkcD0Kl5S8QI/C9SjcL8VAcT0K16Pjw0DD9Shcj9qqQAAAAAAATK1AzczMzEx1skAAAAAAABBXQM3MzMzM7KZAuB6F69E4okDD9Shcjz2iQB+F61G4ZJZAUrgehesTnUAAAAAAAJehQB+F61G4DZRAMzMzMzNzv0DNzMzMTJ6lQIXrUbgeu5ZAUrgehSukskAK16NwPSCJQDMzMzMz44NAmpmZmdmSokAAAAAAAGivQAAAAAAA0FpAmpmZmRnxqUCPwvUoXNdkQAAAAAAATK1AhetRuD5/wkDsUbgexam7QGZmZmZmvmNApHA9Ctdtl0DNzMzMzGGsQOF6FK5H1YtAH4XrUThwskBI4XoULn2vQClcj8L1iJxAXI/C9ejL2EAAAAAAAEytQGZmZmZm7rZA9ihcj6JuwUAzMzMzs+WkQI/C9Shc12RA9ihcj8JckkA9CtejcGCeQMP1KFyPhZZAMzMzMzO6o0AUrkfhOpC1QOxRuB6FFaZAcT0K16Nfk0AAAAAAAEytQNejcD3aMMlAZmZmZma+Y0A9CtejcKl7QEjhehSu0p9ACtejcL3JpEBmZmZmZjykQKRwPQrXw55A9ihcj8KdZEAzMzMzM/GXQI/C9Shc12RAexSuR6HVykDhehSuR9ujQHsUrkdh27FAPQrXo/B9vkAAAAAAAABoQPYoXI/CMKNAXI/C9ShUnUB7FK5H4VuQQNejcD0KuZ1AZmZmZma+Y0AK16Nwvdm3QOF6FK5HHq1AAAAAAAAAZUCPwvUoXCKQQNejcD0KY6pAAAAAAABMrUCPwvUo3GmkQNejcD0Kn2RA9ihcj0Kuo0Bcj8L1qOOjQD0K16MwJ7FA16NwPQqfZEC4HoXrUayeQDMzMzNDHNBA4XoUrkeZpEApXI/CdTasQAAAAAAA8GVAw/UoXI+znUAUrkfhej2oQMP1KFyPYJBAAAAAAADwZUAAAAAAACmwQArXo3A9gINA7FG4HgWxoEBSuB6Fawe2QOF6FK7ng75AcT0K10M6wkAzMzMzM06bQJqZmZmZuKpAPQrXo3CwqECF61G4HqquQArXo3A9uItAKVyPwvVQjkBmZmZmZgGrQAAAAAAATK1A16NwPYq6r0BmZmZmZlaUQAAAAAAATK1AAAAAAADwZUB7FK5H4YyzQClcj8L1qFJAH4XrUbi5nkCkcD0KV9GyQClcj8J1AKVACtejcL1Lr0DD9Shcz66oQI/C9Shc12RA4XoUrscV1UCPwvUoHDe1QClcj8L1WJpAUrgeheupg0DXo3A9Cn98QAAAAAAATK1AAAAAAABMrUDD9ShcD5etQAAAAAAALpRASOF6FK4Ey0CPwvUoXNdkQGZmZmZmvmNAj8L1KFxInUCamZmZmTGsQI/C9Sjcl6hAj8L1KFyBsUA9CtejcF1xQK5H4XoU/l1AcT0K16OtsEDD9Shcj6KZQAAAAAAATK1Aj8L1KJyHz0CPwvUoXKCxQI/C9ShcDqhAAAAAAABMrUAK16NwvUmrQIXrUbgeGYdApHA9CteunkB7FK5HYUShQOxRuB6Fw5tAXI/C9ajQuECPwvUoXOuTQHE9Ctejh55A9ihcj8IhkkCamZmZmTqkQM3MzMwMJNZAw/UoXI+8kUDNzMzMzIxBQJqZmZmZh4JA7FG4HiUHw0BI4XoUriOOQFK4HoXrz5hAPQrXo3Dww0CkcD0K15WZQIXrUbiudd1AZmZmZma+Y0D2KFyPwsWhQKRwPQrXRrZAMzMzMzOzIUAzMzMz8zGyQArXo3B96LlAFK5H4XrMi0BxPQrXYznEQMP1KFyPwJZAZmZmZmbekUBcj8L1aPu8QAAAAAAA8GVASOF6FG4k1UDhehSuR4KaQHsUrkfhI5tAhetRuB55skAUrkfheoulQKRwPQq3VMtAcT0K16OPoEAzMzMzM9t2QAAAAAAATK1AzczMzMyIiEAfhetR+CHJQNejcD0Kn2RAMzMzM+vQ4EDXo3A9Cp9kQHsUrkfhioFAj8L1KFysr0BxPQrXI16hQArXo3A9DoZAXI/C9Sguo0AzMzMzM+evQGZmZmZmSqJASOF6FA4bz0DsUbgexW22QJqZmZmZgJZAXI/C9SiZmkDNzMzMzLSTQD0K16MQm9RAH4XrUXiosEAzMzMzMwmdQNejcD0KqpBAH4XrUXgwrEDD9Shcj1iPQAAAAAAAFa5A16NwPQr5pkDNzMzMzFKzQJqZmZkZ5qVAAAAAAADJmECPwvUoXNdkQArXo3C9E7BA7FG4HoVjrUBmZmZmZkurQAAAAAAA6apAexSuR+FWrEDhehSuR5StQDMzMzOzor5AcT0K16OFmkAAAAAAAPBlQMP1KFw3oetAzczMzMyVrUAAAAAAAPBlQOF6FK7HOKNAFK5H4XoTskDNzMzMzF6cQClcj8L1VphAUrgehetjlEDXo3A9CpCnQGZmZmZmvmNAUrgehevci0AfhetRuF6HQIXrUbh+/MRAFK5H4XpQjEAzMzMzsxKkQOxRuB6FxqJAj8L1KFxFmEBSuB6Fa8a4QLgeheuRaLJAKVyPwvVghUBmZmZmZouzQJqZmZmZF6ZA9ihcj8L5oUAAAAAAAHCHQJqZmZmZeKRAAAAAAADwZUAUrkfhuoy3QFyPwvUooq9APQrXo3D9ZkAfhetR2N7QQB+F61G4FKhAMzMzM7NuqUCF61G4HvykQOF6FK5ndMBArkfhevQFxECPwvUoXNdkQJqZmZkZg6FA7FG4HgUDk0CF61G4Hi+gQHE9Ctej1bNAzczMzIxstECkcD0KV/i2QA==", + "dtype": "f8" + }, + "xaxis": "x", + "yaxis": "y" + } + ], + "layout": { + "barmode": "relative", + "legend": { + "tracegroupgap": 0 + }, + "margin": { + "t": 60 + }, + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + }, + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "histogram": [ + { + "marker": { + "pattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "fillpattern": { + "fillmode": "overlay", + "size": 10, + "solidity": 0.2 + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermap": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermap" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "xaxis": { + "anchor": "y", + "domain": [ + 0, + 1 + ], + "title": { + "text": "CM" + } + }, + "yaxis": { + "anchor": "x", + "domain": [ + 0, + 1 + ], + "title": { + "text": "count" + } + } + } + } + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Observation de la distribution\n", + "fig = px.histogram(data_model, x=\"CM\")\n", + "fig.show()" + ] + }, { "cell_type": "markdown", "id": "92d6156a", @@ -795,7 +1994,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 198, "id": "1b156435", "metadata": {}, "outputs": [ @@ -805,7 +2004,7 @@ "(824, 13)" ] }, - "execution_count": 6, + "execution_count": 198, "metadata": {}, "output_type": "execute_result" } @@ -817,30 +2016,30 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 199, "id": "0ef0fcc0", "metadata": {}, "outputs": [], "source": [ - "#Séparation en variables qualitatives ou catégorielles\n", + "# Séparation en variables qualitatives ou catégorielles\n", "variables_na = []\n", "variables_numeriques = []\n", "variables_01 = []\n", "variables_categorielles = []\n", "for colu in data_set.columns:\n", - " if True in data_set[colu].isna().unique() :\n", + " if True in data_set[colu].isna().unique():\n", " variables_na.append(data_set[colu])\n", - " else :\n", - " if str(data_set[colu].dtypes) in [\"int32\",\"int64\",\"float64\"]:\n", - " if len(data_set[colu].unique())==2 :\n", + " else:\n", + " if str(data_set[colu].dtypes) in [\"int32\", \"int64\", \"float64\"]:\n", + " if len(data_set[colu].unique()) == 2:\n", " variables_categorielles.append(data_set[colu])\n", - " else :\n", + " else:\n", " variables_numeriques.append(data_set[colu])\n", - " else :\n", - " if len(data_set[colu].unique())==2 :\n", + " else:\n", + " if len(data_set[colu].unique()) == 2:\n", " variables_categorielles.append(data_set[colu])\n", - " else :\n", - " variables_categorielles.append(data_set[colu])" + " else:\n", + " variables_categorielles.append(data_set[colu])\n" ] }, { @@ -853,7 +2052,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 200, "id": "e130aae5", "metadata": {}, "outputs": [], @@ -863,942 +2062,416 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 201, "id": "c39e2ad0", "metadata": {}, "outputs": [ { "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ { - "coloraxis": "coloraxis", - "hovertemplate": "x: %{x}
y: %{y}
color: %{z}", - "name": "0", - "texttemplate": "%{z:.2f}", - "type": "heatmap", - "x": [ - "CONTRAT_ANCIENNETE", - "FREQUENCE_PAIEMENT_COTISATION", - "GROUPE_KM", - "ZONE_RISQUE", - "GENRE", - "DEUXIEME_CONDUCTEUR", - "ENERGIE", - "EQUIPEMENT_SECURITE", - "VALEUR_DU_BIEN" - ], - "xaxis": "x", - "y": [ - "CONTRAT_ANCIENNETE", - "FREQUENCE_PAIEMENT_COTISATION", - "GROUPE_KM", - "ZONE_RISQUE", - "GENRE", - "DEUXIEME_CONDUCTEUR", - "ENERGIE", - "EQUIPEMENT_SECURITE", - "VALEUR_DU_BIEN" - ], - "yaxis": "y", - "z": { - "bdata": "AAAAAAAA8D8AAAAAAAAAACoCGzzITrA/jS6+t390sj/aAKYMJa2eP5RMqUS3uZs/ytNpsBVXkz8AAAAAAAAAAJsekiMPM4I/AAAAAAAAAAAAAAAAAADwPwAAAAAAAAAAAAAAAAAAAABgNwyfFOK3Px3tLvtk1qI/VTS7w965nj/DbHQwNU6sP6xOyIjBVMQ/KwIbPMhOsD8AAAAAAAAAAAAAAAAAAPA/JGwWgOwjwz/Y12crRVC2P1AU8aUpk3Y/tZ25v8HgyT9++YWBDBq6PxMKBP1KAMk/ki6+t390sj8AAAAAAAAAACNsFoDsI8M/AAAAAAAA8D8AAAAAAAAAAOzpAHMW1bU/OToUIB5twT+gpoD1ZjrEP/5ATjN+vpg/0gCmDCWtnj9gNwyfFOK3P9jXZytFULY/AAAAAAAAAAAAAAAAAADwPwAAAAAAAAAA2p0N4q1bwz/UsLoqS0u5PxFqf8IHB9E/lEypRLe5mz8d7S77ZNaiP1AU8aUpk3Y/7OkAcxbVtT8AAAAAAAAAAAAAAAAAAPA/AAAAAAAAAAAAAAAAAAAAAOYlMsJ0brs/ytNpsBVXkz9RNLvD3rmeP7edub/B4Mk/OjoUIB5twT/anQ3irVvDPwAAAAAAAAAAAAAAAAAA8D8nEbUEUmnAP+SA2g/TvNE/AAAAAAAAAADDbHQwNU6sP335hYEMGro/oKaA9WY6xD/UsLoqS0u5PwAAAAAAAAAAJxG1BFJpwD8AAAAAAADwP+fmCf6XRco/mx6SIw8zgj+rTsiIwVTEPxIKBP1KAMk//kBOM36+mD8Ran/CBwfRP+YlMsJ0brs/5YDaD9O80T/n5gn+l0XKPwAAAAAAAPA/", - "dtype": "f8", - "shape": "9, 9" - } + "name": "index", + "rawType": "object", + "type": "string" + }, + { + "name": "CONTRAT_ANCIENNETE", + "rawType": "float64", + "type": "float" + }, + { + "name": "FREQUENCE_PAIEMENT_COTISATION", + "rawType": "float64", + "type": "float" + }, + { + "name": "GROUPE_KM", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE", + "rawType": "float64", + "type": "float" + }, + { + "name": "GENRE", + "rawType": "float64", + "type": "float" + }, + { + "name": "DEUXIEME_CONDUCTEUR", + "rawType": "float64", + "type": "float" + }, + { + "name": "ENERGIE", + "rawType": "float64", + "type": "float" + }, + { + "name": "EQUIPEMENT_SECURITE", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN", + "rawType": "float64", + "type": "float" } ], - "layout": { - "coloraxis": { - "colorscale": [ - [ - 0, - "rgb(5,48,97)" - ], - [ - 0.1, - "rgb(33,102,172)" - ], - [ - 0.2, - "rgb(67,147,195)" - ], - [ - 0.3, - "rgb(146,197,222)" - ], - [ - 0.4, - "rgb(209,229,240)" - ], - [ - 0.5, - "rgb(247,247,247)" - ], - [ - 0.6, - "rgb(253,219,199)" - ], - [ - 0.7, - "rgb(244,165,130)" - ], - [ - 0.8, - "rgb(214,96,77)" - ], - [ - 0.9, - "rgb(178,24,43)" - ], - [ - 1, - "rgb(103,0,31)" - ] - ] - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermap": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermap" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "text": "Matrice de corrélation des variables catégorielles (V de Cramér)" - }, - "xaxis": { - "anchor": "y", - "domain": [ - 0, - 1 - ] - }, - "yaxis": { - "anchor": "x", - "autorange": "reversed", - "domain": [ - 0, - 1 - ] - } + "ref": "2f5478f2-7cdc-47d7-aeff-3055a9f87820", + "rows": [ + [ + "CONTRAT_ANCIENNETE", + "1.0", + "0.0", + "0.01", + "0.02", + "0.01", + "0.01", + "0.01", + "0.0", + "0.01" + ], + [ + "FREQUENCE_PAIEMENT_COTISATION", + "0.0", + "1.0", + "0.0", + "0.01", + "0.01", + "0.0", + "0.0", + "0.01", + "0.03" + ], + [ + "GROUPE_KM", + "0.01", + "0.0", + "1.0", + "0.04", + "0.01", + "0.0", + "0.04", + "0.01", + "0.04" + ], + [ + "ZONE_RISQUE", + "0.02", + "0.01", + "0.04", + "1.0", + "0.01", + "0.02", + "0.03", + "0.04", + "0.02" + ], + [ + "GENRE", + "0.01", + "0.01", + "0.01", + "0.01", + "1.0", + "0.0", + "0.03", + "0.01", + "0.08" + ], + [ + "DEUXIEME_CONDUCTEUR", + "0.01", + "0.0", + "0.0", + "0.02", + "0.0", + "0.99", + "0.0", + "0.0", + "0.02" + ], + [ + "ENERGIE", + "0.01", + "0.0", + "0.04", + "0.03", + "0.03", + "0.0", + "1.0", + "0.02", + "0.08" + ], + [ + "EQUIPEMENT_SECURITE", + "0.0", + "0.01", + "0.01", + "0.04", + "0.01", + "0.0", + "0.02", + "0.99", + "0.05" + ], + [ + "VALEUR_DU_BIEN", + "0.01", + "0.03", + "0.04", + "0.02", + "0.08", + "0.02", + "0.08", + "0.05", + "1.0" + ] + ], + "shape": { + "columns": 9, + "rows": 9 } - } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CONTRAT_ANCIENNETEFREQUENCE_PAIEMENT_COTISATIONGROUPE_KMZONE_RISQUEGENREDEUXIEME_CONDUCTEURENERGIEEQUIPEMENT_SECURITEVALEUR_DU_BIEN
CONTRAT_ANCIENNETE1.000.000.010.020.010.010.010.000.01
FREQUENCE_PAIEMENT_COTISATION0.001.000.000.010.010.000.000.010.03
GROUPE_KM0.010.001.000.040.010.000.040.010.04
ZONE_RISQUE0.020.010.041.000.010.020.030.040.02
GENRE0.010.010.010.011.000.000.030.010.08
DEUXIEME_CONDUCTEUR0.010.000.000.020.000.990.000.000.02
ENERGIE0.010.000.040.030.030.001.000.020.08
EQUIPEMENT_SECURITE0.000.010.010.040.010.000.020.990.05
VALEUR_DU_BIEN0.010.030.040.020.080.020.080.051.00
\n", + "
" + ], + "text/plain": [ + " CONTRAT_ANCIENNETE \\\n", + "CONTRAT_ANCIENNETE 1.00 \n", + "FREQUENCE_PAIEMENT_COTISATION 0.00 \n", + "GROUPE_KM 0.01 \n", + "ZONE_RISQUE 0.02 \n", + "GENRE 0.01 \n", + "DEUXIEME_CONDUCTEUR 0.01 \n", + "ENERGIE 0.01 \n", + "EQUIPEMENT_SECURITE 0.00 \n", + "VALEUR_DU_BIEN 0.01 \n", + "\n", + " FREQUENCE_PAIEMENT_COTISATION GROUPE_KM \\\n", + "CONTRAT_ANCIENNETE 0.00 0.01 \n", + "FREQUENCE_PAIEMENT_COTISATION 1.00 0.00 \n", + "GROUPE_KM 0.00 1.00 \n", + "ZONE_RISQUE 0.01 0.04 \n", + "GENRE 0.01 0.01 \n", + "DEUXIEME_CONDUCTEUR 0.00 0.00 \n", + "ENERGIE 0.00 0.04 \n", + "EQUIPEMENT_SECURITE 0.01 0.01 \n", + "VALEUR_DU_BIEN 0.03 0.04 \n", + "\n", + " ZONE_RISQUE GENRE DEUXIEME_CONDUCTEUR \\\n", + "CONTRAT_ANCIENNETE 0.02 0.01 0.01 \n", + "FREQUENCE_PAIEMENT_COTISATION 0.01 0.01 0.00 \n", + "GROUPE_KM 0.04 0.01 0.00 \n", + "ZONE_RISQUE 1.00 0.01 0.02 \n", + "GENRE 0.01 1.00 0.00 \n", + "DEUXIEME_CONDUCTEUR 0.02 0.00 0.99 \n", + "ENERGIE 0.03 0.03 0.00 \n", + "EQUIPEMENT_SECURITE 0.04 0.01 0.00 \n", + "VALEUR_DU_BIEN 0.02 0.08 0.02 \n", + "\n", + " ENERGIE EQUIPEMENT_SECURITE VALEUR_DU_BIEN \n", + "CONTRAT_ANCIENNETE 0.01 0.00 0.01 \n", + "FREQUENCE_PAIEMENT_COTISATION 0.00 0.01 0.03 \n", + "GROUPE_KM 0.04 0.01 0.04 \n", + "ZONE_RISQUE 0.03 0.04 0.02 \n", + "GENRE 0.03 0.01 0.08 \n", + "DEUXIEME_CONDUCTEUR 0.00 0.00 0.02 \n", + "ENERGIE 1.00 0.02 0.08 \n", + "EQUIPEMENT_SECURITE 0.02 0.99 0.05 \n", + "VALEUR_DU_BIEN 0.08 0.05 1.00 " + ] }, + "execution_count": 201, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "# Matrice de corrélation pour les variables catégorielles (V de Cramér)\n", - "def cramers_v(confusion_matrix):\n", - " \"\"\"Calcule le V de Cramér à partir d'une matrice de contingence\"\"\"\n", - " chi2 = chi2_contingency(confusion_matrix)[0]\n", - " n = confusion_matrix.sum().sum()\n", - " phi2 = chi2 / n\n", - " r, k = confusion_matrix.shape\n", - " phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))\n", - " rcorr = r - ((r-1)**2)/(n-1)\n", - " kcorr = k - ((k-1)**2)/(n-1)\n", - " return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))\n", + "# Test du V de Cramer\n", + "rows = []\n", "\n", - "# Créer la matrice de corrélation\n", - "categorical_cols = vars_categorielles.columns\n", - "n_vars = len(categorical_cols)\n", - "cramers_matrix = np.zeros((n_vars, n_vars))\n", + "for var1 in vars_categorielles:\n", + " col = []\n", + " for var2 in vars_categorielles:\n", + " cramers = cramers_V(\n", + " vars_categorielles[var1], vars_categorielles[var2]\n", + " ) # V de Cramer\n", + " col.append(round(cramers, 2)) # arrondi du résultat\n", + " rows.append(col)\n", "\n", - "for i, col1 in enumerate(categorical_cols):\n", - " for j, col2 in enumerate(categorical_cols):\n", - " if i == j:\n", - " cramers_matrix[i, j] = 1.0\n", - " else:\n", - " confusion_matrix = pd.crosstab(vars_categorielles[col1], vars_categorielles[col2])\n", - " cramers_matrix[i, j] = cramers_v(confusion_matrix)\n", + "cramers_results = np.array(rows)\n", + "v_cramer_resultats = pd.DataFrame(\n", + " cramers_results,\n", + " columns=vars_categorielles.columns,\n", + " index=vars_categorielles.columns,\n", + ")\n", "\n", - "# Créer le DataFrame de corrélation\n", - "correlation_cat = pd.DataFrame(cramers_matrix,\n", - " index=categorical_cols,\n", - " columns=categorical_cols)\n", - "\n", - "# Visualiser avec Plotly\n", - "fig = px.imshow(correlation_cat,\n", - " text_auto='.2f',\n", - " aspect=\"auto\",\n", - " color_continuous_scale='RdBu_r',\n", - " title='Matrice de corrélation des variables catégorielles (V de Cramér)')\n", - "fig.show()" + "v_cramer_resultats" + ] + }, + { + "cell_type": "code", + "execution_count": 202, + "id": "1755a2a4", + "metadata": {}, + "outputs": [], + "source": [ + "# On repère les variables trop corrélées\n", + "for i in range(v_cramer_resultats.shape[0]):\n", + " for j in range(i + 1, v_cramer_resultats.shape[0]):\n", + " if v_cramer_resultats.iloc[i, j] > 0.7:\n", + " print(\n", + " v_cramer_resultats.index.to_numpy()[i]\n", + " + \" et \"\n", + " + v_cramer_resultats.columns[j]\n", + " + \" sont trop dépendantes, V-CRAMER = \"\n", + " + str(v_cramer_resultats.iloc[i, j])\n", + " )\n" ] }, { @@ -1811,912 +2484,188 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 203, "id": "a16215ab", "metadata": {}, "outputs": [], "source": [ - "vars_numeriques = pd.DataFrame(variables_numeriques).transpose()" + "vars_numeriques = pd.DataFrame(variables_numeriques).transpose()\n" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 204, "id": "532ca6c4", "metadata": {}, "outputs": [ { "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ { - "coloraxis": "coloraxis", - "hovertemplate": "x: %{x}
y: %{y}
color: %{z}", - "name": "0", - "texttemplate": "%{z}", - "type": "heatmap", - "x": [ - "ANNEE_CTR", - "AGE_ASSURE_PRINCIPAL", - "ANCIENNETE_PERMIS", - "ANNEE_CONSTRUCTION" - ], - "xaxis": "x", - "y": [ - "ANNEE_CTR", - "AGE_ASSURE_PRINCIPAL", - "ANCIENNETE_PERMIS", - "ANNEE_CONSTRUCTION" - ], - "yaxis": "y", - "z": { - "bdata": "AAAAAAAA8D+ybZcEUUCbP/CBLCtO46Q/qr2Q49LN2D+ybZcEUUCbPwAAAAAAAPA/slV7SAtP4T84L73yETWgv/CBLCtO46Q/slV7SAtP4T8AAAAAAADwP0I6y25dD6E/qr2Q49LN2D84L73yETWgv0I6y25dD6E/AAAAAAAA8D8=", - "dtype": "f8", - "shape": "4, 4" - } + "name": "index", + "rawType": "object", + "type": "string" + }, + { + "name": "ANNEE_CTR", + "rawType": "float64", + "type": "float" + }, + { + "name": "AGE_ASSURE_PRINCIPAL", + "rawType": "float64", + "type": "float" + }, + { + "name": "ANCIENNETE_PERMIS", + "rawType": "float64", + "type": "float" + }, + { + "name": "ANNEE_CONSTRUCTION", + "rawType": "float64", + "type": "float" } ], - "layout": { - "coloraxis": { - "colorscale": [ - [ - 0, - "rgb(5,48,97)" - ], - [ - 0.1, - "rgb(33,102,172)" - ], - [ - 0.2, - "rgb(67,147,195)" - ], - [ - 0.3, - "rgb(146,197,222)" - ], - [ - 0.4, - "rgb(209,229,240)" - ], - [ - 0.5, - "rgb(247,247,247)" - ], - [ - 0.6, - "rgb(253,219,199)" - ], - [ - 0.7, - "rgb(244,165,130)" - ], - [ - 0.8, - "rgb(214,96,77)" - ], - [ - 0.9, - "rgb(178,24,43)" - ], - [ - 1, - "rgb(103,0,31)" - ] - ] - }, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermap": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermap" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "text": "Matrice de corrélation des variables numériques" - }, - "xaxis": { - "anchor": "y", - "domain": [ - 0, - 1 - ] - }, - "yaxis": { - "anchor": "x", - "autorange": "reversed", - "domain": [ - 0, - 1 - ] - } + "ref": "cbf56bf5-5e8b-495e-a03c-dc55b1f8dfd7", + "rows": [ + [ + "ANNEE_CTR", + "1.0", + "0.0266125353863182", + "0.04079670216583853", + "0.38756248686965" + ], + [ + "AGE_ASSURE_PRINCIPAL", + "0.0266125353863182", + "1.0", + "0.5408989349040694", + "-0.03165489280817585" + ], + [ + "ANCIENNETE_PERMIS", + "0.04079670216583853", + "0.5408989349040694", + "1.0", + "0.033320350432053406" + ], + [ + "ANNEE_CONSTRUCTION", + "0.38756248686965", + "-0.03165489280817585", + "0.033320350432053406", + "1.0" + ] + ], + "shape": { + "columns": 4, + "rows": 4 } - } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ANNEE_CTRAGE_ASSURE_PRINCIPALANCIENNETE_PERMISANNEE_CONSTRUCTION
ANNEE_CTR1.0000000.0266130.0407970.387562
AGE_ASSURE_PRINCIPAL0.0266131.0000000.540899-0.031655
ANCIENNETE_PERMIS0.0407970.5408991.0000000.033320
ANNEE_CONSTRUCTION0.387562-0.0316550.0333201.000000
\n", + "
" + ], + "text/plain": [ + " ANNEE_CTR AGE_ASSURE_PRINCIPAL ANCIENNETE_PERMIS \\\n", + "ANNEE_CTR 1.000000 0.026613 0.040797 \n", + "AGE_ASSURE_PRINCIPAL 0.026613 1.000000 0.540899 \n", + "ANCIENNETE_PERMIS 0.040797 0.540899 1.000000 \n", + "ANNEE_CONSTRUCTION 0.387562 -0.031655 0.033320 \n", + "\n", + " ANNEE_CONSTRUCTION \n", + "ANNEE_CTR 0.387562 \n", + "AGE_ASSURE_PRINCIPAL -0.031655 \n", + "ANCIENNETE_PERMIS 0.033320 \n", + "ANNEE_CONSTRUCTION 1.000000 " + ] }, + "execution_count": 204, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "vars_numeriques.corr()\n", - "fig = px.imshow(vars_numeriques.corr(),\n", - " text_auto=True,\n", - " aspect=\"auto\",\n", - " color_continuous_scale='RdBu_r',\n", - " title='Matrice de corrélation des variables numériques')\n", - "fig.show()" + "# Corrélation de Pearson\n", + "correlations_num = vars_numeriques.corr(method=\"pearson\")\n", + "correlations_num" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "6c3bd9b2", + "metadata": {}, + "outputs": [], + "source": [ + "# On repère les variables trop corrélées\n", + "nb_variables = correlations_num.shape[0]\n", + "for i in range(nb_variables):\n", + " for j in range(i + 1, nb_variables):\n", + " if abs(correlations_num.iloc[i, j]) > 0.7:\n", + " print(\n", + " correlations_num.index.to_numpy()[i]\n", + " + \" et \"\n", + " + correlations_num.columns[j]\n", + " + \" sont trop dépendantes, corr = \"\n", + " + str(correlations_num.iloc[i, j])\n", + " )\n" ] }, { @@ -2764,15 +2713,616 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 206, "id": "b8530717", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "CONTRAT_ANCIENNETE_(0,1]", + "rawType": "float64", + "type": "float" + }, + { + "name": "CONTRAT_ANCIENNETE_(1,2]", + "rawType": "float64", + "type": "float" + }, + { + "name": "CONTRAT_ANCIENNETE_(2,5]", + "rawType": "float64", + "type": "float" + }, + { + "name": "CONTRAT_ANCIENNETE_(5,10]", + "rawType": "float64", + "type": "float" + }, + { + "name": "FREQUENCE_PAIEMENT_COTISATION_MENSUEL", + "rawType": "float64", + "type": "float" + }, + { + "name": "FREQUENCE_PAIEMENT_COTISATION_TRIMESTRIEL", + "rawType": "float64", + "type": "float" + }, + { + "name": "GROUPE_KM_[20000;40000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "GROUPE_KM_[40000;60000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "GROUPE_KM_[60000;99999[", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_B", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_C", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_D", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_E", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_F", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_G", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_H", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_I", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_J", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_K", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_L", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_M", + "rawType": "float64", + "type": "float" + }, + { + "name": "ZONE_RISQUE_T", + "rawType": "float64", + "type": "float" + }, + { + "name": "GENRE_M", + "rawType": "float64", + "type": "float" + }, + { + "name": "DEUXIEME_CONDUCTEUR_True", + "rawType": "float64", + "type": "float" + }, + { + "name": "ENERGIE_DIESEL", + "rawType": "float64", + "type": "float" + }, + { + "name": "ENERGIE_ESSENCE", + "rawType": "float64", + "type": "float" + }, + { + "name": "EQUIPEMENT_SECURITE_VRAI", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN_[10000;15000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN_[15000;20000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN_[20000;25000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN_[25000;35000[", + "rawType": "float64", + "type": "float" + }, + { + "name": "VALEUR_DU_BIEN_[35000;99999[", + "rawType": "float64", + "type": "float" + } + ], + "ref": "a62943ce-0b7b-4ed1-9ec2-fe8c4868e843", + "rows": [ + [ + "0", + "1.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0" + ], + [ + "1", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0" + ], + [ + "2", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "3", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0" + ], + [ + "4", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "1.0", + "0.0", + "0.0", + "0.0", + "0.0", + "1.0", + "0.0" + ] + ], + "shape": { + "columns": 32, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CONTRAT_ANCIENNETE_(0,1]CONTRAT_ANCIENNETE_(1,2]CONTRAT_ANCIENNETE_(2,5]CONTRAT_ANCIENNETE_(5,10]FREQUENCE_PAIEMENT_COTISATION_MENSUELFREQUENCE_PAIEMENT_COTISATION_TRIMESTRIELGROUPE_KM_[20000;40000[GROUPE_KM_[40000;60000[GROUPE_KM_[60000;99999[ZONE_RISQUE_B...GENRE_MDEUXIEME_CONDUCTEUR_TrueENERGIE_DIESELENERGIE_ESSENCEEQUIPEMENT_SECURITE_VRAIVALEUR_DU_BIEN_[10000;15000[VALEUR_DU_BIEN_[15000;20000[VALEUR_DU_BIEN_[20000;25000[VALEUR_DU_BIEN_[25000;35000[VALEUR_DU_BIEN_[35000;99999[
01.00.00.00.01.00.00.00.00.00.0...1.00.00.01.01.00.01.00.00.00.0
10.00.00.00.01.00.01.00.00.00.0...1.01.00.00.00.00.00.00.00.01.0
20.00.00.00.01.00.01.00.00.00.0...1.00.00.01.01.00.00.00.00.00.0
30.00.00.00.01.00.01.00.00.01.0...1.00.01.00.00.00.01.00.00.00.0
40.01.00.00.01.00.01.00.00.00.0...1.00.00.01.00.00.00.00.01.00.0
\n", + "

5 rows × 32 columns

\n", + "
" + ], + "text/plain": [ + " CONTRAT_ANCIENNETE_(0,1] CONTRAT_ANCIENNETE_(1,2] \\\n", + "0 1.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 1.0 \n", + "\n", + " CONTRAT_ANCIENNETE_(2,5] CONTRAT_ANCIENNETE_(5,10] \\\n", + "0 0.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " FREQUENCE_PAIEMENT_COTISATION_MENSUEL \\\n", + "0 1.0 \n", + "1 1.0 \n", + "2 1.0 \n", + "3 1.0 \n", + "4 1.0 \n", + "\n", + " FREQUENCE_PAIEMENT_COTISATION_TRIMESTRIEL GROUPE_KM_[20000;40000[ \\\n", + "0 0.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 1.0 \n", + "3 0.0 1.0 \n", + "4 0.0 1.0 \n", + "\n", + " GROUPE_KM_[40000;60000[ GROUPE_KM_[60000;99999[ ZONE_RISQUE_B ... \\\n", + "0 0.0 0.0 0.0 ... \n", + "1 0.0 0.0 0.0 ... \n", + "2 0.0 0.0 0.0 ... \n", + "3 0.0 0.0 1.0 ... \n", + "4 0.0 0.0 0.0 ... \n", + "\n", + " GENRE_M DEUXIEME_CONDUCTEUR_True ENERGIE_DIESEL ENERGIE_ESSENCE \\\n", + "0 1.0 0.0 0.0 1.0 \n", + "1 1.0 1.0 0.0 0.0 \n", + "2 1.0 0.0 0.0 1.0 \n", + "3 1.0 0.0 1.0 0.0 \n", + "4 1.0 0.0 0.0 1.0 \n", + "\n", + " EQUIPEMENT_SECURITE_VRAI VALEUR_DU_BIEN_[10000;15000[ \\\n", + "0 1.0 0.0 \n", + "1 0.0 0.0 \n", + "2 1.0 0.0 \n", + "3 0.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " VALEUR_DU_BIEN_[15000;20000[ VALEUR_DU_BIEN_[20000;25000[ \\\n", + "0 1.0 0.0 \n", + "1 0.0 0.0 \n", + "2 0.0 0.0 \n", + "3 1.0 0.0 \n", + "4 0.0 0.0 \n", + "\n", + " VALEUR_DU_BIEN_[25000;35000[ VALEUR_DU_BIEN_[35000;99999[ \n", + "0 0.0 0.0 \n", + "1 0.0 1.0 \n", + "2 0.0 0.0 \n", + "3 0.0 0.0 \n", + "4 1.0 0.0 \n", + "\n", + "[5 rows x 32 columns]" + ] + }, + "execution_count": 206, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "encoder = preproc.OneHotEncoder()\n", - "encoder.fit(vars_categorielles)\n", - "vars_categorielles_enc = encoder.transform(vars_categorielles)\n", - "vars_categorielles_enc = pd.DataFrame(vars_categorielles_enc.toarray(), columns=encoder.get_feature_names_out(vars_categorielles.columns))" + "# One hot encoding des variables catégorielles\n", + "preproc_ohe = preproc.OneHotEncoder(handle_unknown=\"ignore\")\n", + "preproc_ohe = preproc.OneHotEncoder(drop=\"first\", sparse_output=False).fit(\n", + " vars_categorielles\n", + ")\n", + "\n", + "variables_categorielles_ohe = preproc_ohe.transform(vars_categorielles)\n", + "variables_categorielles_ohe = pd.DataFrame(\n", + " variables_categorielles_ohe,\n", + " columns=preproc_ohe.get_feature_names_out(vars_categorielles.columns),\n", + ")\n", + "variables_categorielles_ohe.head()" ] }, { @@ -2785,15 +3335,172 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 207, "id": "4ff3847d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "ANNEE_CTR", + "rawType": "float64", + "type": "float" + }, + { + "name": "AGE_ASSURE_PRINCIPAL", + "rawType": "float64", + "type": "float" + }, + { + "name": "ANCIENNETE_PERMIS", + "rawType": "float64", + "type": "float" + }, + { + "name": "ANNEE_CONSTRUCTION", + "rawType": "float64", + "type": "float" + } + ], + "ref": "012814d2-2bb4-463c-b907-53ba71631da2", + "rows": [ + [ + "0", + "0.40615626262983295", + "-0.31764836563527515", + "0.067767057718506", + "0.5653698304986595" + ], + [ + "1", + "1.06626032654885", + "-1.2596885906311412", + "-1.1719751563806404", + "0.8816391722032739" + ], + [ + "2", + "0.40615626262983295", + "-1.839405652167059", + "-1.740190337842749", + "0.5653698304986595" + ], + [ + "3", + "0.40615626262983295", + "-0.31764836563527515", + "0.48101446241822143", + "0.8816391722032739" + ], + [ + "4", + "-0.25394780128918387", + "-1.7669410194750692", + "-1.2752870075555691", + "-0.38343819461518397" + ] + ], + "shape": { + "columns": 4, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ANNEE_CTRAGE_ASSURE_PRINCIPALANCIENNETE_PERMISANNEE_CONSTRUCTION
00.406156-0.3176480.0677670.565370
11.066260-1.259689-1.1719750.881639
20.406156-1.839406-1.7401900.565370
30.406156-0.3176480.4810140.881639
4-0.253948-1.766941-1.275287-0.383438
\n", + "
" + ], + "text/plain": [ + " ANNEE_CTR AGE_ASSURE_PRINCIPAL ANCIENNETE_PERMIS ANNEE_CONSTRUCTION\n", + "0 0.406156 -0.317648 0.067767 0.565370\n", + "1 1.066260 -1.259689 -1.171975 0.881639\n", + "2 0.406156 -1.839406 -1.740190 0.565370\n", + "3 0.406156 -0.317648 0.481014 0.881639\n", + "4 -0.253948 -1.766941 -1.275287 -0.383438" + ] + }, + "execution_count": 207, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "scaler = preproc.StandardScaler()\n", - "scaler.fit(vars_numeriques)\n", - "vars_numeriques_scaled = scaler.transform(vars_numeriques)\n", - "vars_numeriques_scaled = pd.DataFrame(vars_numeriques_scaled, columns=vars_numeriques.columns)" + "# Normalisation des varibales numériques\n", + "preproc_scale = preproc.StandardScaler(with_mean=True, with_std=True)\n", + "preproc_scale.fit(vars_numeriques)\n", + "\n", + "vars_numeriques_scaled = preproc_scale.transform(vars_numeriques)\n", + "vars_numeriques_scaled = pd.DataFrame(\n", + " vars_numeriques_scaled, columns=vars_numeriques.columns\n", + ")\n", + "vars_numeriques_scaled.head()\n" ] }, { @@ -2814,14 +3521,28 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 208, "id": "6a1c7907", "metadata": {}, "outputs": [], "source": [ - "X = data_model_preprocessed = vars_numeriques_scaled.merge(vars_categorielles_enc, left_index=True, right_index=True)\n", + "X_global = vars_numeriques_scaled.merge(\n", + " variables_categorielles_ohe, left_index=True, right_index=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "58a14153", + "metadata": {}, + "outputs": [], + "source": [ + "# Réorganisation des données\n", + "X = X_global.to_numpy()\n", "Y = data_model[\"CM\"]\n", "\n", + "# Sampling en 80% train et 20% test\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X, Y, test_size=0.2, random_state=42\n", ")" @@ -2845,442 +3566,16 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 210, "id": "053e013c", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "DecisionTreeRegressor()" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "tree = DecisionTreeRegressor()\n", - "tree.fit(X_train, y_train)" + "# Initialisation de l'objet\n", + "model_CART = DecisionTreeRegressor()\n", + "\n", + "# Train Decision Tree Classifer\n", + "model_CART = model_CART.fit(X_train, y_train)" ] }, { @@ -3293,7 +3588,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 211, "id": "c4ca2cf9", "metadata": {}, "outputs": [ @@ -3309,7 +3604,7 @@ ], "source": [ "# Prédictions sur l'ensemble d'entraînement\n", - "y_pred_train = tree.predict(X_train)\n", + "y_pred_train = model_CART.predict(X_train)\n", "\n", "mae = metrics.mean_absolute_error(y_train, y_pred_train)\n", "mse = metrics.mean_squared_error(y_train, y_pred_train)\n", @@ -3322,7 +3617,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 212, "id": "4b739d5b", "metadata": {}, "outputs": [ @@ -3330,14 +3625,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "MAE: 5969.32\n", - "MSE: 161922043.77\n", - "RMSE: 12724.86\n" + "MAE: 5186.37\n", + "MSE: 94029342.74\n", + "RMSE: 9696.87\n" ] } ], "source": [ - "y_pred_test = tree.predict(X_test)\n", + "y_pred_test = model_CART.predict(X_test)\n", "\n", "mae = metrics.mean_absolute_error(y_test, y_pred_test)\n", "mse = metrics.mean_squared_error(y_test, y_pred_test)\n", @@ -3408,17 +3703,18 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 213, "id": "ab1e1367", "metadata": {}, "outputs": [], "source": [ - "X = data_model_preprocessed\n", - "Y = data_model[\"CM\"]\n", + "X_global = vars_numeriques_scaled.merge(\n", + " variables_categorielles_ohe, left_index=True, right_index=True\n", + ")\n", "\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, Y, test_size=0.2, random_state=42\n", - ")" + "# Réorganisation des données\n", + "X = X_global.to_numpy()\n", + "Y = np.array(data_model[\"CM\"])" ] }, { @@ -3439,7 +3735,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 214, "id": "b515460e", "metadata": {}, "outputs": [], @@ -3462,41 +3758,35 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 215, "id": "eebb394f", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Validation croisée terminée avec 5 folds\n" - ] - } - ], + "outputs": [], "source": [ "# Entrainement avec cross-validation\n", - "for train_index, val_index in kf.split(X_train):\n", - " X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]\n", - " y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]\n", + "for train_index, test_index in kf.split(X):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = Y[train_index], Y[test_index]\n", "\n", - " rf_regressor.fit(X_train_fold, y_train_fold)\n", - " y_pred_fold = rf_regressor.predict(X_val_fold)\n", + " # Fitting\n", + " rf_regressor.fit(X_train, y_train)\n", "\n", - " mae = metrics.mean_absolute_error(y_val_fold, y_pred_fold)\n", - " mse = metrics.mean_squared_error(y_val_fold, y_pred_fold)\n", - " rmse = metrics.root_mean_squared_error(y_val_fold, y_pred_fold)\n", + " # Evaluation du modèle\n", + " y_pred_test = rf_regressor.predict(X_test)\n", "\n", - " MAE_scores.append(mae)\n", - " MSE_scores.append(mse)\n", - " RMSE_scores.append(rmse)\n", + " MAE = metrics.mean_absolute_error(y_test, y_pred_test)\n", + " MSE = metrics.mean_squared_error(y_test, y_pred_test)\n", + " RMSE = metrics.root_mean_squared_error(y_test, y_pred_test)\n", "\n", - "print(f\"Validation croisée terminée avec {len(MAE_scores)} folds\")" + " # Concaténation des résultats\n", + " MAE_scores.append(MAE)\n", + " MSE_scores.append(MSE)\n", + " RMSE_scores.append(RMSE)\n" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 216, "id": "b067126c", "metadata": {}, "outputs": [ @@ -3504,11 +3794,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fold 1 MAE: 4472.5486946969695\n", - "Fold 2 MAE: 3859.4743234848484\n", - "Fold 3 MAE: 3633.0231541666662\n", - "Fold 4 MAE: 3888.3910715909087\n", - "Fold 5 MAE: 4808.59621832061\n" + "Fold 1 MAE: 4007.8326951515155\n", + "Fold 2 MAE: 3651.8632978787878\n", + "Fold 3 MAE: 4718.226707878788\n", + "Fold 4 MAE: 4031.310562727273\n", + "Fold 5 MAE: 4410.05992957317\n" ] } ], @@ -3522,7 +3812,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 217, "id": "6597152c", "metadata": {}, "outputs": [ @@ -3530,11 +3820,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fold 1 MSE: 85464414.44080053\n", - "Fold 2 MSE: 34396997.21755034\n", - "Fold 3 MSE: 55184512.50786593\n", - "Fold 4 MSE: 33191300.80751679\n", - "Fold 5 MSE: 68739370.63588645\n" + "Fold 1 MSE: 32761893.668576293\n", + "Fold 2 MSE: 50894497.0512714\n", + "Fold 3 MSE: 106861487.03512044\n", + "Fold 4 MSE: 35487273.569623545\n", + "Fold 5 MSE: 54729524.04672807\n" ] } ], @@ -3546,7 +3836,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 218, "id": "63ff1c9d", "metadata": {}, "outputs": [ @@ -3554,11 +3844,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "Fold 1 RMSE: 9244.696557529649\n", - "Fold 2 RMSE: 5864.895328780415\n", - "Fold 3 RMSE: 7428.62790210049\n", - "Fold 4 RMSE: 5761.189183451346\n", - "Fold 5 RMSE: 8290.9209763383\n" + "Fold 1 RMSE: 5723.8006314490285\n", + "Fold 2 RMSE: 7134.037920509772\n", + "Fold 3 RMSE: 10337.38298773536\n", + "Fold 4 RMSE: 5957.119569861222\n", + "Fold 5 RMSE: 7397.940527385177\n" ] } ], @@ -3594,17 +3884,17 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 219, "id": "d9342ad6", "metadata": {}, "outputs": [], "source": [ - "X = data_model_preprocessed\n", - "Y = data_model[\"CM\"]\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(\n", - " X, Y, test_size=0.2, random_state=42\n", - ")" + "X_global = vars_numeriques_scaled.merge(\n", + " variables_categorielles_ohe, left_index=True, right_index=True\n", + ")\n", + "# Réorganisation des données\n", + "X = X_global.to_numpy()\n", + "Y = np.array(data_model[\"CM\"])" ] }, { @@ -3625,11 +3915,12 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 220, "id": "6d58dbc2", "metadata": {}, "outputs": [], "source": [ + "# Initialisation\n", "# Nombre de sous-échantillons pour la cross-validation\n", "num_splits = 5\n", "\n", @@ -3642,9 +3933,9 @@ "RMSE_scores = []\n", "\n", "# Hyperparamètres à tester\n", - "n_estimators_values = [50, 100, 200, 300]\n", - "max_depth_values = [2, 5, 10]\n", - "min_samples_split_values = [2, 5, 10]\n", + "n_estimators_values = [60, 65, 70, 75]\n", + "max_depth_values = [None, 1, 2, 3]\n", + "min_samples_split_values = [5, 8, 10, 11, 13, 14, 15]\n", "\n", "# Liste pour sauveagrder les meilleurs résultats\n", "best_score = np.inf\n", @@ -3657,58 +3948,71 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 221, "id": "47da5172", "metadata": {}, "outputs": [], "source": [ + "# grid search à la main\n", "for n_estimators in n_estimators_values:\n", " for max_depth in max_depth_values:\n", " for min_samples_split in min_samples_split_values:\n", - " rf_regressor = RandomForestRegressor(\n", - " n_estimators=n_estimators,\n", - " max_depth=max_depth,\n", - " min_samples_split=min_samples_split,\n", - " random_state=42\n", - " )\n", + " # Réinitialisation des résultats\n", " MAE_scores = []\n", " MSE_scores = []\n", " RMSE_scores = []\n", "\n", - " for train_index, val_index in kf.split(X_train):\n", - " X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]\n", - " y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]\n", + " # Boucle de Cross-Validation\n", + " for train_index, test_index in kf.split(X):\n", + " X_train, X_test = X[train_index], X[test_index]\n", + " y_train, y_test = Y[train_index], Y[test_index]\n", "\n", - " rf_regressor.fit(X_train_fold, y_train_fold)\n", - " y_pred_fold = rf_regressor.predict(X_val_fold)\n", + " # Modèle avec hyperparamètres actuels\n", + " rf_regressor = RandomForestRegressor(\n", + " n_estimators = n_estimators,\n", + " max_depth = max_depth,\n", + " min_samples_split = min_samples_split,\n", + " random_state = 42,\n", + " )\n", "\n", - " mae = metrics.mean_absolute_error(y_val_fold, y_pred_fold)\n", - " mse = metrics.mean_squared_error(y_val_fold, y_pred_fold)\n", - " rmse = metrics.root_mean_squared_error(y_val_fold, y_pred_fold)\n", + " rf_regressor.fit(X_train, y_train)\n", "\n", - " MAE_scores.append(mae)\n", - " MSE_scores.append(mse)\n", - " RMSE_scores.append(rmse)\n", + " # Evaluation du modèle\n", + " y_pred_test = rf_regressor.predict(X_test)\n", "\n", - " avg_mae = np.mean(MAE_scores)\n", - " avg_mse = np.mean(MSE_scores)\n", - " avg_rmse = np.mean(RMSE_scores)\n", + " MAE = metrics.mean_absolute_error(y_test, y_pred_test)\n", + " MSE = metrics.mean_squared_error(y_test, y_pred_test)\n", + " RMSE = metrics.root_mean_squared_error(y_test, y_pred_test)\n", "\n", - " if avg_rmse < best_score:\n", - " best_score = avg_rmse\n", + " # Concaténation des résultats\n", + " MAE_scores.append(MAE)\n", + " MSE_scores.append(MSE)\n", + " RMSE_scores.append(RMSE)\n", + "\n", + " # Calcul du meilleur score pour le jeu de paramètres\n", + " min_rmse = np.min(RMSE_scores)\n", + "\n", + " # Mise à jour du meilleur score si besoin\n", + " if min_rmse < best_score:\n", + " best_score = min_rmse\n", " best_params = {\n", - " 'n_estimators': n_estimators,\n", - " 'max_depth': max_depth,\n", - " 'min_samples_split': min_samples_split\n", + " \"n_estimators\": n_estimators,\n", + " \"max_depth\": max_depth,\n", + " \"min_samples_split\": min_samples_split,\n", " }\n", + "\n", + " # Sauvegarde des scores pour analyse\n", " MAE_best_score = MAE_scores\n", " MSE_best_score = MSE_scores\n", - " RMSE_best_score = RMSE_scores" + " RMSE_best_score = RMSE_scores\n", + "\n", + " # Sauvegarde du modèle pour l'utiliser directement\n", + " best_model_regressor = rf_regressor\n" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 222, "id": "d4936c46", "metadata": {}, "outputs": [ @@ -3716,8 +4020,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "Meilleurs paramètres: {}\n", - "Meilleure RMSE : inf\n" + "Meilleurs paramètres: {'n_estimators': 65, 'max_depth': 1, 'min_samples_split': 5}\n", + "Meilleure RMSE : 4548.156488811854\n" ] } ], @@ -3729,10 +4033,22 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 223, "id": "3215c463", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 RMSE: 5168.96443207593\n", + "Fold 2 RMSE: 6779.919772901815\n", + "Fold 3 RMSE: 10081.628056733409\n", + "Fold 4 RMSE: 4548.156488811854\n", + "Fold 5 RMSE: 6713.822743503048\n" + ] + } + ], "source": [ "# Métriques sur tous les folds\n", "\n", @@ -3743,10 +4059,22 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 224, "id": "bb9a5c9b", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 MSE: 26718193.300066035\n", + "Fold 2 MSE: 45967312.126985006\n", + "Fold 3 MSE: 101639224.27431424\n", + "Fold 4 MSE: 20685727.446721368\n", + "Fold 5 MSE: 45075415.831178784\n" + ] + } + ], "source": [ "#MAE\n", "for fold, mse in enumerate(MSE_best_score, start=1):\n", @@ -3755,10 +4083,22 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 225, "id": "0f0768ad", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fold 1 MAE: 3516.8014139306597\n", + "Fold 2 MAE: 3209.253810522964\n", + "Fold 3 MAE: 4545.1440942571835\n", + "Fold 4 MAE: 3088.226098509521\n", + "Fold 5 MAE: 3576.4647056529234\n" + ] + } + ], "source": [ "#MSE\n", "for fold, mae in enumerate(MAE_best_score, start=1):\n", @@ -3772,6 +4112,220 @@ "source": [ "**Question :** Commentez les résultats" ] + }, + { + "cell_type": "markdown", + "id": "bd1e91ee", + "metadata": {}, + "source": [ + "### Implémentation avec les librairies existantes" + ] + }, + { + "cell_type": "code", + "execution_count": 226, + "id": "4b8cc48d", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import GridSearchCV, KFold" + ] + }, + { + "cell_type": "code", + "execution_count": 227, + "id": "f0e5d591", + "metadata": {}, + "outputs": [], + "source": [ + "#Sampling en 80% train et 20% test\n", + "X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "id": "71177a63", + "metadata": {}, + "outputs": [], + "source": [ + "# Supposons que vous ayez des données d'entraînement X_train et y_train\n", + "\n", + "# Définir la grille d'hyperparamètres à rechercher\n", + "param_grid = {\n", + " \"n_estimators\": [60, 65, 70, 75],\n", + " \"max_depth\": [None, 1, 2, 3],\n", + " \"min_samples_split\": [5, 8, 10, 11, 13, 14, 15],\n", + "}\n", + "# Nombre de folds pour la validation croisée\n", + "num_folds = 5" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "e463b9d7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Meilleurs hyperparamètres : {'max_depth': 1, 'min_samples_split': 5, 'n_estimators': 60}\n" + ] + } + ], + "source": [ + "# Initialisation du modèle RandomForestRegressor\n", + "rf = RandomForestRegressor(random_state=42)\n", + "\n", + "# Création de l'objet GridSearchCV pour la recherche sur grille avec validation croisée\n", + "grid_search = GridSearchCV(\n", + " estimator = rf,\n", + " param_grid = param_grid,\n", + " cv = KFold(\n", + " n_splits = num_folds, shuffle = True, random_state = 42\n", + " ), # Validation croisée avec 5 folds\n", + " scoring = \"neg_mean_squared_error\", # Métrique d'évaluation (moins c'est mieux)\n", + " n_jobs = -1, # Utiliser tous les cœurs du processeur\n", + ")\n", + "\n", + "# Exécution de la recherche sur grille\n", + "grid_search.fit(X_train, y_train)\n", + "\n", + "# Afficher les meilleurs hyperparamètres\n", + "best_params = grid_search.best_params_\n", + "print(\"Meilleurs hyperparamètres : \", best_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 230, + "id": "d1b84e91", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialiser le modèle final avec les meilleurs hyperparamètres\n", + "best_rf = RandomForestRegressor(random_state = 42, **best_params)" + ] + }, + { + "cell_type": "code", + "execution_count": 231, + "id": "c46d32a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE pour le fold 1: -8836.353449486982\n", + "RMSE pour le fold 2: -5242.128416843558\n", + "RMSE pour le fold 3: -7205.432382938018\n", + "RMSE pour le fold 4: -4902.177844748944\n", + "RMSE pour le fold 5: -7707.687751500834\n", + "\n", + "\n", + "MSE pour le fold 1: -78081142.28426048\n", + "MSE pour le fold 2: -27479910.338678744\n", + "MSE pour le fold 3: -51918255.825091854\n", + "MSE pour le fold 4: -24031347.6215474\n", + "MSE pour le fold 5: -59408450.47463598\n", + "\n", + "\n", + "MAE pour le fold 1: -4047.520107345083\n", + "MAE pour le fold 2: -3389.6166968886077\n", + "MAE pour le fold 3: -3373.620497619359\n", + "MAE pour le fold 4: -3186.2100657449696\n", + "MAE pour le fold 5: -4145.078817961569\n" + ] + } + ], + "source": [ + "# Cross validation\n", + "# RMSE de chaque fold\n", + "rmse_scores = cross_val_score(\n", + " best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_root_mean_squared_error\"\n", + ")\n", + "\n", + "# Afficher les scores pour chaque fold\n", + "for i, score in enumerate(rmse_scores):\n", + " print(f\"RMSE pour le fold {i + 1}: {score}\")\n", + "\n", + "# MSE de chaque fold\n", + "mse_scores = cross_val_score(\n", + " best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_squared_error\"\n", + ")\n", + "\n", + "# Afficher les scores pour chaque fold\n", + "print(\"\\n\")\n", + "for i, score in enumerate(mse_scores):\n", + " print(f\"MSE pour le fold {i + 1}: {score}\")\n", + "\n", + "# MAE de chaque fold\n", + "mae_scores = cross_val_score(\n", + " best_rf, X_train, y_train, cv=num_folds, scoring=\"neg_mean_absolute_error\"\n", + ")\n", + "\n", + "# Afficher les scores pour chaque fold\n", + "print(\"\\n\")\n", + "for i, score in enumerate(mae_scores):\n", + " print(f\"MAE pour le fold {i + 1}: {score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "3ba2274c", + "metadata": {}, + "outputs": [], + "source": [ + "# Entraîner le modèle final sur toute la base\n", + "best_rf.fit(X_train, y_train)\n", + "\n", + "# Faire des prédictions sur l'ensemble de test\n", + "y_pred = best_rf.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "id": "ec717a0c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "RMSE : 6792.775060864194\n", + "MSE : 46141793.02749855\n", + "MAE : 3387.6746891178996\n" + ] + } + ], + "source": [ + "# Calculer la métrique de performance (dans ce cas, RMSE)\n", + "rmse = metrics.root_mean_squared_error(y_test, y_pred)\n", + "print(f\"RMSE : {rmse}\")\n", + "\n", + "# Calculer la métrique de performance (dans ce cas, MSE)\n", + "mse = metrics.mean_squared_error(y_test, y_pred)\n", + "print(f\"MSE : {mse}\")\n", + "\n", + "# Calculer la métrique de performance (dans ce cas, MAE)\n", + "mae = metrics.mean_absolute_error(y_test, y_pred)\n", + "print(f\"MAE : {mae}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "001baf7d", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {