diff --git a/naive_bayes.ipynb b/naive_bayes.ipynb index 6f5b26c..60c1a44 100644 --- a/naive_bayes.ipynb +++ b/naive_bayes.ipynb @@ -532,95 +532,17 @@ "The data of the features is continuous and non-binary. As such, we use a GaussianNB, the performance will nevertheless suffer as the features aren't all Normal distributed and the dimension is rather small, we cannot suppose normal distribution through size." ] }, - { - "cell_type": "markdown", - "id": "71cc45b2", - "metadata": {}, - "source": [ - "## Cross-Validation" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "cff7e03a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Cross-validation (10 folds)\n", - "Average F1-score : 0.620 ± 0.217\n", - "Average accuracy : 0.655 ± 0.167\n", - "Average recall : 0.571 ± 0.262\n" - ] - }, - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from sklearn.model_selection import cross_val_score\n", - "from sklearn.naive_bayes import GaussianNB\n", - "\n", - "general_classifier = GaussianNB()\n", - "general_classifier.fit(X_scaled, y)\n", - "# 10 fold corss-validation on the entire (transformed dataset)\n", - "f1_scores = cross_val_score(general_classifier, X_scaled, y, cv=10, scoring='f1')\n", - "acc_scores = cross_val_score(general_classifier, X_scaled, y, cv=10, scoring='accuracy')\n", - "recall_scores = cross_val_score(general_classifier, X_scaled, y, cv=10, scoring=\"recall\")\n", - "\n", - "# Report\n", - "print(\"Cross-validation (10 folds)\")\n", - "print(f\"Average F1-score : {f1_scores.mean():.3f} ± {f1_scores.std():.3f}\")\n", - "print(f\"Average accuracy : {acc_scores.mean():.3f} ± {acc_scores.std():.3f}\")\n", - "print(f\"Average recall : {recall_scores.mean():.3f} ± {recall_scores.std():.3f}\")\n", - "\n", - "# Visualisation des scores par fold\n", - "folds = range(1, len(f1_scores) + 1)\n", - "plt.figure(figsize=(8, 5))\n", - "plt.plot(folds, f1_scores, marker='o', label='F1-score')\n", - "plt.plot(folds, acc_scores, marker='s', label='Accuracy')\n", - "plt.plot(folds, recall_scores, marker=\"v\", label=\"Recall\")\n", - "plt.title(\"Scores by fold (cross-validation)\")\n", - "plt.xlabel(\"Fold\")\n", - "plt.ylabel(\"Score\")\n", - "plt.ylim(0, 1)\n", - "plt.grid(True)\n", - "plt.legend()\n", - "plt.tight_layout()\n", - "plt.show()\n" - ] - }, - { - "cell_type": "markdown", - "id": "94d0dd72", - "metadata": {}, - "source": [ - "### Analysis\n", - "\n", - "The high standard deviations shows a certain sensibilty to the folds, that is probably due to the small sample size. This high disparity between the folds also shows on the graph." - ] - }, { "cell_type": "markdown", "id": "5fd9b3f0", "metadata": {}, "source": [ - "## Final evaluation on the dataset" + "## Train/Test split for the evaluation " ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "d58417ad", "metadata": {}, "outputs": [ @@ -639,7 +561,7 @@ "# Stratified split to conserve the distribution of the variables\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X_scaled, y,\n", - " test_size=0.3,\n", + " test_size=0.2,\n", " random_state=42,\n", " stratify=y)\n", "\n", @@ -657,12 +579,77 @@ "- **Training set** : 81 observations\n", "- **Test set** : 24 observations\n", "\n", - "This is a standard 70/30 split" + "This is a standard 80/20 split" + ] + }, + { + "cell_type": "markdown", + "id": "13a5621f", + "metadata": {}, + "source": [ + "## Cross-Validation" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, + "id": "3d50659c", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "from sklearn.naive_bayes import GaussianNB\n", + "\n", + "general_classifier = GaussianNB()\n", + "general_classifier.fit(X_train, y_train)\n", + "# 10 fold corss-validation on the entire (transformed dataset)\n", + "f1_scores = cross_val_score(general_classifier, X_train, y_train, cv=5, scoring='f1')\n", + "acc_scores = cross_val_score(general_classifier, X_train, y_train, cv=5, scoring='accuracy')\n", + "recall_scores = cross_val_score(general_classifier, X_train, y_train, cv=5, scoring=\"recall\")\n", + "\n", + "# Report\n", + "print(\"Cross-validation (5 folds)\")\n", + "print(f\"Average F1-score : {f1_scores.mean():.3f} ± {f1_scores.std():.3f}\")\n", + "print(f\"Average accuracy : {acc_scores.mean():.3f} ± {acc_scores.std():.3f}\")\n", + "print(f\"Average recall : {recall_scores.mean():.3f} ± {recall_scores.std():.3f}\")\n", + "\n", + "# Visualisation des scores par fold\n", + "folds = range(1, len(f1_scores) + 1)\n", + "plt.figure(figsize=(8, 5))\n", + "plt.plot(folds, f1_scores, marker='o', label='F1-score')\n", + "plt.plot(folds, acc_scores, marker='s', label='Accuracy')\n", + "plt.plot(folds, recall_scores, marker=\"v\", label=\"Recall\")\n", + "plt.title(\"Scores by fold (cross-validation)\")\n", + "plt.xlabel(\"Fold\")\n", + "plt.ylabel(\"Score\")\n", + "plt.ylim(0, 1)\n", + "plt.grid(True)\n", + "plt.legend()\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "c35db965", + "metadata": {}, + "source": [ + "### Analysis\n", + "\n", + "The high standard deviations shows a certain sensibilty to the folds, that is probably due to the small sample size. This high disparity between the folds also shows on the graph." + ] + }, + { + "cell_type": "markdown", + "id": "b8891271", + "metadata": {}, + "source": [ + "## Final evaluation on the dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, "id": "d26b8326", "metadata": {}, "outputs": [ @@ -708,7 +695,6 @@ "# Make predictions on the test set\n", "y_pred = nb_classifier.predict(X_test)\n", "\n", - "\n", "# Accuracy\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"Accuracy : {accuracy:.3f}\")\n", diff --git a/neural_network.ipynb b/neural_network.ipynb index e4cd942..b57bdd1 100644 --- a/neural_network.ipynb +++ b/neural_network.ipynb @@ -191,9 +191,45 @@ " return model" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "train test split and scaling of the features " + ] + }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import f1_score, classification_report\n", + "import tensorflow as tf\n", + "import numpy as np\n", + "\n", + "# Splitting the dataset into training and testing sets\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", + "\n", + "# Scaling the features\n", + "scaler = StandardScaler()\n", + "X_train_scaled = scaler.fit_transform(X_train)\n", + "X_test_scaled = scaler.transform(X_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Cross validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": {}, "outputs": [ { @@ -297,14 +333,9 @@ " verbose=1\n", ")\n", "\n", - "for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):\n", - " X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]\n", - " y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]\n", - "\n", - " # Standardisation\n", - " scaler = StandardScaler()\n", - " X_train_scaled = scaler.fit_transform(X_train)\n", - " X_val_scaled = scaler.transform(X_val)\n", + "for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), 1):\n", + " X_cv_train, X_cv_val = X_train.iloc[train_idx], X_train.iloc[val_idx]\n", + " y_cv_train, y_cv_val = y_train.iloc[train_idx], y_train.iloc[val_idx]\n", " \n", " model = build_model()\n", "\n", @@ -319,10 +350,10 @@ "\n", " # Entraînement\n", " history = model.fit(\n", - " X_train_scaled, y_train,\n", + " X_cv_train, y_cv_train,\n", " epochs=50,\n", " batch_size=8,\n", - " validation_data=(X_val_scaled, y_val),\n", + " validation_data=(X_cv_val, y_cv_val),\n", " callbacks=[callback],\n", " verbose=0,\n", " class_weight={0: 1.0, 1: 2.0}\n", @@ -331,8 +362,8 @@ " histories.append(history.history)\n", "\n", " # Prédiction & F1\n", - " y_pred_val = (model.predict(X_val_scaled) > 0.5).astype(int)\n", - " score = f1_score(y_val, y_pred_val)\n", + " y_pred_val = (model.predict(X_cv_val) > 0.5).astype(int)\n", + " score = f1_score(y_cv_val, y_pred_val)\n", " f1_scores.append(score)\n", " print(f\"Fold {fold} - F1-score : {score:.4f}\")\n", "\n", @@ -400,7 +431,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -430,18 +461,6 @@ } ], "source": [ - "from sklearn.model_selection import train_test_split\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.metrics import f1_score, classification_report\n", - "import tensorflow as tf\n", - "import numpy as np\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)\n", - "\n", - "scaler = StandardScaler()\n", - "X_train_scaled = scaler.fit_transform(X_train)\n", - "X_test_scaled = scaler.transform(X_test)\n", - "\n", "model = build_model()\n", "\n", "model.compile(\n",