mirror of
https://github.com/ArthurDanjou/breast-cancer-detection.git
synced 2026-01-14 13:54:06 +01:00
330 lines
87 KiB
Plaintext
330 lines
87 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8797cb42",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Étape 1 — Chargement et exploration initiale"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "391c54d2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"(116, 10)\n",
|
|
" Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin \\\n",
|
|
"0 48 23.500000 70 2.707 0.467409 8.8071 9.702400 7.99585 \n",
|
|
"1 83 20.690495 92 3.115 0.706897 8.8438 5.429285 4.06405 \n",
|
|
"2 82 23.124670 91 4.498 1.009651 17.9393 22.432040 9.27715 \n",
|
|
"3 68 21.367521 77 3.226 0.612725 9.8827 7.169560 12.76600 \n",
|
|
"4 86 21.111111 92 3.549 0.805386 6.6994 4.819240 10.57635 \n",
|
|
"\n",
|
|
" MCP.1 Classification \n",
|
|
"0 417.114 1 \n",
|
|
"1 468.786 1 \n",
|
|
"2 554.697 1 \n",
|
|
"3 928.220 1 \n",
|
|
"4 773.920 1 \n",
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
"RangeIndex: 116 entries, 0 to 115\n",
|
|
"Data columns (total 10 columns):\n",
|
|
" # Column Non-Null Count Dtype \n",
|
|
"--- ------ -------------- ----- \n",
|
|
" 0 Age 116 non-null int64 \n",
|
|
" 1 BMI 116 non-null float64\n",
|
|
" 2 Glucose 116 non-null int64 \n",
|
|
" 3 Insulin 116 non-null float64\n",
|
|
" 4 HOMA 116 non-null float64\n",
|
|
" 5 Leptin 116 non-null float64\n",
|
|
" 6 Adiponectin 116 non-null float64\n",
|
|
" 7 Resistin 116 non-null float64\n",
|
|
" 8 MCP.1 116 non-null float64\n",
|
|
" 9 Classification 116 non-null int64 \n",
|
|
"dtypes: float64(7), int64(3)\n",
|
|
"memory usage: 9.2 KB\n"
|
|
]
|
|
},
|
|
{
|
|
"data": {
|
|
"image/png": "",
|
|
"text/plain": [
|
|
"<Figure size 1200x800 with 12 Axes>"
|
|
]
|
|
},
|
|
"metadata": {},
|
|
"output_type": "display_data"
|
|
}
|
|
],
|
|
"source": [
|
|
"# Import des bibliothèques\n",
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import seaborn as sns\n",
|
|
"\n",
|
|
"# Chargement des données\n",
|
|
"data = pd.read_csv('dataR2.csv')\n",
|
|
"\n",
|
|
"# Aperçu des données\n",
|
|
"print(data.shape)\n",
|
|
"print(data.head())\n",
|
|
"data.info()\n",
|
|
"\n",
|
|
"# Liste des variables explicatives\n",
|
|
"features = data.columns[:-1]\n",
|
|
"target = 'Classification'\n",
|
|
"\n",
|
|
"# Histogrammes des variables\n",
|
|
"data.hist(bins=50, figsize=(12, 8))\n",
|
|
"plt.suptitle(\"Histogrammes des variables\", fontsize=16)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "a862efa5",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Definition of X and y\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "c4cb9164",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"X = data.drop(columns='Classification')\n",
|
|
"y = data['Classification']\n",
|
|
"Y_binarized = (y == 2).astype(int)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "446a3793",
|
|
"metadata": {},
|
|
"source": [
|
|
"## NAIVE BAYES"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "1af36855",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Why do we use a Gaussian NB"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "d0cd967c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"We cannot reject H0: The column Age follows a normal distribution as p= 0.008932850613695056 > 0.05\n",
|
|
"We cannot reject H0: The column BMI follows a normal distribution as p= 0.00765341311315994 > 0.05\n",
|
|
"We cannot reject H0: The column Glucose follows a normal distribution as p= 1.2014716303695527e-12 > 0.05\n",
|
|
"We cannot reject H0: The column Insulin follows a normal distribution as p= 1.5076048136113956e-14 > 0.05\n",
|
|
"We cannot reject H0: The column HOMA follows a normal distribution as p= 4.300187278780875e-17 > 0.05\n",
|
|
"We cannot reject H0: The column Leptin follows a normal distribution as p= 1.1868019935827926e-08 > 0.05\n",
|
|
"We cannot reject H0: The column Adiponectin follows a normal distribution as p= 2.764933584842159e-10 > 0.05\n",
|
|
"We cannot reject H0: The column Resistin follows a normal distribution as p= 6.705226955771421e-13 > 0.05\n",
|
|
"We cannot reject H0: The column MCP.1 follows a normal distribution as p= 5.076835730086118e-08 > 0.05\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from scipy.stats import shapiro\n",
|
|
"\n",
|
|
"for col in X.columns:\n",
|
|
" stat, p = shapiro(X[col])\n",
|
|
" if p>0.05:\n",
|
|
" print(f\"We cannot reject H0: The column {col} follows a normal distribution as p= {p} > 0.05\")\n",
|
|
" else:\n",
|
|
" print(f\"We cannot reject H0: The column {col} follows a normal distribution as p= {p} > 0.05\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "74cef382",
|
|
"metadata": {},
|
|
"source": [
|
|
"So we use a Gaussian Naive Bayes method."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "a2e1af5c",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Naive Bayes Classification Accuracy: 0.7586\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.naive_bayes import GaussianNB\n",
|
|
"from sklearn.metrics import accuracy_score\n",
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
"\n",
|
|
"# Binarize the dataset: Convert classification values to {0, 1}\n",
|
|
"Y_binarized = (y == 2).astype(int)\n",
|
|
"\n",
|
|
"# Split into training and test sets\n",
|
|
"X_train, X_test, y_train, y_test = train_test_split(X, Y_binarized, test_size=0.25, random_state=42)\n",
|
|
"\n",
|
|
"# Initialize and train a Naive Bayes classifier\n",
|
|
"nb_classifier = GaussianNB()\n",
|
|
"nb_classifier.fit(X_train, y_train)\n",
|
|
"\n",
|
|
"# Make predictions on the test set\n",
|
|
"y_pred = nb_classifier.predict(X_test)\n",
|
|
"\n",
|
|
"# Compute accuracy\n",
|
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|
"print(f\"Naive Bayes Classification Accuracy: {accuracy:.4f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "96408f32",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Évaluation du modèle et Cross-validation"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "d0267905",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Accuracy : 0.759\n",
|
|
"F1-score : 0.696\n",
|
|
"\n",
|
|
"Classification Report :\n",
|
|
" precision recall f1-score support\n",
|
|
"\n",
|
|
" 0 0.70 0.93 0.80 15\n",
|
|
" 1 0.89 0.57 0.70 14\n",
|
|
"\n",
|
|
" accuracy 0.76 29\n",
|
|
" macro avg 0.79 0.75 0.75 29\n",
|
|
"weighted avg 0.79 0.76 0.75 29\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.metrics import f1_score, classification_report\n",
|
|
"\n",
|
|
"# Accuracy\n",
|
|
"accuracy = accuracy_score(y_test, y_pred)\n",
|
|
"print(f\"Accuracy : {accuracy:.3f}\")\n",
|
|
"\n",
|
|
"# F1-score (binaire par défaut)\n",
|
|
"f1 = f1_score(y_test, y_pred)\n",
|
|
"print(f\"F1-score : {f1:.3f}\")\n",
|
|
"\n",
|
|
"# Rapport complet\n",
|
|
"print(\"\\nClassification Report :\")\n",
|
|
"print(classification_report(y_test, y_pred))\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "7afddd86",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Validation croisée (5 folds)\n",
|
|
"F1-score moyen : 0.496 ± 0.228\n",
|
|
"Accuracy moyen : 0.589 ± 0.153\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from sklearn.model_selection import cross_val_score\n",
|
|
"\n",
|
|
"# Initialisation du modèle\n",
|
|
"model = GaussianNB()\n",
|
|
"\n",
|
|
"# Validation croisée avec scoring F1\n",
|
|
"f1_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')\n",
|
|
"acc_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')\n",
|
|
"\n",
|
|
"# Résumé\n",
|
|
"print(\"Validation croisée (5 folds)\")\n",
|
|
"print(f\"F1-score moyen : {f1_scores.mean():.3f} ± {f1_scores.std():.3f}\")\n",
|
|
"print(f\"Accuracy moyen : {acc_scores.mean():.3f} ± {acc_scores.std():.3f}\")\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "5e3cf2bb",
|
|
"metadata": {},
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "ff150ac7",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "3d724867",
|
|
"metadata": {},
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.9.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|