breast-cancer-detection/knn.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "4e6f6cb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from ucimlrepo import fetch_ucirepo \n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.neighbors import KNeighborsClassifier\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4dd5223b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c1ab7ec9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'uci_id': 451, 'name': 'Breast Cancer Coimbra', 'repository_url': 'https://archive.ics.uci.edu/dataset/451/breast+cancer+coimbra', 'data_url': 'https://archive.ics.uci.edu/static/public/451/data.csv', 'abstract': 'Clinical features were observed or measured for 64 patients with breast cancer and 52 healthy controls. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 116, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Classification'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C52P59', 'creators': ['Miguel Patrcio', 'Jos Pereira', 'Joana Crisstomo', 'Paulo Matafome', 'Raquel Seia', 'Francisco Caramelo'], 'intro_paper': {'ID': 431, 'type': 'NATIVE', 'title': 'Using Resistin, glucose, age and BMI to predict the presence of breast cancer', 'authors': 'M. Patrício, J. Pereira, J. Crisóstomo, P. Matafome, M. Gomes, Raquel Seiça, F. Caramelo', 'venue': 'BMC Cancer', 'year': 2018, 'journal': None, 'DOI': '10.1186/s12885-017-3877-1', 'URL': 'https://www.semanticscholar.org/paper/0861bd9c3acf6504ce142517c4e05bf8f564f32a', 'sha': None, 'corpus': None, 'arxiv': None, 'mag': None, 'acl': None, 'pmid': None, 'pmcid': None}, 'additional_info': {'summary': 'There are 10 predictors, all quantitative, and a binary dependent variable, indicating the presence or absence of breast cancer. \\r\\nThe predictors are anthropometric data and parameters which can be gathered in routine blood analysis. \\r\\nPrediction models based on these predictors, if accurate, can potentially be used as a biomarker of breast cancer.', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Quantitative Attributes: \\r\\nAge (years)\\r\\nBMI (kg/m2)\\r\\nGlucose (mg/dL)\\r\\nInsulin (µU/mL)\\r\\nHOMA\\r\\nLeptin (ng/mL)\\r\\nAdiponectin (µg/mL)\\r\\nResistin (ng/mL)\\r\\nMCP-1(pg/dL)\\r\\n\\r\\nLabels:\\r\\n1=Healthy controls\\r\\n2=Patients', 'citation': None}}\n",
      "             name     role        type demographic  \\\n",
      "0             Age  Feature     Integer         Age   \n",
      "1             BMI  Feature  Continuous        None   \n",
      "2         Glucose  Feature     Integer        None   \n",
      "3         Insulin  Feature  Continuous        None   \n",
      "4            HOMA  Feature  Continuous        None   \n",
      "5          Leptin  Feature  Continuous        None   \n",
      "6     Adiponectin  Feature  Continuous        None   \n",
      "7        Resistin  Feature  Continuous        None   \n",
      "8           MCP.1  Feature  Continuous        None   \n",
      "9  Classification   Target     Integer        None   \n",
      "\n",
      "                      description  units missing_values  \n",
      "0                            None   year             no  \n",
      "1                            None  kg/m2             no  \n",
      "2                            None  mg/dL             no  \n",
      "3                            None  µU/mL             no  \n",
      "4                            None   None             no  \n",
      "5                            None  ng/mL             no  \n",
      "6                            None  µU/mL             no  \n",
      "7                            None  ng/mL             no  \n",
      "8                            None  pg/dL             no  \n",
      "9  1=Healthy controls, 2=Patients   None             no  \n"
     ]
    }
   ],
   "source": [
    "# fetch dataset \n",
    "breast_cancer_coimbra = fetch_ucirepo(id=451) \n",
    "  \n",
    "# data (as pandas dataframes) \n",
    "X = breast_cancer_coimbra.data.features \n",
    "y = breast_cancer_coimbra.data.targets \n",
    "  \n",
    "# metadata \n",
    "print(breast_cancer_coimbra.metadata) \n",
    "  \n",
    "# variable information \n",
    "print(breast_cancer_coimbra.variables) "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1004c28",
   "metadata": {},
   "source": [
    "# K-NN classifier "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "082c143b",
   "metadata": {},
   "source": [
    "### Cross-validation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "754dce9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of samples: 116\n",
      "Number of features: 9\n"
     ]
    }
   ],
   "source": [
    "print(\"Number of samples:\", X.shape[0])\n",
    "print(\"Number of features:\", X.shape[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01bb817a",
   "metadata": {},
   "source": [
    "Then d is small enough to insure that we are not in the curse of dimention"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "feb42adf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The best k for k-NN is k = 26\n"
     ]
    }
   ],
   "source": [
    "k_scores = []\n",
    "K_list = np.arange(1, X.shape[0] // 4)  # concidering 1/4 of the samples as neaighbors is large enough for k-NN to don't overfit\n",
    "\n",
    "for k in K_list:\n",
    "    knn = KNeighborsClassifier(n_neighbors=k)\n",
    "    score_acc = cross_val_score(knn, X, y, cv=5, scoring='accuracy')\n",
    "    score_f1 = cross_val_score(knn, X, y, cv=5, scoring='f1')\n",
    "    score = (score_acc + score_f1) / 2 \n",
    "    k_scores.append(score.mean())\n",
    "\n",
    "k_scores = np.array(k_scores)\n",
    "\n",
    "print(\"The best k for k-NN is k =\", np.argmin(k_scores))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fa8a7166",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6\n",
      "F1 Score: 0.5857142857142857\n",
      "Classification Report:\n",
      "               precision    recall  f1-score   support\n",
      "\n",
      "           1       0.64      0.41      0.50        17\n",
      "           2       0.58      0.78      0.67        18\n",
      "\n",
      "    accuracy                           0.60        35\n",
      "   macro avg       0.61      0.59      0.58        35\n",
      "weighted avg       0.61      0.60      0.59        35\n",
      "\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
    "\n",
    "knn = KNeighborsClassifier(n_neighbors=6) # using the best k found\n",
    "knn.fit(X_train, y_train)\n",
    "\n",
    "y_pred = knn.predict(X_test)\n",
    "\n",
    "acc = accuracy_score(y_test, y_pred)\n",
    "f1 = f1_score(y_test, y_pred, average='weighted')\n",
    "class_report = classification_report(y_test, y_pred)\n",
    "\n",
    "print(\"Accuracy:\", acc)\n",
    "print(\"F1 Score:\", f1)\n",
    "print(\"Classification Report:\\n\", class_report)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}