Files
breast-cancer-detection/knn.ipynb

222 lines
8.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 10,
"id": "4e6f6cb1",
"metadata": {},
"outputs": [],
"source": [
"from ucimlrepo import fetch_ucirepo \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.neighbors import KNeighborsClassifier\n",
"from sklearn.model_selection import cross_val_score\n",
"from sklearn.metrics import accuracy_score, f1_score, classification_report\n",
"\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4dd5223b",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c1ab7ec9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'uci_id': 451, 'name': 'Breast Cancer Coimbra', 'repository_url': 'https://archive.ics.uci.edu/dataset/451/breast+cancer+coimbra', 'data_url': 'https://archive.ics.uci.edu/static/public/451/data.csv', 'abstract': 'Clinical features were observed or measured for 64 patients with breast cancer and 52 healthy controls. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 116, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Classification'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C52P59', 'creators': ['Miguel Patrcio', 'Jos Pereira', 'Joana Crisstomo', 'Paulo Matafome', 'Raquel Seia', 'Francisco Caramelo'], 'intro_paper': {'ID': 431, 'type': 'NATIVE', 'title': 'Using Resistin, glucose, age and BMI to predict the presence of breast cancer', 'authors': 'M. Patrício, J. Pereira, J. Crisóstomo, P. Matafome, M. Gomes, Raquel Seiça, F. Caramelo', 'venue': 'BMC Cancer', 'year': 2018, 'journal': None, 'DOI': '10.1186/s12885-017-3877-1', 'URL': 'https://www.semanticscholar.org/paper/0861bd9c3acf6504ce142517c4e05bf8f564f32a', 'sha': None, 'corpus': None, 'arxiv': None, 'mag': None, 'acl': None, 'pmid': None, 'pmcid': None}, 'additional_info': {'summary': 'There are 10 predictors, all quantitative, and a binary dependent variable, indicating the presence or absence of breast cancer. \\r\\nThe predictors are anthropometric data and parameters which can be gathered in routine blood analysis. \\r\\nPrediction models based on these predictors, if accurate, can potentially be used as a biomarker of breast cancer.', 'purpose': None, 'funded_by': None, 'instances_represent': None, 'recommended_data_splits': None, 'sensitive_data': None, 'preprocessing_description': None, 'variable_info': 'Quantitative Attributes: \\r\\nAge (years)\\r\\nBMI (kg/m2)\\r\\nGlucose (mg/dL)\\r\\nInsulin (µU/mL)\\r\\nHOMA\\r\\nLeptin (ng/mL)\\r\\nAdiponectin (µg/mL)\\r\\nResistin (ng/mL)\\r\\nMCP-1(pg/dL)\\r\\n\\r\\nLabels:\\r\\n1=Healthy controls\\r\\n2=Patients', 'citation': None}}\n",
" name role type demographic \\\n",
"0 Age Feature Integer Age \n",
"1 BMI Feature Continuous None \n",
"2 Glucose Feature Integer None \n",
"3 Insulin Feature Continuous None \n",
"4 HOMA Feature Continuous None \n",
"5 Leptin Feature Continuous None \n",
"6 Adiponectin Feature Continuous None \n",
"7 Resistin Feature Continuous None \n",
"8 MCP.1 Feature Continuous None \n",
"9 Classification Target Integer None \n",
"\n",
" description units missing_values \n",
"0 None year no \n",
"1 None kg/m2 no \n",
"2 None mg/dL no \n",
"3 None µU/mL no \n",
"4 None None no \n",
"5 None ng/mL no \n",
"6 None µU/mL no \n",
"7 None ng/mL no \n",
"8 None pg/dL no \n",
"9 1=Healthy controls, 2=Patients None no \n"
]
}
],
"source": [
"# fetch dataset \n",
"breast_cancer_coimbra = fetch_ucirepo(id=451) \n",
" \n",
"# data (as pandas dataframes) \n",
"X = breast_cancer_coimbra.data.features \n",
"y = breast_cancer_coimbra.data.targets \n",
" \n",
"# metadata \n",
"print(breast_cancer_coimbra.metadata) \n",
" \n",
"# variable information \n",
"print(breast_cancer_coimbra.variables) "
]
},
{
"cell_type": "markdown",
"id": "a1004c28",
"metadata": {},
"source": [
"# K-NN classifier "
]
},
{
"cell_type": "markdown",
"id": "082c143b",
"metadata": {},
"source": [
"### Cross-validation"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "754dce9b",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Number of samples: 116\n",
"Number of features: 9\n"
]
}
],
"source": [
"print(\"Number of samples:\", X.shape[0])\n",
"print(\"Number of features:\", X.shape[1])"
]
},
{
"cell_type": "markdown",
"id": "01bb817a",
"metadata": {},
"source": [
"Then d is small enough to insure that we are not in the curse of dimention"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "feb42adf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The best k for k-NN is k = 26\n"
]
}
],
"source": [
"k_scores = []\n",
"K_list = np.arange(1, X.shape[0] // 4) # concidering 1/4 of the samples as neaighbors is large enough for k-NN to don't overfit\n",
"\n",
"for k in K_list:\n",
" knn = KNeighborsClassifier(n_neighbors=k)\n",
" score_acc = cross_val_score(knn, X, y, cv=5, scoring='accuracy')\n",
" score_f1 = cross_val_score(knn, X, y, cv=5, scoring='f1')\n",
" score = (score_acc + score_f1) / 2 \n",
" k_scores.append(score.mean())\n",
"\n",
"k_scores = np.array(k_scores)\n",
"\n",
"print(\"The best k for k-NN is k =\", np.argmin(k_scores))"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "fa8a7166",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.6\n",
"F1 Score: 0.5857142857142857\n",
"Classification Report:\n",
" precision recall f1-score support\n",
"\n",
" 1 0.64 0.41 0.50 17\n",
" 2 0.58 0.78 0.67 18\n",
"\n",
" accuracy 0.60 35\n",
" macro avg 0.61 0.59 0.58 35\n",
"weighted avg 0.61 0.60 0.59 35\n",
"\n"
]
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)\n",
"\n",
"knn = KNeighborsClassifier(n_neighbors=6) # using the best k found\n",
"knn.fit(X_train, y_train)\n",
"\n",
"y_pred = knn.predict(X_test)\n",
"\n",
"acc = accuracy_score(y_test, y_pred)\n",
"f1 = f1_score(y_test, y_pred, average='weighted')\n",
"class_report = classification_report(y_test, y_pred)\n",
"\n",
"print(\"Accuracy:\", acc)\n",
"print(\"F1 Score:\", f1)\n",
"print(\"Classification Report:\\n\", class_report)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}