In [10]:
from ucimlrepo import fetch_ucirepo 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

import numpy as np
import matplotlib.pyplot as plt

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
# fetch dataset 
breast_cancer_coimbra = fetch_ucirepo(id=451) 
  
# data (as pandas dataframes) 
X = breast_cancer_coimbra.data.features 
y = breast_cancer_coimbra.data.targets 
  
# metadata 
print(breast_cancer_coimbra.metadata) 
  
# variable information 
print(breast_cancer_coimbra.variables) 

{'uci_id': 451, 'name': 'Breast Cancer Coimbra', 'repository_url': 'https://archive.ics.uci.edu/dataset/451/breast+cancer+coimbra', 'data_url': 'https://archive.ics.uci.edu/static/public/451/data.csv', 'abstract': 'Clinical features were observed or measured for 64 patients with breast cancer and 52 healthy controls. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 116, 'num_features': 9, 'feature_types': ['Integer'], 'demographics': ['Age'], 'target_col': ['Classification'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C52P59', 'creators': ['Miguel Patrcio', 'Jos Pereira', 'Joana Crisstomo', 'Paulo Matafome', 'Raquel Seia', 'Francisco Caramelo'], 'intro_paper': {'ID': 431, 'type': 'NATIVE', 'title': 'Using Resistin, glucose, age and BMI to predict the presence of breast cancer', 'authors': 'M.

# K-NN classifier 

### Cross-validation

In [5]:
print("Number of samples:", X.shape[0])
print("Number of features:", X.shape[1])

Number of samples: 116
Number of features: 9


Then d is small enough to insure that we are not in the curse of dimention

In [6]:
k_scores = []
K_list = np.arange(1, X.shape[0] // 4)  # concidering 1/4 of the samples as neaighbors is large enough for k-NN to don't overfit

for k in K_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    score_acc = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    score_f1 = cross_val_score(knn, X, y, cv=5, scoring='f1')
    score = (score_acc + score_f1) / 2 
    k_scores.append(score.mean())

k_scores = np.array(k_scores)

print("The best k for k-NN is k =", np.argmin(k_scores))

The best k for k-NN is k = 26


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

knn = KNeighborsClassifier(n_neighbors=6) # using the best k found
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
class_report = classification_report(y_test, y_pred)

print("Accuracy:", acc)
print("F1 Score:", f1)
print("Classification Report:\n", class_report)

Accuracy: 0.6
F1 Score: 0.5857142857142857
Classification Report:
               precision    recall  f1-score   support

           1       0.64      0.41      0.50        17
           2       0.58      0.78      0.67        18

    accuracy                           0.60        35
   macro avg       0.61      0.59      0.58        35
weighted avg       0.61      0.60      0.59        35

