ml_exercises/notebooks/3_supervised_comparison.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Compare Supervised Learning Models\n",
    "\n",
    "In this notebook we use 6 toy datasets (3 for regression and 3 for classification) to compare the different algorithms and their hyperparameter settings.\n",
    "\n",
    "Execute the following cells until you see the different datasets and then, after each chapter describing a type of model, come back to this notebook to test the respective model on the datasets and experiment with the model's hyperparameter settings."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.colors import ListedColormap\n",
    "from sklearn.datasets import make_moons\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.inspection import DecisionBoundaryDisplay\n",
    "# don't get unneccessary warnings\n",
    "import warnings\n",
    "warnings.simplefilter(action='ignore', category=FutureWarning)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# You do not need to understand what happens in these functions,\n",
    "# just execute the cell so you can use the functions below\n",
    "\n",
    "n_train_reg = 100\n",
    "n_train_clf = 300\n",
    "\n",
    "def plot_regression(X, y, model=None):\n",
    "    # plot a regression dataset (and model predictions)\n",
    "    plt.figure()\n",
    "    plt.scatter(X[:, 0], y, s=10, c='#3090C7', alpha=0.7, label='data samples')\n",
    "    if model is not None:\n",
    "        X_plot = np.linspace(np.min(X), np.max(X), 1000)\n",
    "        plt.plot(X_plot, model.predict(X_plot[:, np.newaxis]), '#15317E', linewidth=1., alpha=0.9, label='prediction')\n",
    "        plt.legend()\n",
    "    plt.xlabel('x (feature)')\n",
    "    plt.ylabel('y (target)')\n",
    "    plt.title('Regression Problem')\n",
    "    \n",
    "def plot_classification(X, Y, model=None):\n",
    "    # plot a classification dataset (and model predictions)\n",
    "    plt.figure()\n",
    "    if model is not None:\n",
    "        DecisionBoundaryDisplay.from_estimator(\n",
    "            model, X, cmap=plt.cm.RdBu, alpha=0.8, eps=0.5, xlabel=\"feature 1\", ylabel=\"feature 2\", ax=plt.gca()\n",
    "        )\n",
    "    # plot the training points\n",
    "    plt.scatter(X[:, 0], X[:, 1], s=20, c=Y, cmap=ListedColormap(['#FF0000', '#0000FF']), label=\"data samples\")\n",
    "    plt.title(\"Classification Problem\")\n",
    "    plt.colorbar()\n",
    "    \n",
    "def get_linear_regression():\n",
    "    # generate noisy linear regression dataset\n",
    "    np.random.seed(15)\n",
    "    X = np.random.rand(n_train_reg, 1)\n",
    "    y = -2.5 + 5*X\n",
    "    y += np.random.randn(n_train_reg, 1) * 0.4\n",
    "    return StandardScaler(with_std=False).fit_transform(X), y.flatten()\n",
    "\n",
    "def get_linear_outlier():\n",
    "    # generate linear regression dataset with outliers\n",
    "    np.random.seed(15)\n",
    "    X = np.random.rand(n_train_reg, 1)\n",
    "    y = -2.5 + 5*X\n",
    "    y += np.random.randn(n_train_reg, 1) * 0.05\n",
    "    y[(X>0.7) & (X<0.73)] = 10\n",
    "    return StandardScaler(with_std=False).fit_transform(X), y.flatten()\n",
    "\n",
    "def get_nonlinear_regression():\n",
    "    # generate noisy non-linear regression dataset\n",
    "    np.random.seed(15)\n",
    "    X = np.random.rand(n_train_reg, 1) * np.pi * 2.\n",
    "    y = np.sin(X)\n",
    "    y += np.random.randn(n_train_reg, 1) * 0.2\n",
    "    return StandardScaler().fit_transform(X), y.flatten()\n",
    "\n",
    "def get_linear_classification_1f():\n",
    "    # generate classification dataset with 1 informative feature\n",
    "    np.random.seed(15)\n",
    "    mean = [0, 0]\n",
    "    cov = [[1, 0], [0, 10]]\n",
    "    X = np.zeros((n_train_clf, 2))\n",
    "    X[:n_train_clf//2] = np.random.multivariate_normal(mean, cov, n_train_clf//2)\n",
    "    mean = [5, 0]\n",
    "    X[n_train_clf//2:] = np.random.multivariate_normal(mean, cov, n_train_clf//2)\n",
    "    y = np.zeros(n_train_clf, dtype=int)\n",
    "    y[n_train_clf//2:] = 1\n",
    "    rndidx = np.random.permutation(len(y))\n",
    "    return StandardScaler().fit_transform(X[rndidx]), y[rndidx]\n",
    "\n",
    "def get_linear_classification_2f():\n",
    "    # generate classification dataset with 2 informative features\n",
    "    np.random.seed(15)\n",
    "    mean = [0, 4]\n",
    "    cov = np.array([[1, 8], [8, 10]])\n",
    "    cov = np.dot(cov, cov.T)/10\n",
    "    X = np.zeros((n_train_clf, 2))\n",
    "    X[:n_train_clf//2] = np.random.multivariate_normal(mean, cov, n_train_clf//2)\n",
    "    mean = [4, 0]\n",
    "    X[n_train_clf//2:] = np.random.multivariate_normal(mean, cov, n_train_clf//2)\n",
    "    y = np.zeros(n_train_clf, dtype=int)\n",
    "    y[n_train_clf//2:] = 1\n",
    "    rndidx = np.random.permutation(len(y))\n",
    "    return StandardScaler().fit_transform(X[rndidx]), y[rndidx]\n",
    "\n",
    "def get_nonlinear_classification():\n",
    "    # generate non-linear classification dataset\n",
    "    X, y = make_moons(n_samples=n_train_clf, noise=0.3, random_state=1)\n",
    "    return StandardScaler().fit_transform(X), y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Datasets\n",
    "\n",
    "Here you can have a look at the 3 regression and 3 classification datasets on which we'll compare the different models. The regression dataset only has one input feature, while the classification dataset has two and the target (i.e., class label) is indicated by the color of the dots.\n",
    "\n",
    "**Questions:**\n",
    "- Why are the first two regression and classification datasets linear and the last ones non-linear?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate & plot regression datasets\n",
    "X_reg_1, y_reg_1 = get_linear_regression()\n",
    "X_reg_2, y_reg_2 = get_linear_outlier()\n",
    "X_reg_3, y_reg_3 = get_nonlinear_regression()\n",
    "plot_regression(X_reg_1, y_reg_1)\n",
    "plot_regression(X_reg_2, y_reg_2)\n",
    "plot_regression(X_reg_3, y_reg_3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate & plot classification datasets\n",
    "X_clf_1, y_clf_1 = get_linear_classification_1f()\n",
    "X_clf_2, y_clf_2 = get_linear_classification_2f()\n",
    "X_clf_3, y_clf_3 = get_nonlinear_classification()\n",
    "plot_classification(X_clf_1, y_clf_1)\n",
    "plot_classification(X_clf_2, y_clf_2)\n",
    "plot_classification(X_clf_3, y_clf_3)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Linear Models\n",
    "\n",
    "After reading the chapter on linear models, test them here on different datasets (by changing the number at the end of the dataset variable, e.g., `X_reg_1` -> `X_reg_2`) and experiment with their hyperparameter settings (in the comments you'll find a description of the different hyperparameters and which values you can test for them).\n",
    "\n",
    "**Questions:**\n",
    "- Compare the linear regression and ridge regression models on the regression dataset with outliers (i.e., `X_reg_2, y_reg_2`): what do you observe?\n",
    "- What happens when you increase the value for `alpha` for the ridge regression model? (first think about it, then confirm your guess by actually changing the parameter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Linear Regression\n",
    "X, y = X_reg_1, y_reg_1  # change the numbers here to test the model on a different dataset\n",
    "model = LinearRegression()\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)\n",
    "print(f\"f(x) = {model.intercept_:.3f} + {model.coef_[0]:.3f} * x\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ridge Regression:\n",
    "# alpha (> 0): regularization (higher values = more regularization)\n",
    "X, y = X_reg_1, y_reg_1\n",
    "model = Ridge(alpha=1.)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)\n",
    "print(f\"f(x) = {model.intercept_:.3f} + {model.coef_[0]:.3f} * x\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Logistic Regression (for classification problems!):\n",
    "# C (> 0): regularization (smaller values = more regularization)\n",
    "# penalty: change to \"l1\" to get sparse weights (only if you have many features; needs a different solver)\n",
    "X, y = X_clf_2, y_clf_2\n",
    "model = LogisticRegression(penalty=\"l2\", C=100.)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)  # the shaded area indicates the predicted probability for each class\n",
    "print(f\"f(x) = sigmoid({model.intercept_[0]:.3f} + {model.coef_[0, 0]:.3f} * x_1 + {model.coef_[0, 1]:.3f} * x_2)\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feed-Forward Neural Network (Multi-Layer Perceptron)\n",
    "\n",
    "After reading the chapter on neural networks, test them here on different datasets and experiment with their hyperparameter settings.\n",
    "\n",
    "**Questions:**\n",
    "- What do you observe when you change the activation function to `'relu'` for the 3rd regression dataset?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neural_network import MLPRegressor, MLPClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Neural network for regression:\n",
    "# alpha (> 0): regularization (higher values = more regularization)\n",
    "# hidden_layer_sizes (tuple of ints): number of units in the hidden layers\n",
    "# activation (one of 'identity', 'logistic', 'tanh', 'relu'): non-linear activation function between layers\n",
    "X, y = X_reg_3, y_reg_3\n",
    "model = MLPRegressor(alpha=1e-05, hidden_layer_sizes=(15,), activation=\"tanh\", solver=\"lbfgs\", random_state=1)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Neural network for classification:\n",
    "# alpha (> 0): regularization (higher values = more regularization)\n",
    "# hidden_layer_sizes (tuple of ints): number of units in the hidden layers\n",
    "# activation (one of 'identity', 'logistic', 'tanh', 'relu'): non-linear activation function between layers\n",
    "X, y = X_clf_3, y_clf_3\n",
    "model = MLPClassifier(alpha=1e-05, hidden_layer_sizes=(25, 5), activation=\"relu\", solver=\"adam\", max_iter=1000, random_state=1)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Decision Trees\n",
    "\n",
    "After reading the chapter on decision trees, test them here on different datasets and experiment with their hyperparameter settings.\n",
    "\n",
    "**Questions:**\n",
    "- On the 3rd regression dataset with `max_depth=2`, why do you get exactly 4 plateaus in the prediction?\n",
    "- On the 3rd regression dataset, what happens if you leave `min_samples_leaf` at 10 and then increase `max_depth` step by step from 2 to 10 or even higher values? How do you explain this behavior and what would you need to do to get a tree that fits the data in a more fine granular way?\n",
    "- Compare the prediction of the decision tree classifier on the 2nd dataset (which is basically a rotation of the 1st dataset, i.e., still a simple linear classification problem!) to the prediction made by the logistic regression model on this dataset: What do you observe and why?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Decision Tree for regression:\n",
    "# max_depth (>= 1): depth of the tree (i.e. how many decisions are made before the final prediction)\n",
    "# min_samples_leaf (>= 1): how many training points are in one prediction bucket\n",
    "X, y = X_reg_3, y_reg_3\n",
    "model = DecisionTreeRegressor(max_depth=2, min_samples_leaf=10)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Decision Tree for classification:\n",
    "# max_depth (>= 1): depth of the tree (i.e. how many decisions are made before the final prediction)\n",
    "# min_samples_leaf (>= 1): how many training points are in one prediction bucket\n",
    "X, y = X_clf_1, y_clf_1\n",
    "model = DecisionTreeClassifier(max_depth=2, min_samples_leaf=10)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Ensemble Methods (Random Forest)\n",
    "\n",
    "After reading the chapter on ensemble methods, test the random forest here on different datasets and experiment with the hyperparameter settings (same hyperparameters as the decision tree and the additional parameter `n_estimators` for the number of trees in the forest).\n",
    "\n",
    "**Questions:**\n",
    "- What do you observe when you compare a random forest with multiple estimators to a single decision tree with the same hyperparameter settings (especially for more specific trees, i.e., large `max_depth` and small `min_samples_leaf`)?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Random Forest for regression:\n",
    "# n_estimators (>= 1): how many decision trees to train (don't set this too high, gets computationally expensive)\n",
    "# max_depth (>= 1): depth of the tree (i.e. how many decisions are made before the final prediction)\n",
    "# min_samples_leaf (>= 1): how many training points are in one prediction bucket\n",
    "X, y = X_reg_3, y_reg_3\n",
    "model = RandomForestRegressor(n_estimators=100, max_depth=2, min_samples_leaf=10)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Random Forest for classification:\n",
    "# n_estimators (>= 1): how many decision trees to train\n",
    "# max_depth (>= 1): depth of the tree (i.e. how many decisions are made before the final prediction)\n",
    "# min_samples_leaf (>= 1): how many training points are in one prediction bucket\n",
    "X, y = X_clf_2, y_clf_2\n",
    "model = RandomForestClassifier(n_estimators=100, max_depth=2, min_samples_leaf=10)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Similarity-based Models (kNN)\n",
    "\n",
    "After reading the chapter on k-nearest neighbors, test the method here on different datasets and experiment with the hyperparameter settings.\n",
    "\n",
    "**Questions:**\n",
    "- On the 3rd regression dataset for a larger number of nearest neighbors (e.g., 20), what do you observe for the prediction at the edges of the input domain and why?\n",
    "- Especially for binary classification problems, why does it make sense to always use an odd number of nearest neighbors?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# k-Nearest Neighbors for regression:\n",
    "# n_neighbors (>= 1): how many nearest neighbors are used for the prediction\n",
    "X, y = X_reg_3, y_reg_3\n",
    "model = KNeighborsRegressor(n_neighbors=10)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# k-Nearest Neighbors for classification:\n",
    "# n_neighbors (>= 1): how many nearest neighbors are used for the prediction\n",
    "X, y = X_clf_3, y_clf_3\n",
    "model = KNeighborsClassifier(n_neighbors=11)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Kernel Methods\n",
    "\n",
    "After reading the chapter on kernel methods, test a SVM here on different datasets and experiment with the hyperparameter settings.\n",
    "\n",
    "**Questions:**\n",
    "- How do the values of the hyperparameters `gamma` and `C` interact? \n",
    "- What do you observe when you leave `gamma` at its default value `'scale'`?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVR, SVC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Support Vector Regression:\n",
    "# kernel: kernel function to compute similarities (default: \"rbf\")\n",
    "# gamma (> 0): width of rbf kernel (larger values --> more focused on individual points)\n",
    "# C (> 0): regularization (smaller values = more regularization)\n",
    "X, y = X_reg_3, y_reg_3\n",
    "model = SVR(kernel='rbf', gamma=100., C=1.)\n",
    "model.fit(X, y)\n",
    "plot_regression(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Support Vector Classification:\n",
    "# kernel: kernel function to compute similarities (default: \"rbf\")\n",
    "# gamma (> 0): width of rbf kernel (larger values --> more focused on individual points)\n",
    "# C (> 0): regularization (smaller values = more regularization)\n",
    "X, y = X_clf_3, y_clf_3\n",
    "model = SVC(kernel='rbf', gamma=.005, C=1.)\n",
    "model.fit(X, y)\n",
    "plot_classification(X, y, model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}