ml_exercises/exercises/5_mnist_torch.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Analyze (F)MNIST with `torch`\n",
    "\n",
    "Careful: do **not** hit 'Kernel' > 'Restart & Run All', since some of the cells below take a long time to execute if you are not running the code on a GPU, so we already executed them for you. Only run the first few cells that are not yet executed.\n",
    "\n",
    "In this notebook we compare different types of neural network architectures on the MNIST and Fashion MNIST datasets, to see how the performance improves when using a more complicated architecture. Additionally, we compare the networks to a simple logistic regression classifier from `sklearn`, which should have approximately the same accuracy as a linear FFNN (= a FFNN with only one layer mapping from the input directly to the output and no hidden layers, i.e., that has the same number of trainable parameters as the logistic regression model)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:06:30.088245Z",
     "start_time": "2020-11-22T19:06:29.139733Z"
    }
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score\n",
    "# torch neural network stuff\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import torch.nn.functional as F\n",
    "import torch.optim as optim\n",
    "# torchvision includes the (F)MNIST datasets\n",
    "from torchvision import datasets, transforms\n",
    "# skorch provides a wrapper for torch networks so we can use them like sklearn models\n",
    "from skorch import NeuralNetClassifier\n",
    "from skorch.callbacks import EpochScoring\n",
    "# set random seeds to get (at least more or less) reproducable results\n",
    "np.random.seed(28)\n",
    "torch.manual_seed(28);"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and look at the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:06:30.114557Z",
     "start_time": "2020-11-22T19:06:30.090761Z"
    }
   },
   "outputs": [],
   "source": [
    "# you do not need to understand what these functions do in detail\n",
    "\n",
    "def torch_to_X_y(dataset):\n",
    "    # transform input tensor to numpy array\n",
    "    X = dataset.data.numpy()\n",
    "    # reshape (28 x 28) pixel images to vector\n",
    "    X = X.reshape(X.shape[0], -1).astype('float32')\n",
    "    # the ToTensor transform was not applied to the raw data, so we need to scale ourselves\n",
    "    X /= X.max()\n",
    "    # extract numpy array with targets\n",
    "    y = dataset.targets.numpy()\n",
    "    return X, y\n",
    "\n",
    "def load_data(use_fashion=False):\n",
    "    if use_fashion:\n",
    "        data_train = datasets.FashionMNIST(\"../data\", train=True, download=True, transform=transforms.ToTensor())\n",
    "        data_test = datasets.FashionMNIST(\"../data\", train=False, transform=transforms.ToTensor())\n",
    "    else:\n",
    "        data_train = datasets.MNIST(\"../data\", train=True, download=True, transform=transforms.ToTensor())\n",
    "        data_test = datasets.MNIST(\"../data\", train=False, transform=transforms.ToTensor())\n",
    "    # extract (n_samples x n_features) and (n_samples,) X and y numpy arrays from torch dataset\n",
    "    X_train, y_train = torch_to_X_y(data_train)\n",
    "    X_test, y_test = torch_to_X_y(data_train)\n",
    "    return X_train, X_test, y_train, y_test\n",
    "    \n",
    "def plot_images(x):\n",
    "    n = 10\n",
    "    plt.figure(figsize=(20, 4))\n",
    "    for i in range(1, n+1):\n",
    "        # display original\n",
    "        ax = plt.subplot(2, n, i)\n",
    "        plt.imshow(x[i].reshape(28, 28))\n",
    "        plt.gray()\n",
    "        ax.get_xaxis().set_visible(False)\n",
    "        ax.get_yaxis().set_visible(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:06:31.076712Z",
     "start_time": "2020-11-22T19:06:30.115973Z"
    }
   },
   "outputs": [],
   "source": [
    "# load and display the data -> see how the images have the same format\n",
    "# MNIST\n",
    "X_train, X_test, y_train, y_test = load_data()\n",
    "plot_images(X_train)\n",
    "# Fashion MNIST\n",
    "X_train_F, X_test_F, y_train_F, y_test_F = load_data(use_fashion=True)\n",
    "plot_images(X_train_F)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## See how a `torch` network works"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:06:31.158130Z",
     "start_time": "2020-11-22T19:06:31.078469Z"
    }
   },
   "outputs": [],
   "source": [
    "# FFNN with hidden layers (like the one you saw in the book)\n",
    "class MyNeuralNet(nn.Module):\n",
    "    \n",
    "    def __init__(self, n_in=784, n_hl1=512, n_hl2=256, n_out=10, verbose=False):\n",
    "        # input size is 28x28 pixel images flattened into a 784-dimensional vector\n",
    "        # output size is 10 classes\n",
    "        # hidden layer sizes can be set however you want\n",
    "        super(MyNeuralNet, self).__init__()\n",
    "        self.verbose = verbose\n",
    "        # initialize layers\n",
    "        self.l1 = nn.Linear(n_in, n_hl1)\n",
    "        self.l2 = nn.Linear(n_hl1, n_hl2)\n",
    "        self.lout = nn.Linear(n_hl2, n_out)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        # apply layers in correct order\n",
    "        if self.verbose: print(\"[MyNeuralNet]  input:\", x.shape)\n",
    "        h = F.relu(self.l1(x))              # 784 -> 512 [relu]\n",
    "        if self.verbose: print(\"[MyNeuralNet] 1st hl:\", h.shape)\n",
    "        h = F.relu(self.l2(h))              # 512 -> 256 [relu]\n",
    "        if self.verbose: print(\"[MyNeuralNet] 2nd hl:\", h.shape)\n",
    "        y = F.softmax(self.lout(h), dim=1)  # 256 -> 10  [softmax]\n",
    "        if self.verbose: print(\"[MyNeuralNet] output:\", y.shape)\n",
    "        return y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialize the network\n",
    "ffnn = MyNeuralNet(verbose=True)\n",
    "# get an input data batch and convert the numpy array \n",
    "# to a torch tensor to use it with the network directly\n",
    "# (skorch later works with the numpy arrays)\n",
    "x = torch.Tensor(X_train[:16])\n",
    "print(x.shape)  # batch size x features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# apply network to input, i.e., call forward() to generate the prediction\n",
    "y = ffnn(x)\n",
    "print(y.shape)  # batch size x classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check the image of the first training sample\n",
    "plt.imshow(X_train[0].reshape(28, 28));"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# look at the network's output for this first data point\n",
    "# -> since the network wasn't trained yet, the predicted probabilities for all 10 classes are ~0.1\n",
    "# (notice the grad parameter, which indicates that the network kept track of the gradients,\n",
    "# which are needed for later tuning the weights during training)\n",
    "y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# wrap torch NN in skorch Classifier and initialize\n",
    "net = NeuralNetClassifier(\n",
    "    MyNeuralNet,  # usually the class itself, not an instantiated object\n",
    "    batch_size=32,  # how many samples are used in each training iteration\n",
    "    optimizer=torch.optim.Adadelta,  # the optimizer (i.e. \"what type\" of gradient descent)\n",
    "    lr=1.,  # learning rate of the optimizer\n",
    "    device=\"cuda\" if torch.cuda.is_available() else \"cpu\",  # train the network on a GPU if available\n",
    "    max_epochs=1,  # for how many epochs to train the network\n",
    "    callbacks=[  # additional stuff that should happen after each epoch, e.g., learning rate scheduler\n",
    "        ('tr_acc', EpochScoring(  # or in this case print the accuracy after every epoch\n",
    "            'accuracy',\n",
    "            lower_is_better=False,\n",
    "            on_train=True,\n",
    "            name='train_acc',\n",
    "        )),\n",
    "    ],\n",
    ")\n",
    "\n",
    "# use simple sklearn-like interface to train the network (for 1 epoch)\n",
    "net.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# generate predictions for the same samples as above\n",
    "# -> this gives class labels directly like sklearn\n",
    "y = net.predict(X_train[:16])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check if the prediction (after training) is correct\n",
    "print(\"true class:\", y_train[0])\n",
    "y[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# we can also get the original probabilities (notice the higher value at the index of the true class)\n",
    "y = net.predict_proba(X_train[:16])\n",
    "y[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define NNs for the classification task\n",
    "\n",
    "In the code below, we define 3 different neural network architectures: a linear FFNN, a FFNN with multiple hidden layers, and a CNN, which is an architecture particularly well suited for image classification tasks.\n",
    "\n",
    "You will see that the more complex architectures use an additional operation between layers called `Dropout`. This is a regularization technique used for training neural networks, where a certain percentage of the values in the hidden layer representation of a data point are randomly set to zero. You can think of this as the network suffering from a temporary stroke, which forces the neurons learn redundant representations (i.e., such that one neuron can take over for another neuron that was knocked out), which improves generalization."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:06:31.188855Z",
     "start_time": "2020-11-22T19:06:31.159480Z"
    }
   },
   "outputs": [],
   "source": [
    "# linear FFNN (--> same number of parameters as LogReg model)\n",
    "class LinNN(nn.Module):\n",
    "    \n",
    "    def __init__(self, n_in=784, n_out=10):\n",
    "        super(LinNN, self).__init__()\n",
    "        self.l = nn.Linear(n_in, n_out)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        y = F.softmax(self.l(x), dim=1)  # 784 -> 10 [softmax]\n",
    "        return y\n",
    "    \n",
    "# FFNN with hidden layers  \n",
    "class FFNN(nn.Module):\n",
    "    \n",
    "    def __init__(self, n_in=784, n_hl1=512, n_hl2=256, n_out=10, dropout=0.2):\n",
    "        super(FFNN, self).__init__()\n",
    "        # initialize layers\n",
    "        self.dropout = nn.Dropout(dropout)\n",
    "        self.l1 = nn.Linear(n_in, n_hl1)\n",
    "        self.l2 = nn.Linear(n_hl1, n_hl2)\n",
    "        self.lout = nn.Linear(n_hl2, n_out)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        # apply layers in correct order\n",
    "        h = F.relu(self.l1(x))              # 784 -> 512 [relu]\n",
    "        h = self.dropout(h)\n",
    "        h = F.relu(self.l2(h))              # 512 -> 256 [relu]\n",
    "        h = self.dropout(h)\n",
    "        y = F.softmax(self.lout(h), dim=1)  # 256 -> 10  [softmax]\n",
    "        return y\n",
    "    \n",
    "# Convolutional Neural Net    \n",
    "# based on https://github.com/pytorch/examples/blob/master/mnist/main.py\n",
    "class CNN(nn.Module):\n",
    "    \n",
    "    def __init__(self):\n",
    "        super(CNN, self).__init__()\n",
    "        self.conv1 = nn.Conv2d(1, 32, 3, 1)\n",
    "        self.conv2 = nn.Conv2d(32, 64, 3, 1)\n",
    "        self.dropout1 = nn.Dropout(0.25)\n",
    "        self.dropout2 = nn.Dropout(0.5)\n",
    "        self.fc1 = nn.Linear(9216, 128)\n",
    "        self.fc2 = nn.Linear(128, 10)\n",
    "        \n",
    "    def forward(self, x):\n",
    "        # convolutional and pooling layers\n",
    "        h = self.conv1(x)\n",
    "        h = F.relu(h)\n",
    "        h = self.conv2(h)\n",
    "        h = F.relu(h)\n",
    "        h = F.max_pool2d(h, 2)\n",
    "        h = self.dropout1(h)\n",
    "        # flatten the representation and apply FFNN part for the classification\n",
    "        h = torch.flatten(h, 1)\n",
    "        h = self.fc1(h)\n",
    "        h = F.relu(h)\n",
    "        h = self.dropout2(h)\n",
    "        h = self.fc2(h)\n",
    "        y = F.softmax(h, dim=1)\n",
    "        return y\n",
    "\n",
    "# skorch wrapper with fit/predict methods\n",
    "def eval_net(net_module, X_train, y_train, X_test, y_test, max_epochs=1):\n",
    "    print(\"###\", net_module.__name__)\n",
    "    net = NeuralNetClassifier(\n",
    "        net_module,\n",
    "        batch_size=32,\n",
    "        optimizer=torch.optim.Adadelta,\n",
    "        lr=1.,\n",
    "        device=\"cuda\" if torch.cuda.is_available() else \"cpu\",\n",
    "        max_epochs=max_epochs,\n",
    "        callbacks=[\n",
    "            ('tr_acc', EpochScoring(\n",
    "                'accuracy',\n",
    "                lower_is_better=False,\n",
    "                on_train=True,\n",
    "                name='train_acc',\n",
    "            )),\n",
    "        ],\n",
    "    )\n",
    "    net.fit(X_train, y_train)\n",
    "    # evaluate on test set\n",
    "    y_pred = net.predict(X_test)\n",
    "    print('Test accuracy:', accuracy_score(y_test, y_pred), \"\\n\")\n",
    "    return net"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test on MNIST dataset\n",
    "\n",
    "As you see below, the simple logistic regression classifier is already very good on this easy task, with a test accuracy of over 93.5%.\n",
    "\n",
    "The linear FFNN has almost the same accuracy (90.5%) as the LogReg model (please note: the NNs were only trained for a single epoch!) and the multi-layer FFNN is already better than the LogReg model (96.4%), while the CNN beats them all (98.2%), which is expected since this architecture is designed for the image classification task."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:07:16.662634Z",
     "start_time": "2020-11-22T19:06:31.190048Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### LogReg\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/franzi/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test accuracy: 0.93535\n",
      "### LinNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.8914\u001b[0m        \u001b[32m0.4044\u001b[0m       \u001b[35m0.9052\u001b[0m        \u001b[31m0.3264\u001b[0m  2.9920\n",
      "Test accuracy: 0.9051833333333333\n",
      "### FFNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.9205\u001b[0m        \u001b[32m0.2589\u001b[0m       \u001b[35m0.9600\u001b[0m        \u001b[31m0.1438\u001b[0m  5.8516\n",
      "Test accuracy: 0.9642166666666667\n",
      "### CNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.9311\u001b[0m        \u001b[32m0.2313\u001b[0m       \u001b[35m0.9797\u001b[0m        \u001b[31m0.0744\u001b[0m  8.4549\n",
      "Test accuracy: 0.9821833333333333\n"
     ]
    }
   ],
   "source": [
    "# get regular MNIST dataset\n",
    "X_train, X_test, y_train, y_test = load_data()\n",
    "# compare sklearn LogReg classifier\n",
    "print(\"### LogReg\")\n",
    "clf = LogisticRegression(class_weight='balanced', random_state=1, fit_intercept=True)\n",
    "clf.fit(X_train, y_train)\n",
    "print('Test accuracy:', clf.score(X_test, y_test), \"\\n\")\n",
    "# and our different NN architectures\n",
    "for net_module in [LinNN, FFNN, CNN]:\n",
    "    if net_module == CNN:\n",
    "        # the CNN operates on the 28x28 pixel images directly\n",
    "        net = eval_net(net_module, X_train.reshape(-1, 1, 28, 28), y_train, X_test.reshape(-1, 1, 28, 28), y_test)\n",
    "    else:\n",
    "        # the FFNNs get the flattened vectors\n",
    "        net = eval_net(net_module, X_train, y_train, X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Test on FashionMNIST\n",
    "\n",
    "On the more difficult FMNIST task, the LogReg model has a much lower accuracy of 86.6% compared to the 93.5% achieved on the original MNIST dataset. When trained for only a single epoch, both the linear and multi-layer FFNNs have a lower accuracy than the LogReg model (82.7 and 83.7% respectively) and only the CNN does a bit better (88.6%). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:07:58.876986Z",
     "start_time": "2020-11-22T19:07:16.665145Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### LogReg\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/franzi/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/_logistic.py:940: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  extra_warning_msg=_LOGISTIC_SOLVER_CONVERGENCE_MSG)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Test accuracy: 0.8659833333333333\n",
      "### LinNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.8022\u001b[0m        \u001b[32m0.5794\u001b[0m       \u001b[35m0.8257\u001b[0m        \u001b[31m0.5023\u001b[0m  2.7844\n",
      "Test accuracy: 0.8270166666666666\n",
      "### FFNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.7816\u001b[0m        \u001b[32m0.5942\u001b[0m       \u001b[35m0.8366\u001b[0m        \u001b[31m0.4465\u001b[0m  5.6541\n",
      "Test accuracy: 0.8375666666666667\n",
      "### CNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.8063\u001b[0m        \u001b[32m0.5415\u001b[0m       \u001b[35m0.8842\u001b[0m        \u001b[31m0.3228\u001b[0m  8.7526\n",
      "Test accuracy: 0.8861166666666667\n"
     ]
    }
   ],
   "source": [
    "X_train, X_test, y_train, y_test = load_data(True)\n",
    "# regular sklearn LogReg classifier\n",
    "print(\"### LogReg\")\n",
    "clf = LogisticRegression(class_weight='balanced', random_state=1, fit_intercept=True)\n",
    "clf.fit(X_train, y_train)\n",
    "print('Test accuracy:', clf.score(X_test, y_test), \"\\n\")\n",
    "# our different NN\n",
    "for net_module in [LinNN, FFNN, CNN]:\n",
    "    if net_module == CNN:\n",
    "        net = eval_net(net_module, X_train.reshape(-1, 1, 28, 28), y_train, X_test.reshape(-1, 1, 28, 28), y_test)\n",
    "    else:\n",
    "        net = eval_net(net_module, X_train, y_train, X_test, y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "However, when trained for more epochs, the performance of all models improves, with the accuracy of the linear FFNN now being very close to that of the LogReg model (85.8%), while the multi-layer FFNN is better (89.3%) and the CNN can now solve the task quite well with an accuracy of 94.6%.\n",
    "\n",
    "(See how the training and validation loss decrease over time - observing how these metrics develop can help you judge whether you've set your learning rate correctly and for how many epochs you should train the network.)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2020-11-22T19:12:29.545463Z",
     "start_time": "2020-11-22T19:07:58.880264Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "### LinNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.8015\u001b[0m        \u001b[32m0.5794\u001b[0m       \u001b[35m0.8258\u001b[0m        \u001b[31m0.5047\u001b[0m  2.9007\n",
      "      2       \u001b[36m0.8383\u001b[0m        \u001b[32m0.4715\u001b[0m       \u001b[35m0.8363\u001b[0m        \u001b[31m0.4753\u001b[0m  3.1765\n",
      "      3       \u001b[36m0.8461\u001b[0m        \u001b[32m0.4520\u001b[0m       \u001b[35m0.8401\u001b[0m        \u001b[31m0.4635\u001b[0m  3.3053\n",
      "      4       \u001b[36m0.8497\u001b[0m        \u001b[32m0.4415\u001b[0m       \u001b[35m0.8426\u001b[0m        \u001b[31m0.4576\u001b[0m  2.8845\n",
      "      5       \u001b[36m0.8523\u001b[0m        \u001b[32m0.4347\u001b[0m       \u001b[35m0.8449\u001b[0m        \u001b[31m0.4544\u001b[0m  3.1842\n",
      "      6       \u001b[36m0.8542\u001b[0m        \u001b[32m0.4297\u001b[0m       \u001b[35m0.8460\u001b[0m        \u001b[31m0.4527\u001b[0m  2.9838\n",
      "      7       \u001b[36m0.8555\u001b[0m        \u001b[32m0.4258\u001b[0m       0.8456        \u001b[31m0.4518\u001b[0m  3.0838\n",
      "      8       \u001b[36m0.8569\u001b[0m        \u001b[32m0.4227\u001b[0m       0.8456        \u001b[31m0.4515\u001b[0m  3.0435\n",
      "      9       \u001b[36m0.8580\u001b[0m        \u001b[32m0.4201\u001b[0m       0.8459        0.4515  3.2644\n",
      "     10       \u001b[36m0.8592\u001b[0m        \u001b[32m0.4179\u001b[0m       \u001b[35m0.8462\u001b[0m        0.4517  3.0675\n",
      "     11       \u001b[36m0.8602\u001b[0m        \u001b[32m0.4159\u001b[0m       \u001b[35m0.8464\u001b[0m        0.4521  3.1939\n",
      "     12       \u001b[36m0.8610\u001b[0m        \u001b[32m0.4143\u001b[0m       \u001b[35m0.8468\u001b[0m        0.4526  3.0209\n",
      "     13       \u001b[36m0.8619\u001b[0m        \u001b[32m0.4128\u001b[0m       0.8462        0.4532  3.0769\n",
      "     14       \u001b[36m0.8627\u001b[0m        \u001b[32m0.4115\u001b[0m       0.8462        0.4538  3.1085\n",
      "     15       \u001b[36m0.8635\u001b[0m        \u001b[32m0.4103\u001b[0m       0.8465        0.4544  2.8896\n",
      "Test accuracy: 0.8585333333333334\n",
      "### FFNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.7825\u001b[0m        \u001b[32m0.5919\u001b[0m       \u001b[35m0.8488\u001b[0m        \u001b[31m0.4411\u001b[0m  5.6527\n",
      "      2       \u001b[36m0.8400\u001b[0m        \u001b[32m0.4556\u001b[0m       \u001b[35m0.8496\u001b[0m        \u001b[31m0.4000\u001b[0m  6.2455\n",
      "      3       \u001b[36m0.8499\u001b[0m        \u001b[32m0.4279\u001b[0m       \u001b[35m0.8550\u001b[0m        0.4068  5.3284\n",
      "      4       \u001b[36m0.8554\u001b[0m        \u001b[32m0.4144\u001b[0m       \u001b[35m0.8609\u001b[0m        0.4078  5.9324\n",
      "      5       \u001b[36m0.8595\u001b[0m        \u001b[32m0.4088\u001b[0m       \u001b[35m0.8679\u001b[0m        \u001b[31m0.3969\u001b[0m  5.5903\n",
      "      6       \u001b[36m0.8605\u001b[0m        \u001b[32m0.4040\u001b[0m       \u001b[35m0.8687\u001b[0m        0.4187  5.7188\n",
      "      7       \u001b[36m0.8645\u001b[0m        \u001b[32m0.4004\u001b[0m       0.8608        0.4450  5.9384\n",
      "      8       \u001b[36m0.8657\u001b[0m        \u001b[32m0.3949\u001b[0m       0.8673        \u001b[31m0.3921\u001b[0m  5.8410\n",
      "      9       0.8649        \u001b[32m0.3934\u001b[0m       \u001b[35m0.8748\u001b[0m        0.3986  6.0358\n",
      "     10       \u001b[36m0.8702\u001b[0m        \u001b[32m0.3902\u001b[0m       0.8698        0.4123  5.6180\n",
      "     11       \u001b[36m0.8709\u001b[0m        \u001b[32m0.3887\u001b[0m       \u001b[35m0.8762\u001b[0m        0.3928  5.8379\n",
      "     12       \u001b[36m0.8721\u001b[0m        \u001b[32m0.3871\u001b[0m       0.8751        0.3933  5.9377\n",
      "     13       \u001b[36m0.8734\u001b[0m        \u001b[32m0.3826\u001b[0m       \u001b[35m0.8778\u001b[0m        0.4058  5.4589\n",
      "     14       0.8734        \u001b[32m0.3775\u001b[0m       \u001b[35m0.8798\u001b[0m        0.3961  5.5648\n",
      "     15       \u001b[36m0.8745\u001b[0m        0.3825       0.8788        0.3984  5.7210\n",
      "Test accuracy: 0.8931333333333333\n",
      "### CNN\n",
      "  epoch    train_acc    train_loss    valid_acc    valid_loss     dur\n",
      "-------  -----------  ------------  -----------  ------------  ------\n",
      "      1       \u001b[36m0.8104\u001b[0m        \u001b[32m0.5317\u001b[0m       \u001b[35m0.8889\u001b[0m        \u001b[31m0.3057\u001b[0m  7.9180\n",
      "      2       \u001b[36m0.8770\u001b[0m        \u001b[32m0.3548\u001b[0m       \u001b[35m0.8979\u001b[0m        \u001b[31m0.2899\u001b[0m  8.4600\n",
      "      3       \u001b[36m0.8930\u001b[0m        \u001b[32m0.3103\u001b[0m       \u001b[35m0.9120\u001b[0m        \u001b[31m0.2476\u001b[0m  8.7328\n",
      "      4       \u001b[36m0.9010\u001b[0m        \u001b[32m0.2884\u001b[0m       \u001b[35m0.9121\u001b[0m        0.2549  8.7084\n",
      "      5       \u001b[36m0.9061\u001b[0m        \u001b[32m0.2731\u001b[0m       \u001b[35m0.9153\u001b[0m        \u001b[31m0.2409\u001b[0m  8.2132\n",
      "      6       \u001b[36m0.9121\u001b[0m        \u001b[32m0.2610\u001b[0m       0.9117        0.2664  8.4597\n",
      "      7       \u001b[36m0.9154\u001b[0m        \u001b[32m0.2504\u001b[0m       \u001b[35m0.9179\u001b[0m        \u001b[31m0.2391\u001b[0m  8.5581\n",
      "      8       \u001b[36m0.9185\u001b[0m        \u001b[32m0.2428\u001b[0m       0.9143        0.2494  8.1528\n",
      "      9       \u001b[36m0.9208\u001b[0m        \u001b[32m0.2348\u001b[0m       \u001b[35m0.9184\u001b[0m        0.2415  8.8257\n",
      "     10       \u001b[36m0.9234\u001b[0m        \u001b[32m0.2317\u001b[0m       0.9182        0.2498  8.1160\n",
      "     11       \u001b[36m0.9243\u001b[0m        \u001b[32m0.2288\u001b[0m       0.9172        0.2472  7.9041\n",
      "     12       0.9232        0.2306       0.9183        0.2617  8.7895\n",
      "     13       \u001b[36m0.9260\u001b[0m        \u001b[32m0.2252\u001b[0m       0.9136        0.2523  8.5135\n",
      "     14       \u001b[36m0.9290\u001b[0m        \u001b[32m0.2194\u001b[0m       0.9146        0.2508  8.8612\n",
      "     15       0.9282        \u001b[32m0.2182\u001b[0m       0.9184        0.2503  7.9617\n",
      "Test accuracy: 0.9464833333333333\n"
     ]
    }
   ],
   "source": [
    "# train with more epochs\n",
    "for net_module in [LinNN, FFNN, CNN]:\n",
    "    if net_module == CNN:\n",
    "        net = eval_net(net_module, X_train.reshape(-1, 1, 28, 28), y_train, X_test.reshape(-1, 1, 28, 28), y_test, max_epochs=15)\n",
    "    else:\n",
    "        net = eval_net(net_module, X_train, y_train, X_test, y_test, max_epochs=15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}