diff --git a/11_training_deep_neural_networks.ipynb b/11_training_deep_neural_networks.ipynb
index d27daff..267947a 100644
--- a/11_training_deep_neural_networks.ipynb
+++ b/11_training_deep_neural_networks.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 10 – Training Deep Neural Networks**"
+    "**Chapter 11 – Training Deep Neural Networks**"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "_This notebook contains all the sample code and solutions to the exercises in chapter 10._"
+    "_This notebook contains all the sample code and solutions to the exercises in chapter 11._"
    ]
   },
   {
@@ -20,17 +20,19 @@
    "source": [
     "<table align=\"left\">\n",
     "  <td>\n",
-    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml3/blob/main/11_training_deep_neural_networks.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
     "  </td>\n",
     "  <td>\n",
-    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml2/blob/master/11_training_deep_neural_networks.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
+    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml3/blob/main/11_training_deep_neural_networks.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
     "  </td>\n",
     "</table>"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "# Setup"
    ]
@@ -39,7 +41,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
+    "This project requires Python 3.8 or above:"
    ]
   },
   {
@@ -48,38 +50,84 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Python ≥3.8 is required\n",
     "import sys\n",
-    "assert sys.version_info >= (3, 8)\n",
     "\n",
-    "# Common imports\n",
-    "import numpy as np\n",
+    "assert sys.version_info >= (3, 8)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "It also requires Scikit-Learn ≥ 1.0.1:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sklearn\n",
+    "\n",
+    "assert sklearn.__version__ >= \"1.0.1\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And TensorFlow ≥ 2.6:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "\n",
+    "assert tf.__version__ >= \"2.6.0\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As we did in previous chapters, let's define the default font sizes to make the figures prettier:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "plt.rc('font', size=14)\n",
+    "plt.rc('axes', labelsize=14, titlesize=14)\n",
+    "plt.rc('legend', fontsize=14)\n",
+    "plt.rc('xtick', labelsize=10)\n",
+    "plt.rc('ytick', labelsize=10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And let's create the `images/deep` folder (if it doesn't already exist), and define the `save_fig()` function which is used through this notebook to save the figures in high-res for the book:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "from pathlib import Path\n",
     "\n",
-    "# Scikit-Learn ≥1.0 is required\n",
-    "import sklearn\n",
-    "assert sklearn.__version__ >= \"1.0\"\n",
-    "\n",
-    "# TensorFlow ≥2.6 is required\n",
-    "import tensorflow as tf\n",
-    "assert tf.__version__ >= \"2.6\"\n",
-    "\n",
-    "# Load the Jupyter extension for TensorBoard\n",
-    "%load_ext tensorboard\n",
-    "\n",
-    "# to make this notebook's output stable across runs\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "# To plot pretty figures\n",
-    "%matplotlib inline\n",
-    "import matplotlib as mpl\n",
-    "import matplotlib.pyplot as plt\n",
-    "mpl.rc('axes', labelsize=14)\n",
-    "mpl.rc('xtick', labelsize=12)\n",
-    "mpl.rc('ytick', labelsize=12)\n",
-    "\n",
-    "# Where to save the figures\n",
     "IMAGES_PATH = Path() / \"images\" / \"deep\"\n",
     "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
     "\n",
@@ -99,34 +147,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def logit(z):\n",
-    "    return 1 / (1 + np.exp(-z))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell generates and saves Figure 11–1\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "def sigmoid(z):\n",
+    "    return 1 / (1 + np.exp(-z))\n",
+    "\n",
     "z = np.linspace(-5, 5, 200)\n",
     "\n",
     "plt.plot([-5, 5], [0, 0], 'k-')\n",
     "plt.plot([-5, 5], [1, 1], 'k--')\n",
     "plt.plot([0, 0], [-0.2, 1.2], 'k-')\n",
     "plt.plot([-5, 5], [-3/4, 7/4], 'g--')\n",
-    "plt.plot(z, logit(z), \"b-\", linewidth=2)\n",
+    "plt.plot(z, sigmoid(z), \"b-\", linewidth=2,\n",
+    "         label=r\"$\\sigma(z) = \\dfrac{1}{1+e^{-z}}$\")\n",
     "props = dict(facecolor='black', shrink=0.1)\n",
-    "plt.annotate('Saturating', xytext=(3.5, 0.7), xy=(5, 1), arrowprops=props, fontsize=14, ha=\"center\")\n",
-    "plt.annotate('Saturating', xytext=(-3.5, 0.3), xy=(-5, 0), arrowprops=props, fontsize=14, ha=\"center\")\n",
-    "plt.annotate('Linear', xytext=(2, 0.2), xy=(0, 0.5), arrowprops=props, fontsize=14, ha=\"center\")\n",
+    "plt.annotate('Saturating', xytext=(3.5, 0.7), xy=(5, 1), arrowprops=props,\n",
+    "             fontsize=14, ha=\"center\")\n",
+    "plt.annotate('Saturating', xytext=(-3.5, 0.3), xy=(-5, 0), arrowprops=props,\n",
+    "             fontsize=14, ha=\"center\")\n",
+    "plt.annotate('Linear', xytext=(2, 0.2), xy=(0, 0.5), arrowprops=props,\n",
+    "             fontsize=14, ha=\"center\")\n",
     "plt.grid(True)\n",
-    "plt.title(\"Sigmoid activation function\", fontsize=14)\n",
     "plt.axis([-5, 5, -0.2, 1.2])\n",
+    "plt.xlabel(\"$z$\")\n",
+    "plt.legend(loc=\"upper left\", fontsize=16)\n",
     "\n",
     "save_fig(\"sigmoid_saturation_plot\")\n",
     "plt.show()"
@@ -141,31 +191,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
-    "[name for name in dir(tf.keras.initializers) if not name.startswith(\"_\")]"
+    "dense = tf.keras.layers.Dense(50, activation=\"relu\",\n",
+    "                              kernel_initializer=\"he_normal\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.layers.Dense(10, activation=\"relu\", kernel_initializer=\"he_normal\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "init = tf.keras.initializers.VarianceScaling(scale=2., mode='fan_avg',\n",
-    "                                          distribution='uniform')\n",
-    "tf.keras.layers.Dense(10, activation=\"relu\", kernel_initializer=init)"
+    "he_avg_init = tf.keras.initializers.VarianceScaling(scale=2., mode=\"fan_avg\",\n",
+    "                                                    distribution=\"uniform\")\n",
+    "dense = tf.keras.layers.Dense(50, activation=\"sigmoid\",\n",
+    "                              kernel_initializer=he_avg_init)"
    ]
   },
   {
@@ -182,42 +225,32 @@
     "### Leaky ReLU"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def leaky_relu(z, alpha=0.01):\n",
-    "    return np.maximum(alpha*z, z)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(z, leaky_relu(z, 0.05), \"b-\", linewidth=2)\n",
-    "plt.plot([-5, 5], [0, 0], 'k-')\n",
-    "plt.plot([0, 0], [-0.5, 4.2], 'k-')\n",
-    "plt.grid(True)\n",
-    "props = dict(facecolor='black', shrink=0.1)\n",
-    "plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.2), arrowprops=props, fontsize=14, ha=\"center\")\n",
-    "plt.title(\"Leaky ReLU activation function\", fontsize=14)\n",
-    "plt.axis([-5, 5, -0.5, 4.2])\n",
-    "\n",
-    "save_fig(\"leaky_relu_plot\")\n",
-    "plt.show()"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
-    "[m for m in dir(tf.keras.activations) if not m.startswith(\"_\")]"
+    "# extra code – this cell generates and saves Figure 11–2\n",
+    "\n",
+    "def leaky_relu(z, alpha):\n",
+    "    return np.maximum(alpha * z, z)\n",
+    "\n",
+    "z = np.linspace(-5, 5, 200)\n",
+    "plt.plot(z, leaky_relu(z, 0.1), \"b-\", linewidth=2, label=r\"$LeakyReLU(z) = max(\\alpha z, z)$\")\n",
+    "plt.plot([-5, 5], [0, 0], 'k-')\n",
+    "plt.plot([0, 0], [-1, 3.7], 'k-')\n",
+    "plt.grid(True)\n",
+    "props = dict(facecolor='black', shrink=0.1)\n",
+    "plt.annotate('Leak', xytext=(-3.5, 0.5), xy=(-5, -0.3), arrowprops=props,\n",
+    "             fontsize=14, ha=\"center\")\n",
+    "plt.xlabel(\"$z$\")\n",
+    "plt.axis([-5, 5, -1, 3.7])\n",
+    "plt.gca().set_aspect(\"equal\")\n",
+    "plt.legend()\n",
+    "\n",
+    "save_fig(\"leaky_relu_plot\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -226,14 +259,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "[m for m in dir(tf.keras.layers) if \"relu\" in m.lower()]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's train a neural network on Fashion MNIST using the Leaky ReLU:"
+    "leaky_relu = tf.keras.layers.LeakyReLU(alpha=0.2)  # defaults to alpha=0.3\n",
+    "dense = tf.keras.layers.Dense(50, activation=leaky_relu,\n",
+    "                              kernel_initializer=\"he_normal\")"
    ]
   },
   {
@@ -242,102 +270,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\n",
-    "X_train_full = X_train_full / 255.0\n",
-    "X_test = X_test / 255.0\n",
-    "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n",
-    "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, kernel_initializer=\"he_normal\"),\n",
-    "    tf.keras.layers.LeakyReLU(),\n",
-    "    tf.keras.layers.Dense(100, kernel_initializer=\"he_normal\"),\n",
-    "    tf.keras.layers.LeakyReLU(),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
+    "model = tf.keras.models.Sequential([\n",
+    "    # [...]  # more layers\n",
+    "    tf.keras.layers.Dense(50, kernel_initializer=\"he_normal\"),  # no activation\n",
+    "    tf.keras.layers.LeakyReLU(alpha=0.2),  # activation as a separate layer\n",
+    "    # [...]  # more layers\n",
     "])"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "              metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train, y_train, epochs=10,\n",
-    "                    validation_data=(X_valid, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now let's try PReLU:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, kernel_initializer=\"he_normal\"),\n",
-    "    tf.keras.layers.PReLU(),\n",
-    "    tf.keras.layers.Dense(100, kernel_initializer=\"he_normal\"),\n",
-    "    tf.keras.layers.PReLU(),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "              metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train, y_train, epochs=10,\n",
-    "                    validation_data=(X_valid, y_valid))"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -346,52 +286,27 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 18,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "def elu(z, alpha=1):\n",
-    "    return np.where(z < 0, alpha * (np.exp(z) - 1), z)"
+    "Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer, and use He initialization:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(z, elu(z), \"b-\", linewidth=2)\n",
-    "plt.plot([-5, 5], [0, 0], 'k-')\n",
-    "plt.plot([-5, 5], [-1, -1], 'k--')\n",
-    "plt.plot([0, 0], [-2.2, 3.2], 'k-')\n",
-    "plt.grid(True)\n",
-    "plt.title(r\"ELU activation function ($\\alpha=1$)\", fontsize=14)\n",
-    "plt.axis([-5, 5, -2.2, 3.2])\n",
-    "\n",
-    "save_fig(\"elu_plot\")\n",
-    "plt.show()"
+    "dense = tf.keras.layers.Dense(50, activation=\"elu\",\n",
+    "                              kernel_initializer=\"he_normal\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Implementing ELU in TensorFlow is trivial, just specify the activation function when building each layer:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.layers.Dense(10, activation=\"elu\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "### SELU"
    ]
@@ -400,48 +315,54 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "This activation function was proposed in this [great paper](https://arxiv.org/pdf/1706.02515.pdf) by Günter Klambauer, Thomas Unterthiner and Andreas Mayr, published in June 2017. During training, a neural network composed exclusively of a stack of dense layers using the SELU activation function and LeCun initialization will self-normalize: the output of each layer will tend to preserve the same mean and variance during training, which solves the vanishing/exploding gradients problem. As a result, this activation function outperforms the other activation functions very significantly for such neural nets, so you should really try it out. Unfortunately, the self-normalizing property of the SELU activation function is easily broken: you cannot use ℓ<sub>1</sub> or ℓ<sub>2</sub> regularization, regular dropout, max-norm, skip connections or other non-sequential topologies (so recurrent neural networks won't self-normalize). However, in practice it works quite well with sequential CNNs. If you break self-normalization, SELU will not necessarily outperform other activation functions."
+    "By default, the SELU hyperparameters (`scale` and `alpha`) are tuned in such a way that the mean output of each neuron remains close to 0, and the standard deviation remains close to 1 (assuming the inputs are standardized with mean 0 and standard deviation 1 too, and other constraints are respected, as explained in the book). Using this activation function, even a 1,000 layer deep neural network preserves roughly mean 0 and standard deviation 1 across all layers, avoiding the exploding/vanishing gradients problem:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell generates and saves Figure 11–3\n",
+    "\n",
     "from scipy.special import erfc\n",
     "\n",
     "# alpha and scale to self normalize with mean 0 and standard deviation 1\n",
     "# (see equation 14 in the paper):\n",
-    "alpha_0_1 = -np.sqrt(2 / np.pi) / (erfc(1/np.sqrt(2)) * np.exp(1/2) - 1)\n",
-    "scale_0_1 = (1 - erfc(1 / np.sqrt(2)) * np.sqrt(np.e)) * np.sqrt(2 * np.pi) * (2 * erfc(np.sqrt(2))*np.e**2 + np.pi*erfc(1/np.sqrt(2))**2*np.e - 2*(2+np.pi)*erfc(1/np.sqrt(2))*np.sqrt(np.e)+np.pi+2)**(-1/2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "alpha_0_1 = -np.sqrt(2 / np.pi) / (erfc(1 / np.sqrt(2)) * np.exp(1 / 2) - 1)\n",
+    "scale_0_1 = (\n",
+    "    (1 - erfc(1 / np.sqrt(2)) * np.sqrt(np.e))\n",
+    "    * np.sqrt(2 * np.pi)\n",
+    "    * (\n",
+    "        2 * erfc(np.sqrt(2)) * np.e ** 2\n",
+    "        + np.pi * erfc(1 / np.sqrt(2)) ** 2 * np.e\n",
+    "        - 2 * (2 + np.pi) * erfc(1 / np.sqrt(2)) * np.sqrt(np.e)\n",
+    "        + np.pi\n",
+    "        + 2\n",
+    "    ) ** (-1 / 2)\n",
+    ")\n",
+    "\n",
+    "def elu(z, alpha=1):\n",
+    "    return np.where(z < 0, alpha * (np.exp(z) - 1), z)\n",
+    "\n",
     "def selu(z, scale=scale_0_1, alpha=alpha_0_1):\n",
-    "    return scale * elu(z, alpha)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.plot(z, selu(z), \"b-\", linewidth=2)\n",
+    "    return scale * elu(z, alpha)\n",
+    "\n",
+    "z = np.linspace(-5, 5, 200)\n",
+    "plt.plot(z, elu(z), \"b-\", linewidth=2, label=r\"ELU$_\\alpha(z) = \\alpha (e^z - 1)$ if $z < 0$, else $z$\")\n",
+    "plt.plot(z, selu(z), \"r--\", linewidth=2, label=r\"SELU$(z) = 1.05 \\, $ELU$_{1.67}(z)$\")\n",
     "plt.plot([-5, 5], [0, 0], 'k-')\n",
-    "plt.plot([-5, 5], [-1.758, -1.758], 'k--')\n",
+    "plt.plot([-5, 5], [-1, -1], 'k:', linewidth=2)\n",
+    "plt.plot([-5, 5], [-1.758, -1.758], 'k:', linewidth=2)\n",
     "plt.plot([0, 0], [-2.2, 3.2], 'k-')\n",
     "plt.grid(True)\n",
-    "plt.title(\"SELU activation function\", fontsize=14)\n",
     "plt.axis([-5, 5, -2.2, 3.2])\n",
+    "plt.xlabel(\"$z$\")\n",
+    "plt.gca().set_aspect(\"equal\")\n",
+    "plt.legend()\n",
     "\n",
-    "save_fig(\"selu_plot\")\n",
+    "save_fig(\"elu_selu_plot\")\n",
     "plt.show()"
    ]
   },
@@ -449,84 +370,51 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "By default, the SELU hyperparameters (`scale` and `alpha`) are tuned in such a way that the mean output of each neuron remains close to 0, and the standard deviation remains close to 1 (assuming the inputs are standardized with mean 0 and standard deviation 1 too). Using this activation function, even a 1,000 layer deep neural network preserves roughly mean 0 and standard deviation 1 across all layers, avoiding the exploding/vanishing gradients problem:"
+    "Using SELU is straightforward:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.random.seed(42)\n",
-    "Z = np.random.normal(size=(500, 100)) # standardized inputs\n",
-    "for layer in range(1000):\n",
-    "    W = np.random.normal(size=(100, 100), scale=np.sqrt(1 / 100)) # LeCun initialization\n",
-    "    Z = selu(np.dot(Z, W))\n",
-    "    means = np.mean(Z, axis=0).mean()\n",
-    "    stds = np.std(Z, axis=0).mean()\n",
-    "    if layer % 100 == 0:\n",
-    "        print(\"Layer {}: mean {:.2f}, std deviation {:.2f}\".format(layer, means, stds))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Using SELU is easy:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.layers.Dense(10, activation=\"selu\",\n",
-    "                   kernel_initializer=\"lecun_normal\")"
+    "dense = tf.keras.layers.Dense(50, activation=\"selu\",\n",
+    "                              kernel_initializer=\"lecun_normal\")"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "**Extra material – an example of a self-regularized network using SELU**\n",
+    "\n",
     "Let's create a neural net for Fashion MNIST with 100 hidden layers, using the SELU activation function:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))\n",
-    "model.add(tf.keras.layers.Dense(300, activation=\"selu\",\n",
-    "                             kernel_initializer=\"lecun_normal\"))\n",
-    "for layer in range(99):\n",
+    "for layer in range(100):\n",
     "    model.add(tf.keras.layers.Dense(100, activation=\"selu\",\n",
-    "                                 kernel_initializer=\"lecun_normal\"))\n",
+    "                                    kernel_initializer=\"lecun_normal\"))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "              optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),\n",
     "              metrics=[\"accuracy\"])"
    ]
   },
@@ -539,7 +427,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fashion_mnist = tf.keras.datasets.fashion_mnist.load_data()\n",
+    "(X_train_full, y_train_full), (X_test, y_test) = fashion_mnist\n",
+    "X_train, y_train = X_train_full[:-5000], y_train_full[:-5000]\n",
+    "X_valid, y_valid = X_train_full[-5000:], y_train_full[-5000:]\n",
+    "X_train, X_valid, X_test = X_train / 255, X_valid / 255, X_test / 255"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class_names = [\"T-shirt/top\", \"Trouser\", \"Pullover\", \"Dress\", \"Coat\",\n",
+    "               \"Sandal\", \"Shirt\", \"Sneaker\", \"Bag\", \"Ankle boot\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -552,7 +463,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -564,47 +475,46 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now look at what happens if we try to use the ReLU activation function instead:"
+    "The network managed to learn, despite how deep it is. Now look at what happens if we try to use the ReLU activation function instead:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": 22,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[28, 28]))\n",
-    "model.add(tf.keras.layers.Dense(300, activation=\"relu\", kernel_initializer=\"he_normal\"))\n",
-    "for layer in range(99):\n",
-    "    model.add(tf.keras.layers.Dense(100, activation=\"relu\", kernel_initializer=\"he_normal\"))\n",
+    "for layer in range(100):\n",
+    "    model.add(tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                                    kernel_initializer=\"he_normal\"))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": 23,
    "metadata": {},
    "outputs": [],
    "source": [
     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "              optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),\n",
     "              metrics=[\"accuracy\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 34,
+   "execution_count": 24,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -619,6 +529,56 @@
     "Not great at all, we suffered from the vanishing/exploding gradients problem."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### GELU, Swish and Mish"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – this cell generates and saves Figure 11–4\n",
+    "\n",
+    "def swish(z, beta=1):\n",
+    "    return z * sigmoid(beta * z)\n",
+    "\n",
+    "def approx_gelu(z):\n",
+    "    return swish(z, beta=1.702)\n",
+    "\n",
+    "def softplus(z):\n",
+    "    return np.log(1 + np.exp(z))\n",
+    "\n",
+    "def mish(z):\n",
+    "    return z * np.tanh(softplus(z))\n",
+    "\n",
+    "z = np.linspace(-4, 2, 200)\n",
+    "\n",
+    "beta = 0.6\n",
+    "plt.plot(z, approx_gelu(z), \"b-\", linewidth=2,\n",
+    "         label=r\"GELU$(z) = z\\,\\Phi(z)$\")\n",
+    "plt.plot(z, swish(z), \"r--\", linewidth=2,\n",
+    "         label=r\"Swish$(z) = z\\,\\sigma(z)$\")\n",
+    "plt.plot(z, swish(z, beta), \"r:\", linewidth=2,\n",
+    "         label=fr\"Swish$_{{\\beta={beta}}}(z)=z\\,\\sigma({beta}\\,z)$\")\n",
+    "plt.plot(z, mish(z), \"g:\", linewidth=3,\n",
+    "         label=fr\"Mish$(z) = z\\,\\tanh($softplus$(z))$\")\n",
+    "plt.plot([-4, 2], [0, 0], 'k-')\n",
+    "plt.plot([0, 0], [-2.2, 3.2], 'k-')\n",
+    "plt.grid(True)\n",
+    "plt.axis([-4, 2, -1, 2])\n",
+    "plt.gca().set_aspect(\"equal\")\n",
+    "plt.xlabel(\"$z$\")\n",
+    "plt.legend(loc=\"upper left\")\n",
+    "\n",
+    "save_fig(\"gelu_swish_mish_plot\")\n",
+    "plt.show()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -628,16 +588,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 35,
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code - clear the name counters and set the random seed\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
     "    tf.keras.layers.BatchNormalization(),\n",
-    "    tf.keras.layers.Dense(300, activation=\"relu\"),\n",
+    "    tf.keras.layers.Dense(300, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    tf.keras.layers.BatchNormalization(),\n",
-    "    tf.keras.layers.Dense(100, activation=\"relu\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    tf.keras.layers.BatchNormalization(),\n",
     "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
     "])"
@@ -645,7 +618,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
+   "execution_count": 28,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -654,42 +627,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 37,
+   "execution_count": 29,
    "metadata": {},
    "outputs": [],
    "source": [
-    "bn1 = model.layers[1]\n",
-    "[(var.name, var.trainable) for var in bn1.variables]"
+    "[(var.name, var.trainable) for var in model.layers[1].variables]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#bn1.updates #deprecated"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "              metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train, y_train, epochs=10,\n",
-    "                    validation_data=(X_valid, y_valid))"
+    "# extra code – just show that the model works! 😊\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"sgd\",\n",
+    "              metrics=\"accuracy\")\n",
+    "model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))"
    ]
   },
   {
@@ -701,17 +655,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code - clear the name counters and set the random seed\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.BatchNormalization(),\n",
-    "    tf.keras.layers.Dense(300, use_bias=False),\n",
+    "    tf.keras.layers.Dense(300, kernel_initializer=\"he_normal\", use_bias=False),\n",
     "    tf.keras.layers.BatchNormalization(),\n",
     "    tf.keras.layers.Activation(\"relu\"),\n",
-    "    tf.keras.layers.Dense(100, use_bias=False),\n",
+    "    tf.keras.layers.Dense(100, kernel_initializer=\"he_normal\", use_bias=False),\n",
     "    tf.keras.layers.BatchNormalization(),\n",
     "    tf.keras.layers.Activation(\"relu\"),\n",
     "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
@@ -720,23 +684,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "              metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train, y_train, epochs=10,\n",
-    "                    validation_data=(X_valid, y_valid))"
+    "# extra code – just show that the model works! 😊\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"sgd\",\n",
+    "              metrics=\"accuracy\")\n",
+    "model.fit(X_train, y_train, epochs=2, validation_data=(X_valid, y_valid))"
    ]
   },
   {
@@ -750,25 +705,27 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "All tf.keras.optimizers accept `clipnorm` or `clipvalue` arguments:"
+    "All `tf.keras.optimizers` accept `clipnorm` or `clipvalue` arguments:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)"
+    "optimizer = tf.keras.optimizers.SGD(clipvalue=1.0)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)"
+    "optimizer = tf.keras.optimizers.SGD(clipnorm=1.0)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer)"
    ]
   },
   {
@@ -790,174 +747,104 @@
    "metadata": {},
    "source": [
     "Let's split the fashion MNIST training set in two:\n",
-    "* `X_train_A`: all images of all items except for sandals and shirts (classes 5 and 6).\n",
-    "* `X_train_B`: a much smaller training set of just the first 200 images of sandals or shirts.\n",
+    "* `X_train_A`: all images of all items except for T-shirts/tops and pullovers (classes 0 and 2).\n",
+    "* `X_train_B`: a much smaller training set of just the first 200 images of T-shirts/tops and pullovers.\n",
     "\n",
     "The validation set and the test set are also split this way, but without restricting the number of images.\n",
     "\n",
-    "We will train a model on set A (classification task with 8 classes), and try to reuse it to tackle set B (binary classification). We hope to transfer a little bit of knowledge from task A to task B, since classes in set A (sneakers, ankle boots, coats, t-shirts, etc.) are somewhat similar to classes in set B (sandals and shirts). However, since we are using `Dense` layers, only patterns that occur at the same location can be reused (in contrast, convolutional layers will transfer much better, since learned patterns can be detected anywhere on the image, as we will see in the chapter 13)."
+    "We will train a model on set A (classification task with 8 classes), and try to reuse it to tackle set B (binary classification). We hope to transfer a little bit of knowledge from task A to task B, since classes in set A (trousers, dresses, coats, sandals, shirts, sneakers, bags, and ankle boots) are somewhat similar to classes in set B (T-shirts/tops and pullovers). However, since we are using `Dense` layers, only patterns that occur at the same location can be reused (in contrast, convolutional layers will transfer much better, since learned patterns can be detected anywhere on the image, as we will see in the chapter 14)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – split Fashion MNIST into tasks A and B, then train and save\n",
+    "#              model A to \"my_model_A\".\n",
+    "\n",
+    "pos_class_id = class_names.index(\"Pullover\")\n",
+    "neg_class_id = class_names.index(\"T-shirt/top\")\n",
+    "\n",
     "def split_dataset(X, y):\n",
-    "    y_5_or_6 = (y == 5) | (y == 6) # sandals or shirts\n",
-    "    y_A = y[~y_5_or_6]\n",
-    "    y_A[y_A > 6] -= 2 # class indices 7, 8, 9 should be moved to 5, 6, 7\n",
-    "    y_B = (y[y_5_or_6] == 6).astype(np.float32) # binary classification task: is it a shirt (class 6)?\n",
-    "    return ((X[~y_5_or_6], y_A),\n",
-    "            (X[y_5_or_6], y_B))\n",
+    "    y_for_B = (y == pos_class_id) | (y == neg_class_id)\n",
+    "    y_A = y[~y_for_B]\n",
+    "    y_B = (y[y_for_B] == pos_class_id).astype(np.float32)\n",
+    "    old_class_ids = list(set(range(10)) - set([neg_class_id, pos_class_id]))\n",
+    "    for old_class_id, new_class_id in zip(old_class_ids, range(8)):\n",
+    "        y_A[y_A == old_class_id] = new_class_id  # reorder class ids for A\n",
+    "    return ((X[~y_for_B], y_A), (X[y_for_B], y_B))\n",
     "\n",
     "(X_train_A, y_train_A), (X_train_B, y_train_B) = split_dataset(X_train, y_train)\n",
     "(X_valid_A, y_valid_A), (X_valid_B, y_valid_B) = split_dataset(X_valid, y_valid)\n",
     "(X_test_A, y_test_A), (X_test_B, y_test_B) = split_dataset(X_test, y_test)\n",
     "X_train_B = X_train_B[:200]\n",
-    "y_train_B = y_train_B[:200]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train_A.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train_B.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_train_A[:30]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_train_B[:30]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "y_train_B = y_train_B[:200]\n",
+    "\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_A = tf.keras.Sequential()\n",
-    "model_A.add(tf.keras.layers.Flatten(input_shape=[28, 28]))\n",
-    "for n_hidden in (300, 100, 50, 50, 50):\n",
-    "    model_A.add(tf.keras.layers.Dense(n_hidden, activation=\"selu\"))\n",
-    "model_A.add(tf.keras.layers.Dense(8, activation=\"softmax\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 53,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "\n",
+    "model_A = tf.keras.Sequential([\n",
+    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(8, activation=\"softmax\")\n",
+    "])\n",
+    "\n",
     "model_A.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "                optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "                metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 54,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),\n",
+    "                metrics=[\"accuracy\"])\n",
     "history = model_A.fit(X_train_A, y_train_A, epochs=20,\n",
-    "                    validation_data=(X_valid_A, y_valid_A))"
+    "                      validation_data=(X_valid_A, y_valid_A))\n",
+    "model_A.save(\"my_model_A\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_A.save(\"my_model_A.h5\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_B = tf.keras.Sequential()\n",
-    "model_B.add(tf.keras.layers.Flatten(input_shape=[28, 28]))\n",
-    "for n_hidden in (300, 100, 50, 50, 50):\n",
-    "    model_B.add(tf.keras.layers.Dense(n_hidden, activation=\"selu\"))\n",
-    "model_B.add(tf.keras.layers.Dense(1, activation=\"sigmoid\"))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – train and evaluate model B, without reusing model A\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "model_B = tf.keras.Sequential([\n",
+    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(1, activation=\"sigmoid\")\n",
+    "])\n",
+    "\n",
     "model_B.compile(loss=\"binary_crossentropy\",\n",
-    "                optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "                metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 58,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "                optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),\n",
+    "                metrics=[\"accuracy\"])\n",
     "history = model_B.fit(X_train_B, y_train_B, epochs=20,\n",
-    "                      validation_data=(X_valid_B, y_valid_B))"
+    "                      validation_data=(X_valid_B, y_valid_B))\n",
+    "model_B.evaluate(X_test_B, y_test_B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Model B reaches 91.85% accuracy on the test set. Now let's try reusing the pretrained model A."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model_B.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 60,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_A = tf.keras.models.load_model(\"my_model_A.h5\")\n",
+    "model_A = tf.keras.models.load_model(\"my_model_A\")\n",
     "model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])\n",
     "model_B_on_A.add(tf.keras.layers.Dense(1, activation=\"sigmoid\"))"
    ]
@@ -971,33 +858,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)  # extra code – ensure reproducibility"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
     "model_A_clone = tf.keras.models.clone_model(model_A)\n",
-    "model_A_clone.set_weights(model_A.get_weights())\n",
+    "model_A_clone.set_weights(model_A.get_weights())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – creating model_B_on_A just like in the previous cell\n",
     "model_B_on_A = tf.keras.Sequential(model_A_clone.layers[:-1])\n",
     "model_B_on_A.add(tf.keras.layers.Dense(1, activation=\"sigmoid\"))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
     "for layer in model_B_on_A.layers[:-1]:\n",
     "    layer.trainable = False\n",
     "\n",
-    "model_B_on_A.compile(loss=\"binary_crossentropy\",\n",
-    "                     optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)\n",
+    "model_B_on_A.compile(loss=\"binary_crossentropy\", optimizer=optimizer,\n",
     "                     metrics=[\"accuracy\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1007,8 +912,8 @@
     "for layer in model_B_on_A.layers[:-1]:\n",
     "    layer.trainable = True\n",
     "\n",
-    "model_B_on_A.compile(loss=\"binary_crossentropy\",\n",
-    "                     optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)\n",
+    "model_B_on_A.compile(loss=\"binary_crossentropy\", optimizer=optimizer,\n",
     "                     metrics=[\"accuracy\"])\n",
     "history = model_B_on_A.fit(X_train_B, y_train_B, epochs=16,\n",
     "                           validation_data=(X_valid_B, y_valid_B))"
@@ -1023,16 +928,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_B.evaluate(X_test_B, y_test_B)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1043,16 +939,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Great! We got quite a bit of transfer: the error rate dropped by a factor of 4.9!"
+    "Great! We got a bit of transfer: the model's accuracy went up 2 percentage points, from 91.85% to 93.85%. This means the error rate dropped by almost 25%:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
-    "(100 - 97.05) / (100 - 99.40)"
+    "1 - (100 - 93.85) / (100 - 91.85)"
    ]
   },
   {
@@ -1062,6 +958,53 @@
     "# Faster Optimizers"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – a little function to test an optimizer on Fashion MNIST\n",
+    "\n",
+    "def build_model(seed=42):\n",
+    "    tf.random.set_seed(seed)\n",
+    "    return tf.keras.Sequential([\n",
+    "        tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
+    "        tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                              kernel_initializer=\"he_normal\"),\n",
+    "        tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                              kernel_initializer=\"he_normal\"),\n",
+    "        tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                              kernel_initializer=\"he_normal\"),\n",
+    "        tf.keras.layers.Dense(10, activation=\"softmax\")\n",
+    "    ])\n",
+    "\n",
+    "def build_and_train_model(optimizer):\n",
+    "    model = build_model()\n",
+    "    model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "                  metrics=[\"accuracy\"])\n",
+    "    return model.fit(X_train, y_train, epochs=10,\n",
+    "                     validation_data=(X_valid, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_sgd = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1071,13 +1014,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
     "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_momentum = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1087,11 +1039,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 51,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, nesterov=True)"
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9,\n",
+    "                                    nesterov=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_nesterov = build_and_train_model(optimizer)  # extra code"
    ]
   },
   {
@@ -1103,13 +1065,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
     "optimizer = tf.keras.optimizers.Adagrad(learning_rate=0.001)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_adagrad = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1119,13 +1090,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [],
    "source": [
     "optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001, rho=0.9)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_rmsprop = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1135,43 +1115,132 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)"
+    "optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9,\n",
+    "                                     beta_2=0.999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_adam = build_and_train_model(optimizer)  # extra code"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Adamax Optimization"
+    "**Adamax Optimization**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9, beta_2=0.999)"
+    "optimizer = tf.keras.optimizers.Adamax(learning_rate=0.001, beta_1=0.9,\n",
+    "                                       beta_2=0.999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_adamax = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "**Nadam Optimization**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9,\n",
+    "                                      beta_2=0.999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "history_nadam = build_and_train_model(optimizer)  # extra code"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Nadam Optimization"
+    "**AdamW Optimization**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 63,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow_addons as tfa\n",
+    "\n",
+    "optimizer = tfa.optimizers.AdamW(weight_decay=1e-5, learning_rate=0.001,\n",
+    "                                 beta_1=0.9, beta_2=0.999)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)"
+    "history_adamw = build_and_train_model(optimizer)  # extra code"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – visualize the learning curves of all the optimizers\n",
+    "\n",
+    "for loss in (\"loss\", \"val_loss\"):\n",
+    "    plt.figure(figsize=(12, 8))\n",
+    "    opt_names = \"SGD Momentum Nesterov AdaGrad RMSProp Adam Adamax Nadam AdamW\"\n",
+    "    for history, opt_name in zip((history_sgd, history_momentum, history_nesterov,\n",
+    "                                  history_adagrad, history_rmsprop, history_adam,\n",
+    "                                  history_adamax, history_nadam, history_adamw),\n",
+    "                                 opt_names.split()):\n",
+    "        plt.plot(history.history[loss], label=f\"{opt_name}\", linewidth=3)\n",
+    "\n",
+    "    plt.grid()\n",
+    "    plt.xlabel(\"Epochs\")\n",
+    "    plt.ylabel({\"loss\": \"Training loss\", \"val_loss\": \"Validation loss\"}[loss])\n",
+    "    plt.legend(loc=\"upper left\")\n",
+    "    plt.axis([0, 9, 0.1, 0.7])\n",
+    "    plt.show()"
    ]
   },
   {
@@ -1198,7 +1267,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1207,42 +1276,29 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])"
+    "history_power_scheduling = build_and_train_model(optimizer)  # extra code"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_epochs = 25\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 68,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell plots power scheduling\n",
+    "\n",
     "import math\n",
     "\n",
     "learning_rate = 0.01\n",
     "decay = 1e-4\n",
     "batch_size = 32\n",
     "n_steps_per_epoch = math.ceil(len(X_train) / batch_size)\n",
+    "n_epochs = 25\n",
+    "\n",
     "epochs = np.arange(n_epochs)\n",
     "lrs = learning_rate / (1 + decay * epochs * n_steps_per_epoch)\n",
     "\n",
@@ -1266,28 +1322,28 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "```lr = lr0 * 0.1**(epoch / s)```"
+    "```lr = lr0 * 0.1 ** (epoch / s)```"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
     "def exponential_decay_fn(epoch):\n",
-    "    return 0.01 * 0.1**(epoch / 20)"
+    "    return 0.01 * 0.1 ** (epoch / 20)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
     "def exponential_decay(lr0, s):\n",
     "    def exponential_decay_fn(epoch):\n",
-    "        return lr0 * 0.1**(epoch / s)\n",
+    "        return lr0 * 0.1 ** (epoch / s)\n",
     "    return exponential_decay_fn\n",
     "\n",
     "exponential_decay_fn = exponential_decay(lr0=0.01, s=20)"
@@ -1295,38 +1351,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 25"
+    "# extra code – build and compile a model for Fashion MNIST\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "model = build_model()\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 72,
    "metadata": {},
    "outputs": [],
    "source": [
     "lr_scheduler = tf.keras.callbacks.LearningRateScheduler(exponential_decay_fn)\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid),\n",
+    "history = model.fit(X_train, y_train, epochs=n_epochs,\n",
+    "                    validation_data=(X_valid, y_valid),\n",
     "                    callbacks=[lr_scheduler])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell plots exponential scheduling\n",
+    "\n",
     "plt.plot(history.epoch, history.history[\"lr\"], \"o-\")\n",
     "plt.axis([0, n_epochs - 1, 0, 0.011])\n",
     "plt.xlabel(\"Epoch\")\n",
@@ -1345,80 +1402,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 74,
    "metadata": {},
    "outputs": [],
    "source": [
     "def exponential_decay_fn(epoch, lr):\n",
-    "    return lr * 0.1**(1 / 20)"
+    "    return lr * 0.1 ** (1 / 20)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "If you want to update the learning rate at each iteration rather than at each epoch, you must write your own callback class:"
+    "**Extra material**: if you want to update the learning rate at each iteration rather than at each epoch, you can write your own callback class:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 75,
    "metadata": {},
    "outputs": [],
    "source": [
     "K = tf.keras.backend\n",
     "\n",
     "class ExponentialDecay(tf.keras.callbacks.Callback):\n",
-    "    def __init__(self, s=40000):\n",
+    "    def __init__(self, n_steps=40_000):\n",
     "        super().__init__()\n",
-    "        self.s = s\n",
+    "        self.n_steps = n_steps\n",
     "\n",
     "    def on_batch_begin(self, batch, logs=None):\n",
     "        # Note: the `batch` argument is reset at each epoch\n",
     "        lr = K.get_value(self.model.optimizer.learning_rate)\n",
-    "        K.set_value(self.model.optimizer.learning_rate, lr * 0.1**(1 / s))\n",
+    "        new_learning_rate = lr * 0.1 ** (1 / self.n_steps)\n",
+    "        K.set_value(self.model.optimizer.learning_rate, new_learning_rate)\n",
     "\n",
     "    def on_epoch_end(self, epoch, logs=None):\n",
     "        logs = logs or {}\n",
-    "        logs['lr'] = K.get_value(self.model.optimizer.learning_rate)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
+    "        logs['lr'] = K.get_value(self.model.optimizer.learning_rate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "lr0 = 0.01\n",
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=lr0)\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])\n",
+    "model = build_model()\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "n_epochs = 25\n",
-    "\n",
-    "s = 20 * len(X_train) // 32 # number of steps in 20 epochs (batch size = 32)\n",
-    "exp_decay = ExponentialDecay(s)\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid),\n",
+    "batch_size = 32\n",
+    "n_steps = n_epochs * math.ceil(len(X_train) / batch_size)\n",
+    "exp_decay = ExponentialDecay(n_steps)\n",
+    "history = model.fit(X_train, y_train, epochs=n_epochs,\n",
+    "                    validation_data=(X_valid, y_valid),\n",
     "                    callbacks=[exp_decay])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_steps = n_epochs * len(X_train) // 32\n",
-    "steps = np.arange(n_steps)\n",
-    "lrs = lr0 * 0.1**(steps / s)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 78,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
+    "n_steps = n_epochs * math.ceil(len(X_train) / batch_size)\n",
+    "steps = np.arange(n_steps)\n",
+    "decay_rate = 0.1\n",
+    "lrs = lr0 * decay_rate ** (steps / n_steps)\n",
+    "\n",
     "plt.plot(steps, lrs, \"-\", linewidth=2)\n",
     "plt.axis([0, n_steps - 1, 0, lr0 * 1.1])\n",
     "plt.xlabel(\"Batch\")\n",
@@ -1437,7 +1500,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 87,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1452,15 +1515,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
+   "execution_count": 80,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell demonstrates a more general way to define\n",
+    "#              piecewise constant scheduling.\n",
+    "\n",
     "def piecewise_constant(boundaries, values):\n",
     "    boundaries = np.array([0] + boundaries)\n",
     "    values = np.array(values)\n",
     "    def piecewise_constant_fn(epoch):\n",
-    "        return values[np.argmax(boundaries > epoch) - 1]\n",
+    "        return values[(boundaries > epoch).argmax() - 1]\n",
     "    return piecewise_constant_fn\n",
     "\n",
     "piecewise_constant_fn = piecewise_constant([5, 15], [0.01, 0.005, 0.001])"
@@ -1468,32 +1534,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
+   "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – use a tf.keras.callbacks.LearningRateScheduler like earlier\n",
+    "\n",
+    "n_epochs = 25\n",
+    "\n",
     "lr_scheduler = tf.keras.callbacks.LearningRateScheduler(piecewise_constant_fn)\n",
     "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 25\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid),\n",
+    "model = build_model()\n",
+    "optimizer = tf.keras.optimizers.Nadam(learning_rate=lr0)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])\n",
+    "history = model.fit(X_train, y_train, epochs=n_epochs,\n",
+    "                    validation_data=(X_valid, y_valid),\n",
     "                    callbacks=[lr_scheduler])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
-    "plt.plot(history.epoch, [piecewise_constant_fn(epoch) for epoch in history.epoch], \"o-\")\n",
+    "# extra code – this cell plots piecewise constant scheduling\n",
+    "\n",
+    "plt.plot(history.epoch, history.history[\"lr\"], \"o-\")\n",
     "plt.axis([0, n_epochs - 1, 0, 0.011])\n",
     "plt.xlabel(\"Epoch\")\n",
     "plt.ylabel(\"Learning Rate\")\n",
@@ -1511,42 +1579,38 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 91,
+   "execution_count": 83,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)"
+    "# extra code – build and compile the model\n",
+    "\n",
+    "model = build_model()\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=lr0)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 92,
+   "execution_count": 84,
    "metadata": {},
    "outputs": [],
    "source": [
     "lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.02, momentum=0.9)\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])\n",
-    "n_epochs = 25\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid),\n",
+    "history = model.fit(X_train, y_train, epochs=n_epochs,\n",
+    "                    validation_data=(X_valid, y_valid),\n",
     "                    callbacks=[lr_scheduler])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 93,
+   "execution_count": 85,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – this cell plots performance scheduling\n",
+    "\n",
     "plt.plot(history.epoch, history.history[\"lr\"], \"bo-\")\n",
     "plt.xlabel(\"Epoch\")\n",
     "plt.ylabel(\"Learning Rate\", color='b')\n",
@@ -1572,23 +1636,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 94,
+   "execution_count": 86,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "s = 20 * len(X_train) // 32 # number of steps in 20 epochs (batch size = 32)\n",
-    "learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(0.01, s, 0.1)\n",
-    "optimizer = tf.keras.optimizers.SGD(learning_rate)\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])\n",
+    "import math\n",
+    "\n",
+    "batch_size = 32\n",
     "n_epochs = 25\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "n_steps = n_epochs * math.ceil(len(X_train) / batch_size)\n",
+    "scheduled_learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(\n",
+    "    initial_learning_rate=0.01, decay_steps=n_steps, decay_rate=0.1)\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=scheduled_learning_rate)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – build and train the model\n",
+    "model = build_and_train_model(optimizer)"
    ]
   },
   {
@@ -1600,11 +1669,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 88,
    "metadata": {},
    "outputs": [],
    "source": [
-    "learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(\n",
+    "# extra code – shows how to use PiecewiseConstantDecay\n",
+    "scheduled_learning_rate = tf.keras.optimizers.schedules.PiecewiseConstantDecay(\n",
     "    boundaries=[5. * n_steps_per_epoch, 15. * n_steps_per_epoch],\n",
     "    values=[0.01, 0.005, 0.001])"
    ]
@@ -1616,9 +1686,16 @@
     "### 1Cycle scheduling"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `ExponentialLearningRate` custom callback updates the learning rate during training, at the end of each batch. It multiplies it by a constant `factor`. It also saves the learning rate and loss at each batch. Since `logs[\"loss\"]` is actually the mean loss since the start of the epoch, and we want to save the batch loss instead, we must compute the mean times the number of batches since the beginning of the epoch to get the total loss so far, then we subtract the total loss at the previous batch to get the current batch's loss."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 96,
+   "execution_count": 89,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1629,15 +1706,39 @@
     "        self.factor = factor\n",
     "        self.rates = []\n",
     "        self.losses = []\n",
-    "    def on_batch_end(self, batch, logs):\n",
-    "        self.rates.append(K.get_value(self.model.optimizer.learning_rate))\n",
-    "        self.losses.append(logs[\"loss\"])\n",
-    "        K.set_value(self.model.optimizer.learning_rate, self.model.optimizer.learning_rate * self.factor)\n",
     "\n",
-    "def find_learning_rate(model, X, y, epochs=1, batch_size=32, min_rate=10**-5, max_rate=10):\n",
+    "    def on_epoch_begin(self, epoch, logs=None):\n",
+    "        self.sum_of_epoch_losses = 0\n",
+    "\n",
+    "    def on_batch_end(self, batch, logs=None):\n",
+    "        mean_epoch_loss = logs[\"loss\"]  # the epoch's mean loss so far \n",
+    "        new_sum_of_epoch_losses = mean_epoch_loss * (batch + 1)\n",
+    "        batch_loss = new_sum_of_epoch_losses - self.sum_of_epoch_losses\n",
+    "        self.sum_of_epoch_losses = new_sum_of_epoch_losses\n",
+    "        self.rates.append(K.get_value(self.model.optimizer.learning_rate))\n",
+    "        self.losses.append(batch_loss)\n",
+    "        K.set_value(self.model.optimizer.learning_rate,\n",
+    "                    self.model.optimizer.learning_rate * self.factor)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `find_learning_rate()` function trains the model using the `ExponentialLearningRate` callback, and it returns the learning rates and corresponding batch losses. At the end, it restores the model and its optimizer to their initial state."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_learning_rate(model, X, y, epochs=1, batch_size=32, min_rate=1e-4,\n",
+    "                       max_rate=1):\n",
     "    init_weights = model.get_weights()\n",
     "    iterations = math.ceil(len(X) / batch_size) * epochs\n",
-    "    factor = np.exp(np.log(max_rate / min_rate) / iterations)\n",
+    "    factor = (max_rate / min_rate) ** (1 / iterations)\n",
     "    init_lr = K.get_value(model.optimizer.learning_rate)\n",
     "    K.set_value(model.optimizer.learning_rate, min_rate)\n",
     "    exp_lr = ExponentialLearningRate(factor)\n",
@@ -1645,113 +1746,141 @@
     "                        callbacks=[exp_lr])\n",
     "    K.set_value(model.optimizer.learning_rate, init_lr)\n",
     "    model.set_weights(init_weights)\n",
-    "    return exp_lr.rates, exp_lr.losses\n",
-    "\n",
-    "def plot_lr_vs_loss(rates, losses):\n",
-    "    plt.plot(rates, losses)\n",
-    "    plt.gca().set_xscale('log')\n",
-    "    plt.hlines(min(losses), min(rates), max(rates))\n",
-    "    plt.axis([min(rates), max(rates), min(losses), (losses[0] + min(losses)) / 2])\n",
-    "    plt.xlabel(\"Learning rate\")\n",
-    "    plt.ylabel(\"Loss\")"
+    "    return exp_lr.rates, exp_lr.losses"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Warning**: In the `on_batch_end()` method, `logs[\"loss\"]` used to contain the batch loss, but in TensorFlow 2.2.0 it was replaced with the mean loss (since the start of the epoch). This explains why the graph below is much smoother than in the book (if you are using TF 2.2 or above). It also means that there is a lag between the moment the batch loss starts exploding and the moment the explosion becomes clear in the graph. So you should choose a slightly smaller learning rate than you would have chosen with the \"noisy\" graph. Alternatively, you can tweak the `ExponentialLearningRate` callback above so it computes the batch loss (based on the current mean loss and the previous mean loss):\n",
-    "\n",
-    "```python\n",
-    "class ExponentialLearningRate(tf.keras.callbacks.Callback):\n",
-    "    def __init__(self, factor):\n",
-    "        self.factor = factor\n",
-    "        self.rates = []\n",
-    "        self.losses = []\n",
-    "    def on_epoch_begin(self, epoch, logs=None):\n",
-    "        self.prev_loss = 0\n",
-    "    def on_batch_end(self, batch, logs=None):\n",
-    "        batch_loss = logs[\"loss\"] * (batch + 1) - self.prev_loss * batch\n",
-    "        self.prev_loss = logs[\"loss\"]\n",
-    "        self.rates.append(K.get_value(self.model.optimizer.learning_rate))\n",
-    "        self.losses.append(batch_loss)\n",
-    "        K.set_value(self.model.optimizer.learning_rate, self.model.optimizer.learning_rate * self.factor)\n",
-    "```"
+    "The `plot_lr_vs_loss()` function plots the learning rates vs the losses. The optimal learning rate to use as the maximum learning rate in 1cycle is near the bottom of the curve."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 97,
+   "execution_count": 91,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
+    "def plot_lr_vs_loss(rates, losses):\n",
+    "    plt.plot(rates, losses, \"b\")\n",
+    "    plt.gca().set_xscale('log')\n",
+    "    max_loss = losses[0] + min(losses)\n",
+    "    plt.hlines(min(losses), min(rates), max(rates), color=\"k\")\n",
+    "    plt.axis([min(rates), max(rates), 0, max_loss])\n",
+    "    plt.xlabel(\"Learning rate\")\n",
+    "    plt.ylabel(\"Loss\")\n",
+    "    plt.grid()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's build a simple Fashion MNIST model and compile it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = build_model()\n",
     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "              optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),\n",
     "              metrics=[\"accuracy\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's find the optimal max learning rate for 1cycle:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 98,
+   "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
     "batch_size = 128\n",
-    "rates, losses = find_learning_rate(model, X_train_scaled, y_train, epochs=1, batch_size=batch_size)\n",
+    "rates, losses = find_learning_rate(model, X_train, y_train, epochs=1,\n",
+    "                                   batch_size=batch_size)\n",
     "plot_lr_vs_loss(rates, losses)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 99,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "class OneCycleScheduler(tf.keras.callbacks.Callback):\n",
-    "    def __init__(self, iterations, max_rate, start_rate=None,\n",
-    "                 last_iterations=None, last_rate=None):\n",
-    "        self.iterations = iterations\n",
-    "        self.max_rate = max_rate\n",
-    "        self.start_rate = start_rate or max_rate / 10\n",
-    "        self.last_iterations = last_iterations or iterations // 10 + 1\n",
-    "        self.half_iteration = (iterations - self.last_iterations) // 2\n",
-    "        self.last_rate = last_rate or self.start_rate / 1000\n",
-    "        self.iteration = 0\n",
-    "    def _interpolate(self, iter1, iter2, rate1, rate2):\n",
-    "        return ((rate2 - rate1) * (self.iteration - iter1)\n",
-    "                / (iter2 - iter1) + rate1)\n",
-    "    def on_batch_begin(self, batch, logs):\n",
-    "        if self.iteration < self.half_iteration:\n",
-    "            rate = self._interpolate(0, self.half_iteration, self.start_rate, self.max_rate)\n",
-    "        elif self.iteration < 2 * self.half_iteration:\n",
-    "            rate = self._interpolate(self.half_iteration, 2 * self.half_iteration,\n",
-    "                                     self.max_rate, self.start_rate)\n",
-    "        else:\n",
-    "            rate = self._interpolate(2 * self.half_iteration, self.iterations,\n",
-    "                                     self.start_rate, self.last_rate)\n",
-    "        self.iteration += 1\n",
-    "        K.set_value(self.model.optimizer.learning_rate, rate)"
+    "Looks like the max learning rate to use for 1cycle is around 10<sup>–1</sup>."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `OneCycleScheduler` custom callback updates the learning rate at the beginning of each batch. It applies the logic described in the book: increase the learning rate linearly during about half of training, then reduce it linearly back to the initial learning rate, and lastly reduce it down to close to zero linearly for the very last part of training."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 94,
    "metadata": {},
    "outputs": [],
    "source": [
+    "class OneCycleScheduler(tf.keras.callbacks.Callback):\n",
+    "    def __init__(self, iterations, max_lr=1e-3, start_lr=None,\n",
+    "                 last_iterations=None, last_lr=None):\n",
+    "        self.iterations = iterations\n",
+    "        self.max_lr = max_lr\n",
+    "        self.start_lr = start_lr or max_lr / 10\n",
+    "        self.last_iterations = last_iterations or iterations // 10 + 1\n",
+    "        self.half_iteration = (iterations - self.last_iterations) // 2\n",
+    "        self.last_lr = last_lr or self.start_lr / 1000\n",
+    "        self.iteration = 0\n",
+    "\n",
+    "    def _interpolate(self, iter1, iter2, lr1, lr2):\n",
+    "        return (lr2 - lr1) * (self.iteration - iter1) / (iter2 - iter1) + lr1\n",
+    "\n",
+    "    def on_batch_begin(self, batch, logs):\n",
+    "        if self.iteration < self.half_iteration:\n",
+    "            lr = self._interpolate(0, self.half_iteration, self.start_lr,\n",
+    "                                   self.max_lr)\n",
+    "        elif self.iteration < 2 * self.half_iteration:\n",
+    "            lr = self._interpolate(self.half_iteration, 2 * self.half_iteration,\n",
+    "                                   self.max_lr, self.start_lr)\n",
+    "        else:\n",
+    "            lr = self._interpolate(2 * self.half_iteration, self.iterations,\n",
+    "                                   self.start_lr, self.last_lr)\n",
+    "        self.iteration += 1\n",
+    "        K.set_value(self.model.optimizer.learning_rate, lr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's build and compile a simple Fashion MNIST model, then train it using the `OneCycleScheduler` callback:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = build_model()\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
+    "              optimizer=tf.keras.optimizers.SGD(),\n",
+    "              metrics=[\"accuracy\"])\n",
     "n_epochs = 25\n",
-    "onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs, max_rate=0.05)\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs, batch_size=batch_size,\n",
-    "                    validation_data=(X_valid_scaled, y_valid),\n",
+    "onecycle = OneCycleScheduler(math.ceil(len(X_train) / batch_size) * n_epochs,\n",
+    "                             max_lr=0.1)\n",
+    "history = model.fit(X_train, y_train, epochs=n_epochs, batch_size=batch_size,\n",
+    "                    validation_data=(X_valid, y_valid),\n",
     "                    callbacks=[onecycle])"
    ]
   },
@@ -1771,63 +1900,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 96,
    "metadata": {},
    "outputs": [],
    "source": [
-    "layer = tf.keras.layers.Dense(100, activation=\"elu\",\n",
-    "                           kernel_initializer=\"he_normal\",\n",
-    "                           kernel_regularizer=tf.keras.regularizers.l2(0.01))\n",
-    "# or l1(0.1) for ℓ1 regularization with a factor of 0.1\n",
-    "# or l1_l2(0.1, 0.01) for both ℓ1 and ℓ2 regularization, with factors 0.1 and 0.01 respectively"
+    "layer = tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                              kernel_initializer=\"he_normal\",\n",
+    "                              kernel_regularizer=tf.keras.regularizers.l2(0.01))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Or use `l1(0.1)` for ℓ<sub>1</sub> regularization with a factor of 0.1, or `l1_l2(0.1, 0.01)` for both ℓ<sub>1</sub> and ℓ<sub>2</sub> regularization, with factors 0.1 and 0.01 respectively."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 97,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.Dense(300, activation=\"elu\",\n",
-    "                       kernel_initializer=\"he_normal\",\n",
-    "                       kernel_regularizer=tf.keras.regularizers.l2(0.01)),\n",
-    "    tf.keras.layers.Dense(100, activation=\"elu\",\n",
-    "                       kernel_initializer=\"he_normal\",\n",
-    "                       kernel_regularizer=tf.keras.regularizers.l2(0.01)),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\",\n",
-    "                       kernel_regularizer=tf.keras.regularizers.l2(0.01))\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 2\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "tf.random.set_seed(42)  # extra code – for reproducibility"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
     "from functools import partial\n",
     "\n",
     "RegularizedDense = partial(tf.keras.layers.Dense,\n",
-    "                           activation=\"elu\",\n",
+    "                           activation=\"relu\",\n",
     "                           kernel_initializer=\"he_normal\",\n",
     "                           kernel_regularizer=tf.keras.regularizers.l2(0.01))\n",
     "\n",
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    RegularizedDense(300),\n",
+    "    RegularizedDense(100),\n",
     "    RegularizedDense(100),\n",
     "    RegularizedDense(10, activation=\"softmax\")\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 2\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – compile and train the model\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.02)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])\n",
+    "history = model.fit(X_train, y_train, epochs=2,\n",
+    "                    validation_data=(X_valid, y_valid))"
    ]
   },
   {
@@ -1839,89 +1969,76 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)  # extra code – for reproducibility"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
     "    tf.keras.layers.Dropout(rate=0.2),\n",
-    "    tf.keras.layers.Dense(300, activation=\"elu\", kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    tf.keras.layers.Dropout(rate=0.2),\n",
-    "    tf.keras.layers.Dense(100, activation=\"elu\", kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(100, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    tf.keras.layers.Dropout(rate=0.2),\n",
     "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 2\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – compile and train the model\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])\n",
+    "history = model.fit(X_train, y_train, epochs=10,\n",
+    "                    validation_data=(X_valid, y_valid))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Alpha Dropout"
+    "The training accuracy looks like it's lower than the validation accuracy, but that's just because dropout is only active during training. If we evaluate the model on the training set after training (i.e., with dropout turned off), we get the \"real\" training accuracy, which is very slightly higher than the validation accuracy and the test accuracy:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)"
+    "model.evaluate(X_train, y_train)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    tf.keras.layers.AlphaDropout(rate=0.2),\n",
-    "    tf.keras.layers.Dense(300, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.AlphaDropout(rate=0.2),\n",
-    "    tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\"),\n",
-    "    tf.keras.layers.AlphaDropout(rate=0.2),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
-    "])\n",
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])\n",
-    "n_epochs = 20\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "model.evaluate(X_test, y_test)"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 107,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "model.evaluate(X_test_scaled, y_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 108,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.evaluate(X_train_scaled, y_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train_scaled, y_train)"
+    "**Note**: make sure to use `AlphaDropout` instead of `Dropout` if you want to build a self-normalizing neural net using SELU."
    ]
   },
   {
@@ -1931,14 +2048,63 @@
     "## MC Dropout"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)  # extra code – for reproducibility"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_probas = np.stack([model(X_test, training=True)\n",
+    "                     for sample in range(100)])\n",
+    "y_proba = y_probas.mean(axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 107,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.predict(X_test[:1]).round(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_proba[0].round(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_std = y_probas.std(axis=0)\n",
+    "y_std[0].round(3)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)"
+    "y_pred = y_proba.argmax(axis=1)\n",
+    "accuracy = (y_pred == y_test).sum() / len(y_test)\n",
+    "accuracy"
    ]
   },
   {
@@ -1947,10 +2113,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "y_probas = np.stack([model(X_test_scaled, training=True)\n",
-    "                     for sample in range(100)])\n",
-    "y_proba = y_probas.mean(axis=0)\n",
-    "y_std = y_probas.std(axis=0)"
+    "class MCDropout(tf.keras.layers.Dropout):\n",
+    "    def call(self, inputs, training=None):\n",
+    "        return super().call(inputs, training=True)"
    ]
   },
   {
@@ -1959,7 +2124,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.round(model.predict(X_test_scaled[:1]), 2)"
+    "# extra code – shows how to convert Dropout to MCDropout in a Sequential model\n",
+    "Dropout = tf.keras.layers.Dropout\n",
+    "mc_model = tf.keras.Sequential([\n",
+    "    MCDropout(layer.rate) if isinstance(layer, Dropout) else layer\n",
+    "    for layer in model.layers\n",
+    "])\n",
+    "mc_model.set_weights(model.get_weights())"
    ]
   },
   {
@@ -1967,113 +2138,10 @@
    "execution_count": 113,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "np.round(y_probas[:, :1], 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 114,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "np.round(y_proba[:1], 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_std = y_probas.std(axis=0)\n",
-    "np.round(y_std[:1], 2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 116,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "y_pred = np.argmax(y_proba, axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "accuracy = np.sum(y_pred == y_test) / len(y_test)\n",
-    "accuracy"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 118,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class MCDropout(tf.keras.layers.Dropout):\n",
-    "    def call(self, inputs):\n",
-    "        return super().call(inputs, training=True)\n",
-    "\n",
-    "class MCAlphaDropout(tf.keras.layers.AlphaDropout):\n",
-    "    def call(self, inputs):\n",
-    "        return super().call(inputs, training=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 119,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "np.random.seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 120,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mc_model = tf.keras.Sequential([\n",
-    "    MCAlphaDropout(layer.rate) if isinstance(layer, tf.keras.layers.AlphaDropout) else layer\n",
-    "    for layer in model.layers\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 121,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "mc_model.summary()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True)\n",
-    "mc_model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer, metrics=[\"accuracy\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mc_model.set_weights(model.get_weights())"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2083,11 +2151,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.round(np.mean([mc_model.predict(X_test_scaled[:1]) for sample in range(100)], axis=0), 2)"
+    "# extra code – shows that the model works without retraining\n",
+    "tf.random.set_seed(42)\n",
+    "np.mean([mc_model.predict(X_test[:1])\n",
+    "         for sample in range(100)], axis=0).round(2)"
    ]
   },
   {
@@ -2099,34 +2170,39 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
-    "layer = tf.keras.layers.Dense(100, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                           kernel_constraint=tf.keras.constraints.max_norm(1.))"
+    "dense = tf.keras.layers.Dense(\n",
+    "    100, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "    kernel_constraint=tf.keras.constraints.max_norm(1.))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to apply max norm to every hidden layer in a model\n",
+    "\n",
     "MaxNormDense = partial(tf.keras.layers.Dense,\n",
-    "                       activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
+    "                       activation=\"relu\", kernel_initializer=\"he_normal\",\n",
     "                       kernel_constraint=tf.keras.constraints.max_norm(1.))\n",
     "\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Flatten(input_shape=[28, 28]),\n",
-    "    MaxNormDense(300),\n",
+    "    MaxNormDense(100),\n",
     "    MaxNormDense(100),\n",
     "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
     "])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n",
-    "n_epochs = 2\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=n_epochs,\n",
-    "                    validation_data=(X_valid_scaled, y_valid))"
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=optimizer,\n",
+    "              metrics=[\"accuracy\"])\n",
+    "history = model.fit(X_train, y_train, epochs=10,\n",
+    "                    validation_data=(X_valid, y_valid))"
    ]
   },
   {
@@ -2147,7 +2223,13 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "See appendix A."
+    "1. Glorot initialization and He initialization were designed to make the output standard deviation as close as possible to the input standard deviation, at least at the beginning of training. This reduces the vanishing/exploding gradients problem.\n",
+    "2. No, all weights should be sampled independently; they should not all have the same initial value. One important goal of sampling weights randomly is to break symmetry: if all the weights have the same initial value, even if that value is not zero, then symmetry is not broken (i.e., all neurons in a given layer are equivalent), and backpropagation will be unable to break it. Concretely, this means that all the neurons in any given layer will always have the same weights. It's like having just one neuron per layer, and much slower. It is virtually impossible for such a configuration to converge to a good solution.\n",
+    "3. It is perfectly fine to initialize the bias terms to zero. Some people like to initialize them just like weights, and that's OK too; it does not make much difference.\n",
+    "4. ReLU is usually a good default for the hidden layers, as it is fast and yields good results. Its ability to output precisely zero can also be useful in some cases (e.g., see Chapter 17). Moreover, it can sometimes benefit from optimized implementations as well as from hardware acceleration. The leaky ReLU variants of ReLU can improve the model's quality without hindering its speed too much compared to ReLU. For large neural nets and more complex problems, GLU, Swish and Mish can give you a slightly higher quality model, but they have a computational cost. The hyperbolic tangent (tanh) can be useful in the output layer if you need to output a number in a fixed range (by default between –1 and 1), but nowadays it is not used much in hidden layers, except in recurrent nets. The sigmoid activation function is also useful in the output layer when you need to estimate a probability (e.g., for binary classification), but it is rarely used in hidden layers (there are exceptions—for example, for the coding layer of variational autoencoders; see Chapter 17). The softplus activation function is useful in the output layer when you need to ensure that the output will always be positive. The softmax activation function is useful in the output layer to estimate probabilities for mutually exclusive classes, but it is rarely (if ever) used in hidden layers.\n",
+    "5. If you set the `momentum` hyperparameter too close to 1 (e.g., 0.99999) when using an `SGD` optimizer, then the algorithm will likely pick up a lot of speed, hopefully moving roughly toward the global minimum, but its momentum will carry it right past the minimum. Then it will slow down and come back, accelerate again, overshoot again, and so on. It may oscillate this way many times before converging, so overall it will take much longer to converge than with a smaller `momentum` value.\n",
+    "6. One way to produce a sparse model (i.e., with most weights equal to zero) is to train the model normally, then zero out tiny weights. For more sparsity, you can apply ℓ<sub>1</sub> regularization during training, which pushes the optimizer toward sparsity. A third option is to use the TensorFlow Model Optimization Toolkit.\n",
+    "7. Yes, dropout does slow down training, in general roughly by a factor of two. However, it has no impact on inference speed since it is only turned on during training. MC Dropout is exactly like dropout during training, but it is still active during inference, so each inference is slowed down slightly. More importantly, when using MC Dropout you generally want to run inference 10 times or more to get better predictions. This means that making predictions is slowed down by a factor of 10 or more."
    ]
   },
   {
@@ -2162,25 +2244,23 @@
    "metadata": {},
    "source": [
     "### a.\n",
-    "*Exercise: Build a DNN with 20 hidden layers of 100 neurons each (that's too many, but it's the point of this exercise). Use He initialization and the ELU activation function.*"
+    "*Exercise: Build a DNN with 20 hidden layers of 100 neurons each (that's too many, but it's the point of this exercise). Use He initialization and the Swish activation function.*"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 117,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
     "for _ in range(20):\n",
     "    model.add(tf.keras.layers.Dense(100,\n",
-    "                                 activation=\"elu\",\n",
-    "                                 kernel_initializer=\"he_normal\"))"
+    "                                    activation=\"swish\",\n",
+    "                                    kernel_initializer=\"he_normal\"))"
    ]
   },
   {
@@ -2200,7 +2280,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2216,7 +2296,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2235,7 +2315,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2256,30 +2336,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
-    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20)\n",
-    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_model.h5\", save_best_only=True)\n",
+    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,\n",
+    "                                                     restore_best_weights=True)\n",
+    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_model\",\n",
+    "                                                         save_best_only=True)\n",
     "run_index = 1 # increment every time you train the model\n",
-    "run_logdir = Path() / \"my_cifar10_logs\" / \"run_{:03d}\".format(run_index)\n",
+    "run_logdir = Path() / \"my_cifar10_logs\" / f\"run_{run_index:03d}\"\n",
     "tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)\n",
     "callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
-    "%tensorboard --logdir=./my_cifar10_logs --port=6006"
+    "%tensorboard --logdir=./my_cifar10_logs"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2290,11 +2372,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.models.load_model(\"my_cifar10_model.h5\")\n",
     "model.evaluate(X_valid, y_valid)"
    ]
   },
@@ -2302,7 +2383,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model with the lowest validation loss gets about 47.6% accuracy on the validation set. It took 27 epochs to reach the lowest validation loss, with roughly 8 seconds per epoch on my laptop (without a GPU). Let's see if we can improve performance using Batch Normalization."
+    "The model with the lowest validation loss gets about 46.7% accuracy on the validation set. It took 29 epochs to reach the lowest validation loss, with roughly 10 seconds per epoch on my laptop (without a GPU). Let's see if we can improve the model using Batch Normalization."
    ]
   },
   {
@@ -2319,28 +2400,25 @@
    "source": [
     "The code below is very similar to the code above, with a few changes:\n",
     "\n",
-    "* I added a BN layer after every Dense layer (before the activation function), except for the output layer. I also added a BN layer before the first hidden layer.\n",
+    "* I added a BN layer after every Dense layer (before the activation function), except for the output layer.\n",
     "* I changed the learning rate to 5e-4. I experimented with 1e-5, 3e-5, 5e-5, 1e-4, 3e-4, 5e-4, 1e-3 and 3e-3, and I chose the one with the best validation performance after 20 epochs.\n",
-    "* I renamed the run directories to run_bn_* and the model file name to my_cifar10_bn_model.h5."
+    "* I renamed the run directories to run_bn_* and the model file name to `my_cifar10_bn_model`."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
-    "model.add(tf.keras.layers.BatchNormalization())\n",
     "for _ in range(20):\n",
     "    model.add(tf.keras.layers.Dense(100, kernel_initializer=\"he_normal\"))\n",
     "    model.add(tf.keras.layers.BatchNormalization())\n",
-    "    model.add(tf.keras.layers.Activation(\"elu\"))\n",
+    "    model.add(tf.keras.layers.Activation(\"swish\"))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
     "\n",
     "optimizer = tf.keras.optimizers.Nadam(learning_rate=5e-4)\n",
@@ -2348,10 +2426,12 @@
     "              optimizer=optimizer,\n",
     "              metrics=[\"accuracy\"])\n",
     "\n",
-    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20)\n",
-    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_bn_model.h5\", save_best_only=True)\n",
+    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20,\n",
+    "                                                     restore_best_weights=True)\n",
+    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_bn_model\",\n",
+    "                                                         save_best_only=True)\n",
     "run_index = 1 # increment every time you train the model\n",
-    "run_logdir = Path() / \"my_cifar10_logs\" / \"run_bn_{:03d}\".format(run_index)\n",
+    "run_logdir = Path() / \"my_cifar10_logs\" / f\"run_bn_{run_index:03d}\"\n",
     "tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)\n",
     "callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]\n",
     "\n",
@@ -2359,7 +2439,6 @@
     "          validation_data=(X_valid, y_valid),\n",
     "          callbacks=callbacks)\n",
     "\n",
-    "model = tf.keras.models.load_model(\"my_cifar10_bn_model.h5\")\n",
     "model.evaluate(X_valid, y_valid)"
    ]
   },
@@ -2367,9 +2446,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "* *Is the model converging faster than before?* Much faster! The previous model took 27 epochs to reach the lowest validation loss, while the new model achieved that same loss in just 5 epochs and continued to make progress until the 16th epoch. The BN layers stabilized training and allowed us to use a much larger learning rate, so convergence was faster.\n",
-    "* *Does BN produce a better model?* Yes! The final model is also much better, with 54.0% accuracy instead of 47.6%. It's still not a very good model, but at least it's much better than before (a Convolutional Neural Network would do much better, but that's a different topic, see chapter 13).\n",
-    "* *How does BN affect training speed?* Although the model converged much faster, each epoch took about 12s instead of 8s, because of the extra computations required by the BN layers. But overall the training time (wall time) was shortened significantly!"
+    "* *Is the model converging faster than before?* Much faster! The previous model took 29 epochs to reach the lowest validation loss, while the new model achieved that same loss in just 12 epochs and continued to make progress until the 17th epoch. The BN layers stabilized training and allowed us to use a much larger learning rate, so convergence was faster.\n",
+    "* *Does BN produce a better model?* Yes! The final model is also much better, with 50.7% validation accuracy instead of 46.7%. It's still not a very good model, but at least it's much better than before (a Convolutional Neural Network would do much better, but that's a different topic, see chapter 14).\n",
+    "* *How does BN affect training speed?* Although the model converged much faster, each epoch took about 15s instead of 10s, because of the extra computations required by the BN layers. But overall the training time (wall time) to reach the best model was shortened by about 10%."
    ]
   },
   {
@@ -2382,22 +2461,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 126,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
     "for _ in range(20):\n",
     "    model.add(tf.keras.layers.Dense(100,\n",
-    "                                 kernel_initializer=\"lecun_normal\",\n",
-    "                                 activation=\"selu\"))\n",
+    "                                    kernel_initializer=\"lecun_normal\",\n",
+    "                                    activation=\"selu\"))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
     "\n",
     "optimizer = tf.keras.optimizers.Nadam(learning_rate=7e-4)\n",
@@ -2405,10 +2482,12 @@
     "              optimizer=optimizer,\n",
     "              metrics=[\"accuracy\"])\n",
     "\n",
-    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20)\n",
-    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_selu_model.h5\", save_best_only=True)\n",
+    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(\n",
+    "    patience=20, restore_best_weights=True)\n",
+    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "    \"my_cifar10_selu_model\", save_best_only=True)\n",
     "run_index = 1 # increment every time you train the model\n",
-    "run_logdir = Path() / \"my_cifar10_logs\" / \"run_selu_{:03d}\".format(run_index)\n",
+    "run_logdir = Path() / \"my_cifar10_logs\" / f\"run_selu_{run_index:03d}\"\n",
     "tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)\n",
     "callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]\n",
     "\n",
@@ -2422,17 +2501,6 @@
     "          validation_data=(X_valid_scaled, y_valid),\n",
     "          callbacks=callbacks)\n",
     "\n",
-    "model = tf.keras.models.load_model(\"my_cifar10_selu_model.h5\")\n",
-    "model.evaluate(X_valid_scaled, y_valid)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\"my_cifar10_selu_model.h5\")\n",
     "model.evaluate(X_valid_scaled, y_valid)"
    ]
   },
@@ -2440,7 +2508,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We get 47.9% accuracy, which is not much better than the original model (47.6%), and not as good as the model using batch normalization (54.0%). However, convergence was almost as fast as with the BN model, plus each epoch took only 7 seconds. So it's by far the fastest model to train so far."
+    "This model reached the first model's validation loss in just 8 epochs. After 14 epochs, it reached its lowest validation loss, with about 50.3% accuracy, which is better than the original model (46.7%), but not quite as good as the model using batch normalization (50.7%). Each epoch took only 9 seconds. So it's the fastest model to train so far."
    ]
   },
   {
@@ -2453,20 +2521,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
     "for _ in range(20):\n",
     "    model.add(tf.keras.layers.Dense(100,\n",
-    "                                 kernel_initializer=\"lecun_normal\",\n",
-    "                                 activation=\"selu\"))\n",
+    "                                    kernel_initializer=\"lecun_normal\",\n",
+    "                                    activation=\"selu\"))\n",
     "\n",
     "model.add(tf.keras.layers.AlphaDropout(rate=0.1))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
@@ -2476,10 +2542,12 @@
     "              optimizer=optimizer,\n",
     "              metrics=[\"accuracy\"])\n",
     "\n",
-    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=20)\n",
-    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\"my_cifar10_alpha_dropout_model.h5\", save_best_only=True)\n",
+    "early_stopping_cb = tf.keras.callbacks.EarlyStopping(\n",
+    "    patience=20, restore_best_weights=True)\n",
+    "model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(\n",
+    "    \"my_cifar10_alpha_dropout_model\", save_best_only=True)\n",
     "run_index = 1 # increment every time you train the model\n",
-    "run_logdir = Path() / \"my_cifar10_logs\" / \"run_alpha_dropout_{:03d}\".format(run_index)\n",
+    "run_logdir = Path() / \"my_cifar10_logs\" / f\"run_alpha_dropout_{run_index:03d}\"\n",
     "tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir)\n",
     "callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]\n",
     "\n",
@@ -2493,7 +2561,6 @@
     "          validation_data=(X_valid_scaled, y_valid),\n",
     "          callbacks=callbacks)\n",
     "\n",
-    "model = tf.keras.models.load_model(\"my_cifar10_alpha_dropout_model.h5\")\n",
     "model.evaluate(X_valid_scaled, y_valid)"
    ]
   },
@@ -2501,7 +2568,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model reaches 48.9% accuracy on the validation set. That's very slightly better than without dropout (47.6%). With an extensive hyperparameter search, it might be possible to do better (I tried dropout rates of 5%, 10%, 20% and 40%, and learning rates 1e-4, 3e-4, 5e-4, and 1e-3), but probably not much better in this case."
+    "The model reaches 48.1% accuracy on the validation set. That's worse than without dropout (50.3%). With an extensive hyperparameter search, it might be possible to do better (I tried dropout rates of 5%, 10%, 20% and 40%, and learning rates 1e-4, 3e-4, 5e-4, and 1e-3), but probably not much better in this case."
    ]
   },
   {
@@ -2513,7 +2580,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2531,12 +2598,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
     "mc_model = tf.keras.Sequential([\n",
-    "    MCAlphaDropout(layer.rate) if isinstance(layer, tf.keras.layers.AlphaDropout) else layer\n",
+    "    (\n",
+    "        MCAlphaDropout(layer.rate)\n",
+    "        if isinstance(layer, tf.keras.layers.AlphaDropout)\n",
+    "        else layer\n",
+    "    )\n",
     "    for layer in model.layers\n",
     "])"
    ]
@@ -2550,7 +2621,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2560,7 +2631,7 @@
     "\n",
     "def mc_dropout_predict_classes(mc_model, X, n_samples=10):\n",
     "    Y_probas = mc_dropout_predict_probas(mc_model, X, n_samples)\n",
-    "    return np.argmax(Y_probas, axis=1)"
+    "    return Y_probas.argmax(axis=1)"
    ]
   },
   {
@@ -2572,16 +2643,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "y_pred = mc_dropout_predict_classes(mc_model, X_valid_scaled)\n",
-    "accuracy = np.mean(y_pred == y_valid[:, 0])\n",
+    "accuracy = (y_pred == y_valid[:, 0]).mean()\n",
     "accuracy"
    ]
   },
@@ -2589,7 +2658,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We get no accuracy improvement in this case (we're still at 48.9% accuracy).\n",
+    "We get back to the accuracy of the model without dropout in this case (about 50.3% accuracy).\n",
     "\n",
     "So the best model we got in this exercise is the Batch Normalization model."
    ]
@@ -2604,25 +2673,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
     "for _ in range(20):\n",
     "    model.add(tf.keras.layers.Dense(100,\n",
-    "                                 kernel_initializer=\"lecun_normal\",\n",
-    "                                 activation=\"selu\"))\n",
+    "                                    kernel_initializer=\"lecun_normal\",\n",
+    "                                    activation=\"selu\"))\n",
     "\n",
     "model.add(tf.keras.layers.AlphaDropout(rate=0.1))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
     "\n",
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=1e-3)\n",
+    "optimizer = tf.keras.optimizers.SGD()\n",
     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
     "              optimizer=optimizer,\n",
     "              metrics=[\"accuracy\"])"
@@ -2630,25 +2697,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 133,
    "metadata": {},
    "outputs": [],
    "source": [
     "batch_size = 128\n",
-    "rates, losses = find_learning_rate(model, X_train_scaled, y_train, epochs=1, batch_size=batch_size)\n",
-    "plot_lr_vs_loss(rates, losses)\n",
-    "plt.axis([min(rates), max(rates), min(losses), (losses[0] + min(losses)) / 1.4])"
+    "rates, losses = find_learning_rate(model, X_train_scaled, y_train, epochs=1,\n",
+    "                                   batch_size=batch_size)\n",
+    "plot_lr_vs_loss(rates, losses)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
     "tf.random.set_seed(42)\n",
-    "np.random.seed(42)\n",
     "\n",
     "model = tf.keras.Sequential()\n",
     "model.add(tf.keras.layers.Flatten(input_shape=[32, 32, 3]))\n",
@@ -2660,7 +2725,7 @@
     "model.add(tf.keras.layers.AlphaDropout(rate=0.1))\n",
     "model.add(tf.keras.layers.Dense(10, activation=\"softmax\"))\n",
     "\n",
-    "optimizer = tf.keras.optimizers.SGD(learning_rate=1e-2)\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=2e-2)\n",
     "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
     "              optimizer=optimizer,\n",
     "              metrics=[\"accuracy\"])"
@@ -2668,7 +2733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2683,7 +2748,26 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "One cycle allowed us to train the model in just 15 epochs, each taking only 2 seconds (thanks to the larger batch size). This is several times faster than the fastest model we trained so far. Moreover, we improved the model's performance (from 47.6% to 52.0%). The batch normalized model reaches a slightly better performance (54%), but it's much slower to train."
+    "One cycle allowed us to train the model in just 15 epochs, each taking only 2 seconds (thanks to the larger batch size). This is several times faster than the fastest model we trained so far. Moreover, we improved the model's performance (from 50.7% to 52.0%)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "time.time()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!date"
    ]
   },
   {
@@ -2696,7 +2780,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
diff --git a/12_custom_models_and_training_with_tensorflow.ipynb b/12_custom_models_and_training_with_tensorflow.ipynb
index 77e33c4..822bc55 100644
--- a/12_custom_models_and_training_with_tensorflow.ipynb
+++ b/12_custom_models_and_training_with_tensorflow.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 11 – Custom Models and Training with TensorFlow**"
+    "**Chapter 12 – Custom Models and Training with TensorFlow**"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "_This notebook contains all the sample code and solutions to the exercises in chapter 11._"
+    "_This notebook contains all the sample code and solutions to the exercises in chapter 12, as well as code examples from Appendix C_"
    ]
   },
   {
@@ -20,17 +20,19 @@
    "source": [
     "<table align=\"left\">\n",
     "  <td>\n",
-    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml2/blob/master/12_custom_models_and_training_with_tensorflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml3/blob/main/12_custom_models_and_training_with_tensorflow.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
     "  </td>\n",
     "  <td>\n",
-    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml2/blob/master/12_custom_models_and_training_with_tensorflow.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
+    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml3/blob/main/12_custom_models_and_training_with_tensorflow.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
     "  </td>\n",
     "</table>"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "# Setup"
    ]
@@ -39,7 +41,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
+    "This project requires Python 3.8 or above:"
    ]
   },
   {
@@ -48,47 +50,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Python ≥3.8 is required\n",
     "import sys\n",
-    "assert sys.version_info >= (3, 8)\n",
     "\n",
-    "# Common imports\n",
-    "import numpy as np\n",
-    "from pathlib import Path\n",
-    "\n",
-    "# Scikit-Learn ≥1.0 is required\n",
-    "import sklearn\n",
-    "assert sklearn.__version__ >= \"1.0\"\n",
-    "\n",
-    "# TensorFlow ≥2.6 is required\n",
-    "import tensorflow as tf\n",
-    "assert tf.__version__ >= \"2.6\"\n",
-    "\n",
-    "# to make this notebook's output stable across runs\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "# To plot pretty figures\n",
-    "%matplotlib inline\n",
-    "import matplotlib as mpl\n",
-    "import matplotlib.pyplot as plt\n",
-    "mpl.rc('axes', labelsize=14)\n",
-    "mpl.rc('xtick', labelsize=12)\n",
-    "mpl.rc('ytick', labelsize=12)"
+    "assert sys.version_info >= (3, 8)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Tensors and operations"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Tensors"
+    "And TensorFlow ≥ 2.6:"
    ]
   },
   {
@@ -97,7 +68,30 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.constant([[1., 2., 3.], [4., 5., 6.]]) # matrix"
+    "import tensorflow as tf\n",
+    "\n",
+    "assert tf.__version__ >= \"2.6.0\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Using TensorFlow like NumPy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tensors and Operations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Tensors"
    ]
   },
   {
@@ -106,7 +100,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.constant(42) # scalar"
+    "t = tf.constant([[1., 2., 3.], [4., 5., 6.]]) # matrix\n",
+    "t"
    ]
   },
   {
@@ -114,23 +109,13 @@
    "execution_count": 4,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "t = tf.constant([[1., 2., 3.], [4., 5., 6.]])\n",
-    "t"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "t.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,12 +126,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Indexing"
+    "#### Indexing"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -155,7 +140,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -166,12 +151,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Ops"
+    "#### Ops"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,7 +165,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -200,7 +185,30 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Using `tf.keras.backend`"
+    "#### Scalars"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.constant(42)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Keras's low-level API"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "You may still run across code that uses Keras's low-level API:"
    ]
   },
   {
@@ -217,7 +225,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### From/To NumPy"
+    "But since Keras does not support multiple backends anymore, you should instead use TF's low-level API directly:"
    ]
   },
   {
@@ -226,8 +234,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "a = np.array([2., 4., 5.])\n",
-    "tf.constant(a)"
+    "tf.square(tf.transpose(t)) + 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Tensors and NumPy"
    ]
   },
   {
@@ -236,7 +250,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "t.numpy()"
+    "import numpy as np\n",
+    "\n",
+    "a = np.array([2., 4., 5.])\n",
+    "tf.constant(a)"
    ]
   },
   {
@@ -245,7 +262,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "np.array(t)"
+    "t.numpy()"
    ]
   },
   {
@@ -254,7 +271,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.square(a)"
+    "np.array(t)"
    ]
   },
   {
@@ -262,6 +279,15 @@
    "execution_count": 17,
    "metadata": {},
    "outputs": [],
+   "source": [
+    "tf.square(a)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
    "source": [
     "np.square(t)"
    ]
@@ -270,12 +296,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Conflicting Types"
+    "### Type Conversions"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -287,7 +313,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -299,7 +325,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 21,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -311,16 +337,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Strings"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.constant(b\"hello world\")"
+    "### Variables"
    ]
   },
   {
@@ -329,7 +346,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.constant(\"café\")"
+    "v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])\n",
+    "v"
    ]
   },
   {
@@ -338,8 +356,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "u = tf.constant([ord(c) for c in \"café\"])\n",
-    "u"
+    "v.assign(2 * v)"
    ]
   },
   {
@@ -348,24 +365,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "b = tf.strings.unicode_encode(u, \"UTF-8\")\n",
-    "tf.strings.length(b, unit=\"UTF8_CHAR\")"
+    "v[0, 1].assign(42)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 25,
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
-    "tf.strings.unicode_decode(b, \"UTF-8\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### String arrays"
+    "v[:, 2].assign([0., 1.])"
    ]
   },
   {
@@ -374,7 +385,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "p = tf.constant([\"Café\", \"Coffee\", \"caffè\", \"咖啡\"])"
+    "v.scatter_nd_update(\n",
+    "    indices=[[0, 0], [1, 2]], updates=[100., 200.])"
    ]
   },
   {
@@ -383,7 +395,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.strings.length(p, unit=\"UTF8_CHAR\")"
+    "# extra code – shows how to use scatter_update()\n",
+    "sparse_delta = tf.IndexedSlices(values=[[1., 2., 3.], [4., 5., 6.]],\n",
+    "                                indices=[1, 0])\n",
+    "v.scatter_update(sparse_delta)"
    ]
   },
   {
@@ -391,268 +406,6 @@
    "execution_count": 28,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "r = tf.strings.unicode_decode(p, \"UTF8\")\n",
-    "r"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(r)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Ragged tensors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(r[1])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(r[1:3])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "r2 = tf.ragged.constant([[65, 66], [], [67]])\n",
-    "print(tf.concat([r, r2], axis=0))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "r3 = tf.ragged.constant([[68, 69, 70], [71], [], [72, 73]])\n",
-    "print(tf.concat([r, r3], axis=1))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.strings.unicode_encode(r3, \"UTF-8\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "r.to_tensor()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Sparse tensors"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s = tf.SparseTensor(indices=[[0, 1], [1, 0], [2, 3]],\n",
-    "                    values=[1., 2., 3.],\n",
-    "                    dense_shape=[3, 4])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(s)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.sparse.to_dense(s)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s2 = s * 2.0"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    s3 = s + 1.\n",
-    "except TypeError as ex:\n",
-    "    print(ex)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s4 = tf.constant([[10., 20.], [30., 40.], [50., 60.], [70., 80.]])\n",
-    "tf.sparse.sparse_dense_matmul(s, s4)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s5 = tf.SparseTensor(indices=[[0, 2], [0, 1]],\n",
-    "                     values=[1., 2.],\n",
-    "                     dense_shape=[3, 4])\n",
-    "print(s5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 43,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    tf.sparse.to_dense(s5)\n",
-    "except tf.errors.InvalidArgumentError as ex:\n",
-    "    print(ex)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "s6 = tf.sparse.reorder(s5)\n",
-    "tf.sparse.to_dense(s6)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Sets"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "set1 = tf.constant([[2, 3, 5, 7], [7, 9, 0, 0]])\n",
-    "set2 = tf.constant([[4, 5, 6], [9, 10, 0]])\n",
-    "tf.sparse.to_dense(tf.sets.union(set1, set2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 46,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.sparse.to_dense(tf.sets.difference(set1, set2))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 47,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.sparse.to_dense(tf.sets.intersection(set1, set2))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Variables"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 48,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "v = tf.Variable([[1., 2., 3.], [4., 5., 6.]])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "v.assign(2 * v)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "v[0, 1].assign(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 51,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "v[:, 2].assign([0., 1.])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 52,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "try:\n",
     "    v[1] = [7., 8., 9.]\n",
@@ -660,14 +413,300 @@
     "    print(ex)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Strings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The code in this section and all the following sections  in appendix C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.constant(b\"hello world\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.constant(\"café\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "u = tf.constant([ord(c) for c in \"café\"])\n",
+    "u"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = tf.strings.unicode_encode(u, \"UTF-8\")\n",
+    "tf.strings.length(b, unit=\"UTF8_CHAR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.strings.unicode_decode(b, \"UTF-8\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Other Data Structures"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The code in this section is in Appendix C."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### String arrays"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.constant(b\"hello world\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.constant(\"café\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "u = tf.constant([ord(c) for c in \"café\"])\n",
+    "u"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "b = tf.strings.unicode_encode(u, \"UTF-8\")\n",
+    "b"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 38,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.strings.length(b, unit=\"UTF8_CHAR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.strings.unicode_decode(b, \"UTF-8\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = tf.constant([\"Café\", \"Coffee\", \"caffè\", \"咖啡\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.strings.length(p, unit=\"UTF8_CHAR\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r = tf.strings.unicode_decode(p, \"UTF8\")\n",
+    "r"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Ragged tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r[1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r[1:3]  # extra code – a slice of a ragged tensor is a ragged tensor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r2 = tf.ragged.constant([[65, 66], [], [67]])\n",
+    "tf.concat([r, r2], axis=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r3 = tf.ragged.constant([[68, 69, 70], [71], [], [72, 73]])\n",
+    "print(tf.concat([r, r3], axis=1))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "r.to_tensor()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Sparse tensors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s = tf.SparseTensor(indices=[[0, 1], [1, 0], [2, 3]],\n",
+    "                    values=[1., 2., 3.],\n",
+    "                    dense_shape=[3, 4])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.sparse.to_dense(s)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "s * 42.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    s + 42.0\n",
+    "except TypeError as ex:\n",
+    "    print(ex)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows how to multiply a sparse tensor and a dense tensor\n",
+    "s4 = tf.constant([[10., 20.], [30., 40.], [50., 60.], [70., 80.]])\n",
+    "tf.sparse.sparse_dense_matmul(s, s4)"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
-    "v.scatter_nd_update(indices=[[0, 0], [1, 2]],\n",
-    "                    updates=[100., 200.])"
+    "# extra code – when creating a sparse tensor, values must be given in \"reading\n",
+    "#              order\", or else `to_dense()` will fail.\n",
+    "s5 = tf.SparseTensor(indices=[[0, 2], [0, 1]],  # WRONG ORDER!\n",
+    "                     values=[1., 2.],\n",
+    "                     dense_shape=[3, 4])\n",
+    "try:\n",
+    "    tf.sparse.to_dense(s5)\n",
+    "except tf.errors.InvalidArgumentError as ex:\n",
+    "    print(ex)"
    ]
   },
   {
@@ -676,16 +715,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sparse_delta = tf.IndexedSlices(values=[[1., 2., 3.], [4., 5., 6.]],\n",
-    "                                indices=[1, 0])\n",
-    "v.scatter_update(sparse_delta)"
+    "# extra code – shows how to fix the sparse tensor s5 by reordering its values\n",
+    "s6 = tf.sparse.reorder(s5)\n",
+    "tf.sparse.to_dense(s6)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Tensor Arrays"
+    "#### Tensor Arrays"
    ]
   },
   {
@@ -697,7 +736,8 @@
     "array = tf.TensorArray(dtype=tf.float32, size=3)\n",
     "array = array.write(0, tf.constant([1., 2.]))\n",
     "array = array.write(1, tf.constant([3., 10.]))\n",
-    "array = array.write(2, tf.constant([5., 7.]))"
+    "array = array.write(2, tf.constant([5., 7.]))\n",
+    "tensor1 = array.read(1)  # returns (and zeros out!) tf.constant([3., 10.])"
    ]
   },
   {
@@ -706,7 +746,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "array.read(1)"
+    "array.stack()"
    ]
   },
   {
@@ -715,7 +755,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "array.stack()"
+    "# extra code – shows how to disable clear_after_read\n",
+    "array2 = tf.TensorArray(dtype=tf.float32, size=3, clear_after_read=False)\n",
+    "array2 = array2.write(0, tf.constant([1., 2.]))\n",
+    "array2 = array2.write(1, tf.constant([3., 10.]))\n",
+    "array2 = array2.write(2, tf.constant([5., 7.]))\n",
+    "tensor2 = array2.read(1)  # returns tf.constant([3., 10.])\n",
+    "array2.stack()"
    ]
   },
   {
@@ -724,8 +770,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "mean, variance = tf.nn.moments(array.stack(), axes=0)\n",
-    "mean"
+    "# extra code – shows how to create and use a tensor array with a dynamic size\n",
+    "array3 = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)\n",
+    "array3 = array3.write(0, tf.constant([1., 2.]))\n",
+    "array3 = array3.write(1, tf.constant([3., 10.]))\n",
+    "array3 = array3.write(2, tf.constant([5., 7.]))\n",
+    "tensor3 = array3.read(1)\n",
+    "array3.stack()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Sets"
    ]
   },
   {
@@ -734,7 +792,122 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "variance"
+    "a = tf.constant([[1, 5, 9]])\n",
+    "b = tf.constant([[5, 6, 9, 11]])\n",
+    "u = tf.sets.union(a, b)\n",
+    "u"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.sparse.to_dense(u)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "a = tf.constant([[1, 5, 9], [10, 0, 0]])\n",
+    "b = tf.constant([[5, 6, 9, 11], [13, 0, 0, 0]])\n",
+    "u = tf.sets.union(a, b)\n",
+    "tf.sparse.to_dense(u)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows how to use a different default value: -1 in this case\n",
+    "a = tf.constant([[1, 5, 9], [10, -1, -1]])\n",
+    "b = tf.constant([[5, 6, 9, 11], [13, -1, -1, -1]])\n",
+    "u = tf.sets.union(a, b)\n",
+    "tf.sparse.to_dense(u, default_value=-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows how to use `tf.sets.difference()`\n",
+    "set1 = tf.constant([[2, 3, 5, 7], [7, 9, 0, 0]])\n",
+    "set2 = tf.constant([[4, 5, 6], [9, 10, 0]])\n",
+    "tf.sparse.to_dense(tf.sets.difference(set1, set2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows how to use `tf.sets.difference()`\n",
+    "tf.sparse.to_dense(tf.sets.intersection(set1, set2))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – check whether set1[0] contains 5\n",
+    "tf.sets.size(tf.sets.intersection(set1[:1], tf.constant([[5, 0, 0, 0]]))) > 0"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Queues"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q = tf.queue.FIFOQueue(3, [tf.int32, tf.string], shapes=[(), ()])\n",
+    "q.enqueue([10, b\"windy\"])\n",
+    "q.enqueue([15, b\"sunny\"])\n",
+    "q.size()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q.dequeue()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q.enqueue_many([[13, 16], [b'cloudy', b'rainy']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "q.dequeue_many(3)"
    ]
   },
   {
@@ -744,38 +917,9 @@
     "## Custom loss function"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set, and finally we scale it:"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 60,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from sklearn.datasets import fetch_california_housing\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import StandardScaler\n",
-    "\n",
-    "housing = fetch_california_housing()\n",
-    "X_train_full, X_test, y_train_full, y_test = train_test_split(\n",
-    "    housing.data, housing.target.reshape(-1, 1), random_state=42)\n",
-    "X_train, X_valid, y_train, y_valid = train_test_split(\n",
-    "    X_train_full, y_train_full, random_state=42)\n",
-    "\n",
-    "scaler = StandardScaler()\n",
-    "X_train_scaled = scaler.fit_transform(X_train)\n",
-    "X_valid_scaled = scaler.transform(X_valid)\n",
-    "X_test_scaled = scaler.transform(X_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -789,18 +933,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows what the Huber loss looks like\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
     "plt.figure(figsize=(8, 3.5))\n",
     "z = np.linspace(-4, 4, 200)\n",
+    "z_center = np.linspace(-1, 1, 200)\n",
     "plt.plot(z, huber_fn(0, z), \"b-\", linewidth=2, label=\"huber($z$)\")\n",
-    "plt.plot(z, z**2 / 2, \"b:\", linewidth=1, label=r\"$\\frac{1}{2}z^2$\")\n",
-    "plt.plot([-1, -1], [0, huber_fn(0., -1.)], \"r--\")\n",
-    "plt.plot([1, 1], [0, huber_fn(0., 1.)], \"r--\")\n",
+    "plt.plot(z, z ** 2 / 2, \"r:\", linewidth=1)\n",
+    "plt.plot(z_center, z_center ** 2 / 2, \"r\", linewidth=2)\n",
+    "plt.plot([-1, -1], [0, huber_fn(0., -1.)], \"k--\")\n",
+    "plt.plot([1, 1], [0, huber_fn(0., 1.)], \"k--\")\n",
     "plt.gca().axhline(y=0, color='k')\n",
     "plt.gca().axvline(x=0, color='k')\n",
+    "plt.text(2.1, 3.5, r\"$\\frac{1}{2}z^2$\", color=\"r\", fontsize=15)\n",
+    "plt.text(3.0, 2.2, r\"$|z| - \\frac{1}{2}$\", color=\"b\", fontsize=15)\n",
     "plt.axis([-4, 4, 0, 4])\n",
     "plt.grid(True)\n",
     "plt.xlabel(\"$z$\")\n",
@@ -810,23 +962,51 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 63,
+   "cell_type": "markdown",
    "metadata": {},
+   "source": [
+    "To test our custom loss function, let's create a basic Keras model and train it on the California housing dataset:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {
+    "tags": []
+   },
    "outputs": [],
    "source": [
+    "# extra code – loads, splits and scales the California housing dataset, then\n",
+    "#              creates a simple Keras model\n",
+    "\n",
+    "from sklearn.datasets import fetch_california_housing\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "housing = fetch_california_housing()\n",
+    "X_train_full, X_test, y_train_full, y_test = train_test_split(\n",
+    "    housing.data, housing.target.reshape(-1, 1), random_state=42)\n",
+    "X_train, X_valid, y_train, y_valid = train_test_split(\n",
+    "    X_train_full, y_train_full, random_state=42)\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "X_train_scaled = scaler.fit_transform(X_train)\n",
+    "X_valid_scaled = scaler.transform(X_valid)\n",
+    "X_test_scaled = scaler.transform(X_test)\n",
+    "\n",
     "input_shape = X_train.shape[1:]\n",
     "\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1),\n",
     "])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 73,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -835,7 +1015,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 74,
    "metadata": {
     "scrolled": true
    },
@@ -854,26 +1034,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 75,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save(\"my_model_with_a_custom_loss.h5\")"
+    "model.save(\"my_model_with_a_custom_loss\")  # extra code – saving works fine"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 76,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss.h5\",\n",
-    "                                custom_objects={\"huber_fn\": huber_fn})"
+    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss\",\n",
+    "                                   custom_objects={\"huber_fn\": huber_fn})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 77,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -883,7 +1063,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 78,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -892,14 +1072,14 @@
     "        error = y_true - y_pred\n",
     "        is_small_error = tf.abs(error) < threshold\n",
     "        squared_loss = tf.square(error) / 2\n",
-    "        linear_loss  = threshold * tf.abs(error) - threshold**2 / 2\n",
+    "        linear_loss  = threshold * tf.abs(error) - threshold ** 2 / 2\n",
     "        return tf.where(is_small_error, squared_loss, linear_loss)\n",
     "    return huber_fn"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 70,
+   "execution_count": 79,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -908,7 +1088,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 80,
    "metadata": {
     "scrolled": true
    },
@@ -918,115 +1098,13 @@
     "          validation_data=(X_valid_scaled, y_valid))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 72,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.save(\"my_model_with_a_custom_loss_threshold_2.h5\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 73,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss_threshold_2.h5\",\n",
-    "                                custom_objects={\"huber_fn\": create_huber(2.0)})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 74,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit(X_train_scaled, y_train, epochs=2,\n",
-    "          validation_data=(X_valid_scaled, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class HuberLoss(tf.keras.losses.Loss):\n",
-    "    def __init__(self, threshold=1.0, **kwargs):\n",
-    "        self.threshold = threshold\n",
-    "        super().__init__(**kwargs)\n",
-    "    def call(self, y_true, y_pred):\n",
-    "        error = y_true - y_pred\n",
-    "        is_small_error = tf.abs(error) < self.threshold\n",
-    "        squared_loss = tf.square(error) / 2\n",
-    "        linear_loss  = self.threshold * tf.abs(error) - self.threshold**2 / 2\n",
-    "        return tf.where(is_small_error, squared_loss, linear_loss)\n",
-    "    def get_config(self):\n",
-    "        base_config = super().get_config()\n",
-    "        return {**base_config, \"threshold\": self.threshold}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
-    "    tf.keras.layers.Dense(1),\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=HuberLoss(2.), optimizer=\"nadam\", metrics=[\"mae\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 78,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit(X_train_scaled, y_train, epochs=2,\n",
-    "          validation_data=(X_valid_scaled, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.save(\"my_model_with_a_custom_loss_class.h5\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 80,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss_class.h5\",\n",
-    "                                custom_objects={\"HuberLoss\": HuberLoss})"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 81,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(X_train_scaled, y_train, epochs=2,\n",
-    "          validation_data=(X_valid_scaled, y_valid))"
+    "model.save(\"my_model_with_a_custom_loss_threshold_2\")"
    ]
   },
   {
@@ -1035,7 +1113,114 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.loss.threshold"
+    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss_threshold_2\",\n",
+    "                                   custom_objects={\"huber_fn\": create_huber(2.0)})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class HuberLoss(tf.keras.losses.Loss):\n",
+    "    def __init__(self, threshold=1.0, **kwargs):\n",
+    "        self.threshold = threshold\n",
+    "        super().__init__(**kwargs)\n",
+    "\n",
+    "    def call(self, y_true, y_pred):\n",
+    "        error = y_true - y_pred\n",
+    "        is_small_error = tf.abs(error) < self.threshold\n",
+    "        squared_loss = tf.square(error) / 2\n",
+    "        linear_loss  = self.threshold * tf.abs(error) - self.threshold**2 / 2\n",
+    "        return tf.where(is_small_error, squared_loss, linear_loss)\n",
+    "\n",
+    "    def get_config(self):\n",
+    "        base_config = super().get_config()\n",
+    "        return {**base_config, \"threshold\": self.threshold}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 85,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – creates another basic Keras model\n",
+    "tf.random.set_seed(42)\n",
+    "model = tf.keras.Sequential([\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(1),\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(loss=HuberLoss(2.), optimizer=\"nadam\", metrics=[\"mae\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.save(\"my_model_with_a_custom_loss_class\")  # extra code – saving works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = tf.keras.models.load_model(\"my_model_with_a_custom_loss_class\",\n",
+    "                                   custom_objects={\"HuberLoss\": HuberLoss})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows that loading worked fine, the model can be used normally\n",
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 91,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.loss.threshold  # extra code – the treshold was loaded correctly"
    ]
   },
   {
@@ -1047,23 +1232,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 92,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 84,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def my_softplus(z): # return value is just tf.nn.softplus(z)\n",
-    "    return tf.math.log(tf.exp(z) + 1.0)\n",
+    "def my_softplus(z):\n",
+    "    return tf.math.log(1.0 + tf.exp(z))\n",
     "\n",
     "def my_glorot_initializer(shape, dtype=tf.float32):\n",
     "    stddev = tf.sqrt(2. / (shape[0] + shape[1]))\n",
@@ -1072,117 +1246,20 @@
     "def my_l1_regularizer(weights):\n",
     "    return tf.reduce_sum(tf.abs(0.01 * weights))\n",
     "\n",
-    "def my_positive_weights(weights): # return value is just tf.nn.relu(weights)\n",
+    "def my_positive_weights(weights):  # return value is just tf.nn.relu(weights)\n",
     "    return tf.where(weights < 0., tf.zeros_like(weights), weights)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 85,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "layer = tf.keras.layers.Dense(1, activation=my_softplus,\n",
-    "                           kernel_initializer=my_glorot_initializer,\n",
-    "                           kernel_regularizer=my_l1_regularizer,\n",
-    "                           kernel_constraint=my_positive_weights)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 86,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
-    "    tf.keras.layers.Dense(1, activation=my_softplus,\n",
-    "                       kernel_regularizer=my_l1_regularizer,\n",
-    "                       kernel_constraint=my_positive_weights,\n",
-    "                       kernel_initializer=my_glorot_initializer),\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"mse\", optimizer=\"nadam\", metrics=[\"mae\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 89,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit(X_train_scaled, y_train, epochs=2,\n",
-    "          validation_data=(X_valid_scaled, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 90,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.save(\"my_model_with_many_custom_parts.h5\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 91,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\n",
-    "    \"my_model_with_many_custom_parts.h5\",\n",
-    "    custom_objects={\n",
-    "       \"my_l1_regularizer\": my_l1_regularizer,\n",
-    "       \"my_positive_weights\": my_positive_weights,\n",
-    "       \"my_glorot_initializer\": my_glorot_initializer,\n",
-    "       \"my_softplus\": my_softplus,\n",
-    "    })"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 92,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "class MyL1Regularizer(tf.keras.regularizers.Regularizer):\n",
-    "    def __init__(self, factor):\n",
-    "        self.factor = factor\n",
-    "    def __call__(self, weights):\n",
-    "        return tf.reduce_sum(tf.abs(self.factor * weights))\n",
-    "    def get_config(self):\n",
-    "        return {\"factor\": self.factor}"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 93,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
+    "layer = tf.keras.layers.Dense(1, activation=my_softplus,\n",
+    "                              kernel_initializer=my_glorot_initializer,\n",
+    "                              kernel_regularizer=my_l1_regularizer,\n",
+    "                              kernel_constraint=my_positive_weights)"
    ]
   },
   {
@@ -1191,14 +1268,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – show that building, training, saving, loading, and training again\n",
+    "#              works fine with a model containing many custom parts\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1, activation=my_softplus,\n",
-    "                       kernel_regularizer=MyL1Regularizer(0.01),\n",
-    "                       kernel_constraint=my_positive_weights,\n",
-    "                       kernel_initializer=my_glorot_initializer),\n",
-    "])"
+    "                          kernel_initializer=my_glorot_initializer,\n",
+    "                          kernel_regularizer=my_l1_regularizer,\n",
+    "                          kernel_constraint=my_positive_weights)\n",
+    "])\n",
+    "model.compile(loss=\"mse\", optimizer=\"nadam\", metrics=[\"mae\"])\n",
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))\n",
+    "model.save(\"my_model_with_many_custom_parts\")\n",
+    "model = tf.keras.models.load_model(\n",
+    "    \"my_model_with_many_custom_parts\",\n",
+    "    custom_objects={\n",
+    "       \"my_l1_regularizer\": my_l1_regularizer,\n",
+    "       \"my_positive_weights\": my_positive_weights,\n",
+    "       \"my_glorot_initializer\": my_glorot_initializer,\n",
+    "       \"my_softplus\": my_softplus,\n",
+    "    }\n",
+    ")\n",
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
    ]
   },
   {
@@ -1207,7 +1303,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.compile(loss=\"mse\", optimizer=\"nadam\", metrics=[\"mae\"])"
+    "class MyL1Regularizer(tf.keras.regularizers.Regularizer):\n",
+    "    def __init__(self, factor):\n",
+    "        self.factor = factor\n",
+    "\n",
+    "    def __call__(self, weights):\n",
+    "        return tf.reduce_sum(tf.abs(self.factor * weights))\n",
+    "\n",
+    "    def get_config(self):\n",
+    "        return {\"factor\": self.factor}"
    ]
   },
   {
@@ -1216,33 +1320,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – again, show that everything works fine, this time using our\n",
+    "#              custom regularizer class\n",
+    "\n",
+    "tf.random.set_seed(42)\n",
+    "model = tf.keras.Sequential([\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(1, activation=my_softplus,\n",
+    "                          kernel_regularizer=MyL1Regularizer(0.01),\n",
+    "                          kernel_constraint=my_positive_weights,\n",
+    "                          kernel_initializer=my_glorot_initializer),\n",
+    "])\n",
+    "model.compile(loss=\"mse\", optimizer=\"nadam\", metrics=[\"mae\"])\n",
     "model.fit(X_train_scaled, y_train, epochs=2,\n",
-    "          validation_data=(X_valid_scaled, y_valid))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 97,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.save(\"my_model_with_many_custom_parts.h5\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "          validation_data=(X_valid_scaled, y_valid))\n",
+    "model.save(\"my_model_with_many_custom_parts\")\n",
     "model = tf.keras.models.load_model(\n",
-    "    \"my_model_with_many_custom_parts.h5\",\n",
+    "    \"my_model_with_many_custom_parts\",\n",
     "    custom_objects={\n",
     "       \"MyL1Regularizer\": MyL1Regularizer,\n",
     "       \"my_positive_weights\": my_positive_weights,\n",
     "       \"my_glorot_initializer\": my_glorot_initializer,\n",
     "       \"my_softplus\": my_softplus,\n",
-    "    })"
+    "    }\n",
+    ")\n",
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
    ]
   },
   {
@@ -1254,31 +1358,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
+   "execution_count": 97,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – once again, lets' create a basic Keras model\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1),\n",
     "])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 101,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1287,10 +1382,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 102,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – train the model with our custom metric\n",
     "model.fit(X_train_scaled, y_train, epochs=2)"
    ]
   },
@@ -1298,39 +1394,9 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Note**: if you use the same function as the loss and a metric, you may be surprised to see different results. This is generally just due to floating point precision errors: even though the mathematical equations are equivalent, the operations are not run in the same order, which can lead to small differences. Moreover, when using sample weights, there's more than just precision errors:\n",
-    "* the loss since the start of the epoch is the mean of all batch losses seen so far. Each batch loss is the sum of the weighted instance losses divided by the _batch size_ (not the sum of weights, so the batch loss is _not_ the weighted mean of the losses).\n",
-    "* the metric since the start of the epoch is equal to the sum of weighted instance losses divided by sum of all weights seen so far. In other words, it is the weighted mean of all the instance losses. Not the same thing.\n",
-    "\n",
-    "If you do the math, you will find that loss = metric * mean of sample weights (plus some floating point precision error)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=create_huber(2.0), optimizer=\"nadam\", metrics=[create_huber(2.0)])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sample_weight = np.random.rand(len(y_train))\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=2, sample_weight=sample_weight)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history.history[\"loss\"][0], history.history[\"huber_fn\"][0] * sample_weight.mean()"
+    "**Note**: if you use the same function as the loss and a metric, you may be surprised to see slightly different results. This is in part because the operations are not computed exactly in the same order, so there might be tiny floating point errors. More importantly, if you use sample weights or class weights, then the equations are a bit different:\n",
+    "* the `fit()` method keeps track of the mean of all batch losses seen so far since the start of the epoch. Each batch loss is the sum of the weighted instance losses divided by the _batch size_ (not the sum of weights, so the batch loss is _not_ the weighted mean of the losses).\n",
+    "* the metric since the start of the epoch is equal to the sum of weighted instance losses divided by sum of all weights seen so far. In other words, it is the weighted mean of all the instance losses. Not the same thing."
    ]
   },
   {
@@ -1342,7 +1408,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 100,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1352,7 +1418,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 101,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1361,7 +1427,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1370,7 +1436,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 103,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1379,7 +1445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 104,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1395,31 +1461,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 105,
    "metadata": {},
    "outputs": [],
    "source": [
     "class HuberMetric(tf.keras.metrics.Metric):\n",
     "    def __init__(self, threshold=1.0, **kwargs):\n",
-    "        super().__init__(**kwargs) # handles base args (e.g., dtype)\n",
+    "        super().__init__(**kwargs)  # handles base args (e.g., dtype)\n",
     "        self.threshold = threshold\n",
     "        self.huber_fn = create_huber(threshold)\n",
     "        self.total = self.add_weight(\"total\", initializer=\"zeros\")\n",
     "        self.count = self.add_weight(\"count\", initializer=\"zeros\")\n",
+    "\n",
     "    def update_state(self, y_true, y_pred, sample_weight=None):\n",
-    "        metric = self.huber_fn(y_true, y_pred)\n",
-    "        self.total.assign_add(tf.reduce_sum(metric))\n",
+    "        sample_metrics = self.huber_fn(y_true, y_pred)\n",
+    "        self.total.assign_add(tf.reduce_sum(sample_metrics))\n",
     "        self.count.assign_add(tf.cast(tf.size(y_true), tf.float32))\n",
+    "\n",
     "    def result(self):\n",
     "        return self.total / self.count\n",
+    "\n",
     "    def get_config(self):\n",
     "        base_config = super().get_config()\n",
     "        return {**base_config, \"threshold\": self.threshold}"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Extra material** – the rest of this section tests the `HuberMetric` class and shows another implementation subclassing `tf.keras.metrics.Mean`."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 106,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1428,26 +1504,33 @@
     "# total = 2 * |10 - 2| - 2²/2 = 14\n",
     "# count = 1\n",
     "# result = 14 / 1 = 14\n",
-    "m(tf.constant([[2.]]), tf.constant([[10.]])) "
+    "m(tf.constant([[2.]]), tf.constant([[10.]]))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 107,
    "metadata": {},
    "outputs": [],
    "source": [
     "# total = total + (|1 - 0|² / 2) + (2 * |9.25 - 5| - 2² / 2) = 14 + 7 = 21\n",
     "# count = count + 2 = 3\n",
     "# result = total / count = 21 / 3 = 7\n",
-    "m(tf.constant([[0.], [5.]]), tf.constant([[1.], [9.25]]))\n",
-    "\n",
+    "m(tf.constant([[0.], [5.]]), tf.constant([[1.], [9.25]]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "m.result()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1456,7 +1539,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1473,85 +1556,80 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1),\n",
     "])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 112,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.compile(loss=create_huber(2.0), optimizer=\"nadam\", metrics=[HuberMetric(2.0)])"
+    "model.compile(loss=create_huber(2.0), optimizer=\"nadam\",\n",
+    "              metrics=[HuberMetric(2.0)])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(X_train_scaled.astype(np.float32), y_train.astype(np.float32), epochs=2)"
+    "model.fit(X_train_scaled, y_train, epochs=2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save(\"my_model_with_a_custom_metric.h5\")"
+    "model.save(\"my_model_with_a_custom_metric\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_metric.h5\",\n",
-    "                                custom_objects={\"huber_fn\": create_huber(2.0),\n",
-    "                                                \"HuberMetric\": HuberMetric})"
+    "model = tf.keras.models.load_model(\n",
+    "    \"my_model_with_a_custom_metric\",\n",
+    "    custom_objects={\n",
+    "        \"huber_fn\": create_huber(2.0),\n",
+    "        \"HuberMetric\": HuberMetric\n",
+    "    }\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(X_train_scaled.astype(np.float32), y_train.astype(np.float32), epochs=2)"
+    "model.fit(X_train_scaled, y_train, epochs=2)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Warning**: In TF 2.2, tf.keras adds an extra first metric in `model.metrics` at position 0 (see [TF issue #38150](https://github.com/tensorflow/tensorflow/issues/38150)). This forces us to use `model.metrics[-1]` rather than `model.metrics[0]` to access the `HuberMetric`."
+    "`model.metrics` contains the model's loss followed by the model's metric(s), so the `HuberMetric` is `model.metrics[-1]`:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 117,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1567,7 +1645,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1576,9 +1654,11 @@
     "        self.threshold = threshold\n",
     "        self.huber_fn = create_huber(threshold)\n",
     "        super().__init__(name=name, dtype=dtype)\n",
+    "\n",
     "    def update_state(self, y_true, y_pred, sample_weight=None):\n",
     "        metric = self.huber_fn(y_true, y_pred)\n",
     "        super(HuberMetric, self).update_state(metric, sample_weight)\n",
+    "\n",
     "    def get_config(self):\n",
     "        base_config = super().get_config()\n",
     "        return {**base_config, \"threshold\": self.threshold}        "
@@ -1593,90 +1673,83 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\", kernel_initializer=\"lecun_normal\",\n",
-    "                       input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1),\n",
     "])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.compile(loss=tf.keras.losses.Huber(2.0), optimizer=\"nadam\", weighted_metrics=[HuberMetric(2.0)])"
+    "model.compile(loss=tf.keras.losses.Huber(2.0), optimizer=\"nadam\",\n",
+    "              weighted_metrics=[HuberMetric(2.0)])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 121,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
+    "np.random.seed(42)\n",
     "sample_weight = np.random.rand(len(y_train))\n",
-    "history = model.fit(X_train_scaled.astype(np.float32), y_train.astype(np.float32),\n",
-    "                    epochs=2, sample_weight=sample_weight)"
+    "history = model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "                    sample_weight=sample_weight)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
-    "history.history[\"loss\"][0], history.history[\"HuberMetric\"][0] * sample_weight.mean()"
+    "(history.history[\"loss\"][0],\n",
+    " history.history[\"HuberMetric\"][0] * sample_weight.mean())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save(\"my_model_with_a_custom_metric_v2.h5\")"
+    "model.save(\"my_model_with_a_custom_metric_v2\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_metric_v2.h5\",\n",
-    "                                custom_objects={\"HuberMetric\": HuberMetric})"
+    "model = tf.keras.models.load_model(\"my_model_with_a_custom_metric_v2\",\n",
+    "                                   custom_objects={\"HuberMetric\": HuberMetric})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.fit(X_train_scaled.astype(np.float32), y_train.astype(np.float32), epochs=2)"
+    "model.fit(X_train_scaled, y_train, epochs=2)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 126,
    "metadata": {
     "scrolled": true
    },
@@ -1694,7 +1767,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1703,10 +1776,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – like all layers, it can be used as a function:\n",
     "exponential_layer([-1., 0., 1.])"
    ]
   },
@@ -1714,26 +1788,16 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Adding an exponential layer at the output of a regression model can be useful if the values to predict are positive and with very different scales (e.g., 0.001, 10., 10000):"
+    "Adding an exponential layer at the output of a regression model can be useful if the values to predict are positive and with very different scales (e.g., 0.001, 10., 10000)."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
     "    tf.keras.layers.Dense(30, activation=\"relu\", input_shape=input_shape),\n",
     "    tf.keras.layers.Dense(1),\n",
@@ -1745,9 +1809,16 @@
     "model.evaluate(X_test_scaled, y_test)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, it's often preferable to replace the targets with the logarithm of the targets (and use no activation function in the output layer)."
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1760,10 +1831,10 @@
     "    def build(self, batch_input_shape):\n",
     "        self.kernel = self.add_weight(\n",
     "            name=\"kernel\", shape=[batch_input_shape[-1], self.units],\n",
-    "            initializer=\"glorot_normal\")\n",
+    "            initializer=\"he_normal\")\n",
     "        self.bias = self.add_weight(\n",
     "            name=\"bias\", shape=[self.units], initializer=\"zeros\")\n",
-    "        super().build(batch_input_shape) # must be at the end\n",
+    "        super().build(batch_input_shape)  # must be at the end\n",
     "\n",
     "    def call(self, X):\n",
     "        return self.activation(X @ self.kernel + self.bias)\n",
@@ -1779,73 +1850,51 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows that a custom layer can be used normally\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
     "    MyDense(30, activation=\"relu\", input_shape=input_shape),\n",
     "    MyDense(1)\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 141,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "])\n",
     "model.compile(loss=\"mse\", optimizer=\"nadam\")\n",
     "model.fit(X_train_scaled, y_train, epochs=2,\n",
     "          validation_data=(X_valid_scaled, y_valid))\n",
-    "model.evaluate(X_test_scaled, y_test)"
+    "model.evaluate(X_test_scaled, y_test)\n",
+    "model.save(\"my_model_with_a_custom_layer\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save(\"my_model_with_a_custom_layer.h5\")"
+    "# extra code – shows how to load a model with a custom layer\n",
+    "model = tf.keras.models.load_model(\"my_model_with_a_custom_layer\",\n",
+    "                                   custom_objects={\"MyDense\": MyDense})\n",
+    "model.fit(X_train_scaled, y_train, epochs=2,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\"my_model_with_a_custom_layer.h5\",\n",
-    "                                custom_objects={\"MyDense\": MyDense})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 133,
    "metadata": {},
    "outputs": [],
    "source": [
     "class MyMultiLayer(tf.keras.layers.Layer):\n",
     "    def call(self, X):\n",
     "        X1, X2 = X\n",
-    "        print(\"X1.shape: \", X1.shape ,\" X2.shape: \", X2.shape) # Debugging of custom layer\n",
-    "        return X1 + X2, X1 * X2\n",
+    "        print(\"X1.shape: \", X1.shape ,\" X2.shape: \", X2.shape)  # extra code\n",
+    "        return X1 + X2, X1 * X2, X1 / X2\n",
     "\n",
     "    def compute_output_shape(self, batch_input_shape):\n",
     "        batch_input_shape1, batch_input_shape2 = batch_input_shape\n",
-    "        return [batch_input_shape1, batch_input_shape2]"
+    "        return [batch_input_shape1, batch_input_shape1, batch_input_shape1]"
    ]
   },
   {
@@ -1857,103 +1906,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – tests MyMultiLayer with symbolic inputs\n",
     "inputs1 = tf.keras.layers.Input(shape=[2])\n",
     "inputs2 = tf.keras.layers.Input(shape=[2])\n",
-    "outputs1, outputs2 = MyMultiLayer()((inputs1, inputs2))"
+    "MyMultiLayer()((inputs1, inputs2))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Note that the `call()` method receives symbolic inputs, whose shape is only partially specified (at this stage, we don't know the batch size, which is why the first dimension is `None`):\n",
+    "Note that the `call()` method receives symbolic inputs, and it returns symbolic outputs. The shapes are only partially specified at this stage: we don't know the batch size, which is why the first dimension is `None`.\n",
     "\n",
-    "We can also pass actual data to the custom layer. To test this, let's split each dataset's inputs into two parts, with four features each:"
+    "We can also pass actual data to the custom layer:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def split_data(data):\n",
-    "    columns_count = data.shape[-1]\n",
-    "    half = columns_count // 2\n",
-    "    return data[:, :half], data[:, half:]\n",
-    "\n",
-    "X_train_scaled_A, X_train_scaled_B = split_data(X_train_scaled)\n",
-    "X_valid_scaled_A, X_valid_scaled_B = split_data(X_valid_scaled)\n",
-    "X_test_scaled_A, X_test_scaled_B = split_data(X_test_scaled)\n",
-    "\n",
-    "# Printing the splitted data shapes\n",
-    "X_train_scaled_A.shape, X_train_scaled_B.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now notice that the shapes are fully specified:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 147,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "outputs1, outputs2 = MyMultiLayer()((X_train_scaled_A, X_train_scaled_B))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Let's build a more complete model using the functional API (this is just a toy example, don't expect awesome performance):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 148,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "input_A = tf.keras.layers.Input(shape=X_train_scaled_A.shape[-1])\n",
-    "input_B = tf.keras.layers.Input(shape=X_train_scaled_B.shape[-1])\n",
-    "hidden_A, hidden_B = MyMultiLayer()((input_A, input_B))\n",
-    "hidden_A = tf.keras.layers.Dense(30, activation='selu')(hidden_A)\n",
-    "hidden_B = tf.keras.layers.Dense(30, activation='selu')(hidden_B)\n",
-    "concat = tf.keras.layers.Concatenate()((hidden_A, hidden_B))\n",
-    "output = tf.keras.layers.Dense(1)(concat)\n",
-    "model = tf.keras.Model(inputs=[input_A, input_B], outputs=[output])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 149,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss='mse', optimizer='nadam')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.fit((X_train_scaled_A, X_train_scaled_B), y_train, epochs=2,\n",
-    "          validation_data=((X_valid_scaled_A, X_valid_scaled_B), y_valid))"
+    "# extra code – tests MyMultiLayer with actual data \n",
+    "X1, X2 = np.array([[3., 6.], [2., 7.]]), np.array([[6., 12.], [4., 3.]]) \n",
+    "MyMultiLayer()((X1, X2))"
    ]
   },
   {
@@ -1965,11 +1945,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [],
    "source": [
-    "class AddGaussianNoise(tf.keras.layers.Layer):\n",
+    "class MyGaussianNoise(tf.keras.layers.Layer):\n",
     "    def __init__(self, stddev, **kwargs):\n",
     "        super().__init__(**kwargs)\n",
     "        self.stddev = stddev\n",
@@ -1994,27 +1974,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
+    "# extra code – tests MyGaussianNoise\n",
     "tf.random.set_seed(42)\n",
-    "\n",
     "model = tf.keras.Sequential([\n",
-    "    AddGaussianNoise(stddev=1.0),\n",
-    "    tf.keras.layers.Dense(30, activation=\"selu\"),\n",
+    "    MyGaussianNoise(stddev=1.0, input_shape=input_shape),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    tf.keras.layers.Dense(1)\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 153,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "])\n",
     "model.compile(loss=\"mse\", optimizer=\"nadam\")\n",
     "model.fit(X_train_scaled, y_train, epochs=2,\n",
     "          validation_data=(X_valid_scaled, y_valid))\n",
@@ -2030,24 +2001,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_new_scaled = X_test_scaled"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 155,
+   "execution_count": 138,
    "metadata": {},
    "outputs": [],
    "source": [
     "class ResidualBlock(tf.keras.layers.Layer):\n",
     "    def __init__(self, n_layers, n_neurons, **kwargs):\n",
     "        super().__init__(**kwargs)\n",
-    "        self.hidden = [tf.keras.layers.Dense(n_neurons, activation=\"elu\",\n",
-    "                                          kernel_initializer=\"he_normal\")\n",
+    "        self.hidden = [tf.keras.layers.Dense(n_neurons, activation=\"relu\",\n",
+    "                                             kernel_initializer=\"he_normal\")\n",
     "                       for _ in range(n_layers)]\n",
     "\n",
     "    def call(self, inputs):\n",
@@ -2059,15 +2021,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 156,
+   "execution_count": 139,
    "metadata": {},
    "outputs": [],
    "source": [
     "class ResidualRegressor(tf.keras.Model):\n",
     "    def __init__(self, output_dim, **kwargs):\n",
     "        super().__init__(**kwargs)\n",
-    "        self.hidden1 = tf.keras.layers.Dense(30, activation=\"elu\",\n",
-    "                                          kernel_initializer=\"he_normal\")\n",
+    "        self.hidden1 = tf.keras.layers.Dense(30, activation=\"relu\",\n",
+    "                                             kernel_initializer=\"he_normal\")\n",
     "        self.block1 = ResidualBlock(2, 30)\n",
     "        self.block2 = ResidualBlock(2, 30)\n",
     "        self.out = tf.keras.layers.Dense(output_dim)\n",
@@ -2082,53 +2044,30 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 140,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows that the model can be used normally\n",
+    "tf.random.set_seed(42)\n",
     "model = ResidualRegressor(1)\n",
     "model.compile(loss=\"mse\", optimizer=\"nadam\")\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=5)\n",
+    "history = model.fit(X_train_scaled, y_train, epochs=2)\n",
     "score = model.evaluate(X_test_scaled, y_test)\n",
-    "y_pred = model.predict(X_new_scaled)"
+    "model.save(\"my_custom_model\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
+   "execution_count": 141,
    "metadata": {},
    "outputs": [],
    "source": [
-    "model.save(\"my_custom_model.ckpt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 160,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model = tf.keras.models.load_model(\"my_custom_model.ckpt\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 161,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "history = model.fit(X_train_scaled, y_train, epochs=5)"
+    "# extra code – the model can be loaded and you can continue training or use it\n",
+    "#              to make predictions\n",
+    "model = tf.keras.models.load_model(\"my_custom_model\")\n",
+    "history = model.fit(X_train_scaled, y_train, epochs=2)\n",
+    "model.predict(X_test_scaled[:3])"
    ]
   },
   {
@@ -2140,42 +2079,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 163,
+   "execution_count": 142,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)\n",
     "block1 = ResidualBlock(2, 30)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"elu\", kernel_initializer=\"he_normal\"),\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\",\n",
+    "                          kernel_initializer=\"he_normal\"),\n",
     "    block1, block1, block1, block1,\n",
     "    ResidualBlock(2, 30),\n",
     "    tf.keras.layers.Dense(1)\n",
     "])"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 164,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"mse\", optimizer=\"nadam\")\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=5)\n",
-    "score = model.evaluate(X_test_scaled, y_test)\n",
-    "y_pred = model.predict(X_new_scaled)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2187,30 +2105,29 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Note**: the following code has two differences with the code in the book:\n",
-    "1. It creates a `tf.keras.metrics.Mean()` metric in the constructor and uses it in the `call()` method to track the mean reconstruction loss. Since we only want to do this during training, we add a `training` argument to the `call()` method, and if `training` is `True`, then we update `reconstruction_mean` and we call `self.add_metric()` to ensure it's displayed properly.\n",
-    "2. Due to an issue introduced in TF 2.2 ([#46858](https://github.com/tensorflow/tensorflow/issues/46858)), we must not call `super().build()` inside the `build()` method."
+    "**Warning**: due to an issue introduced in TF 2.2 ([#46858](https://github.com/tensorflow/tensorflow/issues/46858)), `super().build()` fails. We can work around this issue by setting `self.built = True` instead."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
+   "execution_count": 143,
    "metadata": {},
    "outputs": [],
    "source": [
     "class ReconstructingRegressor(tf.keras.Model):\n",
     "    def __init__(self, output_dim, **kwargs):\n",
     "        super().__init__(**kwargs)\n",
-    "        self.hidden = [tf.keras.layers.Dense(30, activation=\"selu\",\n",
-    "                                          kernel_initializer=\"lecun_normal\")\n",
+    "        self.hidden = [tf.keras.layers.Dense(30, activation=\"relu\",\n",
+    "                                             kernel_initializer=\"he_normal\")\n",
     "                       for _ in range(5)]\n",
     "        self.out = tf.keras.layers.Dense(output_dim)\n",
-    "        self.reconstruction_mean = tf.keras.metrics.Mean(name=\"reconstruction_error\")\n",
+    "        self.reconstruction_mean = tf.keras.metrics.Mean(\n",
+    "            name=\"reconstruction_error\")\n",
     "\n",
     "    def build(self, batch_input_shape):\n",
     "        n_inputs = batch_input_shape[-1]\n",
     "        self.reconstruct = tf.keras.layers.Dense(n_inputs)\n",
-    "        #super().build(batch_input_shape)\n",
+    "        self.built = True  # WORKAROUND for super().build(batch_input_shape)\n",
     "\n",
     "    def call(self, inputs, training=None):\n",
     "        Z = inputs\n",
@@ -2227,24 +2144,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 166,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 167,
+   "execution_count": 144,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code\n",
+    "tf.random.set_seed(42)\n",
     "model = ReconstructingRegressor(1)\n",
     "model.compile(loss=\"mse\", optimizer=\"nadam\")\n",
-    "history = model.fit(X_train_scaled, y_train, epochs=2)\n",
+    "history = model.fit(X_train_scaled, y_train, epochs=5)\n",
     "y_pred = model.predict(X_test_scaled)"
    ]
   },
@@ -2252,12 +2160,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Computing Gradients with Autodiff"
+    "## Computing Gradients Using Autodiff"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 168,
+   "execution_count": 145,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2267,7 +2175,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 169,
+   "execution_count": 146,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2278,7 +2186,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 147,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2287,7 +2195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 171,
+   "execution_count": 148,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2300,7 +2208,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 172,
+   "execution_count": 149,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2309,37 +2217,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 173,
+   "execution_count": 150,
    "metadata": {},
    "outputs": [],
    "source": [
     "with tf.GradientTape() as tape:\n",
     "    z = f(w1, w2)\n",
     "\n",
-    "dz_dw1 = tape.gradient(z, w1)\n",
+    "dz_dw1 = tape.gradient(z, w1)  # returns tensor 36.0\n",
     "try:\n",
-    "    dz_dw2 = tape.gradient(z, w2)\n",
+    "    dz_dw2 = tape.gradient(z, w2)  # raises a RuntimeError!\n",
     "except RuntimeError as ex:\n",
     "    print(ex)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 174,
+   "execution_count": 151,
    "metadata": {},
    "outputs": [],
    "source": [
     "with tf.GradientTape(persistent=True) as tape:\n",
     "    z = f(w1, w2)\n",
     "\n",
-    "dz_dw1 = tape.gradient(z, w1)\n",
-    "dz_dw2 = tape.gradient(z, w2) # works now!\n",
+    "dz_dw1 = tape.gradient(z, w1)  # returns tensor 36.0\n",
+    "dz_dw2 = tape.gradient(z, w2)  # returns tensor 10.0, works fine now!\n",
     "del tape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 175,
+   "execution_count": 152,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2348,7 +2256,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 176,
+   "execution_count": 153,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2361,7 +2269,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 177,
+   "execution_count": 154,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2370,7 +2278,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 178,
+   "execution_count": 155,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2384,7 +2292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 179,
+   "execution_count": 156,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2393,10 +2301,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 180,
+   "execution_count": 157,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – if given a vector, tape.gradient() will compute the gradient of\n",
+    "#              the vector's sum.\n",
     "with tf.GradientTape() as tape:\n",
     "    z1 = f(w1, w2 + 2.)\n",
     "    z2 = f(w1, w2 + 5.)\n",
@@ -2407,25 +2317,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 181,
+   "execution_count": 158,
    "metadata": {},
    "outputs": [],
    "source": [
-    "with tf.GradientTape(persistent=True) as tape:\n",
+    "# extra code – shows that we get the same result as the previous cell\n",
+    "with tf.GradientTape() as tape:\n",
     "    z1 = f(w1, w2 + 2.)\n",
     "    z2 = f(w1, w2 + 5.)\n",
     "    z3 = f(w1, w2 + 7.)\n",
+    "    z = z1 + z2 + z3\n",
     "\n",
-    "tf.reduce_sum(tf.stack([tape.gradient(z, [w1, w2]) for z in (z1, z2, z3)]), axis=0)\n",
-    "del tape"
+    "tape.gradient(z, [w1, w2])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 182,
+   "execution_count": 159,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to compute the jacobians and the hessians\n",
     "with tf.GradientTape(persistent=True) as hessian_tape:\n",
     "    with tf.GradientTape() as jacobian_tape:\n",
     "        z = f(w1, w2)\n",
@@ -2437,7 +2349,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 183,
+   "execution_count": 160,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2446,7 +2358,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 184,
+   "execution_count": 161,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2455,7 +2367,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 185,
+   "execution_count": 162,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2463,27 +2375,36 @@
     "    return 3 * w1 ** 2 + tf.stop_gradient(2 * w1 * w2)\n",
     "\n",
     "with tf.GradientTape() as tape:\n",
-    "    z = f(w1, w2)\n",
+    "    z = f(w1, w2)  # same result as without stop_gradient()\n",
     "\n",
-    "tape.gradient(z, [w1, w2])"
+    "gradients = tape.gradient(z, [w1, w2])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 186,
+   "execution_count": 163,
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = tf.Variable(100.)\n",
+    "gradients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 164,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "x = tf.Variable(1e-50)\n",
     "with tf.GradientTape() as tape:\n",
-    "    z = my_softplus(x)\n",
+    "    z = tf.sqrt(x)\n",
     "\n",
     "tape.gradient(z, [x])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 187,
+   "execution_count": 165,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2492,11 +2413,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 188,
+   "execution_count": 166,
    "metadata": {},
    "outputs": [],
    "source": [
-    "x = tf.Variable([100.])\n",
+    "x = tf.Variable([1.0e30])\n",
     "with tf.GradientTape() as tape:\n",
     "    z = my_softplus(x)\n",
     "\n",
@@ -2505,37 +2426,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 189,
+   "execution_count": 167,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def my_softplus(z):\n",
+    "    return tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0., z)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here is the proof that this equation is equal to log(1 + exp(_z_)):\n",
+    "* softplus(_z_) = log(1 + exp(_z_))\n",
+    "* softplus(_z_) = log(1 + exp(_z_)) - log(exp(_z_)) + log(exp(_z_)) ; **just adding and subtracting the same value**\n",
+    "* softplus(_z_) = log\\[(1 + exp(_z_)) / exp(_z_)\\] + log(exp(_z_)) ; **since log(_a_) - log(_b_) = log(_a_ / _b_)**\n",
+    "* softplus(_z_) = log\\[(1 + exp(_z_)) / exp(_z_)\\] + _z_ ; **since log(exp(_z_)) = _z_**\n",
+    "* softplus(_z_) = log\\[1 / exp(_z_) + exp(_z_) / exp(_z_)\\] + _z_ ; **since (1 + _a_) / _b_ = 1 / _b_ + _a_ / _b_**\n",
+    "* softplus(_z_) = log\\[exp(–_z_) + 1\\] + _z_ ; **since 1 / exp(_z_) = exp(–z), and exp(_z_) / exp(_z_) = 1**\n",
+    "* softplus(_z_) = softplus(–_z_) + _z_ ; **we recognize the definition at the top, but with –_z_**\n",
+    "* softplus(_z_) = softplus(–|_z_|) + max(0, _z_) ; **if you consider both cases, _z_ < 0 or _z_ ≥ 0, you will see that this works**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 168,
    "metadata": {},
    "outputs": [],
    "source": [
     "@tf.custom_gradient\n",
-    "def my_better_softplus(z):\n",
-    "    exp = tf.exp(z)\n",
-    "    def my_softplus_gradients(grad):\n",
-    "        return grad / (1 + 1 / exp)\n",
-    "    return tf.math.log(exp + 1), my_softplus_gradients"
+    "def my_softplus(z):\n",
+    "    def my_softplus_gradients(grads):  # grads = backprop'ed from upper layers\n",
+    "        return grads * (1 - 1 / (1 + tf.exp(z)))  # stable grads of softplus\n",
+    "\n",
+    "    result = tf.math.log(1 + tf.exp(-tf.abs(z))) + tf.maximum(0., z)\n",
+    "    return result, my_softplus_gradients"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 190,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def my_better_softplus(z):\n",
-    "    return tf.where(z > 30., z, tf.math.log(tf.exp(z) + 1.))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 191,
+   "execution_count": 169,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows that the function is now stable, as well as its gradients\n",
     "x = tf.Variable([1000.])\n",
     "with tf.GradientTape() as tape:\n",
-    "    z = my_better_softplus(x)\n",
+    "    z = my_softplus(x)\n",
     "\n",
     "z, tape.gradient(z, [x])"
    ]
@@ -2544,37 +2482,27 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Computing Gradients Using Autodiff"
+    "## Custom Training Loops"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 192,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 193,
+   "execution_count": 170,
    "metadata": {},
    "outputs": [],
    "source": [
+    "tf.random.set_seed(42)  # extra code – to ensure reproducibility\n",
     "l2_reg = tf.keras.regularizers.l2(0.05)\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"elu\", kernel_initializer=\"he_normal\",\n",
-    "                       kernel_regularizer=l2_reg),\n",
+    "model = tf.keras.models.Sequential([\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          kernel_regularizer=l2_reg),\n",
     "    tf.keras.layers.Dense(1, kernel_regularizer=l2_reg)\n",
     "])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 194,
+   "execution_count": 171,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2585,117 +2513,37 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 195,
+   "execution_count": 172,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def print_status_bar(iteration, total, loss, metrics=None):\n",
-    "    metrics = \" - \".join([\"{}: {:.4f}\".format(m.name, m.result())\n",
-    "                         for m in [loss] + (metrics or [])])\n",
-    "    end = \"\" if iteration < total else \"\\n\"\n",
-    "    print(\"\\r{}/{} - \".format(iteration, total) + metrics,\n",
-    "          end=end)"
+    "def print_status_bar(step, total, loss, metrics=None):\n",
+    "    metrics = \" - \".join([f\"{m.name}: {m.result():.4f}\"\n",
+    "                          for m in [loss] + (metrics or [])])\n",
+    "    end = \"\" if step < total else \"\\n\"\n",
+    "    print(f\"\\r{step}/{total} - \" + metrics, end=end)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 196,
+   "execution_count": 173,
    "metadata": {},
    "outputs": [],
    "source": [
-    "import time\n",
-    "\n",
-    "mean_loss = tf.keras.metrics.Mean(name=\"loss\")\n",
-    "mean_square = tf.keras.metrics.Mean(name=\"mean_square\")\n",
-    "for i in range(1, 50 + 1):\n",
-    "    loss = 1 / i\n",
-    "    mean_loss(loss)\n",
-    "    mean_square(i ** 2)\n",
-    "    print_status_bar(i, 50, mean_loss, [mean_square])\n",
-    "    time.sleep(0.05)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "A fancier version with a progress bar:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 197,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def progress_bar(iteration, total, size=30):\n",
-    "    running = iteration < total\n",
-    "    c = \">\" if running else \"=\"\n",
-    "    p = (size - 1) * iteration // total\n",
-    "    fmt = \"{{:-{}d}}/{{}} [{{}}]\".format(len(str(total)))\n",
-    "    params = [iteration, total, \"=\" * p + c + \".\" * (size - p - 1)]\n",
-    "    return fmt.format(*params)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 198,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progress_bar(3500, 10000, size=6)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 199,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def print_status_bar(iteration, total, loss, metrics=None, size=30):\n",
-    "    metrics = \" - \".join([\"{}: {:.4f}\".format(m.name, m.result())\n",
-    "                         for m in [loss] + (metrics or [])])\n",
-    "    end = \"\" if iteration < total else \"\\n\"\n",
-    "    print(\"\\r{} - {}\".format(progress_bar(iteration, total), metrics), end=end)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 200,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mean_loss = tf.keras.metrics.Mean(name=\"loss\")\n",
-    "mean_square = tf.keras.metrics.Mean(name=\"mean_square\")\n",
-    "for i in range(1, 50 + 1):\n",
-    "    loss = 1 / i\n",
-    "    mean_loss(loss)\n",
-    "    mean_square(i ** 2)\n",
-    "    print_status_bar(i, 50, mean_loss, [mean_square])\n",
-    "    time.sleep(0.05)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 201,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
     "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 202,
+   "execution_count": 174,
    "metadata": {},
    "outputs": [],
    "source": [
     "n_epochs = 5\n",
     "batch_size = 32\n",
     "n_steps = len(X_train) // batch_size\n",
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)\n",
     "loss_fn = tf.keras.losses.mean_squared_error\n",
     "mean_loss = tf.keras.metrics.Mean()\n",
     "metrics = [tf.keras.metrics.MeanAbsoluteError()]"
@@ -2703,68 +2551,77 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 203,
+   "execution_count": 175,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
     "for epoch in range(1, n_epochs + 1):\n",
-    "    print(\"Epoch {}/{}\".format(epoch, n_epochs))\n",
+    "    print(f\"Epoch {epoch}/{n_epochs}\")\n",
     "    for step in range(1, n_steps + 1):\n",
     "        X_batch, y_batch = random_batch(X_train_scaled, y_train)\n",
     "        with tf.GradientTape() as tape:\n",
-    "            y_pred = model(X_batch)\n",
+    "            y_pred = model(X_batch, training=True)\n",
     "            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
     "            loss = tf.add_n([main_loss] + model.losses)\n",
+    "\n",
     "        gradients = tape.gradient(loss, model.trainable_variables)\n",
     "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
+    "\n",
+    "        # extra code – if your model has variable constraints\n",
     "        for variable in model.variables:\n",
     "            if variable.constraint is not None:\n",
     "                variable.assign(variable.constraint(variable))\n",
+    "\n",
     "        mean_loss(loss)\n",
     "        for metric in metrics:\n",
     "            metric(y_batch, y_pred)\n",
-    "        print_status_bar(step * batch_size, len(y_train), mean_loss, metrics)\n",
-    "    print_status_bar(len(y_train), len(y_train), mean_loss, metrics)\n",
+    "\n",
+    "        print_status_bar(step, n_steps, mean_loss, metrics)\n",
+    "\n",
     "    for metric in [mean_loss] + metrics:\n",
     "        metric.reset_states()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 204,
+   "execution_count": 176,
    "metadata": {},
    "outputs": [],
    "source": [
-    "try:\n",
-    "    from tqdm.notebook import trange\n",
-    "    from collections import OrderedDict\n",
-    "    with trange(1, n_epochs + 1, desc=\"All epochs\") as epochs:\n",
-    "        for epoch in epochs:\n",
-    "            with trange(1, n_steps + 1, desc=\"Epoch {}/{}\".format(epoch, n_epochs)) as steps:\n",
-    "                for step in steps:\n",
-    "                    X_batch, y_batch = random_batch(X_train_scaled, y_train)\n",
-    "                    with tf.GradientTape() as tape:\n",
-    "                        y_pred = model(X_batch)\n",
-    "                        main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
-    "                        loss = tf.add_n([main_loss] + model.losses)\n",
-    "                    gradients = tape.gradient(loss, model.trainable_variables)\n",
-    "                    optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
-    "                    for variable in model.variables:\n",
-    "                        if variable.constraint is not None:\n",
-    "                            variable.assign(variable.constraint(variable))                    \n",
-    "                    status = OrderedDict()\n",
-    "                    mean_loss(loss)\n",
-    "                    status[\"loss\"] = mean_loss.result().numpy()\n",
-    "                    for metric in metrics:\n",
-    "                        metric(y_batch, y_pred)\n",
-    "                        status[metric.name] = metric.result().numpy()\n",
-    "                    steps.set_postfix(status)\n",
-    "            for metric in [mean_loss] + metrics:\n",
-    "                metric.reset_states()\n",
-    "except ImportError as ex:\n",
-    "    print(\"To run this cell, please install tqdm, ipywidgets and restart Jupyter\")"
+    "# extra code – shows how to use the tqdm package to display nice progress bars\n",
+    "\n",
+    "from tqdm.notebook import trange\n",
+    "from collections import OrderedDict\n",
+    "with trange(1, n_epochs + 1, desc=\"All epochs\") as epochs:\n",
+    "    for epoch in epochs:\n",
+    "        with trange(1, n_steps + 1, desc=f\"Epoch {epoch}/{n_epochs}\") as steps:\n",
+    "            for step in steps:\n",
+    "                X_batch, y_batch = random_batch(X_train_scaled, y_train)\n",
+    "                with tf.GradientTape() as tape:\n",
+    "                    y_pred = model(X_batch)\n",
+    "                    main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
+    "                    loss = tf.add_n([main_loss] + model.losses)\n",
+    "\n",
+    "                gradients = tape.gradient(loss, model.trainable_variables)\n",
+    "                optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
+    "\n",
+    "                for variable in model.variables:\n",
+    "                    if variable.constraint is not None:\n",
+    "                        variable.assign(variable.constraint(variable))\n",
+    "\n",
+    "                status = OrderedDict()\n",
+    "                mean_loss(loss)\n",
+    "                status[\"loss\"] = mean_loss.result().numpy()\n",
+    "                for metric in metrics:\n",
+    "                    metric(y_batch, y_pred)\n",
+    "                    status[metric.name] = metric.result().numpy()\n",
+    "\n",
+    "                steps.set_postfix(status)\n",
+    "\n",
+    "        for metric in [mean_loss] + metrics:\n",
+    "            metric.reset_states()"
    ]
   },
   {
@@ -2776,7 +2633,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 205,
+   "execution_count": 177,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2786,7 +2643,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 206,
+   "execution_count": 178,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2795,7 +2652,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 207,
+   "execution_count": 179,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2804,7 +2661,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 208,
+   "execution_count": 180,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2814,7 +2671,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 209,
+   "execution_count": 181,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2823,13 +2680,31 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 210,
+   "execution_count": 182,
    "metadata": {},
    "outputs": [],
    "source": [
     "tf_cube(tf.constant(2.0))"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 183,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tf.function\n",
+    "def tf_cube(x):\n",
+    "    return x ** 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Note:** the rest of the code in this section is in appendix D."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -2839,17 +2714,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 211,
+   "execution_count": 184,
    "metadata": {},
    "outputs": [],
    "source": [
     "concrete_function = tf_cube.get_concrete_function(tf.constant(2.0))\n",
-    "concrete_function.graph"
+    "concrete_function"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 212,
+   "execution_count": 185,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2858,7 +2733,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 213,
+   "execution_count": 186,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2874,7 +2749,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 214,
+   "execution_count": 187,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2883,7 +2758,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 215,
+   "execution_count": 188,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2893,7 +2768,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 216,
+   "execution_count": 189,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2903,7 +2778,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 217,
+   "execution_count": 190,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2912,7 +2787,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 218,
+   "execution_count": 191,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2921,7 +2796,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 219,
+   "execution_count": 192,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2930,7 +2805,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 220,
+   "execution_count": 193,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2946,19 +2821,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 221,
+   "execution_count": 194,
    "metadata": {},
    "outputs": [],
    "source": [
     "@tf.function\n",
     "def tf_cube(x):\n",
-    "    print(\"print:\", x)\n",
+    "    print(f\"x = {x}\")\n",
     "    return x ** 3"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 222,
+   "execution_count": 195,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2967,7 +2842,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 223,
+   "execution_count": 196,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2976,15 +2851,47 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 224,
+   "execution_count": 197,
    "metadata": {},
    "outputs": [],
    "source": [
-    "result = tf_cube(2)\n",
-    "result = tf_cube(3)\n",
-    "result = tf_cube(tf.constant([[1., 2.]])) # New shape: trace!\n",
-    "result = tf_cube(tf.constant([[3., 4.], [5., 6.]])) # New shape: trace!\n",
-    "result = tf_cube(tf.constant([[7., 8.], [9., 10.], [11., 12.]])) # New shape: trace!\n"
+    "result = tf_cube(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 198,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = tf_cube(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 199,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = tf_cube(tf.constant([[1., 2.]]))  # New shape: trace!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 200,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = tf_cube(tf.constant([[3., 4.], [5., 6.]]))  # New shape: trace!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 201,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result = tf_cube(tf.constant([[7., 8.], [9., 10.]]))  # Same shape: no trace"
    ]
   },
   {
@@ -2996,48 +2903,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 225,
+   "execution_count": 202,
    "metadata": {},
    "outputs": [],
    "source": [
     "@tf.function(input_signature=[tf.TensorSpec([None, 28, 28], tf.float32)])\n",
     "def shrink(images):\n",
-    "    print(\"Tracing\", images)\n",
+    "    print(\"Tracing\", images)  # extra code to show when tracing happens\n",
     "    return images[:, ::2, ::2] # drop half the rows and columns"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 226,
+   "execution_count": 203,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 227,
+   "execution_count": 204,
    "metadata": {},
    "outputs": [],
    "source": [
     "img_batch_1 = tf.random.uniform(shape=[100, 28, 28])\n",
     "img_batch_2 = tf.random.uniform(shape=[50, 28, 28])\n",
-    "preprocessed_images = shrink(img_batch_1) # Traces the function.\n",
-    "preprocessed_images = shrink(img_batch_2) # Reuses the same concrete function."
+    "preprocessed_images = shrink(img_batch_1)  # Works fine, traces the function\n",
+    "preprocessed_images = shrink(img_batch_2)  # Works fine, same concrete function"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 228,
+   "execution_count": 205,
    "metadata": {},
    "outputs": [],
    "source": [
     "img_batch_3 = tf.random.uniform(shape=[2, 2, 2])\n",
     "try:\n",
-    "    preprocessed_images = shrink(img_batch_3)  # rejects unexpected types or shapes\n",
+    "    preprocessed_images = shrink(img_batch_3)  # ValueError! Incompatible inputs\n",
     "except ValueError as ex:\n",
     "    print(ex)"
    ]
@@ -3058,7 +2963,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 229,
+   "execution_count": 206,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3071,7 +2976,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 230,
+   "execution_count": 207,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3080,7 +2985,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 231,
+   "execution_count": 208,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3096,10 +3001,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 232,
+   "execution_count": 209,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to use tf.while_loop (usually @tf.function is simpler)\n",
     "@tf.function\n",
     "def add_10(x):\n",
     "    condition = lambda i, x: tf.less(i, 10)\n",
@@ -3110,7 +3016,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 233,
+   "execution_count": 210,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3119,7 +3025,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 234,
+   "execution_count": 211,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3135,7 +3041,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 235,
+   "execution_count": 212,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3148,7 +3054,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 236,
+   "execution_count": 213,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3164,7 +3070,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 237,
+   "execution_count": 214,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3172,22 +3078,15 @@
     "\n",
     "@tf.function\n",
     "def increment(counter, c=1):\n",
-    "    return counter.assign_add(c)"
+    "    return counter.assign_add(c)\n",
+    "\n",
+    "increment(counter)  # counter is now equal to 1\n",
+    "increment(counter)  # counter is now equal to 2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 238,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "increment(counter)\n",
-    "increment(counter)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 239,
+   "execution_count": 215,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3197,7 +3096,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 240,
+   "execution_count": 216,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3210,7 +3109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 241,
+   "execution_count": 217,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3220,7 +3119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 242,
+   "execution_count": 218,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3230,7 +3129,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 243,
+   "execution_count": 219,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3245,7 +3144,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 244,
+   "execution_count": 220,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3256,7 +3155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 245,
+   "execution_count": 221,
    "metadata": {
     "scrolled": true
    },
@@ -3273,21 +3172,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 246,
+   "execution_count": 222,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to display the autograph code with syntax highlighting\n",
     "def display_tf_code(func):\n",
     "    from IPython.display import display, Markdown\n",
     "    if hasattr(func, \"python_function\"):\n",
     "        func = func.python_function\n",
     "    code = tf.autograph.to_code(func)\n",
-    "    display(Markdown('```python\\n{}\\n```'.format(code)))"
+    "    display(Markdown(f'```python\\n{code}\\n```'))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 247,
+   "execution_count": 223,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3311,7 +3211,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 248,
+   "execution_count": 224,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3323,7 +3223,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 249,
+   "execution_count": 225,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3335,7 +3235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 250,
+   "execution_count": 226,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3364,18 +3264,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 251,
+   "execution_count": 227,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 252,
+   "execution_count": 228,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3400,7 +3298,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 253,
+   "execution_count": 229,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3409,7 +3307,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 254,
+   "execution_count": 230,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3427,18 +3325,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 255,
+   "execution_count": 231,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 256,
+   "execution_count": 232,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3447,7 +3343,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 257,
+   "execution_count": 233,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3458,12 +3354,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Not the custom code will be called at each iteration. Let's fit, validate and evaluate with tiny datasets to avoid getting too much output:"
+    "Now the custom code will be called at each iteration. Let's fit, validate and evaluate with tiny datasets to avoid getting too much output:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 258,
+   "execution_count": 234,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3481,18 +3377,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 259,
+   "execution_count": 235,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 260,
+   "execution_count": 236,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3501,7 +3395,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 261,
+   "execution_count": 237,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3510,7 +3404,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 262,
+   "execution_count": 238,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3523,7 +3417,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Custom Optimizers"
+    "## Extra Material – Custom Optimizers"
    ]
   },
   {
@@ -3535,7 +3429,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 263,
+   "execution_count": 239,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3581,18 +3475,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 264,
+   "execution_count": 240,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
     "tf.random.set_seed(42)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 265,
+   "execution_count": 241,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3612,8 +3504,24 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## 1. to 11.\n",
-    "See Appendix A."
+    "## 1. to 11."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1. TensorFlow is an open-source library for numerical computation, particularly well suited and fine-tuned for large-scale Machine Learning. Its core is similar to NumPy, but it also features GPU support, support for distributed computing, computation graph analysis and optimization capabilities (with a portable graph format that allows you to train a TensorFlow model in one environment and run it in another), an optimization API based on reverse-mode autodiff, and several powerful APIs such as tf.keras, tf.data, tf.image, tf.signal, and more. Other popular Deep Learning libraries include PyTorch, MXNet, Microsoft Cognitive Toolkit, Theano, Caffe2, and Chainer.\n",
+    "2. Although TensorFlow offers most of the functionalities provided by NumPy, it is not a drop-in replacement, for a few reasons. First, the names of the functions are not always the same (for example, `tf.reduce_sum()` versus `np.sum()`). Second, some functions do not behave in exactly the same way (for example, `tf.transpose()` creates a transposed copy of a tensor, while NumPy's `T` attribute creates a transposed view, without actually copying any data). Lastly, NumPy arrays are mutable, while TensorFlow tensors are not (but you can use a `tf.Variable` if you need a mutable object).\n",
+    "3. Both `tf.range(10)` and `tf.constant(np.arange(10))` return a one-dimensional tensor containing the integers 0 to 9. However, the former uses 32-bit integers while the latter uses 64-bit integers. Indeed, TensorFlow defaults to 32 bits, while NumPy defaults to 64 bits.\n",
+    "4. Beyond regular tensors, TensorFlow offers several other data structures, including sparse tensors, tensor arrays, ragged tensors, queues, string tensors, and sets. The last two are actually represented as regular tensors, but TensorFlow provides special functions to manipulate them (in `tf.strings` and `tf.sets`).\n",
+    "5. When you want to define a custom loss function, in general you can just implement it as a regular Python function. However, if your custom loss function must support some hyperparameters (or any other state), then you should subclass the `keras.losses.Loss` class and implement the `__init__()` and `call()` methods. If you want the loss function's hyperparameters to be saved along with the model, then you must also implement the `get_config()` method.\n",
+    "6. Much like custom loss functions, most metrics can be defined as regular Python functions. But if you want your custom metric to support some hyperparameters (or any other state), then you should subclass the `keras.metrics.Metric` class. Moreover, if computing the metric over a whole epoch is not equivalent to computing the mean metric over all batches in that epoch (e.g., as for the precision and recall metrics), then you should subclass the `keras.metrics.Metric` class and implement the `__init__()`, `update_state()`, and `result()` methods to keep track of a running metric during each epoch. You should also implement the `reset_states()` method unless all it needs to do is reset all variables to 0.0. If you want the state to be saved along with the model, then you should implement the `get_config()` method as well.\n",
+    "7. You should distinguish the internal components of your model (i.e., layers or reusable blocks of layers) from the model itself (i.e., the object you will train). The former should subclass the `keras.layers.Layer` class, while the latter should subclass the `keras.models.Model` class.\n",
+    "8. Writing your own custom training loop is fairly advanced, so you should only do it if you really need to. Keras provides several tools to customize training without having to write a custom training loop: callbacks, custom regularizers, custom constraints, custom losses, and so on. You should use these instead of writing a custom training loop whenever possible: writing a custom training loop is more error-prone, and it will be harder to reuse the custom code you write. However, in some cases writing a custom training loop is necessary⁠—for example, if you want to use different optimizers for different parts of your neural network, like in the [Wide & Deep paper](https://homl.info/widedeep). A custom training loop can also be useful when debugging, or when trying to understand exactly how training works.\n",
+    "9. Custom Keras components should be convertible to TF Functions, which means they should stick to TF operations as much as possible and respect all the rules listed in Chapter 12 (in the _TF Function Rules_ section). If you absolutely need to include arbitrary Python code in a custom component, you can either wrap it in a `tf.py_function()` operation (but this will reduce performance and limit your model's portability) or set `dynamic=True` when creating the custom layer or model (or set `run_eagerly=True` when calling the model's `compile()` method).\n",
+    "10. Please refer to Chapter 12 for the list of rules to respect when creating a TF Function (in the _TF Function Rules_ section).\n",
+    "11. Creating a dynamic Keras model can be useful for debugging, as it will not compile any custom component to a TF Function, and you can use any Python debugger to debug your code. It can also be useful if you want to include arbitrary Python code in your model (or in your training code), including calls to external libraries. To make a model dynamic, you must set `dynamic=True` when creating it. Alternatively, you can set `run_eagerly=True` when calling the model's `compile()` method. Making a model dynamic prevents Keras from using any of TensorFlow's graph features, so it will slow down training and inference, and you will not have the possibility to export the computation graph, which will limit your model's portability."
    ]
   },
   {
@@ -3621,7 +3529,7 @@
    "metadata": {},
    "source": [
     "## 12. Implement a custom layer that performs _Layer Normalization_\n",
-    "_We will use this type of layer in Chapter 14 when using Recurrent Neural Networks._"
+    "_We will use this type of layer in Chapter 15 when using Recurrent Neural Networks._"
    ]
   },
   {
@@ -3649,7 +3557,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 266,
+   "execution_count": 242,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3703,7 +3611,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 267,
+   "execution_count": 243,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3725,7 +3633,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 268,
+   "execution_count": 244,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3751,7 +3659,7 @@
    "metadata": {},
    "source": [
     "## 13. Train a model using a custom training loop to tackle the Fashion MNIST dataset\n",
-    "_The Fashion MNIST dataset was introduced in Chapter 9._"
+    "_The Fashion MNIST dataset was introduced in Chapter 10._"
    ]
   },
   {
@@ -3764,7 +3672,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 269,
+   "execution_count": 245,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3777,7 +3685,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 270,
+   "execution_count": 246,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3788,7 +3696,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 271,
+   "execution_count": 247,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3801,7 +3709,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 272,
+   "execution_count": 248,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3816,13 +3724,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 273,
+   "execution_count": 249,
    "metadata": {},
    "outputs": [],
    "source": [
     "with trange(1, n_epochs + 1, desc=\"All epochs\") as epochs:\n",
     "    for epoch in epochs:\n",
-    "        with trange(1, n_steps + 1, desc=\"Epoch {}/{}\".format(epoch, n_epochs)) as steps:\n",
+    "        with trange(1, n_steps + 1, desc=f\"Epoch {epoch}/{n_epochs}\") as steps:\n",
     "            for step in steps:\n",
     "                X_batch, y_batch = random_batch(X_train, y_train)\n",
     "                with tf.GradientTape() as tape:\n",
@@ -3860,7 +3768,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 274,
+   "execution_count": 250,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3871,7 +3779,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 275,
+   "execution_count": 251,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3889,7 +3797,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 276,
+   "execution_count": 252,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3899,7 +3807,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 277,
+   "execution_count": 253,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -3913,13 +3821,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 278,
+   "execution_count": 254,
    "metadata": {},
    "outputs": [],
    "source": [
     "with trange(1, n_epochs + 1, desc=\"All epochs\") as epochs:\n",
     "    for epoch in epochs:\n",
-    "        with trange(1, n_steps + 1, desc=\"Epoch {}/{}\".format(epoch, n_epochs)) as steps:\n",
+    "        with trange(1, n_steps + 1, desc=f\"Epoch {epoch}/{n_epochs}\") as steps:\n",
     "            for step in steps:\n",
     "                X_batch, y_batch = random_batch(X_train, y_train)\n",
     "                with tf.GradientTape(persistent=True) as tape:\n",
@@ -3960,7 +3868,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
diff --git a/13_loading_and_preprocessing_data.ipynb b/13_loading_and_preprocessing_data.ipynb
index b14c34f..d5234b7 100644
--- a/13_loading_and_preprocessing_data.ipynb
+++ b/13_loading_and_preprocessing_data.ipynb
@@ -4,14 +4,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "**Chapter 12 – Loading and Preprocessing Data with TensorFlow**"
+    "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "_This notebook contains all the sample code and solutions to the exercises in chapter 12._"
+    "_This notebook contains all the sample code and solutions to the exercises in chapter 13._"
    ]
   },
   {
@@ -20,17 +20,19 @@
    "source": [
     "<table align=\"left\">\n",
     "  <td>\n",
-    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
+    "    <a href=\"https://colab.research.google.com/github/ageron/handson-ml3/blob/main/13_loading_and_preprocessing_data.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>\n",
     "  </td>\n",
     "  <td>\n",
-    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml2/blob/master/13_loading_and_preprocessing_data.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
+    "    <a target=\"_blank\" href=\"https://kaggle.com/kernels/welcome?src=https://github.com/ageron/handson-ml3/blob/main/13_loading_and_preprocessing_data.ipynb\"><img src=\"https://kaggle.com/static/images/open-in-kaggle.svg\" /></a>\n",
     "  </td>\n",
     "</table>"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "tags": []
+   },
    "source": [
     "# Setup"
    ]
@@ -39,7 +41,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
+    "This project requires Python 3.8 or above:"
    ]
   },
   {
@@ -48,61 +50,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Python ≥3.8 is required\n",
     "import sys\n",
-    "assert sys.version_info >= (3, 8)\n",
     "\n",
-    "# Is this notebook running on Colab or Kaggle?\n",
-    "IS_COLAB = \"google.colab\" in sys.modules\n",
-    "IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n",
-    "\n",
-    "if IS_COLAB or IS_KAGGLE:\n",
-    "    %pip install -q -U tfx\n",
-    "    print(\"You can safely ignore the package incompatibility errors.\")\n",
-    "\n",
-    "# Common imports\n",
-    "import numpy as np\n",
-    "from pathlib import Path\n",
-    "\n",
-    "# Scikit-Learn ≥1.0 is required\n",
-    "import sklearn\n",
-    "assert sklearn.__version__ >= \"1.0\"\n",
-    "\n",
-    "# TensorFlow ≥2.6 is required\n",
-    "import tensorflow as tf\n",
-    "assert tf.__version__ >= \"2.6\"\n",
-    "\n",
-    "# Load the Jupyter extension for TensorBoard\n",
-    "%load_ext tensorboard\n",
-    "\n",
-    "# to make this notebook's output stable across runs\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "# To plot pretty figures\n",
-    "%matplotlib inline\n",
-    "import matplotlib as mpl\n",
-    "import matplotlib.pyplot as plt\n",
-    "mpl.rc('axes', labelsize=14)\n",
-    "mpl.rc('xtick', labelsize=12)\n",
-    "mpl.rc('ytick', labelsize=12)\n",
-    "\n",
-    "# Where to save the figures\n",
-    "IMAGES_PATH = Path() / \"images\" / \"data\"\n",
-    "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
-    "\n",
-    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
-    "    path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
-    "    if tight_layout:\n",
-    "        plt.tight_layout()\n",
-    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
+    "assert sys.version_info >= (3, 8)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Datasets"
+    "It also requires Scikit-Learn ≥ 1.0.1:"
    ]
   },
   {
@@ -111,16 +68,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "X = tf.range(10)\n",
-    "dataset = tf.data.Dataset.from_tensor_slices(X)\n",
-    "dataset"
+    "import sklearn\n",
+    "\n",
+    "assert sklearn.__version__ >= \"1.0.1\""
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Equivalently:"
+    "And TensorFlow ≥ 2.6:"
    ]
   },
   {
@@ -129,7 +86,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = tf.data.Dataset.range(10)"
+    "import tensorflow as tf\n",
+    "\n",
+    "assert tf.__version__ >= \"2.7.0\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# The tf.data API"
    ]
   },
   {
@@ -138,21 +104,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for item in dataset:\n",
-    "    print(item)"
+    "import tensorflow as tf\n",
+    "\n",
+    "X = tf.range(10)  # any data tensor\n",
+    "dataset = tf.data.Dataset.from_tensor_slices(X)\n",
+    "dataset"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {
-    "tags": [
-     "raises-exception"
-    ]
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = dataset.repeat(3).batch(7)\n",
     "for item in dataset:\n",
     "    print(item)"
    ]
@@ -163,15 +127,31 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = dataset.map(lambda x: x * 2)"
+    "X_nested = {\"a\": ([1, 2, 3], [4, 5, 6]), \"b\": [7, 8, 9]}\n",
+    "dataset = tf.data.Dataset.from_tensor_slices(X_nested)\n",
+    "for item in dataset:\n",
+    "    print(item)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chaining Transformations"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 7,
-   "metadata": {},
+   "metadata": {
+    "tags": [
+     "raises-exception"
+    ]
+   },
    "outputs": [],
    "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))\n",
+    "dataset = dataset.repeat(3).batch(7)\n",
     "for item in dataset:\n",
     "    print(item)"
    ]
@@ -182,8 +162,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "#dataset = dataset.apply(tf.data.experimental.unbatch()) # Now deprecated\n",
-    "dataset = dataset.unbatch()"
+    "dataset = dataset.map(lambda x: x * 2)  # x is a batch\n",
+    "for item in dataset:\n",
+    "    print(item)"
    ]
   },
   {
@@ -192,7 +173,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dataset = dataset.filter(lambda x: x < 10)  # keep only items < 10"
+    "dataset = dataset.filter(lambda x: tf.reduce_sum(x) > 50)\n",
+    "for item in dataset:\n",
+    "    print(item)"
    ]
   },
   {
@@ -201,20 +184,25 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "for item in dataset.take(3):\n",
+    "for item in dataset.take(2):\n",
     "    print(item)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Shuffling the Data"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "dataset = tf.data.Dataset.range(10).repeat(3)\n",
-    "dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)\n",
+    "dataset = tf.data.Dataset.range(10).repeat(2)\n",
+    "dataset = dataset.shuffle(buffer_size=4, seed=42).batch(7)\n",
     "for item in dataset:\n",
     "    print(item)"
    ]
@@ -223,14 +211,14 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Split the California dataset to multiple CSV files"
+    "### Interleaving lines from multiple files"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set, and finally we scale it:"
+    "Let's start by loading and preparing the California housing dataset. We first load it, then split it into a training set, a validation set and a test set:"
    ]
   },
   {
@@ -239,27 +227,23 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – fetches, splits and normalizes the California housing dataset\n",
+    "\n",
     "from sklearn.datasets import fetch_california_housing\n",
     "from sklearn.model_selection import train_test_split\n",
-    "from sklearn.preprocessing import StandardScaler\n",
     "\n",
     "housing = fetch_california_housing()\n",
     "X_train_full, X_test, y_train_full, y_test = train_test_split(\n",
     "    housing.data, housing.target.reshape(-1, 1), random_state=42)\n",
     "X_train, X_valid, y_train, y_valid = train_test_split(\n",
-    "    X_train_full, y_train_full, random_state=42)\n",
-    "\n",
-    "scaler = StandardScaler()\n",
-    "scaler.fit(X_train)\n",
-    "X_mean = scaler.mean_\n",
-    "X_std = scaler.scale_"
+    "    X_train_full, y_train_full, random_state=42)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For a very large dataset that does not fit in memory, you will typically want to split it into many files first, then have TensorFlow read these files in parallel. To demonstrate this, let's start by splitting the housing dataset and save it to 20 CSV files:"
+    "For a very large dataset that does not fit in memory, you will typically want to split it into many files first, then have TensorFlow read these files in parallel. To demonstrate this, let's start by splitting the housing dataset and saving it to 20 CSV files:"
    ]
   },
   {
@@ -268,41 +252,40 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n",
+    "# extra code – split the dataset into 20 parts and save it to CSV files\n",
+    "\n",
+    "import numpy as np\n",
+    "from pathlib import Path\n",
+    "\n",
+    "def save_to_csv_files(data, name_prefix, header=None, n_parts=10):\n",
     "    housing_dir = Path() / \"datasets\" / \"housing\"\n",
     "    housing_dir.mkdir(parents=True, exist_ok=True)\n",
-    "    path_format = housing_dir / \"my_{}_{:02d}.csv\"\n",
+    "    filename_format = \"my_{}_{:02d}.csv\"\n",
     "\n",
     "    filepaths = []\n",
     "    m = len(data)\n",
-    "    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):\n",
-    "        part_csv = path_format.format(name_prefix, file_idx)\n",
-    "        filepaths.append(part_csv)\n",
-    "        with open(part_csv, \"wt\", encoding=\"utf-8\") as f:\n",
+    "    chunks = np.array_split(np.arange(m), n_parts)\n",
+    "    for file_idx, row_indices in enumerate(chunks):\n",
+    "        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)\n",
+    "        filepaths.append(str(part_csv))\n",
+    "        with open(part_csv, \"w\") as f:\n",
     "            if header is not None:\n",
     "                f.write(header)\n",
     "                f.write(\"\\n\")\n",
     "            for row_idx in row_indices:\n",
     "                f.write(\",\".join([repr(col) for col in data[row_idx]]))\n",
     "                f.write(\"\\n\")\n",
-    "    return filepaths"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    return filepaths\n",
+    "\n",
     "train_data = np.c_[X_train, y_train]\n",
     "valid_data = np.c_[X_valid, y_valid]\n",
     "test_data = np.c_[X_test, y_test]\n",
     "header_cols = housing.feature_names + [\"MedianHouseValue\"]\n",
     "header = \",\".join(header_cols)\n",
     "\n",
-    "train_filepaths = save_to_multiple_csv_files(train_data, \"train\", header, n_parts=20)\n",
-    "valid_filepaths = save_to_multiple_csv_files(valid_data, \"valid\", header, n_parts=10)\n",
-    "test_filepaths = save_to_multiple_csv_files(test_data, \"test\", header, n_parts=10)"
+    "train_filepaths = save_to_csv_files(train_data, \"train\", header, n_parts=20)\n",
+    "valid_filepaths = save_to_csv_files(valid_data, \"valid\", header, n_parts=10)\n",
+    "test_filepaths = save_to_csv_files(test_data, \"test\", header, n_parts=10)"
    ]
   },
   {
@@ -312,40 +295,20 @@
     "Okay, now let's take a peek at the first few lines of one of these CSV files:"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"\".join(open(train_filepaths[0]).readlines()[:4]))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 15,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "pd.read_csv(train_filepaths[0]).head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Or in text mode:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open(train_filepaths[0]) as f:\n",
-    "    for i in range(5):\n",
-    "        print(f.readline(), end=\"\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "train_filepaths"
    ]
@@ -354,12 +317,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Building an Input Pipeline"
+    "**Building an Input Pipeline**"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -368,17 +331,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows that the file paths are shuffled\n",
     "for filepath in filepath_dataset:\n",
     "    print(filepath)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -390,19 +354,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "for line in dataset.take(5):\n",
-    "    print(line.numpy())"
+    "    print(line)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that field 4 is interpreted as a string."
+    "## Preprocessing the Data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – compute the mean and standard deviation of each feature\n",
+    "\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "scaler.fit(X_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_mean, X_std = scaler.mean_, scaler.scale_  # extra code\n",
+    "n_inputs = 8\n",
+    "\n",
+    "def parse_csv_line(line):\n",
+    "    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]\n",
+    "    fields = tf.io.decode_csv(line, record_defaults=defs)\n",
+    "    return tf.stack(fields[:-1]), tf.stack(fields[-1:])\n",
+    "\n",
+    "def preprocess(line):\n",
+    "    x, y = parse_csv_line(line)\n",
+    "    return (x - X_mean) / X_std, y"
    ]
   },
   {
@@ -411,16 +408,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), \"Hello\", tf.constant([])]\n",
-    "parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)\n",
-    "parsed_fields"
+    "preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Notice that all missing fields are replaced with their default value, when provided:"
+    "## Putting Everything Together + Prefetching"
    ]
   },
   {
@@ -429,15 +424,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)\n",
-    "parsed_fields"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The 5th field is compulsory (since we provided `tf.constant([])` as the \"default value\"), so we get an exception if we do not provide it:"
+    "def csv_reader_dataset(filepaths, n_readers=5, n_read_threads=None,\n",
+    "                       n_parse_threads=5, shuffle_buffer_size=10_000, seed=42,\n",
+    "                       batch_size=32):\n",
+    "    dataset = tf.data.Dataset.list_files(filepaths, seed=seed)\n",
+    "    dataset = dataset.interleave(\n",
+    "        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n",
+    "        cycle_length=n_readers, num_parallel_calls=n_read_threads)\n",
+    "    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n",
+    "    dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)\n",
+    "    return dataset.batch(batch_size).prefetch(1)"
    ]
   },
   {
@@ -446,265 +442,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "try:\n",
-    "    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)\n",
-    "except tf.errors.InvalidArgumentError as ex:\n",
-    "    print(ex)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The number of fields should match exactly the number of fields in the `record_defaults`:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "try:\n",
-    "    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)\n",
-    "except tf.errors.InvalidArgumentError as ex:\n",
-    "    print(ex)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "n_inputs = 8 # X_train.shape[-1]\n",
+    "# extra code – show the first couple of batches produced by the dataset\n",
     "\n",
-    "@tf.function\n",
-    "def preprocess(line):\n",
-    "    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]\n",
-    "    fields = tf.io.decode_csv(line, record_defaults=defs)\n",
-    "    x = tf.stack(fields[:-1])\n",
-    "    y = tf.stack(fields[-1:])\n",
-    "    return (x - X_mean) / X_std, y"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def csv_reader_dataset(filepaths, repeat=1, n_readers=5,\n",
-    "                       n_read_threads=None, shuffle_buffer_size=10000,\n",
-    "                       n_parse_threads=5, batch_size=32):\n",
-    "    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)\n",
-    "    dataset = dataset.interleave(\n",
-    "        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),\n",
-    "        cycle_length=n_readers, num_parallel_calls=n_read_threads)\n",
-    "    dataset = dataset.shuffle(shuffle_buffer_size)\n",
-    "    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n",
-    "    dataset = dataset.batch(batch_size)\n",
-    "    return dataset.prefetch(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "train_set = csv_reader_dataset(train_filepaths, batch_size=3)\n",
-    "for X_batch, y_batch in train_set.take(2):\n",
+    "example_set = csv_reader_dataset(train_filepaths, batch_size=3)\n",
+    "for X_batch, y_batch in example_set.take(2):\n",
     "    print(\"X =\", X_batch)\n",
     "    print(\"y =\", y_batch)\n",
     "    print()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "train_set = csv_reader_dataset(train_filepaths, repeat=None)\n",
-    "valid_set = csv_reader_dataset(valid_filepaths)\n",
-    "test_set = csv_reader_dataset(test_filepaths)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)\n",
-    "\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Dense(30, activation=\"relu\", input_shape=X_train.shape[1:]),\n",
-    "    tf.keras.layers.Dense(1),\n",
-    "])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.compile(loss=\"mse\", optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "batch_size = 32\n",
-    "model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,\n",
-    "          validation_data=valid_set)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 34,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model.evaluate(test_set, steps=len(X_test) // batch_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "new_set = test_set.map(lambda X, y: X) # we could instead just pass test_set, Keras would ignore the labels\n",
-    "X_new = X_test\n",
-    "model.predict(new_set, steps=len(X_new) // batch_size)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)\n",
-    "loss_fn = tf.keras.losses.mean_squared_error\n",
-    "\n",
-    "n_epochs = 5\n",
-    "batch_size = 32\n",
-    "n_steps_per_epoch = len(X_train) // batch_size\n",
-    "total_steps = n_epochs * n_steps_per_epoch\n",
-    "global_step = 0\n",
-    "for X_batch, y_batch in train_set.take(total_steps):\n",
-    "    global_step += 1\n",
-    "    print(\"\\rGlobal step {}/{}\".format(global_step, total_steps), end=\"\")\n",
-    "    with tf.GradientTape() as tape:\n",
-    "        y_pred = model(X_batch)\n",
-    "        main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
-    "        loss = tf.add_n([main_loss] + model.losses)\n",
-    "    gradients = tape.gradient(loss, model.trainable_variables)\n",
-    "    optimizer.apply_gradients(zip(gradients, model.trainable_variables))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)\n",
-    "loss_fn = tf.keras.losses.mean_squared_error\n",
-    "\n",
-    "@tf.function\n",
-    "def train(model, n_epochs, batch_size=32,\n",
-    "          n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n",
-    "    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n",
-    "                       n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n",
-    "                       n_parse_threads=n_parse_threads, batch_size=batch_size)\n",
-    "    for X_batch, y_batch in train_set:\n",
-    "        with tf.GradientTape() as tape:\n",
-    "            y_pred = model(X_batch)\n",
-    "            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
-    "            loss = tf.add_n([main_loss] + model.losses)\n",
-    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
-    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
-    "\n",
-    "train(model, 5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "optimizer = tf.keras.optimizers.Nadam(learning_rate=0.01)\n",
-    "loss_fn = tf.keras.losses.mean_squared_error\n",
-    "\n",
-    "@tf.function\n",
-    "def train(model, n_epochs, batch_size=32,\n",
-    "          n_readers=5, n_read_threads=5, shuffle_buffer_size=10000, n_parse_threads=5):\n",
-    "    train_set = csv_reader_dataset(train_filepaths, repeat=n_epochs, n_readers=n_readers,\n",
-    "                       n_read_threads=n_read_threads, shuffle_buffer_size=shuffle_buffer_size,\n",
-    "                       n_parse_threads=n_parse_threads, batch_size=batch_size)\n",
-    "    n_steps_per_epoch = len(X_train) // batch_size\n",
-    "    total_steps = n_epochs * n_steps_per_epoch\n",
-    "    global_step = 0\n",
-    "    for X_batch, y_batch in train_set.take(total_steps):\n",
-    "        global_step += 1\n",
-    "        if tf.equal(global_step % 100, 0):\n",
-    "            tf.print(\"\\rGlobal step\", global_step, \"/\", total_steps)\n",
-    "        with tf.GradientTape() as tape:\n",
-    "            y_pred = model(X_batch)\n",
-    "            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
-    "            loss = tf.add_n([main_loss] + model.losses)\n",
-    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
-    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
-    "\n",
-    "train(model, 5)"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -714,12 +460,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 41,
+   "execution_count": 25,
    "metadata": {
     "scrolled": true
    },
    "outputs": [],
    "source": [
+    "# extra code – list all methods of the tf.data.Dataset class\n",
     "for m in dir(tf.data.Dataset):\n",
     "    if not (m.startswith(\"_\") or m.endswith(\"_\")):\n",
     "        func = getattr(tf.data.Dataset, m)\n",
@@ -731,7 +478,109 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## The `TFRecord` binary format"
+    "## Using the Dataset with Keras"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_set = csv_reader_dataset(train_filepaths)\n",
+    "valid_set = csv_reader_dataset(valid_filepaths)\n",
+    "test_set = csv_reader_dataset(test_filepaths)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – for reproducibility\n",
+    "tf.keras.backend.clear_session()\n",
+    "tf.random.set_seed(42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = tf.keras.Sequential([\n",
+    "    tf.keras.layers.Dense(30, activation=\"relu\", kernel_initializer=\"he_normal\",\n",
+    "                          input_shape=X_train.shape[1:]),\n",
+    "    tf.keras.layers.Dense(1),\n",
+    "])\n",
+    "model.compile(loss=\"mse\", optimizer=\"sgd\")\n",
+    "model.fit(train_set, validation_data=valid_set, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_mse = model.evaluate(test_set)\n",
+    "new_set = test_set.take(3)  # pretend we have 3 new samples\n",
+    "y_pred = model.predict(new_set)  # or you could just pass a NumPy array"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – defines the optimizer and loss function for training\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)\n",
+    "loss_fn = tf.keras.losses.mean_squared_error\n",
+    "\n",
+    "n_epochs = 5\n",
+    "for epoch in range(n_epochs):\n",
+    "    for X_batch, y_batch in train_set:\n",
+    "        # extra code – perform one Gradient Descent step\n",
+    "        #              as explained in Chapter 12\n",
+    "        print(\"\\rEpoch {}/{}\".format(epoch + 1, n_epochs), end=\"\")\n",
+    "        with tf.GradientTape() as tape:\n",
+    "            y_pred = model(X_batch)\n",
+    "            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
+    "            loss = tf.add_n([main_loss] + model.losses)\n",
+    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
+    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tf.function\n",
+    "def train_one_epoch(model, optimizer, loss_fn, train_set):\n",
+    "    for X_batch, y_batch in train_set:\n",
+    "        with tf.GradientTape() as tape:\n",
+    "            y_pred = model(X_batch)\n",
+    "            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))\n",
+    "            loss = tf.add_n([main_loss] + model.losses)\n",
+    "        gradients = tape.gradient(loss, model.trainable_variables)\n",
+    "        optimizer.apply_gradients(zip(gradients, model.trainable_variables))\n",
+    "\n",
+    "optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)\n",
+    "loss_fn = tf.keras.losses.mean_squared_error\n",
+    "for epoch in range(n_epochs):\n",
+    "    print(\"\\rEpoch {}/{}\".format(epoch + 1, n_epochs), end=\"\")\n",
+    "    train_one_epoch(model, optimizer, loss_fn, train_set)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# The TFRecord Format"
    ]
   },
   {
@@ -743,7 +592,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 32,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -761,7 +610,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
+   "execution_count": 33,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -780,10 +629,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
+   "execution_count": 34,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to read multiple files in parallel and interleave them\n",
+    "\n",
     "filepaths = [\"my_test_{}.tfrecord\".format(i) for i in range(5)]\n",
     "for i, filepath in enumerate(filepaths):\n",
     "    with tf.io.TFRecordWriter(filepath) as f:\n",
@@ -795,26 +646,41 @@
     "    print(item)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compressed TFRecord Files"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 45,
+   "execution_count": 35,
    "metadata": {},
    "outputs": [],
    "source": [
     "options = tf.io.TFRecordOptions(compression_type=\"GZIP\")\n",
     "with tf.io.TFRecordWriter(\"my_compressed.tfrecord\", options) as f:\n",
-    "    f.write(b\"This is the first record\")\n",
-    "    f.write(b\"And this is the second record\")"
+    "    f.write(b\"Compress, compress, compress!\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
     "dataset = tf.data.TFRecordDataset([\"my_compressed.tfrecord\"],\n",
-    "                                  compression_type=\"GZIP\")\n",
+    "                                  compression_type=\"GZIP\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – shows that the data is decompressed correctly\n",
     "for item in dataset:\n",
     "    print(item)"
    ]
@@ -823,7 +689,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### A Brief Intro to Protocol Buffers"
+    "## A Brief Introduction to Protocol Buffers"
    ]
   },
   {
@@ -842,16 +708,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
+   "execution_count": 38,
    "metadata": {},
    "outputs": [],
    "source": [
     "%%writefile person.proto\n",
     "syntax = \"proto3\";\n",
     "message Person {\n",
-    "  string name = 1;\n",
-    "  int32 id = 2;\n",
-    "  repeated string email = 3;\n",
+    "    string name = 1;\n",
+    "    int32 id = 2;\n",
+    "    repeated string email = 3;\n",
     "}"
    ]
   },
@@ -864,7 +730,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
+   "execution_count": 39,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -873,20 +739,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
+   "execution_count": 40,
    "metadata": {},
    "outputs": [],
    "source": [
-    "!ls person*"
+    "%ls person*"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
-    "from person_pb2 import Person\n",
+    "from person_pb2 import Person  # import the generated access class\n",
     "\n",
     "person = Person(name=\"Al\", id=123, email=[\"a@b.com\"])  # create a Person\n",
     "print(person)  # display the Person"
@@ -894,7 +760,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -903,7 +769,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -912,7 +778,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -921,7 +787,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 54,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -930,27 +796,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 55,
+   "execution_count": 46,
    "metadata": {},
    "outputs": [],
    "source": [
-    "s = person.SerializeToString()  # serialize to a byte string\n",
-    "s"
+    "serialized = person.SerializeToString()  # serialize person to a byte string\n",
+    "serialized"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 56,
+   "execution_count": 47,
    "metadata": {},
    "outputs": [],
    "source": [
     "person2 = Person()  # create a new Person\n",
-    "person2.ParseFromString(s)  # parse the byte string (27 bytes)"
+    "person2.ParseFromString(serialized)  # parse the byte string (27 bytes long)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 48,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -961,7 +827,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "#### Custom protobuf"
+    "### Custom protobuf"
    ]
   },
   {
@@ -973,12 +839,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 49,
    "metadata": {},
    "outputs": [],
    "source": [
+    "# extra code – shows how to use the tf.io.decode_proto() function\n",
+    "\n",
     "person_tf = tf.io.decode_proto(\n",
-    "    bytes=s,\n",
+    "    bytes=serialized,\n",
     "    message_type=\"Person\",\n",
     "    field_names=[\"name\", \"id\", \"email\"],\n",
     "    output_types=[tf.string, tf.int32, tf.string],\n",
@@ -998,7 +866,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### TensorFlow Protobufs"
+    "## TensorFlow Protobufs"
    ]
   },
   {
@@ -1030,43 +898,46 @@
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Warning**: in TensorFlow 2.0 and 2.1, there was a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 50,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#from tensorflow.train import BytesList, FloatList, Int64List\n",
-    "#from tensorflow.train import Feature, Features, Example\n",
-    "BytesList = tf.train.BytesList\n",
-    "FloatList = tf.train.FloatList\n",
-    "Int64List = tf.train.Int64List\n",
-    "Feature = tf.train.Feature\n",
-    "Features = tf.train.Features\n",
-    "Example = tf.train.Example\n",
+    "from tensorflow.train import BytesList, FloatList, Int64List\n",
+    "from tensorflow.train import Feature, Features, Example\n",
     "\n",
     "person_example = Example(\n",
     "    features=Features(\n",
     "        feature={\n",
     "            \"name\": Feature(bytes_list=BytesList(value=[b\"Alice\"])),\n",
     "            \"id\": Feature(int64_list=Int64List(value=[123])),\n",
-    "            \"emails\": Feature(bytes_list=BytesList(value=[b\"a@b.com\", b\"c@d.com\"]))\n",
-    "        }))\n",
-    "\n",
-    "with tf.io.TFRecordWriter(\"my_contacts.tfrecord\") as f:\n",
-    "    f.write(person_example.SerializeToString())"
+    "            \"emails\": Feature(bytes_list=BytesList(value=[b\"a@b.com\",\n",
+    "                                                          b\"c@d.com\"]))\n",
+    "        }))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with tf.io.TFRecordWriter(\"my_contacts.tfrecord\") as f:\n",
+    "    for _ in range(5):\n",
+    "        f.write(person_example.SerializeToString())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Loading and Parsing Examples"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1075,43 +946,18 @@
     "    \"id\": tf.io.FixedLenFeature([], tf.int64, default_value=0),\n",
     "    \"emails\": tf.io.VarLenFeature(tf.string),\n",
     "}\n",
-    "for serialized_example in tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]):\n",
-    "    parsed_example = tf.io.parse_single_example(serialized_example,\n",
-    "                                                feature_description)"
+    "\n",
+    "def parse(serialized_example):\n",
+    "    return tf.io.parse_single_example(serialized_example, feature_description)\n",
+    "\n",
+    "dataset = tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]).map(parse)\n",
+    "for parsed_example in dataset:\n",
+    "    print(parsed_example)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 61,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "parsed_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 62,
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "parsed_example"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "parsed_example[\"emails\"].values[0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 53,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1120,7 +966,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 54,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1128,18 +974,49 @@
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": 55,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Putting Images in TFRecords"
+    "def parse(serialized_examples):\n",
+    "    return tf.io.parse_example(serialized_examples, feature_description)\n",
+    "\n",
+    "dataset = tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]).batch(2).map(parse)\n",
+    "for parsed_examples in dataset:\n",
+    "    print(parsed_examples)  # two examples at a time"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 56,
    "metadata": {},
    "outputs": [],
    "source": [
+    "parsed_examples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extra Material – Storing Images and Tensors in TFRecords"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's load and display an example image:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
     "from sklearn.datasets import load_sample_images\n",
     "\n",
     "img = load_sample_images()[\"images\"][0]\n",
@@ -1149,9 +1026,16 @@
     "plt.show()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now let's create an `Example` protobuf containing the image encoded as JPEG:"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 58,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1159,18 +1043,36 @@
     "example_with_image = Example(features=Features(feature={\n",
     "    \"image\": Feature(bytes_list=BytesList(value=[data.numpy()]))}))\n",
     "serialized_example = example_with_image.SerializeToString()\n",
-    "# then save to TFRecord"
+    "with tf.io.TFRecordWriter(\"my_image.tfrecord\") as f:\n",
+    "    f.write(serialized_example)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Finally, let's create a tf.data pipeline that will read this TFRecord file, parse each `Example` protobuf (in this case just one), and parse and display the image that the example contains:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 59,
    "metadata": {},
    "outputs": [],
    "source": [
     "feature_description = { \"image\": tf.io.VarLenFeature(tf.string) }\n",
-    "example_with_image = tf.io.parse_single_example(serialized_example, feature_description)\n",
-    "decoded_img = tf.io.decode_jpeg(example_with_image[\"image\"].values[0])"
+    "\n",
+    "def parse(serialized_example):\n",
+    "    example_with_image = tf.io.parse_single_example(serialized_example,\n",
+    "                                                    feature_description)\n",
+    "    return tf.io.decode_jpeg(example_with_image[\"image\"].values[0])\n",
+    "    # or you can use tf.io.decode_image() instead\n",
+    "\n",
+    "dataset = tf.data.TFRecordDataset(\"my_image.tfrecord\").map(parse)\n",
+    "for image in dataset:\n",
+    "    plt.imshow(image)\n",
+    "    plt.axis(\"off\")\n",
+    "    plt.show()"
    ]
   },
   {
@@ -1180,34 +1082,6 @@
     "Or use `decode_image()` which supports BMP, GIF, JPEG and PNG formats:"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 69,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "decoded_img = tf.io.decode_image(example_with_image[\"image\"].values[0])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.imshow(decoded_img)\n",
-    "plt.title(\"Decoded Image\")\n",
-    "plt.axis(\"off\")\n",
-    "plt.show()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Putting Tensors and Sparse Tensors in TFRecords"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1217,69 +1091,49 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 60,
    "metadata": {},
    "outputs": [],
    "source": [
-    "t = tf.constant([[0., 1.], [2., 3.], [4., 5.]])\n",
-    "s = tf.io.serialize_tensor(t)\n",
-    "s"
+    "tensor = tf.constant([[0., 1.], [2., 3.], [4., 5.]])\n",
+    "serialized = tf.io.serialize_tensor(tensor)\n",
+    "serialized"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 61,
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.io.parse_tensor(s, out_type=tf.float32)"
+    "tf.io.parse_tensor(serialized, out_type=tf.float32)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 62,
    "metadata": {},
    "outputs": [],
    "source": [
-    "serialized_sparse = tf.io.serialize_sparse(parsed_example[\"emails\"])\n",
+    "sparse_tensor = parsed_example[\"emails\"]\n",
+    "serialized_sparse = tf.io.serialize_sparse(sparse_tensor)\n",
     "serialized_sparse"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 63,
    "metadata": {},
    "outputs": [],
    "source": [
     "BytesList(value=serialized_sparse.numpy())"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 75,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "dataset = tf.data.TFRecordDataset([\"my_contacts.tfrecord\"]).batch(10)\n",
-    "for serialized_examples in dataset:\n",
-    "    parsed_examples = tf.io.parse_example(serialized_examples,\n",
-    "                                          feature_description)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 76,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "parsed_examples"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Handling Sequential Data Using `SequenceExample`"
+    "## Handling Lists of Lists Using the `SequenceExample` Protobuf"
    ]
   },
   {
@@ -1292,29 +1146,19 @@
     "message FeatureList { repeated Feature feature = 1; };\n",
     "message FeatureLists { map<string, FeatureList> feature_list = 1; };\n",
     "message SequenceExample {\n",
-    "  Features context = 1;\n",
-    "  FeatureLists feature_lists = 2;\n",
+    "    Features context = 1;\n",
+    "    FeatureLists feature_lists = 2;\n",
     "};\n",
     "```"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Warning**: in TensorFlow 2.0 and 2.1, there was a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 64,
    "metadata": {},
    "outputs": [],
    "source": [
-    "#from tensorflow.train import FeatureList, FeatureLists, SequenceExample\n",
-    "FeatureList = tf.train.FeatureList\n",
-    "FeatureLists = tf.train.FeatureLists\n",
-    "SequenceExample = tf.train.SequenceExample\n",
+    "from tensorflow.train import FeatureList, FeatureLists, SequenceExample\n",
     "\n",
     "context = Features(feature={\n",
     "    \"author_id\": Feature(int64_list=Int64List(value=[123])),\n",
@@ -1344,7 +1188,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 65,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1353,7 +1197,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 66,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1362,7 +1206,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 67,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1374,15 +1218,24 @@
     "sequence_feature_descriptions = {\n",
     "    \"content\": tf.io.VarLenFeature(tf.string),\n",
     "    \"comments\": tf.io.VarLenFeature(tf.string),\n",
-    "}\n",
-    "parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(\n",
-    "    serialized_sequence_example, context_feature_descriptions,\n",
-    "    sequence_feature_descriptions)"
+    "}"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "parsed_context, parsed_feature_lists = tf.io.parse_single_sequence_example(\n",
+    "    serialized_sequence_example, context_feature_descriptions,\n",
+    "    sequence_feature_descriptions)\n",
+    "parsed_content = tf.RaggedTensor.from_sparse(parsed_feature_lists[\"content\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1391,7 +1244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 70,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1400,7 +1253,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 71,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1409,7 +1262,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 72,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1420,14 +1273,169 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# The Features API"
+    "# Keras Preprocessing Layers"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's use the variant of the California housing dataset that we used in Chapter 2, since it contains categorical features and missing values:"
+    "## The `Normalization` Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)  # extra code – ensures reproducibility\n",
+    "norm_layer = tf.keras.layers.Normalization()\n",
+    "model = tf.keras.models.Sequential([\n",
+    "    norm_layer,\n",
+    "    tf.keras.layers.Dense(1)\n",
+    "])\n",
+    "model.compile(loss=\"mse\", optimizer=tf.keras.optimizers.SGD(learning_rate=2e-3))\n",
+    "norm_layer.adapt(X_train)  # computes the mean and variance of every feature\n",
+    "model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "norm_layer = tf.keras.layers.Normalization()\n",
+    "norm_layer.adapt(X_train)\n",
+    "X_train_scaled = norm_layer(X_train)\n",
+    "X_valid_scaled = norm_layer(X_valid)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 75,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tf.random.set_seed(42)  # extra code – ensures reproducibility\n",
+    "model = tf.keras.models.Sequential([tf.keras.layers.Dense(1)])\n",
+    "model.compile(loss=\"mse\", optimizer=tf.keras.optimizers.SGD(learning_rate=2e-3))\n",
+    "model.fit(X_train_scaled, y_train, epochs=5,\n",
+    "          validation_data=(X_valid_scaled, y_valid))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 76,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_model = tf.keras.Sequential([norm_layer, model])\n",
+    "X_new = X_test[:3]  # pretend we have a few new instances (unscaled)\n",
+    "y_pred = final_model(X_new)  # preprocesses the data and makes predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extra code – creates a dataset to demo applying the norm_layer using map()\n",
+    "dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = dataset.map(lambda X, y: (norm_layer(X), y))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(dataset.take(1))  # extra code – shows the first batch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MyNormalization(tf.keras.layers.Layer):\n",
+    "    def adapt(self, X):\n",
+    "        self.mean_ = np.mean(X, axis=0, keepdims=True)\n",
+    "        self.std_ = np.std(X, axis=0, keepdims=True)\n",
+    "\n",
+    "    def call(self, inputs):\n",
+    "        eps = tf.keras.backend.epsilon()  # a small smoothing term\n",
+    "        return (inputs - self.mean_) / (self.std_ + eps)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 82,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "my_norm_layer = MyNormalization()\n",
+    "my_norm_layer.adapt(X_train)\n",
+    "X_train_scaled = my_norm_layer(X_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The `Discretization` Layer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 83,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "age = tf.constant([[10.], [93.], [57.], [18.], [37.], [5.]])\n",
+    "discretize_layer = tf.keras.layers.Discretization(bin_boundaries=[18., 50.])\n",
+    "age_categories = discretize_layer(age)\n",
+    "age_categories"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 84,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "discretize_layer = tf.keras.layers.Discretization(num_bins=3)\n",
+    "discretize_layer.adapt(age)\n",
+    "age_categories = discretize_layer(age)\n",
+    "age_categories"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The `CategoryEncoding` Layer"
    ]
   },
   {
@@ -1436,23 +1444,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from pathlib import Path\n",
-    "import tarfile\n",
-    "import urllib.request\n",
-    "import pandas as pd\n",
-    "\n",
-    "def load_housing_data():\n",
-    "    housing_path = Path() / \"datasets\" / \"housing\"\n",
-    "    if not (housing_path / \"housing.csv\").is_file():\n",
-    "        housing_path.mkdir(parents=True, exist_ok=True)\n",
-    "        root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
-    "        url = root + \"datasets/housing/housing.tgz\"\n",
-    "        tgz_path = housing_path / \"housing.tgz\"\n",
-    "        urllib.request.urlretrieve(url, tgz_path)\n",
-    "        housing_tgz = tarfile.open(tgz_path)\n",
-    "        housing_tgz.extractall(path=housing_path)\n",
-    "        housing_tgz.close()\n",
-    "    return pd.read_csv(housing_path / \"housing.csv\")"
+    "onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3)\n",
+    "onehot_layer(age_categories)"
    ]
   },
   {
@@ -1461,8 +1454,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "housing = load_housing_data()\n",
-    "head()"
+    "two_age_categories = np.array([[1, 0], [2, 2], [2, 0]])\n",
+    "onehot_layer(two_age_categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 87,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3, output_mode=\"count\")\n",
+    "onehot_layer(two_age_categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3 + 3)\n",
+    "onehot_layer(two_age_categories + [0, 3])  # adds 3 to the second feature"
    ]
   },
   {
@@ -1471,7 +1484,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "housing_median_age = tf.feature_column.numeric_column(\"housing_median_age\")"
+    "# extra code – shows another way to one-hot encode each feature separately\n",
+    "onehot_layer = tf.keras.layers.CategoryEncoding(num_tokens=3,\n",
+    "                                                output_mode=\"one_hot\")\n",
+    "tf.keras.layers.concatenate([onehot_layer(cat)\n",
+    "                             for cat in tf.transpose(two_age_categories)])"
    ]
   },
   {
@@ -1480,9 +1497,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "age_mean, age_std = X_mean[1], X_std[1]  # The median age is column in 1\n",
-    "housing_median_age = tf.feature_column.numeric_column(\n",
-    "    \"housing_median_age\", normalizer_fn=lambda x: (x - age_mean) / age_std)"
+    "# extra code – shows another way to do this, using tf.one_hot() and Flatten\n",
+    "tf.keras.layers.Flatten()(tf.one_hot(two_age_categories, depth=3))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The `StringLookup` Layer"
    ]
   },
   {
@@ -1491,9 +1514,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "median_income = tf.feature_column.numeric_column(\"median_income\")\n",
-    "bucketized_income = tf.feature_column.bucketized_column(\n",
-    "    median_income, boundaries=[1.5, 3., 4.5, 6.])"
+    "cities = [\"Auckland\", \"Paris\", \"Paris\", \"San Francisco\"]\n",
+    "str_lookup_layer = tf.keras.layers.StringLookup()\n",
+    "str_lookup_layer.adapt(cities)\n",
+    "str_lookup_layer([[\"Paris\"], [\"Auckland\"], [\"Auckland\"], [\"Montreal\"]])"
    ]
   },
   {
@@ -1502,7 +1526,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bucketized_income"
+    "str_lookup_layer = tf.keras.layers.StringLookup(num_oov_indices=5)\n",
+    "str_lookup_layer.adapt(cities)\n",
+    "str_lookup_layer([[\"Paris\"], [\"Auckland\"], [\"Foo\"], [\"Bar\"], [\"Baz\"]])"
    ]
   },
   {
@@ -1511,9 +1537,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ocean_prox_vocab = ['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']\n",
-    "ocean_proximity = tf.feature_column.categorical_column_with_vocabulary_list(\n",
-    "    \"ocean_proximity\", ocean_prox_vocab)"
+    "str_lookup_layer = tf.keras.layers.StringLookup(output_mode=\"one_hot\")\n",
+    "str_lookup_layer.adapt(cities)\n",
+    "str_lookup_layer([[\"Paris\"], [\"Auckland\"], [\"Auckland\"], [\"Montreal\"]])"
    ]
   },
   {
@@ -1522,7 +1548,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ocean_proximity"
+    "# extra code – an example using the IntegerLookup layer\n",
+    "ids = [123, 456, 789]\n",
+    "int_lookup_layer = tf.keras.layers.IntegerLookup()\n",
+    "int_lookup_layer.adapt(ids)\n",
+    "int_lookup_layer([[123], [456], [123], [111]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## The `Hashing` Layer"
    ]
   },
   {
@@ -1531,10 +1568,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Just an example, it's not used later on\n",
-    "city_hash = tf.feature_column.categorical_column_with_hash_bucket(\n",
-    "    \"city\", hash_bucket_size=1000)\n",
-    "city_hash"
+    "hashing_layer = tf.keras.layers.Hashing(num_bins=10)\n",
+    "hashing_layer([[\"Paris\"], [\"Tokyo\"], [\"Auckland\"], [\"Montreal\"]])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Encoding Categorical Features Using Embeddings"
    ]
   },
   {
@@ -1543,10 +1585,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "bucketized_age = tf.feature_column.bucketized_column(\n",
-    "    housing_median_age, boundaries=[-1., -0.5, 0., 0.5, 1.]) # age was scaled\n",
-    "age_and_ocean_proximity = tf.feature_column.crossed_column(\n",
-    "    [bucketized_age, ocean_proximity], hash_bucket_size=100)"
+    "tf.random.set_seed(42)\n",
+    "embedding_layer = tf.keras.layers.Embedding(input_dim=5, output_dim=2)\n",
+    "embedding_layer(np.array([2, 4, 2]))"
    ]
   },
   {
@@ -1555,14 +1596,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "latitude = tf.feature_column.numeric_column(\"latitude\")\n",
-    "longitude = tf.feature_column.numeric_column(\"longitude\")\n",
-    "bucketized_latitude = tf.feature_column.bucketized_column(\n",
-    "    latitude, boundaries=list(np.linspace(32., 42., 20 - 1)))\n",
-    "bucketized_longitude = tf.feature_column.bucketized_column(\n",
-    "    longitude, boundaries=list(np.linspace(-125., -114., 20 - 1)))\n",
-    "location = tf.feature_column.crossed_column(\n",
-    "    [bucketized_latitude, bucketized_longitude], hash_bucket_size=1000)"
+    "tf.random.set_seed(42)\n",
+    "ocean_prox = [\"<1H OCEAN\", \"INLAND\", \"NEAR OCEAN\", \"NEAR BAY\", \"ISLAND\"]\n",
+    "str_lookup_layer = tf.keras.layers.StringLookup()\n",
+    "str_lookup_layer.adapt(ocean_prox)\n",
+    "lookup_and_embed = tf.keras.Sequential([\n",
+    "    str_lookup_layer,\n",
+    "    tf.keras.layers.Embedding(input_dim=str_lookup_layer.vocabulary_size(),\n",
+    "                              output_dim=2)\n",
+    "])\n",
+    "lookup_and_embed(np.array([[\"<1H OCEAN\"], [\"ISLAND\"], [\"<1H OCEAN\"]]))"
    ]
   },
   {
@@ -1571,7 +1614,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ocean_proximity_one_hot = tf.feature_column.indicator_column(ocean_proximity)"
+    "# extra code – set seeds and generates fake random data\n",
+    "# (feel free to load the real dataset if you prefer)\n",
+    "tf.random.set_seed(42)\n",
+    "np.random.seed(42)\n",
+    "X_train_num = np.random.rand(10_000, 8)\n",
+    "X_train_cat = np.random.choice(ocean_prox, size=10_000)\n",
+    "y_train = np.random.rand(10_000, 1)\n",
+    "X_valid_num = np.random.rand(2_000, 8)\n",
+    "X_valid_cat = np.random.choice(ocean_prox, size=2_000)\n",
+    "y_valid = np.random.rand(2_000, 1)\n",
+    "\n",
+    "num_input = tf.keras.layers.Input(shape=[8], name=\"num\")\n",
+    "cat_input = tf.keras.layers.Input(shape=[], dtype=tf.string, name=\"cat\")\n",
+    "cat_embeddings = lookup_and_embed(cat_input) \n",
+    "encoded_inputs = tf.keras.layers.concatenate([num_input, cat_embeddings])\n",
+    "outputs = tf.keras.layers.Dense(1)(encoded_inputs)\n",
+    "model = tf.keras.models.Model(inputs=[num_input, cat_input], outputs=[outputs])\n",
+    "model.compile(loss=\"mse\", optimizer=\"sgd\")\n",
+    "history = model.fit((X_train_num, X_train_cat), y_train, epochs=5,\n",
+    "                    validation_data=((X_valid_num, X_valid_cat), y_valid))"
    ]
   },
   {
@@ -1580,15 +1642,13 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "ocean_proximity_embed = tf.feature_column.embedding_column(ocean_proximity,\n",
-    "                                                           dimension=2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Using Feature Columns for Parsing"
+    "# extra code – shows that the model can also be trained using a tf.data.Dataset\n",
+    "train_set = tf.data.Dataset.from_tensor_slices(\n",
+    "    ((X_train_num, X_train_cat), y_train)).batch(32)\n",
+    "valid_set = tf.data.Dataset.from_tensor_slices(\n",
+    "    ((X_valid_num, X_valid_cat), y_valid)).batch(32)\n",
+    "history = model.fit(train_set, epochs=5,\n",
+    "                    validation_data=valid_set)"
    ]
   },
   {
@@ -1597,7 +1657,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "median_house_value = tf.feature_column.numeric_column(\"median_house_value\")"
+    "# extra code – shows that the dataset can contain dictionaries\n",
+    "train_set = tf.data.Dataset.from_tensor_slices(\n",
+    "    ({\"num\": X_train_num, \"cat\": X_train_cat}, y_train)).batch(32)\n",
+    "valid_set = tf.data.Dataset.from_tensor_slices(\n",
+    "    ({\"num\": X_valid_num, \"cat\": X_valid_cat}, y_valid)).batch(32)\n",
+    "history = model.fit(train_set, epochs=5, validation_data=valid_set)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Text Preprocessing"
    ]
   },
   {
@@ -1606,9 +1678,10 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "columns = [housing_median_age, median_house_value]\n",
-    "feature_descriptions = tf.feature_column.make_parse_example_spec(columns)\n",
-    "feature_descriptions"
+    "train_data = [\"To be\", \"!(to be)\", \"That's the question\", \"Be, be, be.\"]\n",
+    "text_vec_layer = tf.keras.layers.TextVectorization()\n",
+    "text_vec_layer.adapt(train_data)\n",
+    "text_vec_layer([\"Be good!\", \"Question: be or be?\"])"
    ]
   },
   {
@@ -1617,13 +1690,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "with tf.io.TFRecordWriter(\"my_data_with_features.tfrecords\") as f:\n",
-    "    for x, y in zip(X_train[:, 1:2], y_train):\n",
-    "        example = Example(features=Features(feature={\n",
-    "            \"housing_median_age\": Feature(float_list=FloatList(value=[x])),\n",
-    "            \"median_house_value\": Feature(float_list=FloatList(value=[y]))\n",
-    "        }))\n",
-    "        f.write(example.SerializeToString())"
+    "text_vec_layer = tf.keras.layers.TextVectorization(ragged=True)\n",
+    "text_vec_layer.adapt(train_data)\n",
+    "text_vec_layer([\"Be good!\", \"Question: be or be?\"])"
    ]
   },
   {
@@ -1632,9 +1701,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
+    "text_vec_layer = tf.keras.layers.TextVectorization(output_mode=\"tf_idf\")\n",
+    "text_vec_layer.adapt(train_data)\n",
+    "text_vec_layer([\"Be good!\", \"Question: be or be?\"])"
    ]
   },
   {
@@ -1643,21 +1712,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def parse_examples(serialized_examples):\n",
-    "    examples = tf.io.parse_example(serialized_examples, feature_descriptions)\n",
-    "    targets = examples.pop(\"median_house_value\") # separate the targets\n",
-    "    return examples, targets\n",
-    "\n",
-    "batch_size = 32\n",
-    "dataset = tf.data.TFRecordDataset([\"my_data_with_features.tfrecords\"])\n",
-    "dataset = dataset.repeat().shuffle(10000).batch(batch_size).map(parse_examples)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Warning**: the `DenseFeatures` layer currently does not work with the Functional API, see [TF issue #27416](https://github.com/tensorflow/tensorflow/issues/27416). Hopefully this will be resolved before the final release of TF 2.0."
+    "2 * np.log(1 + 4 / (1 + 3))"
    ]
   },
   {
@@ -1666,15 +1721,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "columns_without_target = columns[:-1]\n",
-    "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.DenseFeatures(feature_columns=columns_without_target),\n",
-    "    tf.keras.layers.Dense(1)\n",
-    "])\n",
-    "model.compile(loss=\"mse\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
-    "              metrics=[\"accuracy\"])\n",
-    "model.fit(dataset, steps_per_epoch=len(X_train) // batch_size, epochs=5)"
+    "1 * np.log(1 + 4 / (1 + 1))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Using Pretrained Language Model Components"
    ]
   },
   {
@@ -1683,19 +1739,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "some_columns = [ocean_proximity_embed, bucketized_income]\n",
-    "dense_features = tf.keras.layers.DenseFeatures(some_columns)\n",
-    "dense_features({\n",
-    "    \"ocean_proximity\": [[\"NEAR OCEAN\"], [\"INLAND\"], [\"INLAND\"]],\n",
-    "    \"median_income\": [[3.], [7.2], [1.]]\n",
-    "})"
+    "import tensorflow_hub as hub\n",
+    "\n",
+    "hub_layer = hub.KerasLayer(\"https://tfhub.dev/google/nnlm-en-dim50/2\")\n",
+    "sentence_embeddings = hub_layer(tf.constant([\"To be\", \"Not to be\"]))\n",
+    "sentence_embeddings.numpy().round(2)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# TF Transform"
+    "## Image Preprocessing Layers"
    ]
   },
   {
@@ -1704,20 +1759,33 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "try:\n",
-    "    import tensorflow_transform as tft\n",
+    "from sklearn.datasets import load_sample_images\n",
     "\n",
-    "    def preprocess(inputs):  # inputs is a batch of input features\n",
-    "        median_age = inputs[\"housing_median_age\"]\n",
-    "        ocean_proximity = inputs[\"ocean_proximity\"]\n",
-    "        standardized_age = tft.scale_to_z_score(median_age - tft.mean(median_age))\n",
-    "        ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)\n",
-    "        return {\n",
-    "            \"standardized_median_age\": standardized_age,\n",
-    "            \"ocean_proximity_id\": ocean_proximity_id\n",
-    "        }\n",
-    "except ImportError:\n",
-    "    print(\"TF Transform is not installed. Try running: pip3 install -U tensorflow-transform\")"
+    "images = load_sample_images()[\"images\"]\n",
+    "crop_image_layer = tf.keras.layers.CenterCrop(height=100, width=100)\n",
+    "cropped_images = crop_image_layer(images)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 108,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(images[0])\n",
+    "plt.axis(\"off\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.imshow(cropped_images[0])\n",
+    "plt.axis(\"off\")\n",
+    "plt.show()"
    ]
   },
   {
@@ -1729,7 +1797,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1739,50 +1807,16 @@
     "mnist_train, mnist_test = datasets[\"train\"], datasets[\"test\"]"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print(tfds.list_builders())"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 110,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "plt.figure(figsize=(6,3))\n",
-    "mnist_train = mnist_train.repeat(5).batch(32).prefetch(1)\n",
-    "for item in mnist_train:\n",
-    "    images = item[\"image\"]\n",
-    "    labels = item[\"label\"]\n",
-    "    for index in range(5):\n",
-    "        plt.subplot(1, 5, index + 1)\n",
-    "        image = images[index, ..., 0]\n",
-    "        label = labels[index].numpy()\n",
-    "        plt.imshow(image, cmap=\"binary\")\n",
-    "        plt.title(label)\n",
-    "        plt.axis(\"off\")\n",
-    "    break # just showing part of the first batch"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
-    "datasets = tfds.load(name=\"mnist\")\n",
-    "mnist_train, mnist_test = datasets[\"train\"], datasets[\"test\"]\n",
-    "mnist_train = mnist_train.repeat(5).batch(32)\n",
-    "mnist_train = mnist_train.map(lambda items: (items[\"image\"], items[\"label\"]))\n",
-    "mnist_train = mnist_train.prefetch(1)\n",
-    "for images, labels in mnist_train.take(1):\n",
-    "    print(images.shape)\n",
-    "    print(labels.numpy())"
+    "for batch in mnist_train.shuffle(10_000, seed=42).batch(32).prefetch(1):\n",
+    "    images = batch[\"image\"]\n",
+    "    labels = batch[\"label\"]\n",
+    "    # [...] do something with the images and labels"
    ]
   },
   {
@@ -1791,9 +1825,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
+    "mnist_train = mnist_train.shuffle(10_000, seed=42).batch(32)\n",
+    "mnist_train = mnist_train.map(lambda items: (items[\"image\"], items[\"label\"]))\n",
+    "mnist_train = mnist_train.prefetch(1)"
    ]
   },
   {
@@ -1802,23 +1836,56 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "datasets = tfds.load(name=\"mnist\", batch_size=32, as_supervised=True)\n",
-    "mnist_train = datasets[\"train\"].repeat().prefetch(1)\n",
+    "train_set, valid_set, test_set = tfds.load(\n",
+    "    name=\"mnist\",\n",
+    "    split=[\"train[:90%]\", \"train[90%:]\", \"test\"],\n",
+    "    as_supervised=True\n",
+    ")\n",
+    "train_set = train_set.shuffle(10_000, seed=42).batch(32).prefetch(1)\n",
+    "valid_set = valid_set.batch(32).cache()\n",
+    "test_set = test_set.batch(32).cache()\n",
+    "tf.random.set_seed(42)\n",
     "model = tf.keras.Sequential([\n",
-    "    tf.keras.layers.Flatten(input_shape=[28, 28, 1]),\n",
-    "    tf.keras.layers.Lambda(lambda images: tf.cast(images, tf.float32)),\n",
-    "    tf.keras.layers.Dense(10, activation=\"softmax\")])\n",
-    "model.compile(loss=\"sparse_categorical_crossentropy\",\n",
-    "              optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3),\n",
+    "    tf.keras.layers.Flatten(input_shape=(28, 28)),\n",
+    "    tf.keras.layers.Dense(10, activation=\"softmax\")\n",
+    "])\n",
+    "model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"nadam\",\n",
     "              metrics=[\"accuracy\"])\n",
-    "model.fit(mnist_train, steps_per_epoch=60000 // 32, epochs=5)"
+    "history = model.fit(train_set, validation_data=valid_set, epochs=5)\n",
+    "test_loss, test_accuracy = model.evaluate(test_set)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "# Exercises\n",
+    "\n",
+    "## 1. to 8.\n",
+    "1. Ingesting a large dataset and preprocessing it efficiently can be a complex engineering challenge. The Data API makes it fairly simple. It offers many features, including loading data from various sources (such as text or binary files), reading data in parallel from multiple sources, transforming it, interleaving the records, shuffling the data, batching it, and prefetching it.\n",
+    "2. Splitting a large dataset into multiple files makes it possible to shuffle it at a coarse level before shuffling it at a finer level using a shuffling buffer. It also makes it possible to handle huge datasets that do not fit on a single machine. It's also simpler to manipulate thousands of small files rather than one huge file; for example, it's easier to split the data into multiple subsets. Lastly, if the data is split across multiple files spread across multiple servers, it is possible to download several files from different servers simultaneously, which improves the bandwidth usage.\n",
+    "3. You can use TensorBoard to visualize profiling data: if the GPU is not fully utilized then your input pipeline is likely to be the bottleneck. You can fix it by making sure it reads and preprocesses the data in multiple threads in parallel, and ensuring it prefetches a few batches. If this is insufficient to get your GPU to 100% usage during training, make sure your preprocessing code is optimized. You can also try saving the dataset into multiple TFRecord files, and if necessary perform some of the preprocessing ahead of time so that it does not need to be done on the fly during training (TF Transform can help with this). If necessary, use a machine with more CPU and RAM, and ensure that the GPU bandwidth is large enough.\n",
+    "4. A TFRecord file is composed of a sequence of arbitrary binary records: you can store absolutely any binary data you want in each record. However, in practice most TFRecord files contain sequences of serialized protocol buffers. This makes it possible to benefit from the advantages of protocol buffers, such as the fact that they can be read easily across multiple platforms and languages and their definition can be updated later in a backward-compatible way.\n",
+    "5. The `Example` protobuf format has the advantage that TensorFlow provides some operations to parse it (the `tf.io.parse`*`example()` functions) without you having to define your own format. It is sufficiently flexible to represent instances in most datasets. However, if it does not cover your use case, you can define your own protocol buffer, compile it using `protoc` (setting the `--descriptor_set_out` and `--include_imports` arguments to export the protobuf descriptor), and use the `tf.io.decode_proto()` function to parse the serialized protobufs (see the \"Custom protobuf\" section of the notebook for an example). It's more complicated, and it requires deploying the descriptor along with the model, but it can be done.\n",
+    "6. When using TFRecords, you will generally want to activate compression if the TFRecord files will need to be downloaded by the training script, as compression will make files smaller and thus reduce download time. But if the files are located on the same machine as the training script, it's usually preferable to leave compression off, to avoid wasting CPU for decompression.\n",
+    "7. Let's look at the pros and cons of each preprocessing option:\n",
+    "    * If you preprocess the data when creating the data files, the training script will run faster, since it will not have to perform preprocessing on the fly. In some cases, the preprocessed data will also be much smaller than the original data, so you can save some space and speed up downloads. It may also be helpful to materialize the preprocessed data, for example to inspect it or archive it. However, this approach has a few cons. First, it's not easy to experiment with various preprocessing logics if you need to generate a preprocessed dataset for each variant. Second, if you want to perform data augmentation, you have to materialize many variants of your dataset, which will use a large amount of disk space and take a lot of time to generate. Lastly, the trained model will expect preprocessed data, so you will have to add preprocessing code in your application before it calls the model. There's a risk of code duplication and preprocessing mismatch in this case.\n",
+    "    * If the data is preprocessed with the tf.data pipeline, it's much easier to tweak the preprocessing logic and apply data augmentation. Also, tf.data makes it easy to build highly efficient preprocessing pipelines (e.g., with multithreading and prefetching). However, preprocessing the data this way will slow down training. Moreover, each training instance will be preprocessed once per epoch rather than just once if the data was preprocessed when creating the data files. Well, unless the dataset fits in RAM and you can cache it using the dataset's `cache()` method. Lastly, the trained model will still expect preprocessed data. But if you use preprocessing layers in your tf.data pipeline to handle the preprocessing step, then you can just reuse these layers in your final model (adding them after training), to avoid code duplication and preprocessing mismatch.\n",
+    "    * If you add preprocessing layers to your model, you will only have to write the preprocessing code once for both training and inference. If your model needs to be deployed to many different platforms, you will not need to write the preprocessing code multiple times. Plus, you will not run the risk of using the wrong preprocessing logic for your model, since it will be part of the model. On the downside, preprocessing the data on the fly during training will slow things down, and each instance will be preprocessed once per epoch.\n",
+    "8. Let's look at how to encode categorical text features and text:\n",
+    "    * To encode a categorical feature that has a natural order, such as a movie rating (e.g., \"bad,\" \"average,\" \"good\"), the simplest option is to use ordinal encoding: sort the categories in their natural order and map each category to its rank (e.g., \"bad\" maps to 0, \"average\" maps to 1, and \"good\" maps to 2). However, most categorical features don't have such a natural order. For example, there's no natural order for professions or countries. In this case, you can use one-hot encoding, or embeddings if there are many categories. With Keras, the `StringLookup` layer can be used for ordinal encoding (using the default `output_mode=\"int\"`), or one-hot encoding (using `output_mode=\"one_hot\"`). It can also perform multi-hot encoding (using `output_mode=\"multi_hot\"`) if you want to encode multiple categorical text features together, assuming they share the same categories and it doesn't matter which feature contributed which category. For trainable embeddings, you must first use the `StringLookup` layer to produce an ordinal encoding, then use the `Embedding` layer.\n",
+    "    * For text, the `TextVectorization` layer is easy to use and it can work well for simple tasks, or you can use TF Text for more advanced features. However, you'll often want to use pretrained language models, which you can obtain using tools like TF Hub or Hugging Face's Transformers library. These last two options are discussed in Chapter 16."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# TensorFlow Hub"
+    "## 9.\n",
+    "### a.\n",
+    "_Exercise: Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a training set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized `Example` protobuf with two features: the serialized image (use `tf.io.serialize_tensor()` to serialize each image), and the label. Note: for large images, you could use `tf.io.encode_jpeg()` instead. This would save a lot of space, but it would lose a bit of image quality._"
    ]
   },
   {
@@ -1826,69 +1893,6 @@
    "execution_count": 114,
    "metadata": {},
    "outputs": [],
-   "source": [
-    "tf.keras.backend.clear_session()\n",
-    "np.random.seed(42)\n",
-    "tf.random.set_seed(42)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 115,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tensorflow_hub as hub\n",
-    "\n",
-    "hub_layer = hub.KerasLayer(\"https://tfhub.dev/google/nnlm-en-dim50/2\",\n",
-    "                           output_shape=[50], input_shape=[], dtype=tf.string)\n",
-    "\n",
-    "model = tf.keras.Sequential()\n",
-    "model.add(hub_layer)\n",
-    "model.add(tf.keras.layers.Dense(16, activation='relu'))\n",
-    "model.add(tf.keras.layers.Dense(1, activation='sigmoid'))\n",
-    "\n",
-    "model.summary()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 116,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "sentences = tf.constant([\"It was a great movie\", \"The actors were amazing\"])\n",
-    "embeddings = hub_layer(sentences)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 117,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "embeddings"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Exercises\n",
-    "\n",
-    "## 1. to 8.\n",
-    "See Appendix A\n",
-    "\n",
-    "## 9.\n",
-    "### a.\n",
-    "_Exercise: Load the Fashion MNIST dataset (introduced in Chapter 9); split it into a training set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized `Example` protobuf with two features: the serialized image (use `tf.io.serialize_tensor()` to serialize each image), and the label. Note: for large images, you could use `tf.io.encode_jpeg()` instead. This would save a lot of space, but it would lose a bit of image quality._"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 118,
-   "metadata": {},
-   "outputs": [],
    "source": [
     "(X_train_full, y_train_full), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()\n",
     "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n",
@@ -1897,7 +1901,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1908,7 +1912,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1919,7 +1923,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 117,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1936,7 +1940,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1953,7 +1957,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1974,7 +1978,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1993,7 +1997,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2023,7 +2027,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2034,7 +2038,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2048,7 +2052,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2084,7 +2088,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 129,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2099,16 +2103,9 @@
     "          callbacks=[tensorboard_cb])"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Warning:** The profiling tab in TensorBoard works if you use TensorFlow 2.2+. You also need to make sure `tensorboard_plugin_profile` is installed (and restart Jupyter if necessary)."
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 130,
+   "execution_count": 126,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2129,7 +2126,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2151,7 +2148,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2174,7 +2171,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2183,7 +2180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 133,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2208,7 +2205,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2237,7 +2234,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2255,7 +2252,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
+   "execution_count": 133,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2267,7 +2264,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
+   "execution_count": 134,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2290,7 +2287,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 135,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2306,7 +2303,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 136,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2322,7 +2319,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 137,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2331,7 +2328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 138,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2359,7 +2356,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 139,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2385,7 +2382,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 140,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2407,12 +2404,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we are ready to create the `TextVectorization` layer. Its constructor just saves the hyperparameters (`max_vocabulary_size` and `n_oov_buckets`). The `adapt()` method computes the vocabulary using the `get_vocabulary()` function, then it builds a `StaticVocabularyTable` (see Chapter 15 for more details). The `call()` method preprocesses the reviews to get a padded list of words for each review, then it uses the `StaticVocabularyTable` to lookup the index of each word in the vocabulary:"
+    "Now we are ready to create the `TextVectorization` layer. Its constructor just saves the hyperparameters (`max_vocabulary_size` and `n_oov_buckets`). The `adapt()` method computes the vocabulary using the `get_vocabulary()` function, then it builds a `StaticVocabularyTable` (see Chapter 16 for more details). The `call()` method preprocesses the reviews to get a padded list of words for each review, then it uses the `StaticVocabularyTable` to lookup the index of each word in the vocabulary:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 141,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2443,7 +2440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 142,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2464,7 +2461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 143,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2489,7 +2486,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": 144,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2505,7 +2502,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 145,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2528,7 +2525,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 146,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2545,7 +2542,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 147,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2567,7 +2564,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 148,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2584,7 +2581,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 149,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2601,7 +2598,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 150,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2620,7 +2617,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "We get about 73.5% accuracy on the validation set after just the first epoch, but after that the model makes no significant progress. We will do better in Chapter 15. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers."
+    "We get about 73.5% accuracy on the validation set after just the first epoch, but after that the model makes no significant progress. We will do better in Chapter 16. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers."
    ]
   },
   {
@@ -2628,7 +2625,7 @@
    "metadata": {},
    "source": [
     "### e.\n",
-    "_Exercise: Add an `Embedding` layer and compute the mean embedding for each review, multiplied by the square root of the number of words (see Chapter 15). This rescaled mean embedding can then be passed to the rest of your model._"
+    "_Exercise: Add an `Embedding` layer and compute the mean embedding for each review, multiplied by the square root of the number of words (see Chapter 16). This rescaled mean embedding can then be passed to the rest of your model._"
    ]
   },
   {
@@ -2640,7 +2637,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
+   "execution_count": 151,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2664,7 +2661,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
+   "execution_count": 152,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2680,7 +2677,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 156,
+   "execution_count": 153,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2696,7 +2693,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
+   "execution_count": 154,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2723,7 +2720,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
+   "execution_count": 155,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2735,7 +2732,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "The model is not better using embeddings (but we will do better in Chapter 15). The pipeline looks fast enough (we optimized it earlier)."
+    "The model is not better using embeddings (but we will do better in Chapter 16). The pipeline looks fast enough (we optimized it earlier)."
    ]
   },
   {
@@ -2748,7 +2745,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
+   "execution_count": 156,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2760,7 +2757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
+   "execution_count": 157,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2775,11 +2772,91 @@
    "metadata": {},
    "outputs": [],
    "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# TODO: remove?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that field 4 is interpreted as a string."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "record_defaults=[0, np.nan, tf.constant(np.nan, dtype=tf.float64), \"Hello\", tf.constant([])]\n",
+    "parsed_fields = tf.io.decode_csv('1,2,3,4,5', record_defaults)\n",
+    "parsed_fields"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Notice that all missing fields are replaced with their default value, when provided:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "parsed_fields = tf.io.decode_csv(',,,,5', record_defaults)\n",
+    "parsed_fields"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The 5th field is compulsory (since we provided `tf.constant([])` as the \"default value\"), so we get an exception if we do not provide it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)\n",
+    "except tf.errors.InvalidArgumentError as ex:\n",
+    "    print(ex)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The number of fields should match exactly the number of fields in the `record_defaults`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)\n",
+    "except tf.errors.InvalidArgumentError as ex:\n",
+    "    print(ex)"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },