Move TF notebooks to /work_in_progress and delete 09 and 12 which need a complete rewrite

This commit is contained in:
Aurélien Geron
2019-01-08 12:00:32 +08:00
parent 40ac161744
commit 02763e43b9
10 changed files with 0 additions and 0 deletions

View File

@@ -0,0 +1,993 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Chapter 10 Introduction to Artificial Neural Networks**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_This notebook contains all the sample code and solutions to the exercises in chapter 10._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import os\n",
"\n",
"# to make this notebook's output stable across runs\n",
"def reset_graph(seed=42):\n",
" tf.reset_default_graph()\n",
" tf.set_random_seed(seed)\n",
" np.random.seed(seed)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"plt.rcParams['axes.labelsize'] = 14\n",
"plt.rcParams['xtick.labelsize'] = 12\n",
"plt.rcParams['ytick.labelsize'] = 12\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"ann\"\n",
"\n",
"def save_fig(fig_id, tight_layout=True):\n",
" path = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID, fig_id + \".png\")\n",
" print(\"Saving figure\", fig_id)\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format='png', dpi=300)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Perceptrons"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Note**: we set `max_iter` and `tol` explicitly to avoid warnings about the fact that their default value will change in future versions of Scikit-Learn."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.datasets import load_iris\n",
"from sklearn.linear_model import Perceptron\n",
"\n",
"iris = load_iris()\n",
"X = iris.data[:, (2, 3)] # petal length, petal width\n",
"y = (iris.target == 0).astype(np.int)\n",
"\n",
"per_clf = Perceptron(max_iter=100, tol=-np.infty, random_state=42)\n",
"per_clf.fit(X, y)\n",
"\n",
"y_pred = per_clf.predict([[2, 0.5]])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"y_pred"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"a = -per_clf.coef_[0][0] / per_clf.coef_[0][1]\n",
"b = -per_clf.intercept_ / per_clf.coef_[0][1]\n",
"\n",
"axes = [0, 5, 0, 2]\n",
"\n",
"x0, x1 = np.meshgrid(\n",
" np.linspace(axes[0], axes[1], 500).reshape(-1, 1),\n",
" np.linspace(axes[2], axes[3], 200).reshape(-1, 1),\n",
" )\n",
"X_new = np.c_[x0.ravel(), x1.ravel()]\n",
"y_predict = per_clf.predict(X_new)\n",
"zz = y_predict.reshape(x0.shape)\n",
"\n",
"plt.figure(figsize=(10, 4))\n",
"plt.plot(X[y==0, 0], X[y==0, 1], \"bs\", label=\"Not Iris-Setosa\")\n",
"plt.plot(X[y==1, 0], X[y==1, 1], \"yo\", label=\"Iris-Setosa\")\n",
"\n",
"plt.plot([axes[0], axes[1]], [a * axes[0] + b, a * axes[1] + b], \"k-\", linewidth=3)\n",
"from matplotlib.colors import ListedColormap\n",
"custom_cmap = ListedColormap(['#9898ff', '#fafab0'])\n",
"\n",
"plt.contourf(x0, x1, zz, cmap=custom_cmap)\n",
"plt.xlabel(\"Petal length\", fontsize=14)\n",
"plt.ylabel(\"Petal width\", fontsize=14)\n",
"plt.legend(loc=\"lower right\", fontsize=14)\n",
"plt.axis(axes)\n",
"\n",
"save_fig(\"perceptron_iris_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Activation functions"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def sigmoid(z):\n",
" return 1 / (1 + np.exp(-z))\n",
"\n",
"def relu(z):\n",
" return np.maximum(0, z)\n",
"\n",
"def derivative(f, z, eps=0.000001):\n",
" return (f(z + eps) - f(z - eps))/(2 * eps)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"z = np.linspace(-5, 5, 200)\n",
"\n",
"plt.figure(figsize=(11,4))\n",
"\n",
"plt.subplot(121)\n",
"plt.plot(z, np.sign(z), \"r-\", linewidth=1, label=\"Step\")\n",
"plt.plot(z, sigmoid(z), \"g--\", linewidth=2, label=\"Sigmoid\")\n",
"plt.plot(z, np.tanh(z), \"b-\", linewidth=2, label=\"Tanh\")\n",
"plt.plot(z, relu(z), \"m-.\", linewidth=2, label=\"ReLU\")\n",
"plt.grid(True)\n",
"plt.legend(loc=\"center right\", fontsize=14)\n",
"plt.title(\"Activation functions\", fontsize=14)\n",
"plt.axis([-5, 5, -1.2, 1.2])\n",
"\n",
"plt.subplot(122)\n",
"plt.plot(z, derivative(np.sign, z), \"r-\", linewidth=1, label=\"Step\")\n",
"plt.plot(0, 0, \"ro\", markersize=5)\n",
"plt.plot(0, 0, \"rx\", markersize=10)\n",
"plt.plot(z, derivative(sigmoid, z), \"g--\", linewidth=2, label=\"Sigmoid\")\n",
"plt.plot(z, derivative(np.tanh, z), \"b-\", linewidth=2, label=\"Tanh\")\n",
"plt.plot(z, derivative(relu, z), \"m-.\", linewidth=2, label=\"ReLU\")\n",
"plt.grid(True)\n",
"#plt.legend(loc=\"center right\", fontsize=14)\n",
"plt.title(\"Derivatives\", fontsize=14)\n",
"plt.axis([-5, 5, -0.2, 1.2])\n",
"\n",
"save_fig(\"activation_functions_plot\")\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def heaviside(z):\n",
" return (z >= 0).astype(z.dtype)\n",
"\n",
"def mlp_xor(x1, x2, activation=heaviside):\n",
" return activation(-activation(x1 + x2 - 1.5) + activation(x1 + x2 - 0.5) - 0.5)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"x1s = np.linspace(-0.2, 1.2, 100)\n",
"x2s = np.linspace(-0.2, 1.2, 100)\n",
"x1, x2 = np.meshgrid(x1s, x2s)\n",
"\n",
"z1 = mlp_xor(x1, x2, activation=heaviside)\n",
"z2 = mlp_xor(x1, x2, activation=sigmoid)\n",
"\n",
"plt.figure(figsize=(10,4))\n",
"\n",
"plt.subplot(121)\n",
"plt.contourf(x1, x2, z1)\n",
"plt.plot([0, 1], [0, 1], \"gs\", markersize=20)\n",
"plt.plot([0, 1], [1, 0], \"y^\", markersize=20)\n",
"plt.title(\"Activation function: heaviside\", fontsize=14)\n",
"plt.grid(True)\n",
"\n",
"plt.subplot(122)\n",
"plt.contourf(x1, x2, z2)\n",
"plt.plot([0, 1], [0, 1], \"gs\", markersize=20)\n",
"plt.plot([0, 1], [1, 0], \"y^\", markersize=20)\n",
"plt.title(\"Activation function: sigmoid\", fontsize=14)\n",
"plt.grid(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# FNN for MNIST"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using the Estimator API (formerly `tf.contrib.learn`)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning**: `tf.examples.tutorials.mnist` is deprecated. We will use `tf.keras.datasets.mnist` instead. Moreover, the `tf.contrib.learn` API was promoted to `tf.estimators` and `tf.feature_columns`, and it has changed considerably. In particular, there is no `infer_real_valued_columns_from_input()` function or `SKCompat` class."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
"X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0\n",
"X_test = X_test.astype(np.float32).reshape(-1, 28*28) / 255.0\n",
"y_train = y_train.astype(np.int32)\n",
"y_test = y_test.astype(np.int32)\n",
"X_valid, X_train = X_train[:5000], X_train[5000:]\n",
"y_valid, y_train = y_train[:5000], y_train[5000:]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"feature_cols = [tf.feature_column.numeric_column(\"X\", shape=[28 * 28])]\n",
"dnn_clf = tf.estimator.DNNClassifier(hidden_units=[300,100], n_classes=10,\n",
" feature_columns=feature_cols)\n",
"\n",
"input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={\"X\": X_train}, y=y_train, num_epochs=40, batch_size=50, shuffle=True)\n",
"dnn_clf.train(input_fn=input_fn)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"test_input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={\"X\": X_test}, y=y_test, shuffle=False)\n",
"eval_results = dnn_clf.evaluate(input_fn=test_input_fn)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"eval_results"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"y_pred_iter = dnn_clf.predict(input_fn=test_input_fn)\n",
"y_pred = list(y_pred_iter)\n",
"y_pred[0]"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Using plain TensorFlow"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"n_inputs = 28*28 # MNIST\n",
"n_hidden1 = 300\n",
"n_hidden2 = 100\n",
"n_outputs = 10"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"reset_graph()\n",
"\n",
"X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n",
"y = tf.placeholder(tf.int32, shape=(None), name=\"y\")"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"def neuron_layer(X, n_neurons, name, activation=None):\n",
" with tf.name_scope(name):\n",
" n_inputs = int(X.get_shape()[1])\n",
" stddev = 2 / np.sqrt(n_inputs)\n",
" init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev)\n",
" W = tf.Variable(init, name=\"kernel\")\n",
" b = tf.Variable(tf.zeros([n_neurons]), name=\"bias\")\n",
" Z = tf.matmul(X, W) + b\n",
" if activation is not None:\n",
" return activation(Z)\n",
" else:\n",
" return Z"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"dnn\"):\n",
" hidden1 = neuron_layer(X, n_hidden1, name=\"hidden1\",\n",
" activation=tf.nn.relu)\n",
" hidden2 = neuron_layer(hidden1, n_hidden2, name=\"hidden2\",\n",
" activation=tf.nn.relu)\n",
" logits = neuron_layer(hidden2, n_outputs, name=\"outputs\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"loss\"):\n",
" xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,\n",
" logits=logits)\n",
" loss = tf.reduce_mean(xentropy, name=\"loss\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"learning_rate = 0.01\n",
"\n",
"with tf.name_scope(\"train\"):\n",
" optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
" training_op = optimizer.minimize(loss)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"eval\"):\n",
" correct = tf.nn.in_top_k(logits, y, 1)\n",
" accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"init = tf.global_variables_initializer()\n",
"saver = tf.train.Saver()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"n_epochs = 40\n",
"batch_size = 50"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"def shuffle_batch(X, y, batch_size):\n",
" rnd_idx = np.random.permutation(len(X))\n",
" n_batches = len(X) // batch_size\n",
" for batch_idx in np.array_split(rnd_idx, n_batches):\n",
" X_batch, y_batch = X[batch_idx], y[batch_idx]\n",
" yield X_batch, y_batch"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"with tf.Session() as sess:\n",
" init.run()\n",
" for epoch in range(n_epochs):\n",
" for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):\n",
" sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n",
" acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})\n",
" acc_val = accuracy.eval(feed_dict={X: X_valid, y: y_valid})\n",
" print(epoch, \"Batch accuracy:\", acc_batch, \"Val accuracy:\", acc_val)\n",
"\n",
" save_path = saver.save(sess, \"./my_model_final.ckpt\")"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"with tf.Session() as sess:\n",
" saver.restore(sess, \"./my_model_final.ckpt\") # or better, use save_path\n",
" X_new_scaled = X_test[:20]\n",
" Z = logits.eval(feed_dict={X: X_new_scaled})\n",
" y_pred = np.argmax(Z, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"print(\"Predicted classes:\", y_pred)\n",
"print(\"Actual classes: \", y_test[:20])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from tensorflow_graph_in_jupyter import show_graph"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"show_graph(tf.get_default_graph())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Using `dense()` instead of `neuron_layer()`"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: previous releases of the book used `tensorflow.contrib.layers.fully_connected()` rather than `tf.layers.dense()` (which did not exist when this chapter was written). It is now preferable to use `tf.layers.dense()`, because anything in the contrib module may change or be deleted without notice. The `dense()` function is almost identical to the `fully_connected()` function, except for a few minor differences:\n",
"* several parameters are renamed: `scope` becomes `name`, `activation_fn` becomes `activation` (and similarly the `_fn` suffix is removed from other parameters such as `normalizer_fn`), `weights_initializer` becomes `kernel_initializer`, etc.\n",
"* the default `activation` is now `None` rather than `tf.nn.relu`.\n",
"* a few more differences are presented in chapter 11."
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"n_inputs = 28*28 # MNIST\n",
"n_hidden1 = 300\n",
"n_hidden2 = 100\n",
"n_outputs = 10"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"reset_graph()\n",
"\n",
"X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n",
"y = tf.placeholder(tf.int32, shape=(None), name=\"y\") "
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"dnn\"):\n",
" hidden1 = tf.layers.dense(X, n_hidden1, name=\"hidden1\",\n",
" activation=tf.nn.relu)\n",
" hidden2 = tf.layers.dense(hidden1, n_hidden2, name=\"hidden2\",\n",
" activation=tf.nn.relu)\n",
" logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")\n",
" y_proba = tf.nn.softmax(logits)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"loss\"):\n",
" xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n",
" loss = tf.reduce_mean(xentropy, name=\"loss\")"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"learning_rate = 0.01\n",
"\n",
"with tf.name_scope(\"train\"):\n",
" optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
" training_op = optimizer.minimize(loss)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"eval\"):\n",
" correct = tf.nn.in_top_k(logits, y, 1)\n",
" accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"init = tf.global_variables_initializer()\n",
"saver = tf.train.Saver()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"n_epochs = 20\n",
"n_batches = 50\n",
"\n",
"with tf.Session() as sess:\n",
" init.run()\n",
" for epoch in range(n_epochs):\n",
" for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):\n",
" sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n",
" acc_batch = accuracy.eval(feed_dict={X: X_batch, y: y_batch})\n",
" acc_valid = accuracy.eval(feed_dict={X: X_valid, y: y_valid})\n",
" print(epoch, \"Batch accuracy:\", acc_batch, \"Validation accuracy:\", acc_valid)\n",
"\n",
" save_path = saver.save(sess, \"./my_model_final.ckpt\")"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"show_graph(tf.get_default_graph())"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"# Exercise solutions"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. to 8."
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"See appendix A."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 9."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_Train a deep MLP on the MNIST dataset and see if you can get over 98% precision. Just like in the last exercise of chapter 9, try adding all the bells and whistles (i.e., save checkpoints, restore the last checkpoint in case of an interruption, add summaries, plot learning curves using TensorBoard, and so on)._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First let's create the deep net. It's exactly the same as earlier, with just one addition: we add a `tf.summary.scalar()` to track the loss and the accuracy during training, so we can view nice learning curves using TensorBoard."
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"n_inputs = 28*28 # MNIST\n",
"n_hidden1 = 300\n",
"n_hidden2 = 100\n",
"n_outputs = 10"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"reset_graph()\n",
"\n",
"X = tf.placeholder(tf.float32, shape=(None, n_inputs), name=\"X\")\n",
"y = tf.placeholder(tf.int32, shape=(None), name=\"y\") "
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"dnn\"):\n",
" hidden1 = tf.layers.dense(X, n_hidden1, name=\"hidden1\",\n",
" activation=tf.nn.relu)\n",
" hidden2 = tf.layers.dense(hidden1, n_hidden2, name=\"hidden2\",\n",
" activation=tf.nn.relu)\n",
" logits = tf.layers.dense(hidden2, n_outputs, name=\"outputs\")"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"loss\"):\n",
" xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)\n",
" loss = tf.reduce_mean(xentropy, name=\"loss\")\n",
" loss_summary = tf.summary.scalar('log_loss', loss)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
"learning_rate = 0.01\n",
"\n",
"with tf.name_scope(\"train\"):\n",
" optimizer = tf.train.GradientDescentOptimizer(learning_rate)\n",
" training_op = optimizer.minimize(loss)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"with tf.name_scope(\"eval\"):\n",
" correct = tf.nn.in_top_k(logits, y, 1)\n",
" accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))\n",
" accuracy_summary = tf.summary.scalar('accuracy', accuracy)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
"init = tf.global_variables_initializer()\n",
"saver = tf.train.Saver()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we need to define the directory to write the TensorBoard logs to:"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n",
"\n",
"def log_dir(prefix=\"\"):\n",
" now = datetime.utcnow().strftime(\"%Y%m%d%H%M%S\")\n",
" root_logdir = \"tf_logs\"\n",
" if prefix:\n",
" prefix += \"-\"\n",
" name = prefix + \"run-\" + now\n",
" return \"{}/{}/\".format(root_logdir, name)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"logdir = log_dir(\"mnist_dnn\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can create the `FileWriter` that we will use to write the TensorBoard logs:"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Hey! Why don't we implement early stopping? For this, we are going to need to use the validation set."
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"m, n = X_train.shape"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"n_epochs = 10001\n",
"batch_size = 50\n",
"n_batches = int(np.ceil(m / batch_size))\n",
"\n",
"checkpoint_path = \"/tmp/my_deep_mnist_model.ckpt\"\n",
"checkpoint_epoch_path = checkpoint_path + \".epoch\"\n",
"final_model_path = \"./my_deep_mnist_model\"\n",
"\n",
"best_loss = np.infty\n",
"epochs_without_progress = 0\n",
"max_epochs_without_progress = 50\n",
"\n",
"with tf.Session() as sess:\n",
" if os.path.isfile(checkpoint_epoch_path):\n",
" # if the checkpoint file exists, restore the model and load the epoch number\n",
" with open(checkpoint_epoch_path, \"rb\") as f:\n",
" start_epoch = int(f.read())\n",
" print(\"Training was interrupted. Continuing at epoch\", start_epoch)\n",
" saver.restore(sess, checkpoint_path)\n",
" else:\n",
" start_epoch = 0\n",
" sess.run(init)\n",
"\n",
" for epoch in range(start_epoch, n_epochs):\n",
" for X_batch, y_batch in shuffle_batch(X_train, y_train, batch_size):\n",
" sess.run(training_op, feed_dict={X: X_batch, y: y_batch})\n",
" accuracy_val, loss_val, accuracy_summary_str, loss_summary_str = sess.run([accuracy, loss, accuracy_summary, loss_summary], feed_dict={X: X_valid, y: y_valid})\n",
" file_writer.add_summary(accuracy_summary_str, epoch)\n",
" file_writer.add_summary(loss_summary_str, epoch)\n",
" if epoch % 5 == 0:\n",
" print(\"Epoch:\", epoch,\n",
" \"\\tValidation accuracy: {:.3f}%\".format(accuracy_val * 100),\n",
" \"\\tLoss: {:.5f}\".format(loss_val))\n",
" saver.save(sess, checkpoint_path)\n",
" with open(checkpoint_epoch_path, \"wb\") as f:\n",
" f.write(b\"%d\" % (epoch + 1))\n",
" if loss_val < best_loss:\n",
" saver.save(sess, final_model_path)\n",
" best_loss = loss_val\n",
" else:\n",
" epochs_without_progress += 5\n",
" if epochs_without_progress > max_epochs_without_progress:\n",
" print(\"Early stopping\")\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"os.remove(checkpoint_epoch_path)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"with tf.Session() as sess:\n",
" saver.restore(sess, final_model_path)\n",
" accuracy_val = accuracy.eval(feed_dict={X: X_test, y: y_test})"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"accuracy_val"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
},
"nav_menu": {
"height": "264px",
"width": "369px"
},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": false
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,947 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Appendix D Autodiff**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_This notebook contains toy implementations of various autodiff techniques, to explain how they works._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Setup"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's make sure this notebook works well in both python 2 and 3:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# To support both python 2 and python 3\n",
"from __future__ import absolute_import, division, print_function, unicode_literals"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Introduction"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Suppose we want to compute the gradients of the function $f(x,y)=x^2y + y + 2$ with regards to the parameters x and y:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def f(x,y):\n",
" return x*x*y + y + 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"One approach is to solve this analytically:\n",
"\n",
"$\\dfrac{\\partial f}{\\partial x} = 2xy$\n",
"\n",
"$\\dfrac{\\partial f}{\\partial y} = x^2 + 1$"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def df(x,y):\n",
" return 2*x*y, x*x + 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So for example $\\dfrac{\\partial f}{\\partial x}(3,4) = 24$ and $\\dfrac{\\partial f}{\\partial y}(3,4) = 10$."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df(3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perfect! We can also find the equations for the second order derivatives (also called Hessians):\n",
"\n",
"$\\dfrac{\\partial^2 f}{\\partial x \\partial x} = \\dfrac{\\partial (2xy)}{\\partial x} = 2y$\n",
"\n",
"$\\dfrac{\\partial^2 f}{\\partial x \\partial y} = \\dfrac{\\partial (2xy)}{\\partial y} = 2x$\n",
"\n",
"$\\dfrac{\\partial^2 f}{\\partial y \\partial x} = \\dfrac{\\partial (x^2 + 1)}{\\partial x} = 2x$\n",
"\n",
"$\\dfrac{\\partial^2 f}{\\partial y \\partial y} = \\dfrac{\\partial (x^2 + 1)}{\\partial y} = 0$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"At x=3 and y=4, these Hessians are respectively 8, 6, 6, 0. Let's use the equations above to compute them:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"def d2f(x, y):\n",
" return [2*y, 2*x], [2*x, 0]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"d2f(3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perfect, but this requires some mathematical work. It is not too hard in this case, but for a deep neural network, it is pratically impossible to compute the derivatives this way. So let's look at various ways to automate this!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Numeric differentiation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Here, we compute an approxiation of the gradients using the equation: $\\dfrac{\\partial f}{\\partial x} = \\displaystyle{\\lim_{\\epsilon \\to 0}}\\dfrac{f(x+\\epsilon, y) - f(x, y)}{\\epsilon}$ (and there is a similar definition for $\\dfrac{\\partial f}{\\partial y}$)."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"def gradients(func, vars_list, eps=0.0001):\n",
" partial_derivatives = []\n",
" base_func_eval = func(*vars_list)\n",
" for idx in range(len(vars_list)):\n",
" tweaked_vars = vars_list[:]\n",
" tweaked_vars[idx] += eps\n",
" tweaked_func_eval = func(*tweaked_vars)\n",
" derivative = (tweaked_func_eval - base_func_eval) / eps\n",
" partial_derivatives.append(derivative)\n",
" return partial_derivatives"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"def df(x, y):\n",
" return gradients(f, [x, y])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"df(3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It works well!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The good news is that it is pretty easy to compute the Hessians. First let's create functions that compute the first order derivatives (also called Jacobians):"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"def dfdx(x, y):\n",
" return gradients(f, [x,y])[0]\n",
"\n",
"def dfdy(x, y):\n",
" return gradients(f, [x,y])[1]\n",
"\n",
"dfdx(3., 4.), dfdy(3., 4.)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can simply apply the `gradients()` function to these functions:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"def d2f(x, y):\n",
" return [gradients(dfdx, [3., 4.]), gradients(dfdy, [3., 4.])]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"d2f(3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So everything works well, but the result is approximate, and computing the gradients of a function with regards to $n$ variables requires calling that function $n$ times. In deep neural nets, there are often thousands of parameters to tweak using gradient descent (which requires computing the gradients of the loss function with regards to each of these parameters), so this approach would be much too slow."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Implementing a Toy Computation Graph"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Rather than this numerical approach, let's implement some symbolic autodiff techniques. For this, we will need to define classes to represent constants, variables and operations."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"class Const(object):\n",
" def __init__(self, value):\n",
" self.value = value\n",
" def evaluate(self):\n",
" return self.value\n",
" def __str__(self):\n",
" return str(self.value)\n",
"\n",
"class Var(object):\n",
" def __init__(self, name, init_value=0):\n",
" self.value = init_value\n",
" self.name = name\n",
" def evaluate(self):\n",
" return self.value\n",
" def __str__(self):\n",
" return self.name\n",
"\n",
"class BinaryOperator(object):\n",
" def __init__(self, a, b):\n",
" self.a = a\n",
" self.b = b\n",
"\n",
"class Add(BinaryOperator):\n",
" def evaluate(self):\n",
" return self.a.evaluate() + self.b.evaluate()\n",
" def __str__(self):\n",
" return \"{} + {}\".format(self.a, self.b)\n",
"\n",
"class Mul(BinaryOperator):\n",
" def evaluate(self):\n",
" return self.a.evaluate() * self.b.evaluate()\n",
" def __str__(self):\n",
" return \"({}) * ({})\".format(self.a, self.b)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Good, now we can build a computation graph to represent the function $f$:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"x = Var(\"x\")\n",
"y = Var(\"y\")\n",
"f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And we can run this graph to compute $f$ at any point, for example $f(3, 4)$."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"x.value = 3\n",
"y.value = 4\n",
"f.evaluate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perfect, it found the ultimate answer."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Computing gradients"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The autodiff methods we will present below are all based on the *chain rule*."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Suppose we have two functions $u$ and $v$, and we apply them sequentially to some input $x$, and we get the result $z$. So we have $z = v(u(x))$, which we can rewrite as $z = v(s)$ and $s = u(x)$. Now we can apply the chain rule to get the partial derivative of the output $z$ with regards to the input $x$:\n",
"\n",
"$ \\dfrac{\\partial z}{\\partial x} = \\dfrac{\\partial s}{\\partial x} \\cdot \\dfrac{\\partial z}{\\partial s}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now if $z$ is the output of a sequence of functions which have intermediate outputs $s_1, s_2, ..., s_n$, the chain rule still applies:\n",
"\n",
"$ \\dfrac{\\partial z}{\\partial x} = \\dfrac{\\partial s_1}{\\partial x} \\cdot \\dfrac{\\partial s_2}{\\partial s_1} \\cdot \\dfrac{\\partial s_3}{\\partial s_2} \\cdot \\dots \\cdot \\dfrac{\\partial s_{n-1}}{\\partial s_{n-2}} \\cdot \\dfrac{\\partial s_n}{\\partial s_{n-1}} \\cdot \\dfrac{\\partial z}{\\partial s_n}$"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In forward mode autodiff, the algorithm computes these terms \"forward\" (i.e., in the same order as the computations required to compute the output $z$), that is from left to right: first $\\dfrac{\\partial s_1}{\\partial x}$, then $\\dfrac{\\partial s_2}{\\partial s_1}$, and so on. In reverse mode autodiff, the algorithm computes these terms \"backwards\", from right to left: first $\\dfrac{\\partial z}{\\partial s_n}$, then $\\dfrac{\\partial s_n}{\\partial s_{n-1}}$, and so on.\n",
"\n",
"For example, suppose you want to compute the derivative of the function $z(x)=\\sin(x^2)$ at x=3, using forward mode autodiff. The algorithm would first compute the partial derivative $\\dfrac{\\partial s_1}{\\partial x}=\\dfrac{\\partial x^2}{\\partial x}=2x=6$. Next, it would compute $\\dfrac{\\partial z}{\\partial x}=\\dfrac{\\partial s_1}{\\partial x}\\cdot\\dfrac{\\partial z}{\\partial s_1}= 6 \\cdot \\dfrac{\\partial \\sin(s_1)}{\\partial s_1}=6 \\cdot \\cos(s_1)=6 \\cdot \\cos(3^2)\\approx-5.46$."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's verify this result using the `gradients()` function defined earlier:"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"from math import sin\n",
"\n",
"def z(x):\n",
" return sin(x**2)\n",
"\n",
"gradients(z, [3])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Look good. Now let's do the same thing using reverse mode autodiff. This time the algorithm would start from the right hand side so it would compute $\\dfrac{\\partial z}{\\partial s_1} = \\dfrac{\\partial \\sin(s_1)}{\\partial s_1}=\\cos(s_1)=\\cos(3^2)\\approx -0.91$. Next it would compute $\\dfrac{\\partial z}{\\partial x}=\\dfrac{\\partial s_1}{\\partial x}\\cdot\\dfrac{\\partial z}{\\partial s_1} \\approx \\dfrac{\\partial s_1}{\\partial x} \\cdot -0.91 = \\dfrac{\\partial x^2}{\\partial x} \\cdot -0.91=2x \\cdot -0.91 = 6\\cdot-0.91=-5.46$."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Of course both approaches give the same result (except for rounding errors), and with a single input and output they involve the same number of computations. But when there are several inputs or outputs, they can have very different performance. Indeed, if there are many inputs, the right-most terms will be needed to compute the partial derivatives with regards to each input, so it is a good idea to compute these right-most terms first. That means using reverse-mode autodiff. This way, the right-most terms can be computed just once and used to compute all the partial derivatives. Conversely, if there are many outputs, forward-mode is generally preferable because the left-most terms can be computed just once to compute the partial derivatives of the different outputs. In Deep Learning, there are typically thousands of model parameters, meaning there are lots of inputs, but few outputs. In fact, there is generally just one output during training: the loss. This is why reverse mode autodiff is used in TensorFlow and all major Deep Learning libraries."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"There's one additional complexity in reverse mode autodiff: the value of $s_i$ is generally required when computing $\\dfrac{\\partial s_{i+1}}{\\partial s_i}$, and computing $s_i$ requires first computing $s_{i-1}$, which requires computing $s_{i-2}$, and so on. So basically, a first pass forward through the network is required to compute $s_1$, $s_2$, $s_3$, $\\dots$, $s_{n-1}$ and $s_n$, and then the algorithm can compute the partial derivatives from right to left. Storing all the intermediate values $s_i$ in RAM is sometimes a problem, especially when handling images, and when using GPUs which often have limited RAM: to limit this problem, one can reduce the number of layers in the neural network, or configure TensorFlow to make it swap these values from GPU RAM to CPU RAM. Another approach is to only cache every other intermediate value, $s_1$, $s_3$, $s_5$, $\\dots$, $s_{n-4}$, $s_{n-2}$ and $s_n$. This means that when the algorithm computes the partial derivatives, if an intermediate value $s_i$ is missing, it will need to recompute it based on the previous intermediate value $s_{i-1}$. This trades off CPU for RAM (if you are interested, check out [this paper](https://pdfs.semanticscholar.org/f61e/9fd5a4878e1493f7a6b03774a61c17b7e9a4.pdf))."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Forward mode autodiff"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"Const.gradient = lambda self, var: Const(0)\n",
"Var.gradient = lambda self, var: Const(1) if self is var else Const(0)\n",
"Add.gradient = lambda self, var: Add(self.a.gradient(var), self.b.gradient(var))\n",
"Mul.gradient = lambda self, var: Add(Mul(self.a, self.b.gradient(var)), Mul(self.a.gradient(var), self.b))\n",
"\n",
"x = Var(name=\"x\", init_value=3.)\n",
"y = Var(name=\"y\", init_value=4.)\n",
"f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2\n",
"\n",
"dfdx = f.gradient(x) # 2xy\n",
"dfdy = f.gradient(y) # x² + 1"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"dfdx.evaluate(), dfdy.evaluate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since the output of the `gradient()` method is fully symbolic, we are not limited to the first order derivatives, we can also compute second order derivatives, and so on:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"d2fdxdx = dfdx.gradient(x) # 2y\n",
"d2fdxdy = dfdx.gradient(y) # 2x\n",
"d2fdydx = dfdy.gradient(x) # 2x\n",
"d2fdydy = dfdy.gradient(y) # 0"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"[[d2fdxdx.evaluate(), d2fdxdy.evaluate()],\n",
" [d2fdydx.evaluate(), d2fdydy.evaluate()]]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that the result is now exact, not an approximation (up to the limit of the machine's float precision, of course)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Forward mode autodiff using dual numbers"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A nice way to apply forward mode autodiff is to use [dual numbers](https://en.wikipedia.org/wiki/Dual_number). In short, a dual number $z$ has the form $z = a + b\\epsilon$, where $a$ and $b$ are real numbers, and $\\epsilon$ is an infinitesimal number, positive but smaller than all real numbers, and such that $\\epsilon^2=0$.\n",
"It can be shown that $f(x + \\epsilon) = f(x) + \\dfrac{\\partial f}{\\partial x}\\epsilon$, so simply by computing $f(x + \\epsilon)$ we get both the value of $f(x)$ and the partial derivative of $f$ with regards to $x$. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dual numbers have their own arithmetic rules, which are generally quite natural. For example:\n",
"\n",
"**Addition**\n",
"\n",
"$(a_1 + b_1\\epsilon) + (a_2 + b_2\\epsilon) = (a_1 + a_2) + (b_1 + b_2)\\epsilon$\n",
"\n",
"**Subtraction**\n",
"\n",
"$(a_1 + b_1\\epsilon) - (a_2 + b_2\\epsilon) = (a_1 - a_2) + (b_1 - b_2)\\epsilon$\n",
"\n",
"**Multiplication**\n",
"\n",
"$(a_1 + b_1\\epsilon) \\times (a_2 + b_2\\epsilon) = (a_1 a_2) + (a_1 b_2 + a_2 b_1)\\epsilon + b_1 b_2\\epsilon^2 = (a_1 a_2) + (a_1b_2 + a_2b_1)\\epsilon$\n",
"\n",
"**Division**\n",
"\n",
"$\\dfrac{a_1 + b_1\\epsilon}{a_2 + b_2\\epsilon} = \\dfrac{a_1 + b_1\\epsilon}{a_2 + b_2\\epsilon} \\cdot \\dfrac{a_2 - b_2\\epsilon}{a_2 - b_2\\epsilon} = \\dfrac{a_1 a_2 + (b_1 a_2 - a_1 b_2)\\epsilon - b_1 b_2\\epsilon^2}{{a_2}^2 + (a_2 b_2 - a_2 b_2)\\epsilon - {b_2}^2\\epsilon} = \\dfrac{a_1}{a_2} + \\dfrac{a_1 b_2 - b_1 a_2}{{a_2}^2}\\epsilon$\n",
"\n",
"**Power**\n",
"\n",
"$(a + b\\epsilon)^n = a^n + (n a^{n-1}b)\\epsilon$\n",
"\n",
"etc."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's create a class to represent dual numbers, and implement a few operations (addition and multiplication). You can try adding some more if you want."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"class DualNumber(object):\n",
" def __init__(self, value=0.0, eps=0.0):\n",
" self.value = value\n",
" self.eps = eps\n",
" def __add__(self, b):\n",
" return DualNumber(self.value + self.to_dual(b).value,\n",
" self.eps + self.to_dual(b).eps)\n",
" def __radd__(self, a):\n",
" return self.to_dual(a).__add__(self)\n",
" def __mul__(self, b):\n",
" return DualNumber(self.value * self.to_dual(b).value,\n",
" self.eps * self.to_dual(b).value + self.value * self.to_dual(b).eps)\n",
" def __rmul__(self, a):\n",
" return self.to_dual(a).__mul__(self)\n",
" def __str__(self):\n",
" if self.eps:\n",
" return \"{:.1f} + {:.1f}ε\".format(self.value, self.eps)\n",
" else:\n",
" return \"{:.1f}\".format(self.value)\n",
" def __repr__(self):\n",
" return str(self)\n",
" @classmethod\n",
" def to_dual(cls, n):\n",
" if hasattr(n, \"value\"):\n",
" return n\n",
" else:\n",
" return cls(n)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$3 + (3 + 4 \\epsilon) = 6 + 4\\epsilon$"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"3 + DualNumber(3, 4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"$(3 + 4ε)\\times(5 + 7ε)$ = $3 \\times 5 + 3 \\times 7ε + 4ε \\times 5 + 4ε \\times 7ε$ = $15 + 21ε + 20ε + 28ε^2$ = $15 + 41ε + 28 \\times 0$ = $15 + 41ε$"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"DualNumber(3, 4) * DualNumber(5, 7)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now let's see if the dual numbers work with our toy computation framework:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"x.value = DualNumber(3.0)\n",
"y.value = DualNumber(4.0)\n",
"\n",
"f.evaluate()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Yep, sure works. Now let's use this to compute the partial derivatives of $f$ with regards to $x$ and $y$ at x=3 and y=4:"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"x.value = DualNumber(3.0, 1.0) # 3 + ε\n",
"y.value = DualNumber(4.0) # 4\n",
"\n",
"dfdx = f.evaluate().eps\n",
"\n",
"x.value = DualNumber(3.0) # 3\n",
"y.value = DualNumber(4.0, 1.0) # 4 + ε\n",
"\n",
"dfdy = f.evaluate().eps"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"dfdx"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"dfdy"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Great! However, in this implementation we are limited to first order derivatives.\n",
"Now let's look at reverse mode."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reverse mode autodiff"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's rewrite our toy framework to add reverse mode autodiff:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"class Const(object):\n",
" def __init__(self, value):\n",
" self.value = value\n",
" def evaluate(self):\n",
" return self.value\n",
" def backpropagate(self, gradient):\n",
" pass\n",
" def __str__(self):\n",
" return str(self.value)\n",
"\n",
"class Var(object):\n",
" def __init__(self, name, init_value=0):\n",
" self.value = init_value\n",
" self.name = name\n",
" self.gradient = 0\n",
" def evaluate(self):\n",
" return self.value\n",
" def backpropagate(self, gradient):\n",
" self.gradient += gradient\n",
" def __str__(self):\n",
" return self.name\n",
"\n",
"class BinaryOperator(object):\n",
" def __init__(self, a, b):\n",
" self.a = a\n",
" self.b = b\n",
"\n",
"class Add(BinaryOperator):\n",
" def evaluate(self):\n",
" self.value = self.a.evaluate() + self.b.evaluate()\n",
" return self.value\n",
" def backpropagate(self, gradient):\n",
" self.a.backpropagate(gradient)\n",
" self.b.backpropagate(gradient)\n",
" def __str__(self):\n",
" return \"{} + {}\".format(self.a, self.b)\n",
"\n",
"class Mul(BinaryOperator):\n",
" def evaluate(self):\n",
" self.value = self.a.evaluate() * self.b.evaluate()\n",
" return self.value\n",
" def backpropagate(self, gradient):\n",
" self.a.backpropagate(gradient * self.b.value)\n",
" self.b.backpropagate(gradient * self.a.value)\n",
" def __str__(self):\n",
" return \"({}) * ({})\".format(self.a, self.b)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"x = Var(\"x\", init_value=3)\n",
"y = Var(\"y\", init_value=4)\n",
"f = Add(Mul(Mul(x, x), y), Add(y, Const(2))) # f(x,y) = x²y + y + 2\n",
"\n",
"result = f.evaluate()\n",
"f.backpropagate(1.0)"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"print(f)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"result"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
"x.gradient"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
"y.gradient"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Again, in this implementation the outputs are just numbers, not symbolic expressions, so we are limited to first order derivatives. However, we could have made the `backpropagate()` methods return symbolic expressions rather than values (e.g., return `Add(2,3)` rather than 5). This would make it possible to compute second order gradients (and beyond). This is what TensorFlow does, as do all the major libraries that implement autodiff."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Reverse mode autodiff using TensorFlow"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"\n",
"x = tf.Variable(3., name=\"x\")\n",
"y = tf.Variable(4., name=\"y\")\n",
"f = x*x*y + y + 2\n",
"\n",
"jacobians = tf.gradients(f, [x, y])\n",
"\n",
"init = tf.global_variables_initializer()\n",
"\n",
"with tf.Session() as sess:\n",
" init.run()\n",
" f_val, jacobians_val = sess.run([f, jacobians])\n",
"\n",
"f_val, jacobians_val"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since everything is symbolic, we can compute second order derivatives, and beyond. However, when we compute the derivative of a tensor with regards to a variable that it does not depend on, instead of returning 0.0, the `gradients()` function returns None, which cannot be evaluated by `sess.run()`. So beware of `None` values. Here we just replace them with zero tensors."
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"hessians_x = tf.gradients(jacobians[0], [x, y])\n",
"hessians_y = tf.gradients(jacobians[1], [x, y])\n",
"\n",
"def replace_none_with_zero(tensors):\n",
" return [tensor if tensor is not None else tf.constant(0.)\n",
" for tensor in tensors]\n",
"\n",
"hessians_x = replace_none_with_zero(hessians_x)\n",
"hessians_y = replace_none_with_zero(hessians_y)\n",
"\n",
"init = tf.global_variables_initializer()\n",
"\n",
"with tf.Session() as sess:\n",
" init.run()\n",
" hessians_x_val, hessians_y_val = sess.run([hessians_x, hessians_y])\n",
"\n",
"hessians_x_val, hessians_y_val"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And that's all folks! Hope you enjoyed this notebook."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
},
"nav_menu": {
"height": "603px",
"width": "616px"
},
"toc": {
"navigate_menu": true,
"number_sections": true,
"sideBar": true,
"threshold": 6,
"toc_cell": false,
"toc_section_display": "block",
"toc_window_display": true
}
},
"nbformat": 4,
"nbformat_minor": 1
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,844 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# TensorFlow Reproducibility"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from __future__ import division, print_function, unicode_literals\n",
"\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tensorflow import keras"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Checklist"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Do not run TensorFlow on the GPU.\n",
"2. Beware of multithreading, and make TensorFlow single-threaded.\n",
"3. Set all the random seeds.\n",
"4. Eliminate any other source of variability."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Do Not Run TensorFlow on the GPU"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Some operations (like `tf.reduce_sum()`) have favor performance over precision, and their outputs may vary slightly across runs. To get reproducible results, make sure TensorFlow runs on the CPU:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"os.environ[\"CUDA_VISIBLE_DEVICES\"]=\"\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Beware of Multithreading"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because floats have limited precision, the order of execution matters:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"2. * 5. / 7."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"2. / 7. * 5."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should make sure TensorFlow runs your ops on a single thread:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"config = tf.ConfigProto(intra_op_parallelism_threads=1,\n",
" inter_op_parallelism_threads=1)\n",
"\n",
"with tf.Session(config=config) as sess:\n",
" #... this will run single threaded\n",
" pass"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The thread pools for all sessions are created when you create the first session, so all sessions in the rest of this notebook will be single-threaded:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"with tf.Session() as sess:\n",
" #... also single-threaded!\n",
" pass"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set all the random seeds!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python's built-in `hash()` function"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"print(set(\"Try restarting the kernel and running this again\"))\n",
"print(set(\"Try restarting the kernel and running this again\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since Python 3.3, the result will be different every time, unless you start Python with the `PYTHONHASHSEED` environment variable set to `0`:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```shell\n",
"PYTHONHASHSEED=0 python\n",
"```\n",
"\n",
"```pycon\n",
">>> print(set(\"Now the output is stable across runs\"))\n",
"{'n', 'b', 'h', 'o', 'i', 'a', 'r', 't', 'p', 'N', 's', 'c', ' ', 'l', 'e', 'w', 'u'}\n",
">>> exit()\n",
"```\n",
"\n",
"```shell\n",
"PYTHONHASHSEED=0 python\n",
"```\n",
"```pycon\n",
">>> print(set(\"Now the output is stable across runs\"))\n",
"{'n', 'b', 'h', 'o', 'i', 'a', 'r', 't', 'p', 'N', 's', 'c', ' ', 'l', 'e', 'w', 'u'}\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Alternatively, you could set this environment variable system-wide, but that's probably not a good idea, because this automatic randomization was [introduced for security reasons](http://ocert.org/advisories/ocert-2011-003.html)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Unfortunately, setting the environment variable from within Python (e.g., using `os.environ[\"PYTHONHASHSEED\"]=\"0\"`) will not work, because Python reads it upon startup. For Jupyter notebooks, you have to start the Jupyter server like this:\n",
"\n",
"```shell\n",
"PYTHONHASHSEED=0 jupyter notebook\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"if os.environ.get(\"PYTHONHASHSEED\") != \"0\":\n",
" raise Exception(\"You must set PYTHONHASHSEED=0 when starting the Jupyter server to get reproducible results.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Python Random Number Generators (RNGs)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import random\n",
"\n",
"random.seed(42)\n",
"print(random.random())\n",
"print(random.random())\n",
"\n",
"print()\n",
"\n",
"random.seed(42)\n",
"print(random.random())\n",
"print(random.random())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### NumPy RNGs"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"\n",
"np.random.seed(42)\n",
"print(np.random.rand())\n",
"print(np.random.rand())\n",
"\n",
"print()\n",
"\n",
"np.random.seed(42)\n",
"print(np.random.rand())\n",
"print(np.random.rand())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### TensorFlow RNGs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TensorFlow's behavior is more complex because of two things:\n",
"* you create a graph, and then you execute it. The random seed must be set before you create the random operations.\n",
"* there are two seeds: one at the graph level, and one at the individual random operation level."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import tensorflow as tf\n",
"\n",
"tf.set_random_seed(42)\n",
"rnd = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Every time you reset the graph, you need to set the seed again:"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"\n",
"tf.set_random_seed(42)\n",
"rnd = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you create your own graph, it will ignore the default graph's seed:"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"tf.set_random_seed(42)\n",
"\n",
"graph = tf.Graph()\n",
"with graph.as_default():\n",
" rnd = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session(graph=graph):\n",
" print(rnd.eval())\n",
" print(rnd.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session(graph=graph):\n",
" print(rnd.eval())\n",
" print(rnd.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You must set its own seed:"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"graph = tf.Graph()\n",
"with graph.as_default():\n",
" tf.set_random_seed(42)\n",
" rnd = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session(graph=graph):\n",
" print(rnd.eval())\n",
" print(rnd.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session(graph=graph):\n",
" print(rnd.eval())\n",
" print(rnd.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you set the seed after the random operation is created, the seed has no effet:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"\n",
"rnd = tf.random_uniform(shape=[])\n",
"\n",
"tf.set_random_seed(42) # BAD, NO EFFECT!\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())\n",
"\n",
"print()\n",
"\n",
"tf.set_random_seed(42) # BAD, NO EFFECT!\n",
"with tf.Session() as sess:\n",
" print(rnd.eval())\n",
" print(rnd.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### A note about operation seeds"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also set a seed for each individual random operation. When you do, it is combined with the graph seed into the final seed used by that op. The following table summarizes how this works:\n",
"\n",
"| Graph seed | Op seed | Resulting seed |\n",
"|------------|---------|--------------------------------|\n",
"| None | None | Random |\n",
"| graph_seed | None | f(graph_seed, op_index) |\n",
"| None | op_seed | f(default_graph_seed, op_seed) |\n",
"| graph_seed | op_seed | f(graph_seed, op_seed) |\n",
"\n",
"* `f()` is a deterministic function.\n",
"* `op_index = graph._last_id` when there is a graph seed, different random ops without op seeds will have different outputs. However, each of them will have the same sequence of outputs at every run.\n",
"\n",
"In eager mode, there is a global seed instead of graph seed (since there is no graph in eager mode)."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"\n",
"rnd1 = tf.random_uniform(shape=[], seed=42)\n",
"rnd2 = tf.random_uniform(shape=[], seed=42)\n",
"rnd3 = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"In the following example, you may think that all random ops will have the same random seed, but `rnd3` will actually have a different seed:"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"tf.reset_default_graph()\n",
"\n",
"tf.set_random_seed(42)\n",
"\n",
"rnd1 = tf.random_uniform(shape=[], seed=42)\n",
"rnd2 = tf.random_uniform(shape=[], seed=42)\n",
"rnd3 = tf.random_uniform(shape=[])\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
"\n",
"print()\n",
"\n",
"with tf.Session() as sess:\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())\n",
" print(rnd1.eval())\n",
" print(rnd2.eval())\n",
" print(rnd3.eval())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Estimators API"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Tip**: in a Jupyter notebook, you probably want to set the random seeds regularly so that you can come back and run the notebook from there (instead of from the beginning) and still get reproducible outputs."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)\n",
"np.random.seed(42)\n",
"tf.set_random_seed(42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you use the Estimators API, make sure to create a `RunConfig` and set its `tf_random_seed`, then pass it to the constructor of your estimator:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"my_config = tf.estimator.RunConfig(tf_random_seed=42)\n",
"\n",
"feature_cols = [tf.feature_column.numeric_column(\"X\", shape=[28 * 28])]\n",
"dnn_clf = tf.estimator.DNNClassifier(hidden_units=[300, 100], n_classes=10,\n",
" feature_columns=feature_cols,\n",
" config=my_config)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try it on MNIST:"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()\n",
"X_train = X_train.astype(np.float32).reshape(-1, 28*28) / 255.0\n",
"y_train = y_train.astype(np.int32)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Unfortunately, the `numpy_input_fn` does not allow us to set the seed when `shuffle=True`, so we must shuffle the data ourself and set `shuffle=False`."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"indices = np.random.permutation(len(X_train))\n",
"X_train_shuffled = X_train[indices]\n",
"y_train_shuffled = y_train[indices]\n",
"\n",
"input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={\"X\": X_train_shuffled}, y=y_train_shuffled, num_epochs=10, batch_size=32, shuffle=False)\n",
"dnn_clf.train(input_fn=input_fn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The final loss should be exactly 0.46282205."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Instead of using the `numpy_input_fn()` function (which cannot reproducibly shuffle the dataset at each epoch), you can create your own input function using the Data API and set its shuffling seed:"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"def create_dataset(X, y=None, n_epochs=1, batch_size=32,\n",
" buffer_size=1000, seed=None):\n",
" dataset = tf.data.Dataset.from_tensor_slices(({\"X\": X}, y))\n",
" dataset = dataset.repeat(n_epochs)\n",
" dataset = dataset.shuffle(buffer_size, seed=seed)\n",
" return dataset.batch(batch_size)\n",
"\n",
"input_fn=lambda: create_dataset(X_train, y_train, seed=42)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"random.seed(42)\n",
"np.random.seed(42)\n",
"tf.set_random_seed(42)\n",
"\n",
"my_config = tf.estimator.RunConfig(tf_random_seed=42)\n",
"\n",
"feature_cols = [tf.feature_column.numeric_column(\"X\", shape=[28 * 28])]\n",
"dnn_clf = tf.estimator.DNNClassifier(hidden_units=[300, 100], n_classes=10,\n",
" feature_columns=feature_cols,\n",
" config=my_config)\n",
"dnn_clf.train(input_fn=input_fn)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The final loss should be exactly 1.0556093."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```python\n",
"indices = np.random.permutation(len(X_train))\n",
"X_train_shuffled = X_train[indices]\n",
"y_train_shuffled = y_train[indices]\n",
"\n",
"input_fn = tf.estimator.inputs.numpy_input_fn(\n",
" x={\"X\": X_train_shuffled}, y=y_train_shuffled,\n",
" num_epochs=10, batch_size=32, shuffle=False)\n",
"dnn_clf.train(input_fn=input_fn)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Keras API"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you use the Keras API, all you need to do is set the random seed any time you clear the session:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"keras.backend.clear_session()\n",
"\n",
"random.seed(42)\n",
"np.random.seed(42)\n",
"tf.set_random_seed(42)\n",
"\n",
"model = keras.models.Sequential([\n",
" keras.layers.Dense(300, activation=\"relu\"),\n",
" keras.layers.Dense(100, activation=\"relu\"),\n",
" keras.layers.Dense(10, activation=\"softmax\"),\n",
"])\n",
"model.compile(loss=\"sparse_categorical_crossentropy\", optimizer=\"sgd\",\n",
" metrics=[\"accuracy\"])\n",
"model.fit(X_train, y_train, epochs=10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should get exactly 97.16% accuracy on the training set at the end of training."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Eliminate other sources of variability"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For example, `os.listdir()` returns file names in an order that depends on how the files were indexed by the file system:"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"for i in range(10):\n",
" with open(\"my_test_foo_{}\".format(i), \"w\"):\n",
" pass\n",
"\n",
"[f for f in os.listdir() if f.startswith(\"my_test_foo_\")]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"for i in range(10):\n",
" with open(\"my_test_bar_{}\".format(i), \"w\"):\n",
" pass\n",
"\n",
"[f for f in os.listdir() if f.startswith(\"my_test_bar_\")]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You should sort the file names before you use them:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"filenames = os.listdir()\n",
"filenames.sort()"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"[f for f in filenames if f.startswith(\"my_test_foo_\")]"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"for f in os.listdir():\n",
" if f.startswith(\"my_test_foo_\") or f.startswith(\"my_test_bar_\"):\n",
" os.remove(f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"I hope you enjoyed this notebook. If you do not get reproducible results, or if they are different than mine, then please [file an issue](https://github.com/ageron/handson-ml/issues) on github, specifying what version of Python, TensorFlow, and NumPy you are using, as well as your O.S. version. Thank you!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"If you want to learn more about Deep Learning and TensorFlow, check out my book [Hands-On Machine Learning with Scitkit-Learn and TensorFlow](http://homl.info/amazon), O'Reilly. You can also follow me on twitter [@aureliengeron](https://twitter.com/aureliengeron) or watch my videos on YouTube at [youtube.com/c/AurelienGeron](https://www.youtube.com/c/AurelienGeron)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}