From 9dafb0189353a2013b071befcccecf0aca1494f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Fri, 20 Mar 2020 23:22:47 +1300 Subject: [PATCH] Add the coding exercise solutions for chapter 13 --- 13_loading_and_preprocessing_data.ipynb | 1098 +++++++++++++++++++++-- 1 file changed, 1016 insertions(+), 82 deletions(-) diff --git a/13_loading_and_preprocessing_data.ipynb b/13_loading_and_preprocessing_data.ipynb index 12257f1..7f252ce 100644 --- a/13_loading_and_preprocessing_data.ipynb +++ b/13_loading_and_preprocessing_data.ipynb @@ -51,7 +51,7 @@ "try:\n", " # %tensorflow_version only exists in Colab.\n", " %tensorflow_version 2.x\n", - " !pip install -q -U tfx==0.15.0rc0\n", + " !pip install -q -U tfx==0.21.2\n", " print(\"You can safely ignore the package incompatibility errors.\")\n", "except Exception:\n", " pass\n", @@ -174,7 +174,8 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = dataset.apply(tf.data.experimental.unbatch())" + "#dataset = dataset.apply(tf.data.experimental.unbatch()) # Now deprecated\n", + "dataset = dataset.unbatch()" ] }, { @@ -202,6 +203,8 @@ "metadata": {}, "outputs": [], "source": [ + "tf.random.set_seed(42)\n", + "\n", "dataset = tf.data.Dataset.range(10).repeat(3)\n", "dataset = dataset.shuffle(buffer_size=3, seed=42).batch(7)\n", "for item in dataset:\n", @@ -513,6 +516,8 @@ "metadata": {}, "outputs": [], "source": [ + "tf.random.set_seed(42)\n", + "\n", "train_set = csv_reader_dataset(train_filepaths, batch_size=3)\n", "for X_batch, y_batch in train_set.take(2):\n", " print(\"X =\", X_batch)\n", @@ -537,6 +542,10 @@ "metadata": {}, "outputs": [], "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)\n", + "\n", "model = keras.models.Sequential([\n", " keras.layers.Dense(30, activation=\"relu\", input_shape=X_train.shape[1:]),\n", " keras.layers.Dense(1),\n", @@ -615,6 +624,17 @@ "execution_count": 37, "metadata": {}, "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], "source": [ "optimizer = keras.optimizers.Nadam(lr=0.01)\n", "loss_fn = keras.losses.mean_squared_error\n", @@ -638,7 +658,18 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -677,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 41, "metadata": { "scrolled": true }, @@ -706,7 +737,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ @@ -724,7 +755,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -743,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -760,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -772,7 +803,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -805,7 +836,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -827,7 +858,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -836,7 +867,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 49, "metadata": {}, "outputs": [], "source": [ @@ -845,7 +876,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -857,7 +888,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ @@ -866,7 +897,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -875,7 +906,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -884,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -893,7 +924,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -903,7 +934,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -913,7 +944,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -936,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 58, "metadata": {}, "outputs": [], "source": [ @@ -993,14 +1024,19 @@ "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: there's currently a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details." + ] + }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 59, "metadata": {}, "outputs": [], "source": [ - "# WARNING: there's currently a bug preventing \"from tensorflow.train import X\"\n", - "# so we work around it by writing \"X = tf.train.X\"\n", "#from tensorflow.train import BytesList, FloatList, Int64List\n", "#from tensorflow.train import Feature, Features, Example\n", "BytesList = tf.train.BytesList\n", @@ -1024,7 +1060,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 60, "metadata": {}, "outputs": [], "source": [ @@ -1040,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 61, "metadata": {}, "outputs": [], "source": [ @@ -1049,7 +1085,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 62, "metadata": { "scrolled": true }, @@ -1060,7 +1096,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 63, "metadata": {}, "outputs": [], "source": [ @@ -1069,7 +1105,7 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -1078,7 +1114,7 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ @@ -1094,7 +1130,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ @@ -1109,7 +1145,7 @@ }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ @@ -1122,7 +1158,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 68, "metadata": {}, "outputs": [], "source": [ @@ -1140,7 +1176,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -1149,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ @@ -1175,7 +1211,7 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 71, "metadata": {}, "outputs": [], "source": [ @@ -1186,7 +1222,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -1195,7 +1231,7 @@ }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -1205,7 +1241,7 @@ }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -1214,7 +1250,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 75, "metadata": {}, "outputs": [], "source": [ @@ -1226,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -1256,14 +1292,19 @@ "```" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning**: there's currently a bug preventing `from tensorflow.train import X` so we work around it by writing `X = tf.train.X`. See https://github.com/tensorflow/tensorflow/issues/33289 for more details." + ] + }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ - "# WARNING: there's currently a bug preventing \"from tensorflow.train import X\"\n", - "# so we work around it by writing \"X = tf.train.X\"\n", "#from tensorflow.train import FeatureList, FeatureLists, SequenceExample\n", "FeatureList = tf.train.FeatureList\n", "FeatureLists = tf.train.FeatureLists\n", @@ -1297,7 +1338,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ @@ -1306,7 +1347,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -1315,7 +1356,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 80, "metadata": {}, "outputs": [], "source": [ @@ -1335,7 +1376,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 81, "metadata": {}, "outputs": [], "source": [ @@ -1344,7 +1385,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ @@ -1353,7 +1394,7 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 83, "metadata": {}, "outputs": [], "source": [ @@ -1362,7 +1403,7 @@ }, { "cell_type": "code", - "execution_count": 82, + "execution_count": 84, "metadata": {}, "outputs": [], "source": [ @@ -1385,7 +1426,7 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 85, "metadata": {}, "outputs": [], "source": [ @@ -1408,7 +1449,7 @@ }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 86, "metadata": {}, "outputs": [], "source": [ @@ -1417,7 +1458,7 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 87, "metadata": {}, "outputs": [], "source": [ @@ -1430,7 +1471,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 88, "metadata": {}, "outputs": [], "source": [ @@ -1440,7 +1481,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 89, "metadata": {}, "outputs": [], "source": [ @@ -1449,7 +1490,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 90, "metadata": {}, "outputs": [], "source": [ @@ -1460,7 +1501,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 91, "metadata": {}, "outputs": [], "source": [ @@ -1471,7 +1512,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 92, "metadata": {}, "outputs": [], "source": [ @@ -1480,7 +1521,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 93, "metadata": {}, "outputs": [], "source": [ @@ -1491,7 +1532,7 @@ }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 94, "metadata": {}, "outputs": [], "source": [ @@ -1500,7 +1541,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ @@ -1512,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 96, "metadata": {}, "outputs": [], "source": [ @@ -1524,7 +1565,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 97, "metadata": {}, "outputs": [], "source": [ @@ -1540,7 +1581,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ @@ -1549,7 +1590,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -1566,7 +1607,7 @@ }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 100, "metadata": {}, "outputs": [], "source": [ @@ -1575,7 +1616,7 @@ }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ @@ -1586,7 +1627,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ @@ -1601,7 +1642,18 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, "metadata": {}, "outputs": [], "source": [ @@ -1624,7 +1676,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ @@ -1641,7 +1693,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 106, "metadata": {}, "outputs": [], "source": [ @@ -1662,7 +1714,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 107, "metadata": {}, "outputs": [], "source": [ @@ -1691,7 +1743,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 108, "metadata": {}, "outputs": [], "source": [ @@ -1703,7 +1755,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 109, "metadata": {}, "outputs": [], "source": [ @@ -1712,7 +1764,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 110, "metadata": {}, "outputs": [], "source": [ @@ -1733,7 +1785,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 111, "metadata": {}, "outputs": [], "source": [ @@ -1749,7 +1801,18 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 112, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ @@ -1774,7 +1837,18 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 114, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, "metadata": {}, "outputs": [], "source": [ @@ -1793,7 +1867,7 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 116, "metadata": {}, "outputs": [], "source": [ @@ -1803,13 +1877,873 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 117, "metadata": {}, "outputs": [], "source": [ "embeddings" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Exercises\n", + "\n", + "## 1. to 8.\n", + "See Appendix A\n", + "\n", + "## 9.\n", + "### a.\n", + "_Exercise: Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a training set, a validation set, and a test set; shuffle the training set; and save each dataset to multiple TFRecord files. Each record should be a serialized `Example` protobuf with two features: the serialized image (use `tf.io.serialize_tensor()` to serialize each image), and the label. Note: for large images, you could use `tf.io.encode_jpeg()` instead. This would save a lot of space, but it would lose a bit of image quality._" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()\n", + "X_valid, X_train = X_train_full[:5000], X_train_full[5000:]\n", + "y_valid, y_train = y_train_full[:5000], y_train_full[5000:]" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "np.random.seed(42)\n", + "tf.random.set_seed(42)" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train))\n", + "valid_set = tf.data.Dataset.from_tensor_slices((X_valid, y_valid))\n", + "test_set = tf.data.Dataset.from_tensor_slices((X_test, y_test))" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [], + "source": [ + "def create_example(image, label):\n", + " image_data = tf.io.serialize_tensor(image)\n", + " #image_data = tf.io.encode_jpeg(image[..., np.newaxis])\n", + " return Example(\n", + " features=Features(\n", + " feature={\n", + " \"image\": Feature(bytes_list=BytesList(value=[image_data.numpy()])),\n", + " \"label\": Feature(int64_list=Int64List(value=[label])),\n", + " }))" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [], + "source": [ + "for image, label in valid_set.take(1):\n", + " print(create_example(image, label))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following function saves a given dataset to a set of TFRecord files. The examples are written to the files in a round-robin fashion. To do this, we enumerate all the examples using the `dataset.enumerate()` method, and we compute `index % n_shards` to decide which file to write to. We use the standard `contextlib.ExitStack` class to make sure that all writers are properly closed whether or not an I/O error occurs while writing." + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [], + "source": [ + "from contextlib import ExitStack\n", + "\n", + "def write_tfrecords(name, dataset, n_shards=10):\n", + " paths = [\"{}.tfrecord-{:05d}-of-{:05d}\".format(name, index, n_shards)\n", + " for index in range(n_shards)]\n", + " with ExitStack() as stack:\n", + " writers = [stack.enter_context(tf.io.TFRecordWriter(path))\n", + " for path in paths]\n", + " for index, (image, label) in dataset.enumerate():\n", + " shard = index % n_shards\n", + " example = create_example(image, label)\n", + " writers[shard].write(example.SerializeToString())\n", + " return paths" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "train_filepaths = write_tfrecords(\"my_fashion_mnist.train\", train_set)\n", + "valid_filepaths = write_tfrecords(\"my_fashion_mnist.valid\", valid_set)\n", + "test_filepaths = write_tfrecords(\"my_fashion_mnist.test\", test_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b.\n", + "_Exercise: Then use tf.data to create an efficient dataset for each set. Finally, use a Keras model to train these datasets, including a preprocessing layer to standardize each input feature. Try to make the input pipeline as efficient as possible, using TensorBoard to visualize profiling data._" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(tfrecord):\n", + " feature_descriptions = {\n", + " \"image\": tf.io.FixedLenFeature([], tf.string, default_value=\"\"),\n", + " \"label\": tf.io.FixedLenFeature([], tf.int64, default_value=-1)\n", + " }\n", + " example = tf.io.parse_single_example(tfrecord, feature_descriptions)\n", + " image = tf.io.parse_tensor(example[\"image\"], out_type=tf.uint8)\n", + " #image = tf.io.decode_jpeg(example[\"image\"])\n", + " image = tf.reshape(image, shape=[28, 28])\n", + " return image, example[\"label\"]\n", + "\n", + "def mnist_dataset(filepaths, n_read_threads=5, shuffle_buffer_size=None,\n", + " n_parse_threads=5, batch_size=32, cache=True):\n", + " dataset = tf.data.TFRecordDataset(filepaths,\n", + " num_parallel_reads=n_read_threads)\n", + " if cache:\n", + " dataset = dataset.cache()\n", + " if shuffle_buffer_size:\n", + " dataset = dataset.shuffle(shuffle_buffer_size)\n", + " dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)\n", + " dataset = dataset.batch(batch_size)\n", + " return dataset.prefetch(1)" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "train_set = mnist_dataset(train_filepaths, shuffle_buffer_size=60000)\n", + "valid_set = mnist_dataset(train_filepaths)\n", + "test_set = mnist_dataset(train_filepaths)" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [], + "source": [ + "for X, y in train_set.take(1):\n", + " for i in range(5):\n", + " plt.subplot(1, 5, i + 1)\n", + " plt.imshow(X[i].numpy(), cmap=\"binary\")\n", + " plt.axis(\"off\")\n", + " plt.title(str(y[i].numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "keras.backend.clear_session()\n", + "tf.random.set_seed(42)\n", + "np.random.seed(42)\n", + "\n", + "class Standardization(keras.layers.Layer):\n", + " def adapt(self, data_sample):\n", + " self.means_ = np.mean(data_sample, axis=0, keepdims=True)\n", + " self.stds_ = np.std(data_sample, axis=0, keepdims=True)\n", + " def call(self, inputs):\n", + " return (inputs - self.means_) / (self.stds_ + keras.backend.epsilon())\n", + "\n", + "standardization = Standardization(input_shape=[28, 28])\n", + "# or perhaps soon:\n", + "#standardization = keras.layers.Normalization()\n", + "\n", + "sample_image_batches = train_set.take(100).map(lambda image, label: image)\n", + "sample_images = np.concatenate(list(sample_image_batches.as_numpy_iterator()),\n", + " axis=0).astype(np.float32)\n", + "standardization.adapt(sample_images)\n", + "\n", + "model = keras.models.Sequential([\n", + " standardization,\n", + " keras.layers.Flatten(),\n", + " keras.layers.Dense(100, activation=\"relu\"),\n", + " keras.layers.Dense(10, activation=\"softmax\")\n", + "])\n", + "model.compile(loss=\"sparse_categorical_crossentropy\",\n", + " optimizer=\"nadam\", metrics=[\"accuracy\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "logs = os.path.join(os.curdir, \"my_logs\",\n", + " \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\"))\n", + "\n", + "tensorboard_cb = tf.keras.callbacks.TensorBoard(\n", + " log_dir=logs, histogram_freq=1, profile_batch=10)\n", + "\n", + "model.fit(train_set, epochs=5, validation_data=valid_set,\n", + " callbacks=[tensorboard_cb])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning:** The profiling tab in TensorBoard works if you use TensorFlow 2.2+. You also need to make sure `tensorboard_plugin_profile` is installed (and restart Jupyter if necessary)." + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext tensorboard\n", + "%tensorboard --logdir=./my_logs --port=6006" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 10.\n", + "_Exercise: In this exercise you will download a dataset, split it, create a `tf.data.Dataset` to load it and preprocess it efficiently, then build and train a binary classification model containing an `Embedding` layer._\n", + "\n", + "### a.\n", + "_Exercise: Download the [Large Movie Review Dataset](https://homl.info/imdb), which contains 50,000 movies reviews from the [Internet Movie Database](https://imdb.com/). The data is organized in two directories, `train` and `test`, each containing a `pos` subdirectory with 12,500 positive reviews and a `neg` subdirectory with 12,500 negative reviews. Each review is stored in a separate text file. There are other files and folders (including preprocessed bag-of-words), but we will ignore them in this exercise._" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "\n", + "DOWNLOAD_ROOT = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n", + "FILENAME = \"aclImdb_v1.tar.gz\"\n", + "filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)\n", + "path = Path(filepath).parent / \"aclImdb\"\n", + "path" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "metadata": {}, + "outputs": [], + "source": [ + "for name, subdirs, files in os.walk(path):\n", + " indent = len(Path(name).parts) - len(path.parts)\n", + " print(\" \" * indent + Path(name).parts[-1] + os.sep)\n", + " for index, filename in enumerate(sorted(files)):\n", + " if index == 3:\n", + " print(\" \" * (indent + 1) + \"...\")\n", + " break\n", + " print(\" \" * (indent + 1) + filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "def review_paths(dirpath):\n", + " return [str(path) for path in dirpath.glob(\"*.txt\")]\n", + "\n", + "train_pos = review_paths(path / \"train\" / \"pos\")\n", + "train_neg = review_paths(path / \"train\" / \"neg\")\n", + "test_valid_pos = review_paths(path / \"test\" / \"pos\")\n", + "test_valid_neg = review_paths(path / \"test\" / \"neg\")\n", + "\n", + "len(train_pos), len(train_neg), len(test_valid_pos), len(test_valid_neg)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### b.\n", + "_Exercise: Split the test set into a validation set (15,000) and a test set (10,000)._" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "metadata": {}, + "outputs": [], + "source": [ + "np.random.shuffle(test_valid_pos)\n", + "\n", + "test_pos = test_valid_pos[:5000]\n", + "test_neg = test_valid_neg[:5000]\n", + "valid_pos = test_valid_pos[5000:]\n", + "valid_neg = test_valid_neg[5000:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### c.\n", + "_Exercise: Use tf.data to create an efficient dataset for each set._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the dataset fits in memory, we can just load all the data using pure Python code and use `tf.data.Dataset.from_tensor_slices()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [], + "source": [ + "def imdb_dataset(filepaths_positive, filepaths_negative):\n", + " reviews = []\n", + " labels = []\n", + " for filepaths, label in ((filepaths_negative, 0), (filepaths_positive, 1)):\n", + " for filepath in filepaths:\n", + " with open(filepath) as review_file:\n", + " reviews.append(review_file.read())\n", + " labels.append(label)\n", + " return tf.data.Dataset.from_tensor_slices(\n", + " (tf.constant(reviews), tf.constant(labels)))" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "for X, y in imdb_dataset(train_pos, train_neg).take(3):\n", + " print(X)\n", + " print(y)\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It takes about 20 seconds to load the dataset and go through it 10 times." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But let's pretend the dataset does not fit in memory, just to make things more interesting. Luckily, each review fits on just one line (they use `
` to indicate line breaks), so we can read the reviews using a `TextLineDataset`. If they didn't we would have to preprocess the input files (e.g., converting them to TFRecords). For very large datasets, it would make sense a tool like Apache Beam for that." + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [], + "source": [ + "def imdb_dataset(filepaths_positive, filepaths_negative, n_read_threads=5):\n", + " dataset_neg = tf.data.TextLineDataset(filepaths_negative,\n", + " num_parallel_reads=n_read_threads)\n", + " dataset_neg = dataset_neg.map(lambda review: (review, 0))\n", + " dataset_pos = tf.data.TextLineDataset(filepaths_positive,\n", + " num_parallel_reads=n_read_threads)\n", + " dataset_pos = dataset_pos.map(lambda review: (review, 1))\n", + " return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).repeat(10): pass" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now it takes about 34 seconds to go through the dataset 10 times. That's much slower, essentially because the dataset is not cached in RAM, so it must be reloaded at each epoch. If you add `.cache()` just before `.repeat(10)`, you will see that this implementation will be about as fast as the previous one." + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [], + "source": [ + "%timeit -r1 for X, y in imdb_dataset(train_pos, train_neg).cache().repeat(10): pass" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 32\n", + "\n", + "train_set = imdb_dataset(train_pos, train_neg).shuffle(25000).batch(batch_size).prefetch(1)\n", + "valid_set = imdb_dataset(valid_pos, valid_neg).batch(batch_size).prefetch(1)\n", + "test_set = imdb_dataset(test_pos, test_neg).batch(batch_size).prefetch(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### d.\n", + "_Exercise: Create a binary classification model, using a `TextVectorization` layer to preprocess each review. If the `TextVectorization` layer is not yet available (or if you like a challenge), try to create your own custom preprocessing layer: you can use the functions in the `tf.strings` package, for example `lower()` to make everything lowercase, `regex_replace()` to replace punctuation with spaces, and `split()` to split words on spaces. You should use a lookup table to output word indices, which must be prepared in the `adapt()` method._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's first write a function to preprocess the reviews, cropping them to 300 characters, converting them to lower case, then replacing `
` and all non-letter characters to spaces, splitting the reviews into words, and finally padding or cropping each review so it ends up with exactly `n_words` tokens:" + ] + }, + { + "cell_type": "code", + "execution_count": 142, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess(X_batch, n_words=50):\n", + " shape = tf.shape(X_batch) * tf.constant([1, 0]) + tf.constant([0, n_words])\n", + " Z = tf.strings.substr(X_batch, 0, 300)\n", + " Z = tf.strings.lower(Z)\n", + " Z = tf.strings.regex_replace(Z, b\"\", b\" \")\n", + " Z = tf.strings.regex_replace(Z, b\"[^a-z]\", b\" \")\n", + " Z = tf.strings.split(Z)\n", + " return Z.to_tensor(shape=shape, default_value=b\"\")\n", + "\n", + "X_example = tf.constant([\"It's a great, great movie! I loved it.\", \"It was terrible, run away!!!\"])\n", + "preprocess(X_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's write a second utility function that will take a data sample with the same format as the output of the `preprocess()` function, and will output the list of the top `max_size` most frequent words, ensuring that the padding token is first:" + ] + }, + { + "cell_type": "code", + "execution_count": 143, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import Counter\n", + "\n", + "def get_vocabulary(data_sample, max_size=1000):\n", + " preprocessed_reviews = preprocess(data_sample).numpy()\n", + " counter = Counter()\n", + " for words in preprocessed_reviews:\n", + " for word in words:\n", + " if word != b\"\":\n", + " counter[word] += 1\n", + " return [b\"\"] + [word for word, count in counter.most_common(max_size)]\n", + "\n", + "get_vocabulary(X_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we are ready to create the `TextVectorization` layer. Its constructor just saves the hyperparameters (`max_vocabulary_size` and `n_oov_buckets`). The `adapt()` method computes the vocabulary using the `get_vocabulary()` function, then it builds a `StaticVocabularyTable` (see Chapter 16 for more details). The `call()` method preprocesses the reviews to get a padded list of words for each review, then it uses the `StaticVocabularyTable` to lookup the index of each word in the vocabulary:" + ] + }, + { + "cell_type": "code", + "execution_count": 144, + "metadata": {}, + "outputs": [], + "source": [ + "class TextVectorization(keras.layers.Layer):\n", + " def __init__(self, max_vocabulary_size=1000, n_oov_buckets=100, dtype=tf.string, **kwargs):\n", + " super().__init__(dtype=dtype, **kwargs)\n", + " self.max_vocabulary_size = max_vocabulary_size\n", + " self.n_oov_buckets = n_oov_buckets\n", + "\n", + " def adapt(self, data_sample):\n", + " self.vocab = get_vocabulary(data_sample, self.max_vocabulary_size)\n", + " words = tf.constant(self.vocab)\n", + " word_ids = tf.range(len(self.vocab), dtype=tf.int64)\n", + " vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)\n", + " self.table = tf.lookup.StaticVocabularyTable(vocab_init, self.n_oov_buckets)\n", + " \n", + " def call(self, inputs):\n", + " preprocessed_inputs = preprocess(inputs)\n", + " return self.table.lookup(preprocessed_inputs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try it on our small `X_example` we defined earlier:" + ] + }, + { + "cell_type": "code", + "execution_count": 145, + "metadata": {}, + "outputs": [], + "source": [ + "text_vectorization = TextVectorization()\n", + "\n", + "text_vectorization.adapt(X_example)\n", + "text_vectorization(X_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looks good! As you can see, each review was cleaned up and tokenized, then each word was encoded as its index in the vocabulary (all the 0s correspond to the `` tokens).\n", + "\n", + "Now let's create another `TextVectorization` layer and let's adapt it to the full IMDB training set (if the training set did not fit in RAM, we could just use a smaller sample of the training set by calling `train_set.take(500)`):" + ] + }, + { + "cell_type": "code", + "execution_count": 146, + "metadata": {}, + "outputs": [], + "source": [ + "max_vocabulary_size = 1000\n", + "n_oov_buckets = 100\n", + "\n", + "sample_review_batches = train_set.map(lambda review, label: review)\n", + "sample_reviews = np.concatenate(list(sample_review_batches.as_numpy_iterator()),\n", + " axis=0)\n", + "\n", + "text_vectorization = TextVectorization(max_vocabulary_size, n_oov_buckets,\n", + " input_shape=[])\n", + "text_vectorization.adapt(sample_reviews)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run it on the same `X_example`, just to make sure the word IDs are larger now, since the vocabulary bigger:" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": {}, + "outputs": [], + "source": [ + "text_vectorization(X_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Good! Now let's take a look at the first 10 words in the vocabulary:" + ] + }, + { + "cell_type": "code", + "execution_count": 148, + "metadata": {}, + "outputs": [], + "source": [ + "text_vectorization.vocab[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These are the most common words in the reviews." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now to build our model we will need to encode all these word IDs somehow. One approach is to create bags of words: for each review, and for each word in the vocabulary, we count the number of occurences of that word in the review. For example:" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "metadata": {}, + "outputs": [], + "source": [ + "simple_example = tf.constant([[1, 3, 1, 0, 0], [2, 2, 0, 0, 0]])\n", + "tf.reduce_sum(tf.one_hot(simple_example, 4), axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The first review has 2 times the word 0, 2 times the word 1, 0 times the word 2, and 1 time the word 3, so its bag-of-words representation is `[2, 2, 0, 1]`. Similarly, the second review has 3 times the word 0, 0 times the word 1, and so on. Let's wrap this logic in a small custom layer, and let's test it. We'll drop the counts for the word 0, since this corresponds to the `` token, which we don't care about." + ] + }, + { + "cell_type": "code", + "execution_count": 150, + "metadata": {}, + "outputs": [], + "source": [ + "class BagOfWords(keras.layers.Layer):\n", + " def __init__(self, n_tokens, dtype=tf.int32, **kwargs):\n", + " super().__init__(dtype=tf.int32, **kwargs)\n", + " self.n_tokens = n_tokens\n", + " def call(self, inputs):\n", + " one_hot = tf.one_hot(inputs, self.n_tokens)\n", + " return tf.reduce_sum(one_hot, axis=1)[:, 1:]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test it:" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "bag_of_words = BagOfWords(n_tokens=4)\n", + "bag_of_words(simple_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It works fine! Now let's create another `BagOfWord` with the right vocabulary size for our training set:" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "metadata": {}, + "outputs": [], + "source": [ + "n_tokens = max_vocabulary_size + n_oov_buckets + 1 # add 1 for \n", + "bag_of_words = BagOfWords(n_tokens)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We're ready to train the model!" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "metadata": {}, + "outputs": [], + "source": [ + "model = keras.models.Sequential([\n", + " text_vectorization,\n", + " bag_of_words,\n", + " keras.layers.Dense(100, activation=\"relu\"),\n", + " keras.layers.Dense(1, activation=\"sigmoid\"),\n", + "])\n", + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\",\n", + " metrics=[\"accuracy\"])\n", + "model.fit(train_set, epochs=5, validation_data=valid_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We get about 75% accuracy on the validation set after just the first epoch, but after that the model makes no progress. We will do better in Chapter 16. For now the point is just to perform efficient preprocessing using `tf.data` and Keras preprocessing layers." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### e.\n", + "_Exercise: Add an `Embedding` layer and compute the mean embedding for each review, multiplied by the square root of the number of words (see Chapter 16). This rescaled mean embedding can then be passed to the rest of your model._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To compute the mean embedding for each review, and multiply it by the square root of the number of words in that review, we will need a little function:" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "metadata": {}, + "outputs": [], + "source": [ + "def compute_mean_embedding(inputs):\n", + " not_pad = tf.math.count_nonzero(inputs, axis=-1)\n", + " n_words = tf.math.count_nonzero(not_pad, axis=-1, keepdims=True) \n", + " sqrt_n_words = tf.math.sqrt(tf.cast(n_words, tf.float32))\n", + " return tf.reduce_mean(inputs, axis=1) * sqrt_n_words\n", + "\n", + "another_example = tf.constant([[[1., 2., 3.], [4., 5., 0.], [0., 0., 0.]],\n", + " [[6., 0., 0.], [0., 0., 0.], [0., 0., 0.]]])\n", + "compute_mean_embedding(another_example)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that this is correct. The first review contains 2 words (the last token is a zero vector, which represents the `` token). The second review contains 1 word. So we need to compute the mean embedding for each review, and multiply the first one by the square root of 2, and the second one by the square root of 1:" + ] + }, + { + "cell_type": "code", + "execution_count": 155, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reduce_mean(another_example, axis=1) * tf.sqrt([[2.], [1.]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perfect. Now we're ready to train our final model. It's the same as before, except we replaced the `BagOfWords` layer with an `Embedding` layer followed by a `Lambda` layer that calls the `compute_mean_embedding` layer:" + ] + }, + { + "cell_type": "code", + "execution_count": 156, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_size = 20\n", + "\n", + "model = keras.models.Sequential([\n", + " text_vectorization,\n", + " keras.layers.Embedding(input_dim=n_tokens,\n", + " output_dim=embedding_size,\n", + " mask_zero=True), # tokens => zero vectors\n", + " keras.layers.Lambda(compute_mean_embedding),\n", + " keras.layers.Dense(100, activation=\"relu\"),\n", + " keras.layers.Dense(1, activation=\"sigmoid\"),\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### f.\n", + "_Exercise: Train the model and see what accuracy you get. Try to optimize your pipelines to make training as fast as possible._" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "metadata": {}, + "outputs": [], + "source": [ + "model.compile(loss=\"binary_crossentropy\", optimizer=\"nadam\", metrics=[\"accuracy\"])\n", + "model.fit(train_set, epochs=5, validation_data=valid_set)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is not better using embeddings (but we will do better in Chapter 16). The pipeline looks fast enough (we optimized it earlier)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### g.\n", + "_Exercise: Use TFDS to load the same dataset more easily: `tfds.load(\"imdb_reviews\")`._" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow_datasets as tfds\n", + "\n", + "datasets = tfds.load(name=\"imdb_reviews\")\n", + "train_set, test_set = datasets[\"train\"], datasets[\"test\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "metadata": {}, + "outputs": [], + "source": [ + "for example in train_set.take(1):\n", + " print(example[\"text\"])\n", + " print(example[\"label\"])" + ] + }, { "cell_type": "code", "execution_count": null, @@ -1834,7 +2768,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.3" + "version": "3.7.6" }, "nav_menu": { "height": "264px",