Large change: replace os.path with pathlib, move to Python 3.7

2026-01-14 12:14:36 +01:00 · 2021-10-15 21:46:27 +13:00
parent 1b16a81fe5
commit fa1ae51184
19 changed files with 969 additions and 1066 deletions
--- a/13_loading_and_preprocessing_data.ipynb
+++ b/13_loading_and_preprocessing_data.ipynb
@@ -4,8 +4,13 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**\n",
-    "\n",
+    "**Chapter 13 – Loading and Preprocessing Data with TensorFlow**"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
    "_This notebook contains all the sample code and solutions to the exercises in chapter 13._"
   ]
  },
@@ -34,7 +39,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0."
+    "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
   ]
  },
  {
@@ -43,33 +48,36 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# Python ≥3.5 is required\n",
+    "# Python ≥3.7 is required\n",
    "import sys\n",
-    "assert sys.version_info >= (3, 5)\n",
+    "assert sys.version_info >= (3, 7)\n",
    "\n",
    "# Is this notebook running on Colab or Kaggle?\n",
    "IS_COLAB = \"google.colab\" in sys.modules\n",
    "IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n",
    "\n",
    "if IS_COLAB or IS_KAGGLE:\n",
-    "    %pip install -q -U tfx==0.21.2\n",
+    "    %pip install -q -U tfx\n",
    "    print(\"You can safely ignore the package incompatibility errors.\")\n",
    "\n",
-    "# Scikit-Learn ≥0.20 is required\n",
-    "import sklearn\n",
-    "assert sklearn.__version__ >= \"0.20\"\n",
-    "\n",
-    "# TensorFlow ≥2.0 is required\n",
-    "import tensorflow as tf\n",
-    "from tensorflow import keras\n",
-    "assert tf.__version__ >= \"2.0\"\n",
-    "\n",
    "# Common imports\n",
    "import numpy as np\n",
-    "import os\n",
+    "from pathlib import Path\n",
+    "\n",
+    "# Scikit-Learn ≥1.0 is required\n",
+    "import sklearn\n",
+    "assert sklearn.__version__ >= \"1.0\"\n",
+    "\n",
+    "# TensorFlow ≥2.6 is required\n",
+    "import tensorflow as tf\n",
+    "assert tf.__version__ >= \"2.6\"\n",
+    "\n",
+    "# Load the Jupyter extension for TensorBoard\n",
+    "%load_ext tensorboard\n",
    "\n",
    "# to make this notebook's output stable across runs\n",
    "np.random.seed(42)\n",
+    "tf.random.set_seed(42)\n",
    "\n",
    "# To plot pretty figures\n",
    "%matplotlib inline\n",
@@ -80,14 +88,11 @@
    "mpl.rc('ytick', labelsize=12)\n",
    "\n",
    "# Where to save the figures\n",
-    "PROJECT_ROOT_DIR = \".\"\n",
-    "CHAPTER_ID = \"data\"\n",
-    "IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
-    "os.makedirs(IMAGES_PATH, exist_ok=True)\n",
+    "IMAGES_PATH = Path() / \"images\" / \"data\"\n",
+    "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
    "\n",
    "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
-    "    path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
-    "    print(\"Saving figure\", fig_id)\n",
+    "    path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
    "    if tight_layout:\n",
    "        plt.tight_layout()\n",
    "    plt.savefig(path, format=fig_extension, dpi=resolution)"
@@ -264,9 +269,9 @@
   "outputs": [],
   "source": [
    "def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n",
-    "    housing_dir = os.path.join(\"datasets\", \"housing\")\n",
-    "    os.makedirs(housing_dir, exist_ok=True)\n",
-    "    path_format = os.path.join(housing_dir, \"my_{}_{:02d}.csv\")\n",
+    "    housing_dir = Path() / \"datasets\" / \"housing\"\n",
+    "    housing_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    path_format = housing_dir / \"my_{}_{:02d}.csv\"\n",
    "\n",
    "    filepaths = []\n",
    "    m = len(data)\n",
@@ -1431,21 +1436,23 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "import os\n",
+    "from pathlib import Path\n",
    "import tarfile\n",
    "import urllib.request\n",
+    "import pandas as pd\n",
    "\n",
-    "DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
-    "HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n",
-    "HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n",
-    "\n",
-    "def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
-    "    os.makedirs(housing_path, exist_ok=True)\n",
-    "    tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
-    "    urllib.request.urlretrieve(housing_url, tgz_path)\n",
-    "    housing_tgz = tarfile.open(tgz_path)\n",
-    "    housing_tgz.extractall(path=housing_path)\n",
-    "    housing_tgz.close()"
+    "def load_housing_data():\n",
+    "    housing_path = Path() / \"datasets\" / \"housing\"\n",
+    "    if not (housing_path / \"housing.csv\").is_file():\n",
+    "        housing_path.mkdir(parents=True, exist_ok=True)\n",
+    "        root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
+    "        url = root + \"datasets/housing/housing.tgz\"\n",
+    "        tgz_path = housing_path / \"housing.tgz\"\n",
+    "        urllib.request.urlretrieve(url, tgz_path)\n",
+    "        housing_tgz = tarfile.open(tgz_path)\n",
+    "        housing_tgz.extractall(path=housing_path)\n",
+    "        housing_tgz.close()\n",
+    "    return pd.read_csv(housing_path / \"housing.csv\")"
   ]
  },
  {
@@ -1453,31 +1460,9 @@
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "fetch_housing_data()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 87,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd\n",
-    "\n",
-    "def load_housing_data(housing_path=HOUSING_PATH):\n",
-    "    csv_path = os.path.join(housing_path, \"housing.csv\")\n",
-    "    return pd.read_csv(csv_path)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 88,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "housing = load_housing_data()\n",
-    "housing.head()"
+    "head()"
   ]
  },
  {
@@ -2104,8 +2089,8 @@
   "outputs": [],
   "source": [
    "from datetime import datetime\n",
-    "logs = os.path.join(os.curdir, \"my_logs\",\n",
-    "                    \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\"))\n",
+    "\n",
+    "logs = Path() / \"my_logs\" / \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
    "\n",
    "tensorboard_cb = tf.keras.callbacks.TensorBoard(\n",
    "    log_dir=logs, histogram_freq=1, profile_batch=10)\n",
@@ -2144,33 +2129,56 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 131,
+   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "\n",
-    "DOWNLOAD_ROOT = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
-    "FILENAME = \"aclImdb_v1.tar.gz\"\n",
-    "filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)\n",
-    "path = Path(filepath).parent / \"aclImdb\"\n",
+    "root = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
+    "filename = \"aclImdb_v1.tar.gz\"\n",
+    "filepath = keras.utils.get_file(filename, root + filename, extract=True)\n",
+    "path = Path(filepath).with_name(\"aclImdb\")\n",
    "path"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's define a `tree()` function to view the structure of the `aclImdb` directory:"
+   ]
+  },
  {
   "cell_type": "code",
-   "execution_count": 132,
+   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
-    "for name, subdirs, files in os.walk(path):\n",
-    "    indent = len(Path(name).parts) - len(path.parts)\n",
-    "    print(\"    \" * indent + Path(name).parts[-1] + os.sep)\n",
-    "    for index, filename in enumerate(sorted(files)):\n",
-    "        if index == 3:\n",
-    "            print(\"    \" * (indent + 1) + \"...\")\n",
-    "            break\n",
-    "        print(\"    \" * (indent + 1) + filename)"
+    "def tree(path, level=0, indent=4, max_files=3):\n",
+    "    if level == 0:\n",
+    "        print(f\"{path}/\")\n",
+    "        level += 1\n",
+    "    sub_paths = sorted(path.iterdir())\n",
+    "    sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]\n",
+    "    filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]\n",
+    "    indent_str = \" \" * indent * level\n",
+    "    for sub_dir in sub_dirs:\n",
+    "        print(f\"{indent_str}{sub_dir.name}/\")\n",
+    "        tree(sub_dir,  level + 1, indent)\n",
+    "    for filepath in filepaths[:max_files]:\n",
+    "        print(f\"{indent_str}{filepath.name}\")\n",
+    "    if len(filepaths) > max_files:\n",
+    "        print(f\"{indent_str}...\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tree(path)"
   ]
  },
  {
@@ -2771,7 +2779,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },