Large change: replace os.path with pathlib, move to Python 3.7

This commit is contained in:
Aurélien Geron
2021-10-15 21:46:27 +13:00
parent 1b16a81fe5
commit fa1ae51184
19 changed files with 969 additions and 1066 deletions

View File

@@ -4,8 +4,13 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Chapter 13 Loading and Preprocessing Data with TensorFlow**\n",
"\n",
"**Chapter 13 Loading and Preprocessing Data with TensorFlow**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"_This notebook contains all the sample code and solutions to the exercises in chapter 13._"
]
},
@@ -34,7 +39,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0."
"First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
]
},
{
@@ -43,33 +48,36 @@
"metadata": {},
"outputs": [],
"source": [
"# Python ≥3.5 is required\n",
"# Python ≥3.7 is required\n",
"import sys\n",
"assert sys.version_info >= (3, 5)\n",
"assert sys.version_info >= (3, 7)\n",
"\n",
"# Is this notebook running on Colab or Kaggle?\n",
"IS_COLAB = \"google.colab\" in sys.modules\n",
"IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n",
"\n",
"if IS_COLAB or IS_KAGGLE:\n",
" %pip install -q -U tfx==0.21.2\n",
" %pip install -q -U tfx\n",
" print(\"You can safely ignore the package incompatibility errors.\")\n",
"\n",
"# Scikit-Learn ≥0.20 is required\n",
"import sklearn\n",
"assert sklearn.__version__ >= \"0.20\"\n",
"\n",
"# TensorFlow ≥2.0 is required\n",
"import tensorflow as tf\n",
"from tensorflow import keras\n",
"assert tf.__version__ >= \"2.0\"\n",
"\n",
"# Common imports\n",
"import numpy as np\n",
"import os\n",
"from pathlib import Path\n",
"\n",
"# Scikit-Learn ≥1.0 is required\n",
"import sklearn\n",
"assert sklearn.__version__ >= \"1.0\"\n",
"\n",
"# TensorFlow ≥2.6 is required\n",
"import tensorflow as tf\n",
"assert tf.__version__ >= \"2.6\"\n",
"\n",
"# Load the Jupyter extension for TensorBoard\n",
"%load_ext tensorboard\n",
"\n",
"# to make this notebook's output stable across runs\n",
"np.random.seed(42)\n",
"tf.random.set_seed(42)\n",
"\n",
"# To plot pretty figures\n",
"%matplotlib inline\n",
@@ -80,14 +88,11 @@
"mpl.rc('ytick', labelsize=12)\n",
"\n",
"# Where to save the figures\n",
"PROJECT_ROOT_DIR = \".\"\n",
"CHAPTER_ID = \"data\"\n",
"IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
"os.makedirs(IMAGES_PATH, exist_ok=True)\n",
"IMAGES_PATH = Path() / \"images\" / \"data\"\n",
"IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
"\n",
"def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
" path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
" print(\"Saving figure\", fig_id)\n",
" path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
" if tight_layout:\n",
" plt.tight_layout()\n",
" plt.savefig(path, format=fig_extension, dpi=resolution)"
@@ -264,9 +269,9 @@
"outputs": [],
"source": [
"def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n",
" housing_dir = os.path.join(\"datasets\", \"housing\")\n",
" os.makedirs(housing_dir, exist_ok=True)\n",
" path_format = os.path.join(housing_dir, \"my_{}_{:02d}.csv\")\n",
" housing_dir = Path() / \"datasets\" / \"housing\"\n",
" housing_dir.mkdir(parents=True, exist_ok=True)\n",
" path_format = housing_dir / \"my_{}_{:02d}.csv\"\n",
"\n",
" filepaths = []\n",
" m = len(data)\n",
@@ -1431,21 +1436,23 @@
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pathlib import Path\n",
"import tarfile\n",
"import urllib.request\n",
"import pandas as pd\n",
"\n",
"DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
"HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n",
"HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n",
"\n",
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
" os.makedirs(housing_path, exist_ok=True)\n",
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
" housing_tgz = tarfile.open(tgz_path)\n",
" housing_tgz.extractall(path=housing_path)\n",
" housing_tgz.close()"
"def load_housing_data():\n",
" housing_path = Path() / \"datasets\" / \"housing\"\n",
" if not (housing_path / \"housing.csv\").is_file():\n",
" housing_path.mkdir(parents=True, exist_ok=True)\n",
" root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
" url = root + \"datasets/housing/housing.tgz\"\n",
" tgz_path = housing_path / \"housing.tgz\"\n",
" urllib.request.urlretrieve(url, tgz_path)\n",
" housing_tgz = tarfile.open(tgz_path)\n",
" housing_tgz.extractall(path=housing_path)\n",
" housing_tgz.close()\n",
" return pd.read_csv(housing_path / \"housing.csv\")"
]
},
{
@@ -1453,31 +1460,9 @@
"execution_count": 86,
"metadata": {},
"outputs": [],
"source": [
"fetch_housing_data()"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"def load_housing_data(housing_path=HOUSING_PATH):\n",
" csv_path = os.path.join(housing_path, \"housing.csv\")\n",
" return pd.read_csv(csv_path)"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [],
"source": [
"housing = load_housing_data()\n",
"housing.head()"
"head()"
]
},
{
@@ -2104,8 +2089,8 @@
"outputs": [],
"source": [
"from datetime import datetime\n",
"logs = os.path.join(os.curdir, \"my_logs\",\n",
" \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\"))\n",
"\n",
"logs = Path() / \"my_logs\" / \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
"\n",
"tensorboard_cb = tf.keras.callbacks.TensorBoard(\n",
" log_dir=logs, histogram_freq=1, profile_batch=10)\n",
@@ -2144,33 +2129,56 @@
},
{
"cell_type": "code",
"execution_count": 131,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"DOWNLOAD_ROOT = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
"FILENAME = \"aclImdb_v1.tar.gz\"\n",
"filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)\n",
"path = Path(filepath).parent / \"aclImdb\"\n",
"root = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
"filename = \"aclImdb_v1.tar.gz\"\n",
"filepath = keras.utils.get_file(filename, root + filename, extract=True)\n",
"path = Path(filepath).with_name(\"aclImdb\")\n",
"path"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's define a `tree()` function to view the structure of the `aclImdb` directory:"
]
},
{
"cell_type": "code",
"execution_count": 132,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"for name, subdirs, files in os.walk(path):\n",
" indent = len(Path(name).parts) - len(path.parts)\n",
" print(\" \" * indent + Path(name).parts[-1] + os.sep)\n",
" for index, filename in enumerate(sorted(files)):\n",
" if index == 3:\n",
" print(\" \" * (indent + 1) + \"...\")\n",
" break\n",
" print(\" \" * (indent + 1) + filename)"
"def tree(path, level=0, indent=4, max_files=3):\n",
" if level == 0:\n",
" print(f\"{path}/\")\n",
" level += 1\n",
" sub_paths = sorted(path.iterdir())\n",
" sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]\n",
" filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]\n",
" indent_str = \" \" * indent * level\n",
" for sub_dir in sub_dirs:\n",
" print(f\"{indent_str}{sub_dir.name}/\")\n",
" tree(sub_dir, level + 1, indent)\n",
" for filepath in filepaths[:max_files]:\n",
" print(f\"{indent_str}{filepath.name}\")\n",
" if len(filepaths) > max_files:\n",
" print(f\"{indent_str}...\")"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"tree(path)"
]
},
{
@@ -2771,7 +2779,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},