mirror of
https://github.com/ArthurDanjou/handson-ml3.git
synced 2026-01-14 12:14:36 +01:00
Large change: replace os.path with pathlib, move to Python 3.7
This commit is contained in:
@@ -4,8 +4,13 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Chapter 13 – Loading and Preprocessing Data with TensorFlow**\n",
|
||||
"\n",
|
||||
"**Chapter 13 – Loading and Preprocessing Data with TensorFlow**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"_This notebook contains all the sample code and solutions to the exercises in chapter 13._"
|
||||
]
|
||||
},
|
||||
@@ -34,7 +39,7 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures. We also check that Python 3.5 or later is installed (although Python 2.x may work, it is deprecated so we strongly recommend you use Python 3 instead), as well as Scikit-Learn ≥0.20 and TensorFlow ≥2.0."
|
||||
"First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -43,33 +48,36 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Python ≥3.5 is required\n",
|
||||
"# Python ≥3.7 is required\n",
|
||||
"import sys\n",
|
||||
"assert sys.version_info >= (3, 5)\n",
|
||||
"assert sys.version_info >= (3, 7)\n",
|
||||
"\n",
|
||||
"# Is this notebook running on Colab or Kaggle?\n",
|
||||
"IS_COLAB = \"google.colab\" in sys.modules\n",
|
||||
"IS_KAGGLE = \"kaggle_secrets\" in sys.modules\n",
|
||||
"\n",
|
||||
"if IS_COLAB or IS_KAGGLE:\n",
|
||||
" %pip install -q -U tfx==0.21.2\n",
|
||||
" %pip install -q -U tfx\n",
|
||||
" print(\"You can safely ignore the package incompatibility errors.\")\n",
|
||||
"\n",
|
||||
"# Scikit-Learn ≥0.20 is required\n",
|
||||
"import sklearn\n",
|
||||
"assert sklearn.__version__ >= \"0.20\"\n",
|
||||
"\n",
|
||||
"# TensorFlow ≥2.0 is required\n",
|
||||
"import tensorflow as tf\n",
|
||||
"from tensorflow import keras\n",
|
||||
"assert tf.__version__ >= \"2.0\"\n",
|
||||
"\n",
|
||||
"# Common imports\n",
|
||||
"import numpy as np\n",
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"# Scikit-Learn ≥1.0 is required\n",
|
||||
"import sklearn\n",
|
||||
"assert sklearn.__version__ >= \"1.0\"\n",
|
||||
"\n",
|
||||
"# TensorFlow ≥2.6 is required\n",
|
||||
"import tensorflow as tf\n",
|
||||
"assert tf.__version__ >= \"2.6\"\n",
|
||||
"\n",
|
||||
"# Load the Jupyter extension for TensorBoard\n",
|
||||
"%load_ext tensorboard\n",
|
||||
"\n",
|
||||
"# to make this notebook's output stable across runs\n",
|
||||
"np.random.seed(42)\n",
|
||||
"tf.random.set_seed(42)\n",
|
||||
"\n",
|
||||
"# To plot pretty figures\n",
|
||||
"%matplotlib inline\n",
|
||||
@@ -80,14 +88,11 @@
|
||||
"mpl.rc('ytick', labelsize=12)\n",
|
||||
"\n",
|
||||
"# Where to save the figures\n",
|
||||
"PROJECT_ROOT_DIR = \".\"\n",
|
||||
"CHAPTER_ID = \"data\"\n",
|
||||
"IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, \"images\", CHAPTER_ID)\n",
|
||||
"os.makedirs(IMAGES_PATH, exist_ok=True)\n",
|
||||
"IMAGES_PATH = Path() / \"images\" / \"data\"\n",
|
||||
"IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
|
||||
"\n",
|
||||
"def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n",
|
||||
" path = os.path.join(IMAGES_PATH, fig_id + \".\" + fig_extension)\n",
|
||||
" print(\"Saving figure\", fig_id)\n",
|
||||
" path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n",
|
||||
" if tight_layout:\n",
|
||||
" plt.tight_layout()\n",
|
||||
" plt.savefig(path, format=fig_extension, dpi=resolution)"
|
||||
@@ -264,9 +269,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):\n",
|
||||
" housing_dir = os.path.join(\"datasets\", \"housing\")\n",
|
||||
" os.makedirs(housing_dir, exist_ok=True)\n",
|
||||
" path_format = os.path.join(housing_dir, \"my_{}_{:02d}.csv\")\n",
|
||||
" housing_dir = Path() / \"datasets\" / \"housing\"\n",
|
||||
" housing_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||
" path_format = housing_dir / \"my_{}_{:02d}.csv\"\n",
|
||||
"\n",
|
||||
" filepaths = []\n",
|
||||
" m = len(data)\n",
|
||||
@@ -1431,21 +1436,23 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import os\n",
|
||||
"from pathlib import Path\n",
|
||||
"import tarfile\n",
|
||||
"import urllib.request\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"DOWNLOAD_ROOT = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
|
||||
"HOUSING_PATH = os.path.join(\"datasets\", \"housing\")\n",
|
||||
"HOUSING_URL = DOWNLOAD_ROOT + \"datasets/housing/housing.tgz\"\n",
|
||||
"\n",
|
||||
"def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):\n",
|
||||
" os.makedirs(housing_path, exist_ok=True)\n",
|
||||
" tgz_path = os.path.join(housing_path, \"housing.tgz\")\n",
|
||||
" urllib.request.urlretrieve(housing_url, tgz_path)\n",
|
||||
" housing_tgz = tarfile.open(tgz_path)\n",
|
||||
" housing_tgz.extractall(path=housing_path)\n",
|
||||
" housing_tgz.close()"
|
||||
"def load_housing_data():\n",
|
||||
" housing_path = Path() / \"datasets\" / \"housing\"\n",
|
||||
" if not (housing_path / \"housing.csv\").is_file():\n",
|
||||
" housing_path.mkdir(parents=True, exist_ok=True)\n",
|
||||
" root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n",
|
||||
" url = root + \"datasets/housing/housing.tgz\"\n",
|
||||
" tgz_path = housing_path / \"housing.tgz\"\n",
|
||||
" urllib.request.urlretrieve(url, tgz_path)\n",
|
||||
" housing_tgz = tarfile.open(tgz_path)\n",
|
||||
" housing_tgz.extractall(path=housing_path)\n",
|
||||
" housing_tgz.close()\n",
|
||||
" return pd.read_csv(housing_path / \"housing.csv\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -1453,31 +1460,9 @@
|
||||
"execution_count": 86,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"fetch_housing_data()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 87,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"def load_housing_data(housing_path=HOUSING_PATH):\n",
|
||||
" csv_path = os.path.join(housing_path, \"housing.csv\")\n",
|
||||
" return pd.read_csv(csv_path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 88,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"housing = load_housing_data()\n",
|
||||
"housing.head()"
|
||||
"head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2104,8 +2089,8 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from datetime import datetime\n",
|
||||
"logs = os.path.join(os.curdir, \"my_logs\",\n",
|
||||
" \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\"))\n",
|
||||
"\n",
|
||||
"logs = Path() / \"my_logs\" / \"run_\" + datetime.now().strftime(\"%Y%m%d_%H%M%S\")\n",
|
||||
"\n",
|
||||
"tensorboard_cb = tf.keras.callbacks.TensorBoard(\n",
|
||||
" log_dir=logs, histogram_freq=1, profile_batch=10)\n",
|
||||
@@ -2144,33 +2129,56 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 131,
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"DOWNLOAD_ROOT = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
|
||||
"FILENAME = \"aclImdb_v1.tar.gz\"\n",
|
||||
"filepath = keras.utils.get_file(FILENAME, DOWNLOAD_ROOT + FILENAME, extract=True)\n",
|
||||
"path = Path(filepath).parent / \"aclImdb\"\n",
|
||||
"root = \"http://ai.stanford.edu/~amaas/data/sentiment/\"\n",
|
||||
"filename = \"aclImdb_v1.tar.gz\"\n",
|
||||
"filepath = keras.utils.get_file(filename, root + filename, extract=True)\n",
|
||||
"path = Path(filepath).with_name(\"aclImdb\")\n",
|
||||
"path"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Let's define a `tree()` function to view the structure of the `aclImdb` directory:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 132,
|
||||
"execution_count": 76,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"for name, subdirs, files in os.walk(path):\n",
|
||||
" indent = len(Path(name).parts) - len(path.parts)\n",
|
||||
" print(\" \" * indent + Path(name).parts[-1] + os.sep)\n",
|
||||
" for index, filename in enumerate(sorted(files)):\n",
|
||||
" if index == 3:\n",
|
||||
" print(\" \" * (indent + 1) + \"...\")\n",
|
||||
" break\n",
|
||||
" print(\" \" * (indent + 1) + filename)"
|
||||
"def tree(path, level=0, indent=4, max_files=3):\n",
|
||||
" if level == 0:\n",
|
||||
" print(f\"{path}/\")\n",
|
||||
" level += 1\n",
|
||||
" sub_paths = sorted(path.iterdir())\n",
|
||||
" sub_dirs = [sub_path for sub_path in sub_paths if sub_path.is_dir()]\n",
|
||||
" filepaths = [sub_path for sub_path in sub_paths if not sub_path in sub_dirs]\n",
|
||||
" indent_str = \" \" * indent * level\n",
|
||||
" for sub_dir in sub_dirs:\n",
|
||||
" print(f\"{indent_str}{sub_dir.name}/\")\n",
|
||||
" tree(sub_dir, level + 1, indent)\n",
|
||||
" for filepath in filepaths[:max_files]:\n",
|
||||
" print(f\"{indent_str}{filepath.name}\")\n",
|
||||
" if len(filepaths) > max_files:\n",
|
||||
" print(f\"{indent_str}...\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 77,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tree(path)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -2771,7 +2779,7 @@
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user