From b75811f999175c8d9404744385ad0d5c45c61618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= Date: Thu, 28 Oct 2021 14:55:10 +1300 Subject: [PATCH] BIG UPDATE: rewrote in large part for 3rd edition --- 02_end_to_end_machine_learning_project.ipynb | 2602 +++++++++++------- 1 file changed, 1652 insertions(+), 950 deletions(-) diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb index c6e8f17..0e231ee 100644 --- a/02_end_to_end_machine_learning_project.ipynb +++ b/02_end_to_end_machine_learning_project.ipynb @@ -11,8 +11,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*\n", - "\n", "*This notebook contains all the sample code and solutions to the exercices in chapter 2.*" ] }, @@ -30,55 +28,49 @@ "" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures." - ] - }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ - "# Python ≥3.8 is required\n", + "print(\"Welcome to Machine Learning!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This project requires Python 3.8 or above:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ "import sys\n", - "assert sys.version_info >= (3, 8)\n", "\n", - "# Scikit-Learn ≥1.0 is required\n", + "assert sys.version_info >= (3, 8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It also requires Scikit-Learn ≥ 1.0:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ "import sklearn\n", - "assert sklearn.__version__ >= \"1.0\"\n", "\n", - "# Common imports\n", - "import numpy as np\n", - "from pathlib import Path\n", - "\n", - "# To plot pretty figures\n", - "%matplotlib inline\n", - "import matplotlib as mpl\n", - "import matplotlib.pyplot as plt\n", - "mpl.rc('axes', labelsize=14)\n", - "mpl.rc('xtick', labelsize=12)\n", - "mpl.rc('ytick', labelsize=12)\n", - "\n", - "# Where to save the figures\n", - "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n", - "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", - "\n", - "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", - " path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n", - " if tight_layout:\n", - " plt.tight_layout()\n", - " plt.savefig(path, format=fig_extension, dpi=resolution)" + "assert sklearn.__version__ >= \"1.0\"" ] }, { @@ -88,6 +80,13 @@ "# Get the Data" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "*Welcome to Machine Learning Housing Corp.! Your task is to predict median house values in Californian districts, given a number of features from these districts.*" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -97,13 +96,14 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import tarfile\n", "import urllib.request\n", + "\n", "import pandas as pd\n", "\n", "def load_housing_data():\n", @@ -114,18 +114,10 @@ " url = root + \"datasets/housing/housing.tgz\"\n", " tgz_path = housing_path / \"housing.tgz\"\n", " urllib.request.urlretrieve(url, tgz_path)\n", - " housing_tgz = tarfile.open(tgz_path)\n", - " housing_tgz.extractall(path=housing_path)\n", - " housing_tgz.close()\n", - " return pd.read_csv(housing_path / \"housing.csv\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ + " with tarfile.open(tgz_path) as housing_tgz:\n", + " housing_tgz.extractall(path=housing_path)\n", + " return pd.read_csv(housing_path / \"housing.csv\")\n", + "\n", "housing = load_housing_data()" ] }, @@ -142,7 +134,6 @@ "metadata": {}, "outputs": [], "source": [ - "housing = load_housing_data()\n", "housing.head()" ] }, @@ -173,16 +164,60 @@ "housing.describe()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell is not shown in the book, as it's just here to make the figures prettier by setting the font sizes:" + ] + }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ - "%matplotlib inline\n", + "import matplotlib as mpl\n", + "\n", + "mpl.rc('font', size=12)\n", + "mpl.rc('axes', labelsize=14, titlesize=14)\n", + "mpl.rc('legend', fontsize=14)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The following cell is not shown either in the book. It creates the `images/end_to_end_project` folder (if it doesn't already exist), and it defines the `save_fig()` function which is used through this notebook to save the figures in high-res for the book." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Where to save the figures\n", + "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n", + "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n", + "\n", + "def save_fig(fig_id, tight_layout=True, fig_extension=\"png\", resolution=300):\n", + " path = IMAGES_PATH / f\"{fig_id}.{fig_extension}\"\n", + " if tight_layout:\n", + " plt.tight_layout()\n", + " plt.savefig(path, format=fig_extension, dpi=resolution)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ "import matplotlib.pyplot as plt\n", - "housing.hist(bins=50, figsize=(20,15))\n", - "save_fig(\"attribute_histogram_plots\")\n", + "\n", + "housing.hist(bins=50, figsize=(12, 8))\n", + "save_fig(\"attribute_histogram_plots\") # save_fig() lines are never in the book\n", "plt.show()" ] }, @@ -195,24 +230,13 @@ }, { "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# to make this notebook's output identical at every run\n", - "np.random.seed(42)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "\n", - "# For illustration only. Sklearn has train_test_split()\n", - "def split_train_test(data, test_ratio):\n", + "def shuffle_and_split_data(data, test_ratio):\n", " shuffled_indices = np.random.permutation(len(data))\n", " test_set_size = int(len(data) * test_ratio)\n", " test_indices = shuffled_indices[:test_set_size]\n", @@ -220,23 +244,14 @@ " return data.iloc[train_indices], data.iloc[test_indices]" ] }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "train_set, test_set = split_train_test(housing, 0.2)\n", - "len(train_set)" - ] - }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "len(test_set)" + "train_set, test_set = shuffle_and_split_data(housing, 0.2)\n", + "len(train_set)" ] }, { @@ -245,22 +260,14 @@ "metadata": {}, "outputs": [], "source": [ - "from zlib import crc32\n", - "\n", - "def test_set_check(identifier, test_ratio):\n", - " return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32\n", - "\n", - "def split_train_test_by_id(data, test_ratio, id_column):\n", - " ids = data[id_column]\n", - " in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio))\n", - " return data.loc[~in_test_set], data.loc[in_test_set]" + "len(test_set)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The implementation of `test_set_check()` above works fine in both Python 2 and Python 3. In earlier releases, the following implementation was proposed, which supported any hash function, but was much slower and did not support Python 2:" + "To ensure that this notebook's outputs remain the same every time we run it, we need to set the random seed:" ] }, { @@ -269,17 +276,21 @@ "metadata": {}, "outputs": [], "source": [ - "import hashlib\n", - "\n", - "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", - " return hash(np.int64(identifier)).digest()[-1] < 256 * test_ratio" + "np.random.seed(42)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If you want an implementation that supports any hash function and is compatible with both Python 2 and Python 3, here is one:" + "Sadly, this won't guarantee that this notebook will output exactly the same results as in the book, since there are other possible sources of variation. The most important is the fact that algorithms get tweaked over time when libraries evolve. So please tolerate some minor differences: hopefully, most of the outputs should be the same, or at least in the right ballpark." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: another source of randomness is the order of Python sets: it is based on Python's `hash()` function, which is randomly \"salted\" when Python starts up (this started in Python 3.3, to prevent some denial-of-service attacks). To remove this randomness, the solution is to set the `PYTHONHASHSEED` environment variable to `\"0\"` _before_ Python even starts up. Nothing will happen if you do it after that. Luckily, if you're running this notebook on Colab, the variable is already set for you." ] }, { @@ -288,8 +299,15 @@ "metadata": {}, "outputs": [], "source": [ - "def test_set_check(identifier, test_ratio, hash=hashlib.md5):\n", - " return bytearray(hash(np.int64(identifier)).digest())[-1] < 256 * test_ratio" + "from zlib import crc32\n", + "\n", + "def is_id_in_test_set(identifier, test_ratio):\n", + " return crc32(np.int64(identifier)) < test_ratio * 2**32\n", + "\n", + "def split_data_with_id_hash(data, test_ratio, id_column):\n", + " ids = data[id_column]\n", + " in_test_set = ids.apply(lambda id_: is_id_in_test_set(id_, test_ratio))\n", + " return data.loc[~in_test_set], data.loc[in_test_set]" ] }, { @@ -298,8 +316,8 @@ "metadata": {}, "outputs": [], "source": [ - "housing_with_id = housing.reset_index() # adds an `index` column\n", - "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"index\")" + "housing_with_id = housing.reset_index() # adds an `index` column\n", + "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"index\")" ] }, { @@ -309,7 +327,7 @@ "outputs": [], "source": [ "housing_with_id[\"id\"] = housing[\"longitude\"] * 1000 + housing[\"latitude\"]\n", - "train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, \"id\")" + "train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, \"id\")" ] }, { @@ -317,28 +335,50 @@ "execution_count": 19, "metadata": {}, "outputs": [], - "source": [ - "test_set.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "\n", "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)" ] }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "np.sum(test_set[\"total_bedrooms\"].isnull())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To find the probability that a random sample of 1,000 people contains less than 48.5% female or more than 53.5% female when the population's female ratio is 51.1%, we use the [binomial distribution](https://en.wikipedia.org/wiki/Binomial_distribution). The `cdf()` method of the binomial distribution gives us the probability that the number of females will be equal or less than the given value." + ] + }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "test_set.head()" + "# Not in the book\n", + "\n", + "from scipy.stats import binom\n", + "\n", + "sample_size = 1000\n", + "ratio_female = 0.511\n", + "proba_too_small = binom(sample_size, ratio_female).cdf(485 - 1)\n", + "proba_too_large = 1 - binom(sample_size, ratio_female).cdf(535)\n", + "print(proba_too_small + proba_too_large)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you prefer simulations over maths, here's how you could get roughly the same result:" ] }, { @@ -347,7 +387,12 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"median_income\"].hist()" + "# Not in the book\n", + "\n", + "np.random.seed(42)\n", + "\n", + "samples = (np.random.rand(100_000, sample_size) < ratio_female).sum(axis=1)\n", + "((samples < 485) | (samples > 535)).mean()" ] }, { @@ -367,7 +412,11 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"income_cat\"].value_counts()" + "housing[\"income_cat\"].value_counts().sort_index().plot.bar(rot=0, grid=True)\n", + "plt.xlabel(\"Income category\")\n", + "plt.ylabel(\"Number of districts\")\n", + "save_fig(\"housing_income_cat_bar_plot\")\n", + "plt.show()" ] }, { @@ -376,7 +425,14 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"income_cat\"].hist()" + "from sklearn.model_selection import StratifiedShuffleSplit\n", + "\n", + "splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)\n", + "strat_splits = []\n", + "for train_index, test_index in splitter.split(housing, housing[\"income_cat\"]):\n", + " strat_train_set_n = housing.loc[train_index]\n", + " strat_test_set_n = housing.loc[test_index]\n", + " strat_splits.append([strat_train_set_n, strat_test_set_n])" ] }, { @@ -385,12 +441,14 @@ "metadata": {}, "outputs": [], "source": [ - "from sklearn.model_selection import StratifiedShuffleSplit\n", - "\n", - "split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)\n", - "for train_index, test_index in split.split(housing, housing[\"income_cat\"]):\n", - " strat_train_set = housing.loc[train_index]\n", - " strat_test_set = housing.loc[test_index]" + "strat_train_set, strat_test_set = strat_splits[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's much shorter to get a single stratified split:" ] }, { @@ -399,7 +457,8 @@ "metadata": {}, "outputs": [], "source": [ - "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)" + "strat_train_set, strat_test_set = train_test_split(\n", + " housing, test_size=0.2, stratify=housing[\"income_cat\"], random_state=42)" ] }, { @@ -408,7 +467,7 @@ "metadata": {}, "outputs": [], "source": [ - "housing[\"income_cat\"].value_counts() / len(housing)" + "strat_test_set[\"income_cat\"].value_counts() / len(strat_test_set)" ] }, { @@ -417,18 +476,7 @@ "metadata": {}, "outputs": [], "source": [ - "def income_cat_proportions(data):\n", - " return data[\"income_cat\"].value_counts() / len(data)\n", - "\n", - "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", - "\n", - "compare_props = pd.DataFrame({\n", - " \"Overall\": income_cat_proportions(housing),\n", - " \"Stratified\": income_cat_proportions(strat_test_set),\n", - " \"Random\": income_cat_proportions(test_set),\n", - "}).sort_index()\n", - "compare_props[\"Rand. %error\"] = 100 * compare_props[\"Random\"] / compare_props[\"Overall\"] - 100\n", - "compare_props[\"Strat. %error\"] = 100 * compare_props[\"Stratified\"] / compare_props[\"Overall\"] - 100" + "housing[\"income_cat\"].value_counts() / len(housing) # not in the book" ] }, { @@ -437,7 +485,23 @@ "metadata": {}, "outputs": [], "source": [ - "compare_props" + "# Not in the book\n", + "\n", + "def income_cat_proportions(data):\n", + " return data[\"income_cat\"].value_counts() / len(data)\n", + "\n", + "train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)\n", + "\n", + "compare_props = pd.DataFrame({\n", + " \"Overall %\": income_cat_proportions(housing),\n", + " \"Stratified %\": income_cat_proportions(strat_test_set),\n", + " \"Random %\": income_cat_proportions(test_set),\n", + "}).sort_index()\n", + "compare_props.index.name = \"Income Category\"\n", + "compare_props[\"Strat. Error %\"] = (compare_props[\"Stratified %\"] /\n", + " compare_props[\"Overall %\"] - 1)\n", + "compare_props[\"Rand. Error %\"] = (compare_props[\"Random %\"] /\n", + " compare_props[\"Overall %\"] - 1)" ] }, { @@ -445,6 +509,15 @@ "execution_count": 31, "metadata": {}, "outputs": [], + "source": [ + "(compare_props * 100).round(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], "source": [ "for set_ in (strat_train_set, strat_test_set):\n", " set_.drop(\"income_cat\", axis=1, inplace=True)" @@ -459,7 +532,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ @@ -473,31 +546,15 @@ "## Visualizing Geographical Data" ] }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\")\n", - "save_fig(\"bad_visualization_plot\")" - ] - }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.1)\n", - "save_fig(\"better_visualization_plot\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The argument `sharex=False` fixes a display bug (the x-axis values and legend were not displayed). This is a temporary fix (see: https://github.com/pandas-dev/pandas/issues/10611 ). Thanks to Wilmer Arellano for pointing it out." + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True)\n", + "save_fig(\"bad_visualization_plot\")\n", + "plt.show()" ] }, { @@ -506,29 +563,37 @@ "metadata": {}, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", alpha=0.4,\n", - " s=housing[\"population\"]/100, label=\"population\", figsize=(10,7),\n", - " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n", - " sharex=False)\n", - "plt.legend()\n", - "save_fig(\"housing_prices_scatterplot\")" + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True, alpha=0.2)\n", + "save_fig(\"better_visualization_plot\")\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ - "# Download the California image\n", - "images_path = Path() / \"images\" / \"end_to_end_project\"\n", - "filename = \"california.png\"\n", - "if not (images_path / filename).is_file():\n", - " images_path.mkdir(parents=True, exist_ok=True)\n", - " root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", - " url = root + \"images/end_to_end_project/\" + filename\n", - " print(\"Downloading\", filename)\n", - " urllib.request.urlretrieve(url, images_path / filename)" + "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True,\n", + " s=housing[\"population\"] / 100, label=\"population\",\n", + " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n", + " legend=True, sharex=False, figsize=(10, 7))\n", + "save_fig(\"housing_prices_scatterplot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The argument `sharex=False` fixes a display bug: without it, the x-axis values and label are not displayed (see: https://github.com/pandas-dev/pandas/issues/10611)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next couple of cells generate the first figure in the chapter (this code is not in the book). It's just a beautified version of the previous figure, with an image of California added in the background, nicer label names and no grid." ] }, { @@ -537,25 +602,38 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.image as mpimg\n", + "# Not in the book\n", "\n", - "california_img=mpimg.imread(images_path / filename)\n", - "ax = housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", figsize=(10,7),\n", - " s=housing['population']/100, label=\"Population\",\n", - " c=\"median_house_value\", cmap=plt.get_cmap(\"jet\"),\n", - " colorbar=False, alpha=0.4)\n", - "plt.imshow(california_img, extent=[-124.55, -113.80, 32.45, 42.05], alpha=0.5,\n", - " cmap=plt.get_cmap(\"jet\"))\n", - "plt.ylabel(\"Latitude\", fontsize=14)\n", - "plt.xlabel(\"Longitude\", fontsize=14)\n", + "# Download the California image\n", + "filename = \"california.png\"\n", + "if not (IMAGES_PATH / filename).is_file():\n", + " root = \"https://raw.githubusercontent.com/ageron/handson-ml2/master/\"\n", + " url = root + \"images/end_to_end_project/\" + filename\n", + " print(\"Downloading\", filename)\n", + " urllib.request.urlretrieve(url, IMAGES_PATH / filename)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "housing_renamed = housing.rename(columns={\n", + " \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n", + " \"population\": \"Population\",\n", + " \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n", + "housing_renamed.plot(\n", + " kind=\"scatter\", x=\"Longitude\", y=\"Latitude\",\n", + " s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n", + " c=\"Median house value (ᴜsᴅ)\", cmap=plt.get_cmap(\"jet\"), colorbar=True,\n", + " legend=True, sharex=False, figsize=(10, 7))\n", "\n", - "prices = housing[\"median_house_value\"]\n", - "tick_values = np.linspace(prices.min(), prices.max(), 11)\n", - "cbar = plt.colorbar(ticks=tick_values/prices.max())\n", - "cbar.ax.set_yticklabels([\"$%dk\"%(round(v/1000)) for v in tick_values], fontsize=14)\n", - "cbar.set_label('Median House Value', fontsize=16)\n", + "california_img = plt.imread(IMAGES_PATH / filename)\n", + "axis = -124.55, -113.95, 32.45, 42.05\n", + "plt.axis(axis)\n", + "plt.imshow(california_img, extent=axis)\n", "\n", - "plt.legend(fontsize=16)\n", "save_fig(\"california_housing_prices_plot\")\n", "plt.show()" ] @@ -569,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -578,7 +656,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -587,29 +665,29 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 41, "metadata": {}, "outputs": [], "source": [ - "# from pandas.tools.plotting import scatter_matrix # For older versions of Pandas\n", "from pandas.plotting import scatter_matrix\n", "\n", "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n", " \"housing_median_age\"]\n", "scatter_matrix(housing[attributes], figsize=(12, 8))\n", - "save_fig(\"scatter_matrix_plot\")" + "save_fig(\"scatter_matrix_plot\")\n", + "plt.show()" ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n", - " alpha=0.1)\n", - "plt.axis([0, 16, 0, 550000])\n", - "save_fig(\"income_vs_house_value_scatterplot\")" + " alpha=0.1, grid=True)\n", + "save_fig(\"income_vs_house_value_scatterplot\")\n", + "plt.show()" ] }, { @@ -619,25 +697,15 @@ "## Experimenting with Attribute Combinations" ] }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": {}, - "outputs": [], - "source": [ - "housing[\"rooms_per_household\"] = housing[\"total_rooms\"]/housing[\"households\"]\n", - "housing[\"bedrooms_per_room\"] = housing[\"total_bedrooms\"]/housing[\"total_rooms\"]\n", - "housing[\"population_per_household\"]=housing[\"population\"]/housing[\"households\"]" - ] - }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ - "corr_matrix = housing.corr()\n", - "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" + "housing[\"rooms_per_house\"] = housing[\"total_rooms\"] / housing[\"households\"]\n", + "housing[\"bedrooms_ratio\"] = housing[\"total_bedrooms\"] / housing[\"total_rooms\"]\n", + "housing[\"people_per_house\"] = housing[\"population\"] / housing[\"households\"]" ] }, { @@ -646,19 +714,8 @@ "metadata": {}, "outputs": [], "source": [ - "housing.plot(kind=\"scatter\", x=\"rooms_per_household\", y=\"median_house_value\",\n", - " alpha=0.2)\n", - "plt.axis([0, 5, 0, 520000])\n", - "plt.show()" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "metadata": {}, - "outputs": [], - "source": [ - "housing.describe()" + "corr_matrix = housing.corr()\n", + "corr_matrix[\"median_house_value\"].sort_values(ascending=False)" ] }, { @@ -668,13 +725,20 @@ "# Prepare the Data for Machine Learning Algorithms" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's revert to the original training set and separate the target (note that `strat_train_set.drop()` creates a copy of `strat_train_set` without the column, it doesn't actually modify `strat_train_set` itself, unless you pass `inplace=True`):" + ] + }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ - "housing = strat_train_set.drop(\"median_house_value\", axis=1) # drop labels for training set\n", + "housing = strat_train_set.drop(\"median_house_value\", axis=1)\n", "housing_labels = strat_train_set[\"median_house_value\"].copy()" ] }, @@ -689,16 +753,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In the book 3 options are listed:\n", + "In the book 3 options are listed to handle the NaN values:\n", "\n", "```python\n", - "housing.dropna(subset=[\"total_bedrooms\"]) # option 1\n", + "housing.dropna(subset=[\"total_bedrooms\"], inplace=True) # option 1\n", + "\n", "housing.drop(\"total_bedrooms\", axis=1) # option 2\n", + "\n", "median = housing[\"total_bedrooms\"].median() # option 3\n", "housing[\"total_bedrooms\"].fillna(median, inplace=True)\n", "```\n", "\n", - "To demonstrate each of them, let's create a copy of the housing dataset, but keeping only the rows that contain at least one null. Then it will be easier to visualize exactly what each option does:" + "For each option, we'll create a copy of `housing` and work on that copy to avoid breaking `housing`. We'll also show the output of each option, but filtering on the rows that originally contained a NaN value." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "null_rows_idx = housing.isnull().any(axis=1)\n", + "housing.loc[null_rows_idx].head()" ] }, { @@ -707,8 +783,11 @@ "metadata": {}, "outputs": [], "source": [ - "sample_incomplete_rows = housing[housing.isnull().any(axis=1)].head()\n", - "sample_incomplete_rows" + "housing_option1 = housing.copy()\n", + "\n", + "housing_option1.dropna(subset=[\"total_bedrooms\"], inplace=True) # option 1\n", + "\n", + "housing_option1.loc[null_rows_idx].head()" ] }, { @@ -717,7 +796,11 @@ "metadata": {}, "outputs": [], "source": [ - "sample_incomplete_rows.dropna(subset=[\"total_bedrooms\"]) # option 1" + "housing_option2 = housing.copy()\n", + "\n", + "housing_option2.drop(\"total_bedrooms\", axis=1, inplace=True) # option 2\n", + "\n", + "housing_option2.loc[null_rows_idx].head()" ] }, { @@ -726,7 +809,12 @@ "metadata": {}, "outputs": [], "source": [ - "sample_incomplete_rows.drop(\"total_bedrooms\", axis=1) # option 2" + "housing_option3 = housing.copy()\n", + "\n", + "median = housing[\"total_bedrooms\"].median()\n", + "housing_option3[\"total_bedrooms\"].fillna(median, inplace=True) # option 3\n", + "\n", + "housing_option3.loc[null_rows_idx].head()" ] }, { @@ -735,8 +823,16 @@ "metadata": {}, "outputs": [], "source": [ - "median = housing[\"total_bedrooms\"].median()\n", - "sample_incomplete_rows[\"total_bedrooms\"].fillna(median, inplace=True) # option 3" + "from sklearn.impute import SimpleImputer\n", + "\n", + "imputer = SimpleImputer(strategy=\"median\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Separating out the numerical attributes to use the `\"median\"` strategy (as it cannot be calculated on text attributes like `ocean_proximity`):" ] }, { @@ -745,7 +841,7 @@ "metadata": {}, "outputs": [], "source": [ - "sample_incomplete_rows" + "housing_num = housing.select_dtypes(include=[np.number])" ] }, { @@ -753,40 +849,13 @@ "execution_count": 52, "metadata": {}, "outputs": [], - "source": [ - "from sklearn.impute import SimpleImputer\n", - "imputer = SimpleImputer(strategy=\"median\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Remove the text attribute because median can only be calculated on numerical attributes:" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "housing_num = housing.drop(\"ocean_proximity\", axis=1)\n", - "# alternatively: housing_num = housing.select_dtypes(include=[np.number])" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], "source": [ "imputer.fit(housing_num)" ] }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -802,7 +871,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -818,7 +887,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -827,35 +896,16 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ - "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", - " index=housing.index)" + "imputer.feature_names_in_" ] }, { "cell_type": "code", - "execution_count": 59, - "metadata": {}, - "outputs": [], - "source": [ - "housing_tr.loc[sample_incomplete_rows.index.values]" - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "imputer.strategy" - ] - }, - { - "cell_type": "code", - "execution_count": 61, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -863,13 +913,90 @@ " index=housing_num.index)" ] }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr.loc[null_rows_idx].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "imputer.strategy" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr = pd.DataFrame(X, columns=housing_num.columns,\n", + " index=housing_num.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [], + "source": [ + "housing_tr.loc[null_rows_idx].head() # not shown in the book" + ] + }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "housing_tr.head()" + "#from sklearn import set_config\n", + "#\n", + "# set_config(pandas_in_out=True) # not available yet – see SLEP014" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now let's drop some outliers:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import IsolationForest\n", + "\n", + "isolation_forest = IsolationForest(random_state=42)\n", + "outlier_pred = isolation_forest.fit_predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [], + "source": [ + "outlier_pred" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "#housing = housing.iloc[outlier_pred == 1]\n", + "#housing_labels = housing_labels.iloc[outlier_pred == 1]" ] }, { @@ -888,30 +1015,38 @@ }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "housing_cat = housing[[\"ocean_proximity\"]]\n", - "housing_cat.head(10)" + "housing_cat.head(8)" ] }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OrdinalEncoder\n", "\n", "ordinal_encoder = OrdinalEncoder()\n", - "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)\n", - "housing_cat_encoded[:10]" + "housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)" ] }, { "cell_type": "code", - "execution_count": 65, + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "housing_cat_encoded[:8]" + ] + }, + { + "cell_type": "code", + "execution_count": 69, "metadata": {}, "outputs": [], "source": [ @@ -920,14 +1055,22 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "from sklearn.preprocessing import OneHotEncoder\n", "\n", "cat_encoder = OneHotEncoder()\n", - "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)\n", + "housing_cat_1hot = cat_encoder.fit_transform(housing_cat)" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ "housing_cat_1hot" ] }, @@ -940,7 +1083,7 @@ }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 72, "metadata": {}, "outputs": [], "source": [ @@ -956,7 +1099,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 73, "metadata": {}, "outputs": [], "source": [ @@ -967,13 +1110,245 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "cat_encoder.categories_" ] }, + { + "cell_type": "code", + "execution_count": 75, + "metadata": {}, + "outputs": [], + "source": [ + "df_test = pd.DataFrame({\"ocean_proximity\": [\"INLAND\", \"NEAR BAY\"]})\n", + "pd.get_dummies(df_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.transform(df_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": {}, + "outputs": [], + "source": [ + "df_test_unknown = pd.DataFrame({\"ocean_proximity\": [\"<2H OCEAN\", \"ISLAND\"]})\n", + "pd.get_dummies(df_test_unknown)" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.handle_unknown = \"ignore\"\n", + "cat_encoder.transform(df_test_unknown)" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.feature_names_in_" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "cat_encoder.get_feature_names_out()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "output_df = pd.DataFrame(cat_encoder.transform(df_test_unknown),\n", + " columns=cat_encoder.get_feature_names_out(),\n", + " index=df_test_unknown.index)\n", + "output_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Feature Scaling" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "min_max_scaler = MinMaxScaler(feature_range=(-1, 1))\n", + "housing_num_min_max_scaled = min_max_scaler.fit_transform(housing_num)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "std_scaler = StandardScaler()\n", + "housing_num_std_scaled = std_scaler.fit_transform(housing_num)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "fig, axs = plt.subplots(1, 2, figsize=(8,3), sharey=True)\n", + "housing[\"population\"].hist(ax=axs[0], bins=50)\n", + "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n", + "axs[0].set_xlabel(\"Population\")\n", + "axs[1].set_xlabel(\"Log of population\")\n", + "axs[0].set_ylabel(\"Number of districts\")\n", + "save_fig(\"long_tail_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "What if we replace each value with its percentile?" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "percentiles = [np.percentile(housing[\"median_income\"], p)\n", + " for p in range(1, 100)]\n", + "flattened_median_income = pd.cut(housing[\"median_income\"],\n", + " bins=[-np.inf] + percentiles + [np.inf],\n", + " labels=range(1, 100 + 1))\n", + "flattened_median_income.hist(bins=50)\n", + "plt.xlabel(\"Median income percentile\")\n", + "plt.ylabel(\"Number of districts\")\n", + "plt.show()\n", + "# Note: incomes below the 1st percentile are labeled 1, and incomes above the\n", + "# 99th percentile are labeled 100. This is why the distribution below ranges\n", + "# from 1 to 100 (not 0 to 100)." + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.metrics.pairwise import rbf_kernel\n", + "\n", + "age_simil_35 = rbf_kernel(housing[[\"housing_median_age\"]], [[35]], gamma=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "ages = np.linspace(housing[\"housing_median_age\"].min(),\n", + " housing[\"housing_median_age\"].max(),\n", + " 500).reshape(-1, 1)\n", + "gamma1 = 0.1\n", + "gamma2 = 0.03\n", + "rbf1 = rbf_kernel(ages, [[35]], gamma=gamma1)\n", + "rbf2 = rbf_kernel(ages, [[35]], gamma=gamma2)\n", + "\n", + "fig, ax1 = plt.subplots()\n", + "\n", + "ax1.set_xlabel(\"Housing median age\")\n", + "ax1.set_ylabel(\"Number of districts\")\n", + "ax1.hist(housing[\"housing_median_age\"], bins=50)\n", + "\n", + "ax2 = ax1.twinx() # create a twin axis that shares the same x-axis\n", + "color = \"blue\"\n", + "ax2.plot(ages, rbf1, color=color, label=\"gamma = 0.10\")\n", + "ax2.plot(ages, rbf2, color=color, label=\"gamma = 0.03\", linestyle=\"--\",)\n", + "ax2.tick_params(axis='y', labelcolor=color)\n", + "ax2.set_ylabel(\"Age similarity\", color=color)\n", + "\n", + "plt.legend(loc=\"upper left\", labelcolor=color)\n", + "save_fig(\"age_similarity_plot\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import LinearRegression\n", + "\n", + "target_scaler = StandardScaler()\n", + "scaled_labels = target_scaler.fit_transform(housing_labels.to_frame())\n", + "\n", + "model = LinearRegression()\n", + "model.fit(housing[[\"median_income\"]], scaled_labels)\n", + "some_new_data = housing[[\"median_income\"]].iloc[:5] # pretend this is new data\n", + "\n", + "scaled_predictions = model.predict(some_new_data)\n", + "predictions = target_scaler.inverse_transform(scaled_predictions)\n", + "predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.compose import TransformedTargetRegressor\n", + "\n", + "model = TransformedTargetRegressor(LinearRegression(),\n", + " transformer=StandardScaler())\n", + "model.fit(housing[[\"median_income\"]], housing_labels)\n", + "predictions = model.predict(some_new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "predictions" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -985,75 +1360,200 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's create a custom transformer to add extra attributes:" + "To create simple transformers:" ] }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import FunctionTransformer\n", + "\n", + "log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)\n", + "log_pop = log_transformer.transform(housing[[\"population\"]])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "rbf_transformer = FunctionTransformer(rbf_kernel,\n", + " kw_args=dict(Y=[[35.]], gamma=0.1))\n", + "age_simil_35 = rbf_transformer.transform(housing[[\"housing_median_age\"]])\n", + "age_simil_35" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "sf_coords = 37.7749, -122.41\n", + "sf_transformer = FunctionTransformer(rbf_kernel,\n", + " kw_args=dict(Y=[sf_coords], gamma=0.1))\n", + "sf_simil = sf_transformer.transform(housing[[\"latitude\", \"longitude\"]])\n", + "sf_simil" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "ratio_transformer = FunctionTransformer(lambda X: X[:, [0]] / X[:, [1]])\n", + "ratio_transformer.transform(np.array([[1., 2.], [3., 4.]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 95, "metadata": {}, "outputs": [], "source": [ "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.utils.validation import check_array, check_is_fitted\n", "\n", - "# column index\n", - "rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6\n", + "class StandardScalerClone(BaseEstimator, TransformerMixin):\n", + " def __init__(self, with_mean=True): # no *args or **kwargs!\n", + " self.with_mean = with_mean\n", + "\n", + " def fit(self, X, y=None): # y is required even though we don't use it\n", + " X = check_array(X) # checks that X is an array with finite float values\n", + " self.mean_ = np.mean(X, axis=0)\n", + " self.scale_ = np.std(X, axis=0)\n", + " self.n_features_in_ = X.shape[1] # every estimator stores this in fit()\n", + " return self # always return self!\n", "\n", - "class CombinedAttributesAdder(BaseEstimator, TransformerMixin):\n", - " def __init__(self, add_bedrooms_per_room=True): # no *args or **kargs\n", - " self.add_bedrooms_per_room = add_bedrooms_per_room\n", - " def fit(self, X, y=None):\n", - " return self # nothing else to do\n", " def transform(self, X):\n", - " rooms_per_household = X[:, rooms_ix] / X[:, households_ix]\n", - " population_per_household = X[:, population_ix] / X[:, households_ix]\n", - " if self.add_bedrooms_per_room:\n", - " bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]\n", - " return np.c_[X, rooms_per_household, population_per_household,\n", - " bedrooms_per_room]\n", - " else:\n", - " return np.c_[X, rooms_per_household, population_per_household]\n", + " check_is_fitted(self) # looks for learned attributes (with trailing _)\n", + " X = check_array(X)\n", + " assert self.n_features_in_ == X.shape[1]\n", + " if self.with_mean:\n", + " X = X - self.mean_\n", + " return X / self.scale_\n", + " \n", + " # Not in the book (left as an exercise):\n", + " def inverse_transform(self, X):\n", + " check_is_fitted(self)\n", + " X = check_array(X)\n", + " assert self.n_features_in_ == X.shape[1]\n", + " X = X * self.scale_\n", + " return X + self.mean_ if self.with_mean else X\n", + " \n", + " # Not in the book (left as an exercise):\n", + " def get_feature_names_out(self, names=None):\n", + " return names or getattr(self, \"feature_names_in_\",\n", + " [f\"x{i}\" for i in range(self.n_features_in_)]) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's test our custom transformer:" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "from sklearn.utils.estimator_checks import check_estimator\n", + " \n", + "check_estimator(StandardScaler())\n", + "X = np.random.rand(1000, 3)\n", + "ss = StandardScaler()\n", + "ssc = StandardScalerClone()\n", + "X_scaled1 = ss.fit_transform(X)\n", + "X_scaled2 = ssc.fit_transform(X)\n", + "X_back1 = ss.inverse_transform(X_scaled1)\n", + "X_back2 = ssc.inverse_transform(X_scaled2)\n", + "assert np.allclose(X_scaled1, X_scaled2)\n", + "assert np.allclose(X_back1, X_back2)\n", + "assert ssc.n_features_in_ == 3\n", + "assert not hasattr(ssc, \"features_names_in_\")\n", + "assert ssc.get_feature_names_out() == [\"x0\", \"x1\", \"x2\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.cluster import KMeans\n", "\n", - "attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)\n", - "housing_extra_attribs = attr_adder.transform(housing.values)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note that I hard coded the indices (3, 4, 5, 6) for concision and clarity in the book, but it would be much cleaner to get them dynamically, like this:" + "class ClusterSimilarity(BaseEstimator, TransformerMixin):\n", + " def __init__(self, n_clusters=10, gamma=1.0, random_state=None):\n", + " self.n_clusters = n_clusters\n", + " self.gamma = gamma\n", + " self.random_state = random_state\n", + "\n", + " def fit(self, X, y=None, sample_weight=None):\n", + " self.kmeans_ = KMeans(self.n_clusters, random_state=self.random_state)\n", + " self.kmeans_.fit(X, sample_weight=sample_weight)\n", + " return self # always return self!\n", + "\n", + " def transform(self, X):\n", + " return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)\n", + " \n", + " def get_feature_names_out(self, names=None):\n", + " return [f\"Cluster {i} similarity\" for i in range(self.n_clusters)]" ] }, { "cell_type": "code", - "execution_count": 71, + "execution_count": 98, "metadata": {}, "outputs": [], "source": [ - "col_names = \"total_rooms\", \"total_bedrooms\", \"population\", \"households\"\n", - "rooms_ix, bedrooms_ix, population_ix, households_ix = [\n", - " housing.columns.get_loc(c) for c in col_names] # get the column indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Also, `housing_extra_attribs` is a NumPy array, we've lost the column names (unfortunately, that's a problem with Scikit-Learn). To recover a `DataFrame`, you could run this:" + "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n", + "similarities = cluster_simil.fit_transform(housing[[\"latitude\", \"longitude\"]],\n", + " sample_weight=housing_labels)" ] }, { "cell_type": "code", - "execution_count": 72, + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ - "housing_extra_attribs = pd.DataFrame(\n", - " housing_extra_attribs,\n", - " columns=list(housing.columns)+[\"rooms_per_household\", \"population_per_household\"],\n", - " index=housing.index)\n", - "housing_extra_attribs.head()" + "np.round(similarities[:3], 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "housing_renamed = housing.rename(columns={\n", + " \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n", + " \"population\": \"Population\",\n", + " \"median_house_value\": \"Median house value (ᴜsᴅ)\"})\n", + "housing_renamed[\"Max cluster similarity\"] = similarities.max(axis=1)\n", + "\n", + "housing_renamed.plot(kind=\"scatter\", x=\"Longitude\", y=\"Latitude\", grid=True,\n", + " s=housing_renamed[\"Population\"] / 100, label=\"Population\",\n", + " c=\"Max cluster similarity\",\n", + " cmap=plt.get_cmap(\"jet\"), colorbar=True,\n", + " legend=True, sharex=False, figsize=(10, 7))\n", + "plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],\n", + " cluster_simil.kmeans_.cluster_centers_[:, 0],\n", + " linestyle=\"\", color=\"black\", marker=\"X\", markersize=20,\n", + " label=\"Cluster centers\")\n", + "plt.legend(loc=\"upper right\")\n", + "save_fig(\"district_cluster_plot\")\n", + "plt.show()" ] }, { @@ -1067,165 +1567,287 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now let's build a pipeline for preprocessing the numerical attributes:" + "Now let's build a pipeline to preprocess the numerical attributes:" ] }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", - "from sklearn.preprocessing import StandardScaler\n", "\n", "num_pipeline = Pipeline([\n", - " ('imputer', SimpleImputer(strategy=\"median\")),\n", - " ('attribs_adder', CombinedAttributesAdder()),\n", - " ('std_scaler', StandardScaler()),\n", - " ])\n", - "\n", - "housing_num_tr = num_pipeline.fit_transform(housing_num)" + " (\"impute\", SimpleImputer(strategy=\"median\")),\n", + " (\"standardize\", StandardScaler()),\n", + "])" ] }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 102, "metadata": {}, "outputs": [], "source": [ - "housing_num_tr" + "from sklearn.pipeline import make_pipeline\n", + "\n", + "num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"), StandardScaler())" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import set_config\n", + "\n", + "set_config(display='diagram')\n", + "\n", + "num_pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "housing_num_prepared = num_pipeline.fit_transform(housing_num)\n", + "np.round(housing_num_prepared[:2], 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "def monkey_patch_get_signature_names_out():\n", + " \"\"\"Monkey patch some classes which did not handle get_feature_names_out()\n", + " correctly in 1.0.0.\"\"\"\n", + " from inspect import Signature, signature, Parameter\n", + " import pandas as pd\n", + " from sklearn.impute import SimpleImputer\n", + " from sklearn.pipeline import make_pipeline, Pipeline\n", + " from sklearn.preprocessing import FunctionTransformer, StandardScaler\n", + "\n", + " default_get_feature_names_out = StandardScaler.get_feature_names_out\n", + "\n", + " if not hasattr(SimpleImputer, \"get_feature_names_out\"):\n", + " print(\"Monkey-patching SimpleImputer.get_feature_names_out()\")\n", + " SimpleImputer.get_feature_names_out = default_get_feature_names_out\n", + "\n", + " if not hasattr(FunctionTransformer, \"get_feature_names_out\"):\n", + " print(\"Monkey-patching FunctionTransformer.get_feature_names_out()\")\n", + " orig_init = FunctionTransformer.__init__\n", + " orig_sig = signature(orig_init)\n", + "\n", + " def __init__(*args, feature_names_out=None, **kwargs):\n", + " orig_sig.bind(*args, **kwargs)\n", + " orig_init(*args, **kwargs)\n", + " args[0].feature_names_out = feature_names_out\n", + "\n", + " __init__.__signature__ = Signature(\n", + " list(signature(orig_init).parameters.values()) + [\n", + " Parameter(\"feature_names_out\", Parameter.KEYWORD_ONLY)])\n", + "\n", + " def get_feature_names_out(self, names=None):\n", + " if self.feature_names_out is None:\n", + " return default_get_feature_names_out(self, names)\n", + " elif callable(self.feature_names_out):\n", + " return self.feature_names_out(names)\n", + " else:\n", + " return self.feature_names_out\n", + "\n", + " FunctionTransformer.__init__ = __init__\n", + " FunctionTransformer.get_feature_names_out = get_feature_names_out\n", + "\n", + " p = make_pipeline(SimpleImputer(), SimpleImputer())\n", + " p.fit_transform(pd.DataFrame({\"A\": [1., 2.], \"B\": [3., 4.]}))\n", + " if list(p.get_feature_names_out()) == [\"x0\", \"x1\"]:\n", + " print(\"Monkey-patching Pipeline.get_feature_names_out()\")\n", + " def get_feature_names_out(self, names=None):\n", + " names = default_get_feature_names_out(self, names)\n", + " for transformer in self:\n", + " names = transformer.get_feature_names_out(names)\n", + " return names\n", + "\n", + " Pipeline.get_feature_names_out = get_feature_names_out\n", + "\n", + "monkey_patch_get_signature_names_out()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "housing_num_prepared_df = pd.DataFrame(\n", + " housing_num_prepared, columns=num_pipeline.get_feature_names_out(),\n", + " index=housing_num.index)\n", + "housing_num_prepared_df.head(2) # Not in the book" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline.steps" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline[1]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline[:-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline.named_steps[\"simpleimputer\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "num_pipeline.set_params(simpleimputer__strategy=\"median\")" + ] + }, + { + "cell_type": "code", + "execution_count": 112, "metadata": {}, "outputs": [], "source": [ "from sklearn.compose import ColumnTransformer\n", "\n", - "num_attribs = list(housing_num)\n", + "num_attribs = [\"longitude\", \"latitude\", \"housing_median_age\", \"total_rooms\",\n", + " \"total_bedrooms\", \"population\", \"households\", \"median_income\"]\n", "cat_attribs = [\"ocean_proximity\"]\n", "\n", - "full_pipeline = ColumnTransformer([\n", - " (\"num\", num_pipeline, num_attribs),\n", - " (\"cat\", OneHotEncoder(), cat_attribs),\n", - " ])\n", + "cat_pipeline = make_pipeline(\n", + " SimpleImputer(strategy=\"most_frequent\"),\n", + " OneHotEncoder(handle_unknown=\"ignore\"))\n", "\n", - "housing_prepared = full_pipeline.fit_transform(housing)" + "preprocessing = ColumnTransformer([\n", + " (\"num\", num_pipeline, num_attribs),\n", + " (\"cat\", cat_pipeline, cat_attribs),\n", + "])" ] }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 113, "metadata": {}, "outputs": [], "source": [ - "housing_prepared" + "from sklearn.compose import make_column_selector, make_column_transformer\n", + "\n", + "preprocessing = make_column_transformer(\n", + " (num_pipeline, make_column_selector(dtype_include=np.number)),\n", + " (cat_pipeline, make_column_selector(dtype_include=np.object)),\n", + ")" ] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 114, "metadata": {}, "outputs": [], "source": [ + "housing_prepared = preprocessing.fit_transform(housing)" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "housing_prepared_fr = pd.DataFrame(housing_prepared,\n", + " columns=preprocessing.get_feature_names_out(),\n", + " index=housing.index)\n", + "housing_prepared_fr.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "def column_ratio(X):\n", + " return X[:, [0]] / X[:, [1]]\n", + "\n", + "def ratio_pipeline(name=None):\n", + " return make_pipeline(\n", + " SimpleImputer(strategy=\"median\"),\n", + " FunctionTransformer(column_ratio,\n", + " feature_names_out=[name]),\n", + " StandardScaler())\n", + "\n", + "log_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n", + " FunctionTransformer(np.log),\n", + " StandardScaler())\n", + "cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)\n", + "default_num_pipeline = make_pipeline(SimpleImputer(strategy=\"median\"),\n", + " StandardScaler())\n", + "preprocessing = ColumnTransformer([\n", + " (\"bedrooms_ratio\", ratio_pipeline(\"bedrooms_ratio\"),\n", + " [\"total_bedrooms\", \"total_rooms\"]),\n", + " (\"rooms_per_house\", ratio_pipeline(\"rooms_per_house\"),\n", + " [\"total_rooms\", \"households\"]),\n", + " (\"people_per_house\", ratio_pipeline(\"bedrooms_ratio\"),\n", + " [\"population\", \"households\"]),\n", + " (\"log\", log_pipeline, [\"total_bedrooms\", \"total_rooms\",\n", + " \"population\", \"households\", \"median_income\"]),\n", + " (\"geo\", cluster_simil, [\"latitude\", \"longitude\"]),\n", + " (\"cat\", cat_pipeline, make_column_selector(dtype_include=np.object)),\n", + " ],\n", + " remainder=default_num_pipeline) # one column remaining: housing_median_age" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [], + "source": [ + "housing_prepared = preprocessing.fit_transform(housing)\n", "housing_prepared.shape" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For reference, here is the old solution based on a `DataFrameSelector` transformer (to just select a subset of the Pandas `DataFrame` columns), and a `FeatureUnion`:" - ] - }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 118, "metadata": {}, "outputs": [], "source": [ - "from sklearn.base import BaseEstimator, TransformerMixin\n", - "\n", - "# Create a class to select numerical or categorical columns \n", - "class OldDataFrameSelector(BaseEstimator, TransformerMixin):\n", - " def __init__(self, attribute_names):\n", - " self.attribute_names = attribute_names\n", - " def fit(self, X, y=None):\n", - " return self\n", - " def transform(self, X):\n", - " return X[self.attribute_names].values" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's join all these components into a big pipeline that will preprocess both the numerical and the categorical features:" - ] - }, - { - "cell_type": "code", - "execution_count": 79, - "metadata": {}, - "outputs": [], - "source": [ - "num_attribs = list(housing_num)\n", - "cat_attribs = [\"ocean_proximity\"]\n", - "\n", - "old_num_pipeline = Pipeline([\n", - " ('selector', OldDataFrameSelector(num_attribs)),\n", - " ('imputer', SimpleImputer(strategy=\"median\")),\n", - " ('attribs_adder', CombinedAttributesAdder()),\n", - " ('std_scaler', StandardScaler()),\n", - " ])\n", - "\n", - "old_cat_pipeline = Pipeline([\n", - " ('selector', OldDataFrameSelector(cat_attribs)),\n", - " ('cat_encoder', OneHotEncoder(sparse=False)),\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 80, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.pipeline import FeatureUnion\n", - "\n", - "old_full_pipeline = FeatureUnion(transformer_list=[\n", - " (\"num_pipeline\", old_num_pipeline),\n", - " (\"cat_pipeline\", old_cat_pipeline),\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [], - "source": [ - "old_housing_prepared = old_full_pipeline.fit_transform(housing)\n", - "old_housing_prepared" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The result is the same as with the `ColumnTransformer`:" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "np.allclose(housing_prepared, old_housing_prepared)" + "preprocessing.get_feature_names_out()" ] }, { @@ -1244,28 +1866,31 @@ }, { "cell_type": "code", - "execution_count": 83, + "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "from sklearn.linear_model import LinearRegression\n", "\n", - "lin_reg = LinearRegression()\n", - "lin_reg.fit(housing_prepared, housing_labels)" + "lin_reg = make_pipeline(preprocessing, LinearRegression())\n", + "lin_reg.fit(housing, housing_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try the full preprocessing pipeline on a few training instances:" ] }, { "cell_type": "code", - "execution_count": 84, + "execution_count": 120, "metadata": {}, "outputs": [], "source": [ - "# let's try the full preprocessing pipeline on a few training instances\n", - "some_data = housing.iloc[:5]\n", - "some_labels = housing_labels.iloc[:5]\n", - "some_data_prepared = full_pipeline.transform(some_data)\n", - "\n", - "print(\"Predictions:\", lin_reg.predict(some_data_prepared))" + "housing_predictions = lin_reg.predict(housing)\n", + "np.round(housing_predictions[:5], -2)" ] }, { @@ -1277,76 +1902,58 @@ }, { "cell_type": "code", - "execution_count": 85, + "execution_count": 121, "metadata": {}, "outputs": [], "source": [ - "print(\"Labels:\", list(some_labels))" + "housing_labels.iloc[:5].values" ] }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ - "some_data_prepared" + "# Not in the book\n", + "error_ratios = np.round(housing_predictions[:5], -2) / housing_labels.iloc[:5].values - 1\n", + "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))" ] }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import mean_squared_error\n", "\n", - "housing_predictions = lin_reg.predict(housing_prepared)\n", - "lin_mse = mean_squared_error(housing_labels, housing_predictions)\n", - "lin_rmse = np.sqrt(lin_mse)\n", + "lin_rmse = mean_squared_error(housing_labels, housing_predictions,\n", + " squared=False)\n", "lin_rmse" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note**: since Scikit-Learn 0.22, you can get the RMSE directly by calling the `mean_squared_error()` function with `squared=False`." - ] - }, { "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.metrics import mean_absolute_error\n", - "\n", - "lin_mae = mean_absolute_error(housing_labels, housing_predictions)\n", - "lin_mae" - ] - }, - { - "cell_type": "code", - "execution_count": 89, + "execution_count": 124, "metadata": {}, "outputs": [], "source": [ "from sklearn.tree import DecisionTreeRegressor\n", "\n", - "tree_reg = DecisionTreeRegressor(random_state=42)\n", - "tree_reg.fit(housing_prepared, housing_labels)" + "tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))\n", + "tree_reg.fit(housing, housing_labels)" ] }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 125, "metadata": {}, "outputs": [], "source": [ - "housing_predictions = tree_reg.predict(housing_prepared)\n", - "tree_mse = mean_squared_error(housing_labels, housing_predictions)\n", - "tree_rmse = np.sqrt(tree_mse)\n", + "housing_predictions = tree_reg.predict(housing)\n", + "tree_rmse = mean_squared_error(housing_labels, housing_predictions,\n", + " squared=False)\n", "tree_rmse" ] }, @@ -1359,114 +1966,87 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 126, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import cross_val_score\n", "\n", - "scores = cross_val_score(tree_reg, housing_prepared, housing_labels,\n", - " scoring=\"neg_mean_squared_error\", cv=10)\n", - "tree_rmse_scores = np.sqrt(-scores)" + "tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,\n", + " scoring=\"neg_root_mean_squared_error\", cv=10)" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 127, "metadata": {}, "outputs": [], "source": [ - "def display_scores(scores):\n", - " print(\"Scores:\", scores)\n", - " print(\"Mean:\", scores.mean())\n", - " print(\"Standard deviation:\", scores.std())\n", + "pd.Series(tree_rmses).describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 128, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n", + " scoring=\"neg_root_mean_squared_error\", cv=10)\n", + "pd.Series(lin_rmses).describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 129, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.ensemble import RandomForestRegressor\n", "\n", - "display_scores(tree_rmse_scores)" - ] - }, - { - "cell_type": "code", - "execution_count": 93, - "metadata": {}, - "outputs": [], - "source": [ - "lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels,\n", - " scoring=\"neg_mean_squared_error\", cv=10)\n", - "lin_rmse_scores = np.sqrt(-lin_scores)\n", - "display_scores(lin_rmse_scores)" + "forest_reg = make_pipeline(preprocessing,\n", + " RandomForestRegressor(random_state=42))\n", + "forest_rmses = -cross_val_score(forest_reg, housing, housing_labels,\n", + " scoring=\"neg_root_mean_squared_error\", cv=10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Note**: we specify `n_estimators=100` to be future-proof since the default value is going to change to 100 in Scikit-Learn 0.22 (for simplicity, this is not shown in the book)." + "**Warning:** the following cell may take a few minutes to run:" ] }, { "cell_type": "code", - "execution_count": 94, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ - "from sklearn.ensemble import RandomForestRegressor\n", - "\n", - "forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)\n", - "forest_reg.fit(housing_prepared, housing_labels)" + "pd.Series(forest_rmses).describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Measure the RMSE on the training set:" ] }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 131, "metadata": {}, "outputs": [], "source": [ - "housing_predictions = forest_reg.predict(housing_prepared)\n", - "forest_mse = mean_squared_error(housing_labels, housing_predictions)\n", - "forest_rmse = np.sqrt(forest_mse)\n", + "forest_reg.fit(housing, housing_labels)\n", + "housing_predictions = forest_reg.predict(housing)\n", + "forest_rmse = mean_squared_error(housing_labels, housing_predictions,\n", + " squared=False)\n", "forest_rmse" ] }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.model_selection import cross_val_score\n", - "\n", - "forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels,\n", - " scoring=\"neg_mean_squared_error\", cv=10)\n", - "forest_rmse_scores = np.sqrt(-forest_scores)\n", - "display_scores(forest_rmse_scores)" - ] - }, - { - "cell_type": "code", - "execution_count": 97, - "metadata": {}, - "outputs": [], - "source": [ - "scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring=\"neg_mean_squared_error\", cv=10)\n", - "pd.Series(np.sqrt(-scores)).describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [], - "source": [ - "from sklearn.svm import SVR\n", - "\n", - "svm_reg = SVR(kernel=\"linear\")\n", - "svm_reg.fit(housing_prepared, housing_labels)\n", - "housing_predictions = svm_reg.predict(housing_prepared)\n", - "svm_mse = mean_squared_error(housing_labels, housing_predictions)\n", - "svm_rmse = np.sqrt(svm_mse)\n", - "svm_rmse" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1481,27 +2061,51 @@ "## Grid Search" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning:** the following cell make take a few minutes to run:" + ] + }, { "cell_type": "code", - "execution_count": 99, + "execution_count": 132, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", "\n", + "full_pipeline = Pipeline([\n", + " (\"preprocessing\", preprocessing),\n", + " (\"random_forest\", RandomForestRegressor(random_state=42)),\n", + "])\n", "param_grid = [\n", - " # try 12 (3×4) combinations of hyperparameters\n", - " {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},\n", - " # then try 6 (2×3) combinations with bootstrap set as False\n", - " {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},\n", - " ]\n", - "\n", - "forest_reg = RandomForestRegressor(random_state=42)\n", - "# train across 5 folds, that's a total of (12+6)*5=90 rounds of training \n", - "grid_search = GridSearchCV(forest_reg, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error',\n", - " return_train_score=True)\n", - "grid_search.fit(housing_prepared, housing_labels)" + " {'preprocessing__geo__n_clusters': [5, 8, 10],\n", + " 'random_forest__max_features': [4, 6, 8]},\n", + " {'preprocessing__geo__n_clusters': [10, 15],\n", + " 'random_forest__max_features': [6, 8, 10]},\n", + "]\n", + "grid_search = GridSearchCV(full_pipeline, param_grid, cv=3,\n", + " scoring='neg_root_mean_squared_error')\n", + "grid_search.fit(housing, housing_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can get the full list of hyperparameters available for tuning by looking at `full_pipeline.get_params().keys()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "metadata": {}, + "outputs": [], + "source": [ + "# Not in the book\n", + "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")" ] }, { @@ -1513,7 +2117,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 134, "metadata": {}, "outputs": [], "source": [ @@ -1522,7 +2126,7 @@ }, { "cell_type": "code", - "execution_count": 101, + "execution_count": 135, "metadata": {}, "outputs": [], "source": [ @@ -1538,22 +2142,19 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 136, "metadata": {}, "outputs": [], "source": [ - "cvres = grid_search.cv_results_\n", - "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", - " print(np.sqrt(-mean_score), params)" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [], - "source": [ - "pd.DataFrame(grid_search.cv_results_)" + "cv_res = pd.DataFrame(grid_search.cv_results_)\n", + "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n", + "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n", + " \"param_random_forest__max_features\", \"split0_test_score\",\n", + " \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n", + "score_cols = [\"split0\", \"split1\", \"split2\", \"mean_test_rmse\"]\n", + "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n", + "cv_res[score_cols] = -np.round(cv_res[score_cols]).astype(np.int64)\n", + "cv_res.head()" ] }, { @@ -1563,35 +2164,195 @@ "## Randomized Search" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Warning:** the following cell make take a few minutes to run:" + ] + }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.experimental import enable_halving_search_cv\n", + "from sklearn.model_selection import HalvingRandomSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import RandomizedSearchCV\n", "from scipy.stats import randint\n", "\n", - "param_distribs = {\n", - " 'n_estimators': randint(low=1, high=200),\n", - " 'max_features': randint(low=1, high=8),\n", - " }\n", + "param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),\n", + " 'random_forest__max_features': randint(low=2, high=20)}\n", "\n", - "forest_reg = RandomForestRegressor(random_state=42)\n", - "rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,\n", - " n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)\n", - "rnd_search.fit(housing_prepared, housing_labels)" + "# try 30 (n_iter × cv) random combinations of hyperparameters\n", + "rnd_search = RandomizedSearchCV(\n", + " full_pipeline, param_distributions=param_distribs, n_iter=10, cv=3,\n", + " scoring='neg_root_mean_squared_error', random_state=42)\n", + "\n", + "rnd_search.fit(housing, housing_labels)" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 139, "metadata": {}, "outputs": [], "source": [ - "cvres = rnd_search.cv_results_\n", - "for mean_score, params in zip(cvres[\"mean_test_score\"], cvres[\"params\"]):\n", - " print(np.sqrt(-mean_score), params)" + "# Not in the book\n", + "cv_res = pd.DataFrame(rnd_search.cv_results_)\n", + "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n", + "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n", + " \"param_random_forest__max_features\", \"split0_test_score\",\n", + " \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n", + "cv_res.columns = [\"n_clusters\", \"max_features\"] + score_cols\n", + "cv_res[score_cols] = -np.round(cv_res[score_cols]).astype(np.int64)\n", + "cv_res.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Bonus section: how to choose the sampling distribution for a hyperparameter**\n", + "\n", + "* `scipy.stats.randint(a, b+1)`: for hyperparameters with _discrete_ values that range from a to b, and all values in that range seem equally likely.\n", + "* `scipy.stats.uniform(a, b)`: this is very similar, but for _continuous_ hyperparameters.\n", + "* `scipy.stats.geom(1 / scale)`: for discrete values, when you want to sample roughly in a given scale. E.g., with scale=1000 most samples will be in this ballpark, but ~10% of all samples will be <100 and ~10% will be >2300.\n", + "* `scipy.stats.expon(scale)`: this is the continuous equivalent of `geom`. Just set `scale` to the most likely value.\n", + "* `scipy.stats.reciprocal(a, b)`: when you have almost no idea what the optimal hyperparameter value's scale is. If you set a=0.01 and b=100, then you're just as likely to sample a value between 0.01 and 0.1 as a value between 10 and 100.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are plots of the probability mass functions (for discrete variables), and probability density functions (for continuous variables) for `randint()`, `uniform()`, `geom()` and `expon()`:" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Not in the book\n", + "from scipy.stats import randint, uniform, geom, expon\n", + "\n", + "xs1 = np.arange(0, 7 + 1)\n", + "randint_distrib = randint(0, 7 + 1).pmf(xs1)\n", + "\n", + "xs2 = np.linspace(0, 7, 500)\n", + "uniform_distrib = uniform(0, 7).pdf(xs2)\n", + "\n", + "xs3 = np.arange(0, 7 + 1)\n", + "geom_distrib = geom(0.5).pmf(xs3)\n", + "\n", + "xs4 = np.linspace(0, 7, 500)\n", + "expon_distrib = expon(scale=1).pdf(xs4)\n", + "\n", + "plt.figure(figsize=(12, 7))\n", + "\n", + "plt.subplot(2, 2, 1)\n", + "plt.bar(xs1, randint_distrib, label=\"scipy.randint(0, 7 + 1)\")\n", + "plt.ylabel(\"Probability\")\n", + "plt.legend()\n", + "plt.axis([-1, 8, 0, 0.2])\n", + "\n", + "plt.subplot(2, 2, 2)\n", + "plt.fill_between(xs2, uniform_distrib, label=\"scipy.uniform(0, 7)\")\n", + "plt.ylabel(\"PDF\")\n", + "plt.legend()\n", + "plt.axis([-1, 8, 0, 0.2])\n", + "\n", + "plt.subplot(2, 2, 3)\n", + "plt.bar(xs3, geom_distrib, label=\"scipy.geom(0.5)\")\n", + "plt.xlabel(\"Hyperparameter value\")\n", + "plt.ylabel(\"Probability\")\n", + "plt.legend()\n", + "plt.axis([0, 7, 0, 1])\n", + "\n", + "plt.subplot(2, 2, 4)\n", + "plt.fill_between(xs4, expon_distrib, label=\"scipy.expon(scale=1)\")\n", + "plt.xlabel(\"Hyperparameter value\")\n", + "plt.ylabel(\"PDF\")\n", + "plt.legend()\n", + "plt.axis([0, 7, 0, 1])\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here are the PDF for `expon()` and `reciprocal()` (left column), as well as the PDF of log(X) (right column). The right column shows the distribution of hyperparameter _scales_. You can see that `expon()` favors hyperparameters with roughly the desired scale, with a longer tail towards the smaller scales. But `reciprocal()` does not favor any scale, they are all equally likely:" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Not in the book\n", + "from scipy.stats import reciprocal\n", + "\n", + "xs1 = np.linspace(0, 7, 500)\n", + "expon_distrib = expon(scale=1).pdf(xs1)\n", + "\n", + "log_xs2 = np.linspace(-5, 3, 500)\n", + "log_expon_distrib = np.exp(log_xs2 - np.exp(log_xs2))\n", + "\n", + "xs3 = np.linspace(0.001, 1000, 500)\n", + "reciprocal_distrib = reciprocal(0.001, 1000).pdf(xs3)\n", + "\n", + "log_xs4 = np.linspace(np.log(0.001), np.log(1000), 500)\n", + "log_reciprocal_distrib = uniform(np.log(0.001), np.log(1000)).pdf(log_xs4)\n", + "\n", + "plt.figure(figsize=(12, 7))\n", + "\n", + "plt.subplot(2, 2, 1)\n", + "plt.fill_between(xs1, expon_distrib,\n", + " label=\"scipy.expon(scale=1)\")\n", + "plt.ylabel(\"PDF\")\n", + "plt.legend()\n", + "plt.axis([0, 7, 0, 1])\n", + "\n", + "plt.subplot(2, 2, 2)\n", + "plt.fill_between(log_xs2, log_expon_distrib,\n", + " label=\"log(X) with X ~ expon\")\n", + "plt.legend()\n", + "plt.axis([-5, 3, 0, 1])\n", + "\n", + "plt.subplot(2, 2, 3)\n", + "plt.fill_between(xs3, reciprocal_distrib,\n", + " label=\"scipy.reciprocal(0.001, 1000)\")\n", + "plt.xlabel(\"Hyperparameter value\")\n", + "plt.ylabel(\"PDF\")\n", + "plt.legend()\n", + "plt.axis([0.001, 1000, 0, 0.005])\n", + "\n", + "plt.subplot(2, 2, 4)\n", + "plt.fill_between(log_xs4, log_reciprocal_distrib,\n", + " label=\"log(X) with X ~ reciprocal\")\n", + "plt.xlabel(\"Log of hyperparameter value\")\n", + "plt.legend()\n", + "plt.axis([-8, 1, 0, 0.2])\n", + "\n", + "plt.show()" ] }, { @@ -1603,26 +2364,24 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 142, "metadata": {}, "outputs": [], "source": [ - "feature_importances = grid_search.best_estimator_.feature_importances_\n", - "feature_importances" + "final_model = rnd_search.best_estimator_\n", + "feature_importances = final_model[\"random_forest\"].feature_importances_\n", + "np.round(feature_importances, 2)" ] }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 143, "metadata": {}, "outputs": [], "source": [ - "extra_attribs = [\"rooms_per_hhold\", \"pop_per_hhold\", \"bedrooms_per_room\"]\n", - "#cat_encoder = cat_pipeline.named_steps[\"cat_encoder\"] # old solution\n", - "cat_encoder = full_pipeline.named_transformers_[\"cat\"]\n", - "cat_one_hot_attribs = list(cat_encoder.categories_[0])\n", - "attributes = num_attribs + extra_attribs + cat_one_hot_attribs\n", - "sorted(zip(feature_importances, attributes), reverse=True)" + "sorted(zip(feature_importances,\n", + " final_model[\"preprocessing\"].get_feature_names_out()),\n", + " reverse=True)" ] }, { @@ -1634,25 +2393,21 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 144, "metadata": {}, "outputs": [], "source": [ - "final_model = grid_search.best_estimator_\n", - "\n", "X_test = strat_test_set.drop(\"median_house_value\", axis=1)\n", "y_test = strat_test_set[\"median_house_value\"].copy()\n", "\n", - "X_test_prepared = full_pipeline.transform(X_test)\n", - "final_predictions = final_model.predict(X_test_prepared)\n", + "final_predictions = final_model.predict(X_test)\n", "\n", - "final_mse = mean_squared_error(y_test, final_predictions)\n", - "final_rmse = np.sqrt(final_mse)" + "final_rmse = mean_squared_error(y_test, final_predictions, squared=False)" ] }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 145, "metadata": {}, "outputs": [], "source": [ @@ -1668,7 +2423,7 @@ }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 146, "metadata": {}, "outputs": [], "source": [ @@ -1690,10 +2445,11 @@ }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 147, "metadata": {}, "outputs": [], "source": [ + "# Not in the book\n", "m = len(squared_errors)\n", "mean = squared_errors.mean()\n", "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n", @@ -1705,49 +2461,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Alternatively, we could use a z-scores rather than t-scores:" + "Alternatively, we could use a z-scores rather than t-scores—since the test set is not too small, it won't make a big difference:" ] }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 148, "metadata": {}, "outputs": [], "source": [ + "# Not in the book\n", "zscore = stats.norm.ppf((1 + confidence) / 2)\n", "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n", "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extra material" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## A full pipeline with both preparation and prediction" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [], - "source": [ - "full_pipeline_with_predictor = Pipeline([\n", - " (\"preparation\", full_pipeline),\n", - " (\"linear\", LinearRegression())\n", - " ])\n", - "\n", - "full_pipeline_with_predictor.fit(housing, housing_labels)\n", - "full_pipeline_with_predictor.predict(some_data)" - ] - }, { "cell_type": "markdown", "metadata": {}, @@ -1756,46 +2484,67 @@ ] }, { - "cell_type": "code", - "execution_count": 114, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "my_model = full_pipeline_with_predictor" + "Save the final model:" ] }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 149, "metadata": {}, "outputs": [], "source": [ "import joblib\n", - "joblib.dump(my_model, \"my_model.pkl\") # DIFF\n", - "#...\n", - "my_model_loaded = joblib.load(\"my_model.pkl\") # DIFF" + "\n", + "joblib.dump(final_model, \"my_california_housing_model.pkl\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Example SciPy distributions for `RandomizedSearchCV`" + "Now you can deploy this model to production, load it in your scripts and use is to make predictions:" ] }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 150, "metadata": {}, "outputs": [], "source": [ - "from scipy.stats import geom, expon\n", - "geom_distrib=geom(0.5).rvs(10000, random_state=42)\n", - "expon_distrib=expon(scale=1).rvs(10000, random_state=42)\n", - "plt.hist(geom_distrib, bins=50)\n", - "plt.show()\n", - "plt.hist(expon_distrib, bins=50)\n", - "plt.show()" + "import joblib\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.metrics.pairwise import rbf_kernel\n", + "\n", + "#class ClusterSimilarity(BaseEstimator, TransformerMixin):\n", + "# [...]\n", + "\n", + "def column_ratio(X):\n", + " return X[:, [0]] / X[:, [1]]\n", + "\n", + "final_model_reloaded = joblib.load(\"my_california_housing_model.pkl\")\n", + "\n", + "new_data = housing.iloc[:5] # pretend these are new districts\n", + "predictions = final_model_reloaded.predict(new_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 151, + "metadata": {}, + "outputs": [], + "source": [ + "predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Also works with pickle, but joblib is more efficient." ] }, { @@ -1816,63 +2565,59 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Try a Support Vector Machine regressor (`sklearn.svm.SVR`), with various hyperparameters such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Don't worry about what these hyperparameters mean for now. How does the best `SVR` predictor perform?" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the following cell may take close to 30 minutes to run, or more depending on your hardware." + "_Try a Support Vector Machine regressor (`sklearn.svm.SVR`) with various hyperparameters, such as `kernel=\"linear\"` (with various values for the `C` hyperparameter) or `kernel=\"rbf\"` (with various values for the `C` and `gamma` hyperparameters). Note that SVMs don't scale well to large datasets, so you should probably train your model on just the first 5,000 instances of the training set and use only 3-fold cross-validation, or else it will take hours. Don't worry about what the hyperparameters mean for now (see the SVM notebook if you're interested). How does the best `SVR` predictor perform?_" ] }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.svm import SVR\n", "\n", "param_grid = [\n", - " {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},\n", - " {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],\n", - " 'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n", + " {'svr__kernel': ['linear'], 'svr__C': [10., 30., 100., 300., 1000.,\n", + " 3000., 10000., 30000.0]},\n", + " {'svr__kernel': ['rbf'], 'svr__C': [1.0, 3.0, 10., 30., 100., 300.,\n", + " 1000.0],\n", + " 'svr__gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},\n", " ]\n", "\n", - "svm_reg = SVR()\n", - "grid_search = GridSearchCV(svm_reg, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=2)\n", - "grid_search.fit(housing_prepared, housing_labels)" + "svr_pipeline = Pipeline([(\"preprocessing\", preprocessing), (\"svr\", SVR())])\n", + "grid_search = GridSearchCV(svr_pipeline, param_grid, cv=3,\n", + " scoring='neg_root_mean_squared_error')\n", + "grid_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The best model achieves the following score (evaluated using 5-fold cross validation):" + "The best model achieves the following score (evaluated using 3-fold cross validation):" ] }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 153, "metadata": {}, "outputs": [], "source": [ - "negative_mse = grid_search.best_score_\n", - "rmse = np.sqrt(-negative_mse)\n", - "rmse" + "svr_grid_search_rmse = -grid_search.best_score_\n", + "svr_grid_search_rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's much worse than the `RandomForestRegressor`. Let's check the best hyperparameters found:" + "That's much worse than the `RandomForestRegressor` (but to be fair, we trained the model on much less data). Let's check the best hyperparameters found:" ] }, { "cell_type": "code", - "execution_count": 119, + "execution_count": 154, "metadata": {}, "outputs": [], "source": [ @@ -1897,19 +2642,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Try replacing `GridSearchCV` with `RandomizedSearchCV`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the following cell may take close to 45 minutes to run, or more depending on your hardware." + "_Try replacing the `GridSearchCV` with a `RandomizedSearchCV`._" ] }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 155, "metadata": {}, "outputs": [], "source": [ @@ -1921,46 +2659,46 @@ "\n", "# Note: gamma is ignored when kernel is \"linear\"\n", "param_distribs = {\n", - " 'kernel': ['linear', 'rbf'],\n", - " 'C': reciprocal(20, 200000),\n", - " 'gamma': expon(scale=1.0),\n", + " 'svr__kernel': ['linear', 'rbf'],\n", + " 'svr__C': reciprocal(20, 200_000),\n", + " 'svr__gamma': expon(scale=1.0),\n", " }\n", "\n", - "svm_reg = SVR()\n", - "rnd_search = RandomizedSearchCV(svm_reg, param_distributions=param_distribs,\n", - " n_iter=50, cv=5, scoring='neg_mean_squared_error',\n", - " verbose=2, random_state=42)\n", - "rnd_search.fit(housing_prepared, housing_labels)" + "rnd_search = RandomizedSearchCV(svr_pipeline,\n", + " param_distributions=param_distribs,\n", + " n_iter=50, cv=3,\n", + " scoring='neg_root_mean_squared_error',\n", + " random_state=42)\n", + "rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The best model achieves the following score (evaluated using 5-fold cross validation):" + "The best model achieves the following score (evaluated using 3-fold cross validation):" ] }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 156, "metadata": {}, "outputs": [], "source": [ - "negative_mse = rnd_search.best_score_\n", - "rmse = np.sqrt(-negative_mse)\n", - "rmse" + "svr_rnd_search_rmse = -rnd_search.best_score_\n", + "svr_rnd_search_rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now this is much closer to the performance of the `RandomForestRegressor` (but not quite there yet). Let's check the best hyperparameters found:" + "Now that's really much better, but still far from the `RandomForestRegressor`'s performance. Let's check the best hyperparameters found:" ] }, { "cell_type": "code", - "execution_count": 122, + "execution_count": 157, "metadata": {}, "outputs": [], "source": [ @@ -1978,57 +2716,26 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's look at the exponential distribution we used, with `scale=1.0`. Note that some samples are much larger or smaller than 1.0, but when you look at the log of the distribution, you can see that most values are actually concentrated roughly in the range of exp(-2) to exp(+2), which is about 0.1 to 7.4." + "Note that we used the `expon()` distribution for `gamma`, with a scale of 1, so `RandomSearch` mostly searched for values roughly of that scale: about 80% of the samples were between 0.1 and 2.3 (roughly 10% were smaller and 10% were larger):" ] }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 158, "metadata": {}, "outputs": [], "source": [ - "expon_distrib = expon(scale=1.)\n", - "samples = expon_distrib.rvs(10000, random_state=42)\n", - "plt.figure(figsize=(10, 4))\n", - "plt.subplot(121)\n", - "plt.title(\"Exponential distribution (scale=1.0)\")\n", - "plt.hist(samples, bins=50)\n", - "plt.subplot(122)\n", - "plt.title(\"Log of this distribution\")\n", - "plt.hist(np.log(samples), bins=50)\n", - "plt.show()" + "np.random.seed(42)\n", + "\n", + "s = expon(scale=1).rvs(100_000) # get 100,000 samples\n", + "np.sum((s > 0.105) & (s < 2.29)) / 100_000" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The distribution we used for `C` looks quite different: the scale of the samples is picked from a uniform distribution within a given range, which is why the right graph, which represents the log of the samples, looks roughly constant. This distribution is useful when you don't have a clue of what the target scale is:" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": {}, - "outputs": [], - "source": [ - "reciprocal_distrib = reciprocal(20, 200000)\n", - "samples = reciprocal_distrib.rvs(10000, random_state=42)\n", - "plt.figure(figsize=(10, 4))\n", - "plt.subplot(121)\n", - "plt.title(\"Reciprocal distribution (scale=1.0)\")\n", - "plt.hist(samples, bins=50)\n", - "plt.subplot(122)\n", - "plt.title(\"Log of this distribution\")\n", - "plt.hist(np.log(samples), bins=50)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The reciprocal distribution is useful when you have no idea what the scale of the hyperparameter should be (indeed, as you can see on the figure on the right, all scales are equally likely, within the given range), whereas the exponential distribution is best when you know (more or less) what the scale of the hyperparameter should be." + "We used the `reciprocal()` distribution for `C`, meaning we did not have a clue what the optimal scale of `C` was before running the random search. It explored the range from 20 to 200 just as much as the range from 2,000 to 20,000 or from 20,000 to 200,000." ] }, { @@ -2042,161 +2749,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Try adding a transformer in the preparation pipeline to select only the most important attributes." + "_Try adding a `SelectFromModel` transformer in the preparation pipeline to select only the most important attributes._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create a new pipeline that runs the previously defined preparation pipeline, and adds a `SelectFromModel` transformer based on a `RandomForestRegressor` before the final regressor:" ] }, { "cell_type": "code", - "execution_count": 125, + "execution_count": 159, "metadata": {}, "outputs": [], "source": [ - "from sklearn.base import BaseEstimator, TransformerMixin\n", + "from sklearn.feature_selection import SelectFromModel\n", "\n", - "def indices_of_top_k(arr, k):\n", - " return np.sort(np.argpartition(np.array(arr), -k)[-k:])\n", - "\n", - "class TopFeatureSelector(BaseEstimator, TransformerMixin):\n", - " def __init__(self, feature_importances, k):\n", - " self.feature_importances = feature_importances\n", - " self.k = k\n", - " def fit(self, X, y=None):\n", - " self.feature_indices_ = indices_of_top_k(self.feature_importances, self.k)\n", - " return self\n", - " def transform(self, X):\n", - " return X[:, self.feature_indices_]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Note: this feature selector assumes that you have already computed the feature importances somehow (for example using a `RandomForestRegressor`). You may be tempted to compute them directly in the `TopFeatureSelector`'s `fit()` method, however this would likely slow down grid/randomized search since the feature importances would have to be computed for every hyperparameter combination (unless you implement some sort of cache)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's define the number of top features we want to keep:" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": {}, - "outputs": [], - "source": [ - "k = 5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's look for the indices of the top k features:" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "metadata": {}, - "outputs": [], - "source": [ - "top_k_feature_indices = indices_of_top_k(feature_importances, k)\n", - "top_k_feature_indices" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "metadata": {}, - "outputs": [], - "source": [ - "np.array(attributes)[top_k_feature_indices]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's double check that these are indeed the top k features:" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "metadata": {}, - "outputs": [], - "source": [ - "sorted(zip(feature_importances, attributes), reverse=True)[:k]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Looking good... Now let's create a new pipeline that runs the previously defined preparation pipeline, and adds top k feature selection:" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "metadata": {}, - "outputs": [], - "source": [ - "preparation_and_feature_selection_pipeline = Pipeline([\n", - " ('preparation', full_pipeline),\n", - " ('feature_selection', TopFeatureSelector(feature_importances, k))\n", + "selector_pipeline = Pipeline([\n", + " ('preprocessing', preprocessing),\n", + " ('selector', SelectFromModel(RandomForestRegressor(random_state=42),\n", + " threshold=0.005)), # min feature importance\n", + " ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n", + " gamma=rnd_search.best_params_[\"svr__gamma\"],\n", + " kernel=rnd_search.best_params_[\"svr__kernel\"])),\n", "])" ] }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 160, "metadata": {}, "outputs": [], "source": [ - "housing_prepared_top_k_features = preparation_and_feature_selection_pipeline.fit_transform(housing)" + "selector_rmses = -cross_val_score(selector_pipeline,\n", + " housing.iloc[:5000],\n", + " housing_labels.iloc[:5000],\n", + " scoring=\"neg_root_mean_squared_error\",\n", + " cv=3)\n", + "pd.Series(selector_rmses).describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's look at the features of the first 3 instances:" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "metadata": {}, - "outputs": [], - "source": [ - "housing_prepared_top_k_features[0:3]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's double check that these are indeed the top k features:" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "metadata": {}, - "outputs": [], - "source": [ - "housing_prepared[0:3, top_k_feature_indices]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Works great! :)" + "Oh well, feature selection does not seem to help. But maybe that's just because the threshold we used was not optimal. Perhaps try tuning it using random search or grid search?" ] }, { @@ -2210,56 +2809,161 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Try creating a single pipeline that does the full data preparation plus the final prediction." + "_Try creating a custom transformer that trains a k-Nearest Neighbors regressor (`sklearn.neighbors.KNeighborsRegressor`) in its `fit()` method, and outputs the model's predictions in its `transform()` method. Then add this feature to the preprocessing pipeline, using latitude and longitude as the inputs to this transformer. This will add a feature in the model that corresponds to the housing median price of the nearest districts._" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Rather than restrict ourselves to k-Nearest Neighbors regressors, let's create a transform that accepts any regressor. For this, we can extend the `MetaEstimatorMixin` and have a required `estimator` argument in the constructor. The `fit()` method must work on a clone of this estimator, and it must also save `feature_names_in_`. The `MetaEstimatorMixin` will ensure that `estimator` is listed as a required parameters, and it will update `get_params()` and `set_params()` to make the estimator's hyperparameters available for tuning. Lastly, we create a `get_feature_names_out()` method: the output column name is the " ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 161, "metadata": {}, "outputs": [], "source": [ - "prepare_select_and_predict_pipeline = Pipeline([\n", - " ('preparation', full_pipeline),\n", - " ('feature_selection', TopFeatureSelector(feature_importances, k)),\n", - " ('svm_reg', SVR(**rnd_search.best_params_))\n", + "from sklearn.neighbors import KNeighborsRegressor\n", + "from sklearn.base import MetaEstimatorMixin, clone\n", + "\n", + "class FeatureFromRegressor(MetaEstimatorMixin, BaseEstimator, TransformerMixin):\n", + " def __init__(self, estimator):\n", + " self.estimator = estimator\n", + "\n", + " def fit(self, X, y=None):\n", + " estimator_ = clone(self.estimator)\n", + " estimator_.fit(X, y)\n", + " self.estimator_ = estimator_\n", + " self.n_features_in_ = self.estimator_.n_features_in_\n", + " if hasattr(self.estimator, \"feature_names_in_\"):\n", + " self.feature_names_in_ = self.estimator.feature_names_in_\n", + " return self # always return self!\n", + " \n", + " def transform(self, X):\n", + " check_is_fitted(self)\n", + " predictions = self.estimator_.predict(X)\n", + " if predictions.ndim == 1:\n", + " predictions = predictions.reshape(-1, 1)\n", + " return predictions\n", + "\n", + " def get_feature_names_out(self, names=None):\n", + " check_is_fitted(self)\n", + " n_outputs = getattr(self.estimator_, \"n_outputs_\", 1)\n", + " estimator_class_name = self.estimator_.__class__.__name__\n", + " estimator_short_name = estimator_class_name.lower().replace(\"_\", \"\")\n", + " return [f\"{estimator_short_name}_prediction_{i}\"\n", + " for i in range(n_outputs)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's ensure it complies to Scikit-Learn's API:" + ] + }, + { + "cell_type": "code", + "execution_count": 162, + "metadata": {}, + "outputs": [], + "source": [ + "check_estimator(FeatureFromRegressor(KNeighborsRegressor()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Good! Now let's test it:" + ] + }, + { + "cell_type": "code", + "execution_count": 163, + "metadata": {}, + "outputs": [], + "source": [ + "knn_reg = KNeighborsRegressor(n_neighbors=3, weights=\"distance\")\n", + "knn_transformer = FeatureFromRegressor(knn_reg)\n", + "geo_features = housing[[\"latitude\", \"longitude\"]]\n", + "knn_transformer.fit_transform(geo_features, housing_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And what does its output feature name look like?" + ] + }, + { + "cell_type": "code", + "execution_count": 164, + "metadata": {}, + "outputs": [], + "source": [ + "knn_transformer.get_feature_names_out()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Okay, now let's include this transformer in our preprocessing pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.base import clone\n", + "\n", + "transformers = [(name, clone(transformer), columns)\n", + " for name, transformer, columns in preprocessing.transformers]\n", + "geo_index = [name for name, _, _ in transformers].index(\"geo\")\n", + "transformers[geo_index] = (\"geo\", knn_transformer, [\"latitude\", \"longitude\"])\n", + "\n", + "new_geo_preprocessing = ColumnTransformer(transformers)" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "new_geo_pipeline = Pipeline([\n", + " ('preprocessing', new_geo_preprocessing),\n", + " ('svr', SVR(C=rnd_search.best_params_[\"svr__C\"],\n", + " gamma=rnd_search.best_params_[\"svr__gamma\"],\n", + " kernel=rnd_search.best_params_[\"svr__kernel\"])),\n", "])" ] }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 167, "metadata": {}, "outputs": [], "source": [ - "prepare_select_and_predict_pipeline.fit(housing, housing_labels)" + "new_pipe_rmses = -cross_val_score(new_geo_pipeline,\n", + " housing.iloc[:5000],\n", + " housing_labels.iloc[:5000],\n", + " scoring=\"neg_root_mean_squared_error\",\n", + " cv=3)\n", + "pd.Series(new_pipe_rmses).describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's try the full pipeline on a few instances:" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [], - "source": [ - "some_data = housing.iloc[:4]\n", - "some_labels = housing_labels.iloc[:4]\n", - "\n", - "print(\"Predictions:\\t\", prepare_select_and_predict_pipeline.predict(some_data))\n", - "print(\"Labels:\\t\\t\", list(some_labels))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Well, the full pipeline seems to work fine. Of course, the predictions are not fantastic: they would be better if we used the best `RandomForestRegressor` that we found earlier, rather than the best `SVR`." + "Yikes, that's terrible! Apparently the cluster similarity features were much better. But perhaps we should tune the `KNeighborsRegressor`'s hyperparameters? That's what the next exercise is about." ] }, { @@ -2273,55 +2977,53 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Question: Automatically explore some preparation options using `GridSearchCV`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Warning**: the following cell may take close to 45 minutes to run, or more depending on your hardware." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** In the code below, I've set the `OneHotEncoder`'s `handle_unknown` hyperparameter to `'ignore'`, to avoid warnings during training. Without this, the `OneHotEncoder` would default to `handle_unknown='error'`, meaning that it would raise an error when transforming any data containing a category it didn't see during training. If we kept the default, then the `GridSearchCV` would run into errors during training when evaluating the folds in which not all the categories are in the training set. This is likely to happen since there's only one sample in the `'ISLAND'` category, and it may end up in the test set in some of the folds. So some folds would just be dropped by the `GridSearchCV`, and it's best to avoid that." + "Question: Automatically explore some preparation options using `RandomSearchCV`." ] }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 168, "metadata": {}, "outputs": [], "source": [ - "full_pipeline.named_transformers_[\"cat\"].handle_unknown = 'ignore'\n", + "param_distribs = {\n", + " \"preprocessing__geo__estimator__n_neighbors\": range(1, 30),\n", + " \"preprocessing__geo__estimator__weights\": [\"distance\", \"uniform\"],\n", + " \"svr__C\": reciprocal(20, 200_000),\n", + " \"svr__gamma\": expon(scale=1.0),\n", + "}\n", "\n", - "param_grid = [{\n", - " 'preparation__num__imputer__strategy': ['mean', 'median', 'most_frequent'],\n", - " 'feature_selection__k': list(range(1, len(feature_importances) + 1))\n", - "}]\n", - "\n", - "grid_search_prep = GridSearchCV(prepare_select_and_predict_pipeline, param_grid, cv=5,\n", - " scoring='neg_mean_squared_error', verbose=2)\n", - "grid_search_prep.fit(housing, housing_labels)" + "new_geo_rnd_search = RandomizedSearchCV(new_geo_pipeline,\n", + " param_distributions=param_distribs,\n", + " n_iter=50,\n", + " cv=3,\n", + " scoring='neg_root_mean_squared_error',\n", + " random_state=42)\n", + "new_geo_rnd_search.fit(housing.iloc[:5000], housing_labels.iloc[:5000])" ] }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 169, "metadata": {}, "outputs": [], "source": [ - "grid_search_prep.best_params_" + "new_geo_rnd_search_rmse = -new_geo_rnd_search.best_score_\n", + "new_geo_rnd_search_rmse" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The best imputer strategy is `most_frequent` and apparently almost all features are useful (15 out of 16). The last one (`ISLAND`) seems to just add some noise." + "Oh well... at least we tried! It looks like the cluster similarity features are definitely better than the KNN feature. But perhaps you could try having both?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's all for today! 😀" ] }, { @@ -2334,7 +3036,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" },