Sync notebook with book's code examples, and better identify extra code

2026-01-14 12:14:36 +01:00 · 2022-02-19 18:17:36 +13:00
parent 1c2421fc88
commit b63019fd28
9 changed files with 318 additions and 301 deletions
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -177,7 +177,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – code to save the figures as high-res PNGs for the book\n",
+    "# extra code – code to save the figures as high-res PNGs for the book\n",
    "\n",
    "IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n",
    "IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
@@ -197,7 +197,7 @@
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
-    "# not in the book – the next 5 lines define the default font sizes\n",
+    "# extra code – the next 5 lines define the default font sizes\n",
    "plt.rc('font', size=14)\n",
    "plt.rc('axes', labelsize=14, titlesize=14)\n",
    "plt.rc('legend', fontsize=14)\n",
@@ -205,7 +205,7 @@
    "plt.rc('ytick', labelsize=10)\n",
    "\n",
    "housing.hist(bins=50, figsize=(12, 8))\n",
-    "save_fig(\"attribute_histogram_plots\")  # not in the book\n",
+    "save_fig(\"attribute_histogram_plots\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -351,7 +351,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – shows how to compute the 10.7% proba of getting a bad sample\n",
+    "# extra code – shows how to compute the 10.7% proba of getting a bad sample\n",
    "\n",
    "from scipy.stats import binom\n",
    "\n",
@@ -375,7 +375,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – shows another way to estimate the probability of bad sample\n",
+    "# extra code – shows another way to estimate the probability of bad sample\n",
    "\n",
    "np.random.seed(42)\n",
    "\n",
@@ -403,7 +403,7 @@
    "housing[\"income_cat\"].value_counts().sort_index().plot.bar(rot=0, grid=True)\n",
    "plt.xlabel(\"Income category\")\n",
    "plt.ylabel(\"Number of districts\")\n",
-    "save_fig(\"housing_income_cat_bar_plot\")  # not in the book\n",
+    "save_fig(\"housing_income_cat_bar_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -464,7 +464,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code computes the data for Figure 2–10\n",
+    "# extra code – computes the data for Figure 2–10\n",
    "\n",
    "def income_cat_proportions(data):\n",
    "    return data[\"income_cat\"].value_counts() / len(data)\n",
@@ -524,7 +524,7 @@
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True)\n",
-    "save_fig(\"bad_visualization_plot\")  # not in the book\n",
+    "save_fig(\"bad_visualization_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -535,7 +535,7 @@
   "outputs": [],
   "source": [
    "housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True, alpha=0.2)\n",
-    "save_fig(\"better_visualization_plot\")  # not in the book\n",
+    "save_fig(\"better_visualization_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -549,7 +549,7 @@
    "             s=housing[\"population\"] / 100, label=\"population\",\n",
    "             c=\"median_house_value\", cmap=\"jet\", colorbar=True,\n",
    "             legend=True, sharex=False, figsize=(10, 7))\n",
-    "save_fig(\"housing_prices_scatterplot\")  # not in the book\n",
+    "save_fig(\"housing_prices_scatterplot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -573,7 +573,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code generates the first figure in the chapter\n",
+    "# extra code – this cell generates the first figure in the chapter\n",
    "\n",
    "# Download the California image\n",
    "filename = \"california.png\"\n",
@@ -638,7 +638,7 @@
    "attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
    "              \"housing_median_age\"]\n",
    "scatter_matrix(housing[attributes], figsize=(12, 8))\n",
-    "save_fig(\"scatter_matrix_plot\")  # not in the book\n",
+    "save_fig(\"scatter_matrix_plot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -650,7 +650,7 @@
   "source": [
    "housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
    "             alpha=0.1, grid=True)\n",
-    "save_fig(\"income_vs_house_value_scatterplot\")  # not in the book\n",
+    "save_fig(\"income_vs_house_value_scatterplot\")  # extra code\n",
    "plt.show()"
   ]
  },
@@ -1195,7 +1195,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code generates Figure 2–17\n",
+    "# extra code – this cell generates Figure 2–17\n",
    "fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)\n",
    "housing[\"population\"].hist(ax=axs[0], bins=50)\n",
    "housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n",
@@ -1219,7 +1219,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code just shows that we get a uniform distribution\n",
+    "# extra code – just shows that we get a uniform distribution\n",
    "percentiles = [np.percentile(housing[\"median_income\"], p)\n",
    "               for p in range(1, 100)]\n",
    "flattened_median_income = pd.cut(housing[\"median_income\"],\n",
@@ -1251,7 +1251,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code generates Figure 2–18\n",
+    "# extra code – this cell generates Figure 2–18\n",
    "\n",
    "ages = np.linspace(housing[\"housing_median_age\"].min(),\n",
    "                   housing[\"housing_median_age\"].max(),\n",
@@ -1488,7 +1488,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code generates Figure 2–19\n",
+    "# extra code – this cell generates Figure 2–19\n",
    "\n",
    "housing_renamed = housing.rename(columns={\n",
    "    \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
@@ -1638,7 +1638,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "df_housing_num_prepared.head(2)  # not in the book"
+    "df_housing_num_prepared.head(2)  # extra code"
   ]
  },
  {
@@ -1737,7 +1737,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code shows that we can get a DataFrame out if we want\n",
+    "# extra code – shows that we can get a DataFrame out if we want\n",
    "housing_prepared_fr = pd.DataFrame(\n",
    "    housing_prepared,\n",
    "    columns=preprocessing.get_feature_names_out(),\n",
@@ -1866,7 +1866,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code computes the error ratios discussed in the book\n",
+    "# extra code – computes the error ratios discussed in the book\n",
    "error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n",
    "print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))"
   ]
@@ -1942,7 +1942,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code computes the error stats for the linear model\n",
+    "# extra code – computes the error stats for the linear model\n",
    "lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n",
    "                              scoring=\"neg_root_mean_squared_error\", cv=10)\n",
    "pd.Series(lin_rmses).describe()"
@@ -2062,7 +2062,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code shows part of the output of get_params().keys()\n",
+    "# extra code – shows part of the output of get_params().keys()\n",
    "print(str(full_pipeline.get_params().keys())[:1000] + \"...\")"
   ]
  },
@@ -2107,7 +2107,7 @@
    "cv_res = pd.DataFrame(grid_search.cv_results_)\n",
    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
    "\n",
-    "# not in the book – these few lines of code just make the DataFrame look nicer\n",
+    "# extra code – these few lines of code just make the DataFrame look nicer\n",
    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
    "                 \"param_random_forest__max_features\", \"split0_test_score\",\n",
    "                 \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
@@ -2174,7 +2174,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – this code displays the random search results\n",
+    "# extra code – displays the random search results\n",
    "cv_res = pd.DataFrame(rnd_search.cv_results_)\n",
    "cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
    "cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
@@ -2213,7 +2213,7 @@
   },
   "outputs": [],
   "source": [
-    "# not in the book – plots a few distributions you can use in randomized search\n",
+    "# extra code – plots a few distributions you can use in randomized search\n",
    "\n",
    "from scipy.stats import randint, uniform, geom, expon\n",
    "\n",
@@ -2275,7 +2275,7 @@
   },
   "outputs": [],
   "source": [
-    "# not in the book – shows the difference between expon and reciprocal\n",
+    "# extra code – shows the difference between expon and reciprocal\n",
    "\n",
    "from scipy.stats import reciprocal\n",
    "\n",
@@ -2410,7 +2410,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – shows how to compute a confidence interval for the RMSE\n",
+    "# extra code – shows how to compute a confidence interval for the RMSE\n",
    "m = len(squared_errors)\n",
    "mean = squared_errors.mean()\n",
    "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
@@ -2431,7 +2431,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# not in the book – computes a confidence interval again using z-score\n",
+    "# extra code – computes a confidence interval again using z-score\n",
    "zscore = stats.norm.ppf((1 + confidence) / 2)\n",
    "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
    "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
@@ -2477,7 +2477,7 @@
   "source": [
    "import joblib\n",
    "\n",
-    "# not in the book – excluded for conciseness\n",
+    "# extra code – excluded for conciseness\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.base import BaseEstimator, TransformerMixin\n",
    "from sklearn.metrics.pairwise import rbf_kernel\n",