Sync notebook with book's code examples, and better identify extra code

This commit is contained in:
Aurélien Geron
2022-02-19 18:17:36 +13:00
parent 1c2421fc88
commit b63019fd28
9 changed files with 318 additions and 301 deletions

View File

@@ -177,7 +177,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book code to save the figures as high-res PNGs for the book\n",
"# extra code code to save the figures as high-res PNGs for the book\n",
"\n",
"IMAGES_PATH = Path() / \"images\" / \"end_to_end_project\"\n",
"IMAGES_PATH.mkdir(parents=True, exist_ok=True)\n",
@@ -197,7 +197,7 @@
"source": [
"import matplotlib.pyplot as plt\n",
"\n",
"# not in the book the next 5 lines define the default font sizes\n",
"# extra code the next 5 lines define the default font sizes\n",
"plt.rc('font', size=14)\n",
"plt.rc('axes', labelsize=14, titlesize=14)\n",
"plt.rc('legend', fontsize=14)\n",
@@ -205,7 +205,7 @@
"plt.rc('ytick', labelsize=10)\n",
"\n",
"housing.hist(bins=50, figsize=(12, 8))\n",
"save_fig(\"attribute_histogram_plots\") # not in the book\n",
"save_fig(\"attribute_histogram_plots\") # extra code\n",
"plt.show()"
]
},
@@ -351,7 +351,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book shows how to compute the 10.7% proba of getting a bad sample\n",
"# extra code shows how to compute the 10.7% proba of getting a bad sample\n",
"\n",
"from scipy.stats import binom\n",
"\n",
@@ -375,7 +375,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book shows another way to estimate the probability of bad sample\n",
"# extra code shows another way to estimate the probability of bad sample\n",
"\n",
"np.random.seed(42)\n",
"\n",
@@ -403,7 +403,7 @@
"housing[\"income_cat\"].value_counts().sort_index().plot.bar(rot=0, grid=True)\n",
"plt.xlabel(\"Income category\")\n",
"plt.ylabel(\"Number of districts\")\n",
"save_fig(\"housing_income_cat_bar_plot\") # not in the book\n",
"save_fig(\"housing_income_cat_bar_plot\") # extra code\n",
"plt.show()"
]
},
@@ -464,7 +464,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code computes the data for Figure 210\n",
"# extra code computes the data for Figure 210\n",
"\n",
"def income_cat_proportions(data):\n",
" return data[\"income_cat\"].value_counts() / len(data)\n",
@@ -524,7 +524,7 @@
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True)\n",
"save_fig(\"bad_visualization_plot\") # not in the book\n",
"save_fig(\"bad_visualization_plot\") # extra code\n",
"plt.show()"
]
},
@@ -535,7 +535,7 @@
"outputs": [],
"source": [
"housing.plot(kind=\"scatter\", x=\"longitude\", y=\"latitude\", grid=True, alpha=0.2)\n",
"save_fig(\"better_visualization_plot\") # not in the book\n",
"save_fig(\"better_visualization_plot\") # extra code\n",
"plt.show()"
]
},
@@ -549,7 +549,7 @@
" s=housing[\"population\"] / 100, label=\"population\",\n",
" c=\"median_house_value\", cmap=\"jet\", colorbar=True,\n",
" legend=True, sharex=False, figsize=(10, 7))\n",
"save_fig(\"housing_prices_scatterplot\") # not in the book\n",
"save_fig(\"housing_prices_scatterplot\") # extra code\n",
"plt.show()"
]
},
@@ -573,7 +573,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code generates the first figure in the chapter\n",
"# extra code this cell generates the first figure in the chapter\n",
"\n",
"# Download the California image\n",
"filename = \"california.png\"\n",
@@ -638,7 +638,7 @@
"attributes = [\"median_house_value\", \"median_income\", \"total_rooms\",\n",
" \"housing_median_age\"]\n",
"scatter_matrix(housing[attributes], figsize=(12, 8))\n",
"save_fig(\"scatter_matrix_plot\") # not in the book\n",
"save_fig(\"scatter_matrix_plot\") # extra code\n",
"plt.show()"
]
},
@@ -650,7 +650,7 @@
"source": [
"housing.plot(kind=\"scatter\", x=\"median_income\", y=\"median_house_value\",\n",
" alpha=0.1, grid=True)\n",
"save_fig(\"income_vs_house_value_scatterplot\") # not in the book\n",
"save_fig(\"income_vs_house_value_scatterplot\") # extra code\n",
"plt.show()"
]
},
@@ -1195,7 +1195,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code generates Figure 217\n",
"# extra code this cell generates Figure 217\n",
"fig, axs = plt.subplots(1, 2, figsize=(8, 3), sharey=True)\n",
"housing[\"population\"].hist(ax=axs[0], bins=50)\n",
"housing[\"population\"].apply(np.log).hist(ax=axs[1], bins=50)\n",
@@ -1219,7 +1219,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code just shows that we get a uniform distribution\n",
"# extra code just shows that we get a uniform distribution\n",
"percentiles = [np.percentile(housing[\"median_income\"], p)\n",
" for p in range(1, 100)]\n",
"flattened_median_income = pd.cut(housing[\"median_income\"],\n",
@@ -1251,7 +1251,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code generates Figure 218\n",
"# extra code this cell generates Figure 218\n",
"\n",
"ages = np.linspace(housing[\"housing_median_age\"].min(),\n",
" housing[\"housing_median_age\"].max(),\n",
@@ -1488,7 +1488,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code generates Figure 219\n",
"# extra code this cell generates Figure 219\n",
"\n",
"housing_renamed = housing.rename(columns={\n",
" \"latitude\": \"Latitude\", \"longitude\": \"Longitude\",\n",
@@ -1638,7 +1638,7 @@
"metadata": {},
"outputs": [],
"source": [
"df_housing_num_prepared.head(2) # not in the book"
"df_housing_num_prepared.head(2) # extra code"
]
},
{
@@ -1737,7 +1737,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code shows that we can get a DataFrame out if we want\n",
"# extra code shows that we can get a DataFrame out if we want\n",
"housing_prepared_fr = pd.DataFrame(\n",
" housing_prepared,\n",
" columns=preprocessing.get_feature_names_out(),\n",
@@ -1866,7 +1866,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code computes the error ratios discussed in the book\n",
"# extra code computes the error ratios discussed in the book\n",
"error_ratios = housing_predictions[:5].round(-2) / housing_labels.iloc[:5].values - 1\n",
"print(\", \".join([f\"{100 * ratio:.1f}%\" for ratio in error_ratios]))"
]
@@ -1942,7 +1942,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code computes the error stats for the linear model\n",
"# extra code computes the error stats for the linear model\n",
"lin_rmses = -cross_val_score(lin_reg, housing, housing_labels,\n",
" scoring=\"neg_root_mean_squared_error\", cv=10)\n",
"pd.Series(lin_rmses).describe()"
@@ -2062,7 +2062,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code shows part of the output of get_params().keys()\n",
"# extra code shows part of the output of get_params().keys()\n",
"print(str(full_pipeline.get_params().keys())[:1000] + \"...\")"
]
},
@@ -2107,7 +2107,7 @@
"cv_res = pd.DataFrame(grid_search.cv_results_)\n",
"cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
"\n",
"# not in the book these few lines of code just make the DataFrame look nicer\n",
"# extra code these few lines of code just make the DataFrame look nicer\n",
"cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
" \"param_random_forest__max_features\", \"split0_test_score\",\n",
" \"split1_test_score\", \"split2_test_score\", \"mean_test_score\"]]\n",
@@ -2174,7 +2174,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book this code displays the random search results\n",
"# extra code displays the random search results\n",
"cv_res = pd.DataFrame(rnd_search.cv_results_)\n",
"cv_res.sort_values(by=\"mean_test_score\", ascending=False, inplace=True)\n",
"cv_res = cv_res[[\"param_preprocessing__geo__n_clusters\",\n",
@@ -2213,7 +2213,7 @@
},
"outputs": [],
"source": [
"# not in the book plots a few distributions you can use in randomized search\n",
"# extra code plots a few distributions you can use in randomized search\n",
"\n",
"from scipy.stats import randint, uniform, geom, expon\n",
"\n",
@@ -2275,7 +2275,7 @@
},
"outputs": [],
"source": [
"# not in the book shows the difference between expon and reciprocal\n",
"# extra code shows the difference between expon and reciprocal\n",
"\n",
"from scipy.stats import reciprocal\n",
"\n",
@@ -2410,7 +2410,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book shows how to compute a confidence interval for the RMSE\n",
"# extra code shows how to compute a confidence interval for the RMSE\n",
"m = len(squared_errors)\n",
"mean = squared_errors.mean()\n",
"tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
@@ -2431,7 +2431,7 @@
"metadata": {},
"outputs": [],
"source": [
"# not in the book computes a confidence interval again using z-score\n",
"# extra code computes a confidence interval again using z-score\n",
"zscore = stats.norm.ppf((1 + confidence) / 2)\n",
"zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
"np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
@@ -2477,7 +2477,7 @@
"source": [
"import joblib\n",
"\n",
"# not in the book excluded for conciseness\n",
"# extra code excluded for conciseness\n",
"from sklearn.cluster import KMeans\n",
"from sklearn.base import BaseEstimator, TransformerMixin\n",
"from sklearn.metrics.pairwise import rbf_kernel\n",