From 55adea1ff431d7616ca9a13511249d7c757d34a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Aur=C3=A9lien=20Geron?= <ageron@users.noreply.github.com>
Date: Tue, 8 May 2018 19:41:47 +0200
Subject: [PATCH] Add code to compute a confidence interval

---
 02_end_to_end_machine_learning_project.ipynb | 124 ++++++++++++++-----
 1 file changed, 96 insertions(+), 28 deletions(-)

diff --git a/02_end_to_end_machine_learning_project.ipynb b/02_end_to_end_machine_learning_project.ipynb
index b82b519..e3fd493 100644
--- a/02_end_to_end_machine_learning_project.ipynb
+++ b/02_end_to_end_machine_learning_project.ipynb
@@ -1437,6 +1437,74 @@
     "final_rmse"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can compute a 95% confidence interval for the test RMSE:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from scipy import stats"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "confidence = 0.95\n",
+    "squared_errors = (final_predictions - y_test) ** 2\n",
+    "mean = squared_errors.mean()\n",
+    "m = len(squared_errors)\n",
+    "\n",
+    "np.sqrt(stats.t.interval(confidence, m - 1,\n",
+    "                         loc=np.mean(squared_errors),\n",
+    "                         scale=stats.sem(squared_errors)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We could compute the interval manually like this:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tscore = stats.t.ppf((1 + confidence) / 2, df=m - 1)\n",
+    "tmargin = tscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
+    "np.sqrt(mean - tmargin), np.sqrt(mean + tmargin)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Alternatively, we could use a z-scores rather than t-scores:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "zscore = stats.norm.ppf((1 + confidence) / 2)\n",
+    "zmargin = zscore * squared_errors.std(ddof=1) / np.sqrt(m)\n",
+    "np.sqrt(mean - zmargin), np.sqrt(mean + zmargin)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -1453,7 +1521,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 103,
+   "execution_count": 107,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1475,7 +1543,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 104,
+   "execution_count": 108,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1484,7 +1552,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 105,
+   "execution_count": 109,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1503,7 +1571,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 106,
+   "execution_count": 110,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1541,7 +1609,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 107,
+   "execution_count": 111,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1567,7 +1635,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 108,
+   "execution_count": 112,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1585,7 +1653,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 109,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1615,14 +1683,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 110,
+   "execution_count": 114,
    "metadata": {},
    "outputs": [],
    "source": [
     "from sklearn.model_selection import RandomizedSearchCV\n",
     "from scipy.stats import expon, reciprocal\n",
     "\n",
-    "# see https://docs.scipy.org/doc/scipy-0.19.0/reference/stats.html\n",
+    "# see https://docs.scipy.org/doc/scipy/reference/stats.html\n",
     "# for `expon()` and `reciprocal()` documentation and more probability distribution functions.\n",
     "\n",
     "# Note: gamma is ignored when kernel is \"linear\"\n",
@@ -1648,7 +1716,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 111,
+   "execution_count": 115,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1666,7 +1734,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 112,
+   "execution_count": 116,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1689,7 +1757,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 113,
+   "execution_count": 117,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1714,7 +1782,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 114,
+   "execution_count": 118,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1753,7 +1821,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 115,
+   "execution_count": 119,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1789,7 +1857,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 116,
+   "execution_count": 120,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1805,7 +1873,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 117,
+   "execution_count": 121,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1815,7 +1883,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 118,
+   "execution_count": 122,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1831,7 +1899,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 119,
+   "execution_count": 123,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1847,7 +1915,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 120,
+   "execution_count": 124,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1859,7 +1927,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 121,
+   "execution_count": 125,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1875,7 +1943,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 122,
+   "execution_count": 126,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1891,7 +1959,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 123,
+   "execution_count": 127,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1921,7 +1989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 128,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1934,7 +2002,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 125,
+   "execution_count": 129,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1950,7 +2018,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 126,
+   "execution_count": 130,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1984,7 +2052,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 127,
+   "execution_count": 131,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2000,7 +2068,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 128,
+   "execution_count": 132,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2038,7 +2106,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.5"
+   "version": "3.5.2"
   },
   "nav_menu": {
    "height": "279px",