diff --git a/08_dimensionality_reduction.ipynb b/08_dimensionality_reduction.ipynb index c42e357..da3ebc1 100644 --- a/08_dimensionality_reduction.ipynb +++ b/08_dimensionality_reduction.ipynb @@ -55,11 +55,10 @@ "\n", "# Common imports\n", "import numpy as np\n", - "import numpy.random as rnd\n", "import os\n", "\n", "# to make this notebook's output stable across runs\n", - "rnd.seed(42)\n", + "np.random.seed(42)\n", "\n", "# To plot pretty figures\n", "%matplotlib inline\n", @@ -102,16 +101,16 @@ }, "outputs": [], "source": [ - "rnd.seed(4)\n", + "np.random.seed(4)\n", "m = 60\n", "w1, w2 = 0.1, 0.3\n", "noise = 0.1\n", "\n", - "angles = rnd.rand(m) * 3 * np.pi / 2 - 0.5\n", + "angles = np.random.rand(m) * 3 * np.pi / 2 - 0.5\n", "X = np.empty((m, 3))\n", - "X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * rnd.randn(m) / 2\n", - "X[:, 1] = np.sin(angles) * 0.7 + noise * rnd.randn(m) / 2\n", - "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * rnd.randn(m)" + "X[:, 0] = np.cos(angles) + np.sin(angles)/2 + noise * np.random.randn(m) / 2\n", + "X[:, 1] = np.sin(angles) * 0.7 + noise * np.random.randn(m) / 2\n", + "X[:, 2] = X[:, 0] * w1 + X[:, 1] * w2 + noise * np.random.randn(m)" ] }, { @@ -144,7 +143,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -185,7 +186,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -194,14 +197,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## PCA using Scikit-Learn" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "With Scikit-Learn, PCA is really trivial. It even takes care of mean centering for you:" ] @@ -226,7 +235,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -237,7 +248,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -259,7 +272,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -291,7 +306,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Of course, there was some loss of information during the projection step, so the recovered 3D points are not exactly equal to the original 3D points:" ] @@ -300,7 +318,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -309,7 +329,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We can compute the reconstruction error:" ] @@ -318,7 +341,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -327,7 +352,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The inverse transform in the SVD approach looks like this:" ] @@ -347,7 +375,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The reconstructions from both methods are not identical because Scikit-Learn's `PCA` class automatically takes care of reversing the mean centering, but if we subtract the mean, we get the same reconstruction:" ] @@ -367,7 +398,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The `PCA` object gives access to the principal components that it computed:" ] @@ -387,7 +421,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Compare to the first two principal components computed using the SVD method:" ] @@ -407,14 +444,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice how the axes are flipped." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now let's look at the explained variance ratio:" ] @@ -429,19 +472,25 @@ }, "outputs": [], "source": [ - "print(pca.explained_variance_ratio_)" + "pca.explained_variance_ratio_" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The first dimension explains 84.2% of the variance, while the second explains 14.6%." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "By projecting down to 2D, we lost about 1.1% of the variance:" ] @@ -461,7 +510,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Here is how to compute the explained variance ratio using the SVD approach (recall that `s` is the diagonal of the matrix `S`):" ] @@ -470,7 +522,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -479,7 +533,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next, let's generate some nice figures! :)" ] @@ -822,8 +879,8 @@ "stretch = 5\n", "m = 200\n", "\n", - "rnd.seed(3)\n", - "X = rnd.randn(m, 2) / 10\n", + "np.random.seed(3)\n", + "X = np.random.randn(m, 2) / 10\n", "X = X.dot(np.array([[stretch, 0],[0, 1]])) # stretch\n", "X = X.dot([[np.cos(angle), np.sin(angle)], [-np.sin(angle), np.cos(angle)]]) # rotate\n", "\n", @@ -941,7 +998,9 @@ "cell_type": "code", "execution_count": 34, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -966,7 +1025,9 @@ "cell_type": "code", "execution_count": 36, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1052,7 +1113,9 @@ "cell_type": "code", "execution_count": 41, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1061,7 +1124,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Incremental PCA" ] @@ -1122,7 +1188,9 @@ "cell_type": "code", "execution_count": 45, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1131,7 +1199,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's compare the results of transforming MNIST using regular PCA and incremental PCA. First, the means are equal: " ] @@ -1151,7 +1222,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "But the results are not exactly identical. Incremental PCA gives a very good approximate solution, but it's not perfect:" ] @@ -1171,14 +1245,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Using `memmap()`" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's create the `memmap()` structure and copy the MNIST data into it. This would typically be done by a first program:" ] @@ -1202,7 +1282,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now deleting the `memmap()` object will trigger its Python finalizer, which ensures that the data is saved to disk." ] @@ -1211,7 +1294,9 @@ "cell_type": "code", "execution_count": 49, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1220,7 +1305,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Next, another program would load the data and use it for training:" ] @@ -1258,14 +1346,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Time complexity" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's time regular PCA against Incremental PCA and Randomized PCA, for various number of principal components:" ] @@ -1297,7 +1391,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now let's compare PCA and Randomized PCA for datasets of different sizes (number of instances):" ] @@ -1316,7 +1413,7 @@ "times_pca = []\n", "sizes = [1000, 10000, 20000, 30000, 40000, 50000, 70000, 100000, 200000, 500000]\n", "for n_samples in sizes:\n", - " X = rnd.randn(n_samples, 5)\n", + " X = np.random.randn(n_samples, 5)\n", " pca = PCA(n_components = 2, svd_solver=\"randomized\", random_state=42)\n", " t1 = time.time()\n", " pca.fit(X)\n", @@ -1338,7 +1435,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "And now let's compare their performance on datasets of 2,000 instances with various numbers of features:" ] @@ -1358,7 +1458,7 @@ "times_pca = []\n", "sizes = [1000, 2000, 3000, 4000, 5000, 6000]\n", "for n_features in sizes:\n", - " X = rnd.randn(2000, n_features)\n", + " X = np.random.randn(2000, n_features)\n", " pca = PCA(n_components = 2, random_state=42, svd_solver=\"randomized\")\n", " t1 = time.time()\n", " pca.fit(X)\n", @@ -1392,7 +1492,9 @@ "cell_type": "code", "execution_count": 55, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1403,7 +1505,9 @@ "cell_type": "code", "execution_count": 56, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1531,7 +1635,9 @@ "cell_type": "code", "execution_count": 61, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1583,7 +1689,9 @@ "cell_type": "code", "execution_count": 64, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1610,7 +1718,9 @@ "cell_type": "code", "execution_count": 66, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -1679,7 +1789,7 @@ "source": [ "from sklearn.manifold import TSNE\n", "\n", - "tsne = TSNE(n_components=2)\n", + "tsne = TSNE(n_components=2, random_state=42)\n", "X_reduced_tsne = tsne.fit_transform(X)" ] },