Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

2026-01-14 12:14:36 +01:00 · 2018-12-21 10:18:31 +08:00
parent dc16446c5f
commit b54ee1b608
8 changed files with 694 additions and 586 deletions
--- a/07_ensemble_learning_and_random_forests.ipynb
+++ b/07_ensemble_learning_and_random_forests.ipynb
@@ -115,6 +115,13 @@
    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value.  In your own code, you can simply rely on the latest default values instead."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 5,
@@ -126,9 +133,9 @@
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.svm import SVC\n",
    "\n",
-    "log_clf = LogisticRegression(random_state=42)\n",
-    "rnd_clf = RandomForestClassifier(random_state=42)\n",
-    "svm_clf = SVC(random_state=42)\n",
+    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
+    "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
+    "svm_clf = SVC(gamma=\"auto\", random_state=42)\n",
    "\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
@@ -164,9 +171,9 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "log_clf = LogisticRegression(random_state=42)\n",
-    "rnd_clf = RandomForestClassifier(random_state=42)\n",
-    "svm_clf = SVC(probability=True, random_state=42)\n",
+    "log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
+    "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
+    "svm_clf = SVC(gamma=\"auto\", probability=True, random_state=42)\n",
    "\n",
    "voting_clf = VotingClassifier(\n",
    "    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
@@ -420,8 +427,13 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from sklearn.datasets import fetch_mldata\n",
-    "mnist = fetch_mldata('MNIST original')"
+    "try:\n",
+    "    from sklearn.datasets import fetch_openml\n",
+    "    mnist = fetch_openml('mnist_784', version=1)\n",
+    "    mnist.target = mnist.target.astype(np.int64)\n",
+    "except ImportError:\n",
+    "    from sklearn.datasets import fetch_mldata\n",
+    "    mnist = fetch_mldata('MNIST original')"
   ]
  },
  {
@@ -430,7 +442,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "rnd_clf = RandomForestClassifier(random_state=42)\n",
+    "rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
    "rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])"
   ]
  },
@@ -505,7 +517,7 @@
    "    sample_weights = np.ones(m)\n",
    "    plt.subplot(subplot)\n",
    "    for i in range(5):\n",
-    "        svm_clf = SVC(kernel=\"rbf\", C=0.05, random_state=42)\n",
+    "        svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"auto\", random_state=42)\n",
    "        svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n",
    "        y_pred = svm_clf.predict(X_train)\n",
    "        sample_weights[y_pred != y_train] *= (1 + learning_rate)\n",
@@ -911,36 +923,25 @@
    "Exercise: _Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing)._"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The MNIST dataset was loaded earlier."
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
-   "source": [
-    "from sklearn.datasets import fetch_mldata"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mnist = fetch_mldata('MNIST original')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "metadata": {},
-   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -959,7 +960,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -970,19 +971,19 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 60,
+   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
-    "random_forest_clf = RandomForestClassifier(random_state=42)\n",
-    "extra_trees_clf = ExtraTreesClassifier(random_state=42)\n",
+    "random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
+    "extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)\n",
    "svm_clf = LinearSVC(random_state=42)\n",
    "mlp_clf = MLPClassifier(random_state=42)"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 61,
+   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -994,7 +995,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 62,
+   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1017,7 +1018,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 63,
+   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1026,7 +1027,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 64,
+   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1040,7 +1041,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 65,
+   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1049,7 +1050,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 66,
+   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1058,7 +1059,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 67,
+   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1067,7 +1068,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 68,
+   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1083,7 +1084,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 67,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1099,16 +1100,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 70,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "voting_clf.estimators"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 71,
+   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1124,7 +1116,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 72,
+   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1140,7 +1132,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 73,
+   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1156,7 +1148,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1167,12 +1159,12 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Much better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:"
+    "A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1181,7 +1173,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 76,
+   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1204,7 +1196,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1213,7 +1205,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1224,7 +1216,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "The voting classifier reduced the error rate from about 4.9% for our best model (the `MLPClassifier`) to just 3.5%. That's about 28% less errors, not bad!"
+    "The voting classifier reduced the error rate from about 4.0% for our best model (the `MLPClassifier`) to just 3.1%. That's about 22.5% less errors, not bad!"
   ]
  },
  {
@@ -1243,7 +1235,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1255,7 +1247,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1264,7 +1256,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 78,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1274,7 +1266,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 82,
+   "execution_count": 79,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1297,7 +1289,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 83,
+   "execution_count": 80,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1309,7 +1301,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1318,7 +1310,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 85,
+   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1327,7 +1319,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 86,
+   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1338,15 +1330,8 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, but it still beats all the individual classifiers."
+    "This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, it's just as good as the best individual classifier."
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@@ -1365,7 +1350,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.5.2"
+   "version": "3.6.6"
  },
  "nav_menu": {
   "height": "252px",