Update notebooks 1 to 8 to latest library versions (in particular Scikit-Learn 0.20)

This commit is contained in:
Aurélien Geron
2018-12-21 10:18:31 +08:00
parent dc16446c5f
commit b54ee1b608
8 changed files with 694 additions and 586 deletions

View File

@@ -115,6 +115,13 @@
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value. In your own code, you can simply rely on the latest default values instead."
]
},
{
"cell_type": "code",
"execution_count": 5,
@@ -126,9 +133,9 @@
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"\n",
"log_clf = LogisticRegression(random_state=42)\n",
"rnd_clf = RandomForestClassifier(random_state=42)\n",
"svm_clf = SVC(random_state=42)\n",
"log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"svm_clf = SVC(gamma=\"auto\", random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
@@ -164,9 +171,9 @@
"metadata": {},
"outputs": [],
"source": [
"log_clf = LogisticRegression(random_state=42)\n",
"rnd_clf = RandomForestClassifier(random_state=42)\n",
"svm_clf = SVC(probability=True, random_state=42)\n",
"log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"svm_clf = SVC(gamma=\"auto\", probability=True, random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
@@ -420,8 +427,13 @@
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_mldata\n",
"mnist = fetch_mldata('MNIST original')"
"try:\n",
" from sklearn.datasets import fetch_openml\n",
" mnist = fetch_openml('mnist_784', version=1)\n",
" mnist.target = mnist.target.astype(np.int64)\n",
"except ImportError:\n",
" from sklearn.datasets import fetch_mldata\n",
" mnist = fetch_mldata('MNIST original')"
]
},
{
@@ -430,7 +442,7 @@
"metadata": {},
"outputs": [],
"source": [
"rnd_clf = RandomForestClassifier(random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])"
]
},
@@ -505,7 +517,7 @@
" sample_weights = np.ones(m)\n",
" plt.subplot(subplot)\n",
" for i in range(5):\n",
" svm_clf = SVC(kernel=\"rbf\", C=0.05, random_state=42)\n",
" svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"auto\", random_state=42)\n",
" svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n",
" y_pred = svm_clf.predict(X_train)\n",
" sample_weights[y_pred != y_train] *= (1 + learning_rate)\n",
@@ -911,36 +923,25 @@
"Exercise: _Load the MNIST data and split it into a training set, a validation set, and a test set (e.g., use 50,000 instances for training, 10,000 for validation, and 10,000 for testing)._"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The MNIST dataset was loaded earlier."
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import fetch_mldata"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"mnist = fetch_mldata('MNIST original')"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -959,7 +960,7 @@
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
@@ -970,19 +971,19 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"random_forest_clf = RandomForestClassifier(random_state=42)\n",
"extra_trees_clf = ExtraTreesClassifier(random_state=42)\n",
"random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)\n",
"svm_clf = LinearSVC(random_state=42)\n",
"mlp_clf = MLPClassifier(random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
@@ -994,7 +995,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@@ -1017,7 +1018,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
@@ -1026,7 +1027,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
@@ -1040,7 +1041,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
@@ -1049,7 +1050,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@@ -1058,7 +1059,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
@@ -1067,7 +1068,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
@@ -1083,7 +1084,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
@@ -1099,16 +1100,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"voting_clf.estimators"
]
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
@@ -1124,7 +1116,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
@@ -1140,7 +1132,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
@@ -1156,7 +1148,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@@ -1167,12 +1159,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"Much better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:"
"A bit better! The SVM was hurting performance. Now let's try using a soft voting classifier. We do not actually need to retrain the classifier, we can just set `voting` to `\"soft\"`:"
]
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
@@ -1181,7 +1173,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@@ -1204,7 +1196,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@@ -1213,7 +1205,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@@ -1224,7 +1216,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"The voting classifier reduced the error rate from about 4.9% for our best model (the `MLPClassifier`) to just 3.5%. That's about 28% less errors, not bad!"
"The voting classifier reduced the error rate from about 4.0% for our best model (the `MLPClassifier`) to just 3.1%. That's about 22.5% less errors, not bad!"
]
},
{
@@ -1243,7 +1235,7 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@@ -1255,7 +1247,7 @@
},
{
"cell_type": "code",
"execution_count": 80,
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
@@ -1264,7 +1256,7 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
@@ -1274,7 +1266,7 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
@@ -1297,7 +1289,7 @@
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
@@ -1309,7 +1301,7 @@
},
{
"cell_type": "code",
"execution_count": 84,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
@@ -1318,7 +1310,7 @@
},
{
"cell_type": "code",
"execution_count": 85,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
@@ -1327,7 +1319,7 @@
},
{
"cell_type": "code",
"execution_count": 86,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
@@ -1338,15 +1330,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, but it still beats all the individual classifiers."
"This stacking ensemble does not perform as well as the soft voting classifier we trained earlier, it's just as good as the best individual classifier."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@@ -1365,7 +1350,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.2"
"version": "3.6.6"
},
"nav_menu": {
"height": "252px",