Update all notebooks assuming we are all in the future now: sklearn 0.20+, python 3.5+, TF 2.0 preview

This commit is contained in:
Aurélien Geron
2019-01-18 23:08:37 +08:00
parent 10c432a997
commit 6b8dff91d0
12 changed files with 1186 additions and 2625 deletions

View File

@@ -71,7 +71,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Voting classifiers"
"This notebook assumes you have installed Scikit-Learn ≥0.20."
]
},
{
@@ -79,6 +79,23 @@
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"assert sklearn.__version__ >= \"0.20\""
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Voting classifiers"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"heads_proba = 0.51\n",
"coin_tosses = (np.random.rand(10000, 10) < heads_proba).astype(np.int32)\n",
@@ -87,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -105,7 +122,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -120,27 +137,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Warning**: In Scikit-Learn 0.20, some hyperparameters (`solver`, `n_estimators`, `gamma`, etc.) start issuing warnings about the fact that their default value will change in Scikit-Learn 0.22. To avoid these warnings and ensure that this notebooks keeps producing the same outputs as in the book, I set the hyperparameters to their old default value. In your own code, you can simply rely on the latest default values instead."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"\n",
"log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"svm_clf = SVC(gamma=\"auto\", random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
" voting='hard')"
"**Note**: to be future-proof, we set `solver=\"lbfgs\"`, `n_estimators=100`, and `gamma=\"scale\"` since these will be the default values in upcoming Scikit-Learn versions."
]
},
{
@@ -149,7 +146,18 @@
"metadata": {},
"outputs": [],
"source": [
"voting_clf.fit(X_train, y_train)"
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.ensemble import VotingClassifier\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.svm import SVC\n",
"\n",
"log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"svm_clf = SVC(gamma=\"scale\", random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
" voting='hard')"
]
},
{
@@ -157,6 +165,15 @@
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"voting_clf.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score\n",
"\n",
@@ -166,15 +183,22 @@
" print(clf.__class__.__name__, accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Soft voting:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"log_clf = LogisticRegression(solver=\"liblinear\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"svm_clf = SVC(gamma=\"auto\", probability=True, random_state=42)\n",
"log_clf = LogisticRegression(solver=\"lbfgs\", random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"svm_clf = SVC(gamma=\"scale\", probability=True, random_state=42)\n",
"\n",
"voting_clf = VotingClassifier(\n",
" estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],\n",
@@ -184,7 +208,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@@ -205,7 +229,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@@ -214,14 +238,14 @@
"\n",
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
" max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)\n",
" max_samples=100, bootstrap=True, random_state=42)\n",
"bag_clf.fit(X_train, y_train)\n",
"y_pred = bag_clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
@@ -231,7 +255,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -243,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@@ -269,7 +293,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@@ -293,18 +317,18 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(splitter=\"random\", max_leaf_nodes=16, random_state=42),\n",
" n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)"
" n_estimators=500, max_samples=1.0, bootstrap=True, random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -314,39 +338,25 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1, random_state=42)\n",
"rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)\n",
"rnd_clf.fit(X_train, y_train)\n",
"\n",
"y_pred_rf = rnd_clf.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"np.sum(y_pred == y_pred_rf) / len(y_pred) # almost identical predictions"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.datasets import load_iris\n",
"iris = load_iris()\n",
"rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1, random_state=42)\n",
"rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n",
"for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
" print(name, score)"
"np.sum(y_pred == y_pred_rf) / len(y_pred) # almost identical predictions"
]
},
{
@@ -355,7 +365,12 @@
"metadata": {},
"outputs": [],
"source": [
"rnd_clf.feature_importances_"
"from sklearn.datasets import load_iris\n",
"iris = load_iris()\n",
"rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)\n",
"rnd_clf.fit(iris[\"data\"], iris[\"target\"])\n",
"for name, score in zip(iris[\"feature_names\"], rnd_clf.feature_importances_):\n",
" print(name, score)"
]
},
{
@@ -363,6 +378,15 @@
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"rnd_clf.feature_importances_"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"plt.figure(figsize=(6, 4))\n",
"\n",
@@ -384,20 +408,20 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"bag_clf = BaggingClassifier(\n",
" DecisionTreeClassifier(random_state=42), n_estimators=500,\n",
" bootstrap=True, n_jobs=-1, oob_score=True, random_state=40)\n",
" bootstrap=True, oob_score=True, random_state=40)\n",
"bag_clf.fit(X_train, y_train)\n",
"bag_clf.oob_score_"
]
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
@@ -406,7 +430,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
@@ -422,29 +446,16 @@
"## Feature importance"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" from sklearn.datasets import fetch_openml\n",
" mnist = fetch_openml('mnist_784', version=1)\n",
" mnist.target = mnist.target.astype(np.int64)\n",
"except ImportError:\n",
" from sklearn.datasets import fetch_mldata\n",
" mnist = fetch_mldata('MNIST original')"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])"
"from sklearn.datasets import fetch_openml\n",
"\n",
"mnist = fetch_openml('mnist_784', version=1)\n",
"mnist.target = mnist.target.astype(np.uint8)"
]
},
{
@@ -452,6 +463,16 @@
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"rnd_clf.fit(mnist[\"data\"], mnist[\"target\"])"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def plot_digit(data):\n",
" image = data.reshape(28, 28)\n",
@@ -462,7 +483,7 @@
},
{
"cell_type": "code",
"execution_count": 28,
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
@@ -484,7 +505,7 @@
},
{
"cell_type": "code",
"execution_count": 29,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
@@ -498,7 +519,7 @@
},
{
"cell_type": "code",
"execution_count": 30,
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
@@ -507,7 +528,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 32,
"metadata": {},
"outputs": [],
"source": [
@@ -518,7 +539,7 @@
" sample_weights = np.ones(m)\n",
" plt.subplot(subplot)\n",
" for i in range(5):\n",
" svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"auto\", random_state=42)\n",
" svm_clf = SVC(kernel=\"rbf\", C=0.05, gamma=\"scale\", random_state=42)\n",
" svm_clf.fit(X_train, y_train, sample_weight=sample_weights)\n",
" y_pred = svm_clf.predict(X_train)\n",
" sample_weights[y_pred != y_train] *= (1 + learning_rate)\n",
@@ -537,7 +558,7 @@
},
{
"cell_type": "code",
"execution_count": 32,
"execution_count": 33,
"metadata": {},
"outputs": [],
"source": [
@@ -553,7 +574,7 @@
},
{
"cell_type": "code",
"execution_count": 33,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
@@ -564,7 +585,7 @@
},
{
"cell_type": "code",
"execution_count": 34,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
@@ -576,7 +597,7 @@
},
{
"cell_type": "code",
"execution_count": 35,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
@@ -587,7 +608,7 @@
},
{
"cell_type": "code",
"execution_count": 36,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
@@ -598,7 +619,7 @@
},
{
"cell_type": "code",
"execution_count": 37,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
@@ -607,7 +628,7 @@
},
{
"cell_type": "code",
"execution_count": 38,
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
@@ -616,7 +637,7 @@
},
{
"cell_type": "code",
"execution_count": 39,
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@@ -625,7 +646,7 @@
},
{
"cell_type": "code",
"execution_count": 40,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
@@ -674,7 +695,7 @@
},
{
"cell_type": "code",
"execution_count": 41,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
@@ -686,7 +707,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
@@ -696,7 +717,7 @@
},
{
"cell_type": "code",
"execution_count": 43,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
@@ -723,7 +744,7 @@
},
{
"cell_type": "code",
"execution_count": 44,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
@@ -746,7 +767,7 @@
},
{
"cell_type": "code",
"execution_count": 45,
"execution_count": 46,
"metadata": {},
"outputs": [],
"source": [
@@ -755,7 +776,7 @@
},
{
"cell_type": "code",
"execution_count": 46,
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
@@ -781,7 +802,7 @@
},
{
"cell_type": "code",
"execution_count": 47,
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
@@ -805,7 +826,7 @@
},
{
"cell_type": "code",
"execution_count": 48,
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
@@ -814,7 +835,7 @@
},
{
"cell_type": "code",
"execution_count": 49,
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
@@ -830,7 +851,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
@@ -843,7 +864,7 @@
},
{
"cell_type": "code",
"execution_count": 51,
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
@@ -851,22 +872,8 @@
" xgb_reg = xgboost.XGBRegressor(random_state=42)\n",
" xgb_reg.fit(X_train, y_train)\n",
" y_pred = xgb_reg.predict(X_val)\n",
" val_error = mean_squared_error(y_val, y_pred)\n",
" print(\"Validation MSE:\", val_error)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"if xgboost is not None: # not shown in the book\n",
" xgb_reg.fit(X_train, y_train,\n",
" eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n",
" y_pred = xgb_reg.predict(X_val)\n",
" val_error = mean_squared_error(y_val, y_pred)\n",
" print(\"Validation MSE:\", val_error)"
" val_error = mean_squared_error(y_val, y_pred) # Not shown\n",
" print(\"Validation MSE:\", val_error) # Not shown"
]
},
{
@@ -875,7 +882,12 @@
"metadata": {},
"outputs": [],
"source": [
"%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None"
"if xgboost is not None: # not shown in the book\n",
" xgb_reg.fit(X_train, y_train,\n",
" eval_set=[(X_val, y_val)], early_stopping_rounds=2)\n",
" y_pred = xgb_reg.predict(X_val)\n",
" val_error = mean_squared_error(y_val, y_pred) # Not shown\n",
" print(\"Validation MSE:\", val_error) # Not shown"
]
},
{
@@ -883,6 +895,15 @@
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"%timeit xgboost.XGBRegressor().fit(X_train, y_train) if xgboost is not None else None"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"%timeit GradientBoostingRegressor().fit(X_train, y_train)"
]
@@ -933,7 +954,7 @@
},
{
"cell_type": "code",
"execution_count": 55,
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
@@ -942,7 +963,7 @@
},
{
"cell_type": "code",
"execution_count": 56,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
@@ -961,7 +982,7 @@
},
{
"cell_type": "code",
"execution_count": 57,
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
@@ -972,19 +993,19 @@
},
{
"cell_type": "code",
"execution_count": 58,
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"random_forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)\n",
"extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)\n",
"random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)\n",
"extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)\n",
"svm_clf = LinearSVC(random_state=42)\n",
"mlp_clf = MLPClassifier(random_state=42)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
@@ -996,7 +1017,7 @@
},
{
"cell_type": "code",
"execution_count": 60,
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
@@ -1019,7 +1040,7 @@
},
{
"cell_type": "code",
"execution_count": 61,
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
@@ -1028,7 +1049,7 @@
},
{
"cell_type": "code",
"execution_count": 62,
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
@@ -1042,7 +1063,7 @@
},
{
"cell_type": "code",
"execution_count": 63,
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@@ -1051,7 +1072,7 @@
},
{
"cell_type": "code",
"execution_count": 64,
"execution_count": 65,
"metadata": {},
"outputs": [],
"source": [
@@ -1060,7 +1081,7 @@
},
{
"cell_type": "code",
"execution_count": 65,
"execution_count": 66,
"metadata": {},
"outputs": [],
"source": [
@@ -1069,7 +1090,7 @@
},
{
"cell_type": "code",
"execution_count": 66,
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
@@ -1085,7 +1106,7 @@
},
{
"cell_type": "code",
"execution_count": 67,
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
@@ -1101,7 +1122,7 @@
},
{
"cell_type": "code",
"execution_count": 68,
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
@@ -1117,7 +1138,7 @@
},
{
"cell_type": "code",
"execution_count": 69,
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
@@ -1133,7 +1154,7 @@
},
{
"cell_type": "code",
"execution_count": 70,
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
@@ -1149,7 +1170,7 @@
},
{
"cell_type": "code",
"execution_count": 71,
"execution_count": 72,
"metadata": {},
"outputs": [],
"source": [
@@ -1165,7 +1186,7 @@
},
{
"cell_type": "code",
"execution_count": 72,
"execution_count": 73,
"metadata": {},
"outputs": [],
"source": [
@@ -1174,7 +1195,7 @@
},
{
"cell_type": "code",
"execution_count": 73,
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@@ -1197,7 +1218,7 @@
},
{
"cell_type": "code",
"execution_count": 74,
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
@@ -1206,7 +1227,7 @@
},
{
"cell_type": "code",
"execution_count": 75,
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@@ -1236,7 +1257,7 @@
},
{
"cell_type": "code",
"execution_count": 76,
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
@@ -1248,7 +1269,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 78,
"metadata": {},
"outputs": [],
"source": [
@@ -1257,7 +1278,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 79,
"metadata": {},
"outputs": [],
"source": [
@@ -1267,7 +1288,7 @@
},
{
"cell_type": "code",
"execution_count": 79,
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
@@ -1290,7 +1311,7 @@
},
{
"cell_type": "code",
"execution_count": 80,
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
@@ -1302,7 +1323,7 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
@@ -1311,7 +1332,7 @@
},
{
"cell_type": "code",
"execution_count": 82,
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
@@ -1320,7 +1341,7 @@
},
{
"cell_type": "code",
"execution_count": 83,
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
@@ -1337,7 +1358,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 - tf2",
"display_name": "Python 3",
"language": "python",
"name": "python3"
},