add model interpretation to toydata nb

2026-01-14 12:14:38 +01:00 · 2022-09-08 00:17:44 +02:00
parent 2ba3a610d5
commit be2b83489e
2 changed files with 53 additions and 19 deletions
--- a/notebooks/4_analyze_toydata.ipynb
+++ b/notebooks/4_analyze_toydata.ipynb
@@ -25,6 +25,7 @@
    "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.model_selection import GridSearchCV, train_test_split\n",
    "from sklearn.inspection import plot_partial_dependence, permutation_importance\n",
    "from sklearn import tree\n",
    "# interactive plotting (parallel coordinate plot)\n",
    "import plotly.express as px\n",
@@ -241,7 +242,7 @@
    "# \"product\" is a categorical variable; for it to be handled correctly,\n",
    "# we have to transform it into a one-hot encoded vector\n",
    "e = OneHotEncoder(sparse=False, categories='auto')\n",
-    "ohe = e.fit_transform(df[\"product\"].to_numpy()[:, None])\n",
+    "ohe = e.fit_transform(df[[\"product\"]])\n",
    "df = df.join(pd.DataFrame(ohe, columns=[f\"product_{i}\" for i in e.categories_[0]], index=df.index))\n",
    "df.head()  # notice the additional columns with zeros and a one"
   ]
@@ -254,9 +255,9 @@
   "source": [
    "# from the dataframe we now extract our features ...\n",
    "feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\"]\n",
-    "X = df[feature_cols].to_numpy()  # convert df into a numpy array\n",
+    "X = df[feature_cols]\n",
    "# ... and the vector with labels\n",
-    "y = df[\"faulty\"].to_numpy()\n",
+    "y = df[\"faulty\"]\n",
    "# to evaluate our prediction model, we need to split off a test dataset\n",
    "# later we will use the train_test_split function from sklearn to do this, \n",
    "# but this just goes to show that there is no magic behind it\n",
@@ -264,10 +265,10 @@
    "idx = np.random.permutation(len(df))  # shuffled range of values from 0 to len(df)\n",
    "train_idx = idx[:2000]  # 2/3 of the samples are in the training set\n",
    "test_idx = idx[2000:]\n",
-    "X_train = X[train_idx]  # pick out the rows from X corresponding to these indices\n",
+    "X_train = X.loc[train_idx]  # pick out the rows from X corresponding to these indices\n",
-    "X_test = X[test_idx]\n",
+    "X_test = X.loc[test_idx]\n",
-    "y_train = y[train_idx]\n",
+    "y_train = y.loc[train_idx]\n",
-    "y_test = y[test_idx]"
+    "y_test = y.loc[test_idx]"
   ]
  },
  {
@@ -344,8 +345,8 @@
    "# remove the outliers (i.e., keep only samples with a height > 0)\n",
    "df_new = df[df[\"height\"] > 0.]\n",
    "# create a train/test split again, this time using the sklearn function\n",
-    "X_train, X_test, y_train, y_test = train_test_split(df_new[feature_cols].to_numpy(), \n",
+    "X_train, X_test, y_train, y_test = train_test_split(df_new[feature_cols], \n",
-    "                                                    df_new[\"faulty\"].to_numpy(), \n",
+    "                                                    df_new[\"faulty\"], \n",
    "                                                    test_size=0.33, random_state=15)\n",
    "# see how imbalanced the label distribution in the training and test sets is\n",
    "print(f\"Fraction of ok items in training set: {1-np.mean(y_train):.3f}\")\n",
@@ -423,11 +424,26 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# when the tree is too large (or you're using a random forest),\n",
+    "# to interpret the model when the tree is too large (or you're using a random forest),\n",
    "# check the feature importances instead of plotting the tree\n",
    "dict(zip(feature_cols, clf.feature_importances_))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# similarly, we can also check the permutation feature importance\n",
    "# to see which features are most important for the model (works for any model)\n",
    "result = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42, n_jobs=-1)\n",
    "sorted_idx = result.importances_mean.argsort()\n",
    "plt.figure(figsize=(10, 6))\n",
    "plt.boxplot(result.importances[sorted_idx].T, vert=False, labels=np.array(feature_cols)[sorted_idx])\n",
    "plt.title(\"Permutation Importance\");"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -496,7 +512,9 @@
    "**Questions:** \\\n",
    "Do you notice any patterns? How would you explain to the stakeholders why some of their products are faulty?\n",
    "\n",
-    "(In this case, we can derive the relevant insights already from the plot. However, in real problems, the solution is usually not this obvious, so lets try to see how we could also solve this with ML.)"
+    "(In this case, we can derive the relevant insights already from the plot. However, in real problems, the solution is usually not this obvious, so lets try to see how we could also solve this with ML.)\n",
    "\n",
    "### Supervised Learning 2.0"
   ]
  },
  {
@@ -520,8 +538,8 @@
    "df_new = df[df[\"height\"] > 0.]\n",
    "# let's try with temp as an additional feature\n",
    "feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\", \"temp\"]\n",
-    "X = df_new[feature_cols].to_numpy()\n",
+    "X = df_new[feature_cols]\n",
-    "y = df_new[\"faulty\"].to_numpy()\n",
+    "y = df_new[\"faulty\"]\n",
    "# split into train/test sets again\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=15)\n",
    "# see how imbalanced the label distribution in the training and test sets is\n",
@@ -604,11 +622,27 @@
    "=> Look at this pruned tree and understand which decisions are made (e.g., manually make the same splits on the parallel coordinates plot), i.e., verify that the tree is reaching the same conclusion as we did before."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# you can also check how each feature influences the prediction\n",
    "# with a partial dependence plot (works for any model)\n",
    "plt.figure(figsize=(10, 5))\n",
    "display = plot_partial_dependence(\n",
    "    clf, X_train, feature_cols, kind=\"both\", subsample=50, line_kw={\"color\": '#15317E', \"label\": None},\n",
    "    n_cols=4, n_jobs=-1, grid_resolution=20, random_state=13, ax=plt.gca()\n",
    ")\n",
    "display.figure_.subplots_adjust(wspace=0.1, hspace=0.5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "#### Hyperparameter Tuning\n",
+    "### Hyperparameter Tuning\n",
    "\n",
    "We started out with some initial hyperparameter settings for the decision tree, which already gave us quite good results. However, lets see if we can do even better by systematically testing different hyperparameter combinations, i.e., use a grid search with cross-validation to find an optimal value for `max_depth` and `min_samples_leaf`."
   ]
@@ -694,7 +728,7 @@
    "2. Select the actual parameter values that we want to use for the final model (instead of blindly trusting the values that the grid search had selected for us): notice how with a depth of 5 or greater, all trees with a `min_samples_leaf` setting of 50 or less have the same performance and the grid search simply picked the first model with the best performance. However, as we know a decision tree with a `min_samples_leaf` setting of 1 could in theory memorize individual points, which is not what we want (although this is unlikely with a depth of only 5 and pruning). Therefore, to ensure that we really get robust results, we should instead choose those parameter settings that result in the most regularized model that still produces good results, i.e., in this case a low value for `max_depth` (5) and a high value for `min_samples_leaf` (50).\n",
    "\n",
    "\n",
-    "### Using a Logistic Regression Model\n",
+    "### Try a different model\n",
    "\n",
    "Now that we've obtained very good results with a decision tree, lets see if we can do equally well on this dataset with a linear model (i.e., a logistic regression model, since we have a classification problem)."
   ]
@@ -798,8 +832,8 @@
    "# do a manual feature selection based on the coefficients of the L1 regularized model\n",
    "feature_cols = [\"product_1\", \"product_17\", \"temp\"]\n",
    "# construct a new feature matrix and create the train/test split with this new matrix again\n",
-    "X = df_new[feature_cols].to_numpy()\n",
+    "X = df_new[feature_cols]\n",
-    "y = df_new[\"faulty\"].to_numpy()\n",
+    "y = df_new[\"faulty\"]\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=15)\n",
    "# and don't forget to scale the data again!\n",
    "scaler = StandardScaler()\n",
--- a/notebooks/causal_model.ipynb
+++ b/notebooks/causal_model.ipynb
@@ -228,7 +228,7 @@
   "notebook_metadata_filter": "-all"
  },
  "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
@@ -242,7 +242,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.10.2"
  }
 },
 "nbformat": 4,