quizzes as pdfs and workbook

2026-01-14 12:14:38 +01:00 · 2021-10-23 22:31:51 +02:00
parent 30f203c34e
commit 6032017679
10 changed files with 50 additions and 50 deletions
--- a/notebooks/6_analyze_toydata.ipynb
+++ b/notebooks/6_analyze_toydata.ipynb
@@ -256,7 +256,7 @@
    "feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\"]\n",
    "X = df[feature_cols].to_numpy()  # convert df into a numpy array\n",
    "# ... and the vector with labels\n",
-    "y = df[[\"faulty\"]].to_numpy()\n",
+    "y = df[\"faulty\"].to_numpy()\n",
    "# to evaluate our prediction model, we need to split off a test dataset\n",
    "# later we will use the train_test_split function from sklearn to do this, \n",
    "# but this just goes to show that there is no magic behind it\n",
@@ -340,46 +340,12 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# maybe we just need to give the tree the freedom to make more splits? (i.e., increase its depth)\n",
-    "clf = tree.DecisionTreeClassifier(max_depth=100, random_state=1)\n",
-    "clf = clf.fit(X_train, y_train)\n",
-    "print(f\"Accuracy on training data: {clf.score(X_train, y_train):.3f}\")\n",
-    "print(f\"Accuracy on test data: {clf.score(X_test, y_test):.3f}\")\n",
-    "print(f\"Balanced accuracy on training data: {balanced_accuracy_score(y_train, clf.predict(X_train)):.3f}\")\n",
-    "print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "**Questions:** \\\n",
-    "Is this a better model? If anything, is the model over- or underfitting?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# when the tree is too large (or you're using a random forest),\n",
-    "# check the feature importances instead of plotting the tree\n",
-    "dict(zip(feature_cols, clf.feature_importances_))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# now let's do what we probably should have done in the beginning and \n",
+    "# let's do what we probably should have done in the beginning and \n",
    "# remove the outliers (i.e., keep only samples with a height > 0)\n",
    "df_new = df[df[\"height\"] > 0.]\n",
    "# create a train/test split again, this time using the sklearn function\n",
    "X_train, X_test, y_train, y_test = train_test_split(df_new[feature_cols].to_numpy(), \n",
-    "                                                    df_new[[\"faulty\"]].to_numpy(), \n",
+    "                                                    df_new[\"faulty\"].to_numpy(), \n",
    "                                                    test_size=0.33, random_state=15)\n",
    "# see how imbalanced the label distribution in the training and test sets is\n",
    "print(f\"Fraction of ok items in training set: {1-np.mean(y_train):.3f}\")\n",
@@ -408,6 +374,18 @@
    "print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# plot the tree again\n",
+    "plt.figure(figsize=(15, 10))\n",
+    "tree.plot_tree(clf, feature_names=feature_cols, filled=True, class_names=np.array(clf.classes_, dtype=str), proportion=True);\n",
+    "# notice how in the leaf nodes where the tree predicts \"faulty\", there are only very few data points"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@@ -422,10 +400,32 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "# plot the tree\n",
-    "plt.figure(figsize=(15, 10))\n",
-    "tree.plot_tree(clf, feature_names=feature_cols, filled=True, class_names=np.array(clf.classes_, dtype=str), proportion=True);\n",
-    "# notice how in the leaf nodes where the tree predicts \"faulty\", there are only very few data points"
+    "# maybe we just need to give the tree the freedom to make more splits? (i.e., increase its depth)\n",
+    "clf = tree.DecisionTreeClassifier(max_depth=100, random_state=1)\n",
+    "clf = clf.fit(X_train, y_train)\n",
+    "print(f\"Accuracy on training data: {clf.score(X_train, y_train):.3f}\")\n",
+    "print(f\"Accuracy on test data: {clf.score(X_test, y_test):.3f}\")\n",
+    "print(f\"Balanced accuracy on training data: {balanced_accuracy_score(y_train, clf.predict(X_train)):.3f}\")\n",
+    "print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "**Questions:** \\\n",
+    "Is this a better model? If anything, is the model over- or underfitting?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# when the tree is too large (or you're using a random forest),\n",
+    "# check the feature importances instead of plotting the tree\n",
+    "dict(zip(feature_cols, clf.feature_importances_))"
   ]
  },
  {
@@ -521,7 +521,7 @@
    "# let's try with temp as an additional feature\n",
    "feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\", \"temp\"]\n",
    "X = df_new[feature_cols].to_numpy()\n",
-    "y = df_new[[\"faulty\"]].to_numpy()\n",
+    "y = df_new[\"faulty\"].to_numpy()\n",
    "# split into train/test sets again\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=15)\n",
    "# see how imbalanced the label distribution in the training and test sets is\n",
@@ -706,7 +706,6 @@
   "outputs": [],
   "source": [
    "# try a different classifier: logistic regression\n",
-    "y_train, y_test = y_train.flatten(), y_test.flatten()  # otherwise the model will complain about the shapes\n",
    "# first, try the model with the default parameter settings\n",
    "clf = LogisticRegression()\n",
    "clf = clf.fit(X_train, y_train)\n",
--- a/notebooks/7_hard_drive_failures.ipynb
+++ b/notebooks/7_hard_drive_failures.ipynb
@@ -117,7 +117,7 @@
   "metadata": {},
   "source": [
    "-------------------------------------------------------------------------------------\n",
-    "You're already given this rudimentary prediction pipeline, now your job is to improve it. Below are some things you might want to try, but feel free to get creative! Have a look at the [cheat sheet](https://github.com/cod3licious/ml_exercises/blob/main/cheatsheet.pdf) for more ideas and a concise overview of the relevant steps when developing a machine learning solution in any data science project. \n",
+    "You're already given this rudimentary prediction pipeline, now your job is to improve it. Below are some things you might want to try, but feel free to get creative! Have a look at the [cheat sheet](https://github.com/cod3licious/ml_exercises/blob/main/other/cheatsheet.pdf) for more ideas and a concise overview of the relevant steps when developing a machine learning solution in any data science project. \n",
    "\n",
    "The previous notebook, \"analyze toydata\", deals with a very similar problem and can serve as a guideline for this exercise. For an example of how to use the t-SNE algorithm, have a look at the first notebook, \"visualize text\" (but please note that since you don't have sparse data here, there is no need to transform the data with a kernel PCA before using t-SNE).\n",
    "\n",