mirror of
https://github.com/ArthurDanjou/ml_exercises.git
synced 2026-01-14 12:14:38 +01:00
quizzes as pdfs and workbook
This commit is contained in:
@@ -256,7 +256,7 @@
|
||||
"feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\"]\n",
|
||||
"X = df[feature_cols].to_numpy() # convert df into a numpy array\n",
|
||||
"# ... and the vector with labels\n",
|
||||
"y = df[[\"faulty\"]].to_numpy()\n",
|
||||
"y = df[\"faulty\"].to_numpy()\n",
|
||||
"# to evaluate our prediction model, we need to split off a test dataset\n",
|
||||
"# later we will use the train_test_split function from sklearn to do this, \n",
|
||||
"# but this just goes to show that there is no magic behind it\n",
|
||||
@@ -340,46 +340,12 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# maybe we just need to give the tree the freedom to make more splits? (i.e., increase its depth)\n",
|
||||
"clf = tree.DecisionTreeClassifier(max_depth=100, random_state=1)\n",
|
||||
"clf = clf.fit(X_train, y_train)\n",
|
||||
"print(f\"Accuracy on training data: {clf.score(X_train, y_train):.3f}\")\n",
|
||||
"print(f\"Accuracy on test data: {clf.score(X_test, y_test):.3f}\")\n",
|
||||
"print(f\"Balanced accuracy on training data: {balanced_accuracy_score(y_train, clf.predict(X_train)):.3f}\")\n",
|
||||
"print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Questions:** \\\n",
|
||||
"Is this a better model? If anything, is the model over- or underfitting?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# when the tree is too large (or you're using a random forest),\n",
|
||||
"# check the feature importances instead of plotting the tree\n",
|
||||
"dict(zip(feature_cols, clf.feature_importances_))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# now let's do what we probably should have done in the beginning and \n",
|
||||
"# let's do what we probably should have done in the beginning and \n",
|
||||
"# remove the outliers (i.e., keep only samples with a height > 0)\n",
|
||||
"df_new = df[df[\"height\"] > 0.]\n",
|
||||
"# create a train/test split again, this time using the sklearn function\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(df_new[feature_cols].to_numpy(), \n",
|
||||
" df_new[[\"faulty\"]].to_numpy(), \n",
|
||||
" df_new[\"faulty\"].to_numpy(), \n",
|
||||
" test_size=0.33, random_state=15)\n",
|
||||
"# see how imbalanced the label distribution in the training and test sets is\n",
|
||||
"print(f\"Fraction of ok items in training set: {1-np.mean(y_train):.3f}\")\n",
|
||||
@@ -408,6 +374,18 @@
|
||||
"print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the tree again\n",
|
||||
"plt.figure(figsize=(15, 10))\n",
|
||||
"tree.plot_tree(clf, feature_names=feature_cols, filled=True, class_names=np.array(clf.classes_, dtype=str), proportion=True);\n",
|
||||
"# notice how in the leaf nodes where the tree predicts \"faulty\", there are only very few data points"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
@@ -422,10 +400,32 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the tree\n",
|
||||
"plt.figure(figsize=(15, 10))\n",
|
||||
"tree.plot_tree(clf, feature_names=feature_cols, filled=True, class_names=np.array(clf.classes_, dtype=str), proportion=True);\n",
|
||||
"# notice how in the leaf nodes where the tree predicts \"faulty\", there are only very few data points"
|
||||
"# maybe we just need to give the tree the freedom to make more splits? (i.e., increase its depth)\n",
|
||||
"clf = tree.DecisionTreeClassifier(max_depth=100, random_state=1)\n",
|
||||
"clf = clf.fit(X_train, y_train)\n",
|
||||
"print(f\"Accuracy on training data: {clf.score(X_train, y_train):.3f}\")\n",
|
||||
"print(f\"Accuracy on test data: {clf.score(X_test, y_test):.3f}\")\n",
|
||||
"print(f\"Balanced accuracy on training data: {balanced_accuracy_score(y_train, clf.predict(X_train)):.3f}\")\n",
|
||||
"print(f\"Balanced accuracy on test data: {balanced_accuracy_score(y_test, clf.predict(X_test)):.3f}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Questions:** \\\n",
|
||||
"Is this a better model? If anything, is the model over- or underfitting?"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# when the tree is too large (or you're using a random forest),\n",
|
||||
"# check the feature importances instead of plotting the tree\n",
|
||||
"dict(zip(feature_cols, clf.feature_importances_))"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -521,7 +521,7 @@
|
||||
"# let's try with temp as an additional feature\n",
|
||||
"feature_cols = [\"product_1\", \"product_5\", \"product_17\", \"height\", \"width\", \"depth\", \"temp\"]\n",
|
||||
"X = df_new[feature_cols].to_numpy()\n",
|
||||
"y = df_new[[\"faulty\"]].to_numpy()\n",
|
||||
"y = df_new[\"faulty\"].to_numpy()\n",
|
||||
"# split into train/test sets again\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=15)\n",
|
||||
"# see how imbalanced the label distribution in the training and test sets is\n",
|
||||
@@ -706,7 +706,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# try a different classifier: logistic regression\n",
|
||||
"y_train, y_test = y_train.flatten(), y_test.flatten() # otherwise the model will complain about the shapes\n",
|
||||
"# first, try the model with the default parameter settings\n",
|
||||
"clf = LogisticRegression()\n",
|
||||
"clf = clf.fit(X_train, y_train)\n",
|
||||
|
||||
@@ -117,7 +117,7 @@
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"-------------------------------------------------------------------------------------\n",
|
||||
"You're already given this rudimentary prediction pipeline, now your job is to improve it. Below are some things you might want to try, but feel free to get creative! Have a look at the [cheat sheet](https://github.com/cod3licious/ml_exercises/blob/main/cheatsheet.pdf) for more ideas and a concise overview of the relevant steps when developing a machine learning solution in any data science project. \n",
|
||||
"You're already given this rudimentary prediction pipeline, now your job is to improve it. Below are some things you might want to try, but feel free to get creative! Have a look at the [cheat sheet](https://github.com/cod3licious/ml_exercises/blob/main/other/cheatsheet.pdf) for more ideas and a concise overview of the relevant steps when developing a machine learning solution in any data science project. \n",
|
||||
"\n",
|
||||
"The previous notebook, \"analyze toydata\", deals with a very similar problem and can serve as a guideline for this exercise. For an example of how to use the t-SNE algorithm, have a look at the first notebook, \"visualize text\" (but please note that since you don't have sparse data here, there is no need to transform the data with a kernel PCA before using t-SNE).\n",
|
||||
"\n",
|
||||
|
||||
Reference in New Issue
Block a user