mirror of
https://github.com/ArthurDanjou/ml_exercises.git
synced 2026-01-14 12:14:38 +01:00
update sklearn version and case study with backend
This commit is contained in:
@@ -26,10 +26,7 @@
|
||||
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
||||
"from sklearn.decomposition import KernelPCA\n",
|
||||
"from sklearn.manifold import TSNE\n",
|
||||
"import plotly.express as px\n",
|
||||
"# suppress unnecessary warnings\n",
|
||||
"import warnings\n",
|
||||
"warnings.simplefilter(action='ignore', category=FutureWarning)"
|
||||
"import plotly.express as px"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -25,8 +25,8 @@
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"from sklearn.inspection import DecisionBoundaryDisplay\n",
|
||||
"# don't get unneccessary warnings\n",
|
||||
"import warnings\n",
|
||||
"warnings.simplefilter(action='ignore', category=FutureWarning)"
|
||||
"# import warnings\n",
|
||||
"# warnings.simplefilter(action='ignore', category=FutureWarning)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -25,13 +25,13 @@
|
||||
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"from sklearn.model_selection import GridSearchCV, train_test_split\n",
|
||||
"from sklearn.inspection import plot_partial_dependence, permutation_importance\n",
|
||||
"from sklearn.inspection import PartialDependenceDisplay, permutation_importance\n",
|
||||
"from sklearn import tree\n",
|
||||
"# interactive plotting (parallel coordinate plot)\n",
|
||||
"import plotly.express as px\n",
|
||||
"# suppress unnecessary warnings\n",
|
||||
"import warnings\n",
|
||||
"warnings.simplefilter(action='ignore', category=FutureWarning)"
|
||||
"# import warnings\n",
|
||||
"# warnings.simplefilter(action='ignore', category=FutureWarning)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -142,8 +142,8 @@
|
||||
"# look at the correlation matrix to see the correlations between all variables\n",
|
||||
"# for more info on what these numbers mean see here: https://en.wikipedia.org/wiki/Correlation_and_dependence\n",
|
||||
"corr_mat = df.corr()\n",
|
||||
"# uncomment the part below to see the table in color\n",
|
||||
"corr_mat #.style.background_gradient(cmap='coolwarm', axis=None).set_precision(2)"
|
||||
"# we add color to the table with .style\n",
|
||||
"corr_mat.style.background_gradient(cmap='coolwarm', axis=None).format(precision=2)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -241,7 +241,7 @@
|
||||
"source": [
|
||||
"# \"product\" is a categorical variable; for it to be handled correctly,\n",
|
||||
"# we have to transform it into a one-hot encoded vector\n",
|
||||
"e = OneHotEncoder(sparse=False, categories='auto')\n",
|
||||
"e = OneHotEncoder(sparse_output=False, categories='auto')\n",
|
||||
"ohe = e.fit_transform(df[[\"product\"]])\n",
|
||||
"df = df.join(pd.DataFrame(ohe, columns=[f\"product_{i}\" for i in e.categories_[0]], index=df.index))\n",
|
||||
"df.head() # notice the additional columns with zeros and a one"
|
||||
@@ -631,7 +631,7 @@
|
||||
"# you can also check how each feature influences the prediction\n",
|
||||
"# with a partial dependence plot (works for any model)\n",
|
||||
"plt.figure(figsize=(10, 5))\n",
|
||||
"display = plot_partial_dependence(\n",
|
||||
"display = PartialDependenceDisplay.from_estimator(\n",
|
||||
" clf, X_train, feature_cols, kind=\"both\", subsample=50, line_kw={\"color\": '#15317E', \"label\": None},\n",
|
||||
" n_cols=4, n_jobs=-1, grid_resolution=20, random_state=13, ax=plt.gca()\n",
|
||||
")\n",
|
||||
|
||||
@@ -10,7 +10,33 @@
|
||||
"\n",
|
||||
"The previous notebook, \"analyze toydata\", deals with a similar problem and can serve as a guideline for this exercise. You may also want to have a look at the [cheat sheet](https://franziskahorn.de/mlws_resources/cheatsheet.pdf) for more ideas and a concise overview of the relevant steps when developing a machine learning solution in any data science project. \n",
|
||||
"\n",
|
||||
"Feel free to get creative! "
|
||||
"Feel free to get creative! \n",
|
||||
"\n",
|
||||
"### The Task\n",
|
||||
"\n",
|
||||
"You are a data scientist at a startup that wants to build an app to help construction workers mix concrete of a certain target compressive strength (= the main quality parameter of concrete). The construction worker can enter the amounts of the different ingredients:\n",
|
||||
"```\n",
|
||||
"Cement: _____ kg/m^3\n",
|
||||
"Slag: _____ kg/m^3\n",
|
||||
"Fly Ash: _____ kg/m^3\n",
|
||||
"Fine Aggregate: _____ kg/m^3\n",
|
||||
"Coarse Aggregate: _____ kg/m^3\n",
|
||||
"Plasticizer: _____ kg/m^3\n",
|
||||
"Water: _____ kg/m^3\n",
|
||||
"```\n",
|
||||
"and the app then tells them what the estimated compressive strength of this mixture will be:\n",
|
||||
"```\n",
|
||||
"Predicted Compressive Strength after 28 days: XX.X MPa\n",
|
||||
"```\n",
|
||||
"**Your job is it to build an ML model that predicts this compressive strength.**\n",
|
||||
"\n",
|
||||
"Your colleagues have already prepared the backend for the app (see notebook `b`), which will load your model to make the predictions. Additionally, the backend also contains code to optimize the water content in the concrete recipe so that the mixture will get closer to the specified target strength:\n",
|
||||
"```\n",
|
||||
"Target strength: _____ MPa\n",
|
||||
"\n",
|
||||
"Recommended water amount: XX.X kg/m^3\n",
|
||||
"New predicted strength with optimized water amount: XX.X MPa\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -19,20 +45,14 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# first load some libraries that are needed later\n",
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"from scipy.optimize import minimize\n",
|
||||
"# machine learning stuff\n",
|
||||
"from sklearn.metrics import mean_absolute_error\n",
|
||||
"import requests\n",
|
||||
"import joblib\n",
|
||||
"from sklearn.dummy import DummyRegressor\n",
|
||||
"from sklearn.metrics import mean_absolute_error\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"# interactive plotting (parallel coordinate plot)\n",
|
||||
"import plotly.express as px\n",
|
||||
"# suppress unnecessary warnings\n",
|
||||
"import warnings\n",
|
||||
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
|
||||
"\n",
|
||||
"# these \"magic commands\" are helpful if you plan to import functions from another script\n",
|
||||
"# where you keep changing things, i.e., if you change a function in the script\n",
|
||||
@@ -50,7 +70,7 @@
|
||||
"The original data can be obtained from the [UCI ML data repository](https://archive.ics.uci.edu/ml/datasets/Concrete+Compressive+Strength). (If you're having trouble loading the xls file, you can also open it in Excel and then save it as a CSV file and use `pd.read_csv` instead.)\n",
|
||||
"\n",
|
||||
"One data point in the dataset corresponds to one concrete mixture with the following variables:\n",
|
||||
"- Mixture components (unit: kg in a m3 mixture):\n",
|
||||
"- Mixture components (unit: kg in a m^3 mixture):\n",
|
||||
" - Cement\n",
|
||||
" - Blast Furnace Slag\n",
|
||||
" - Fly Ash\n",
|
||||
@@ -108,7 +128,7 @@
|
||||
"- How are the individual variables distributed?\n",
|
||||
"- Are any variables correlated? \n",
|
||||
"- Do you observe any patterns between the input and target variables? Do these make sense or is anything surprising?\n",
|
||||
"- Anything else you should take into account when preprocessing the data later for the supervised learning part?"
|
||||
"- Anything else you should take into account when preprocessing the data later for the supervised learning task?"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -117,7 +137,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create some plots to better understand the data\n"
|
||||
"# TODO: create some plots to better understand the data\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -128,14 +148,15 @@
|
||||
"\n",
|
||||
"Now that you've become more familiar with the dataset, it's time to tackle the real task, i.e., predict the 28-day compressive strength of a concrete mixture.\n",
|
||||
"\n",
|
||||
"An evaluation pipeline is already set up below using a \"stupid baseline\" (= predicting the mean). Your task is to improve upon the performance by trying... \n",
|
||||
"- different models\n",
|
||||
"- different preprocessing steps (e.g., transformations or feature engineering)\n",
|
||||
"- hyperparameter tuning\n",
|
||||
"An evaluation pipeline is already set up below using a \"stupid baseline\" (= predicting the mean). Your task is to improve upon the performance by trying, for exampe... \n",
|
||||
"- different [models](https://scikit-learn.org/stable/supervised_learning.html)\n",
|
||||
"- different [preprocessing steps](https://scikit-learn.org/stable/modules/preprocessing.html) (e.g., transformations or feature engineering)\n",
|
||||
"- [hyperparameter tuning](https://scikit-learn.org/stable/modules/grid_search.html)\n",
|
||||
"- [ensemble models](https://scikit-learn.org/stable/modules/ensemble.html) (i.e., combining different models)\n",
|
||||
"\n",
|
||||
"Get creative :-)\n",
|
||||
"\n",
|
||||
"**Tip:** Have a look at the [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) function from sklearn to combine multiple steps (e.g., preprocessing and prediction model) into a single estimator object that can be applied to the original data."
|
||||
"**Tip:** To use your model within the app's backend later (notebook `b`), it's important that your final model incl. all necessary preprocessing steps are combined in a single estimator object . This can be accomplished with sklearn's [`make_pipeline`](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html) function. If necessary, you could even write a [custom transformer](https://towardsdatascience.com/pipelines-custom-transformers-in-scikit-learn-the-step-by-step-guide-with-python-code-4a7d9b068156) to perform more fancy feature engineering steps than what is provided by sklearn.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -167,8 +188,9 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define features (all input variables except age) and target\n",
|
||||
"features = ['cement', 'slag', 'fly_ash', 'water', 'plasticizer', 'coarse_aggregate', 'fine_aggregate']\n",
|
||||
"target = 'strength'"
|
||||
"# CAUTION: do not change this - the backend assumes these will be the inputs for your model\n",
|
||||
"features = [\"cement\", \"slag\", \"fly_ash\", \"water\", \"plasticizer\", \"coarse_aggregate\", \"fine_aggregate\"]\n",
|
||||
"target = \"strength\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -238,7 +260,19 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# now it's up to you: try an actual model and get better predictions!\n"
|
||||
"# TODO: now it's up to you: try an actual model and get better predictions!\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# save your final model so it can be used by the backend\n",
|
||||
"# CAUTION: your model must be a single estimator object that also includes all necessary preprocessing steps \n",
|
||||
"# (e.g., by using the `make_pipeline` function mentioned above) which works on the originally defined features\n",
|
||||
"joblib.dump(model, \"strength_model.pkl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -260,7 +294,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# permutation feature importance\n"
|
||||
"# TODO: permutation feature importance\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -269,22 +303,63 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# partial dependence plots\n"
|
||||
"# TODO: partial dependence plots\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 4: Optimization & What-If Analysis\n",
|
||||
"## Step 4: Use the backend\n",
|
||||
"\n",
|
||||
"Head over to notebook `b` and run it from top to bottom and leave it open. It runs the server with our backend using your trained and saved model to make predictions.\n",
|
||||
"\n",
|
||||
"In the code below we query the backend with a single data point to see what it returns."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# url where the backend is running\n",
|
||||
"port = 8000\n",
|
||||
"backend_url = f\"http://localhost:{port}/predict\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# get one row from our feature matrix and convert it to a dictionary\n",
|
||||
"x = X_test.iloc[0].to_dict()\n",
|
||||
"# add a target strength (optional)\n",
|
||||
"x[\"target_strength\"] = 42.5\n",
|
||||
"# send this as a json to the backend via a POST request\n",
|
||||
"response = requests.post(backend_url, json=x)\n",
|
||||
"# the result will also be dictionary with\n",
|
||||
"# - the original water content\n",
|
||||
"# - the predicted strength with the original water content\n",
|
||||
"# - the optimized water content\n",
|
||||
"# - the new predicted strength when using the optimized water content\n",
|
||||
"result = response.json()\n",
|
||||
"result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: Optimization & What-If Analysis\n",
|
||||
"\n",
|
||||
"When mixing concrete, water can usually be dosed rather flexibly.\n",
|
||||
"\n",
|
||||
"Let's say our goal is to achieve a compressive strength after 28 days of 42.5 MPa. \n",
|
||||
"Let's say our goal is to achieve a compressive strength after 28 days of 42.5 MPa.\n",
|
||||
"\n",
|
||||
"Use your prediction model on the test set to see whether the concrete is getting too strong or too weak and then change the water levels accordingly to make sure the production is on target.\n",
|
||||
"\n",
|
||||
"You can run the code below as is, just make sure that `model` is an estimator object that also includes all necessary preprocessing steps (e.g., by using the `make_pipeline` function mentioned above).\n",
|
||||
"Your prediction model is used in the backend to check whether the proposed concrete mixture would get too strong or too weak and then the water amount is adapted accordingly to get closer to the target.\n",
|
||||
"\n",
|
||||
"Does your model help to get the production more on target?"
|
||||
]
|
||||
@@ -295,97 +370,10 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def optimize_water(model, x, value_min, value_max, target_strength=42.5):\n",
|
||||
"def plot_what_if(X, y, target_strength=42.5):\n",
|
||||
" \"\"\"\n",
|
||||
" Optimize the water content for a concrete mixture.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - model: the trained model\n",
|
||||
" - x: pandas dataframe row with one data point\n",
|
||||
" - value_min: minimum bound for water content\n",
|
||||
" - value_max: maximum bound for water content\n",
|
||||
" - target_strength: what we would like the output to be (default: 42.5)\n",
|
||||
" Returns:\n",
|
||||
" - water_org: original water content\n",
|
||||
" - water_new: optimized water content\n",
|
||||
" - pred_org: original strength prediction of the model\n",
|
||||
" - pred_new: strength prediction with optimized water content\n",
|
||||
" \"\"\"\n",
|
||||
" # original situation\n",
|
||||
" water_org = x[\"water\"].values[0]\n",
|
||||
" pred_org = model.predict(x)[0]\n",
|
||||
" print(f\"original prediction with water content {water_org:.1f}: {pred_org:.2f} MPa\")\n",
|
||||
" \n",
|
||||
" def _loss_fun(water_value):\n",
|
||||
" \"\"\"\n",
|
||||
" Nested function (i.e., has access to all variables from the enclosing function)\n",
|
||||
" to compute the squared error between the models strength prediction with the given \n",
|
||||
" water value and our target strength value.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - water_value: np.array with a single value, the proposed water content\n",
|
||||
" Returns:\n",
|
||||
" - loss: the squared error between the predicted and target strength\n",
|
||||
" \"\"\"\n",
|
||||
" # insert the new value into our original data point\n",
|
||||
" new_x = x.copy()\n",
|
||||
" new_x[\"water\"] = water_value[0]\n",
|
||||
" # predict strength with new water content\n",
|
||||
" pred_strength = model.predict(new_x)\n",
|
||||
" # optimization loss = squared difference to target value\n",
|
||||
" loss = (target_strength - pred_strength)**2\n",
|
||||
" return loss\n",
|
||||
" \n",
|
||||
" # use scipy's minimize function to find a value for 'water'\n",
|
||||
" # where the model predicts something close to our target value.\n",
|
||||
" # the start value for the optimization is the original water content.\n",
|
||||
" # to get realistic values, we additionaly specify bounds\n",
|
||||
" # based on the actual min/max values for the water content\n",
|
||||
" res = minimize(_loss_fun, np.array([water_org]), bounds=[(value_min, value_max)], method=\"Powell\")\n",
|
||||
" # the optimized water content is stored in res.x (again a np.array)\n",
|
||||
" water_new = res.x[0]\n",
|
||||
" # check the final strength prediction\n",
|
||||
" new_x = x.copy()\n",
|
||||
" new_x[\"water\"] = water_new\n",
|
||||
" pred_new = model.predict(new_x)[0]\n",
|
||||
" print(f\"new prediction with water content {water_new:.1f}: {pred_new:.2f} MPa\")\n",
|
||||
" return water_org, water_new, pred_org, pred_new\n",
|
||||
" \n",
|
||||
"\n",
|
||||
"def optimize_water_all(model, X, target_strength=42.5):\n",
|
||||
" \"\"\"\n",
|
||||
" Compute the optimized the water content for all data points.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - model: the trained model\n",
|
||||
" - X: pandas dataframe with input features for all data points\n",
|
||||
" - target_strength: what we would like the output to be (default: 42.5)\n",
|
||||
" Returns:\n",
|
||||
" - water_org_s: original water content for all data points\n",
|
||||
" - water_new_s: optimized water content for all data points\n",
|
||||
" - pred_org_s: original strength prediction of the model for all data points\n",
|
||||
" - pred_new_s: strength prediction with optimized water content for all data points\n",
|
||||
" \"\"\"\n",
|
||||
" # bounds for optimization: known, realistic values for water content\n",
|
||||
" value_min, value_max = X[\"water\"].min(), X[\"water\"].max()\n",
|
||||
" # run the optimization for all data points\n",
|
||||
" water_org_s, water_new_s, pred_org_s, pred_new_s = [], [], [], []\n",
|
||||
" for i in range(len(X)):\n",
|
||||
" water_org, water_new, pred_org, pred_new = optimize_water(model, X.iloc[[i]], value_min, value_max, \n",
|
||||
" target_strength)\n",
|
||||
" water_org_s.append(water_org)\n",
|
||||
" water_new_s.append(water_new)\n",
|
||||
" pred_org_s.append(pred_org)\n",
|
||||
" pred_new_s.append(pred_new)\n",
|
||||
" # convert lists to numpy arrays for easier plotting\n",
|
||||
" water_org_s, water_new_s = np.array(water_org_s), np.array(water_new_s)\n",
|
||||
" pred_org_s, pred_new_s = np.array(pred_org_s), np.array(pred_new_s)\n",
|
||||
" return water_org_s, water_new_s, pred_org_s, pred_new_s\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def plot_optimization(water_org_s, water_new_s, pred_org_s, pred_new_s, y, target_strength=42.5):\n",
|
||||
" \"\"\"\n",
|
||||
" Create two plots based on the results from optimize_water_all:\n",
|
||||
" Compute and plot the optimized the water content for all data points.\n",
|
||||
" This code creates two plots:\n",
|
||||
" 1. What-if results after optimization: by changing the water content, the strength predictions\n",
|
||||
" should be closer to the target strength, even after correcting for prediction errors.\n",
|
||||
" The legend includes the MATD = mean absolute target deviation, i.e., how far away the\n",
|
||||
@@ -393,13 +381,26 @@
|
||||
" 2. Original and optimized water content and resulting strength increase/decrease.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - water_org_s: original water content for all data points\n",
|
||||
" - water_new_s: optimized water content for all data points\n",
|
||||
" - pred_org_s: original strength prediction of the model for all data points\n",
|
||||
" - pred_new_s: strength prediction with optimized water content for all data points\n",
|
||||
" - X: pandas dataframe with original input features for all data points\n",
|
||||
" - y: pandas dataframe with true compressive strength values for all data points\n",
|
||||
" - target_strength: what we would like the output to be (default: 42.5)\n",
|
||||
" \"\"\"\n",
|
||||
" # run the optimization for all data points\n",
|
||||
" water_org_s, water_new_s, pred_org_s, pred_new_s = [], [], [], []\n",
|
||||
" for i in range(len(X)):\n",
|
||||
" if not i % 10:\n",
|
||||
" print(f\"backend queried for {i:2} / {len(X)} data points\", end=\"\\r\")\n",
|
||||
" x = X.iloc[i].to_dict()\n",
|
||||
" x[\"target_strength\"] = target_strength\n",
|
||||
" result = requests.post(backend_url, json=x).json()\n",
|
||||
" water_org_s.append(result[\"water_org\"])\n",
|
||||
" water_new_s.append(result[\"water_new\"])\n",
|
||||
" pred_org_s.append(result[\"pred_org\"])\n",
|
||||
" pred_new_s.append(result[\"pred_new\"])\n",
|
||||
" print(f\"backend queried for {len(X)} / {len(X)} data points\")\n",
|
||||
" # convert lists to numpy arrays for easier plotting\n",
|
||||
" water_org_s, water_new_s = np.array(water_org_s), np.array(water_new_s)\n",
|
||||
" pred_org_s, pred_new_s = np.array(pred_org_s), np.array(pred_new_s)\n",
|
||||
" # convert y to a numpy array to make sure the indices match up with the other arrays\n",
|
||||
" target_org_s = y.to_numpy()\n",
|
||||
" \n",
|
||||
@@ -433,7 +434,7 @@
|
||||
" plt.plot([water_new_s.min(), water_new_s.max()], [water_new_s.min(), water_new_s.max()], \"k\", alpha=0.5)\n",
|
||||
" # points above the line: more water than before\n",
|
||||
" # points below the line: less water than before\n",
|
||||
" # color of dot shows whether the optimization resulted in a reduction or increase in strength\n",
|
||||
" # color of dot shows whether the optimization resulted in a reduction or increase in predicted strength\n",
|
||||
" plt.scatter(water_org_s, water_new_s, c=pred_new_s-pred_org_s)\n",
|
||||
" plt.colorbar()\n",
|
||||
" plt.xlabel(\"original water content\")\n",
|
||||
@@ -448,25 +449,15 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# run the optimization on the test set\n",
|
||||
"water_org_s, water_new_s, pred_org_s, pred_new_s = optimize_water_all(model, X_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# plot the results\n",
|
||||
"plot_optimization(water_org_s, water_new_s, pred_org_s, pred_new_s, y_test)"
|
||||
"# run the optimization on the test set and plot the results\n",
|
||||
"plot_what_if(X_test, y_test)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Step 5: Presentation of results\n",
|
||||
"## Step 6: Presentation of results\n",
|
||||
"Clean up your code & think about which results you want to present and the story they tell:\n",
|
||||
"- What have you learned about concrete production and how is this reflected in the data?\n",
|
||||
"- What is the best model that you found & its performance?\n",
|
||||
210
notebooks/5b_quality_prediction_backend.ipynb
Normal file
210
notebooks/5b_quality_prediction_backend.ipynb
Normal file
@@ -0,0 +1,210 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Case Study: Backend\n",
|
||||
"\n",
|
||||
"This notebook contains the backend code for the app. We first load the model you've trained (and saved as `\"strength_model.pkl\"`) in the previous notebook `a` and then make the predictions and optimize the water content of the concrete mixture. \n",
|
||||
"\n",
|
||||
"The backend is built using [FastAPI](https://fastapi.tiangolo.com/), a modern, fast (high-performance), web framework for building APIs with Python. Please note that normally this code would be run as a regular script, not in a notebook."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np\n",
|
||||
"import pandas as pd\n",
|
||||
"import joblib\n",
|
||||
"import nest_asyncio\n",
|
||||
"import uvicorn\n",
|
||||
"from fastapi import Body, FastAPI\n",
|
||||
"from pydantic import BaseModel, Field\n",
|
||||
"from scipy.optimize import minimize"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# load your trained model - make sure it's saved under the right name!\n",
|
||||
"model = joblib.load(\"strength_model.pkl\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# configure the host and port where the server will run\n",
|
||||
"# host depends on your setup -> \"127.0.0.1\" if running locally, \"0.0.0.0\" inside a docker container\n",
|
||||
"host = \"127.0.0.1\"\n",
|
||||
"port = 8000"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# helper function to compute the optimized water content\n",
|
||||
"def optimize_water(model, x, value_min=120., value_max=250., target_strength=42.5):\n",
|
||||
" \"\"\"\n",
|
||||
" Optimize the water content for a concrete mixture.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - model: the trained model\n",
|
||||
" - x: pandas dataframe row with one data point\n",
|
||||
" - value_min: minimum bound for water content (default: 120.)\n",
|
||||
" - value_max: maximum bound for water content (default: 250.)\n",
|
||||
" - target_strength: what we would like the output to be (default: 42.5)\n",
|
||||
" Returns:\n",
|
||||
" - water_org: original water content\n",
|
||||
" - water_new: optimized water content\n",
|
||||
" - pred_org: original strength prediction of the model\n",
|
||||
" - pred_new: strength prediction with optimized water content\n",
|
||||
" \"\"\"\n",
|
||||
" # original situation\n",
|
||||
" water_org = x[\"water\"].values[0]\n",
|
||||
" pred_org = model.predict(x)[0]\n",
|
||||
" print(f\"original prediction with water content {water_org:.1f}: {pred_org:.2f} MPa\")\n",
|
||||
" \n",
|
||||
" def _loss_fun(water_value):\n",
|
||||
" \"\"\"\n",
|
||||
" Nested function (i.e., has access to all variables from the enclosing function)\n",
|
||||
" to compute the squared error between the models strength prediction with the given \n",
|
||||
" water value and our target strength value.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" - water_value: np.array with a single value, the proposed water content\n",
|
||||
" Returns:\n",
|
||||
" - loss: the squared error between the predicted and target strength\n",
|
||||
" \"\"\"\n",
|
||||
" # insert the new value into our original data point\n",
|
||||
" new_x = x.copy()\n",
|
||||
" new_x[\"water\"] = water_value[0]\n",
|
||||
" # predict strength with new water content\n",
|
||||
" pred_strength = model.predict(new_x)\n",
|
||||
" # optimization loss = squared difference to target value\n",
|
||||
" loss = (target_strength - pred_strength)**2\n",
|
||||
" return loss\n",
|
||||
" \n",
|
||||
" # use scipy's minimize function to find a value for 'water'\n",
|
||||
" # where the model predicts something close to our target value.\n",
|
||||
" # the start value for the optimization is the original water content.\n",
|
||||
" # to get realistic values, we additionaly specify bounds\n",
|
||||
" # based on the actual min/max values for the water content\n",
|
||||
" res = minimize(_loss_fun, np.array([water_org]), bounds=[(value_min, value_max)], method=\"Powell\")\n",
|
||||
" # the optimized water content is stored in res.x (again a np.array)\n",
|
||||
" water_new = res.x[0]\n",
|
||||
" # check the final strength prediction\n",
|
||||
" new_x = x.copy()\n",
|
||||
" new_x[\"water\"] = water_new\n",
|
||||
" pred_new = model.predict(new_x)[0]\n",
|
||||
" print(f\"new prediction with water content {water_new:.1f}: {pred_new:.2f} MPa\")\n",
|
||||
" return water_org, water_new, pred_org, pred_new"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# FastAPI app instance\n",
|
||||
"app = FastAPI()\n",
|
||||
"\n",
|
||||
"# pydantic model used for data validation\n",
|
||||
"class ConcreteRecipe(BaseModel):\n",
|
||||
" cement: float = Field(ge=50., le=650., description=\"Cement amount in kg/m^3\")\n",
|
||||
" slag: float = Field(ge=0., le=500., description=\"Slag amount in kg/m^3\")\n",
|
||||
" fly_ash: float = Field(ge=0., le=300., description=\"Fly ash amount in kg/m^3\")\n",
|
||||
" fine_aggregate: float = Field(ge=400., le=1100., description=\"Fine aggregate amount in kg/m^3\")\n",
|
||||
" coarse_aggregate: float = Field(ge=500., le=1500., description=\"Coarse aggregate amount in kg/m^3\")\n",
|
||||
" plasticizer: float = Field(ge=0., le=50., description=\"Plasticizer amount in kg/m^3\")\n",
|
||||
" water: float = Field(ge=50., le=500., description=\"Water amount in kg/m^3\")\n",
|
||||
" target_strength: float = Field(default=42.5, ge=0., le=100., description=\"Desired compressive strength in MPa\")\n",
|
||||
"\n",
|
||||
"# endpoint for the base URI, which can be queried with a GET request\n",
|
||||
"@app.get(\"/\")\n",
|
||||
"def home():\n",
|
||||
" return \"Congratulations! Your API is working.\"\n",
|
||||
"\n",
|
||||
"# endpoint for /predict, which can be queried with a POST request to send the concrete recipe data\n",
|
||||
"@app.post(\"/predict\")\n",
|
||||
"def predict_and_optimize(concrete_recipe: ConcreteRecipe):\n",
|
||||
" target_strength = concrete_recipe.target_strength\n",
|
||||
" # transform the given data into a pandas dataframe\n",
|
||||
" concrete_recipe = concrete_recipe.dict()\n",
|
||||
" features = [\"cement\", \"slag\", \"fly_ash\", \"water\", \"plasticizer\", \"coarse_aggregate\", \"fine_aggregate\"]\n",
|
||||
" x = pd.DataFrame({c: [concrete_recipe[c]] for c in features}, columns=features)\n",
|
||||
" # make predictions and get optimized water content\n",
|
||||
" water_org, water_new, pred_org, pred_new = optimize_water(model, x, target_strength=target_strength)\n",
|
||||
" # return the computed values as a JSON\n",
|
||||
" return {\n",
|
||||
" \"water_org\": water_org, \n",
|
||||
" \"water_new\": water_new, \n",
|
||||
" \"pred_org\": pred_org, \n",
|
||||
" \"pred_new\": pred_new,\n",
|
||||
" }"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"By running the following cell you will spin up the server!\n",
|
||||
"\n",
|
||||
"This causes the notebook to block (no cells/code can run) until you manually interrupt the kernel. You can do this by clicking on the Kernel tab and then on Interrupt or by entering Jupyter's command mode by pressing the `ESC` key and tapping the `I` key twice."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# allow the server to be run in this interactive environment\n",
|
||||
"nest_asyncio.apply()\n",
|
||||
"# spin up the server \n",
|
||||
"uvicorn.run(app, host=host, port=port)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
Reference in New Issue
Block a user