Refactor and enhance code in Reinforcement Learning notebook; add new R script for EM algorithm in Unsupervised Learning; update README to include new section for Unsupervised Learning.

This commit is contained in:
2025-11-26 13:20:18 +01:00
parent 5d968fa5e5
commit 08cf8fbeda
8 changed files with 1480 additions and 212 deletions

View File

@@ -97,7 +97,8 @@
"metadata": {},
"outputs": [],
"source": [
"MIN_ARMS = 2 # Minimum number of arms\n",
"MIN_ARMS = 2 # Minimum number of arms\n",
"\n",
"\n",
"class BernoulliBanditK:\n",
" \"\"\"K-armed Bernoulli bandit environment.\n",
@@ -297,10 +298,16 @@
" return np.random.randint(len(Q))\n",
"\n",
" # With probability 1ε: exploit (choose an arm with the highest estimated value).\n",
" max_val = np.max(Q) # Compute the maximum value of the array Q and store it in the variable max_val\n",
" candidates = np.isclose(Q, max_val) # (see Hint) Find all positions in Q where the value equals max_val.\n",
" max_val = np.max(\n",
" Q\n",
" ) # Compute the maximum value of the array Q and store it in the variable max_val\n",
" candidates = np.isclose(\n",
" Q, max_val\n",
" ) # (see Hint) Find all positions in Q where the value equals max_val.\n",
"\n",
" return np.random.choice(candidates) # pick one of those best arms uniformly at random."
" return np.random.choice(\n",
" candidates\n",
" ) # pick one of those best arms uniformly at random."
]
},
{
@@ -593,7 +600,9 @@
" env = BernoulliBanditK(k=k) # Create a new bandit environment with k arms.\n",
"\n",
" # For evaluation only (not used by the agent),\n",
" opt_arm = env.optimal_arm() # the index of the truly best arm which has the largest p_i\n",
" opt_arm = (\n",
" env.optimal_arm()\n",
" ) # the index of the truly best arm which has the largest p_i\n",
" opt_mean = env.optimal_mean() # the best true success probability p_i\n",
"\n",
" if hasattr(opt_mean, \"item\"):\n",
@@ -604,9 +613,15 @@
" Q = np.zeros(k, dtype=float)\n",
"\n",
" # record what happens at each time step\n",
" rewards = np.zeros(T, dtype=float) # rewards[t] = observed reward at step t (0 or 1),\n",
" chose_opt = np.zeros(T, dtype=float) # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
" regret = np.zeros(T, dtype=float) # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
" rewards = np.zeros(\n",
" T, dtype=float\n",
" ) # rewards[t] = observed reward at step t (0 or 1),\n",
" chose_opt = np.zeros(\n",
" T, dtype=float\n",
" ) # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
" regret = np.zeros(\n",
" T, dtype=float\n",
" ) # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
"\n",
" # -------------------------------------------------\n",
" # For the naive method,\n",
@@ -720,7 +735,7 @@
" avg_optimal = np.mean([res[\"optimal_selected\"] for res in results], axis=0)\n",
" avg_instant_regret = np.mean([res[\"regret\"] for res in results], axis=0)\n",
"\n",
" return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)\n"
" return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)"
]
},
{
@@ -828,7 +843,9 @@
"\n",
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
" plt.plot(\n",
" results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"Average reward\")\n",
"plt.title(\"Average reward vs step (Bernoulli bandit, ε-greedy)\")\n",
@@ -838,7 +855,9 @@
"# Plot: Probability of optimal action\n",
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
" plt.plot(\n",
" results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"P(select optimal arm)\")\n",
"plt.title(\"Optimal-action probability vs step (ε-greedy)\")\n",
@@ -861,12 +880,14 @@
"# Plot: Cumulative regret\n",
"plt.figure(figsize=(10, 6))\n",
"for eps in eps_list:\n",
" plt.plot(results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
" plt.plot(\n",
" results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
" )\n",
"plt.xlabel(\"Step\")\n",
"plt.ylabel(\"Average cumulative regret\")\n",
"plt.title(\"Cumulative regret vs step (ε-greedy)\")\n",
"plt.legend()\n",
"plt.show()\n"
"plt.show()"
]
},
{
@@ -907,7 +928,9 @@
"source": [
"# Calculate final performance metrics for each epsilon\n",
"print(\"### Performance Summary for Different ε Values\\n\")\n",
"print(f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\")\n",
"print(\n",
" f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\"\n",
")\n",
"print(\"-\" * 80)\n",
"\n",
"for eps in eps_list:\n",
@@ -916,7 +939,9 @@
" final_cum_reward = np.cumsum(results[eps][\"avg_reward\"])[-1]\n",
" final_cum_regret = results[eps][\"avg_cumreg\"][-1]\n",
"\n",
" print(f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\")\n",
" print(\n",
" f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\"\n",
" )\n",
"\n",
"# Find the best epsilon based on multiple criteria\n",
"best_eps_reward = max(eps_list, key=lambda e: results[e][\"avg_reward\"][-1])\n",