mirror of
https://github.com/ArthurDanjou/ArtStudies.git
synced 2026-01-28 14:56:01 +01:00
Refactor and enhance code in Reinforcement Learning notebook; add new R script for EM algorithm in Unsupervised Learning; update README to include new section for Unsupervised Learning.
This commit is contained in:
@@ -97,7 +97,8 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"MIN_ARMS = 2 # Minimum number of arms\n",
|
||||
"MIN_ARMS = 2 # Minimum number of arms\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"class BernoulliBanditK:\n",
|
||||
" \"\"\"K-armed Bernoulli bandit environment.\n",
|
||||
@@ -297,10 +298,16 @@
|
||||
" return np.random.randint(len(Q))\n",
|
||||
"\n",
|
||||
" # With probability 1−ε: exploit (choose an arm with the highest estimated value).\n",
|
||||
" max_val = np.max(Q) # Compute the maximum value of the array Q and store it in the variable max_val\n",
|
||||
" candidates = np.isclose(Q, max_val) # (see Hint) Find all positions in Q where the value equals max_val.\n",
|
||||
" max_val = np.max(\n",
|
||||
" Q\n",
|
||||
" ) # Compute the maximum value of the array Q and store it in the variable max_val\n",
|
||||
" candidates = np.isclose(\n",
|
||||
" Q, max_val\n",
|
||||
" ) # (see Hint) Find all positions in Q where the value equals max_val.\n",
|
||||
"\n",
|
||||
" return np.random.choice(candidates) # pick one of those best arms uniformly at random."
|
||||
" return np.random.choice(\n",
|
||||
" candidates\n",
|
||||
" ) # pick one of those best arms uniformly at random."
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -593,7 +600,9 @@
|
||||
" env = BernoulliBanditK(k=k) # Create a new bandit environment with k arms.\n",
|
||||
"\n",
|
||||
" # For evaluation only (not used by the agent),\n",
|
||||
" opt_arm = env.optimal_arm() # the index of the truly best arm which has the largest p_i\n",
|
||||
" opt_arm = (\n",
|
||||
" env.optimal_arm()\n",
|
||||
" ) # the index of the truly best arm which has the largest p_i\n",
|
||||
" opt_mean = env.optimal_mean() # the best true success probability p_i\n",
|
||||
"\n",
|
||||
" if hasattr(opt_mean, \"item\"):\n",
|
||||
@@ -604,9 +613,15 @@
|
||||
" Q = np.zeros(k, dtype=float)\n",
|
||||
"\n",
|
||||
" # record what happens at each time step\n",
|
||||
" rewards = np.zeros(T, dtype=float) # rewards[t] = observed reward at step t (0 or 1),\n",
|
||||
" chose_opt = np.zeros(T, dtype=float) # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
|
||||
" regret = np.zeros(T, dtype=float) # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
|
||||
" rewards = np.zeros(\n",
|
||||
" T, dtype=float\n",
|
||||
" ) # rewards[t] = observed reward at step t (0 or 1),\n",
|
||||
" chose_opt = np.zeros(\n",
|
||||
" T, dtype=float\n",
|
||||
" ) # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
|
||||
" regret = np.zeros(\n",
|
||||
" T, dtype=float\n",
|
||||
" ) # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
|
||||
"\n",
|
||||
" # -------------------------------------------------\n",
|
||||
" # For the naive method,\n",
|
||||
@@ -720,7 +735,7 @@
|
||||
" avg_optimal = np.mean([res[\"optimal_selected\"] for res in results], axis=0)\n",
|
||||
" avg_instant_regret = np.mean([res[\"regret\"] for res in results], axis=0)\n",
|
||||
"\n",
|
||||
" return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)\n"
|
||||
" return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -828,7 +843,9 @@
|
||||
"\n",
|
||||
"plt.figure(figsize=(10, 6))\n",
|
||||
"for eps in eps_list:\n",
|
||||
" plt.plot(results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
|
||||
" plt.plot(\n",
|
||||
" results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
|
||||
" )\n",
|
||||
"plt.xlabel(\"Step\")\n",
|
||||
"plt.ylabel(\"Average reward\")\n",
|
||||
"plt.title(\"Average reward vs step (Bernoulli bandit, ε-greedy)\")\n",
|
||||
@@ -838,7 +855,9 @@
|
||||
"# Plot: Probability of optimal action\n",
|
||||
"plt.figure(figsize=(10, 6))\n",
|
||||
"for eps in eps_list:\n",
|
||||
" plt.plot(results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
|
||||
" plt.plot(\n",
|
||||
" results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
|
||||
" )\n",
|
||||
"plt.xlabel(\"Step\")\n",
|
||||
"plt.ylabel(\"P(select optimal arm)\")\n",
|
||||
"plt.title(\"Optimal-action probability vs step (ε-greedy)\")\n",
|
||||
@@ -861,12 +880,14 @@
|
||||
"# Plot: Cumulative regret\n",
|
||||
"plt.figure(figsize=(10, 6))\n",
|
||||
"for eps in eps_list:\n",
|
||||
" plt.plot(results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
|
||||
" plt.plot(\n",
|
||||
" results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
|
||||
" )\n",
|
||||
"plt.xlabel(\"Step\")\n",
|
||||
"plt.ylabel(\"Average cumulative regret\")\n",
|
||||
"plt.title(\"Cumulative regret vs step (ε-greedy)\")\n",
|
||||
"plt.legend()\n",
|
||||
"plt.show()\n"
|
||||
"plt.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -907,7 +928,9 @@
|
||||
"source": [
|
||||
"# Calculate final performance metrics for each epsilon\n",
|
||||
"print(\"### Performance Summary for Different ε Values\\n\")\n",
|
||||
"print(f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\")\n",
|
||||
"print(\n",
|
||||
" f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\"\n",
|
||||
")\n",
|
||||
"print(\"-\" * 80)\n",
|
||||
"\n",
|
||||
"for eps in eps_list:\n",
|
||||
@@ -916,7 +939,9 @@
|
||||
" final_cum_reward = np.cumsum(results[eps][\"avg_reward\"])[-1]\n",
|
||||
" final_cum_regret = results[eps][\"avg_cumreg\"][-1]\n",
|
||||
"\n",
|
||||
" print(f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\")\n",
|
||||
" print(\n",
|
||||
" f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
"# Find the best epsilon based on multiple criteria\n",
|
||||
"best_eps_reward = max(eps_list, key=lambda e: results[e][\"avg_reward\"][-1])\n",
|
||||
|
||||
Reference in New Issue
Block a user