Refactor and enhance code in Reinforcement Learning notebook; add new R script for EM algorithm in Unsupervised Learning; update README to include new section for Unsupervised Learning.

2026-03-18 16:51:36 +01:00 · 2025-11-26 13:20:18 +01:00
parent 5d968fa5e5
commit 08cf8fbeda
8 changed files with 1480 additions and 212 deletions
--- a/M2/Reinforcement
+++ b/M2/Reinforcement
@@ -97,7 +97,8 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "MIN_ARMS = 2 # Minimum number of arms\n",
+    "MIN_ARMS = 2  # Minimum number of arms\n",
+    "\n",
    "\n",
    "class BernoulliBanditK:\n",
    "    \"\"\"K-armed Bernoulli bandit environment.\n",
@@ -297,10 +298,16 @@
    "        return np.random.randint(len(Q))\n",
    "\n",
    "    # With probability 1−ε: exploit (choose an arm with the highest estimated value).\n",
-    "    max_val = np.max(Q)  # Compute the maximum value of the array Q and store it in the variable max_val\n",
-    "    candidates = np.isclose(Q, max_val)  # (see Hint) Find all positions in Q where the value equals max_val.\n",
+    "    max_val = np.max(\n",
+    "        Q\n",
+    "    )  # Compute the maximum value of the array Q and store it in the variable max_val\n",
+    "    candidates = np.isclose(\n",
+    "        Q, max_val\n",
+    "    )  # (see Hint) Find all positions in Q where the value equals max_val.\n",
    "\n",
-    "    return np.random.choice(candidates)  # pick one of those best arms uniformly at random."
+    "    return np.random.choice(\n",
+    "        candidates\n",
+    "    )  # pick one of those best arms uniformly at random."
   ]
  },
  {
@@ -593,7 +600,9 @@
    "    env = BernoulliBanditK(k=k)  # Create a new bandit environment with k arms.\n",
    "\n",
    "    # For evaluation only (not used by the agent),\n",
-    "    opt_arm = env.optimal_arm()  # the index of the truly best arm which has the largest p_i\n",
+    "    opt_arm = (\n",
+    "        env.optimal_arm()\n",
+    "    )  # the index of the truly best arm which has the largest p_i\n",
    "    opt_mean = env.optimal_mean()  # the best true success probability p_i\n",
    "\n",
    "    if hasattr(opt_mean, \"item\"):\n",
@@ -604,9 +613,15 @@
    "    Q = np.zeros(k, dtype=float)\n",
    "\n",
    "    # record what happens at each time step\n",
-    "    rewards = np.zeros(T, dtype=float)  # rewards[t] = observed reward at step t (0 or 1),\n",
-    "    chose_opt = np.zeros(T, dtype=float)  # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
-    "    regret = np.zeros(T, dtype=float)  # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
+    "    rewards = np.zeros(\n",
+    "        T, dtype=float\n",
+    "    )  # rewards[t] = observed reward at step t (0 or 1),\n",
+    "    chose_opt = np.zeros(\n",
+    "        T, dtype=float\n",
+    "    )  # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
+    "    regret = np.zeros(\n",
+    "        T, dtype=float\n",
+    "    )  # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
    "\n",
    "    # -------------------------------------------------\n",
    "    # For the naive method,\n",
@@ -720,7 +735,7 @@
    "    avg_optimal = np.mean([res[\"optimal_selected\"] for res in results], axis=0)\n",
    "    avg_instant_regret = np.mean([res[\"regret\"] for res in results], axis=0)\n",
    "\n",
-    "    return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)\n"
+    "    return avg_rewards, avg_optimal, np.cumsum(avg_instant_regret)"
   ]
  },
  {
@@ -828,7 +843,9 @@
    "\n",
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
-    "    plt.plot(results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
+    "    plt.plot(\n",
+    "        results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"Average reward\")\n",
    "plt.title(\"Average reward vs step (Bernoulli bandit, ε-greedy)\")\n",
@@ -838,7 +855,9 @@
    "# Plot: Probability of optimal action\n",
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
-    "    plt.plot(results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
+    "    plt.plot(\n",
+    "        results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"P(select optimal arm)\")\n",
    "plt.title(\"Optimal-action probability vs step (ε-greedy)\")\n",
@@ -861,12 +880,14 @@
    "# Plot: Cumulative regret\n",
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
-    "    plt.plot(results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)])\n",
+    "    plt.plot(\n",
+    "        results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"Average cumulative regret\")\n",
    "plt.title(\"Cumulative regret vs step (ε-greedy)\")\n",
    "plt.legend()\n",
-    "plt.show()\n"
+    "plt.show()"
   ]
  },
  {
@@ -907,7 +928,9 @@
   "source": [
    "# Calculate final performance metrics for each epsilon\n",
    "print(\"### Performance Summary for Different ε Values\\n\")\n",
-    "print(f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\")\n",
+    "print(\n",
+    "    f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\"\n",
+    ")\n",
    "print(\"-\" * 80)\n",
    "\n",
    "for eps in eps_list:\n",
@@ -916,7 +939,9 @@
    "    final_cum_reward = np.cumsum(results[eps][\"avg_reward\"])[-1]\n",
    "    final_cum_regret = results[eps][\"avg_cumreg\"][-1]\n",
    "\n",
-    "    print(f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\")\n",
+    "    print(\n",
+    "        f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\"\n",
+    "    )\n",
    "\n",
    "# Find the best epsilon based on multiple criteria\n",
    "best_eps_reward = max(eps_list, key=lambda e: results[e][\"avg_reward\"][-1])\n",