Refactor code for improved readability and consistency across multiple Jupyter notebooks

- Added missing commas in various print statements and function calls for better syntax. - Reformatted code to enhance clarity, including breaking long lines and aligning parameters. - Updated function signatures to use float type for sigma parameters instead of int for better precision. - Cleaned up comments and documentation strings for clarity and consistency. - Ensured consistent formatting in plotting functions and data handling.
2026-03-18 16:51:36 +01:00 · 2025-12-13 23:38:17 +01:00
parent f89ff4a016
commit d5a6bfd339
50 changed files with 779 additions and 449 deletions
--- a/M2/Reinforcement
+++ b/M2/Reinforcement
@@ -299,14 +299,15 @@
    "\n",
    "    # With probability 1−ε: exploit (choose an arm with the highest estimated value).\n",
    "    max_val = np.max(\n",
-    "        Q\n",
+    "        Q,\n",
    "    )  # Compute the maximum value of the array Q and store it in the variable max_val\n",
    "    candidates = np.isclose(\n",
-    "        Q, max_val\n",
+    "        Q,\n",
+    "        max_val,\n",
    "    )  # (see Hint) Find all positions in Q where the value equals max_val.\n",
    "\n",
    "    return np.random.choice(\n",
-    "        candidates\n",
+    "        candidates,\n",
    "    )  # pick one of those best arms uniformly at random."
   ]
  },
@@ -614,13 +615,16 @@
    "\n",
    "    # record what happens at each time step\n",
    "    rewards = np.zeros(\n",
-    "        T, dtype=float\n",
+    "        T,\n",
+    "        dtype=float,\n",
    "    )  # rewards[t] = observed reward at step t (0 or 1),\n",
    "    chose_opt = np.zeros(\n",
-    "        T, dtype=float\n",
+    "        T,\n",
+    "        dtype=float,\n",
    "    )  # chose_opt[t] = 1 if the chosen arm equals opt_arm, else 0,\n",
    "    regret = np.zeros(\n",
-    "        T, dtype=float\n",
+    "        T,\n",
+    "        dtype=float,\n",
    "    )  # regret[t]=p^*-R_t, which means how much we “missed” compared to the best arm\n",
    "\n",
    "    # -------------------------------------------------\n",
@@ -714,7 +718,11 @@
   "outputs": [],
   "source": [
    "def run_many(\n",
-    "    runs: int, T: int, epsilon: float, k: int, update_style: str = \"incremental\"\n",
+    "    runs: int,\n",
+    "    T: int,\n",
+    "    epsilon: float,\n",
+    "    k: int,\n",
+    "    update_style: str = \"incremental\",\n",
    "):\n",
    "    \"\"\"Run multiple independent experiments of T time steps using ε-greedy on a k-armed Bernoulli bandit.\n",
    "\n",
@@ -844,7 +852,9 @@
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
    "    plt.plot(\n",
-    "        results[eps][\"avg_reward\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "        results[eps][\"avg_reward\"],\n",
+    "        label=f\"ε={eps}\",\n",
+    "        color=colors[eps_list.index(eps)],\n",
    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"Average reward\")\n",
@@ -856,7 +866,9 @@
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
    "    plt.plot(\n",
-    "        results[eps][\"avg_opt\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "        results[eps][\"avg_opt\"],\n",
+    "        label=f\"ε={eps}\",\n",
+    "        color=colors[eps_list.index(eps)],\n",
    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"P(select optimal arm)\")\n",
@@ -881,7 +893,9 @@
    "plt.figure(figsize=(10, 6))\n",
    "for eps in eps_list:\n",
    "    plt.plot(\n",
-    "        results[eps][\"avg_cumreg\"], label=f\"ε={eps}\", color=colors[eps_list.index(eps)]\n",
+    "        results[eps][\"avg_cumreg\"],\n",
+    "        label=f\"ε={eps}\",\n",
+    "        color=colors[eps_list.index(eps)],\n",
    "    )\n",
    "plt.xlabel(\"Step\")\n",
    "plt.ylabel(\"Average cumulative regret\")\n",
@@ -929,7 +943,7 @@
    "# Calculate final performance metrics for each epsilon\n",
    "print(\"### Performance Summary for Different ε Values\\n\")\n",
    "print(\n",
-    "    f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\"\n",
+    "    f\"{'ε':<6} {'Final Avg Reward':<18} {'Final Opt %':<15} {'Final Cum Reward':<18} {'Final Cum Regret':<18}\",\n",
    ")\n",
    "print(\"-\" * 80)\n",
    "\n",
@@ -940,7 +954,7 @@
    "    final_cum_regret = results[eps][\"avg_cumreg\"][-1]\n",
    "\n",
    "    print(\n",
-    "        f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\"\n",
+    "        f\"{eps:<6.2f} {final_avg_reward:<18.4f} {final_opt_prob:<15.2f} {final_cum_reward:<18.2f} {final_cum_regret:<18.2f}\",\n",
    "    )\n",
    "\n",
    "# Find the best epsilon based on multiple criteria\n",
@@ -1029,10 +1043,10 @@
    "t_incr = time_runner(\"incremental\")\n",
    "\n",
    "print(\n",
-    "    f\"Naive sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_naive:.3f} s\"\n",
+    "    f\"Naive sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_naive:.3f} s\",\n",
    ")\n",
    "print(\n",
-    "    f\"Incremental sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_incr:.3f} s\"\n",
+    "    f\"Incremental sample-mean total time over {RUNS_num} runs × {T_time} steps: {t_incr:.3f} s\",\n",
    ")\n",
    "print(f\"Speedup (naive / incremental): {t_naive / t_incr:.2f}×\")"
   ]
--- a/M2/Reinforcement
+++ b/M2/Reinforcement
@@ -40,11 +40,12 @@
    "import numpy as np\n",
    "\n",
    "np.set_printoptions(\n",
-    "    precision=3, suppress=True\n",
+    "    precision=3,\n",
+    "    suppress=True,\n",
    ")  # (not mandatory) This line is for limiting floats to 3 decimal places, avoiding scientific notation (like 1.23e-04) for small numbers.\n",
    "\n",
    "# For reproducibility\n",
-    "rng = np.random.default_rng(seed=42)  # This line creates a random number generator.\n"
+    "rng = np.random.default_rng(seed=42)  # This line creates a random number generator."
   ]
  },
  {
@@ -110,7 +111,7 @@
    "    \"#..#..G\",\n",
    "    \"#..X..#\",\n",
    "    \"#######\",\n",
-    "]\n"
+    "]"
   ]
  },
  {
@@ -142,7 +143,7 @@
    "n_rows = len(maze_str)\n",
    "print(n_rows)\n",
    "n_cols = len(maze_str[0])\n",
-    "print(n_cols)\n"
+    "print(n_cols)"
   ]
  },
  {
@@ -169,7 +170,7 @@
   "source": [
    "print(\"Maze:\")\n",
    "for row in maze_str:\n",
-    "    print(row)\n"
+    "    print(row)"
   ]
  },
  {
@@ -207,7 +208,7 @@
    "    \"S\",\n",
    "    \"G\",\n",
    "    \"X\",\n",
-    "}  # The vector Free represents cells that the agent is allowed to move into.\n"
+    "}  # The vector Free represents cells that the agent is allowed to move into."
   ]
  },
  {
@@ -277,7 +278,7 @@
    "print(\"Number of states (non-wall cells):\", n_states)\n",
    "print(\"Start state:\", start_state, \"at\", state_to_pos[start_state])\n",
    "print(\"Goal states:\", goal_states, \"at\", state_to_pos[goal_states[0]])\n",
-    "print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])\n"
+    "print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])"
   ]
  },
  {
@@ -304,7 +305,7 @@
   ],
   "source": [
    "my_dict = {\"key1\": \"value1\", \"key2\": \"value2\"}\n",
-    "print(my_dict[\"key2\"])\n"
+    "print(my_dict[\"key2\"])"
   ]
  },
  {
@@ -383,7 +384,7 @@
    "def plot_maze_with_states():\n",
    "    \"\"\"Plot the maze with state indices.\"\"\"\n",
    "    grid = np.ones(\n",
-    "        (n_rows, n_cols)\n",
+    "        (n_rows, n_cols),\n",
    "    )  # Start with a matrix of ones. Here 1 means “free cell”\n",
    "    for i in range(n_rows):\n",
    "        for j in range(n_cols):\n",
@@ -571,7 +572,7 @@
    "        # If the next cell is a wall, the robot stays in place.\n",
    "        return i, j\n",
    "\n",
-    "    return candidate_i, candidate_j  # Otherwise, return the new position\n"
+    "    return candidate_i, candidate_j  # Otherwise, return the new position"
   ]
  },
  {
@@ -601,7 +602,7 @@
   "outputs": [],
   "source": [
    "gamma = 0.95\n",
-    "p_error = 0.1  # probability of the error to a random other direction\n"
+    "p_error = 0.1  # probability of the error to a random other direction"
   ]
  },
  {
@@ -662,7 +663,7 @@
    "# Set rewards for each state\n",
    "step_penalty = -0.01\n",
    "goal_reward = 1.0\n",
-    "trap_reward = -1.0\n"
+    "trap_reward = -1.0"
   ]
  },
  {
@@ -710,7 +711,7 @@
    "    elif s in trap_states:\n",
    "        R[s] = trap_reward\n",
    "    else:\n",
-    "        R[s] = step_penalty\n"
+    "        R[s] = step_penalty"
   ]
  },
  {
@@ -735,7 +736,7 @@
    "\n",
    "def is_terminal(s: int) -> bool:\n",
    "    \"\"\"Check if a state is terminal (goal or trap).\"\"\"\n",
-    "    return s in terminal_states\n"
+    "    return s in terminal_states"
   ]
  },
  {
@@ -797,9 +798,9 @@
    "            error_i, error_j = move_deterministic(i, j, a2)\n",
    "            s_error = pos_to_state[(error_i, error_j)]  # get its state index s_error\n",
    "            P[a, s, s_error] += p_error / len(\n",
-    "                other_actions\n",
+    "                other_actions,\n",
    "            )  # add p_error / 3 to P[a, s, s_error]\n",
-    "# So for each (s,a), probabilities over all s_next sum to 1.\n"
+    "# So for each (s,a), probabilities over all s_next sum to 1."
   ]
  },
  {
@@ -843,7 +844,7 @@
    "    # If everything is correct, they should be very close to 1.\n",
    "\n",
    "    probs = P[a].sum(axis=1)\n",
-    "    print(f\"Action {action_names[a]}:\", probs)\n"
+    "    print(f\"Action {action_names[a]}:\", probs)"
   ]
  },
  {
@@ -995,7 +996,7 @@
    "\n",
    "    for _it in range(max_iter):  # Main iterative loop\n",
    "        V_new = np.zeros_like(\n",
-    "            V\n",
+    "            V,\n",
    "        )  # Create a new value vector and we will compute an updated value for each state.\n",
    "\n",
    "        # Now we update each state using the Bellman expectation equation\n",
@@ -1004,7 +1005,7 @@
    "            V_new[s] = R[s] + gamma * np.sum(P[a, s, :] * V)\n",
    "\n",
    "        delta = np.max(\n",
-    "            np.abs(V_new - V)\n",
+    "            np.abs(V_new - V),\n",
    "        )  # This measures how much the value function changed in this iteration:\n",
    "        # If delta is small, the values start to converge; otherwise, we need to keep iterating.\n",
    "        V = V_new  # Update V, i.e. Set the new values for the next iteration.\n",
@@ -1012,7 +1013,7 @@
    "        if delta < theta:  # Check convergence: When changes are tiny, we stop.\n",
    "            break\n",
    "\n",
-    "    return V  # Return the final value function, this is our estimate for V^{pi}(s), s in the state set.\n"
+    "    return V  # Return the final value function, this is our estimate for V^{pi}(s), s in the state set."
   ]
  },
  {
@@ -1082,7 +1083,7 @@
   "source": [
    "V_random = policy_evaluation(policy=random_policy, P=P, R=R, gamma=gamma)\n",
    "print(\"Value function under random policy:\")\n",
-    "print(V_random)\n"
+    "print(V_random)"
   ]
  },
  {
@@ -1127,7 +1128,8 @@
    "def plot_values(V: np.ndarray, title=\"Value function\") -> None:\n",
    "    \"\"\"Plot the value function V on the maze as a heatmap.\"\"\"\n",
    "    grid_values = np.full(\n",
-    "        (n_rows, n_cols), np.nan\n",
+    "        (n_rows, n_cols),\n",
+    "        np.nan,\n",
    "    )  # Initializes a grid the same size as the maze. Every cell starts as NaN.\n",
    "    for (\n",
    "        s,\n",
@@ -1152,7 +1154,13 @@
    "\n",
    "    for s, (i, j) in state_to_pos.items():\n",
    "        ax.text(\n",
-    "            j, i, f\"{V[s]:.2f}\", ha=\"center\", va=\"center\", color=\"white\", fontsize=9\n",
+    "            j,\n",
+    "            i,\n",
+    "            f\"{V[s]:.2f}\",\n",
+    "            ha=\"center\",\n",
+    "            va=\"center\",\n",
+    "            color=\"white\",\n",
+    "            fontsize=9,\n",
    "        )\n",
    "\n",
    "    # Remove axis ticks and set title\n",
@@ -1162,7 +1170,7 @@
    "    plt.show()\n",
    "\n",
    "\n",
-    "plot_values(V_random, title=\"Value function: random policy\")\n"
+    "plot_values(V_random, title=\"Value function: random policy\")"
   ]
  },
  {
@@ -1247,7 +1255,7 @@
    "    ax.set_yticklabels([])\n",
    "    ax.grid(True)\n",
    "    ax.set_title(title)\n",
-    "    plt.show()\n"
+    "    plt.show()"
   ]
  },
  {
@@ -1276,7 +1284,7 @@
    }
   ],
   "source": [
-    "plot_policy(policy=random_policy, title=\"Policy\")\n"
+    "plot_policy(policy=random_policy, title=\"Policy\")"
   ]
  },
  {
--- a/M2/Reinforcement
+++ b/M2/Reinforcement
@@ -23,11 +23,12 @@
    "import numpy as np\n",
    "\n",
    "np.set_printoptions(\n",
-    "    precision=3, suppress=True\n",
+    "    precision=3,\n",
+    "    suppress=True,\n",
    ")  # (not mandatory) This line is for limiting floats to 3 decimal places, avoiding scientific notation (like 1.23e-04) for small numbers.\n",
    "\n",
    "# For reproducibility\n",
-    "rng = np.random.default_rng(seed=42)  # This line creates a random number generator.\n"
+    "rng = np.random.default_rng(seed=42)  # This line creates a random number generator."
   ]
  },
  {
@@ -102,7 +103,7 @@
    "    \"S\",\n",
    "    \"G\",\n",
    "    \"X\",\n",
-    "}  # The vector Free represents cells that the agent is allowed to move into.\n"
+    "}  # The vector Free represents cells that the agent is allowed to move into."
   ]
  },
  {
@@ -164,7 +165,7 @@
    "print(\"Number of states (non-wall cells):\", n_states)\n",
    "print(\"Start state:\", start_state, \"at\", state_to_pos[start_state])\n",
    "print(\"Goal states:\", goal_states, \"at\", state_to_pos[goal_states[0]])\n",
-    "print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])\n"
+    "print(\"Trap states:\", trap_states, \"at\", state_to_pos[trap_states[0]])"
   ]
  },
  {
@@ -188,7 +189,7 @@
    "def plot_maze_with_states():\n",
    "    \"\"\"Plot the maze with state indices.\"\"\"\n",
    "    grid = np.ones(\n",
-    "        (n_rows, n_cols)\n",
+    "        (n_rows, n_cols),\n",
    "    )  # Start with a matrix of ones. Here 1 means “free cell”\n",
    "    for i in range(n_rows):\n",
    "        for j in range(n_cols):\n",
@@ -316,7 +317,7 @@
    "        # If the next cell is a wall, the robot stays in place.\n",
    "        return i, j\n",
    "\n",
-    "    return candidate_i, candidate_j  # Otherwise, return the new position\n"
+    "    return candidate_i, candidate_j  # Otherwise, return the new position"
   ]
  },
  {
@@ -335,7 +336,7 @@
   "outputs": [],
   "source": [
    "gamma = 0.95\n",
-    "p_error = 0.1  # probability of the error to a random other direction\n"
+    "p_error = 0.1  # probability of the error to a random other direction"
   ]
  },
  {
@@ -360,7 +361,7 @@
    "# Set rewards for each state\n",
    "step_penalty = -0.01\n",
    "goal_reward = 1.0\n",
-    "trap_reward = -1.0\n"
+    "trap_reward = -1.0"
   ]
  },
  {
@@ -376,7 +377,7 @@
    "    elif s in trap_states:\n",
    "        R[s] = trap_reward\n",
    "    else:\n",
-    "        R[s] = step_penalty\n"
+    "        R[s] = step_penalty"
   ]
  },
  {
@@ -391,7 +392,7 @@
    "\n",
    "def is_terminal(s: int) -> bool:\n",
    "    \"\"\"Check if a state is terminal (goal or trap).\"\"\"\n",
-    "    return s in terminal_states\n"
+    "    return s in terminal_states"
   ]
  },
  {
@@ -437,9 +438,9 @@
    "            error_i, error_j = move_deterministic(i, j, a2)\n",
    "            s_error = pos_to_state[(error_i, error_j)]  # get its state index s_error\n",
    "            P[a, s, s_error] += p_error / len(\n",
-    "                other_actions\n",
+    "                other_actions,\n",
    "            )  # add p_error / 3 to P[a, s, s_error]\n",
-    "# So for each (s,a), probabilities over all s_next sum to 1.\n"
+    "# So for each (s,a), probabilities over all s_next sum to 1."
   ]
  },
  {
@@ -476,7 +477,7 @@
    "    # If everything is correct, they should be very close to 1.\n",
    "\n",
    "    probs = P[a].sum(axis=1)\n",
-    "    print(f\"Action {action_names[a]}:\", probs)\n"
+    "    print(f\"Action {action_names[a]}:\", probs)"
   ]
  },
  {
@@ -520,7 +521,7 @@
    "\n",
    "    for _it in range(max_iter):  # Main iterative loop\n",
    "        V_new = np.zeros_like(\n",
-    "            V\n",
+    "            V,\n",
    "        )  # Create a new value vector and we will compute an updated value for each state.\n",
    "\n",
    "        # Now we update each state using the Bellman expectation equation\n",
@@ -529,7 +530,7 @@
    "            V_new[s] = R[s] + gamma * np.sum(P[a, s, :] * V)\n",
    "\n",
    "        delta = np.max(\n",
-    "            np.abs(V_new - V)\n",
+    "            np.abs(V_new - V),\n",
    "        )  # This measures how much the value function changed in this iteration:\n",
    "        # If delta is small, the values start to converge; otherwise, we need to keep iterating.\n",
    "        V = V_new  # Update V, i.e. Set the new values for the next iteration.\n",
@@ -537,7 +538,7 @@
    "        if delta < theta:  # Check convergence: When changes are tiny, we stop.\n",
    "            break\n",
    "\n",
-    "    return V  # Return the final value function, this is our estimate for V^{pi}(s), s in the state set.\n"
+    "    return V  # Return the final value function, this is our estimate for V^{pi}(s), s in the state set."
   ]
  },
  {
@@ -550,7 +551,8 @@
    "def plot_values(V: np.ndarray, title=\"Value function\") -> None:\n",
    "    \"\"\"Plot the value function V on the maze as a heatmap.\"\"\"\n",
    "    grid_values = np.full(\n",
-    "        (n_rows, n_cols), np.nan\n",
+    "        (n_rows, n_cols),\n",
+    "        np.nan,\n",
    "    )  # Initializes a grid the same size as the maze. Every cell starts as NaN.\n",
    "    for (\n",
    "        s,\n",
@@ -575,14 +577,20 @@
    "\n",
    "    for s, (i, j) in state_to_pos.items():\n",
    "        ax.text(\n",
-    "            j, i, f\"{V[s]:.2f}\", ha=\"center\", va=\"center\", color=\"white\", fontsize=9\n",
+    "            j,\n",
+    "            i,\n",
+    "            f\"{V[s]:.2f}\",\n",
+    "            ha=\"center\",\n",
+    "            va=\"center\",\n",
+    "            color=\"white\",\n",
+    "            fontsize=9,\n",
    "        )\n",
    "\n",
    "    # Remove axis ticks and set title\n",
    "    ax.set_xticks([])\n",
    "    ax.set_yticks([])\n",
    "    ax.set_title(title)\n",
-    "    plt.show()\n"
+    "    plt.show()"
   ]
  },
  {
@@ -659,7 +667,7 @@
    "    ax.set_yticklabels([])\n",
    "    ax.grid(True)\n",
    "    ax.set_title(title)\n",
-    "    plt.show()\n"
+    "    plt.show()"
   ]
  },
  {
@@ -716,7 +724,7 @@
   "source": [
    "V_random = policy_evaluation(policy=random_policy, P=P, R=R, gamma=gamma)\n",
    "print(\"Value function under random policy:\")\n",
-    "print(V_random)\n"
+    "print(V_random)"
   ]
  },
  {
@@ -748,7 +756,7 @@
   ],
   "source": [
    "plot_values(V_random, title=\"Value function: random policy\")\n",
-    "plot_policy(policy=random_policy, title=\"Random Policy\")\n"
+    "plot_policy(policy=random_policy, title=\"Random Policy\")"
   ]
  },
  {
@@ -847,7 +855,7 @@
    "V_my_policy = policy_evaluation(policy=my_policy, P=P, R=R, gamma=gamma)\n",
    "\n",
    "plot_values(V=V_my_policy, title=\"Value function: my policy\")\n",
-    "plot_policy(policy=my_policy, title=\"My policy\")\n"
+    "plot_policy(policy=my_policy, title=\"My policy\")"
   ]
  },
  {