diff --git a/16_reinforcement_learning.ipynb b/16_reinforcement_learning.ipynb index 67ce80d..13593f8 100644 --- a/16_reinforcement_learning.ipynb +++ b/16_reinforcement_learning.ipynb @@ -1886,15 +1886,342 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Coming soon..." + "## 1. to 7." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "See Appendix A." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 8. BipedalWalker-v2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Exercise: _Use policy gradients to tackle OpenAI gym's \"BipedalWalker-v2\"._" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "metadata": {}, + "outputs": [], + "source": [ + "import gym" + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "env = gym.make(\"BipedalWalker-v2\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: if you run into [this issue](https://github.com/openai/gym/issues/100) (\"`module 'Box2D._Box2D' has no attribute 'RAND_LIMIT'`\") when making the `BipedalWalker-v2` environment, then try this workaround:\n", + "\n", + "```\n", + "$ pip uninstall Box2D-kengz\n", + "$ pip install git+https://github.com/pybox2d/pybox2d\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "metadata": {}, + "outputs": [], + "source": [ + "obs = env.reset()" + ] + }, + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "img = env.render(mode=\"rgb_array\")" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "plt.imshow(img)\n", + "plt.axis(\"off\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "obs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can find the meaning of each of these 24 numbers in the [documentation](https://github.com/openai/gym/wiki/BipedalWalker-v2)." + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [], + "source": [ + "env.action_space" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [], + "source": [ + "env.action_space.low" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [], + "source": [ + "env.action_space.high" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is a 4D continuous action space controling each leg's hip torque and knee torque (from -1 to 1). To deal with a continuous action space, one method is to discretize it. For example, let's limit the possible torque values to these 3 values: -1.0, 0.0, and 1.0. This means that we are left with $3^4=81$ possible actions." + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "metadata": {}, + "outputs": [], + "source": [ + "from itertools import product" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "metadata": {}, + "outputs": [], + "source": [ + "possible_torques = np.array([-1.0, 0.0, 1.0])\n", + "possible_actions = np.array(list(product(possible_torques, possible_torques, possible_torques, possible_torques)))\n", + "possible_actions.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [], + "source": [ + "tf.reset_default_graph()\n", + "\n", + "# 1. Specify the network architecture\n", + "n_inputs = env.observation_space.shape[0] # == 24\n", + "n_hidden = 10\n", + "n_outputs = len(possible_actions) # == 625\n", + "initializer = tf.variance_scaling_initializer()\n", + "\n", + "# 2. Build the neural network\n", + "X = tf.placeholder(tf.float32, shape=[None, n_inputs])\n", + "\n", + "hidden = tf.layers.dense(X, n_hidden, activation=tf.nn.selu,\n", + " kernel_initializer=initializer)\n", + "logits = tf.layers.dense(hidden, n_outputs,\n", + " kernel_initializer=initializer)\n", + "outputs = tf.nn.softmax(logits)\n", + "\n", + "# 3. Select a random action based on the estimated probabilities\n", + "action_index = tf.squeeze(tf.multinomial(logits, num_samples=1), axis=-1)\n", + "\n", + "# 4. Training\n", + "learning_rate = 0.01\n", + "\n", + "y = tf.one_hot(action_index, depth=len(possible_actions))\n", + "cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=y, logits=logits)\n", + "optimizer = tf.train.AdamOptimizer(learning_rate)\n", + "grads_and_vars = optimizer.compute_gradients(cross_entropy)\n", + "gradients = [grad for grad, variable in grads_and_vars]\n", + "gradient_placeholders = []\n", + "grads_and_vars_feed = []\n", + "for grad, variable in grads_and_vars:\n", + " gradient_placeholder = tf.placeholder(tf.float32, shape=grad.get_shape())\n", + " gradient_placeholders.append(gradient_placeholder)\n", + " grads_and_vars_feed.append((gradient_placeholder, variable))\n", + "training_op = optimizer.apply_gradients(grads_and_vars_feed)\n", + "\n", + "init = tf.global_variables_initializer()\n", + "saver = tf.train.Saver()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try running this policy network, although it is not trained yet." + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "def run_bipedal_walker(model_path=None, n_max_steps = 1000):\n", + " env = gym.make(\"BipedalWalker-v2\")\n", + " frames = []\n", + " with tf.Session() as sess:\n", + " if model_path is None:\n", + " init.run()\n", + " else:\n", + " saver.restore(sess, model_path)\n", + " obs = env.reset()\n", + " for step in range(n_max_steps):\n", + " img = env.render(mode=\"rgb_array\")\n", + " frames.append(img)\n", + " action_index_val = action_index.eval(feed_dict={X: obs.reshape(1, n_inputs)})\n", + " action = possible_actions[action_index_val]\n", + " obs, reward, done, info = env.step(action[0])\n", + " if done:\n", + " break\n", + " env.close()\n", + " return frames" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "frames = run_bipedal_walker()\n", + "video = plot_animation(frames)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nope, it really can't walk. So let's train it!" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "n_games_per_update = 10\n", + "n_max_steps = 1000\n", + "n_iterations = 1000\n", + "save_iterations = 10\n", + "discount_rate = 0.95\n", + "\n", + "with tf.Session() as sess:\n", + " init.run()\n", + " for iteration in range(n_iterations):\n", + " print(\"\\rIteration: {}/{}\".format(iteration + 1, n_iterations), end=\"\")\n", + " all_rewards = []\n", + " all_gradients = []\n", + " for game in range(n_games_per_update):\n", + " current_rewards = []\n", + " current_gradients = []\n", + " obs = env.reset()\n", + " for step in range(n_max_steps):\n", + " action_index_val, gradients_val = sess.run([action_index, gradients],\n", + " feed_dict={X: obs.reshape(1, n_inputs)})\n", + " action = possible_actions[action_index_val]\n", + " obs, reward, done, info = env.step(action[0])\n", + " current_rewards.append(reward)\n", + " current_gradients.append(gradients_val)\n", + " if done:\n", + " break\n", + " all_rewards.append(current_rewards)\n", + " all_gradients.append(current_gradients)\n", + "\n", + " all_rewards = discount_and_normalize_rewards(all_rewards, discount_rate=discount_rate)\n", + " feed_dict = {}\n", + " for var_index, gradient_placeholder in enumerate(gradient_placeholders):\n", + " mean_gradients = np.mean([reward * all_gradients[game_index][step][var_index]\n", + " for game_index, rewards in enumerate(all_rewards)\n", + " for step, reward in enumerate(rewards)], axis=0)\n", + " feed_dict[gradient_placeholder] = mean_gradients\n", + " sess.run(training_op, feed_dict=feed_dict)\n", + " if iteration % save_iterations == 0:\n", + " saver.save(sess, \"./my_bipedal_walker_pg.ckpt\")" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "frames = run_bipedal_walker(\"./my_bipedal_walker_pg.ckpt\")\n", + "video = plot_animation(frames)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Not the best walker, but at least it stays up and makes (slow) progress to the right.\n", + "A better solution for this problem is to use an actor-critic algorithm, as it does not require discretizing the action space, and it converges much faster. Check out this nice [blog post](https://towardsdatascience.com/reinforcement-learning-w-keras-openai-actor-critic-models-f084612cfd69) by Yash Patel for more details." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Coming soon**" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [] } @@ -1916,18 +2243,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.5.2" - }, - "nav_menu": {}, - "toc": { - "navigate_menu": true, - "number_sections": true, - "sideBar": true, - "threshold": 6, - "toc_cell": false, - "toc_section_display": "block", - "toc_window_display": false } }, "nbformat": 4, - "nbformat_minor": 1 + "nbformat_minor": 2 }