In this tutorial we are going to try and beat the gym game Taxi-v3 using Keras. The game is considered solved whenever the average of the last 100 games is at least positive. We are going to follow the same steps as in the CartPole tutorial before this. Can you remember the roadmap?
In short:
Make a copy of the previous file from the CartPole
tutorial and try and update only the required parts. This tutorial will mention the parts that have to be changed and show you a working coding example for that part. Note that a working model doesn't guarantee it to be a good model.
import gym
import random
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
Update the original code where necessary.
def random_average(nr_games=100):
env = gym.make("CartPole-v0")
collected_scores = []
for _ in range(nr_games):
env.reset()
done = False
score = 0
while not done:
action = env.action_space.sample()
obs, reward, done, info = env.step(action)
score += reward
collected_scores.append(score)
average = sum(collected_scores) / nr_games
print(f"\nA random model played: {nr_games} games, with an average score of:"
f" {average:5.2f}")
return average
Taxi-v3
.def random_average(nr_games=100):
env = gym.make("Taxi-v3")
collected_scores = []
for _ in range(nr_games):
env.reset()
done = False
score = 0
while not done:
action = env.action_space.sample()
obs, reward, done, info = env.step(action)
score += reward
collected_scores.append(score)
average = sum(collected_scores) / nr_games
print(f"\nA random model played: {nr_games} games, with an average score of:"
f" {average: 5.2f}")
return average
Update the original code where necessary.
def collect_data(nr_games=50, minimum_score=100):
""" We are going to collect a fixed number of games having a high score. """
# Here we are going to collect all the data.
data_observations = [] # This is the X, or observation
data_actions = [] # This is the y, or label data
collected_scores = []
collected_games = 0
env = gym.make("CartPole-v0")
# While loop, since we do not know how many games to play until we have nr_games good ones.
while collected_games < nr_games:
# Start the default game loop
obs = env.reset()
done = False
# Temporarily store all actions, in case it is a good run.
temp_observations = []
temp_actions = []
score = 0
while not done:
# Pick a random action
action = env.action_space.sample()
# Store the action and rewards for when this is a good game
temp_observations.append(obs)
temp_actions.append(action)
# Update the obs, done and score
obs, reward, done, info = env.step(action)
score += reward
# Only store information about good games.
if score > minimum_score:
# Use extend to merge a list with a list.
data_observations.extend(temp_observations)
data_actions.extend(temp_actions)
# Update counters and give the users a message.
collected_scores.append(score)
collected_games += 1
# The \r is a flush action, this means that it will get overwritten on the next print.
print(f"\rAdded score: {score}, collected games: {collected_games}/{nr_games}", end='')
env.close()
print(f"\n\nCollected scores (higher is better):\n{collected_scores}")
print(f"\nCollected average: {sum(collected_scores) / nr_games}")
return data_observations, data_actions
Here we have to change the name, but also the minimum score.
gym.make
on line 11.collect_data(minimum_score=-200)
.def collect_data(nr_games=50, minimum_score=100):
""" We are going to collect a fixed number of games having a high score. """
# Here we are going to collect all the data.
data_observations = [] # This is the X, or observation
data_actions = [] # This is the y, or label data
collected_scores = []
collected_games = 0
env = gym.make("Taxi-v3")
while collected_games < nr_games:
# Start the default game loop
obs = env.reset()
done = False
# Definitions
temp_observations = []
temp_actions = []
score = 0
while not done:
# Pick a random action
action = env.action_space.sample()
# Store the action and rewards for when this is a good game
temp_observations.append(obs)
temp_actions.append(action)
# Update the obs, done and score
obs, reward, done, info = env.step(action)
score += reward
# Only store information about good games.
if score > minimum_score:
data_observations.extend(temp_observations)
data_actions.extend(temp_actions)
collected_scores.append(score)
collected_games += 1
print(f"\rAdded score: {score}, collected games: {collected_games}/{nr_games}", end='')
env.close()
print(f"\n\nCollected scores (higher is better):\n{collected_scores}")
print(f"\nCollected average: {sum(collected_scores) / nr_games}")
return data_observations, data_actions
The collection of training data can be done faster by earlier terminating epsidoes that won't make the cut-off.
In any case by adding the following code somewhere between lines 26-34:
if not done and score < minimum_score:
break
Update the original code where necessary.
def create_model():
model = Sequential()
model.add(Dense(64, input_shape=(4,), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(2, activation='softmax')) # We only have two values.
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.summary()
return model
Here we have to change the input and output shape.
Taxi has only a single number representing the state, hence the input_shape is (1,). The brackets and comma in the input are required to make the input a tuple, otherwise Keras will not understand it correctly.
There are 6 output actions for Taxi (up, down, right, left, pickup, dropoff), hence the last layer has to be changed from 2 to 6.
input_shape=(1,)
, observation input is now a single integer.Dence(6, activation='softmax')
, output layer has now 6 actions to choose from.def create_model():
model = Sequential()
model.add(Dense(64, input_shape=(1,), activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(6, activation='softmax')) # We only have two values.
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
model.summary()
return model
Update the original code where necessary.
# Let us show an example prediction of our model
def show_example(model, X):
# Sample id
sample_id = 5
# We convert the observation to a numpy array with shape [1, 4]
# The first value 1 is required, because the model always expects to get a batch of values.
observation = np.array([X[sample_id]])
# Show the shape and values that are the model input.
print(f"Shape: {observation.shape}, values: {observation}")
# This is the model output, 2 values which sum up to 1.
action_probabilities = model.predict(observation)[0]
# Here we show the action and the model predicted probability.
print(f"\nAction and probablity")
for action, probability in zip(np.arange(2), action_probabilities):
print(f"\tAction: {action}, probability: {probability * 100:6.2f}%")
# The action we have to take is equal to the index of the highest value in the model.
print(f"\nBest action: {np.argmax(action_probabilities)}")
For this there is only a small change in the np.arange
, since we now have 6 actions instead of 2.
action_probabilities
. This is because the built in zip
will stop as soon as one of the input list (iterables) is empty.np.arange(6)
, we now have 6 actions that we have to print.# Let us show an example prediction of our model
def show_example(model, X):
sample_id = 5
observation = np.array([X[sample_id]])
print(f"Shape: {observation.shape}, values: {observation}")
action_probabilities = model.predict(observation)[0]
print(f"\nAction and probablity")
for action, probability in zip(np.arange(6), action_probabilities):
print(f"\tAction: {action}, probability: {probability * 100:6.2f}%")
print(f"\nBest action: {np.argmax(action_probabilities)}")
Update the original code where necessary.
model_x = np.array(X)
model_y = to_categorical(np.array(y), 2)
history = model.fit(model_x, model_y, epochs=20)
to_categorical
function requires the number of actions as second input.model_x = np.array(X)
model_y = to_categorical(np.array(y), 6)
history = model.fit(model_x, model_y, epochs=20)
You might wonder about the to_catergorical
function. This is important because the model gives two outputs. Hence the correct label has to be of the same shape. This is done by using a one-hot encoding. A beginners friendly explanation is provided by Michael DelSole. But the basics is that we want our model to give high probabilities for the action that we took, so we put that one to 1, and all other actions to zero. For example, if the taken action is 1, we will encode this as [0, 1]
, so the model will learn to pick action 1.
In most gym environment we can actually automatically detect the number of actions. The following code could do it for you.
import gym
env = gym.make('Taxi-v3')
nr_actions = env.action_space.n
Update the original code where necessary.
def plot_history(history):
""" Shows the loss (Mean squared error) and accuracy over time. """
plt.subplots(1, 2, figsize=(30, 10))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title("Loss")
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title("Accuracy")
plt.show()
We do not need to make any changes. However take a look at the results and you will notice that they are different than the results we got for CartPole
. What kind of effect will this have on our model results?
def plot_history(history):
""" Shows the loss (Mean squared error) and accuracy over time. """
plt.subplots(1, 2, figsize=(30, 10))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'])
plt.title("Loss")
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'])
plt.title("Accuracy")
plt.show()
This is a possible trianing outcome, but most results should show a similar pattern. We can see that the loss is not decreasing as much as expected, but rather staying around the same values. Another observation is that the accuracy is not increasing to a high value.
Do you know why the accuracy is around 0.1667
?
Update the original code where necessary.
def evaluate_model(nr_games=100):
env = gym.make("CartPole-v0")
collected_scores = []
for episode in range(1, nr_games + 1):
obs = env.reset()
done = False
score = 0
while not done:
# Get action from model
model_x = np.array([obs])
action = np.argmax(model.predict(model_x)[0])
# update everything
obs, reward, done, info = env.step(action)
score += reward
print(f"\r\tGame {episode:3d}/{nr_games:3d} score: {score}", end='')
collected_scores.append(score)
print(f"\n\nThe model played: {nr_games} games, with an average score of: {sum(collected_scores) / nr_games:5.2f}")
def evaluate_model(nr_games=100):
env = gym.make("Taxi-v3")
collected_scores = []
for episode in range(1, nr_games + 1):
obs = env.reset()
done = False
score = 0
while not done:
# Get action from model
model_x = np.array([obs])
action = np.argmax(model.predict(model_x)[0])
# update everything
obs, reward, done, info = env.step(action)
score += reward
print(f"\r\tGame {episode:3d}/{nr_games:3d} score: {score}", end='')
collected_scores.append(score)
print(f"\n\nThe model played: {nr_games} games, with an average score of: {sum(collected_scores) / nr_games:5.2f}")
# Now lets compare our model versus a random model.
random_average()
print(f"\nOur model performance:")
evaluate_model()
The results are not that promising. This has to due with a few differences compared to CartPole
.
CartPole
random actions still lead to positive results, namely staying alive longer, in Taxi
it is disadvantages to stay alive long. This means that in our training data we have a lot of random actions that work against each other. Example: we enter and exit the same state multiple times, but perform different actions, hence we have to fit a model that has multiple outputs for the same input. The best the model can hope to achieve is to always pick one random action, hence an accuracy of 1 / 6 = ~0.16667
.Reusing our code of CartPole
does not always lead to a good model. So for every situation we have to check if our model is good enough. Before moving on to the next section, see if you can make the above code more variable by determing the observation space and action space on the fly.