Code: RL Pong with Keras-RL (GPU)

apt-get installs of necessary tools for OpenAI's gym.

apt-get update
apt-get install -y build-essential python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig

55.5s

conda install -qy -c anaconda tensorflow-gpu h5py
pip install keras gym[atari]==0.8.2 #scikit-image opencv-python sk-video
pip install keras-rl

246.1s

Installing OpenAI's gym (Atari games), Keras for the training (on TF backend), h5py to save models. (previously sk-video to create videos of the agents playing, but not needed anymore - using Cristi B's solution with gifs).

scikit-image and opencv-python might be obsolete.

All imports we're gonna need. Following Keras-RL's Atari DQN example.

import gym
from PIL import Image
import numpy as np

import matplotlib.pyplot as plt

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Convolution2D, Permute
from keras.optimizers import Adam
import keras.backend as K

from rl.agents.dqn import DQNAgent
from rl.policy import LinearAnnealedPolicy, BoltzmannQPolicy, EpsGreedyQPolicy
from rl.memory import SequentialMemory
from rl.core import Processor
from rl.callbacks import FileLogger, ModelIntervalCheckpoint

4.6s

Old preprocessind and discounted reward functions for comparison.

def preprocess_image(Img):
  Img = Img[35:195] 
  Img = Img[::2,::2,0] 
  Img[Img == 144] = 0 
  Img[Img == 109] = 0 
  Img[Img != 0] = 1 
  return Img.astype(np.float).ravel()

def discounted_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(xrange(0, r.size)):
    if r[t] != 0: running_add = 0
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r

1.9s

Test render

env = gym.make('Pong-v0')
env.reset()

fig = plt.figure()

for i in range(20):
  env.step(env.action_space.sample())

arr = env.render(mode='rgb_array')

plt.imshow(arr)

fig

1.8s

class AtariProcessor(Processor):
    def process_observation(self, observation):
        assert observation.ndim == 3  # (height, width, channel)
        img = Image.fromarray(observation)
        img = img.resize(INPUT_SHAPE).convert('L')  # resize and convert to grayscale
        processed_observation = np.array(img)
        assert processed_observation.shape == INPUT_SHAPE
        return processed_observation.astype('uint8')  # saves storage in experience memory

    def process_state_batch(self, batch):
        # We could perform this processing step in `process_observation`. In this case, however,
        # we would need to store a `float32` array instead, which is 4x more memory intensive than
        # an `uint8` array. This matters if we store 1M observations.
        processed_batch = batch.astype('float32') / 255.
        return processed_batch

    def process_reward(self, reward):
        return np.clip(reward, -1., 1.)

1.0s

# Get the environment and extract the number of actions.
env_name = 'Pong-v0'
env = gym.make(env_name)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

# Set input shape for the network
INPUT_SHAPE = (84, 84)
WINDOW_LENGTH = 4

64.8s

# Next, we build our model. We use the same model that was described by Mnih et al. (2015).
input_shape = (WINDOW_LENGTH,) + INPUT_SHAPE
model = Sequential()
if K.image_dim_ordering() == 'tf':
    # (width, height, channels)
    model.add(Permute((2, 3, 1), input_shape=input_shape))
elif K.image_dim_ordering() == 'th':
    # (channels, width, height)
    model.add(Permute((1, 2, 3), input_shape=input_shape))
else:
    raise RuntimeError('Unknown image_dim_ordering.')
model.add(Convolution2D(32, 8, 8, subsample=(4, 4)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 4, 4, subsample=(2, 2)))
model.add(Activation('relu'))
model.add(Convolution2D(64, 3, 3, subsample=(1, 1)))
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

1.7s

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=1000000, window_length=WINDOW_LENGTH)
processor = AtariProcessor()

# Select a policy. We use eps-greedy action selection, which means that a random action is selected
# with probability eps. We anneal eps from 1.0 to 0.1 over the course of 1M steps. This is done so that
# the agent initially explores the environment (high eps) and then gradually sticks to what it knows
# (low eps). We also set a dedicated eps value that is used during testing. Note that we set it to 0.05
# so that the agent still performs some random actions. This ensures that the agent cannot get stuck.
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', 
                              value_max=1., value_min=.1, value_test=.2, nb_steps=1000000)

1.2s

# Creating the DQN agent with the specified model, policy, memory etc.
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy,
               memory=memory,processor=processor, nb_steps_warmup=10,
               gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)

dqn.compile(Adam(lr=.00025), metrics=['mae'])
# Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can use the built-in Keras callbacks!
results_path = '/results/'
weights_filename = results_path + 'dqn_{}_weights.h5f'.format(env_name)
checkpoint_weights_filename = results_path + 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = results_path + 'dqn_{}_log.json'.format(env_name)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
callbacks += [FileLogger(log_filename, interval=100)]
dqn.fit(env, callbacks=callbacks, nb_steps=2750000, log_interval=10000)
#dqn.fit(env, callbacks=callbacks, nb_steps=1000, log_interval=100)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

24257.9s

dqn_Pong-v0_weights_1000000.h5f

Download

dqn_Pong-v0_weights_2750000.h5f

Download

dqn_Pong-v0_weights.h5f

Download

dqn_Pong-v0_log.json

Download

dqn_Pong-v0_weights_2500000.h5f

Download

dqn_Pong-v0_weights_2250000.h5f

Download

dqn_Pong-v0_weights_250000.h5f

Download

dqn_Pong-v0_weights_1750000.h5f

Download

dqn_Pong-v0_weights_750000.h5f

Download

dqn_Pong-v0_weights_2000000.h5f

Download

dqn_Pong-v0_weights_1250000.h5f

Download

dqn_Pong-v0_weights_1500000.h5f

Download

dqn_Pong-v0_weights_500000.h5f

Download

Continue training:

dqn_Pong-v0_weights_final.h5f

Download

# Creating the DQN agent with the specified model, policy, memory etc.
dqn = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory,
               processor=processor, nb_steps_warmup=50000, gamma=.99, target_model_update=10000,
               train_interval=4, delta_clip=1.)

dqn.compile(Adam(lr=.00025), metrics=['mae'])
# Okay, now it's time to learn something! We capture the interrupt exception so that training
    # can be prematurely aborted. Notice that you can the built-in Keras callbacks!
results_path = '/results/'
weights_filename = results_path + 'dqn_{}_weights.h5f'.format(env_name)
checkpoint_weights_filename = results_path + 'dqn_' + env_name + '_weights_{step}.h5f'
log_filename = results_path + 'dqn_{}_log.json'.format(env_name)
callbacks = [ModelIntervalCheckpoint(checkpoint_weights_filename, interval=250000)]
callbacks += [FileLogger(log_filename, interval=100)]

dqn.load_weights(dqn_Pong-v0_weights_final.h5f)
dqn.fit(env, callbacks=callbacks, nb_steps=5750000, log_interval=50000)
#dqn.fit(env, callbacks=callbacks, nb_steps=1000, log_interval=100)

# After training is done, we save the final weights one more time.
dqn.save_weights(weights_filename, overwrite=True)

51453.3s

dqn_Pong-v0_weights_1000000.h5f

Download

dqn_Pong-v0_weights_2750000.h5f

Download

dqn_Pong-v0_weights.h5f

Download

dqn_Pong-v0_log.json

Download

dqn_Pong-v0_weights_2500000.h5f

Download

dqn_Pong-v0_weights_2250000.h5f

Download

dqn_Pong-v0_weights_250000.h5f

Download

dqn_Pong-v0_weights_5750000.h5f

Download

dqn_Pong-v0_weights_4000000.h5f

Download

dqn_Pong-v0_weights_5500000.h5f

Download

dqn_Pong-v0_weights_4250000.h5f

Download

dqn_Pong-v0_weights_1750000.h5f

Download

dqn_Pong-v0_weights_750000.h5f

Download

dqn_Pong-v0_weights_3500000.h5f

Download

dqn_Pong-v0_weights_3250000.h5f

Download

dqn_Pong-v0_weights_3750000.h5f

Download

dqn_Pong-v0_weights_3000000.h5f

Download

dqn_Pong-v0_weights_2000000.h5f

Download

dqn_Pong-v0_weights_4750000.h5f

Download

dqn_Pong-v0_weights_5000000.h5f

Download

dqn_Pong-v0_weights_1250000.h5f

Download

dqn_Pong-v0_weights_5250000.h5f

Download

dqn_Pong-v0_weights_1500000.h5f

Download

dqn_Pong-v0_weights_500000.h5f

Download

dqn_Pong-v0_weights_4500000.h5f

Download

Todo:

Create gifs from play
pipe out frames from test method call to save to gif
save files as .h5 (difference to .h5f still unclear)

import json
from pprint import pprint

data = json.load(open())

pprint(data)