Reinforcement Learning: Atari's Pong (GPU)

apt-get installs of necessary tools for OpenAI's gym.

apt-get update
apt-get install -y build-essential python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
Done
apt-get installs
Bash



















































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































conda install -qy -c anaconda tensorflow-gpu h5py
pip install keras gym[atari]==0.8.2 scikit-image opencv-python sk-video
Done
setup
Bash


































































































Installing OpenAI's gym (Atari games), Keras for the training (on TF backend), sk-video to create videos of the agents playing, h5py to save models.

scikit-image and opencv-python might be obsolete.

All imports we're gonna need.

import gym
from PIL import Image
import matplotlib.pyplot as plt
import skvideo.io

import numpy as np
import os

from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Reshape
from keras.layers.wrappers import TimeDistributed
from keras.optimizers import Adam, Adamax, RMSprop
from keras.layers.advanced_activations import PReLU
from keras.layers.normalization import BatchNormalization
from keras.layers.core import Activation, Dropout, Flatten
from keras.layers.convolutional import UpSampling2D, Convolution2D
Done
imports
Python


Let's see what the game looks like. #AtariNostalgia Also let's define an image preprocessing function and test it on the rendered frame.

def preprocess_image(Img):
  Img = Img[35:195] 
  Img = Img[::2,::2,0] 
  Img[Img == 144] = 0 
  Img[Img == 109] = 0 
  Img[Img != 0] = 1 
  return Img.astype(np.float).ravel()
Done
image preprocessing
Python
env = gym.make('Pong-v0')
env.reset()

fig = plt.figure()

arr = env.render(mode='rgb_array')

plt.imshow(arr)

fig
Done
pong screenshot
Python


Let's look at a game played by an agent only taking random actions.

env = gym.make('Pong-v0')
env.reset()

writer = skvideo.io.FFmpegWriter("/results/pong_random_actions.mp4")

for _ in range(1000):
  writer.writeFrame(env.render(mode='rgb_array'))
  env.step(env.action_space.sample())

writer.close()
Error
random pong
Python


pong_random_actions.mp4
Download
#Script Parameters
input_dim = 80 * 80
gamma = 0.99
update_frequency = 1
learning_rate = 0.001

#Initialize
env = gym.make("Pong-v0")
number_of_inputs = env.action_space.n #This is incorrect for Pong (?)
#number_of_inputs = 1
observation = env.reset()
prev_x = None
xs, dlogps, drs, probs = [],[],[],[]
running_reward = None
reward_sum = 0
episode_number = 0
train_X = []
train_y = []

def discounted_rewards(r):
  discounted_r = np.zeros_like(r)
  running_add = 0
  for t in reversed(xrange(0, r.size)):
    if r[t] != 0: running_add = 0
    running_add = running_add * gamma + r[t]
    discounted_r[t] = running_add
  return discounted_r
Done
init and parameter definitions
Python


model = Sequential()

model.add(Reshape((1,80,80), input_shape=(input_dim,)))
#model.add(Convolution2D(32, 9, 9, strides=(4, 4), 
#                        padding='same', activation='relu', 
#                        kernel_initializer='he_uniform'))
model.add(Convolution2D(32, 9, 9, 
                        subsample=(4, 4), border_mode='same',
                        activation='relu', init='he_uniform'))

model.add(Flatten())
model.add(Dense(16, activation='relu', init='he_uniform'))
model.add(Dense(number_of_inputs, activation='softmax'))

opt = Adam(lr=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt)
Done
build network
Python









pong_model_checkpoint_154.h5
Download
pong_model_checkpoint_233.h5
Download
resume = True
render = False

env = gym.make('Pong-v0')
env.reset()

writer = skvideo.io.FFmpegWriter("/results/pong_training_loop.mp4")

if resume:
  model.load_weights(pong_model_checkpoint_233.h5)
  episode_number = 233

for _ in range(1*1000):
  if render:
    writer.writeFrame(env.render(mode='rgb_array'))
  # preprocessing and taking frame differences as inputs
  cur_x = preprocess_image(observation)
  x = cur_x - prev_x if prev_x is not None else np.zeros(input_dim)
  prev_x = cur_x
  # predict from keras model
  aprob = ((model.predict(x.reshape([1,x.shape[0]]), 
                          batch_size=1).flatten()))
  # append features and labels for the episode-batch
  xs.append(x)
  probs.append((model.predict(x.reshape([1,x.shape[0]]),
                              batch_size=1).flatten()))
  aprob = aprob/np.sum(aprob)
  action = np.random.choice(number_of_inputs, 1, p=aprob)[0]
  y = np.zeros([number_of_inputs])
  y[action] = 1
  # print action
  dlogps.append(np.array(y).astype('float32') - aprob)
  observation, reward, done, info = env.step(action)
  reward_sum += reward
  drs.append(reward)
  # if game is over, create episode (?)
  if done:
    episode_number += 1
    epx = np.vstack(xs)
    epdlogp = np.vstack(dlogps)
    epr = np.vstack(drs)
    discounted_epr = discounted_rewards(epr)
    discounted_epr -= np.mean(discounted_epr)
    discounted_epr /= np.std(discounted_epr)
    epdlogp *= discounted_epr
    # slowly prepare the training batch
    train_X.append(xs) 
    train_y.append(epdlogp)
    xs,dlogps,drs = [],[],[]
    # periodically update the model
    if episode_number % update_frequency == 0: 
      # y_train is still work in progress, ideas below
      y_train = probs + learning_rate * np.squeeze(np.vstack(train_y))
      #y_train[y_train<0] = 0
      #y_train[y_train>1] = 1
      #y_train = y_train / np.sum(np.abs(y_train), axis=1, keepdims=True)
      print 'Training Snapshot:'
      print y_train
      model.train_on_batch(np.squeeze(np.vstack(train_X)), y_train)
      # clear the batch
      train_X = []
      train_y = []
      probs = []
      # save a checkpoint of the model
      if os.path.exists('/results/pong_model_checkpoint.h5'):
        os.remove('/results/pong_model_checkpoint.h5')      
      model.save_weights('/results/pong_model_checkpoint.h5')
    # reset the current environment and print current results
    if running_reward is None:
      running_reward = reward_sum
    else:
      running_reward = running_reward * 0.99 + reward_sum * 0.01
    print('Environment reset imminent.')
    print('Total Episode Reward: %f' % reward_sum)
    print('Running Mean: %f' % running_reward)
    reward_sum = 0
    observation = env.reset()
    prev_x = None
  if reward != 0:
    print(('Episode %d Result: ' % episode_number) + 
         ('Defeat!' if reward == -1 else 'Victory!!'))
  
if render:
  writer.close()
Done
training loop
Python