Gregor Koehler / Nov 12 2017
Remix of Reinforcement Learning: Atari's Pong by Gregor Koehler
Reinforcement Learning: Atari's Pong (GPU)
Reinforcement Learning: Atari's Pong (GPU)
apt-get
installs of necessary tools for OpenAI's gym.
apt-get update
apt-get install -y build-essential python-numpy python-dev cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
Done
apt-get installsBash
conda install -qy -c anaconda tensorflow-gpu h5py pip install keras gym[atari]==0.8.2 scikit-image opencv-python sk-video
Done
setupBash
Installing OpenAI's gym (Atari games), Keras for the training (on TF backend), sk-video to create videos of the agents playing, h5py to save models.
scikit-image and opencv-python might be obsolete.
All imports we're gonna need.
import gym from PIL import Image import matplotlib.pyplot as plt import skvideo.io import numpy as np import os from keras.models import Sequential from keras.callbacks import ModelCheckpoint from keras.layers import Input, Dense, Reshape from keras.layers.wrappers import TimeDistributed from keras.optimizers import Adam, Adamax, RMSprop from keras.layers.advanced_activations import PReLU from keras.layers.normalization import BatchNormalization from keras.layers.core import Activation, Dropout, Flatten from keras.layers.convolutional import UpSampling2D, Convolution2D
Done
importsPython
Let's see what the game looks like. #AtariNostalgia Also let's define an image preprocessing function and test it on the rendered frame.
def preprocess_image(Img): Img = Img[35:195] Img = Img[::2,::2,0] Img[Img == 144] = 0 Img[Img == 109] = 0 Img[Img != 0] = 1 return Img.astype(np.float).ravel()
Done
image preprocessingPython
env = gym.make('Pong-v0') env.reset() fig = plt.figure() arr = env.render(mode='rgb_array') plt.imshow(arr) fig
Done
pong screenshotPython
Let's look at a game played by an agent only taking random actions.
env = gym.make('Pong-v0') env.reset() writer = skvideo.io.FFmpegWriter("/results/pong_random_actions.mp4") for _ in range(1000): writer.writeFrame(env.render(mode='rgb_array')) env.step(env.action_space.sample()) writer.close()
Error
random pongPython
pong_random_actions.mp4
Download
#Script Parameters input_dim = 80 * 80 gamma = 0.99 update_frequency = 1 learning_rate = 0.001 #Initialize env = gym.make("Pong-v0") number_of_inputs = env.action_space.n #This is incorrect for Pong (?) #number_of_inputs = 1 observation = env.reset() prev_x = None xs, dlogps, drs, probs = [],[],[],[] running_reward = None reward_sum = 0 episode_number = 0 train_X = [] train_y = [] def discounted_rewards(r): discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(xrange(0, r.size)): if r[t] != 0: running_add = 0 running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r
Done
init and parameter definitionsPython
model = Sequential() model.add(Reshape((1,80,80), input_shape=(input_dim,))) #model.add(Convolution2D(32, 9, 9, strides=(4, 4), # padding='same', activation='relu', # kernel_initializer='he_uniform')) model.add(Convolution2D(32, 9, 9, subsample=(4, 4), border_mode='same', activation='relu', init='he_uniform')) model.add(Flatten()) model.add(Dense(16, activation='relu', init='he_uniform')) model.add(Dense(number_of_inputs, activation='softmax')) opt = Adam(lr=learning_rate) model.compile(loss='categorical_crossentropy', optimizer=opt)
Done
build networkPython
pong_model_checkpoint_154.h5
Download
pong_model_checkpoint_233.h5
Download
resume = True render = False env = gym.make('Pong-v0') env.reset() writer = skvideo.io.FFmpegWriter("/results/pong_training_loop.mp4") if resume: model.load_weights(pong_model_checkpoint_233.h5) episode_number = 233 for _ in range(1*1000): if render: writer.writeFrame(env.render(mode='rgb_array')) # preprocessing and taking frame differences as inputs cur_x = preprocess_image(observation) x = cur_x - prev_x if prev_x is not None else np.zeros(input_dim) prev_x = cur_x # predict from keras model aprob = ((model.predict(x.reshape([1,x.shape[0]]), batch_size=1).flatten())) # append features and labels for the episode-batch xs.append(x) probs.append((model.predict(x.reshape([1,x.shape[0]]), batch_size=1).flatten())) aprob = aprob/np.sum(aprob) action = np.random.choice(number_of_inputs, 1, p=aprob)[0] y = np.zeros([number_of_inputs]) y[action] = 1 # print action dlogps.append(np.array(y).astype('float32') - aprob) observation, reward, done, info = env.step(action) reward_sum += reward drs.append(reward) # if game is over, create episode (?) if done: episode_number += 1 epx = np.vstack(xs) epdlogp = np.vstack(dlogps) epr = np.vstack(drs) discounted_epr = discounted_rewards(epr) discounted_epr -= np.mean(discounted_epr) discounted_epr /= np.std(discounted_epr) epdlogp *= discounted_epr # slowly prepare the training batch train_X.append(xs) train_y.append(epdlogp) xs,dlogps,drs = [],[],[] # periodically update the model if episode_number % update_frequency == 0: # y_train is still work in progress, ideas below y_train = probs + learning_rate * np.squeeze(np.vstack(train_y)) #y_train[y_train<0] = 0 #y_train[y_train>1] = 1 #y_train = y_train / np.sum(np.abs(y_train), axis=1, keepdims=True) print 'Training Snapshot:' print y_train model.train_on_batch(np.squeeze(np.vstack(train_X)), y_train) # clear the batch train_X = [] train_y = [] probs = [] # save a checkpoint of the model if os.path.exists('/results/pong_model_checkpoint.h5'): os.remove('/results/pong_model_checkpoint.h5') model.save_weights('/results/pong_model_checkpoint.h5') # reset the current environment and print current results if running_reward is None: running_reward = reward_sum else: running_reward = running_reward * 0.99 + reward_sum * 0.01 print('Environment reset imminent.') print('Total Episode Reward: %f' % reward_sum) print('Running Mean: %f' % running_reward) reward_sum = 0 observation = env.reset() prev_x = None if reward != 0: print(('Episode %d Result: ' % episode_number) + ('Defeat!' if reward == -1 else 'Victory!!')) if render: writer.close()
Done
training loopPython