Jun / Sep 25 2019

Chapter05 Black Jack (Fig_5_2)

using ReinforcementLearning, ReinforcementLearningEnvironments
using RLIntro
using RLIntro.BlackJack
env = BlackJackEnv(;is_exploring_start=true)

ns, na = length(observation_space(env)), length(action_space(env))
(220, 2)
agent = Agent(
    π=ExploringStartPolicy(
        π=QBasedPolicy(
            learner=MonteCarloLearner(
                approximator=TabularQApproximator(;n_state=ns, n_action=na),
                kind=EVERY_VISIT),
            selector=EpsilonGreedySelector(0.)
        ),
        actions=1:na
    ),
    buffer=episode_RTSA_buffer()
);

run(agent, env, StopAfterEpisode(1000000; is_show_progress=false))
EmptyHook()
q_value = reshape(agent.π.π.learner.approximator.table, 2, 11, 10, 2)
p1 = map(x -> x.I[3], argmax(q_value[1, :, :, :], dims=3))
p2 = map(x -> x.I[3], argmax(q_value[2, :, :, :], dims=3));
heatmap(dropdims(p1, dims=3))
heatmap(dropdims(maximum(q_value[1, :, :, :]; dims=3), dims=3))
heatmap(dropdims(p2, dims=3))
heatmap(dropdims(maximum(q_value[2, :, :, :]; dims=3), dims=3))