Jun / Sep 25 2019

Chapter06 Maximization Bias

using ReinforcementLearning, ReinforcementLearningEnvironments
using RLIntro.MaximizationBias
using Plots, StatsBase
mutable struct CountOfLeft <: AbstractHook
    counts::Vector{Int}
    count::Int
    CountOfLeft() = new([],0)
end

function (f::CountOfLeft)(::PreActStage, agent, env, obs_action)
    obs, action = obs_action
    if get_state(obs) == 1 && action == MaximizationBias.LEFT
        f.count += 1
    end
end

function (f::CountOfLeft)(::PostEpisodeStage, agent, env, obs)
    push!(f.counts, f.count)
    f.count = 0
end
env = MaximizationBiasEnv()
ns, na = length(observation_space(env)), length(action_space(env))
(3, 10)
init_double_Q_agent() = Agent(
    π=QBasedPolicy(
        learner=DoubleLearner(
            L1=TDLearner(
                approximator=TabularQApproximator(n_state=ns, n_action=na),
                optimizer=Descent(0.1),
                method=:SARS
                ),
            L2=TDLearner(
                approximator=TabularQApproximator(n_state=ns, n_action=na),
                optimizer=Descent(0.1),
                method=:SARS
                )
            ),
        selector=EpsilonGreedySelector(0.1)
        ),
    buffer=episode_RTSA_buffer()
)

init_Q_agent() = Agent(
    π=QBasedPolicy(
        learner=TDLearner(
            approximator=TabularQApproximator(n_state=ns, n_action=na),
            optimizer=Descent(0.1),
            method=:SARS
            ),
        selector=EpsilonGreedySelector(0.1)
        ),
    buffer=episode_RTSA_buffer()
)
init_Q_agent (generic function with 1 method)
stats = []
for _ in 1:10000
    hook = CountOfLeft()
    run(init_double_Q_agent(), env, StopAfterEpisode(300);hook=hook)
    push!(stats, hook.counts)
end

plot(mean(stats), legend=:topright, label="double q")
stats = []
for _ in 1:10000
    hook = CountOfLeft()
    run(init_Q_agent(), env, StopAfterEpisode(300);hook=hook)
    push!(stats, hook.counts)
end
plot!(mean(stats), legend=:topright, label="q")

TODO

We need to add a legal action here to restrict that the actions of the first step can only be 1(left) or 2(right). So that the figure will look exactly like the one on the book.