Jun / Sep 25 2019

Chapter09 Random Walk

using SparseArrays

using ReinforcementLearning, ReinforcementLearningEnvironments, RLIntro.RandomWalk
using StatsBase, Plots
ACTIONS = collect(Iterators.flatten((-100:-1, 1:100)))
NS = 1002
NA = length(ACTIONS)
200

Here we define a group mapping preprocessor first.

The first and the last element will be grouped into another two separate group.

Base.@kwdef struct GroupMapping <: AbstractPreprocessor
    n::Int
    n_groups::Int
    n_per_group::Int=div(n, n_groups)
end

function (p::GroupMapping)(x::Int)
    if x == 1
        1
    elseif x == p.n
        p.n_groups + 2
    else
        div(x - 2, p.n_per_group) + 2
    end
end

And then define a hook to calculate the count of each state.

struct CountStates <: AbstractHook
    counts::Vector{Int}
    CountStates(n) = new(zeros(Int, n))
end

(f::CountStates)(::PreActStage, agent, env, obs_action) = f.counts[get_state(observe(env.env))] += 1
env = RandomWalkEnv(N=NS, actions=ACTIONS)
RandomWalkEnv(1002, 501, 501, [-100, -99, -98, -97, -96, -95, -94, -93, -92, -91 … 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], -1.0, 1.0, DiscreteSpace{Int64}(1, 1002, 1002), DiscreteSpace{Int64}(1, 200, 200))
Observation{Float64,Bool,Int64,NamedTuple{(),Tuple{}}}(0.0, false, 501, NamedTuple())
TRUE_STATE_VALUES = begin
    env = RandomWalkEnv(N=NS, actions=ACTIONS)
    agent = Agent(
        π=VBasedPolicy(
            learner=TDLearner(
                approximator=TabularVApproximator(NS),
                method=:SRS,
                optimizer=Descent(0.01)
                ),
            f=TabularRandomPolicy(fill(1/NA, NS, NA))
            ),
        buffer=episode_RTSA_buffer()
    )
    run(agent, env, StopAfterEpisode(10^5))
    agent.π.learner.approximator.table
end
1002-element Array{Float64,1}: 0.0 -0.907603 -0.907423 -0.91348 -0.915209 -0.912627 -0.894126 -0.89234 -0.90065 -0.888785 ⋮ 0.896366 0.887136 0.896937 0.902451 0.898225 0.899328 0.899455 0.908906 0.0
plot(TRUE_STATE_VALUES[2:end-1])
struct RecordRMS <: AbstractHook
    rms::Vector{Float64}
    RecordRMS() = new([])
end

function (f::RecordRMS)(::PostEpisodeStage, agent, env, obs)
    push!(f.rms, sqrt(mean((agent.π.learner.approximator.(env.preprocessor.(2:(NS-1))) - TRUE_STATE_VALUES[2:end-1]).^2)))
end
n_groups = 10
env = WrappedEnv(
    env=RandomWalkEnv(N=NS, actions=ACTIONS),
    preprocessor=GroupMapping(n=NS, n_groups=n_groups)
    )
WrappedEnv{RandomWalkEnv,GroupMapping}(RandomWalkEnv(1002, 501, 501, [-100, -99, -98, -97, -96, -95, -94, -93, -92, -91 … 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], -1.0, 1.0, DiscreteSpace{Int64}(1, 1002, 1002), DiscreteSpace{Int64}(1, 200, 200)), GroupMapping(1002, 10, 100))
agent = Agent(
    π=VBasedPolicy(
        learner=MonteCarloLearner(
            approximator=TabularVApproximator(n_groups+2),
            kind=EVERY_VISIT,  # this is very important!
            α=2e-5),
        f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA))
        ),
    buffer=episode_RTSA_buffer()
)
Agent{VBasedPolicy{MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling},TabularRandomPolicy},EpisodeTurnBuffer{(:reward, :terminal, :state, :action),Tuple{Float64,Bool,Int64,Int64},NamedTuple{(:reward, :terminal, :state, :action),Tuple{Array{Float64,1},Array{Bool,1},Array{Int64,1},Array{Int64,1}}}},Symbol}(VBasedPolicy{MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling},TabularRandomPolicy}(MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling}(TabularVApproximator([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 1.0, 2.0e-5, CachedSampleAvg(Dict{Any,SampleAvg}())), TabularRandomPolicy([0.005 0.005 … 0.005 0.005; 0.005 0.005 … 0.005 0.005; … ; 0.005 0.005 … 0.005 0.005; 0.005 0.005 … 0.005 0.005])), NamedTuple{(:reward, :terminal, :state, :action),Tuple{Float64,Bool,Int64,Int64}}[], :DEFAULT)
hook=CountStates(NS)
run(agent, env, StopAfterEpisode(10^5);hook=hook)
CountStates([0, 1172, 1246, 1290, 1302, 1282, 1328, 1355, 1295, 1364 … 1373, 1313, 1336, 1351, 1301, 1283, 1280, 1220, 1193, 0])
plot(hook.counts./sum(hook.counts))
plot(agent.π.learner.approximator.(env.preprocessor(s) for s in 2:NS-1),legend=:bottomright, label="Monte Carlo")
plot!(TRUE_STATE_VALUES[2:end-1], label="true values")
agent = Agent(
    π=VBasedPolicy(
        learner=TDLearner(
                approximator=TabularVApproximator(n_groups+2),
                method=:SRS,
                optimizer=Descent(2e-4)
                ),
        f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA))
        ),
    buffer=episode_RTSA_buffer()
)
run(agent, env, StopAfterEpisode(10^5))
EmptyHook()
plot(agent.π.learner.approximator.(env.preprocessor(s) for s in 2:NS-1),legend=:bottomright, label="TD Learner")
plot!(TRUE_STATE_VALUES[2:end-1], label="true values")
n_groups = 20
function run_once(n, α)
    env = WrappedEnv(
        env=RandomWalkEnv(N=NS, actions=ACTIONS),
        preprocessor=GroupMapping(n=NS, n_groups=n_groups)
        )
    agent = Agent(
        π=VBasedPolicy(
            learner=TDLearner(
                approximator=TabularVApproximator(n_groups+2),
                method=:SRS,
                optimizer=Descent(α),
                n=n
                ),
            f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA))
            ),
        buffer=episode_RTSA_buffer()
    )

    hook = RecordRMS()
    run(agent, env, StopAfterEpisode(10); hook=hook)
    mean(hook.rms)
end

A = 0:0.1:1

p = plot(legend=:topleft)
for n in [2^i for i in 0:9]
    plot!(p, A, mean([run_once(n, α) for α in A] for _ in 1:100), label="n = $n")
end
p
struct ScalePreprocessor <: AbstractPreprocessor
    scale::Float64
end

(p::ScalePreprocessor)(x::Number) = p.scale * x
π_target = TabularRandomPolicy(fill(1/NA, NS, NA))

function run_once_MC(preprocessor, order, α)
    env = WrappedEnv(
        env=RandomWalkEnv(N=NS, actions=ACTIONS),
        preprocessor=preprocessor
        )
    
    agent = Agent(
        π=VBasedPolicy(
            learner=MonteCarloLearner(
                approximator=LinearVApproximator(zeros(order+1)),
                α=α,
                kind=EVERY_VISIT
                ),
            f=obs -> obs.meta.state_before_ScalePreprocessor |> π_target
            ),
        buffer=episode_RTSA_buffer(;state_eltype=Vector{Float64})
    )

    hook=RecordRMS()
    run(agent, env, StopAfterEpisode(5000;is_show_progress=false); hook=hook)
    hook.rms
end
run_once_MC (generic function with 1 method)
p = plot(legend=:topright)
for order in [5, 10, 20]
    plot!(p, mean(run_once_MC(Chain(ScalePreprocessor(1/NS), FourierPreprocessor(order)), order, 0.00005) for _ in 1:1), label="Fourier $order", linestyle=:dash)
    plot!(p, mean(run_once_MC(Chain(ScalePreprocessor(1/NS), PolynomialPreprocessor(order)), order, 0.0001) for _ in 1:1), label="Polynomial $order", linestyle=:solid)
end
p
function run_once_MC(preprocessor, α, V)
    env = WrappedEnv(
        env=RandomWalkEnv(N=NS, actions=ACTIONS),
        preprocessor=preprocessor
        )
    
    agent = Agent(
        π=VBasedPolicy(
            learner=MonteCarloLearner(
                approximator=LinearVApproximator(V),
                α=α,
                kind=EVERY_VISIT
                ),
            f=obs -> rand(1:NA)
            ),
        buffer=episode_RTSA_buffer(;state_eltype=SparseMatrixCSC{Float64,Int64})
    )

    hook=RecordRMS()
    run(agent, env, StopAfterEpisode(10000;is_show_progress=false); hook=hook)
    hook.rms
end
run_once_MC (generic function with 1 method)
struct ToSparseMatrixPreprocessor <: AbstractPreprocessor
    m::Int
    n::Int
end

(p::ToSparseMatrixPreprocessor)(xs::Vector{Int}) = sparse(xs, 1:length(xs), ones(length(xs)), p.m, p.n)
plot(run_once_MC(
    Chain(
        TilingPreprocessor([Tiling((range(1-4*(i-1), step=200, length=7),)) for i in 1:1]),
        ToSparseMatrixPreprocessor(7, 1)
        ),
    1e-4/1,
    zeros(7,1)))
plot!(run_once_MC(
    Chain(
        TilingPreprocessor([Tiling((range(1-4*(i-1), step=200, length=7),)) for i in 1:50]),
        ToSparseMatrixPreprocessor(7, 50)
        ),
    1e-4/50,
    zeros(7,50)))