Jun / Sep 25 2019
Chapter09 Random Walk
using SparseArrays using ReinforcementLearning, ReinforcementLearningEnvironments, RLIntro.RandomWalk using StatsBase, Plots
ACTIONS = collect(Iterators.flatten((-100:-1, 1:100))) NS = 1002 NA = length(ACTIONS)
200
Here we define a group mapping preprocessor first.
The first and the last element will be grouped into another two separate group.
Base. struct GroupMapping <: AbstractPreprocessor n::Int n_groups::Int n_per_group::Int=div(n, n_groups) end function (p::GroupMapping)(x::Int) if x == 1 1 elseif x == p.n p.n_groups + 2 else div(x - 2, p.n_per_group) + 2 end end
And then define a hook to calculate the count of each state.
struct CountStates <: AbstractHook counts::Vector{Int} CountStates(n) = new(zeros(Int, n)) end (f::CountStates)(::PreActStage, agent, env, obs_action) = f.counts[get_state(observe(env.env))] += 1
env = RandomWalkEnv(N=NS, actions=ACTIONS)
RandomWalkEnv(1002, 501, 501, [-100, -99, -98, -97, -96, -95, -94, -93, -92, -91 … 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], -1.0, 1.0, DiscreteSpace{Int64}(1, 1002, 1002), DiscreteSpace{Int64}(1, 200, 200))
observe(env)
Observation{Float64,Bool,Int64,NamedTuple{(),Tuple{}}}(0.0, false, 501, NamedTuple())
TRUE_STATE_VALUES = begin env = RandomWalkEnv(N=NS, actions=ACTIONS) agent = Agent( π=VBasedPolicy( learner=TDLearner( approximator=TabularVApproximator(NS), method=:SRS, optimizer=Descent(0.01) ), f=TabularRandomPolicy(fill(1/NA, NS, NA)) ), buffer=episode_RTSA_buffer() ) run(agent, env, StopAfterEpisode(10^5)) agent.π.learner.approximator.table end
1002-element Array{Float64,1}:
0.0
-0.907603
-0.907423
-0.91348
-0.915209
-0.912627
-0.894126
-0.89234
-0.90065
-0.888785
⋮
0.896366
0.887136
0.896937
0.902451
0.898225
0.899328
0.899455
0.908906
0.0
using Plots
plot(TRUE_STATE_VALUES[2:end-1])
struct RecordRMS <: AbstractHook rms::Vector{Float64} RecordRMS() = new([]) end function (f::RecordRMS)(::PostEpisodeStage, agent, env, obs) push!(f.rms, sqrt(mean((agent.π.learner.approximator.(env.preprocessor.(2:(NS-1))) - TRUE_STATE_VALUES[2:end-1]).^2))) end
n_groups = 10 env = WrappedEnv( env=RandomWalkEnv(N=NS, actions=ACTIONS), preprocessor=GroupMapping(n=NS, n_groups=n_groups) )
WrappedEnv{RandomWalkEnv,GroupMapping}(RandomWalkEnv(1002, 501, 501, [-100, -99, -98, -97, -96, -95, -94, -93, -92, -91 … 91, 92, 93, 94, 95, 96, 97, 98, 99, 100], -1.0, 1.0, DiscreteSpace{Int64}(1, 1002, 1002), DiscreteSpace{Int64}(1, 200, 200)), GroupMapping(1002, 10, 100))
agent = Agent( π=VBasedPolicy( learner=MonteCarloLearner( approximator=TabularVApproximator(n_groups+2), kind=EVERY_VISIT, # this is very important! α=2e-5), f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA)) ), buffer=episode_RTSA_buffer() )
Agent{VBasedPolicy{MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling},TabularRandomPolicy},EpisodeTurnBuffer{(:reward, :terminal, :state, :action),Tuple{Float64,Bool,Int64,Int64},NamedTuple{(:reward, :terminal, :state, :action),Tuple{Array{Float64,1},Array{Bool,1},Array{Int64,1},Array{Int64,1}}}},Symbol}(VBasedPolicy{MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling},TabularRandomPolicy}(MonteCarloLearner{EveryVisit,TabularVApproximator,CachedSampleAvg,NoSampling}(TabularVApproximator([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]), 1.0, 2.0e-5, CachedSampleAvg(Dict{Any,SampleAvg}())), TabularRandomPolicy([0.005 0.005 … 0.005 0.005; 0.005 0.005 … 0.005 0.005; … ; 0.005 0.005 … 0.005 0.005; 0.005 0.005 … 0.005 0.005])), NamedTuple{(:reward, :terminal, :state, :action),Tuple{Float64,Bool,Int64,Int64}}[], :DEFAULT)
hook=CountStates(NS) run(agent, env, StopAfterEpisode(10^5);hook=hook)
CountStates([0, 1172, 1246, 1290, 1302, 1282, 1328, 1355, 1295, 1364 … 1373, 1313, 1336, 1351, 1301, 1283, 1280, 1220, 1193, 0])
plot(hook.counts./sum(hook.counts))
plot(agent.π.learner.approximator.(env.preprocessor(s) for s in 2:NS-1),legend=:bottomright, label="Monte Carlo") plot!(TRUE_STATE_VALUES[2:end-1], label="true values")
agent = Agent( π=VBasedPolicy( learner=TDLearner( approximator=TabularVApproximator(n_groups+2), method=:SRS, optimizer=Descent(2e-4) ), f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA)) ), buffer=episode_RTSA_buffer() ) run(agent, env, StopAfterEpisode(10^5))
EmptyHook()
plot(agent.π.learner.approximator.(env.preprocessor(s) for s in 2:NS-1),legend=:bottomright, label="TD Learner") plot!(TRUE_STATE_VALUES[2:end-1], label="true values")
using StatsBase
129.1s
n_groups = 20 function run_once(n, α) env = WrappedEnv( env=RandomWalkEnv(N=NS, actions=ACTIONS), preprocessor=GroupMapping(n=NS, n_groups=n_groups) ) agent = Agent( π=VBasedPolicy( learner=TDLearner( approximator=TabularVApproximator(n_groups+2), method=:SRS, optimizer=Descent(α), n=n ), f=TabularRandomPolicy(fill(1/NA, n_groups+2, NA)) ), buffer=episode_RTSA_buffer() ) hook = RecordRMS() run(agent, env, StopAfterEpisode(10); hook=hook) mean(hook.rms) end A = 0:0.1:1 p = plot(legend=:topleft) for n in [2^i for i in 0:9] plot!(p, A, mean([run_once(n, α) for α in A] for _ in 1:100), label="n = $n") end p
struct ScalePreprocessor <: AbstractPreprocessor scale::Float64 end (p::ScalePreprocessor)(x::Number) = p.scale * x
π_target = TabularRandomPolicy(fill(1/NA, NS, NA)) function run_once_MC(preprocessor, order, α) env = WrappedEnv( env=RandomWalkEnv(N=NS, actions=ACTIONS), preprocessor=preprocessor ) agent = Agent( π=VBasedPolicy( learner=MonteCarloLearner( approximator=LinearVApproximator(zeros(order+1)), α=α, kind=EVERY_VISIT ), f=obs -> obs.meta.state_before_ScalePreprocessor |> π_target ), buffer=episode_RTSA_buffer(;state_eltype=Vector{Float64}) ) hook=RecordRMS() run(agent, env, StopAfterEpisode(5000;is_show_progress=false); hook=hook) hook.rms end
run_once_MC (generic function with 1 method)
p = plot(legend=:topright) for order in [5, 10, 20] plot!(p, mean(run_once_MC(Chain(ScalePreprocessor(1/NS), FourierPreprocessor(order)), order, 0.00005) for _ in 1:1), label="Fourier $order", linestyle=:dash) plot!(p, mean(run_once_MC(Chain(ScalePreprocessor(1/NS), PolynomialPreprocessor(order)), order, 0.0001) for _ in 1:1), label="Polynomial $order", linestyle=:solid) end p
function run_once_MC(preprocessor, α, V) env = WrappedEnv( env=RandomWalkEnv(N=NS, actions=ACTIONS), preprocessor=preprocessor ) agent = Agent( π=VBasedPolicy( learner=MonteCarloLearner( approximator=LinearVApproximator(V), α=α, kind=EVERY_VISIT ), f=obs -> rand(1:NA) ), buffer=episode_RTSA_buffer(;state_eltype=SparseMatrixCSC{Float64,Int64}) ) hook=RecordRMS() run(agent, env, StopAfterEpisode(10000;is_show_progress=false); hook=hook) hook.rms end
run_once_MC (generic function with 1 method)
struct ToSparseMatrixPreprocessor <: AbstractPreprocessor m::Int n::Int end (p::ToSparseMatrixPreprocessor)(xs::Vector{Int}) = sparse(xs, 1:length(xs), ones(length(xs)), p.m, p.n)
plot(run_once_MC( Chain( TilingPreprocessor([Tiling((range(1-4*(i-1), step=200, length=7),)) for i in 1:1]), ToSparseMatrixPreprocessor(7, 1) ), 1e-4/1, zeros(7,1)))
plot!(run_once_MC( Chain( TilingPreprocessor([Tiling((range(1-4*(i-1), step=200, length=7),)) for i in 1:50]), ToSparseMatrixPreprocessor(7, 50) ), 1e-4/50, zeros(7,50)))