|
| 1 | +using Flux: normalise |
| 2 | +using Random: shuffle |
| 3 | + |
| 4 | +using ReinforcementLearningBase |
| 5 | +using ReinforcementLearningCore |
| 6 | + |
| 7 | +export VPGPolicy, GaussianNetwork |
| 8 | + |
| 9 | +struct GaussianNetwork |
| 10 | + pre::Chain |
| 11 | + μ::Chain |
| 12 | + σ::Chain |
| 13 | +end |
| 14 | +Flux.@functor GaussianNetwork |
| 15 | +function (m::GaussianNetwork)(S) |
| 16 | + x = m.pre(S) |
| 17 | + m.μ(x), m.σ(x) .|> exp |
| 18 | +end |
| 19 | + |
| 20 | +""" |
| 21 | +Vanilla Policy Gradient |
| 22 | +
|
| 23 | +VPGPolicy(;kwargs) |
| 24 | +
|
| 25 | +# Keyword arguments |
| 26 | +- `approximator`, |
| 27 | +- `baseline`, |
| 28 | +- `dist`, distribution function of the action |
| 29 | +- `γ`, discount factor |
| 30 | +- `α_θ`, step size of policy parameter |
| 31 | +- `α_w`, step size of baseline parameter |
| 32 | +- `batch_size`, |
| 33 | +- `rng`, |
| 34 | +- `loss`, |
| 35 | +- `baseline_loss`, |
| 36 | +
|
| 37 | +
|
| 38 | +if the action space is continuous, |
| 39 | +then the env should transform the action value, (such as using tanh), |
| 40 | +in order to make sure low ≤ value ≤ high |
| 41 | +""" |
| 42 | +Base.@kwdef mutable struct VPGPolicy{ |
| 43 | + A<:NeuralNetworkApproximator, |
| 44 | + B<:Union{NeuralNetworkApproximator,Nothing}, |
| 45 | + S<:AbstractSpace, |
| 46 | + R<:AbstractRNG, |
| 47 | +} <: AbstractPolicy |
| 48 | + approximator::A |
| 49 | + baseline::B = nothing |
| 50 | + action_space::S |
| 51 | + dist::Any |
| 52 | + γ::Float32 = 0.99f0 # discount factor |
| 53 | + α_θ = 1.0f0 # step size of policy |
| 54 | + α_w = 1.0f0 # step size of baseline |
| 55 | + batch_size::Int = 1024 |
| 56 | + rng::R = Random.GLOBAL_RNG |
| 57 | + loss::Float32 = 0.0f0 |
| 58 | + baseline_loss::Float32 = 0.0f0 |
| 59 | +end |
| 60 | + |
| 61 | +""" |
| 62 | +About continuous action space, see |
| 63 | +* [Diagonal Gaussian Policies](https://spinningup.openai.com/en/latest/spinningup/rl_intro.html#stochastic-policies |
| 64 | +* [Clipped Action Policy Gradient](https://arxiv.org/pdf/1802.07564.pdf) |
| 65 | +""" |
| 66 | + |
| 67 | +function (π::VPGPolicy)(env::AbstractEnv) |
| 68 | + to_dev(x) = send_to_device(device(π.approximator), x) |
| 69 | + |
| 70 | + logits = env |> get_state |> to_dev |> π.approximator |
| 71 | + |
| 72 | + if π.action_space isa DiscreteSpace |
| 73 | + dist = logits |> softmax |> π.dist |
| 74 | + action = π.action_space[rand(π.rng, dist)] |
| 75 | + elseif π.action_space isa ContinuousSpace |
| 76 | + dist = π.dist.(logits...) |
| 77 | + action = rand.(π.rng, dist)[1] |
| 78 | + else |
| 79 | + error("not implemented") |
| 80 | + end |
| 81 | + action |
| 82 | +end |
| 83 | + |
| 84 | +function (π::VPGPolicy)(env::MultiThreadEnv) |
| 85 | + error("not implemented") |
| 86 | + # TODO: can PG support multi env? PG only get updated at the end of an episode. |
| 87 | +end |
| 88 | + |
| 89 | +function RLBase.update!(π::VPGPolicy, traj::ElasticCompactSARTSATrajectory) |
| 90 | + (length(traj[:terminal]) > 0 && traj[:terminal][end]) || return |
| 91 | + |
| 92 | + model = π.approximator |
| 93 | + to_dev(x) = send_to_device(device(model), x) |
| 94 | + |
| 95 | + states = traj[:state] |
| 96 | + actions = traj[:action] |> Array # need to convert ElasticArray to Array, or code will fail on gpu. `log_prob[CartesianIndex.(A, 1:length(A))` |
| 97 | + gains = traj[:reward] |> x -> discount_rewards(x, π.γ) |
| 98 | + |
| 99 | + for idx in Iterators.partition(shuffle(1:length(traj[:terminal])), π.batch_size) |
| 100 | + S = select_last_dim(states, idx) |> to_dev |
| 101 | + A = actions[idx] |
| 102 | + G = gains[idx] |> x -> Flux.unsqueeze(x, 1) |> to_dev |
| 103 | + # gains is a 1 colomn array, but the ouput of flux model is 1 row, n_batch columns array. so unsqueeze it. |
| 104 | + |
| 105 | + if π.baseline isa NeuralNetworkApproximator |
| 106 | + gs = gradient(Flux.params(π.baseline)) do |
| 107 | + δ = G - π.baseline(S) |
| 108 | + loss = mean(δ .^ 2) * π.α_w # mse |
| 109 | + ignore() do |
| 110 | + π.baseline_loss = loss |
| 111 | + end |
| 112 | + loss |
| 113 | + end |
| 114 | + update!(π.baseline, gs) |
| 115 | + elseif π.baseline isa Nothing |
| 116 | + # Normalization. See |
| 117 | + # (http://rail.eecs.berkeley.edu/deeprlcourse-fa17/f17docs/hw2_final.pdf) |
| 118 | + # (https://web.stanford.edu/class/cs234/assignment3/solution.pdf) |
| 119 | + # normalise should not be used with baseline. or the loss of the policy will be too small. |
| 120 | + δ = G |> x -> normalise(x; dims = 2) |
| 121 | + end |
| 122 | + |
| 123 | + gs = gradient(Flux.params(model)) do |
| 124 | + if π.action_space isa DiscreteSpace |
| 125 | + log_prob = S |> model |> logsoftmax |
| 126 | + log_probₐ = log_prob[CartesianIndex.(A, 1:length(A))] |
| 127 | + elseif π.action_space isa ContinuousSpace |
| 128 | + dist = π.dist.(model(S)...) # TODO: this part does not work on GPU. See: https://github.com/JuliaStats/Distributions.jl/issues/1183 . |
| 129 | + log_probₐ = logpdf.(dist, A) |
| 130 | + end |
| 131 | + loss = -mean(log_probₐ .* δ) * π.α_θ |
| 132 | + ignore() do |
| 133 | + π.loss = loss |
| 134 | + end |
| 135 | + loss |
| 136 | + end |
| 137 | + update!(model, gs) |
| 138 | + end |
| 139 | + empty!(traj) |
| 140 | +end |
0 commit comments