fix legal_actions_mask errors (#74)

findmyway · web-flow · commit 74c6e5a0933b · 2020-08-21T01:23:57.000+08:00
diff --git a/src/algorithms/dqns/dqn.jl b/src/algorithms/dqns/dqn.jl
@@ -88,21 +88,15 @@ end
     if `!isnothing(stack_size)`.
 """
 function (learner::DQNLearner)(env)
-    probs =
-        env |>
-        get_state |>
+    env |>
+    get_state |>
+    x ->
+        Flux.unsqueeze(x, ndims(x) + 1) |>
         x ->
-            Flux.unsqueeze(x, ndims(x) + 1) |>
-            x ->
-                send_to_device(device(learner.approximator), x) |>
-                learner.approximator |>
-                vec |>
-                send_to_host
-
-    if ActionStyle(env) === FULL_ACTION_SET
-        probs .+= typemin(eltype(probs)) .* (1 .- get_legal_actions_mask(env))
-    end
-    probs
+            send_to_device(device(learner.approximator), x) |>
+            learner.approximator |>
+            vec |>
+            send_to_host
 end
 
 function RLBase.update!(learner::DQNLearner, t::AbstractTrajectory)
@@ -133,9 +127,9 @@ function RLBase.update!(learner::DQNLearner, t::AbstractTrajectory)
 
     target_q = Qₜ(next_states)
     if haskey(t, :next_legal_actions_mask)
-        target_q .+=
-            typemin(eltype(target_q)) .*
-            (1 .- send_to_device(D, t[:next_legal_actions_mask]))
+        masked_value = fill(typemin(Float32), size(experience.next_legal_actions_mask))
+        masked_value[experience.next_legal_actions_mask] .= 0
+        target_q .+= send_to_device(D, masked_value)
     end
 
     q′ = dropdims(maximum(target_q; dims = 1), dims = 1)
diff --git a/src/algorithms/dqns/iqn.jl b/src/algorithms/dqns/iqn.jl
@@ -156,11 +156,7 @@ function (learner::IQNLearner)(env)
     τ = rand(learner.device_rng, Float32, learner.K, 1)
     τₑₘ = embed(τ, learner.Nₑₘ)
     quantiles = learner.approximator(state, τₑₘ)
-    probs = vec(mean(quantiles; dims = 2)) |> send_to_host
-    if ActionStyle(env) === FULL_ACTION_SET
-        probs .+= typemin(eltype(probs)) .* (1 .- get_legal_actions_mask(env))
-    end
-    probs
+    vec(mean(quantiles; dims = 2)) |> send_to_host
 end
 
 embed(x, Nₑₘ) = cos.(Float32(π) .* (1:Nₑₘ) .* reshape(x, 1, :))
@@ -187,9 +183,9 @@ function RLBase.update!(learner::IQNLearner, batch::NamedTuple)
     avg_zₜ = mean(zₜ, dims = 2)
 
     if !isnothing(batch.next_legal_actions_mask)
-        avg_zₜ .+=
-            typemin(eltype(avg_zₜ)) .*
-            (1 .- send_to_device(D, batch.next_legal_actions_mask))
+        masked_value = fill(typemin(Float32), size(batch.next_legal_actions_mask))
+        masked_value[batch.next_legal_actions_mask] .= 0
+        avg_zₜ .+= send_to_device(D, masked_value)
     end
 
     aₜ = argmax(avg_zₜ, dims = 1)
diff --git a/src/algorithms/dqns/prioritized_dqn.jl b/src/algorithms/dqns/prioritized_dqn.jl
@@ -102,21 +102,15 @@ end
     if `!isnothing(stack_size)`.
 """
 function (learner::PrioritizedDQNLearner)(env)
-    probs =
-        env |>
-        get_state |>
+    env |>
+    get_state |>
+    x ->
+        Flux.unsqueeze(x, ndims(x) + 1) |>
         x ->
-            Flux.unsqueeze(x, ndims(x) + 1) |>
-            x ->
-                send_to_device(device(learner.approximator), x) |>
-                learner.approximator |>
-                vec |>
-                send_to_host
-
-    if ActionStyle(env) === FULL_ACTION_SET
-        probs .+= typemin(eltype(probs)) .* (1 .- get_legal_actions_mask(env))
-    end
-    probs
+            send_to_device(device(learner.approximator), x) |>
+            learner.approximator |>
+            vec |>
+            send_to_host
 end
 
 function RLBase.update!(learner::PrioritizedDQNLearner, batch::NamedTuple)
@@ -141,9 +135,9 @@ function RLBase.update!(learner::PrioritizedDQNLearner, batch::NamedTuple)
 
     target_q = Qₜ(next_states)
     if !isnothing(batch.next_legal_actions_mask)
-        target_q .+=
-            typemin(eltype(target_q)) .*
-            (1 .- send_to_device(D, batch.next_legal_actions_mask))
+        masked_value = fill(typemin(Float32), size(batch.next_legal_actions_mask))
+        masked_value[batch.next_legal_actions_mask] .= 0
+        target_q .+= send_to_device(D, masked_value)
     end
 
     q′ = dropdims(maximum(target_q; dims = 1), dims = 1)
diff --git a/src/algorithms/dqns/rainbow.jl b/src/algorithms/dqns/rainbow.jl
@@ -126,11 +126,7 @@ function (learner::RainbowLearner)(env)
     state = Flux.unsqueeze(state, ndims(state) + 1)
     logits = learner.approximator(state)
     q = learner.support .* softmax(reshape(logits, :, learner.n_actions))
-    probs = vec(sum(q, dims = 1)) |> send_to_host
-    if ActionStyle(env) === FULL_ACTION_SET
-        probs .+= typemin(eltype(probs)) .* (1 .- get_legal_actions_mask(env))
-    end
-    probs
+    vec(sum(q, dims = 1)) |> send_to_host
 end
 
 function RLBase.update!(learner::RainbowLearner, batch::NamedTuple)
@@ -161,9 +157,9 @@ function RLBase.update!(learner::RainbowLearner, batch::NamedTuple)
     next_probs = reshape(softmax(reshape(next_logits, n_atoms, :)), n_atoms, n_actions, :)
     next_q = reshape(sum(support .* next_probs, dims = 1), n_actions, :)
     if !isnothing(batch.next_legal_actions_mask)
-        next_q .+=
-            typemin(eltype(next_q)) .*
-            (1 .- send_to_device(D, batch.next_legal_actions_mask))
+        masked_value = fill(typemin(Float32), size(batch.next_legal_actions_mask))
+        masked_value[batch.next_legal_actions_mask] .= 0
+        next_q .+= send_to_device(D, masked_value)
     end
     next_prob_select = select_best_probs(next_probs, next_q)
 
diff --git a/src/algorithms/policy_gradient/A2C.jl b/src/algorithms/policy_gradient/A2C.jl
@@ -28,28 +28,17 @@ Base.@kwdef mutable struct A2CLearner{A<:ActorCritic} <: AbstractLearner
 end
 
 function (learner::A2CLearner)(env::MultiThreadEnv)
-    logits =
-        learner.approximator.actor(send_to_device(
-            device(learner.approximator),
-            get_state(env),
-        )) |> send_to_host
-
-    if ActionStyle(env[1]) === FULL_ACTION_SET
-        logits .+= typemin(eltype(logits)) .* (1 .- get_legal_actions_mask(env))
-    end
-    logits
+    learner.approximator.actor(send_to_device(
+        device(learner.approximator),
+        get_state(env),
+    )) |> send_to_host
 end
 
 function (learner::A2CLearner)(env)
     s = get_state(env)
     s = Flux.unsqueeze(s, ndims(s) + 1)
     s = send_to_device(device(learner.approximator), s)
-    logits = learner.approximator.actor(s) |> vec |> send_to_host
-
-    if ActionStyle(env) === FULL_ACTION_SET
-        logits .+= typemin(eltype(logits)) .* (1 .- get_legal_actions_mask(env))
-    end
-    logits
+    learner.approximator.actor(s) |> vec |> send_to_host
 end
 
 function RLBase.update!(learner::A2CLearner, t::AbstractTrajectory)
@@ -87,11 +76,6 @@ function RLBase.update!(learner::A2CLearner, t::AbstractTrajectory)
     ps = Flux.params(AC)
     gs = gradient(ps) do
         logits = AC.actor(states_flattened)
-        if haskey(t, :legal_actions_mask)
-            logits .+=
-                typemin(eltype(logits)) .*
-                (1 .- flatten_batch(send_to_device(D, t[:legal_actions_mask])))
-        end
         probs = softmax(logits)
         log_probs = logsoftmax(logits)
         log_probs_select = log_probs[actions]
diff --git a/src/algorithms/policy_gradient/ppo.jl b/src/algorithms/policy_gradient/ppo.jl
@@ -75,28 +75,17 @@ function PPOLearner(;
 end
 
 function (learner::PPOLearner)(env::MultiThreadEnv)
-    logits =
-        learner.approximator.actor(send_to_device(
-            device(learner.approximator),
-            get_state(env),
-        )) |> send_to_host
-
-    if ActionStyle(env[1]) === FULL_ACTION_SET
-        logits .+= typemin(eltype(logits)) .* (1 .- get_legal_actions_mask(env))
-    end
-    logits
+    learner.approximator.actor(send_to_device(
+        device(learner.approximator),
+        get_state(env),
+    )) |> send_to_host
 end
 
 function (learner::PPOLearner)(env)
     s = get_state(env)
     s = Flux.unsqueeze(s, ndims(s) + 1)
     s = send_to_device(device(learner.approximator), s)
-    logits = learner.approximator.actor(s) |> vec |> send_to_host
-
-    if ActionStyle(env) === FULL_ACTION_SET
-        logits .+= typemin(eltype(logits)) .* (1 .- get_legal_actions_mask(env))
-    end
-    logits
+    learner.approximator.actor(s) |> vec |> send_to_host
 end
 
 function RLBase.update!(learner::PPOLearner, t::PPOTrajectory)
@@ -154,9 +143,6 @@ function RLBase.update!(learner::PPOLearner, t::PPOTrajectory)
             gs = gradient(ps) do
                 v′ = AC.critic(s) |> vec
                 logit′ = AC.actor(s)
-                if haskey(t, :legal_actions_mask)
-                    logit′ .+= typemin(eltype(logit′)) .* (1 .- lam)
-                end
                 p′ = softmax(logit′)
                 log_p′ = logsoftmax(logit′)
                 log_p′ₐ = log_p′[CartesianIndex.(a, 1:length(a))]