JuliaReinforcementLearning
diff --git a/‎Project.toml
Lines changed: 3 additions & 2 deletions b/‎Project.toml
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/algorithms/algorithms.jl
Lines changed: 1 addition & 0 deletions b/‎src/algorithms/algorithms.jl
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/algorithms/cfr/external_sampling_mccfr.jl
Lines changed: 1 addition & 2 deletions b/‎src/algorithms/cfr/external_sampling_mccfr.jl
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/algorithms/cfr/outcome_sampling_mccfr.jl
Lines changed: 1 addition & 2 deletions b/‎src/algorithms/cfr/outcome_sampling_mccfr.jl
Lines changed: 1 addition & 2 deletions
diff --git a/‎src/algorithms/cfr/tabular_cfr.jl
Lines changed: 2 additions & 3 deletions b/‎src/algorithms/cfr/tabular_cfr.jl
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/algorithms/policy_gradient/MAC.jl
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/policy_gradient/MAC.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/policy_gradient/ddpg.jl
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/policy_gradient/ddpg.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/policy_gradient/ppo.jl
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/policy_gradient/ppo.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/policy_gradient/sac.jl
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/policy_gradient/sac.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/algorithms/policy_gradient/td3.jl
Lines changed: 1 addition & 1 deletion b/‎src/algorithms/policy_gradient/td3.jl
Lines changed: 1 addition & 1 deletion
@@ -1,13 +1,14 @@
 name = "ReinforcementLearningZoo"
 uuid = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
 authors = ["Jun Tian <tianjun.cpp@gmail.com>"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
@@ -38,7 +39,7 @@ Flux = "0.11"
 IntervalSets = "0.5"
 MacroTools = "0.5"
 ReinforcementLearningBase = "0.9"
-ReinforcementLearningCore = "0.6.3"
+ReinforcementLearningCore = "0.7"
 Requires = "1"
 Setfield = "0.6, 0.7"
 StableRNGs = "1.0"
 
@@ -1,3 +1,4 @@
+include("tabular/tabular.jl")
 include("dqns/dqns.jl")
 include("policy_gradient/policy_gradient.jl")
 include("searching/searching.jl")
 
@@ -29,7 +29,6 @@ function ExternalSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_
         TabularRandomPolicy(;
             rng = rng,
             table = Dict{state_type,Vector{Float64}}(),
-            is_normalized = true,
         ),
         rng,
     )
@@ -44,7 +43,7 @@ function RLBase.update!(p::ExternalSamplingMCCFRPolicy)
             strategy[m] .= v.cumulative_strategy ./ s
             update!(p.behavior_policy, k => strategy)
         else
-            # The TabularLearner will return uniform distribution by default. 
+            # The TabularRandomPolicy will return uniform distribution by default. 
             # So we do nothing here.
         end
     end
 
@@ -30,7 +30,6 @@ function OutcomeSamplingMCCFRPolicy(; state_type = String, rng = Random.GLOBAL_R
         TabularRandomPolicy(;
             rng = rng,
             table = Dict{state_type,Vector{Float64}}(),
-            is_normalized = true,
         ),
         ϵ,
         rng,
@@ -55,7 +54,7 @@ function RLBase.update!(p::OutcomeSamplingMCCFRPolicy)
             strategy[m] .= v.cumulative_strategy ./ s
             update!(p.behavior_policy, k => strategy)
         else
-            # The TabularLearner will return uniform distribution by default. 
+            # The TabularRandomPolicy will return uniform distribution by default. 
             # So we do nothing here.
         end
     end
 
@@ -23,7 +23,7 @@ end
 
 mutable struct TabularCFRPolicy{S,T,R<:AbstractRNG} <: AbstractCFRPolicy
     nodes::Dict{S,InfoStateNode}
-    behavior_policy::QBasedPolicy{TabularLearner{S,T},WeightedExplorer{true,R}}
+    behavior_policy::TabularRandomPolicy{S,T,R}
     is_reset_neg_regrets::Bool
     is_linear_averaging::Bool
     weighted_averaging_delay::Int
@@ -70,7 +70,6 @@ function TabularCFRPolicy(;
         TabularRandomPolicy(;
             rng = rng,
             table = Dict{state_type,Vector{Float64}}(),
-            is_normalized = true,
         ),
         is_reset_neg_regrets,
         is_linear_averaging,
@@ -91,7 +90,7 @@ function RLBase.update!(p::TabularCFRPolicy)
             strategy[m] .= v.cumulative_strategy ./ s
             update!(p.behavior_policy, k => strategy)
         else
-            # The TabularLearner will return uniform distribution by default. 
+            # The TabularRandomPolicy will return uniform distribution by default. 
             # So we do nothing here.
         end
     end
 
@@ -39,7 +39,7 @@ function (learner::MACLearner)(env)
     learner.approximator.actor(s) |> vec |> send_to_host
 end
 
-function RLBase.update!(learner::MACLearner, t::CircularArraySARTTrajectory)
+function RLBase.update!(learner::MACLearner, t::CircularArraySARTTrajectory, ::AbstractEnv, ::PreActStage)
     length(t) == 0 && return  # in the first update, only state & action is inserted into trajectory
     learner.update_step += 1
     if learner.update_step % learner.update_freq == 0
 
@@ -119,7 +119,7 @@ function (p::DDPGPolicy)(env)
     end
 end
 
-function RLBase.update!(p::DDPGPolicy, traj::CircularArraySARTTrajectory)
+function RLBase.update!(p::DDPGPolicy, traj::CircularArraySARTTrajectory, ::AbstractEnv, ::PreActStage)
     length(traj) > p.update_after || return
     p.step % p.update_every == 0 || return
     inds, batch = sample(p.rng, traj, BatchSampler{SARTS}(p.batch_size))
 
@@ -182,7 +182,7 @@ function (agent::Agent{<:RandomStartPolicy{<:PPOPolicy}})(env::AbstractEnv)
     end
 end
 
-function RLBase.update!(p::PPOPolicy, t::Union{PPOTrajectory, MaskedPPOTrajectory})
+function RLBase.update!(p::PPOPolicy, t::Union{PPOTrajectory, MaskedPPOTrajectory}, ::AbstractEnv, ::PreActStage)
     length(t) == 0 && return  # in the first update, only state & action is inserted into trajectory
     p.update_step += 1
     if p.update_step % p.update_freq == 0
 
@@ -125,7 +125,7 @@ function evaluate(p::SACPolicy, state)
     return tanh.(z), logp_π
 end
 
-function RLBase.update!(p::SACPolicy, traj::CircularArraySARTTrajectory)
+function RLBase.update!(p::SACPolicy, traj::CircularArraySARTTrajectory, ::AbstractEnv, ::PreActStage)
     length(traj) > p.update_after || return
     p.step % p.update_every == 0 || return
     inds, batch = sample(p.rng, traj, BatchSampler{SARTS}(p.batch_size))
 
@@ -126,7 +126,7 @@ function (p::TD3Policy)(env)
     end
 end
 
-function RLBase.update!(p::TD3Policy, traj::CircularArraySARTTrajectory)
+function RLBase.update!(p::TD3Policy, traj::CircularArraySARTTrajectory, ::AbstractEnv, ::PreActStage)
     length(traj) > p.update_after || return
     p.step % p.update_every == 0 || return
     inds, batch = sample(p.rng, traj, BatchSampler{SARTS}(p.batch_size))
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+include("tabular/tabular.jl")`
`1`	`2`	`include("dqns/dqns.jl")`
`2`	`3`	`include("policy_gradient/policy_gradient.jl")`
`3`	`4`	`include("searching/searching.jl")`