Add MCCFR (#90)

findmyway · web-flow · commit 34aea9030577 · 2020-09-23T23:58:34.000+08:00
* add outcome_sampling_mccfr

* add esmccfr

* update README.md
diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ This project aims to provide some implementations of the most typical reinforcem
 - DDPG
 - TD3
 - SAC
-- CFR
+- CFR/OS-MCCFR/ES-MCCFR
 - Minimax
 
 If you are looking for tabular reinforcement learning algorithms, you may refer [ReinforcementLearningAnIntroduction.jl](https://github.com/JuliaReinforcementLearning/ReinforcementLearningAnIntroduction.jl).
diff --git a/src/algorithms/cfr/cfr.jl b/src/algorithms/cfr/cfr.jl
@@ -1 +1,3 @@
 include("tabular_cfr.jl")
+include("outcome_sampling_mccfr.jl")
+include("external_sampling_mccfr.jl")
diff --git a/src/algorithms/cfr/external_sampling_mccfr.jl b/src/algorithms/cfr/external_sampling_mccfr.jl
@@ -0,0 +1,93 @@
+export ExternalSamplingMCCFRPolicy
+
+using Random
+using StatsBase: sample, Weights
+
+"""
+    ExternalSamplingMCCFRPolicy
+
+This implementation uses stochasticaly-weighted averaging.
+
+Ref:
+
+- [MONTE CARLO SAMPLING AND REGRET MINIMIZATION FOR EQUILIBRIUM COMPUTATION AND DECISION-MAKING IN LARGE EXTENSIVE FORM GAMES](http://mlanctot.info/files/papers/PhD_Thesis_MarcLanctot.pdf)
+- [Monte Carlo Sampling for Regret Minimization in Extensive Games](https://papers.nips.cc/paper/3713-monte-carlo-sampling-for-regret-minimization-in-extensive-games.pdf)
+"""
+struct ExternalSamplingMCCFRPolicy{S,T,R<:AbstractRNG} <: AbstractPolicy
+    nodes::Dict{S,InfoStateNode}
+    behavior_policy::QBasedPolicy{TabularLearner{S,T},WeightedExplorer{true,R}}
+end
+
+(p::ExternalSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
+
+RLBase.get_prob(p::ExternalSamplingMCCFRPolicy, env::AbstractEnv) = get_prob(p.behavior_policy, env)
+
+function ExternalSamplingMCCFRPolicy(;
+    env::AbstractEnv,
+    n_iter::Int,
+    rng=Random.GLOBAL_RNG,
+)
+    @assert NumAgentStyle(env) isa MultiAgent
+    @assert DynamicStyle(env) === SEQUENTIAL
+    @assert RewardStyle(env) === TERMINAL_REWARD
+    @assert ChanceStyle(env) === EXPLICIT_STOCHASTIC
+    @assert DefaultStateStyle(env) === Information{String}()
+
+    nodes = init_info_state_nodes(env)
+
+    for i in 1:n_iter
+        for p in get_players(env)
+            if p != get_chance_player(env)
+                external_sampling(copy(env), p, nodes, rng)
+            end
+        end
+    end
+
+    behavior_policy = QBasedPolicy(;
+        learner = TabularLearner{String}(),
+        explorer = WeightedExplorer(; is_normalized = true, rng = rng),
+    )
+
+    for (k, v) in nodes
+        s = sum(v.cumulative_strategy)
+        if s != 0
+            update!(behavior_policy, k => v.cumulative_strategy ./ s)
+        end
+    end
+
+    ExternalSamplingMCCFRPolicy(nodes, behavior_policy)
+end
+
+function external_sampling(env, i, nodes, rng)
+    current_player = get_current_player(env)
+
+    if get_terminal(env)
+        get_reward(env, i)
+    elseif current_player == get_chance_player(env)
+        env(rand(rng, get_actions(env)))
+        external_sampling(env, i, nodes, rng)
+    else
+        I = get_state(env)
+        node = nodes[I]
+        regret_matching!(node)
+        σ, rI, sI = node.strategy, node.cumulative_regret, node.cumulative_strategy
+        n = length(node.strategy)
+
+        if i == current_player
+            u = zeros(n)
+            uσ = 0
+            for (aᵢ, a) in enumerate(get_legal_actions(env))
+                u[aᵢ] = external_sampling(child(env, a), i, nodes, rng)
+                uσ += σ[aᵢ] * u[aᵢ]
+            end
+            rI .+= u .- uσ
+            uσ
+        else
+            a′ = sample(rng, Weights(σ, 1.0))
+            env(get_legal_actions(env)[a′])
+            u = external_sampling(env, i, nodes, rng)
+            sI .+= σ
+            u
+        end
+    end
+end
diff --git a/src/algorithms/cfr/outcome_sampling_mccfr.jl b/src/algorithms/cfr/outcome_sampling_mccfr.jl
@@ -0,0 +1,99 @@
+export OutcomeSamplingMCCFRPolicy
+
+using Random
+using StatsBase: sample, Weights
+
+"""
+    OutcomeSamplingMCCFRPolicy
+
+This implementation uses stochasticaly-weighted averaging.
+
+Ref:
+
+- [MONTE CARLO SAMPLING AND REGRET MINIMIZATION FOR EQUILIBRIUM COMPUTATION AND DECISION-MAKING IN LARGE EXTENSIVE FORM GAMES](http://mlanctot.info/files/papers/PhD_Thesis_MarcLanctot.pdf)
+- [Monte Carlo Sampling for Regret Minimization in Extensive Games](https://papers.nips.cc/paper/3713-monte-carlo-sampling-for-regret-minimization-in-extensive-games.pdf)
+"""
+struct OutcomeSamplingMCCFRPolicy{S,T,R<:AbstractRNG} <: AbstractPolicy
+    nodes::Dict{S,InfoStateNode}
+    behavior_policy::QBasedPolicy{TabularLearner{S,T},WeightedExplorer{true,R}}
+end
+
+(p::OutcomeSamplingMCCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
+
+RLBase.get_prob(p::OutcomeSamplingMCCFRPolicy, env::AbstractEnv) = get_prob(p.behavior_policy, env)
+
+function OutcomeSamplingMCCFRPolicy(;
+    env::AbstractEnv,
+    n_iter::Int,
+    rng=Random.GLOBAL_RNG,
+    ϵ=0.6
+)
+    @assert NumAgentStyle(env) isa MultiAgent
+    @assert DynamicStyle(env) === SEQUENTIAL
+    @assert RewardStyle(env) === TERMINAL_REWARD
+    @assert ChanceStyle(env) === EXPLICIT_STOCHASTIC
+    @assert DefaultStateStyle(env) === Information{String}()
+
+    nodes = init_info_state_nodes(env)
+
+    for i in 1:n_iter
+        for p in get_players(env)
+            if p != get_chance_player(env)
+                outcome_sampling(copy(env), p, nodes, ϵ, 1.0, 1.0, 1.0, rng)
+            end
+        end
+    end
+
+    behavior_policy = QBasedPolicy(;
+        learner = TabularLearner{String}(),
+        explorer = WeightedExplorer(; is_normalized = true, rng = rng),
+    )
+
+    for (k, v) in nodes
+        s = sum(v.cumulative_strategy)
+        if s != 0
+            update!(behavior_policy, k => v.cumulative_strategy ./ s)
+        end
+    end
+
+    OutcomeSamplingMCCFRPolicy(nodes, behavior_policy)
+end
+
+function outcome_sampling(env, i, nodes, ϵ, πᵢ, π₋ᵢ, s, rng)
+    current_player = get_current_player(env)
+
+    if get_terminal(env)
+        get_reward(env, i) / s, 1.0
+    elseif current_player == get_chance_player(env)
+        env(rand(rng, get_actions(env)))
+        outcome_sampling(env, i, nodes, ϵ, πᵢ, π₋ᵢ, s, rng)
+    else
+        I = get_state(env)
+        node = nodes[I]
+        regret_matching!(node)
+        σ, rI, sI = node.strategy, node.cumulative_regret, node.cumulative_strategy
+        n = length(node.strategy)
+
+        if i == current_player
+            aᵢ = rand(rng) >= ϵ ? sample(rng, Weights(σ, 1.0)) : rand(rng, 1:n)
+            pᵢ = σ[aᵢ] * (1 - ϵ) + ϵ / n
+            πᵢ′, π₋ᵢ′, s′ = πᵢ * pᵢ, π₋ᵢ, s * pᵢ
+        else
+            aᵢ = sample(rng, Weights(σ, 1.0))
+            pᵢ = σ[aᵢ]
+            πᵢ′, π₋ᵢ′, s′ = πᵢ, π₋ᵢ * pᵢ, s * pᵢ
+        end
+
+        env(get_legal_actions(env)[aᵢ])
+        u, πₜₐᵢₗ = outcome_sampling(env, i, nodes, ϵ, πᵢ′, π₋ᵢ′, s′, rng)
+
+        if i == current_player
+            w = u * π₋ᵢ
+            rI .+= w * πₜₐᵢₗ .* ((1:n .== aᵢ) .- σ[aᵢ])
+        else
+            sI .+= π₋ᵢ / s  .* σ
+        end
+
+        u, πₜₐᵢₗ * σ[aᵢ]
+    end
+end
diff --git a/src/algorithms/cfr/tabular_cfr.jl b/src/algorithms/cfr/tabular_cfr.jl
@@ -144,6 +144,8 @@ function cfr!(nodes, env, player, reach_probs, chance_player_reach_prob, ratio)
     end
 end
 
+regret_matching!(node::InfoStateNode) = regret_matching!(node.strategy, node.cumulative_regret)
+
 function regret_matching!(strategy, cumulative_regret)
     s = mapreduce(x -> max(0, x), +, cumulative_regret)
     if s > 0
@@ -155,6 +157,6 @@ end
 
 function update_strategy!(nodes)
     for node in values(nodes)
-        regret_matching!(node.strategy, node.cumulative_regret)
+        regret_matching!(node)
     end
 end

Original file line number	Diff line number	Diff line change
`@@ -1 +1,3 @@`
`1`	`1`	`include("tabular_cfr.jl")`
	`2`	`+include("outcome_sampling_mccfr.jl")`
	`3`	`+include("external_sampling_mccfr.jl")`