Multi agent related changes (#83)

findmyway · web-flow · commit 28fcb998348e · 2020-09-01T21:48:26.000+08:00
* add experiment of snake game

* sync

* add Experiment for Minimax

* add Experiment for CFRPolicy

* fix CFR

* add more experiments

* add more experiments

* update dependency

* bump version

* add more info in README.md

* bugfix

* minor bugfix

* automatically decrease steps in CI

* increase steps in JuliaRL_TabularCFR_OpenSpiel
diff --git a/.travis.yml b/.travis.yml
@@ -2,7 +2,7 @@
 language: julia
 os:
   - linux
-  - osx
+  # - osx
 julia:
   - 1.4
   - nightly
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "ReinforcementLearningZoo"
 uuid = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
 authors = ["Jun Tian <tianjun.cpp@gmail.com>"]
-version = "0.1.6"
+version = "0.1.7"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -32,7 +32,7 @@ Distributions = "0.23"
 Flux = "0.11"
 MacroTools = "0.5"
 ReinforcementLearningBase = "0.8"
-ReinforcementLearningCore = "0.4.1"
+ReinforcementLearningCore = "0.4.2"
 Requires = "1"
 Setfield = "0.6, 0.7"
 StatsBase = "0.32, 0.33"
@@ -41,8 +41,9 @@ Zygote = "0.5"
 julia = "1.4"
 
 [extras]
+OpenSpiel = "ceb70bd2-fe3f-44f0-b81f-41608acaf2f2"
 ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "ReinforcementLearningEnvironments"]
+test = ["Test", "ReinforcementLearningEnvironments", "OpenSpiel"]
diff --git a/README.md b/README.md
@@ -24,6 +24,8 @@ This project aims to provide some implementations of the most typical reinforcem
 - PPO
 - DDPG
 - SAC
+- CFR
+- Minimax
 
 If you are looking for tabular reinforcement learning algorithms, you may refer [ReinforcementLearningAnIntroduction.jl](https://github.com/JuliaReinforcementLearning/ReinforcementLearningAnIntroduction.jl).
 
@@ -45,6 +47,9 @@ Some built-in experiments are exported to help new users to easily run benchmark
 - ``E`JuliaRL_SAC_Pendulum` `` (Thanks to [@rbange](https://github.com/rbange))
 - ``E`JuliaRL_BasicDQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))
 - ``E`JuliaRL_DQN_MountainCar` `` (Thanks to [@felixchalumeau](https://github.com/felixchalumeau))
+- ``E`JuliaRL_Minimax_OpenSpiel(tic_tac_toe)` ``
+- ``E`JuliaRL_TabularCFR_OpenSpiel(kuhn_poker)` ``
+- ``E`JuliaRL_DQN_SnakeGame` ``
 - ``E`Dopamine_DQN_Atari(pong)` ``
 - ``E`Dopamine_Rainbow_Atari(pong)` ``
 - ``E`Dopamine_IQN_Atari(pong)` ``
@@ -56,7 +61,7 @@ Some built-in experiments are exported to help new users to easily run benchmark
 - Experiments on `CartPole` usually run faster with CPU only due to the overhead of sending data between CPU and GPU.
 - It shouldn't surprise you that our experiments on `CartPole` are much faster than those written in Python. The secret is that our environment is written in Julia!
 - Remember to set `JULIA_NUM_THREADS` to enable multi-threading when using algorithms like `A2C` and `PPO`.
-- Experiments on `Atari` are only available when you have `ArcadeLearningEnvironment.jl` installed and `using ArcadeLearningEnvironment`.
+- Experiments on `Atari` (`OpenSpiel`, `SnakeGame`) are only available after you have `ArcadeLearningEnvironment.jl` (`OpenSpiel.jl`, `SnakeGame.jl`) installed and `using ArcadeLearningEnvironment` (`using OpenSpiel`, `using SnakeGame`).
 
 ### Speed
 
diff --git a/src/ReinforcementLearningZoo.jl b/src/ReinforcementLearningZoo.jl
@@ -19,6 +19,7 @@ function __init__()
         include("experiments/rl_envs.jl")
         @require ArcadeLearningEnvironment = "b7f77d8d-088d-5e02-8ac0-89aab2acc977" include("experiments/atari.jl")
         @require SnakeGames = "34dccd9f-48d6-4445-aa0f-8c2e373b5429" include("experiments/snake.jl")
+        @require OpenSpiel = "ceb70bd2-fe3f-44f0-b81f-41608acaf2f2" include("experiments/open_spiel.jl")
     end
 end
 
diff --git a/src/algorithms/algorithms.jl b/src/algorithms/algorithms.jl
@@ -1,2 +1,4 @@
 include("dqns/dqns.jl")
 include("policy_gradient/policy_gradient.jl")
+include("searching/searching.jl")
+include("cfr/cfr.jl")
diff --git a/src/algorithms/cfr/cfr.jl b/src/algorithms/cfr/cfr.jl
@@ -0,0 +1 @@
+include("tabular_cfr.jl")
diff --git a/src/algorithms/cfr/tabular_cfr.jl b/src/algorithms/cfr/tabular_cfr.jl
@@ -0,0 +1,128 @@
+export TabularCFRPolicy
+
+struct InfoStateNode
+    strategy::Vector{Float64}
+    cumulative_regret::Vector{Float64}
+    cumulative_strategy::Vector{Float64}
+end
+
+InfoStateNode(n) = InfoStateNode(fill(1/n,n), zeros(n), zeros(n))
+
+function init_info_state_nodes(env::AbstractEnv)
+    nodes = Dict{String, InfoStateNode}()
+    walk(env) do x
+        if !get_terminal(x) && get_current_player(x) != get_chance_player(x)
+            get!(nodes, get_state(x), InfoStateNode(length(get_legal_actions(x))))
+        end
+    end
+    nodes
+end
+
+"""
+    TabularCFRPolicy
+
+See more details: [An Introduction to Counterfactual Regret Minimization](http://modelai.gettysburg.edu/2013/cfr/cfr.pdf)
+"""
+struct TabularCFRPolicy{S,T,R<:AbstractRNG} <: AbstractPolicy
+    nodes::Dict{S, InfoStateNode}
+    behavior_policy::QBasedPolicy{TabularLearner{S,T}, WeightedExplorer{true,R}}
+end
+
+(p::TabularCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
+
+RLBase.get_prob(p::TabularCFRPolicy, env::AbstractEnv) = get_prob(p.behavior_policy, env)
+
+"""
+    TabularCFRPolicy(;n_iter::Int, env::AbstractEnv)
+"""
+function TabularCFRPolicy(;n_iter::Int, env::AbstractEnv, rng=Random.GLOBAL_RNG, is_reset_neg_regrets=false, is_linear_averaging=false)
+    @assert NumAgentStyle(env) isa MultiAgent
+    @assert DynamicStyle(env) === SEQUENTIAL
+    @assert RewardStyle(env) === TERMINAL_REWARD
+    @assert ChanceStyle(env) === EXPLICIT_STOCHASTIC
+    @assert DefaultStateStyle(env) === Information{String}()
+
+    nodes = init_info_state_nodes(env)
+
+    for i in 1:n_iter
+        for p in get_players(env)
+            if p != get_chance_player(env)
+                init_reach_prob = Dict(x=>1.0 for x in get_players(env) if x != get_chance_player(env))
+                cfr!(nodes, env, p, init_reach_prob, 1.0, is_linear_averaging ? i : 1)
+                update_strategy!(nodes)
+
+                if is_reset_neg_regrets
+                    for node in values(nodes)
+                        node.cumulative_regret .= max.(node.cumulative_regret, 0)
+                    end
+                end
+            end
+        end
+    end
+
+    behavior_policy = QBasedPolicy(;learner=TabularLearner{String}(), explorer=WeightedExplorer(;is_normalized=true, rng=rng))
+
+    for (k,v) in nodes
+        s = sum(v.cumulative_strategy)
+        if s != 0
+            update!(behavior_policy, k => v.cumulative_strategy ./ s)
+        end
+    end
+
+    TabularCFRPolicy(nodes, behavior_policy)
+end
+
+function cfr!(nodes, env, player, reach_probs, chance_player_reach_prob, ratio)
+    if get_terminal(env)
+        get_reward(env, player)
+    else
+        if get_current_player(env) == get_chance_player(env)
+            v = 0.
+            for a::ActionProbPair in get_legal_actions(env)
+                v += a.prob * cfr!(nodes, child(env, a), player, reach_probs, chance_player_reach_prob * a.prob, ratio)
+            end
+            v
+        else
+            v = 0.
+            node = nodes[get_state(env)]
+            legal_actions = get_legal_actions(env)
+            U = player == get_current_player(env) ? Vector{Float64}(undef, length(legal_actions)) : nothing
+
+            for (i, action) in enumerate(legal_actions)
+                prob = node.strategy[i]
+                new_reach_probs = copy(reach_probs)
+                new_reach_probs[get_current_player(env)] *= prob
+
+                u = cfr!(nodes, child(env, action), player, new_reach_probs, chance_player_reach_prob, ratio)
+                isnothing(U) || (U[i] = u)
+                v += prob * u
+            end
+
+            if player == get_current_player(env)
+                reach_prob = reach_probs[player]
+                counterfactual_reach_prob = reduce(
+                    *,
+                    (reach_probs[p] for p in get_players(env) if p != player && p != get_chance_player(env));
+                    init=chance_player_reach_prob)
+                node.cumulative_regret .+= counterfactual_reach_prob .* (U .- v)
+                node.cumulative_strategy .+= ratio .* reach_prob .* node.strategy
+            end
+            v
+        end
+    end
+end
+
+function regret_matching!(strategy, cumulative_regret)
+    s = mapreduce(x->max(0,x), +,cumulative_regret)
+    if s > 0
+        strategy .= max.(0., cumulative_regret) ./ s
+    else
+        fill!(strategy, 1/length(strategy))
+    end
+end
+
+function update_strategy!(nodes)
+    for node in values(nodes)
+        regret_matching!(node.strategy, node.cumulative_regret)
+    end
+end
diff --git a/src/algorithms/searching/minimax.jl b/src/algorithms/searching/minimax.jl
@@ -0,0 +1,64 @@
+export MinimaxPolicy
+
+"""
+    MinimaxPolicy(;value_function, depth::Int)
+The minimax algorithm with [Alpha-beta pruning](https://en.wikipedia.org/wiki/Alpha-beta_pruning)
+## Keyword Arguments
+- `maximum_depth::Int=30`, the maximum depth of search.
+- `value_function=nothing`, estimate the value of `env`. `value_function(env) -> Number`. It is only called after searching for `maximum_depth` and the `env` is not terminated yet.
+"""
+Base.@kwdef mutable struct MinimaxPolicy{F} <: AbstractPolicy
+    maximum_depth::Int = 30
+    value_function::F = nothing
+    v::Float64 = 0.
+end
+
+(p::MinimaxPolicy)(env::AbstractEnv) = p(env, DynamicStyle(env), NumAgentStyle(env))
+
+function (p::MinimaxPolicy)(env::AbstractEnv, ::Sequential, ::MultiAgent{2})
+    if get_terminal(env)
+        rand(get_actions(env))  # just a dummy action
+    else
+        a, v = α_β_search(env, p.value_function, p.maximum_depth, -Inf, Inf, get_current_player(env))
+        p.v = v  # for debug only
+        a
+    end
+end
+
+function α_β_search(env::AbstractEnv, value_function, depth, α, β, maximizing_role)
+    if get_terminal(env)
+        nothing, get_reward(env, maximizing_role)
+    elseif depth == 0
+        nothing, value_function(env)
+    elseif get_current_player(env) == maximizing_role
+        legal_actions = get_legal_actions(env)
+        best_action = legal_actions[1]
+        v = -Inf
+        for a in legal_actions
+            node = child(env, a)
+            _, v_node = α_β_search(node, value_function, depth-1, α, β, maximizing_role)
+            if v_node > v
+                v = v_node
+                best_action = a
+            end
+            α = max(α, v)
+            α >= β && break  # β cut-off
+        end
+        best_action, v
+    else
+        legal_actions = get_legal_actions(env)
+        best_action = legal_actions[1]
+        v = Inf
+        for a in legal_actions
+            node = child(env, a)
+            _, v_node = α_β_search(node, value_function, depth-1, α, β, maximizing_role)
+            if v_node < v
+                v = v_node
+                best_action = a
+            end
+            β = min(β, v)
+            β <= α && break  # α cut-off
+        end
+        best_action, v
+    end
+end
diff --git a/src/algorithms/searching/searching.jl b/src/algorithms/searching/searching.jl
@@ -0,0 +1 @@
+include("minimax.jl")
diff --git a/src/experiments/open_spiel.jl b/src/experiments/open_spiel.jl
@@ -0,0 +1,46 @@
+using Random
+
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:Minimax},
+    ::Val{:OpenSpiel},
+    game;
+)
+    env = OpenSpielEnv(string(game))
+    agents = (
+        Agent(policy=MinimaxPolicy(), role=0),
+        Agent(policy=MinimaxPolicy(), role=1)
+    )
+    hooks = (TotalRewardPerEpisode(), TotalRewardPerEpisode())
+    description="""
+    # Play `$game` in OpenSpiel with Minimax
+    """
+    Experiment(agents, env, StopAfterEpisode(1), hooks, description)
+end
+
+function RLCore.Experiment(
+    ::Val{:JuliaRL},
+    ::Val{:TabularCFR},
+    ::Val{:OpenSpiel},
+    game;
+    n_iter=300,
+    seed=123
+)
+    env = OpenSpielEnv(game;default_state_style=RLBase.Information{String}(), is_chance_agent_required=true)
+    rng = MersenneTwister(seed)
+    π = TabularCFRPolicy(;n_iter=n_iter, env=env, rng=rng)
+
+    agents = map(get_players(env)) do p
+        if p == get_chance_player(env)
+            Agent(;policy=RandomPolicy(), role=p)
+        else
+            Agent(;policy=π,role=p)
+        end
+    end
+
+    hooks = [p == get_chance_player(env) ? EmptyHook() : TotalRewardPerEpisode() for p in get_players(env)]
+    description="""
+    # Play `$game` in OpenSpiel with TabularCFRPolicy
+    """
+    Experiment(agents, env, StopAfterEpisode(100_000), hooks, description)
+end
diff --git a/src/experiments/rl_envs.jl b/src/experiments/rl_envs.jl
@@ -549,7 +549,7 @@ function RLCore.Experiment(
         ),
     )
 
-    stop_condition = StopAfterStep(10_000)
+    stop_condition = StopAfterStep(haskey(ENV, "CI") ? 10_000 : 100_000)
     total_reward_per_episode = TotalBatchRewardPerEpisode(N_ENV)
     time_per_step = TimePerStep()
     hook = ComposedHook(
@@ -642,7 +642,7 @@ function RLCore.Experiment(
             terminal_size = (N_ENV,),
         ),
     )
-    stop_condition = StopAfterStep(10_000)
+    stop_condition = StopAfterStep(haskey(ENV, "CI") ? 10_000 : 100_000)
     total_reward_per_episode = TotalBatchRewardPerEpisode(N_ENV)
     time_per_step = TimePerStep()
     hook = ComposedHook(
@@ -747,7 +747,7 @@ function RLCore.Experiment(
         ),
     )
 
-    stop_condition = StopAfterStep(10000)
+    stop_condition = StopAfterStep(10_000)
     total_reward_per_episode = TotalRewardPerEpisode()
     time_per_step = TimePerStep()
     hook = ComposedHook(
@@ -850,7 +850,7 @@ function RLCore.Experiment(
         ),
     )
 
-    stop_condition = StopAfterStep(10_000)
+    stop_condition = StopAfterStep(haskey(ENV, "CI") ? 10_000 : 100_000)
     total_reward_per_episode = TotalBatchRewardPerEpisode(N_ENV)
     time_per_step = TimePerStep()
     hook = ComposedHook(
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -6,6 +6,7 @@ using ReinforcementLearningEnvironments
 using Flux
 using Statistics
 using Random
+using OpenSpiel
 
 @testset "ReinforcementLearningZoo.jl" begin
 
@@ -71,4 +72,22 @@ using Random
                 e.hook[1].rewards[end]
         end
     end
+
+    @testset "minimax" begin
+        e = E`JuliaRL_Minimax_OpenSpiel(tic_tac_toe)`
+        run(e)
+        @test e.hook[1].rewards[end] == e.hook[2].rewards[end] == 0.0
+    end
+
+    @testset "TabularCFR" begin
+        e = E`JuliaRL_TabularCFR_OpenSpiel(kuhn_poker)`
+        run(e)
+        @test isapprox(mean(e.hook[2].rewards), -1 / 18;atol=0.01)
+        @test isapprox(mean(e.hook[3].rewards), 1 / 18;atol=0.01)
+
+        reset!(e.env)
+        expected_values = Dict(expected_policy_values(e.agent, e.env))
+        @test isapprox(expected_values[get_role(e.agent[2])], -1/18; atol=0.01)
+        @test isapprox(expected_values[get_role(e.agent[3])], 1/18; atol=0.01)
+    end
 end

-Original file line number
+Diff line change
 language: julia
 os:
   - linux
 -  - osx
 +  # - osx
 julia:
   - 1.4
   - nightly