Format .jl files (#84)

github-actions[bot] · web-flow · commit d21b82df8953 · 2020-09-02T01:24:28.000+08:00
Co-authored-by: github-actions[bot] &lt;41898282+github-actions[bot]@users.noreply.github.com&gt;
diff --git a/src/algorithms/cfr/cfr.jl b/src/algorithms/cfr/cfr.jl
@@ -1 +1 @@
-include("tabular_cfr.jl")
+include("tabular_cfr.jl")
diff --git a/src/algorithms/cfr/tabular_cfr.jl b/src/algorithms/cfr/tabular_cfr.jl
@@ -6,10 +6,10 @@ struct InfoStateNode
     cumulative_strategy::Vector{Float64}
 end
 
-InfoStateNode(n) = InfoStateNode(fill(1/n,n), zeros(n), zeros(n))
+InfoStateNode(n) = InfoStateNode(fill(1 / n, n), zeros(n), zeros(n))
 
 function init_info_state_nodes(env::AbstractEnv)
-    nodes = Dict{String, InfoStateNode}()
+    nodes = Dict{String,InfoStateNode}()
     walk(env) do x
         if !get_terminal(x) && get_current_player(x) != get_chance_player(x)
             get!(nodes, get_state(x), InfoStateNode(length(get_legal_actions(x))))
@@ -24,8 +24,8 @@ end
 See more details: [An Introduction to Counterfactual Regret Minimization](http://modelai.gettysburg.edu/2013/cfr/cfr.pdf)
 """
 struct TabularCFRPolicy{S,T,R<:AbstractRNG} <: AbstractPolicy
-    nodes::Dict{S, InfoStateNode}
-    behavior_policy::QBasedPolicy{TabularLearner{S,T}, WeightedExplorer{true,R}}
+    nodes::Dict{S,InfoStateNode}
+    behavior_policy::QBasedPolicy{TabularLearner{S,T},WeightedExplorer{true,R}}
 end
 
 (p::TabularCFRPolicy)(env::AbstractEnv) = p.behavior_policy(env)
@@ -35,7 +35,13 @@ RLBase.get_prob(p::TabularCFRPolicy, env::AbstractEnv) = get_prob(p.behavior_pol
 """
     TabularCFRPolicy(;n_iter::Int, env::AbstractEnv)
 """
-function TabularCFRPolicy(;n_iter::Int, env::AbstractEnv, rng=Random.GLOBAL_RNG, is_reset_neg_regrets=false, is_linear_averaging=false)
+function TabularCFRPolicy(;
+    n_iter::Int,
+    env::AbstractEnv,
+    rng = Random.GLOBAL_RNG,
+    is_reset_neg_regrets = false,
+    is_linear_averaging = false,
+)
     @assert NumAgentStyle(env) isa MultiAgent
     @assert DynamicStyle(env) === SEQUENTIAL
     @assert RewardStyle(env) === TERMINAL_REWARD
@@ -47,7 +53,8 @@ function TabularCFRPolicy(;n_iter::Int, env::AbstractEnv, rng=Random.GLOBAL_RNG,
     for i in 1:n_iter
         for p in get_players(env)
             if p != get_chance_player(env)
-                init_reach_prob = Dict(x=>1.0 for x in get_players(env) if x != get_chance_player(env))
+                init_reach_prob =
+                    Dict(x => 1.0 for x in get_players(env) if x != get_chance_player(env))
                 cfr!(nodes, env, p, init_reach_prob, 1.0, is_linear_averaging ? i : 1)
                 update_strategy!(nodes)
 
@@ -60,9 +67,12 @@ function TabularCFRPolicy(;n_iter::Int, env::AbstractEnv, rng=Random.GLOBAL_RNG,
         end
     end
 
-    behavior_policy = QBasedPolicy(;learner=TabularLearner{String}(), explorer=WeightedExplorer(;is_normalized=true, rng=rng))
+    behavior_policy = QBasedPolicy(;
+        learner = TabularLearner{String}(),
+        explorer = WeightedExplorer(; is_normalized = true, rng = rng),
+    )
 
-    for (k,v) in nodes
+    for (k, v) in nodes
         s = sum(v.cumulative_strategy)
         if s != 0
             update!(behavior_policy, k => v.cumulative_strategy ./ s)
@@ -77,23 +87,39 @@ function cfr!(nodes, env, player, reach_probs, chance_player_reach_prob, ratio)
         get_reward(env, player)
     else
         if get_current_player(env) == get_chance_player(env)
-            v = 0.
+            v = 0.0
             for a::ActionProbPair in get_legal_actions(env)
-                v += a.prob * cfr!(nodes, child(env, a), player, reach_probs, chance_player_reach_prob * a.prob, ratio)
+                v +=
+                    a.prob * cfr!(
+                        nodes,
+                        child(env, a),
+                        player,
+                        reach_probs,
+                        chance_player_reach_prob * a.prob,
+                        ratio,
+                    )
             end
             v
         else
-            v = 0.
+            v = 0.0
             node = nodes[get_state(env)]
             legal_actions = get_legal_actions(env)
-            U = player == get_current_player(env) ? Vector{Float64}(undef, length(legal_actions)) : nothing
+            U = player == get_current_player(env) ?
+                Vector{Float64}(undef, length(legal_actions)) : nothing
 
             for (i, action) in enumerate(legal_actions)
                 prob = node.strategy[i]
                 new_reach_probs = copy(reach_probs)
                 new_reach_probs[get_current_player(env)] *= prob
 
-                u = cfr!(nodes, child(env, action), player, new_reach_probs, chance_player_reach_prob, ratio)
+                u = cfr!(
+                    nodes,
+                    child(env, action),
+                    player,
+                    new_reach_probs,
+                    chance_player_reach_prob,
+                    ratio,
+                )
                 isnothing(U) || (U[i] = u)
                 v += prob * u
             end
@@ -102,8 +128,13 @@ function cfr!(nodes, env, player, reach_probs, chance_player_reach_prob, ratio)
                 reach_prob = reach_probs[player]
                 counterfactual_reach_prob = reduce(
                     *,
-                    (reach_probs[p] for p in get_players(env) if p != player && p != get_chance_player(env));
-                    init=chance_player_reach_prob)
+                    (
+                        reach_probs[p]
+                        for
+                        p in get_players(env) if p != player && p != get_chance_player(env)
+                    );
+                    init = chance_player_reach_prob,
+                )
                 node.cumulative_regret .+= counterfactual_reach_prob .* (U .- v)
                 node.cumulative_strategy .+= ratio .* reach_prob .* node.strategy
             end
@@ -113,16 +144,16 @@ function cfr!(nodes, env, player, reach_probs, chance_player_reach_prob, ratio)
 end
 
 function regret_matching!(strategy, cumulative_regret)
-    s = mapreduce(x->max(0,x), +,cumulative_regret)
+    s = mapreduce(x -> max(0, x), +, cumulative_regret)
     if s > 0
-        strategy .= max.(0., cumulative_regret) ./ s
+        strategy .= max.(0.0, cumulative_regret) ./ s
     else
-        fill!(strategy, 1/length(strategy))
+        fill!(strategy, 1 / length(strategy))
     end
 end
 
 function update_strategy!(nodes)
     for node in values(nodes)
         regret_matching!(node.strategy, node.cumulative_regret)
     end
-end
+end
diff --git a/src/algorithms/searching/minimax.jl b/src/algorithms/searching/minimax.jl
@@ -10,7 +10,7 @@ The minimax algorithm with [Alpha-beta pruning](https://en.wikipedia.org/wiki/Al
 Base.@kwdef mutable struct MinimaxPolicy{F} <: AbstractPolicy
     maximum_depth::Int = 30
     value_function::F = nothing
-    v::Float64 = 0.
+    v::Float64 = 0.0
 end
 
 (p::MinimaxPolicy)(env::AbstractEnv) = p(env, DynamicStyle(env), NumAgentStyle(env))
@@ -19,7 +19,14 @@ function (p::MinimaxPolicy)(env::AbstractEnv, ::Sequential, ::MultiAgent{2})
     if get_terminal(env)
         rand(get_actions(env))  # just a dummy action
     else
-        a, v = α_β_search(env, p.value_function, p.maximum_depth, -Inf, Inf, get_current_player(env))
+        a, v = α_β_search(
+            env,
+            p.value_function,
+            p.maximum_depth,
+            -Inf,
+            Inf,
+            get_current_player(env),
+        )
         p.v = v  # for debug only
         a
     end
@@ -36,7 +43,7 @@ function α_β_search(env::AbstractEnv, value_function, depth, α, β, maximizin
         v = -Inf
         for a in legal_actions
             node = child(env, a)
-            _, v_node = α_β_search(node, value_function, depth-1, α, β, maximizing_role)
+            _, v_node = α_β_search(node, value_function, depth - 1, α, β, maximizing_role)
             if v_node > v
                 v = v_node
                 best_action = a
@@ -51,7 +58,7 @@ function α_β_search(env::AbstractEnv, value_function, depth, α, β, maximizin
         v = Inf
         for a in legal_actions
             node = child(env, a)
-            _, v_node = α_β_search(node, value_function, depth-1, α, β, maximizing_role)
+            _, v_node = α_β_search(node, value_function, depth - 1, α, β, maximizing_role)
             if v_node < v
                 v = v_node
                 best_action = a
diff --git a/src/algorithms/searching/searching.jl b/src/algorithms/searching/searching.jl
@@ -1 +1 @@
-include("minimax.jl")
+include("minimax.jl")
diff --git a/src/experiments/open_spiel.jl b/src/experiments/open_spiel.jl
@@ -1,20 +1,15 @@
 using Random
 
-function RLCore.Experiment(
-    ::Val{:JuliaRL},
-    ::Val{:Minimax},
-    ::Val{:OpenSpiel},
-    game;
-)
+function RLCore.Experiment(::Val{:JuliaRL}, ::Val{:Minimax}, ::Val{:OpenSpiel}, game;)
     env = OpenSpielEnv(string(game))
     agents = (
-        Agent(policy=MinimaxPolicy(), role=0),
-        Agent(policy=MinimaxPolicy(), role=1)
+        Agent(policy = MinimaxPolicy(), role = 0),
+        Agent(policy = MinimaxPolicy(), role = 1),
     )
     hooks = (TotalRewardPerEpisode(), TotalRewardPerEpisode())
-    description="""
-    # Play `$game` in OpenSpiel with Minimax
-    """
+    description = """
+      # Play `$game` in OpenSpiel with Minimax
+      """
     Experiment(agents, env, StopAfterEpisode(1), hooks, description)
 end
 
@@ -23,24 +18,31 @@ function RLCore.Experiment(
     ::Val{:TabularCFR},
     ::Val{:OpenSpiel},
     game;
-    n_iter=300,
-    seed=123
+    n_iter = 300,
+    seed = 123,
 )
-    env = OpenSpielEnv(game;default_state_style=RLBase.Information{String}(), is_chance_agent_required=true)
+    env = OpenSpielEnv(
+        game;
+        default_state_style = RLBase.Information{String}(),
+        is_chance_agent_required = true,
+    )
     rng = MersenneTwister(seed)
-    π = TabularCFRPolicy(;n_iter=n_iter, env=env, rng=rng)
+    π = TabularCFRPolicy(; n_iter = n_iter, env = env, rng = rng)
 
     agents = map(get_players(env)) do p
         if p == get_chance_player(env)
-            Agent(;policy=RandomPolicy(), role=p)
+            Agent(; policy = RandomPolicy(), role = p)
         else
-            Agent(;policy=π,role=p)
+            Agent(; policy = π, role = p)
         end
     end
 
-    hooks = [p == get_chance_player(env) ? EmptyHook() : TotalRewardPerEpisode() for p in get_players(env)]
-    description="""
-    # Play `$game` in OpenSpiel with TabularCFRPolicy
-    """
+    hooks = [
+        p == get_chance_player(env) ? EmptyHook() : TotalRewardPerEpisode()
+        for p in get_players(env)
+    ]
+    description = """
+      # Play `$game` in OpenSpiel with TabularCFRPolicy
+      """
     Experiment(agents, env, StopAfterEpisode(100_000), hooks, description)
-end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -82,12 +82,12 @@ using OpenSpiel
     @testset "TabularCFR" begin
         e = E`JuliaRL_TabularCFR_OpenSpiel(kuhn_poker)`
         run(e)
-        @test isapprox(mean(e.hook[2].rewards), -1 / 18;atol=0.01)
-        @test isapprox(mean(e.hook[3].rewards), 1 / 18;atol=0.01)
+        @test isapprox(mean(e.hook[2].rewards), -1 / 18; atol = 0.01)
+        @test isapprox(mean(e.hook[3].rewards), 1 / 18; atol = 0.01)
 
         reset!(e.env)
         expected_values = Dict(expected_policy_values(e.agent, e.env))
-        @test isapprox(expected_values[get_role(e.agent[2])], -1/18; atol=0.01)
-        @test isapprox(expected_values[get_role(e.agent[3])], 1/18; atol=0.01)
+        @test isapprox(expected_values[get_role(e.agent[2])], -1 / 18; atol = 0.01)
+        @test isapprox(expected_values[get_role(e.agent[3])], 1 / 18; atol = 0.01)
     end
 end

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-include("tabular_cfr.jl")`
	`1`	`+include("tabular_cfr.jl")`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-include("minimax.jl")`
	`1`	`+include("minimax.jl")`