@@ -28,8 +28,8 @@ class DBS(Player):
28
28
used when computing discounted frequencies to learn opponent's
29
29
strategy. Must be between 0 and 1. The default is 0.75
30
30
promotion_threshold : int, optional
31
- number of observations needed to promote a change in opponent's
32
- strategy . The default is 3.
31
+ number of successive observations needed to promote an
32
+ opponent behavior as a deterministic rule . The default is 3.
33
33
violation_threshold : int, optional
34
34
number of observations needed to considerate opponent's
35
35
strategy has changed. You can lower it when noise increases.
@@ -58,35 +58,65 @@ class DBS(Player):
58
58
def __init__ (self , discount_factor = .75 , promotion_threshold = 3 ,
59
59
violation_threshold = 4 , reject_threshold = 3 , tree_depth = 5 ):
60
60
super ().__init__ ()
61
-
62
- # default opponent's policy is TitForTat
61
+
62
+ # The opponent's behavior is represented by a 3 dicts :
63
+ # Rd, Rc, and Rp.
64
+ # His behavior his modeled by a set of rules. A rule is the move that
65
+ # the opponent will play (C or D or a probability to play C) after a
66
+ # given outcome (for instance after (C, D))
67
+ # A rule can be deterministic or probabilistic
68
+ # - Rc is the set of deterministic rules
69
+ # - Rp is the set of probabilistic rules
70
+ # - Rd is the default rule set which is used for initialization but also
71
+ # keeps track of previous policies when change in the opponent behavior
72
+ # happens, in order to have a smooth transition
73
+ # - Pi is a set of rules that aggregates all above sets of rules in
74
+ # order to fully model the opponent's behavior
75
+
76
+ # Default rule set Rd
77
+ # Default opponent's policy is TitForTat
63
78
self .Rd = create_policy (1 , 1 , 0 , 0 )
79
+ # Set of current deterministic rules Rc
64
80
self .Rc = {}
65
- self .Pi = self .Rd # policy used by MoveGen
81
+ # Aggregated rule set Pi
82
+ self .Pi = self .Rd
83
+ # For each rule in Rd we need to count the number of successive
84
+ # violations. Those counts are saved in violation_counts.
66
85
self .violation_counts = {}
67
86
self .reject_threshold = reject_threshold
68
87
self .violation_threshold = violation_threshold
69
88
self .promotion_threshold = promotion_threshold
70
89
self .tree_depth = tree_depth
90
+ # v is a violation count used to know when to clean the default rule
91
+ # set Rd
71
92
self .v = 0
93
+ # A discount factor for computing the probabilistic rules
72
94
self .alpha = discount_factor
73
- self .history_by_cond = {}
74
- # to compute the discount frequencies, we need to keep
75
- # up to date an history of what has been played for each
76
- # condition:
95
+
96
+ # The probabilistic rule set Rp is not saved as an attribute, but each
97
+ # rule is computed only when needed.
98
+ # The rules are computed as discounted frequencies of opponent's past
99
+ # moves. To compute the discounted frequencies, we need to keep
100
+ # up to date an history of what has been played following each
101
+ # outcome (or condition):
77
102
# We save it as a dict history_by_cond; keys are conditions
78
- # (ex (C,C)) and values are a tuple of 2 lists (G,F)
79
- # for a condition j:
103
+ # (ex (C, C)) and values are a tuple of 2 lists (G, F)
104
+ # for a condition j and an iteration i in the match :
80
105
# G[i] = 1 if cond j was True at turn i-1 and C has been played
81
- # by the opponent; else G[i]=0
82
- # F[i] = 1 if cond j was True at turn i-1; else G[i]=0
106
+ # by the opponent; else G[i] = 0
107
+ # F[i] = 1 if cond j was True at turn i-1; else F[i]=0
108
+ # this representation makes the computing of discounted frequencies
109
+ # easy and efficient
83
110
# initial hypothesized policy is TitForTat
84
- self .history_by_cond [(C , C )] = ([1 ], [1 ])
85
- self .history_by_cond [(C , D )] = ([1 ], [1 ])
86
- self .history_by_cond [(D , C )] = ([0 ], [1 ])
87
- self .history_by_cond [(D , D )] = ([0 ], [1 ])
111
+ self .history_by_cond = {
112
+ [(C , C )] = ([1 ], [1 ])
113
+ [(C , D )] = ([1 ], [1 ])
114
+ [(D , C )] = ([0 ], [1 ])
115
+ [(D , D )] = ([0 ], [1 ])
116
+ }
88
117
89
118
def reset (self ):
119
+ """ Reset instance properties. """
90
120
super ().reset ()
91
121
self .Rd = create_policy (1 , 1 , 0 , 0 )
92
122
self .Rc = {}
@@ -101,7 +131,26 @@ def reset(self):
101
131
102
132
def should_promote (self , r_plus , promotion_threshold = 3 ):
103
133
"""
104
-
134
+ This function determines if the move r_plus is a deterministic
135
+ behavior of the opponent, and then returns True, or if r_plus
136
+ is due to a random behavior (or noise) which would require a
137
+ probabilistic rule, in which case it returns False
138
+
139
+ To do so it looks into the game history : if the K last times
140
+ when the opponent was in the same situation than in r_plus, he
141
+ played the same thing, then then r_plus is considered as a
142
+ deterministic rule (where K is the user-defined
143
+ promotion_threshold)
144
+
145
+ Parameters
146
+
147
+ r_plus : tuple of (tuple of actions.Actions, actions.Actions)
148
+ exemple: ((C, C), D)
149
+ r_plus represents one outcome of the history, and the
150
+ following move played by the opponent
151
+ promotion_threshold : int, optionnal
152
+ number of successive observations needed to promote an
153
+ opponent behavior as a deterministic rule. Default is 3.
105
154
"""
106
155
if r_plus [1 ] == C :
107
156
opposite_action = 0
@@ -127,9 +176,17 @@ def should_promote(self, r_plus, promotion_threshold=3):
127
176
return False
128
177
129
178
def should_demote (self , r_minus , violation_threshold = 4 ):
179
+ """
180
+ Checks if the number of successive violations of a deterministic
181
+ rule (in the opponent's behavior) exceeds the user-defined
182
+ violation_threshold
183
+ """
130
184
return (self .violation_counts [r_minus [0 ]] >= violation_threshold )
131
185
132
186
def update_history_by_cond (self , opponent_history ):
187
+ """
188
+ Updates self.history_by_cond, between each turns of the game.
189
+ """
133
190
two_moves_ago = (self .history [- 2 ], opponent_history [- 2 ])
134
191
for outcome ,GF in self .history_by_cond .items ():
135
192
G ,F = GF
@@ -143,7 +200,25 @@ def update_history_by_cond(self, opponent_history):
143
200
G .append (0 )
144
201
F .append (0 )
145
202
146
- def compute_prob_rule (self , outcome , alpha ):
203
+ def compute_prob_rule (self , outcome , alpha = 1 ):
204
+ """
205
+ Uses the game history to compute the probability of the opponent
206
+ playing C, in the outcome situation
207
+ (exemple : outcome = (C, C)).
208
+ When alpha = 1, the results is approximately equal to the frequency
209
+ of the occurence of outcome -> C.
210
+ alpha is a discount factor that allows to give more weight to recent
211
+ events than earlier ones.
212
+
213
+ Parameters
214
+
215
+ outcome : tuple of two actions.Actions
216
+ in {(C, C), (C, D), (D, C), (D, D)}
217
+ We want to compute the probability that the opponent plays C
218
+ following this outcome in the game
219
+ alpha : int, optionnal
220
+ Discount factor. Default is 1.
221
+ """
147
222
G = self .history_by_cond [outcome ][0 ]
148
223
F = self .history_by_cond [outcome ][1 ]
149
224
discounted_g = 0
@@ -153,25 +228,30 @@ def compute_prob_rule(self, outcome, alpha):
153
228
discounted_g += alpha_k * g
154
229
discounted_f += alpha_k * f
155
230
alpha_k = alpha * alpha_k
156
- p_cond = discounted_g / discounted_f
231
+ p_cond = discounted_g / discounted_f
157
232
return p_cond
158
233
159
234
def strategy (self , opponent : Player ) -> Action :
160
-
161
235
# First move
162
236
if not self .history :
163
237
return C
164
238
165
239
if (len (opponent .history ) >= 2 ):
166
240
167
- # update history_by_cond
241
+ # We begin by update history_by_cond
168
242
# (i.e. update Rp)
169
243
self .update_history_by_cond (opponent .history )
170
244
171
245
two_moves_ago = (self .history [- 2 ], opponent .history [- 2 ])
246
+ # r_plus is the information of what the opponent just played,
247
+ # following the previous outcome two_moves_ago
172
248
r_plus = (two_moves_ago , opponent .history [- 1 ])
249
+ # r_minus is the opposite move, following the same outcome
173
250
r_minus = (two_moves_ago , ({C , D } - {opponent .history [- 1 ]}).pop ())
174
251
252
+ # If r_plus and r_minus are not in the current set of deterministic
253
+ # rules, we check if r_plus should be added to it (following the
254
+ # rule defined in the should_promote function)
175
255
if r_plus [0 ] not in self .Rc .keys ():
176
256
if self .should_promote (r_plus , self .promotion_threshold ):
177
257
self .Rc [r_plus [0 ]] = action_to_int (r_plus [1 ])
@@ -187,9 +267,14 @@ def strategy(self, opponent: Player) -> Action:
187
267
self .violation_counts [r_plus [0 ]] = 0
188
268
# (if r- in Rc)
189
269
elif r_minus [1 ] == to_check :
190
- # increment violation count of r-
270
+ # Increment violation count of r-
191
271
self .violation_counts [r_plus [0 ]] += 1
192
- if self .should_demote (r_minus ,self .violation_threshold ):
272
+ # As we observe that the behavior of the opponent is
273
+ # opposed to a rule modeled in Rc, we check if the number
274
+ # of consecutive violations of this rule is superior to
275
+ # a threshold. If it is, we clean Rc, but we keep the rules
276
+ # of Rc in Rd for smooth transition
277
+ if self .should_demote (r_minus , self .violation_threshold ):
193
278
self .Rd .update (self .Rc )
194
279
self .Rc .clear ()
195
280
self .violation_counts .clear ()
@@ -206,25 +291,28 @@ def strategy(self, opponent: Player) -> Action:
206
291
and self .Rd [r_minus [0 ]] == action_to_int (r_minus [1 ])
207
292
)
208
293
294
+ # Increment number of violations of Rd rules
209
295
if r_minus_in_Rd :
210
296
self .v += 1
211
-
297
+ # If the number of violations is superior to a threshold, clean Rd
212
298
if (self .v > self .reject_threshold
213
299
or (r_plus_in_Rc and r_minus_in_Rd )):
214
300
self .Rd .clear ()
215
301
self .v = 0
216
302
217
- # compute Rp for conditions that are neither in Rc or Rd
303
+ # Compute Rp for conditions that are neither in Rc or Rd
218
304
Rp = {}
219
305
all_cond = [(C , C ), (C , D ), (D , C ), (D , D )]
220
306
for outcome in all_cond :
221
307
if ((outcome not in self .Rc .keys ())
222
308
and (outcome not in self .Rd .keys ())):
223
- # then we need to compute opponent's C answer probability
309
+ # Compute opponent's C answer probability
224
310
Rp [outcome ] = self .compute_prob_rule (outcome , self .alpha )
225
311
312
+ # We aggregate the rules of Rc, Rd, and Rp in a set of rule Pi
226
313
self .Pi = {}
227
- # algorithm ensure no duplicate keys -> no key overwriting
314
+ # The algorithm makes sure that a rule cannot be in two different
315
+ # sets of rule so we do not need to check for duplicates.
228
316
self .Pi .update (self .Rc )
229
317
self .Pi .update (self .Rd )
230
318
self .Pi .update (Rp )
0 commit comments