@@ -195,6 +195,11 @@ def check_expected_results(results, atol=0.005, rtol=0.1):
195195 Expected results should take the form of a list of expectations, each
196196 specified by four elements: [dataset, task, metric, expected value]. For
197197 example: [['coco_2014_minival', 'box_proposal', 'AR@1000', 0.387], ...].
198+
199+ The expected value may also be formatted as a list [mean, std] providing
200+ an empirical mean and standard deviation from which a valid range is computed
201+ using cfg.EXPECTED_RESULTS_SIGMA_TOL. For example:
202+ [['coco_2014_minival', 'box_proposal', 'AR@1000', [0.387, 0.001]], ...]
198203 """
199204 # cfg contains a reference set of results that we want to check against
200205 if len (cfg .EXPECTED_RESULTS ) == 0 :
@@ -206,13 +211,28 @@ def check_expected_results(results, atol=0.005, rtol=0.1):
206211 assert metric in results [dataset ][task ], \
207212 'Metric {} not in results' .format (metric )
208213 actual_val = results [dataset ][task ][metric ]
209- err = abs (actual_val - expected_val )
210- tol = atol + rtol * abs (expected_val )
211- msg = (
212- '{} > {} > {} sanity check (actual vs. expected): '
213- '{:.3f} vs. {:.3f}, err={:.3f}, tol={:.3f}'
214- ).format (dataset , task , metric , actual_val , expected_val , err , tol )
215- if err > tol :
214+ ok = False
215+ if isinstance (expected_val , list ):
216+ assert len (expected_val ) == 2 , (
217+ 'Expected result must be in (mean, std) format'
218+ )
219+ mean , std = expected_val
220+ lo = mean - cfg .EXPECTED_RESULTS_SIGMA_TOL * std
221+ hi = mean + cfg .EXPECTED_RESULTS_SIGMA_TOL * std
222+ ok = (lo < actual_val ) and (actual_val < hi )
223+ msg = (
224+ '{} > {} > {} sanity check (actual vs. expected): '
225+ '{:.3f} vs. mean={:.4f}, std={:.4}, range=({:.4f}, {:.4f})'
226+ ).format (dataset , task , metric , actual_val , mean , std , lo , hi )
227+ else :
228+ err = abs (actual_val - expected_val )
229+ tol = atol + rtol * abs (expected_val )
230+ ok = (err > tol )
231+ msg = (
232+ '{} > {} > {} sanity check (actual vs. expected): '
233+ '{:.3f} vs. {:.3f}, err={:.3f}, tol={:.3f}'
234+ ).format (dataset , task , metric , actual_val , expected_val , err , tol )
235+ if not ok :
216236 msg = 'FAIL: ' + msg
217237 logger .error (msg )
218238 if cfg .EXPECTED_RESULTS_EMAIL != '' :
0 commit comments