Skip to content

Commit a9aeafa

Browse files
authored
[CI][Bench] Implement exponentially weighted moving average for SYCL nightly regression CI (#18766)
Median does not respond very fast to changes in performance, and thus is not a suitable metric to be used for regression checking. This PR implements an option to use exponentially weighted moving average instead. The hope is that this implementation could also be used on CPU instructions retired metric when SYCL compute-benchmark tests start reporting instructions retired; this would create a far more robust metric to use in order to spot regressions. **Note** to llvm-reviewers-benchmarking: Observe that changes here are removed from the core benchmarking scripts -- This change should be functionally NFC for the core benchmarking scripts.
1 parent 0bba746 commit a9aeafa

File tree

5 files changed

+113
-8
lines changed

5 files changed

+113
-8
lines changed

devops/actions/run-tests/benchmark/action.yml

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,12 +157,16 @@ runs:
157157
--preset "$PRESET" \
158158
--timestamp-override "$SAVE_TIMESTAMP" \
159159
--detect-version sycl,compute_runtime
160+
160161
echo "-----"
161162
python3 ./devops/scripts/benchmarks/compare.py to_hist \
163+
--avg-type EWMA \
164+
--cutoff "$(date -u -d '7 days ago' +'%Y%m%d_%H%M%S')" \
162165
--name "$SAVE_NAME" \
163166
--compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \
164167
--results-dir "./llvm-ci-perf-results/results/" \
165-
--regression-filter '^[a-z_]+_sycl '
168+
--regression-filter '^[a-z_]+_sycl ' \
169+
--verbose
166170
echo "-----"
167171

168172
- name: Cache changes to benchmark folder for archival purposes

devops/scripts/benchmarks/compare.py

Lines changed: 28 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from utils.aggregate import Aggregator, SimpleMedian
1+
from utils.aggregate import Aggregator, SimpleMedian, EWMA
22
from utils.validate import Validate
33
from utils.result import Result, BenchmarkRun
44
from options import options
@@ -13,6 +13,8 @@
1313
from dataclasses import dataclass, asdict
1414

1515

16+
verbose = False
17+
1618
@dataclass
1719
class BenchmarkHistoricAverage:
1820
"""Contains historic average information for 1 benchmark"""
@@ -225,6 +227,11 @@ def perf_diff_entry() -> dict:
225227
elif halfway_round(delta, 2) < -options.regression_threshold:
226228
regression.append(perf_diff_entry())
227229

230+
if verbose:
231+
print(
232+
f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}"
233+
)
234+
228235
return improvement, regression
229236

230237
def to_hist(
@@ -255,8 +262,12 @@ def to_hist(
255262
from the average for this benchmark run.
256263
"""
257264

258-
if avg_type != "median":
259-
print("Only median is currently supported: Refusing to continue.")
265+
if avg_type == "median":
266+
aggregator_type = SimpleMedian
267+
elif avg_type == "EWMA":
268+
aggregator_type = EWMA
269+
else:
270+
print("Error: Unsupported avg_type f{avg_type}.")
260271
exit(1)
261272

262273
try:
@@ -282,6 +293,7 @@ def to_hist(
282293
result_dir,
283294
compare_result.hostname,
284295
cutoff,
296+
aggregator=aggregator_type,
285297
exclude=[Path(compare_file).stem],
286298
)
287299
return Compare.to_hist_avg(hist_avg, compare_result)
@@ -320,6 +332,11 @@ def to_hist(
320332
help="Timestamp (in YYYYMMDD_HHMMSS) of oldest result to include in historic average calculation",
321333
default="20000101_010101",
322334
)
335+
parser_avg.add_argument(
336+
"--verbose",
337+
action="store_true",
338+
help="Increase output verbosity",
339+
)
323340
parser_avg.add_argument(
324341
"--regression-filter",
325342
type=str,
@@ -329,15 +346,19 @@ def to_hist(
329346

330347
args = parser.parse_args()
331348

349+
if args.verbose:
350+
verbose = True
351+
print("-- Compare.py --")
352+
332353
if args.operation == "to_hist":
333-
if args.avg_type != "median":
334-
print("Only median is currently supported: exiting.")
335-
exit(1)
336354
if not Validate.timestamp(args.cutoff):
337355
raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.")
356+
if args.avg_type not in ["median", "EWMA"]:
357+
print("Only median, EWMA is currently supported: exiting.")
358+
exit(1)
338359

339360
improvements, regressions = Compare.to_hist(
340-
"median", args.name, args.compare_file, args.results_dir, args.cutoff
361+
args.avg_type, args.name, args.compare_file, args.results_dir, args.cutoff
341362
)
342363

343364
# Not all regressions are of concern: if a filter is provided, filter

devops/scripts/benchmarks/options.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ class Options:
7272
exit_on_failure: bool = False
7373

7474
# Options intended for CI:
75+
7576
regression_threshold: float = 0.05
7677
# It's necessary in CI to compare or redo benchmark runs. Instead of
7778
# generating a new timestamp each run by default, specify a single timestamp
@@ -94,6 +95,17 @@ class Options:
9495
archive_baseline_days: int = 30 # Archive Baseline_* runs after 30 days
9596
archive_pr_days: int = 7 # Archive other (PR/dev) runs after 7 days
9697

98+
# EWMA Options:
99+
100+
# The smoothing factor is alpha in the EWMA equation. Generally, a higher
101+
# smoothing factor results in newer data having more weight, and a lower
102+
# smoothing factor results in older data having more weight.
103+
#
104+
# Valid values for this smoothing factor ranges from (0, 1). Note that no
105+
# value of smothing factor will result in older elements having more weight
106+
# than newer elements.
107+
EWMA_smoothing_factor: float = 0.15
108+
97109
detect_versions: DetectVersionsOptions = field(
98110
default_factory=DetectVersionsOptions
99111
)
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import sys
2+
import os
3+
4+
sys.path.append(f"{os.path.dirname(__file__)}/../")
5+
from options import options
6+
from utils.aggregate import *
7+
8+
9+
def run_testcase(aggregator: Aggregator, src: list, expected: float) -> bool:
10+
aggr = aggregator()
11+
for n in src:
12+
aggr.add(n)
13+
res = aggr.get_avg()
14+
if res != expected:
15+
print(f"Failed: {aggregator}, {src} -- expected {expected}, got {res}")
16+
return False
17+
return True
18+
19+
20+
def test_EWMA():
21+
options.EWMA_smoothing_factor = 0.5
22+
testcases = [
23+
([], None),
24+
([100], 100),
25+
([100, 100, 100, 100, 100], 100),
26+
([100, 105, 103, 108, 107], 106.1875),
27+
]
28+
successes = 0
29+
fails = 0
30+
for t in testcases:
31+
if not run_testcase(EWMA, *t):
32+
fails = fails + 1
33+
else:
34+
successes = successes + 1
35+
print(f"EWMA test: {successes} successes, {fails} fails.")
36+
37+
38+
if __name__ == "__main__":
39+
test_EWMA()

devops/scripts/benchmarks/utils/aggregate.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import statistics
22
from abc import ABC, abstractmethod
33

4+
from options import options
5+
46

57
class Aggregator(ABC):
68
"""
@@ -51,3 +53,30 @@ def add(self, n: float):
5153

5254
def get_avg(self) -> float:
5355
return statistics.median(self.elements)
56+
57+
58+
class EWMA(Aggregator):
59+
"""
60+
Exponentially weighted moving average based on all elements added to the
61+
aggregator.
62+
"""
63+
64+
def __init__(self, starting_elements: list = []):
65+
self.elements = starting_elements
66+
67+
@staticmethod
68+
def get_type() -> str:
69+
return "EWMA"
70+
71+
def add(self, n: float):
72+
self.elements.append(n)
73+
74+
def get_avg(self) -> float:
75+
if len(self.elements) == 0:
76+
return None # No elements collected, cannot provide an average
77+
78+
alpha = options.EWMA_smoothing_factor
79+
ewma_t = self.elements[0]
80+
for x_t in self.elements[1:]:
81+
ewma_t = alpha * x_t + (1 - alpha) * ewma_t
82+
return ewma_t

0 commit comments

Comments
 (0)