[CI][Bench] Implement exponentially weighted moving average for SYCL nightly regression CI (#18766)

ianayl · web-flow · commit a9aeafa66021 · 2025-07-08T17:57:20.000Z
Median does not respond very fast to changes in performance, and thus is
not a suitable metric to be used for regression checking. This PR
implements an option to use exponentially weighted moving average
instead.

The hope is that this implementation could also be used on CPU
instructions retired metric when SYCL compute-benchmark tests start
reporting instructions retired; this would create a far more robust
metric to use in order to spot regressions.

**Note** to llvm-reviewers-benchmarking: Observe that changes here are
removed from the core benchmarking scripts -- This change should be
functionally NFC for the core benchmarking scripts.
diff --git a/devops/actions/run-tests/benchmark/action.yml b/devops/actions/run-tests/benchmark/action.yml
@@ -157,12 +157,16 @@ runs:
         --preset "$PRESET" \
         --timestamp-override "$SAVE_TIMESTAMP" \
         --detect-version sycl,compute_runtime
+
       echo "-----"
       python3 ./devops/scripts/benchmarks/compare.py to_hist \
+        --avg-type EWMA \
+        --cutoff "$(date -u -d '7 days ago' +'%Y%m%d_%H%M%S')" \
         --name "$SAVE_NAME" \
         --compare-file "./llvm-ci-perf-results/results/${SAVE_NAME}_${SAVE_TIMESTAMP}.json" \
         --results-dir "./llvm-ci-perf-results/results/" \
-        --regression-filter '^[a-z_]+_sycl '
+        --regression-filter '^[a-z_]+_sycl ' \
+        --verbose
       echo "-----"
 
   - name: Cache changes to benchmark folder for archival purposes
diff --git a/devops/scripts/benchmarks/compare.py b/devops/scripts/benchmarks/compare.py
@@ -1,4 +1,4 @@
-from utils.aggregate import Aggregator, SimpleMedian
+from utils.aggregate import Aggregator, SimpleMedian, EWMA
 from utils.validate import Validate
 from utils.result import Result, BenchmarkRun
 from options import options
@@ -13,6 +13,8 @@
 from dataclasses import dataclass, asdict
 
 
+verbose = False
+
 @dataclass
 class BenchmarkHistoricAverage:
     """Contains historic average information for 1 benchmark"""
@@ -225,6 +227,11 @@ def perf_diff_entry() -> dict:
             elif halfway_round(delta, 2) < -options.regression_threshold:
                 regression.append(perf_diff_entry())
 
+            if verbose:
+                print(
+                    f"{test.name}: expect {hist_avg[test.name].value}, got {test.value}"
+                )
+
         return improvement, regression
 
     def to_hist(
@@ -255,8 +262,12 @@ def to_hist(
             from the average for this benchmark run.
         """
 
-        if avg_type != "median":
-            print("Only median is currently supported: Refusing to continue.")
+        if avg_type == "median":
+            aggregator_type = SimpleMedian
+        elif avg_type == "EWMA":
+            aggregator_type = EWMA
+        else:
+            print("Error: Unsupported avg_type f{avg_type}.")
             exit(1)
 
         try:
@@ -282,6 +293,7 @@ def to_hist(
             result_dir,
             compare_result.hostname,
             cutoff,
+            aggregator=aggregator_type,
             exclude=[Path(compare_file).stem],
         )
         return Compare.to_hist_avg(hist_avg, compare_result)
@@ -320,6 +332,11 @@ def to_hist(
         help="Timestamp (in YYYYMMDD_HHMMSS) of oldest result to include in historic average calculation",
         default="20000101_010101",
     )
+    parser_avg.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Increase output verbosity",
+    )
     parser_avg.add_argument(
         "--regression-filter",
         type=str,
@@ -329,15 +346,19 @@ def to_hist(
 
     args = parser.parse_args()
 
+    if args.verbose:
+        verbose = True
+        print("-- Compare.py --")
+
     if args.operation == "to_hist":
-        if args.avg_type != "median":
-            print("Only median is currently supported: exiting.")
-            exit(1)
         if not Validate.timestamp(args.cutoff):
             raise ValueError("Timestamp must be provided as YYYYMMDD_HHMMSS.")
+        if args.avg_type not in ["median", "EWMA"]:
+            print("Only median, EWMA is currently supported: exiting.")
+            exit(1)
 
         improvements, regressions = Compare.to_hist(
-            "median", args.name, args.compare_file, args.results_dir, args.cutoff
+            args.avg_type, args.name, args.compare_file, args.results_dir, args.cutoff
         )
 
         # Not all regressions are of concern: if a filter is provided, filter
diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py
@@ -72,6 +72,7 @@ class Options:
     exit_on_failure: bool = False
 
     # Options intended for CI:
+
     regression_threshold: float = 0.05
     # It's necessary in CI to compare or redo benchmark runs. Instead of
     # generating a new timestamp each run by default, specify a single timestamp
@@ -94,6 +95,17 @@ class Options:
     archive_baseline_days: int = 30  # Archive Baseline_* runs after 30 days
     archive_pr_days: int = 7  # Archive other (PR/dev) runs after 7 days
 
+    # EWMA Options:
+
+    # The smoothing factor is alpha in the EWMA equation. Generally, a higher
+    # smoothing factor results in newer data having more weight, and a lower
+    # smoothing factor results in older data having more weight.
+    #
+    # Valid values for this smoothing factor ranges from (0, 1). Note that no
+    # value of smothing factor will result in older elements having more weight
+    # than newer elements.
+    EWMA_smoothing_factor: float = 0.15
+
     detect_versions: DetectVersionsOptions = field(
         default_factory=DetectVersionsOptions
     )
diff --git a/devops/scripts/benchmarks/tests/test_aggregate.py b/devops/scripts/benchmarks/tests/test_aggregate.py
@@ -0,0 +1,39 @@
+import sys
+import os
+
+sys.path.append(f"{os.path.dirname(__file__)}/../")
+from options import options
+from utils.aggregate import *
+
+
+def run_testcase(aggregator: Aggregator, src: list, expected: float) -> bool:
+    aggr = aggregator()
+    for n in src:
+        aggr.add(n)
+    res = aggr.get_avg()
+    if res != expected:
+        print(f"Failed: {aggregator}, {src} -- expected {expected}, got {res}")
+        return False
+    return True
+
+
+def test_EWMA():
+    options.EWMA_smoothing_factor = 0.5
+    testcases = [
+        ([], None),
+        ([100], 100),
+        ([100, 100, 100, 100, 100], 100),
+        ([100, 105, 103, 108, 107], 106.1875),
+    ]
+    successes = 0
+    fails = 0
+    for t in testcases:
+        if not run_testcase(EWMA, *t):
+            fails = fails + 1
+        else:
+            successes = successes + 1
+    print(f"EWMA test: {successes} successes, {fails} fails.")
+
+
+if __name__ == "__main__":
+    test_EWMA()
diff --git a/devops/scripts/benchmarks/utils/aggregate.py b/devops/scripts/benchmarks/utils/aggregate.py
@@ -1,6 +1,8 @@
 import statistics
 from abc import ABC, abstractmethod
 
+from options import options
+
 
 class Aggregator(ABC):
     """
@@ -51,3 +53,30 @@ def add(self, n: float):
 
     def get_avg(self) -> float:
         return statistics.median(self.elements)
+
+
+class EWMA(Aggregator):
+    """
+    Exponentially weighted moving average based on all elements added to the
+    aggregator.
+    """
+
+    def __init__(self, starting_elements: list = []):
+        self.elements = starting_elements
+
+    @staticmethod
+    def get_type() -> str:
+        return "EWMA"
+
+    def add(self, n: float):
+        self.elements.append(n)
+
+    def get_avg(self) -> float:
+        if len(self.elements) == 0:
+            return None  # No elements collected, cannot provide an average
+
+        alpha = options.EWMA_smoothing_factor
+        ewma_t = self.elements[0]
+        for x_t in self.elements[1:]:
+            ewma_t = alpha * x_t + (1 - alpha) * ewma_t
+        return ewma_t