feat: store test results for A/B runs

ShadowCurse · ShadowCurse · commit 9c1a1d29de2c · 2025-05-06T15:42:34.000+01:00
Currently when A/B is run, only results for B test
are available in the `test_results` dir because this
dir is shared for both runs and the last one overwrites the data.
Now we move results into separate dirs after tests are done.
diff --git a/tests/framework/ab_test.py b/tests/framework/ab_test.py
@@ -21,6 +21,7 @@
 of both invocations is the same, the test passes (with us being alerted to this situtation via a special pipeline that
 does not block PRs). If not, it fails, preventing PRs from introducing new vulnerable dependencies.
 """
+import os
 import statistics
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -103,7 +104,7 @@ def git_ab_test(
 
 
 def binary_ab_test(
-    test_runner: Callable[[Path, bool], T],
+    test_runner: Callable[[str, Path, bool], T],
     comparator: Callable[[T, T], U] = default_comparator,
     *,
     a_directory: Path = DEFAULT_A_DIRECTORY,
@@ -113,8 +114,14 @@ def binary_ab_test(
     Similar to `git_ab_test`, but instead of locally checking out different revisions, it operates on
     directories containing firecracker/jailer binaries
     """
-    result_a = test_runner(a_directory, True)
-    result_b = test_runner(b_directory, False)
+    result_a = test_runner("A", a_directory, True)
+    result_b = test_runner("B", b_directory, False)
+
+    # put results back into the place, where buildkite will
+    # expect them to be
+    os.mkdir("test_results")
+    os.rename("A", "test_results/A")
+    os.rename("B", "test_results/B")
 
     return result_a, result_b, comparator(result_a, result_b)
 
@@ -160,7 +167,7 @@ def git_ab_test_host_command(
 
 
 def set_did_not_grow_comparator(
-    set_generator: Callable[[CommandReturn], set]
+    set_generator: Callable[[CommandReturn], set],
 ) -> Callable[[CommandReturn, CommandReturn], bool]:
     """Factory function for comparators to use with git_ab_test_command that converts the command output to sets
     (using the given callable) and then checks that the "B" set is a subset of the "A" set
diff --git a/tools/ab_test.py b/tools/ab_test.py
@@ -180,7 +180,7 @@ def uninteresting_dimensions(processed_emf):
     return uninteresting
 
 
-def collect_data(binary_dir: Path, pytest_opts: str):
+def collect_data(tag: str, binary_dir: Path, pytest_opts: str):
     """Executes the specified test using the provided firecracker binaries"""
     binary_dir = binary_dir.resolve()
 
@@ -195,8 +195,13 @@ def collect_data(binary_dir: Path, pytest_opts: str):
         check=True,
         shell=True,
     )
+
+    # move results into a tag directory
+    os.mkdir(tag)
+    os.rename("test_results", f"{tag}")
+
     return load_data_series(
-        Path("test_results/test-report.json"), binary_dir, reemit=True
+        Path(f"{tag}/test-report.json"), binary_dir, reemit=True
     )
 
 
@@ -346,7 +351,7 @@ def ab_performance_test(
     """Does an A/B-test of the specified test with the given firecracker/jailer binaries"""
 
     return binary_ab_test(
-        lambda bin_dir, _: collect_data(bin_dir, pytest_opts),
+        lambda tag, bin_dir, _: collect_data(tag, bin_dir, pytest_opts),
         lambda ah, be: analyze_data(
             ah,
             be,