Merge pull request #550 from NVIDIA/am/nixl-report

amaslenn · web-flow · commit 4edf2812624e · 2025-06-03T18:59:42.000+02:00
NIXL reporting
diff --git a/src/cloudai/__init__.py b/src/cloudai/__init__.py
@@ -113,7 +113,12 @@
     NeMoRunSlurmCommandGenStrategy,
     NeMoRunTestDefinition,
 )
-from .workloads.nixl_bench import NIXLBenchSlurmCommandGenStrategy, NIXLBenchTestDefinition
+from .workloads.nixl_bench import (
+    NIXLBenchJobStatusRetrievalStrategy,
+    NIXLBenchSlurmCommandGenStrategy,
+    NIXLBenchTestDefinition,
+)
+from .workloads.nixl_bench.report_generation_strategy import NIXLBenchReportGenerationStrategy
 from .workloads.sleep import (
     SleepGradingStrategy,
     SleepKubernetesJsonGenStrategy,
@@ -245,10 +250,15 @@
         SlurmContainerTestDefinition,
         MegatronRunTestDefinition,
         TritonInferenceTestDefinition,
-        NIXLBenchTestDefinition,
     ],
     DefaultJobStatusRetrievalStrategy,
 )
+Registry().add_strategy(
+    JobStatusRetrievalStrategy,
+    [SlurmSystem],
+    [NIXLBenchTestDefinition],
+    NIXLBenchJobStatusRetrievalStrategy,
+)
 Registry().add_strategy(
     JobStatusRetrievalStrategy, [StandaloneSystem], [SleepTestDefinition], DefaultJobStatusRetrievalStrategy
 )
@@ -316,6 +326,7 @@
 Registry().add_report(SlurmContainerTestDefinition, SlurmContainerReportGenerationStrategy)
 Registry().add_report(UCCTestDefinition, UCCTestReportGenerationStrategy)
 Registry().add_report(TritonInferenceTestDefinition, TritonInferenceReportGenerationStrategy)
+Registry().add_report(NIXLBenchTestDefinition, NIXLBenchReportGenerationStrategy)
 
 Registry().add_scenario_report(PerTestReporter)
 Registry().add_scenario_report(StatusReporter)
diff --git a/src/cloudai/report_generator/tool/bokeh_report_tool.py b/src/cloudai/report_generator/tool/bokeh_report_tool.py
@@ -146,7 +146,7 @@ def add_linear_xy_line_plot(
         df: pd.DataFrame,
         sol: Optional[float] = None,
         color: str = "black",
-    ):
+    ) -> bokeh.plotting.figure:
         """
         Add a line plot with linear axes to the report tool.
 
@@ -180,6 +180,7 @@ def add_linear_xy_line_plot(
         self.add_sol_line(p, df, x_column, y_column, sol)
 
         self.plots.append(p)
+        return p
 
     def add_single_point_plot(
         self,
@@ -189,7 +190,7 @@ def add_single_point_plot(
         y_columns: List[Tuple[str, str]],
         x_axis_label: str,
         y_axis_label: str,
-    ):
+    ) -> bokeh.plotting.figure:
         """
         Create a scatter plot for a single data point.
 
@@ -232,7 +233,7 @@ def add_multiple_messages_multi_lines_plot(
         y_columns: List[Tuple[str, str]],
         x_axis_label: str,
         y_axis_label: str,
-    ):
+    ) -> bokeh.plotting.figure:
         """
         Create lines plot for multiple message sizes.
 
@@ -320,7 +321,7 @@ def add_log_x_linear_y_multi_line_plot(
         y_axis_label: str,
         df: pd.DataFrame,
         sol: Optional[float] = None,
-    ):
+    ) -> bokeh.plotting.figure:
         """
         Add a line plot with a logarithmic x-axis and linear y-axis for multiple datasets.
 
@@ -394,6 +395,7 @@ def add_log_x_linear_y_multi_line_plot(
 
         p.legend.location = "bottom_right"
         self.plots.append(p)
+        return p
 
     def finalize_report(self, output_filename: Path):
         """
diff --git a/src/cloudai/workloads/nixl_bench/__init__.py b/src/cloudai/workloads/nixl_bench/__init__.py
@@ -14,11 +14,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .job_status_retrieval_strategy import NIXLBenchJobStatusRetrievalStrategy
 from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
+from .report_generation_strategy import NIXLBenchReportGenerationStrategy
 from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
 
 __all__ = [
     "NIXLBenchCmdArgs",
+    "NIXLBenchJobStatusRetrievalStrategy",
+    "NIXLBenchReportGenerationStrategy",
     "NIXLBenchSlurmCommandGenStrategy",
     "NIXLBenchTestDefinition",
 ]
diff --git a/src/cloudai/workloads/nixl_bench/job_status_retrieval_strategy.py b/src/cloudai/workloads/nixl_bench/job_status_retrieval_strategy.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from cloudai import JobStatusResult, JobStatusRetrievalStrategy
+
+
+class NIXLBenchJobStatusRetrievalStrategy(JobStatusRetrievalStrategy):
+    """Strategy to retrieve job status for NIXL Bench by checking 'stdout.txt'."""
+
+    def get_job_status(self, output_path: Path) -> JobStatusResult:
+        stdout_path = output_path / "stdout.txt"
+        if not stdout_path.exists():
+            return JobStatusResult(
+                is_successful=False,
+                error_message=f"stdout.txt file not found in the specified output directory {output_path}.",
+            )
+
+        has_header, has_data = False, False
+        for line in stdout_path.read_text().splitlines():
+            if "Block Size (B)      Batch Size     Avg Lat. (us)  B/W (MiB/Sec)  B/W (GiB/Sec)  B/W (GB/Sec)" in line:
+                has_header = True
+                continue
+            if has_header and len(line.split()) == 6:
+                has_data = True
+                break
+
+        if has_data:
+            return JobStatusResult(is_successful=True)
+
+        if not has_header:
+            return JobStatusResult(
+                is_successful=False,
+                error_message=f"NIXLBench results table not found in {stdout_path}.",
+            )
+
+        return JobStatusResult(is_successful=False, error_message=f"NIXLBench data not found in {stdout_path}.")
diff --git a/src/cloudai/workloads/nixl_bench/report_generation_strategy.py b/src/cloudai/workloads/nixl_bench/report_generation_strategy.py
@@ -0,0 +1,111 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import logging
+from functools import cache
+from pathlib import Path
+from typing import TYPE_CHECKING, ClassVar
+
+from cloudai import ReportGenerationStrategy
+from cloudai._core.test_scenario import METRIC_ERROR
+from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
+from cloudai.util.lazy_imports import lazy
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+
+@cache
+def extract_data(stdout_file: Path) -> pd.DataFrame:
+    if not stdout_file.exists():
+        logging.debug(f"{stdout_file} not found")
+        return lazy.pd.DataFrame()
+
+    header_present, data = False, []
+    for line in stdout_file.read_text().splitlines():
+        if "Block Size (B)      Batch Size     Avg Lat. (us)  B/W (MiB/Sec)  B/W (GiB/Sec)  B/W (GB/Sec)" in line:
+            header_present = True
+            continue
+        if header_present and len(line.split()) == 6:
+            data.append(line.split())
+
+    df = lazy.pd.DataFrame(
+        data, columns=["block_size", "batch_size", "avg_lat", "bw_mib_sec", "bw_gib_sec", "bw_gb_sec"]
+    )
+    df["block_size"] = df["block_size"].astype(int)
+    df["batch_size"] = df["batch_size"].astype(int)
+    df["avg_lat"] = df["avg_lat"].astype(float)
+    df["bw_mib_sec"] = df["bw_mib_sec"].astype(float)
+    df["bw_gib_sec"] = df["bw_gib_sec"].astype(float)
+    df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
+
+    return df
+
+
+class NIXLBenchReportGenerationStrategy(ReportGenerationStrategy):
+    """Strategy for generating reports from NIXL Bench directories."""
+
+    metrics: ClassVar[list[str]] = ["default", "latency"]
+
+    @property
+    def results_file(self) -> Path:
+        return self.test_run.output_path / "stdout.txt"
+
+    def can_handle_directory(self) -> bool:
+        df = extract_data(self.results_file)
+        return not df.empty
+
+    def generate_report(self) -> None:
+        if not self.can_handle_directory():
+            return
+
+        self.generate_bokeh_report()
+        df = extract_data(self.results_file)
+        df.to_csv(self.test_run.output_path / "nixlbench.csv", index=False)
+
+    def get_metric(self, metric: str) -> float:
+        logging.debug(f"Getting metric {metric} from {self.results_file.absolute()}")
+        df = extract_data(self.results_file)
+        if df.empty or metric not in {"default", "latency"}:
+            return METRIC_ERROR
+
+        return float(lazy.np.mean(df["avg_lat"]))
+
+    def generate_bokeh_report(self) -> None:
+        df = extract_data(self.results_file)
+
+        report_tool = BokehReportTool(self.test_run.output_path)
+        p = report_tool.add_log_x_linear_y_multi_line_plot(
+            title="NIXL Bench Latency",
+            df=df,
+            x_column="block_size",
+            y_columns=[("avg_lat", "blue")],
+            x_axis_label="Block Size (B)",
+            y_axis_label="Latency (us)",
+        )
+        p.width, p.height = 800, 500
+        p = report_tool.add_log_x_linear_y_multi_line_plot(
+            title="NIXL Bench Bandwidth",
+            df=df,
+            x_column="block_size",
+            y_columns=[("bw_gb_sec", "blue")],
+            x_axis_label="Block Size (B)",
+            y_axis_label="Bandwidth (GB/Sec)",
+        )
+        p.width, p.height = 800, 500
+        report_tool.finalize_report(Path("cloudai_nixlbench_bokeh_report.html"))
diff --git a/tests/job_status_retrieval_strategy/test_nixlbench_job_status_retrieval_strategy.py b/tests/job_status_retrieval_strategy/test_nixlbench_job_status_retrieval_strategy.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+from cloudai.workloads.nixl_bench import NIXLBenchJobStatusRetrievalStrategy
+
+LOG_EXTRACT = """
+Num threads (--num_threads=N)                               : 1
+--------------------------------------------------------------------------------
+
+Block Size (B)      Batch Size     Avg Lat. (us)  B/W (MiB/Sec)  B/W (GiB/Sec)  B/W (GB/Sec)
+--------------------------------------------------------------------------------
+4096                1              6.36607        613.604        0.599223       0.643411
+8192                1              6.36806        1226.83        1.19807
+"""
+
+
+class TestNIXLBenchJobStatusRetrievalStrategy:
+    def setup_method(self) -> None:
+        self.js = NIXLBenchJobStatusRetrievalStrategy()
+
+    def test_no_file(self, tmp_path: Path) -> None:
+        result = self.js.get_job_status(tmp_path)
+        assert not result.is_successful
+        assert result.error_message == f"stdout.txt file not found in the specified output directory {tmp_path}."
+
+    def test_no_header(self, tmp_path: Path) -> None:
+        (tmp_path / "stdout.txt").write_text(LOG_EXTRACT.splitlines()[-1])
+        result = self.js.get_job_status(tmp_path)
+        assert not result.is_successful
+        assert result.error_message == f"NIXLBench results table not found in {tmp_path / 'stdout.txt'}."
+
+    def test_no_data(self, tmp_path: Path) -> None:
+        (tmp_path / "stdout.txt").write_text("\n".join(LOG_EXTRACT.splitlines()[:-2]))
+        result = self.js.get_job_status(tmp_path)
+        assert not result.is_successful
+        assert result.error_message == f"NIXLBench data not found in {tmp_path / 'stdout.txt'}."
+
+    def test_successfull_job(self, tmp_path: Path) -> None:
+        (tmp_path / "stdout.txt").write_text(LOG_EXTRACT)
+        result = self.js.get_job_status(tmp_path)
+        assert result.is_successful
+        assert result.error_message == ""
diff --git a/tests/test_init.py b/tests/test_init.py
@@ -69,8 +69,11 @@
     NeMoRunSlurmCommandGenStrategy,
     NeMoRunTestDefinition,
 )
-from cloudai.workloads.nixl_bench.nixl_bench import NIXLBenchTestDefinition
-from cloudai.workloads.nixl_bench.slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
+from cloudai.workloads.nixl_bench import (
+    NIXLBenchJobStatusRetrievalStrategy,
+    NIXLBenchSlurmCommandGenStrategy,
+    NIXLBenchTestDefinition,
+)
 from cloudai.workloads.sleep import (
     SleepGradingStrategy,
     SleepKubernetesJsonGenStrategy,
@@ -164,7 +167,7 @@ def test_runners():
     (JobStatusRetrievalStrategy, SlurmSystem, UCCTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, SlurmSystem, TritonInferenceTestDefinition): DefaultJobStatusRetrievalStrategy,
-    (JobStatusRetrievalStrategy, SlurmSystem, NIXLBenchTestDefinition): DefaultJobStatusRetrievalStrategy,
+    (JobStatusRetrievalStrategy, SlurmSystem, NIXLBenchTestDefinition): NIXLBenchJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, StandaloneSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, LSFSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
     (JobStatusRetrievalStrategy, RunAISystem, NCCLTestDefinition): DefaultJobStatusRetrievalStrategy,
@@ -186,7 +189,7 @@ def strategy2str(key: tuple) -> str:
     assert len(missing) == 0, f"Missing: {missing}"
     assert len(extra) == 0, f"Extra: {extra}"
     for key, value in ALL_STRATEGIES.items():
-        assert strategies[key] == value
+        assert strategies[key] == value, f"Strategy {strategy2str(key)} is not {value}"
 
 
 def test_installers():
diff --git a/tests/test_test_scenario.py b/tests/test_test_scenario.py
@@ -63,6 +63,7 @@
     NeMoRunReportGenerationStrategy,
     NeMoRunTestDefinition,
 )
+from cloudai.workloads.nixl_bench import NIXLBenchReportGenerationStrategy, NIXLBenchTestDefinition
 from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
 from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
 from cloudai.workloads.triton_inference import TritonInferenceReportGenerationStrategy, TritonInferenceTestDefinition
@@ -456,7 +457,7 @@ def test_default(self):
         assert len(reporters) == 0
 
     def test_default_reporters_size(self):
-        assert len(Registry().reports_map) == 12
+        assert len(Registry().reports_map) == 13
 
     @pytest.mark.parametrize(
         "tdef,expected_reporters",
@@ -473,6 +474,7 @@ def test_default_reporters_size(self):
             (SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}),
             (UCCTestDefinition, {UCCTestReportGenerationStrategy}),
             (TritonInferenceTestDefinition, {TritonInferenceReportGenerationStrategy}),
+            (NIXLBenchTestDefinition, {NIXLBenchReportGenerationStrategy}),
         ],
     )
     def test_custom_reporters(self, tdef: Type[TestDefinition], expected_reporters: Set[ReportGenerationStrategy]):