Skip to content

Commit 4edf281

Browse files
authored
Merge pull request #550 from NVIDIA/am/nixl-report
NIXL reporting
2 parents 98c4e58 + 82f020c commit 4edf281

File tree

8 files changed

+252
-11
lines changed

8 files changed

+252
-11
lines changed

src/cloudai/__init__.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,12 @@
113113
NeMoRunSlurmCommandGenStrategy,
114114
NeMoRunTestDefinition,
115115
)
116-
from .workloads.nixl_bench import NIXLBenchSlurmCommandGenStrategy, NIXLBenchTestDefinition
116+
from .workloads.nixl_bench import (
117+
NIXLBenchJobStatusRetrievalStrategy,
118+
NIXLBenchSlurmCommandGenStrategy,
119+
NIXLBenchTestDefinition,
120+
)
121+
from .workloads.nixl_bench.report_generation_strategy import NIXLBenchReportGenerationStrategy
117122
from .workloads.sleep import (
118123
SleepGradingStrategy,
119124
SleepKubernetesJsonGenStrategy,
@@ -245,10 +250,15 @@
245250
SlurmContainerTestDefinition,
246251
MegatronRunTestDefinition,
247252
TritonInferenceTestDefinition,
248-
NIXLBenchTestDefinition,
249253
],
250254
DefaultJobStatusRetrievalStrategy,
251255
)
256+
Registry().add_strategy(
257+
JobStatusRetrievalStrategy,
258+
[SlurmSystem],
259+
[NIXLBenchTestDefinition],
260+
NIXLBenchJobStatusRetrievalStrategy,
261+
)
252262
Registry().add_strategy(
253263
JobStatusRetrievalStrategy, [StandaloneSystem], [SleepTestDefinition], DefaultJobStatusRetrievalStrategy
254264
)
@@ -316,6 +326,7 @@
316326
Registry().add_report(SlurmContainerTestDefinition, SlurmContainerReportGenerationStrategy)
317327
Registry().add_report(UCCTestDefinition, UCCTestReportGenerationStrategy)
318328
Registry().add_report(TritonInferenceTestDefinition, TritonInferenceReportGenerationStrategy)
329+
Registry().add_report(NIXLBenchTestDefinition, NIXLBenchReportGenerationStrategy)
319330

320331
Registry().add_scenario_report(PerTestReporter)
321332
Registry().add_scenario_report(StatusReporter)

src/cloudai/report_generator/tool/bokeh_report_tool.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def add_linear_xy_line_plot(
146146
df: pd.DataFrame,
147147
sol: Optional[float] = None,
148148
color: str = "black",
149-
):
149+
) -> bokeh.plotting.figure:
150150
"""
151151
Add a line plot with linear axes to the report tool.
152152
@@ -180,6 +180,7 @@ def add_linear_xy_line_plot(
180180
self.add_sol_line(p, df, x_column, y_column, sol)
181181

182182
self.plots.append(p)
183+
return p
183184

184185
def add_single_point_plot(
185186
self,
@@ -189,7 +190,7 @@ def add_single_point_plot(
189190
y_columns: List[Tuple[str, str]],
190191
x_axis_label: str,
191192
y_axis_label: str,
192-
):
193+
) -> bokeh.plotting.figure:
193194
"""
194195
Create a scatter plot for a single data point.
195196
@@ -232,7 +233,7 @@ def add_multiple_messages_multi_lines_plot(
232233
y_columns: List[Tuple[str, str]],
233234
x_axis_label: str,
234235
y_axis_label: str,
235-
):
236+
) -> bokeh.plotting.figure:
236237
"""
237238
Create lines plot for multiple message sizes.
238239
@@ -320,7 +321,7 @@ def add_log_x_linear_y_multi_line_plot(
320321
y_axis_label: str,
321322
df: pd.DataFrame,
322323
sol: Optional[float] = None,
323-
):
324+
) -> bokeh.plotting.figure:
324325
"""
325326
Add a line plot with a logarithmic x-axis and linear y-axis for multiple datasets.
326327
@@ -394,6 +395,7 @@ def add_log_x_linear_y_multi_line_plot(
394395

395396
p.legend.location = "bottom_right"
396397
self.plots.append(p)
398+
return p
397399

398400
def finalize_report(self, output_filename: Path):
399401
"""

src/cloudai/workloads/nixl_bench/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,15 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17+
from .job_status_retrieval_strategy import NIXLBenchJobStatusRetrievalStrategy
1718
from .nixl_bench import NIXLBenchCmdArgs, NIXLBenchTestDefinition
19+
from .report_generation_strategy import NIXLBenchReportGenerationStrategy
1820
from .slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
1921

2022
__all__ = [
2123
"NIXLBenchCmdArgs",
24+
"NIXLBenchJobStatusRetrievalStrategy",
25+
"NIXLBenchReportGenerationStrategy",
2226
"NIXLBenchSlurmCommandGenStrategy",
2327
"NIXLBenchTestDefinition",
2428
]
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from pathlib import Path
18+
19+
from cloudai import JobStatusResult, JobStatusRetrievalStrategy
20+
21+
22+
class NIXLBenchJobStatusRetrievalStrategy(JobStatusRetrievalStrategy):
23+
"""Strategy to retrieve job status for NIXL Bench by checking 'stdout.txt'."""
24+
25+
def get_job_status(self, output_path: Path) -> JobStatusResult:
26+
stdout_path = output_path / "stdout.txt"
27+
if not stdout_path.exists():
28+
return JobStatusResult(
29+
is_successful=False,
30+
error_message=f"stdout.txt file not found in the specified output directory {output_path}.",
31+
)
32+
33+
has_header, has_data = False, False
34+
for line in stdout_path.read_text().splitlines():
35+
if "Block Size (B) Batch Size Avg Lat. (us) B/W (MiB/Sec) B/W (GiB/Sec) B/W (GB/Sec)" in line:
36+
has_header = True
37+
continue
38+
if has_header and len(line.split()) == 6:
39+
has_data = True
40+
break
41+
42+
if has_data:
43+
return JobStatusResult(is_successful=True)
44+
45+
if not has_header:
46+
return JobStatusResult(
47+
is_successful=False,
48+
error_message=f"NIXLBench results table not found in {stdout_path}.",
49+
)
50+
51+
return JobStatusResult(is_successful=False, error_message=f"NIXLBench data not found in {stdout_path}.")
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from __future__ import annotations
18+
19+
import logging
20+
from functools import cache
21+
from pathlib import Path
22+
from typing import TYPE_CHECKING, ClassVar
23+
24+
from cloudai import ReportGenerationStrategy
25+
from cloudai._core.test_scenario import METRIC_ERROR
26+
from cloudai.report_generator.tool.bokeh_report_tool import BokehReportTool
27+
from cloudai.util.lazy_imports import lazy
28+
29+
if TYPE_CHECKING:
30+
import pandas as pd
31+
32+
33+
@cache
34+
def extract_data(stdout_file: Path) -> pd.DataFrame:
35+
if not stdout_file.exists():
36+
logging.debug(f"{stdout_file} not found")
37+
return lazy.pd.DataFrame()
38+
39+
header_present, data = False, []
40+
for line in stdout_file.read_text().splitlines():
41+
if "Block Size (B) Batch Size Avg Lat. (us) B/W (MiB/Sec) B/W (GiB/Sec) B/W (GB/Sec)" in line:
42+
header_present = True
43+
continue
44+
if header_present and len(line.split()) == 6:
45+
data.append(line.split())
46+
47+
df = lazy.pd.DataFrame(
48+
data, columns=["block_size", "batch_size", "avg_lat", "bw_mib_sec", "bw_gib_sec", "bw_gb_sec"]
49+
)
50+
df["block_size"] = df["block_size"].astype(int)
51+
df["batch_size"] = df["batch_size"].astype(int)
52+
df["avg_lat"] = df["avg_lat"].astype(float)
53+
df["bw_mib_sec"] = df["bw_mib_sec"].astype(float)
54+
df["bw_gib_sec"] = df["bw_gib_sec"].astype(float)
55+
df["bw_gb_sec"] = df["bw_gb_sec"].astype(float)
56+
57+
return df
58+
59+
60+
class NIXLBenchReportGenerationStrategy(ReportGenerationStrategy):
61+
"""Strategy for generating reports from NIXL Bench directories."""
62+
63+
metrics: ClassVar[list[str]] = ["default", "latency"]
64+
65+
@property
66+
def results_file(self) -> Path:
67+
return self.test_run.output_path / "stdout.txt"
68+
69+
def can_handle_directory(self) -> bool:
70+
df = extract_data(self.results_file)
71+
return not df.empty
72+
73+
def generate_report(self) -> None:
74+
if not self.can_handle_directory():
75+
return
76+
77+
self.generate_bokeh_report()
78+
df = extract_data(self.results_file)
79+
df.to_csv(self.test_run.output_path / "nixlbench.csv", index=False)
80+
81+
def get_metric(self, metric: str) -> float:
82+
logging.debug(f"Getting metric {metric} from {self.results_file.absolute()}")
83+
df = extract_data(self.results_file)
84+
if df.empty or metric not in {"default", "latency"}:
85+
return METRIC_ERROR
86+
87+
return float(lazy.np.mean(df["avg_lat"]))
88+
89+
def generate_bokeh_report(self) -> None:
90+
df = extract_data(self.results_file)
91+
92+
report_tool = BokehReportTool(self.test_run.output_path)
93+
p = report_tool.add_log_x_linear_y_multi_line_plot(
94+
title="NIXL Bench Latency",
95+
df=df,
96+
x_column="block_size",
97+
y_columns=[("avg_lat", "blue")],
98+
x_axis_label="Block Size (B)",
99+
y_axis_label="Latency (us)",
100+
)
101+
p.width, p.height = 800, 500
102+
p = report_tool.add_log_x_linear_y_multi_line_plot(
103+
title="NIXL Bench Bandwidth",
104+
df=df,
105+
x_column="block_size",
106+
y_columns=[("bw_gb_sec", "blue")],
107+
x_axis_label="Block Size (B)",
108+
y_axis_label="Bandwidth (GB/Sec)",
109+
)
110+
p.width, p.height = 800, 500
111+
report_tool.finalize_report(Path("cloudai_nixlbench_bokeh_report.html"))
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# SPDX-FileCopyrightText: NVIDIA CORPORATION & AFFILIATES
2+
# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
# SPDX-License-Identifier: Apache-2.0
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
from pathlib import Path
18+
19+
from cloudai.workloads.nixl_bench import NIXLBenchJobStatusRetrievalStrategy
20+
21+
LOG_EXTRACT = """
22+
Num threads (--num_threads=N) : 1
23+
--------------------------------------------------------------------------------
24+
25+
Block Size (B) Batch Size Avg Lat. (us) B/W (MiB/Sec) B/W (GiB/Sec) B/W (GB/Sec)
26+
--------------------------------------------------------------------------------
27+
4096 1 6.36607 613.604 0.599223 0.643411
28+
8192 1 6.36806 1226.83 1.19807
29+
"""
30+
31+
32+
class TestNIXLBenchJobStatusRetrievalStrategy:
33+
def setup_method(self) -> None:
34+
self.js = NIXLBenchJobStatusRetrievalStrategy()
35+
36+
def test_no_file(self, tmp_path: Path) -> None:
37+
result = self.js.get_job_status(tmp_path)
38+
assert not result.is_successful
39+
assert result.error_message == f"stdout.txt file not found in the specified output directory {tmp_path}."
40+
41+
def test_no_header(self, tmp_path: Path) -> None:
42+
(tmp_path / "stdout.txt").write_text(LOG_EXTRACT.splitlines()[-1])
43+
result = self.js.get_job_status(tmp_path)
44+
assert not result.is_successful
45+
assert result.error_message == f"NIXLBench results table not found in {tmp_path / 'stdout.txt'}."
46+
47+
def test_no_data(self, tmp_path: Path) -> None:
48+
(tmp_path / "stdout.txt").write_text("\n".join(LOG_EXTRACT.splitlines()[:-2]))
49+
result = self.js.get_job_status(tmp_path)
50+
assert not result.is_successful
51+
assert result.error_message == f"NIXLBench data not found in {tmp_path / 'stdout.txt'}."
52+
53+
def test_successfull_job(self, tmp_path: Path) -> None:
54+
(tmp_path / "stdout.txt").write_text(LOG_EXTRACT)
55+
result = self.js.get_job_status(tmp_path)
56+
assert result.is_successful
57+
assert result.error_message == ""

tests/test_init.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,11 @@
6969
NeMoRunSlurmCommandGenStrategy,
7070
NeMoRunTestDefinition,
7171
)
72-
from cloudai.workloads.nixl_bench.nixl_bench import NIXLBenchTestDefinition
73-
from cloudai.workloads.nixl_bench.slurm_command_gen_strategy import NIXLBenchSlurmCommandGenStrategy
72+
from cloudai.workloads.nixl_bench import (
73+
NIXLBenchJobStatusRetrievalStrategy,
74+
NIXLBenchSlurmCommandGenStrategy,
75+
NIXLBenchTestDefinition,
76+
)
7477
from cloudai.workloads.sleep import (
7578
SleepGradingStrategy,
7679
SleepKubernetesJsonGenStrategy,
@@ -164,7 +167,7 @@ def test_runners():
164167
(JobStatusRetrievalStrategy, SlurmSystem, UCCTestDefinition): DefaultJobStatusRetrievalStrategy,
165168
(JobStatusRetrievalStrategy, SlurmSystem, MegatronRunTestDefinition): DefaultJobStatusRetrievalStrategy,
166169
(JobStatusRetrievalStrategy, SlurmSystem, TritonInferenceTestDefinition): DefaultJobStatusRetrievalStrategy,
167-
(JobStatusRetrievalStrategy, SlurmSystem, NIXLBenchTestDefinition): DefaultJobStatusRetrievalStrategy,
170+
(JobStatusRetrievalStrategy, SlurmSystem, NIXLBenchTestDefinition): NIXLBenchJobStatusRetrievalStrategy,
168171
(JobStatusRetrievalStrategy, StandaloneSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
169172
(JobStatusRetrievalStrategy, LSFSystem, SleepTestDefinition): DefaultJobStatusRetrievalStrategy,
170173
(JobStatusRetrievalStrategy, RunAISystem, NCCLTestDefinition): DefaultJobStatusRetrievalStrategy,
@@ -186,7 +189,7 @@ def strategy2str(key: tuple) -> str:
186189
assert len(missing) == 0, f"Missing: {missing}"
187190
assert len(extra) == 0, f"Extra: {extra}"
188191
for key, value in ALL_STRATEGIES.items():
189-
assert strategies[key] == value
192+
assert strategies[key] == value, f"Strategy {strategy2str(key)} is not {value}"
190193

191194

192195
def test_installers():

tests/test_test_scenario.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
NeMoRunReportGenerationStrategy,
6464
NeMoRunTestDefinition,
6565
)
66+
from cloudai.workloads.nixl_bench import NIXLBenchReportGenerationStrategy, NIXLBenchTestDefinition
6667
from cloudai.workloads.sleep import SleepReportGenerationStrategy, SleepTestDefinition
6768
from cloudai.workloads.slurm_container import SlurmContainerReportGenerationStrategy, SlurmContainerTestDefinition
6869
from cloudai.workloads.triton_inference import TritonInferenceReportGenerationStrategy, TritonInferenceTestDefinition
@@ -456,7 +457,7 @@ def test_default(self):
456457
assert len(reporters) == 0
457458

458459
def test_default_reporters_size(self):
459-
assert len(Registry().reports_map) == 12
460+
assert len(Registry().reports_map) == 13
460461

461462
@pytest.mark.parametrize(
462463
"tdef,expected_reporters",
@@ -473,6 +474,7 @@ def test_default_reporters_size(self):
473474
(SlurmContainerTestDefinition, {SlurmContainerReportGenerationStrategy}),
474475
(UCCTestDefinition, {UCCTestReportGenerationStrategy}),
475476
(TritonInferenceTestDefinition, {TritonInferenceReportGenerationStrategy}),
477+
(NIXLBenchTestDefinition, {NIXLBenchReportGenerationStrategy}),
476478
],
477479
)
478480
def test_custom_reporters(self, tdef: Type[TestDefinition], expected_reporters: Set[ReportGenerationStrategy]):

0 commit comments

Comments
 (0)