Skip to content

Commit bedeb31

Browse files
authored
Extend OSU to run on all OSes (#5742)
Add OSU benchmark tests for all OSes Signed-off-by: Judy Ng <njud@amazon.com>
1 parent 165a084 commit bedeb31

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+1413
-11
lines changed
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
{%- import 'common.jinja2' as common with context -%}
2+
test-suites:
3+
efa:
4+
test_efa.py::test_efa:
5+
dimensions:
6+
- regions: [ "euw1-az1" ] # do not move, unless capacity reservation is moved as well
7+
instances: [ "c5n.18xlarge" ]
8+
oss: {{ common.OSS_COMMERCIAL_X86 }}
9+
schedulers: [ "slurm" ]

tests/integration-tests/tests/efa/test_efa.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,12 @@ def test_efa(
6161
remote_command_executor = RemoteCommandExecutor(cluster)
6262
scheduler_commands = scheduler_commands_factory(remote_command_executor)
6363

64-
_test_efa_installation(scheduler_commands, remote_command_executor, efa_installed=True, partition="efa-enabled")
65-
_test_mpi(remote_command_executor, slots_per_instance, scheduler, scheduler_commands, partition="efa-enabled")
66-
logging.info("Running on Instances: {0}".format(get_compute_nodes_instance_ids(cluster.cfn_name, region)))
64+
# Run EFA tests if not running OSU benchmark, in future decouple EFA from OSU tests
65+
# TODO: Remove this condition once OSU benchmark tests are decoupled from EFA tests
66+
if instance not in osu_benchmarks_instances or os == "rocky8":
67+
_test_efa_installation(scheduler_commands, remote_command_executor, efa_installed=True, partition="efa-enabled")
68+
_test_mpi(remote_command_executor, slots_per_instance, scheduler, scheduler_commands, partition="efa-enabled")
69+
logging.info("Running on Instances: {0}".format(get_compute_nodes_instance_ids(cluster.cfn_name, region)))
6770

6871
run_system_analyzer(cluster, scheduler_commands_factory, request, partition="efa-enabled")
6972

@@ -78,6 +81,7 @@ def test_efa(
7881
remote_command_executor,
7982
scheduler_commands,
8083
test_datadir,
84+
os,
8185
instance,
8286
slots_per_instance,
8387
partition="efa-enabled",
@@ -89,6 +93,7 @@ def test_efa(
8993
remote_command_executor,
9094
scheduler_commands,
9195
test_datadir,
96+
os,
9297
instance,
9398
num_instances=max_queue_size,
9499
slots_per_instance=slots_per_instance,
@@ -106,10 +111,13 @@ def test_efa(
106111
slots_per_instance,
107112
partition="efa-enabled",
108113
)
109-
_test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled")
110114

111-
if instance == "p4d.24xlarge" and os != "centos7":
112-
_test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands)
115+
# TODO: Remove this condition once OSU benchmark tests are decoupled from EFA tests
116+
if instance not in osu_benchmarks_instances or os == "rocky8":
117+
_test_shm_transfer_is_enabled(scheduler_commands, remote_command_executor, partition="efa-enabled")
118+
119+
if instance == "p4d.24xlarge" and os != "centos7":
120+
_test_nccl_benchmarks(remote_command_executor, test_datadir, "openmpi", scheduler_commands)
113121

114122
assert_no_errors_in_logs(remote_command_executor, scheduler, skip_ice=True)
115123

@@ -140,7 +148,14 @@ def _test_efa_installation(scheduler_commands, remote_command_executor, efa_inst
140148

141149

142150
def _test_osu_benchmarks_pt2pt(
143-
mpi_version, remote_command_executor, scheduler_commands, test_datadir, instance, slots_per_instance, partition=None
151+
mpi_version,
152+
remote_command_executor,
153+
scheduler_commands,
154+
test_datadir,
155+
os,
156+
instance,
157+
slots_per_instance,
158+
partition=None,
144159
):
145160
# OSU pt2pt benchmarks cannot be executed with more than 2 MPI ranks.
146161
# Run them in 2 instances with 1 proc per instance, defined by map-by parameter.
@@ -161,7 +176,7 @@ def _test_osu_benchmarks_pt2pt(
161176
slots_per_instance,
162177
test_datadir,
163178
)
164-
failures = _check_osu_benchmarks_results(test_datadir, instance, mpi_version, benchmark_name, output)
179+
failures = _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output)
165180
if failures > accepted_number_of_failures:
166181
failed_benchmarks.append(f"{mpi_version}-{benchmark_name}")
167182

@@ -173,6 +188,7 @@ def _test_osu_benchmarks_collective(
173188
remote_command_executor,
174189
scheduler_commands,
175190
test_datadir,
191+
os,
176192
instance,
177193
num_instances,
178194
slots_per_instance,
@@ -195,7 +211,7 @@ def _test_osu_benchmarks_collective(
195211
test_datadir,
196212
timeout=24,
197213
)
198-
failures = _check_osu_benchmarks_results(test_datadir, instance, mpi_version, benchmark_name, output)
214+
failures = _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output)
199215
if failures > accepted_number_of_failures:
200216
failed_benchmarks.append(f"{mpi_version}-{benchmark_name}")
201217

@@ -241,15 +257,16 @@ def _test_osu_benchmarks_multiple_bandwidth(
241257
assert_that(float(max_bandwidth)).is_greater_than(expected_bandwidth)
242258

243259

244-
def _check_osu_benchmarks_results(test_datadir, instance, mpi_version, benchmark_name, output):
260+
def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output):
245261
logging.info(output)
246262
# Check avg latency for all packet sizes
247263
failures = 0
248264
metric_data = []
249265
metric_namespace = "ParallelCluster/test_efa"
250266
for packet_size, value in re.findall(r"(\d+)\s+(\d+)\.", output):
251267
with open(
252-
str(test_datadir / "osu_benchmarks" / "results" / instance / mpi_version / benchmark_name), encoding="utf-8"
268+
str(test_datadir / "osu_benchmarks" / "results" / os / instance / mpi_version / benchmark_name),
269+
encoding="utf-8",
253270
) as result:
254271
previous_result = re.search(rf"{packet_size}\s+(\d+)\.", result.read()).group(1)
255272

0 commit comments

Comments
 (0)