@@ -61,9 +61,12 @@ def test_efa(
61
61
remote_command_executor = RemoteCommandExecutor (cluster )
62
62
scheduler_commands = scheduler_commands_factory (remote_command_executor )
63
63
64
- _test_efa_installation (scheduler_commands , remote_command_executor , efa_installed = True , partition = "efa-enabled" )
65
- _test_mpi (remote_command_executor , slots_per_instance , scheduler , scheduler_commands , partition = "efa-enabled" )
66
- logging .info ("Running on Instances: {0}" .format (get_compute_nodes_instance_ids (cluster .cfn_name , region )))
64
+ # Run EFA tests if not running OSU benchmark, in future decouple EFA from OSU tests
65
+ # TODO: Remove this condition once OSU benchmark tests are decoupled from EFA tests
66
+ if instance not in osu_benchmarks_instances or os == "rocky8" :
67
+ _test_efa_installation (scheduler_commands , remote_command_executor , efa_installed = True , partition = "efa-enabled" )
68
+ _test_mpi (remote_command_executor , slots_per_instance , scheduler , scheduler_commands , partition = "efa-enabled" )
69
+ logging .info ("Running on Instances: {0}" .format (get_compute_nodes_instance_ids (cluster .cfn_name , region )))
67
70
68
71
run_system_analyzer (cluster , scheduler_commands_factory , request , partition = "efa-enabled" )
69
72
@@ -78,6 +81,7 @@ def test_efa(
78
81
remote_command_executor ,
79
82
scheduler_commands ,
80
83
test_datadir ,
84
+ os ,
81
85
instance ,
82
86
slots_per_instance ,
83
87
partition = "efa-enabled" ,
@@ -89,6 +93,7 @@ def test_efa(
89
93
remote_command_executor ,
90
94
scheduler_commands ,
91
95
test_datadir ,
96
+ os ,
92
97
instance ,
93
98
num_instances = max_queue_size ,
94
99
slots_per_instance = slots_per_instance ,
@@ -106,10 +111,13 @@ def test_efa(
106
111
slots_per_instance ,
107
112
partition = "efa-enabled" ,
108
113
)
109
- _test_shm_transfer_is_enabled (scheduler_commands , remote_command_executor , partition = "efa-enabled" )
110
114
111
- if instance == "p4d.24xlarge" and os != "centos7" :
112
- _test_nccl_benchmarks (remote_command_executor , test_datadir , "openmpi" , scheduler_commands )
115
+ # TODO: Remove this condition once OSU benchmark tests are decoupled from EFA tests
116
+ if instance not in osu_benchmarks_instances or os == "rocky8" :
117
+ _test_shm_transfer_is_enabled (scheduler_commands , remote_command_executor , partition = "efa-enabled" )
118
+
119
+ if instance == "p4d.24xlarge" and os != "centos7" :
120
+ _test_nccl_benchmarks (remote_command_executor , test_datadir , "openmpi" , scheduler_commands )
113
121
114
122
assert_no_errors_in_logs (remote_command_executor , scheduler , skip_ice = True )
115
123
@@ -140,7 +148,14 @@ def _test_efa_installation(scheduler_commands, remote_command_executor, efa_inst
140
148
141
149
142
150
def _test_osu_benchmarks_pt2pt (
143
- mpi_version , remote_command_executor , scheduler_commands , test_datadir , instance , slots_per_instance , partition = None
151
+ mpi_version ,
152
+ remote_command_executor ,
153
+ scheduler_commands ,
154
+ test_datadir ,
155
+ os ,
156
+ instance ,
157
+ slots_per_instance ,
158
+ partition = None ,
144
159
):
145
160
# OSU pt2pt benchmarks cannot be executed with more than 2 MPI ranks.
146
161
# Run them in 2 instances with 1 proc per instance, defined by map-by parameter.
@@ -161,7 +176,7 @@ def _test_osu_benchmarks_pt2pt(
161
176
slots_per_instance ,
162
177
test_datadir ,
163
178
)
164
- failures = _check_osu_benchmarks_results (test_datadir , instance , mpi_version , benchmark_name , output )
179
+ failures = _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output )
165
180
if failures > accepted_number_of_failures :
166
181
failed_benchmarks .append (f"{ mpi_version } -{ benchmark_name } " )
167
182
@@ -173,6 +188,7 @@ def _test_osu_benchmarks_collective(
173
188
remote_command_executor ,
174
189
scheduler_commands ,
175
190
test_datadir ,
191
+ os ,
176
192
instance ,
177
193
num_instances ,
178
194
slots_per_instance ,
@@ -195,7 +211,7 @@ def _test_osu_benchmarks_collective(
195
211
test_datadir ,
196
212
timeout = 24 ,
197
213
)
198
- failures = _check_osu_benchmarks_results (test_datadir , instance , mpi_version , benchmark_name , output )
214
+ failures = _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output )
199
215
if failures > accepted_number_of_failures :
200
216
failed_benchmarks .append (f"{ mpi_version } -{ benchmark_name } " )
201
217
@@ -241,15 +257,16 @@ def _test_osu_benchmarks_multiple_bandwidth(
241
257
assert_that (float (max_bandwidth )).is_greater_than (expected_bandwidth )
242
258
243
259
244
- def _check_osu_benchmarks_results (test_datadir , instance , mpi_version , benchmark_name , output ):
260
+ def _check_osu_benchmarks_results (test_datadir , os , instance , mpi_version , benchmark_name , output ):
245
261
logging .info (output )
246
262
# Check avg latency for all packet sizes
247
263
failures = 0
248
264
metric_data = []
249
265
metric_namespace = "ParallelCluster/test_efa"
250
266
for packet_size , value in re .findall (r"(\d+)\s+(\d+)\." , output ):
251
267
with open (
252
- str (test_datadir / "osu_benchmarks" / "results" / instance / mpi_version / benchmark_name ), encoding = "utf-8"
268
+ str (test_datadir / "osu_benchmarks" / "results" / os / instance / mpi_version / benchmark_name ),
269
+ encoding = "utf-8" ,
253
270
) as result :
254
271
previous_result = re .search (rf"{ packet_size } \s+(\d+)\." , result .read ()).group (1 )
255
272
0 commit comments