Skip to content

Commit 25f4f38

Browse files
authored
[Integ-Test] Fix test_slurm_scaling by increasing job running time and setting stop_max_delay_secs based on the OS (#6737)
* Use c5.xlarge instead of hpc5a instance type. The slurm job is just sleep. * We noticed bootstrap time in Rocky and Rhel is explicitly longer than other OSs. `c5.xlarge` bootstrap time in Rocky and Rhel is around 6 minutes. Set stop_max_delay_secs based on the OS. Increase `wait_for_num_nodes_in_scheduler` timeout to 7 mins. * Ensure job running time bigger than `_wait_for_node_reset` timeout plus `_assert_nodes_not_terminated` waiting_time. * Modify parameter name to improve readability * Add a 45 seconds delay to accommodate node replacement process (~45s between node down status and replacement)
1 parent ead8db6 commit 25f4f38

File tree

3 files changed

+43
-16
lines changed

3 files changed

+43
-16
lines changed

tests/integration-tests/configs/develop.yaml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -432,8 +432,9 @@ test-suites:
432432
schedulers: ["slurm"]
433433
test_slurm.py::test_slurm_scaling:
434434
dimensions:
435-
- regions: ["use2-az2"] # do not move, unless instance type support is moved as well
436-
instances: [{{ common.instance("instance_type_1") }}]
435+
- regions: ["us-east-2"] # We used to test it with hpc5a.48xlarge, if that's the case, set it to use2-az2
436+
# If want to test with hpc5a.48xlarge, set instance type to [{{ common.instance("instance_type_1") }}]
437+
instances: {{ common.INSTANCES_DEFAULT_X86 }}
437438
oss: [{{ OS_X86_4 }}]
438439
schedulers: [ "slurm" ]
439440
test_slurm.py::test_error_handling:

tests/integration-tests/tests/common/hit_common.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,9 @@ def assert_compute_node_reasons(scheduler_commands, compute_nodes, expected_reas
8585
assert_that(node_info).contains(f"Reason={expected_reason}")
8686

8787

88-
@retry(wait_fixed=seconds(20), stop_max_delay=minutes(6))
88+
# TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
89+
# We must address it and restore the default wait time to 5 minutes.
90+
@retry(wait_fixed=seconds(20), stop_max_delay=minutes(7))
8991
def wait_for_num_nodes_in_scheduler(scheduler_commands, desired, filter_by_partition=None):
9092
assert_num_nodes_in_scheduler(scheduler_commands, desired, filter_by_partition)
9193

tests/integration-tests/tests/schedulers/test_slurm.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,9 @@ def test_slurm_scaling(
257257
cluster = clusters_factory(cluster_config)
258258
remote_command_executor = RemoteCommandExecutor(cluster)
259259
scheduler_commands = scheduler_commands_factory(remote_command_executor)
260+
# TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
261+
# We must address it and restore the default wait time to 300s.
262+
stop_max_delay_secs = 400 if (os.startswith("rocky") or os.startswith("rhel")) else 300
260263

261264
_assert_cluster_initial_conditions(scheduler_commands, 20, 20, 4)
262265
_test_online_node_configured_correctly(
@@ -284,18 +287,19 @@ def test_slurm_scaling(
284287
num_static_nodes=2,
285288
num_dynamic_nodes=3,
286289
dynamic_instance_type=instance,
290+
stop_max_delay_secs=stop_max_delay_secs,
287291
)
288292
_test_replace_down_nodes(
289293
remote_command_executor,
290294
scheduler_commands,
291295
test_datadir,
292296
cluster.cfn_name,
293297
region,
294-
os,
295298
partition="ondemand1",
296299
num_static_nodes=2,
297300
num_dynamic_nodes=3,
298301
dynamic_instance_type=instance,
302+
stop_max_delay_secs=stop_max_delay_secs,
299303
)
300304
_test_keep_or_replace_suspended_nodes(
301305
scheduler_commands,
@@ -305,6 +309,7 @@ def test_slurm_scaling(
305309
num_static_nodes=2,
306310
num_dynamic_nodes=3,
307311
dynamic_instance_type=instance,
312+
stop_max_delay_secs=stop_max_delay_secs,
308313
)
309314
assert_no_errors_in_logs(remote_command_executor, scheduler)
310315

@@ -1139,7 +1144,14 @@ def _test_partition_states(
11391144

11401145

11411146
def _test_reset_terminated_nodes(
1142-
scheduler_commands, cluster_name, region, partition, num_static_nodes, num_dynamic_nodes, dynamic_instance_type
1147+
scheduler_commands,
1148+
cluster_name,
1149+
region,
1150+
partition,
1151+
num_static_nodes,
1152+
num_dynamic_nodes,
1153+
dynamic_instance_type,
1154+
stop_max_delay_secs,
11431155
):
11441156
"""
11451157
Test that slurm nodes are reset if instances are terminated manually.
@@ -1162,7 +1174,7 @@ def _test_reset_terminated_nodes(
11621174
# terminate all instances manually
11631175
_terminate_nodes_manually(instance_ids, region)
11641176
# Assert that cluster replaced static node and reset dynamic nodes
1165-
_wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes)
1177+
_wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes, stop_max_delay_secs=stop_max_delay_secs)
11661178
assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
11671179

11681180

@@ -1172,11 +1184,11 @@ def _test_replace_down_nodes(
11721184
test_datadir,
11731185
cluster_name,
11741186
region,
1175-
os,
11761187
partition,
11771188
num_static_nodes,
11781189
num_dynamic_nodes,
11791190
dynamic_instance_type,
1191+
stop_max_delay_secs,
11801192
):
11811193
"""Test that slurm nodes are replaced if nodes are marked DOWN."""
11821194
logging.info("Testing that nodes replaced when set to down state")
@@ -1196,22 +1208,28 @@ def _test_replace_down_nodes(
11961208
remote_command_executor.run_remote_script(str(test_datadir / "slurm_kill_slurmd_job.sh"), args=[node])
11971209
# set dynamic to down manually
11981210
_set_nodes_to_down_manually(scheduler_commands, dynamic_nodes)
1199-
# TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
1200-
# We must address it and restore the default wait time to 300s.
1201-
stop_max_delay_secs = 360 if os.startswith("rocky") else 300
12021211
_wait_for_node_reset(scheduler_commands, static_nodes, dynamic_nodes, stop_max_delay_secs=stop_max_delay_secs)
12031212
assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
12041213

12051214

12061215
def _test_keep_or_replace_suspended_nodes(
1207-
scheduler_commands, cluster_name, region, partition, num_static_nodes, num_dynamic_nodes, dynamic_instance_type
1216+
scheduler_commands,
1217+
cluster_name,
1218+
region,
1219+
partition,
1220+
num_static_nodes,
1221+
num_dynamic_nodes,
1222+
dynamic_instance_type,
1223+
stop_max_delay_secs,
12081224
):
12091225
"""Test keep DRAIN nodes if there is job running, or terminate if no job is running."""
12101226
logging.info(
12111227
"Testing that nodes are NOT terminated when set to suspend state and there is job running on the nodes"
12121228
)
12131229
job_id = submit_initial_job(
12141230
scheduler_commands,
1231+
# Job running time should at least bigger than `_wait_for_node_reset` timeout
1232+
# plus `_assert_nodes_not_terminated` time
12151233
"sleep 550",
12161234
partition,
12171235
dynamic_instance_type,
@@ -1224,13 +1242,17 @@ def _test_keep_or_replace_suspended_nodes(
12241242
# Set all nodes to drain, static should be in DRAINED and dynamic in DRAINING
12251243
_set_nodes_to_suspend_state_manually(scheduler_commands, static_nodes + dynamic_nodes)
12261244
# Static nodes in DRAINED are immediately replaced
1227-
_wait_for_node_reset(scheduler_commands, static_nodes=static_nodes, dynamic_nodes=[])
1245+
_wait_for_node_reset(
1246+
scheduler_commands, static_nodes=static_nodes, dynamic_nodes=[], stop_max_delay_secs=stop_max_delay_secs
1247+
)
12281248
# Assert dynamic nodes in DRAINING are not terminated during job run
12291249
_assert_nodes_not_terminated(scheduler_commands, dynamic_nodes)
12301250
# wait until the job is completed and check that the DRAINING dynamic nodes are then terminated
12311251
scheduler_commands.wait_job_completed(job_id)
12321252
scheduler_commands.assert_job_succeeded(job_id)
1233-
_wait_for_node_reset(scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes)
1253+
_wait_for_node_reset(
1254+
scheduler_commands, static_nodes=[], dynamic_nodes=dynamic_nodes, stop_max_delay_secs=stop_max_delay_secs
1255+
)
12341256
assert_num_instances_in_cluster(cluster_name, region, len(static_nodes))
12351257

12361258

@@ -1415,6 +1437,8 @@ def _wait_for_node_reset(
14151437
wait_fixed_secs=wait_fixed_secs,
14161438
stop_max_delay_secs=stop_max_delay_secs,
14171439
)
1440+
# Add delay to accommodate node replacement process (~45s between node down status and replacement)
1441+
time.sleep(45)
14181442
logging.info("Assert static nodes are replaced")
14191443
wait_for_compute_nodes_states(
14201444
scheduler_commands,
@@ -1443,10 +1467,10 @@ def _assert_node_addr_host_reset(addr_host_list, nodes):
14431467
assert_that(addr_host_list).contains("{0} {0} {0}".format(nodename))
14441468

14451469

1446-
def _assert_nodes_not_terminated(scheduler_commands, nodes, timeout=5):
1447-
logging.info("Waiting for cluster daemon action")
1470+
def _assert_nodes_not_terminated(scheduler_commands, nodes, waiting_time=2):
1471+
logging.info("Assert the job still running for {} minutes on DRAINING dynamic nodes.".format(waiting_time))
14481472
start_time = time.time()
1449-
while time.time() < start_time + 60 * (timeout):
1473+
while time.time() < start_time + 60 * (waiting_time):
14501474
assert_that(set(nodes) <= set(scheduler_commands.get_compute_nodes())).is_true()
14511475
time.sleep(20)
14521476

0 commit comments

Comments
 (0)