@@ -257,6 +257,9 @@ def test_slurm_scaling(
257
257
cluster = clusters_factory (cluster_config )
258
258
remote_command_executor = RemoteCommandExecutor (cluster )
259
259
scheduler_commands = scheduler_commands_factory (remote_command_executor )
260
+ # TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
261
+ # We must address it and restore the default wait time to 300s.
262
+ stop_max_delay_secs = 400 if (os .startswith ("rocky" ) or os .startswith ("rhel" )) else 300
260
263
261
264
_assert_cluster_initial_conditions (scheduler_commands , 20 , 20 , 4 )
262
265
_test_online_node_configured_correctly (
@@ -284,18 +287,19 @@ def test_slurm_scaling(
284
287
num_static_nodes = 2 ,
285
288
num_dynamic_nodes = 3 ,
286
289
dynamic_instance_type = instance ,
290
+ stop_max_delay_secs = stop_max_delay_secs ,
287
291
)
288
292
_test_replace_down_nodes (
289
293
remote_command_executor ,
290
294
scheduler_commands ,
291
295
test_datadir ,
292
296
cluster .cfn_name ,
293
297
region ,
294
- os ,
295
298
partition = "ondemand1" ,
296
299
num_static_nodes = 2 ,
297
300
num_dynamic_nodes = 3 ,
298
301
dynamic_instance_type = instance ,
302
+ stop_max_delay_secs = stop_max_delay_secs ,
299
303
)
300
304
_test_keep_or_replace_suspended_nodes (
301
305
scheduler_commands ,
@@ -305,6 +309,7 @@ def test_slurm_scaling(
305
309
num_static_nodes = 2 ,
306
310
num_dynamic_nodes = 3 ,
307
311
dynamic_instance_type = instance ,
312
+ stop_max_delay_secs = stop_max_delay_secs ,
308
313
)
309
314
assert_no_errors_in_logs (remote_command_executor , scheduler )
310
315
@@ -1139,7 +1144,14 @@ def _test_partition_states(
1139
1144
1140
1145
1141
1146
def _test_reset_terminated_nodes (
1142
- scheduler_commands , cluster_name , region , partition , num_static_nodes , num_dynamic_nodes , dynamic_instance_type
1147
+ scheduler_commands ,
1148
+ cluster_name ,
1149
+ region ,
1150
+ partition ,
1151
+ num_static_nodes ,
1152
+ num_dynamic_nodes ,
1153
+ dynamic_instance_type ,
1154
+ stop_max_delay_secs ,
1143
1155
):
1144
1156
"""
1145
1157
Test that slurm nodes are reset if instances are terminated manually.
@@ -1162,7 +1174,7 @@ def _test_reset_terminated_nodes(
1162
1174
# terminate all instances manually
1163
1175
_terminate_nodes_manually (instance_ids , region )
1164
1176
# Assert that cluster replaced static node and reset dynamic nodes
1165
- _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes )
1177
+ _wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs )
1166
1178
assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
1167
1179
1168
1180
@@ -1172,11 +1184,11 @@ def _test_replace_down_nodes(
1172
1184
test_datadir ,
1173
1185
cluster_name ,
1174
1186
region ,
1175
- os ,
1176
1187
partition ,
1177
1188
num_static_nodes ,
1178
1189
num_dynamic_nodes ,
1179
1190
dynamic_instance_type ,
1191
+ stop_max_delay_secs ,
1180
1192
):
1181
1193
"""Test that slurm nodes are replaced if nodes are marked DOWN."""
1182
1194
logging .info ("Testing that nodes replaced when set to down state" )
@@ -1196,22 +1208,28 @@ def _test_replace_down_nodes(
1196
1208
remote_command_executor .run_remote_script (str (test_datadir / "slurm_kill_slurmd_job.sh" ), args = [node ])
1197
1209
# set dynamic to down manually
1198
1210
_set_nodes_to_down_manually (scheduler_commands , dynamic_nodes )
1199
- # TOFIX We observe in 3.13.0 an increase in the bootstrap time for Rocky and RHEL.
1200
- # We must address it and restore the default wait time to 300s.
1201
- stop_max_delay_secs = 360 if os .startswith ("rocky" ) else 300
1202
1211
_wait_for_node_reset (scheduler_commands , static_nodes , dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs )
1203
1212
assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
1204
1213
1205
1214
1206
1215
def _test_keep_or_replace_suspended_nodes (
1207
- scheduler_commands , cluster_name , region , partition , num_static_nodes , num_dynamic_nodes , dynamic_instance_type
1216
+ scheduler_commands ,
1217
+ cluster_name ,
1218
+ region ,
1219
+ partition ,
1220
+ num_static_nodes ,
1221
+ num_dynamic_nodes ,
1222
+ dynamic_instance_type ,
1223
+ stop_max_delay_secs ,
1208
1224
):
1209
1225
"""Test keep DRAIN nodes if there is job running, or terminate if no job is running."""
1210
1226
logging .info (
1211
1227
"Testing that nodes are NOT terminated when set to suspend state and there is job running on the nodes"
1212
1228
)
1213
1229
job_id = submit_initial_job (
1214
1230
scheduler_commands ,
1231
+ # Job running time should at least bigger than `_wait_for_node_reset` timeout
1232
+ # plus `_assert_nodes_not_terminated` time
1215
1233
"sleep 550" ,
1216
1234
partition ,
1217
1235
dynamic_instance_type ,
@@ -1224,13 +1242,17 @@ def _test_keep_or_replace_suspended_nodes(
1224
1242
# Set all nodes to drain, static should be in DRAINED and dynamic in DRAINING
1225
1243
_set_nodes_to_suspend_state_manually (scheduler_commands , static_nodes + dynamic_nodes )
1226
1244
# Static nodes in DRAINED are immediately replaced
1227
- _wait_for_node_reset (scheduler_commands , static_nodes = static_nodes , dynamic_nodes = [])
1245
+ _wait_for_node_reset (
1246
+ scheduler_commands , static_nodes = static_nodes , dynamic_nodes = [], stop_max_delay_secs = stop_max_delay_secs
1247
+ )
1228
1248
# Assert dynamic nodes in DRAINING are not terminated during job run
1229
1249
_assert_nodes_not_terminated (scheduler_commands , dynamic_nodes )
1230
1250
# wait until the job is completed and check that the DRAINING dynamic nodes are then terminated
1231
1251
scheduler_commands .wait_job_completed (job_id )
1232
1252
scheduler_commands .assert_job_succeeded (job_id )
1233
- _wait_for_node_reset (scheduler_commands , static_nodes = [], dynamic_nodes = dynamic_nodes )
1253
+ _wait_for_node_reset (
1254
+ scheduler_commands , static_nodes = [], dynamic_nodes = dynamic_nodes , stop_max_delay_secs = stop_max_delay_secs
1255
+ )
1234
1256
assert_num_instances_in_cluster (cluster_name , region , len (static_nodes ))
1235
1257
1236
1258
@@ -1415,6 +1437,8 @@ def _wait_for_node_reset(
1415
1437
wait_fixed_secs = wait_fixed_secs ,
1416
1438
stop_max_delay_secs = stop_max_delay_secs ,
1417
1439
)
1440
+ # Add delay to accommodate node replacement process (~45s between node down status and replacement)
1441
+ time .sleep (45 )
1418
1442
logging .info ("Assert static nodes are replaced" )
1419
1443
wait_for_compute_nodes_states (
1420
1444
scheduler_commands ,
@@ -1443,10 +1467,10 @@ def _assert_node_addr_host_reset(addr_host_list, nodes):
1443
1467
assert_that (addr_host_list ).contains ("{0} {0} {0}" .format (nodename ))
1444
1468
1445
1469
1446
- def _assert_nodes_not_terminated (scheduler_commands , nodes , timeout = 5 ):
1447
- logging .info ("Waiting for cluster daemon action" )
1470
+ def _assert_nodes_not_terminated (scheduler_commands , nodes , waiting_time = 2 ):
1471
+ logging .info ("Assert the job still running for {} minutes on DRAINING dynamic nodes." . format ( waiting_time ) )
1448
1472
start_time = time .time ()
1449
- while time .time () < start_time + 60 * (timeout ):
1473
+ while time .time () < start_time + 60 * (waiting_time ):
1450
1474
assert_that (set (nodes ) <= set (scheduler_commands .get_compute_nodes ())).is_true ()
1451
1475
time .sleep (20 )
1452
1476
0 commit comments