|
13 | 13 | from tests.common.assertions import assert_no_msg_in_logs
|
14 | 14 | from tests.common.scaling_common import get_bootstrap_errors, get_scaling_metrics, validate_and_get_scaling_test_config
|
15 | 15 |
|
16 |
| -MAX_QUEUE_SIZE = 30000 |
| 16 | +MAX_QUEUE_SIZE = 5000 |
17 | 17 |
|
18 | 18 |
|
19 | 19 | @pytest.mark.parametrize(
|
@@ -76,12 +76,18 @@ def _datetime_to_minute(dt: datetime):
|
76 | 76 |
|
77 | 77 | def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_target: int, start_time: datetime):
|
78 | 78 | try:
|
79 |
| - scaling_target_index = capacity_time_series.index(scaling_target) |
| 79 | + if scaling_target in capacity_time_series: |
| 80 | + max_scaled_instances = scaling_target |
| 81 | + else: |
| 82 | + max_scaled_instances = max(capacity_time_series) |
| 83 | + logging.warning(f"Cluster scaled to {max_scaled_instances} when expected target was {scaling_target}") |
| 84 | + |
| 85 | + scaling_target_index = capacity_time_series.index(max_scaled_instances) |
80 | 86 | timestamp_at_full_cluster_size = timestamps[scaling_target_index]
|
81 | 87 | scaling_target_time = datetime.datetime.fromtimestamp(
|
82 | 88 | float(timestamp_at_full_cluster_size), tz=datetime.timezone.utc
|
83 | 89 | )
|
84 |
| - return scaling_target_time, int((scaling_target_time - start_time).total_seconds()) |
| 90 | + return scaling_target_time, int((scaling_target_time - start_time).total_seconds()), max_scaled_instances |
85 | 91 | except ValueError as e:
|
86 | 92 | logging.error("Cluster did not scale up to %d nodes", scaling_target)
|
87 | 93 | raise Exception(
|
@@ -299,8 +305,8 @@ def _scale_up_and_down(
|
299 | 305 | get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"), region)
|
300 | 306 |
|
301 | 307 | # Extract scale up duration and timestamp from the monitoring metrics collected above
|
302 |
| - _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time) |
303 |
| - scaling_target_time, scale_up_time_scheduler = _get_scaling_time( |
| 308 | + _, scale_up_time_ec2, _ = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time) |
| 309 | + scaling_target_time, scale_up_time_scheduler, max_compute_nodes_up = _get_scaling_time( |
304 | 310 | compute_nodes_time_series_up, timestamps, scaling_target, start_time
|
305 | 311 | )
|
306 | 312 |
|
@@ -331,7 +337,7 @@ def _scale_up_and_down(
|
331 | 337 | target_cluster_size=0,
|
332 | 338 | )
|
333 | 339 | # Extract scale down duration and timestamp from the monitoring metrics collected above
|
334 |
| - _, scale_down_time = _get_scaling_time(ec2_capacity_time_series_down, timestamps, 0, scale_down_start_timestamp) |
| 340 | + _, scale_down_time, _ = _get_scaling_time(ec2_capacity_time_series_down, timestamps, 0, scale_down_start_timestamp) |
335 | 341 | # Summarize the scaling metrics in a report (logs and metrics image)
|
336 | 342 | scaling_results = {
|
337 | 343 | "Region": region,
|
@@ -359,7 +365,8 @@ def _scale_up_and_down(
|
359 | 365 | # Verify that there was no EC2 over-scaling
|
360 | 366 | assert_that(max(ec2_capacity_time_series_up)).is_equal_to(scaling_target)
|
361 | 367 | # Verify that there was no Slurm nodes over-scaling
|
362 |
| - assert_that(max(compute_nodes_time_series_up)).is_equal_to(scaling_target) |
| 368 | + with soft_assertions(): |
| 369 | + assert_that(max_compute_nodes_up).is_equal_to(scaling_target) |
363 | 370 | # Verify all Slurm nodes were removed on scale down
|
364 | 371 | assert_that(compute_nodes_time_series_down[-1]).is_equal_to(0)
|
365 | 372 |
|
|
0 commit comments