|
1 | 1 | import json
|
2 | 2 | import logging
|
| 3 | +from concurrent.futures import ThreadPoolExecutor |
3 | 4 |
|
4 | 5 | import boto3
|
5 | 6 | import pytest
|
@@ -47,6 +48,24 @@ def starccm_installed(headnode):
|
47 | 48 | return False
|
48 | 49 |
|
49 | 50 |
|
| 51 | +def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath): |
| 52 | + num_of_tasks = number_of_nodes * TASK_VCPUS |
| 53 | + result = remote_command_executor.run_remote_command( |
| 54 | + f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', |
| 55 | + additional_files=[str(test_datadir / "starccm.slurm.sh")], |
| 56 | + ) |
| 57 | + logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes") |
| 58 | + job_id = scheduler_commands.assert_job_submitted(result.stdout) |
| 59 | + scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) |
| 60 | + scheduler_commands.assert_job_succeeded(job_id) |
| 61 | + perf_test_result = remote_command_executor.run_remote_script( |
| 62 | + (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False |
| 63 | + ) |
| 64 | + observed_value = float(perf_test_result.stdout) |
| 65 | + logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds") |
| 66 | + return observed_value |
| 67 | + |
| 68 | + |
50 | 69 | @pytest.mark.parametrize(
|
51 | 70 | "number_of_nodes",
|
52 | 71 | [[8, 16, 32]],
|
@@ -88,21 +107,23 @@ def test_starccm(
|
88 | 107 | logging.info("StarCCM+ Installed")
|
89 | 108 | podkey, licpath = get_starccm_secrets(region)
|
90 | 109 | performance_degradation = {}
|
91 |
| - for node in number_of_nodes: |
92 |
| - num_of_tasks = node * TASK_VCPUS |
93 |
| - result = remote_command_executor.run_remote_command( |
94 |
| - f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"', |
95 |
| - additional_files=[str(test_datadir / "starccm.slurm.sh")], |
| 110 | + |
| 111 | + # Run 8 and 16 node tests in parallel |
| 112 | + with ThreadPoolExecutor(max_workers=2) as executor: |
| 113 | + future_8 = executor.submit( |
| 114 | + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 8, podkey, licpath |
96 | 115 | )
|
97 |
| - logging.info(f"Submitting StarCCM+ job with {node} nodes") |
98 |
| - job_id = scheduler_commands.assert_job_submitted(result.stdout) |
99 |
| - scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT) |
100 |
| - scheduler_commands.assert_job_succeeded(job_id) |
101 |
| - perf_test_result = remote_command_executor.run_remote_script( |
102 |
| - (str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False |
| 116 | + future_16 = executor.submit( |
| 117 | + run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 16, podkey, licpath |
103 | 118 | )
|
104 |
| - observed_value = float(perf_test_result.stdout) |
105 |
| - logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds") |
| 119 | + observed_value_8 = future_8.result() |
| 120 | + observed_value_16 = future_16.result() |
| 121 | + |
| 122 | + # Run 32 node test |
| 123 | + observed_value_32 = run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, 32, podkey, licpath) |
| 124 | + |
| 125 | + # Check results and log performance degradation |
| 126 | + for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]): |
106 | 127 | baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
|
107 | 128 | percentage_difference = perf_test_difference(observed_value, baseline_value)
|
108 | 129 | if percentage_difference < 0:
|
|
0 commit comments