Skip to content

Commit bcadc9e

Browse files
committed
Run 8 and 16 node performance tests in parallel
1 parent 978ecf7 commit bcadc9e

File tree

2 files changed

+65
-26
lines changed

2 files changed

+65
-26
lines changed

tests/integration-tests/tests/performance_tests/test_openfoam.py

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from concurrent.futures.thread import ThreadPoolExecutor
23

34
import pytest
45
from remote_command_executor import RemoteCommandExecutionError, RemoteCommandExecutor
@@ -33,6 +34,23 @@ def openfoam_installed(headnode):
3334
return False
3435

3536

37+
def run_openfoam_test(remote_command_executor, test_datadir, number_of_nodes):
38+
subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
39+
logging.info(f"Submitting OpenFOAM job with {number_of_nodes} nodes")
40+
remote_command_executor.run_remote_command(
41+
f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{number_of_nodes}" 2>&1',
42+
additional_files=[str(test_datadir / "openfoam.slurm.sh")],
43+
timeout=OPENFOAM_JOB_TIMEOUT,
44+
)
45+
perf_test_result = remote_command_executor.run_remote_script(
46+
(str(test_datadir / "openfoam.results.sh")), hide=False
47+
)
48+
output = perf_test_result.stdout.strip()
49+
observed_value = int(output.split("\n")[-1].strip())
50+
logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
51+
return observed_value
52+
53+
3654
@pytest.mark.parametrize(
3755
"number_of_nodes",
3856
[[8, 16, 32]],
@@ -59,19 +77,19 @@ def test_openfoam(
5977
)
6078
logging.info("OpenFOAM Installed")
6179
performance_degradation = {}
62-
subspace_benchmarks_dir = "/shared/SubspaceBenchmarks"
63-
for node in number_of_nodes:
64-
logging.info(f"Submitting OpenFOAM job with {node} nodes")
65-
remote_command_executor.run_remote_command(
66-
f'bash openfoam.slurm.sh "{subspace_benchmarks_dir}" "{node}" 2>&1',
67-
additional_files=[str(test_datadir / "openfoam.slurm.sh")],
68-
timeout=OPENFOAM_JOB_TIMEOUT,
69-
)
70-
perf_test_result = remote_command_executor.run_remote_script(
71-
(str(test_datadir / "openfoam.results.sh")), hide=False
72-
)
73-
output = perf_test_result.stdout.strip()
74-
observed_value = int(output.split("\n")[-1].strip())
80+
81+
# Run 8 and 16 node tests in parallel
82+
with ThreadPoolExecutor(max_workers=2) as executor:
83+
future_8 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 8)
84+
future_16 = executor.submit(run_openfoam_test, remote_command_executor, test_datadir, 16)
85+
observed_value_8 = future_8.result()
86+
observed_value_16 = future_16.result()
87+
88+
# Run 32 node test
89+
observed_value_32 = run_openfoam_test(remote_command_executor, test_datadir, 32)
90+
91+
# Check results and log performance degradation
92+
for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
7593
baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
7694
logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds")
7795
percentage_difference = perf_test_difference(observed_value, baseline_value)

tests/integration-tests/tests/performance_tests/test_starccm.py

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import logging
3+
from concurrent.futures import ThreadPoolExecutor
34

45
import boto3
56
import pytest
@@ -47,6 +48,24 @@ def starccm_installed(headnode):
4748
return False
4849

4950

51+
def run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, number_of_nodes, podkey, licpath):
52+
num_of_tasks = number_of_nodes * TASK_VCPUS
53+
result = remote_command_executor.run_remote_command(
54+
f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"',
55+
additional_files=[str(test_datadir / "starccm.slurm.sh")],
56+
)
57+
logging.info(f"Submitting StarCCM+ job with {number_of_nodes} nodes")
58+
job_id = scheduler_commands.assert_job_submitted(result.stdout)
59+
scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT)
60+
scheduler_commands.assert_job_succeeded(job_id)
61+
perf_test_result = remote_command_executor.run_remote_script(
62+
(str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False
63+
)
64+
observed_value = float(perf_test_result.stdout)
65+
logging.info(f"The elapsed time for {number_of_nodes} nodes is {observed_value} seconds")
66+
return observed_value
67+
68+
5069
@pytest.mark.parametrize(
5170
"number_of_nodes",
5271
[[8, 16, 32]],
@@ -88,21 +107,23 @@ def test_starccm(
88107
logging.info("StarCCM+ Installed")
89108
podkey, licpath = get_starccm_secrets(region)
90109
performance_degradation = {}
91-
for node in number_of_nodes:
92-
num_of_tasks = node * TASK_VCPUS
93-
result = remote_command_executor.run_remote_command(
94-
f'sbatch --ntasks={num_of_tasks} starccm.slurm.sh "{podkey}" "{licpath}"',
95-
additional_files=[str(test_datadir / "starccm.slurm.sh")],
110+
111+
# Run 8 and 16 node tests in parallel
112+
with ThreadPoolExecutor(max_workers=2) as executor:
113+
future_8 = executor.submit(
114+
run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 8, podkey, licpath
96115
)
97-
logging.info(f"Submitting StarCCM+ job with {node} nodes")
98-
job_id = scheduler_commands.assert_job_submitted(result.stdout)
99-
scheduler_commands.wait_job_completed(job_id, timeout=STARCCM_JOB_TIMEOUT)
100-
scheduler_commands.assert_job_succeeded(job_id)
101-
perf_test_result = remote_command_executor.run_remote_script(
102-
(str(test_datadir / "starccm.results.sh")), args=[job_id], hide=False
116+
future_16 = executor.submit(
117+
run_starccm_test, remote_command_executor, scheduler_commands, test_datadir, 16, podkey, licpath
103118
)
104-
observed_value = float(perf_test_result.stdout)
105-
logging.info(f"The elapsed time for {node} nodes is {observed_value} seconds")
119+
observed_value_8 = future_8.result()
120+
observed_value_16 = future_16.result()
121+
122+
# Run 32 node test
123+
observed_value_32 = run_starccm_test(remote_command_executor, scheduler_commands, test_datadir, 32, podkey, licpath)
124+
125+
# Check results and log performance degradation
126+
for node, observed_value in zip(number_of_nodes, [observed_value_8, observed_value_16, observed_value_32]):
106127
baseline_value = BASELINE_CLUSTER_SIZE_ELAPSED_SECONDS[os][node]
107128
percentage_difference = perf_test_difference(observed_value, baseline_value)
108129
if percentage_difference < 0:

0 commit comments

Comments
 (0)