Skip to content

Commit ead8db6

Browse files
committed
[Test] Make it easier to detect and evaluate performance degradation/improvement with OSU.
In particular: 1. degradation/improvements are labelled in logs 2. percentage degrad/improvement is included in log 3. results are stored in test output dir
1 parent 534c891 commit ead8db6

File tree

2 files changed

+46
-8
lines changed

2 files changed

+46
-8
lines changed

tests/integration-tests/tests/common/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -529,3 +529,12 @@ def assert_no_file_handler_leak(init_compute_ip_to_num_files, remote_command_exe
529529
assert_that(current_compute_ip_to_num_files[compute_ip]).is_equal_to(
530530
init_compute_ip_to_num_files[compute_ip]
531531
)
532+
533+
534+
def write_file(dirname, filename, content):
535+
os.makedirs(dirname, exist_ok=True)
536+
filepath = f"{dirname}/{filename}"
537+
with open(filepath, "w") as f:
538+
f.write(content)
539+
logging.info(f"File written: {filepath}")
540+
return filepath

tests/integration-tests/tests/performance_tests/test_osu.py

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,12 @@
1919

2020
from tests.common.assertions import assert_no_errors_in_logs
2121
from tests.common.osu_common import run_individual_osu_benchmark
22-
from tests.common.utils import fetch_instance_slots, get_installed_parallelcluster_version, run_system_analyzer
22+
from tests.common.utils import (
23+
fetch_instance_slots,
24+
get_installed_parallelcluster_version,
25+
run_system_analyzer,
26+
write_file,
27+
)
2328

2429
# We collected OSU benchmarks results for c5n.18xlarge only.
2530
OSU_BENCHMARKS_INSTANCES = ["c5n.18xlarge"]
@@ -61,6 +66,8 @@ def test_osu(
6166

6267
benchmark_failures = []
6368

69+
output_dir = request.config.getoption("output_dir")
70+
6471
# Run OSU benchmarks in efa-enabled queue.
6572
for mpi_version in mpi_variants:
6673
benchmark_failures.extend(
@@ -69,6 +76,7 @@ def test_osu(
6976
remote_command_executor,
7077
scheduler_commands,
7178
test_datadir,
79+
output_dir,
7280
os,
7381
instance,
7482
slots_per_instance,
@@ -81,6 +89,7 @@ def test_osu(
8189
remote_command_executor,
8290
scheduler_commands,
8391
test_datadir,
92+
output_dir,
8493
os,
8594
instance,
8695
num_instances=32,
@@ -108,6 +117,7 @@ def _test_osu_benchmarks_pt2pt(
108117
remote_command_executor,
109118
scheduler_commands,
110119
test_datadir,
120+
output_dir,
111121
os,
112122
instance,
113123
slots_per_instance,
@@ -120,10 +130,11 @@ def _test_osu_benchmarks_pt2pt(
120130
accepted_number_of_failures = 4
121131

122132
failed_benchmarks = []
133+
benchmark_group = "pt2pt"
123134
for benchmark_name in ["osu_latency", "osu_bibw"]:
124135
_, output = run_individual_osu_benchmark(
125136
mpi_version,
126-
"pt2pt",
137+
benchmark_group,
127138
benchmark_name,
128139
partition,
129140
remote_command_executor,
@@ -132,7 +143,9 @@ def _test_osu_benchmarks_pt2pt(
132143
slots_per_instance,
133144
test_datadir,
134145
)
135-
failures = _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output)
146+
failures = _check_osu_benchmarks_results(
147+
test_datadir, output_dir, os, instance, mpi_version, benchmark_name, output
148+
)
136149
if failures > accepted_number_of_failures:
137150
failed_benchmarks.append(f"{mpi_version}-{benchmark_name}")
138151

@@ -144,6 +157,7 @@ def _test_osu_benchmarks_collective(
144157
remote_command_executor,
145158
scheduler_commands,
146159
test_datadir,
160+
output_dir,
147161
os,
148162
instance,
149163
num_instances,
@@ -154,10 +168,11 @@ def _test_osu_benchmarks_collective(
154168
accepted_number_of_failures = 3
155169

156170
failed_benchmarks = []
171+
benchmark_group = "collective"
157172
for benchmark_name in ["osu_allgather", "osu_bcast", "osu_allreduce", "osu_alltoall"]:
158173
_, output = run_individual_osu_benchmark(
159174
mpi_version,
160-
"collective",
175+
benchmark_group,
161176
benchmark_name,
162177
partition,
163178
remote_command_executor,
@@ -167,7 +182,9 @@ def _test_osu_benchmarks_collective(
167182
test_datadir,
168183
timeout=24,
169184
)
170-
failures = _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output)
185+
failures = _check_osu_benchmarks_results(
186+
test_datadir, output_dir, os, instance, mpi_version, benchmark_name, output
187+
)
171188
if failures > accepted_number_of_failures:
172189
failed_benchmarks.append(f"{mpi_version}-{benchmark_name}")
173190

@@ -213,12 +230,13 @@ def _test_osu_benchmarks_multiple_bandwidth(
213230
assert_that(float(max_bandwidth)).is_greater_than(expected_bandwidth)
214231

215232

216-
def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, benchmark_name, output):
233+
def _check_osu_benchmarks_results(test_datadir, output_dir, os, instance, mpi_version, benchmark_name, output):
217234
logging.info(output)
218235
# Check avg latency for all packet sizes
219236
failures = 0
220237
metric_data = []
221238
metric_namespace = "ParallelCluster/test_efa"
239+
evaluation_output = ""
222240
for packet_size, value in re.findall(r"(\d+)\s+(\d+)\.", output):
223241
with open(
224242
str(test_datadir / "osu_benchmarks" / "results" / os / instance / mpi_version / benchmark_name),
@@ -236,11 +254,17 @@ def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, bench
236254

237255
is_failure = int(value) > tolerated_value
238256

257+
percentage_diff = (float(value) - float(tolerated_value)) / float(tolerated_value) * 100
258+
259+
outcome = "DEGRADATION" if percentage_diff > 0 else "IMPROVEMENT"
260+
239261
message = (
240-
f"{mpi_version} - {benchmark_name} - packet size {packet_size}: "
241-
f"tolerated: {tolerated_value}, current: {value}"
262+
f"{outcome} : {mpi_version} - {benchmark_name} - packet size {packet_size}: "
263+
f"tolerated: {tolerated_value}, current: {value}, percentage_diff: {percentage_diff}%"
242264
)
243265

266+
evaluation_output += f"\n{message}"
267+
244268
dimensions = {
245269
"PclusterVersion": get_installed_parallelcluster_version(),
246270
"MpiVariant": mpi_version,
@@ -263,6 +287,11 @@ def _check_osu_benchmarks_results(test_datadir, os, instance, mpi_version, bench
263287
logging.error(message)
264288
else:
265289
logging.info(message)
290+
write_file(
291+
dirname=f"{output_dir}/osu-results",
292+
filename=f"{os}-{instance}-{mpi_version}-{benchmark_name}-evaluation.out",
293+
content=evaluation_output,
294+
)
266295
boto3.client("cloudwatch").put_metric_data(Namespace=metric_namespace, MetricData=metric_data)
267296

268297
return failures

0 commit comments

Comments
 (0)