Skip to content

Commit 6d8e35a

Browse files
judysngenrico-usai
authored andcommitted
Decouple OSU benchmarks from EFA integration tests (#5928)
* Decouple OSU test from EFA tests Signed-off-by: Judy Ng <njud@amazon.com> Signed-off-by: Enrico Usai <usai@amazon.com>
1 parent f0a8394 commit 6d8e35a

File tree

101 files changed

+343
-363
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+343
-363
lines changed

tests/integration-tests/configs/ad_integration.yaml

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,3 @@ test-suites:
88
instances: {{ common.INSTANCES_DEFAULT_X86 }}
99
oss: ["alinux2", "ubuntu2004"]
1010
schedulers: ["slurm"]
11-
benchmarks:
12-
- mpi_variants: ["openmpi"]
13-
num_instances: [100]
14-
osu_benchmarks:
15-
# Available collective benchmarks "osu_allgather", "osu_allreduce", "osu_alltoall", "osu_barrier", "osu_bcast", "osu_gather", "osu_reduce", "osu_reduce_scatter", "osu_scatter"
16-
collective: ["osu_allreduce", "osu_alltoall"]
17-
pt2pt: []

tests/integration-tests/configs/isolated_regions.yaml

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,6 @@ test-suites:
1616
instances: {{ INSTANCES }}
1717
oss: {{ OSS }}
1818
schedulers: {{ SCHEDULERS }}
19-
benchmarks:
20-
- mpi_variants: ["openmpi"]
21-
num_instances: [5]
22-
osu_benchmarks:
23-
collective: ["osu_alltoall"]
2419
# This test cannot be executed in US isolated regions because it relies on a CloudFormation stack using resources
2520
# that are not supported by CloudFormation in ADC, i.e. CapacityReservation and ResourceGroup.
2621
# capacity_reservations:
@@ -153,13 +148,6 @@ test-suites:
153148
instances: {{ INSTANCES }}
154149
oss: {{ OSS }}
155150
schedulers: {{ SCHEDULERS }}
156-
benchmarks:
157-
- mpi_variants: ["openmpi", "intelmpi"]
158-
num_instances: [20] # Change the head node instance type if you'd test more than 30 instances
159-
slots_per_instance: 2
160-
partition: "ht-disabled"
161-
osu_benchmarks:
162-
collective: ["osu_allreduce", "osu_alltoall"]
163151
dns:
164152
test_dns.py::test_hit_no_cluster_dns_mpi:
165153
dimensions:
@@ -460,12 +448,6 @@ test-suites:
460448
instances: {{ INSTANCES }}
461449
oss: {{ OSS }}
462450
schedulers: {{ SCHEDULERS }}
463-
benchmarks:
464-
- mpi_variants: ["openmpi", "intelmpi"]
465-
num_instances: [20] # Change the head node instance type if you'd test more than 30 instances
466-
slots_per_instance: 2
467-
osu_benchmarks:
468-
collective: ["osu_allreduce", "osu_alltoall"]
469451
test_raid.py::test_raid_fault_tolerance_mode:
470452
dimensions:
471453
- regions: {{ REGIONS }}

tests/integration-tests/configs/new_os.yaml

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,6 @@ test-suites:
1616
instances: {{ common.INSTANCES_DEFAULT_X86 }}
1717
oss: {{ NEW_OS }}
1818
schedulers: ["slurm"]
19-
benchmarks:
20-
- mpi_variants: [ "openmpi", "intelmpi" ]
21-
num_instances: [ 4 ]
22-
slots_per_instance: 2
23-
osu_benchmarks:
24-
collective: [ "osu_alltoall" ]
2519
arm_pl:
2620
test_arm_pl.py::test_arm_pl:
2721
dimensions:
@@ -100,12 +94,6 @@ test-suites:
10094
instances: ["m4.xlarge"]
10195
oss: {{ NEW_OS }}
10296
schedulers: ["slurm"]
103-
benchmarks:
104-
- mpi_variants: [ "openmpi", "intelmpi" ]
105-
num_instances: [ 4 ]
106-
slots_per_instance: 2
107-
osu_benchmarks:
108-
collective: [ "osu_alltoall" ]
10997
dns:
11098
test_dns.py::test_hit_no_cluster_dns_mpi:
11199
dimensions:
@@ -237,12 +225,6 @@ test-suites:
237225
instances: {{ common.INSTANCES_DEFAULT_ARM }}
238226
oss: {{ NEW_OS }}
239227
schedulers: ["slurm"]
240-
benchmarks:
241-
- mpi_variants: [ "openmpi", "intelmpi" ]
242-
num_instances: [ 4 ]
243-
slots_per_instance: 2
244-
osu_benchmarks:
245-
collective: [ "osu_alltoall" ]
246228
test_fsx_lustre.py::test_fsx_lustre_configuration_options:
247229
dimensions:
248230
- regions: ["us-east-2"]
@@ -261,12 +243,6 @@ test-suites:
261243
instances: {{ common.INSTANCES_DEFAULT_ARM }}
262244
oss: {{ NEW_OS }}
263245
schedulers: [ "slurm" ]
264-
benchmarks:
265-
- mpi_variants: [ "openmpi", "intelmpi" ]
266-
num_instances: [ 4 ]
267-
slots_per_instance: 2
268-
osu_benchmarks:
269-
collective: [ "osu_alltoall" ]
270246
test_raid.py::test_raid_performance_mode:
271247
dimensions:
272248
- regions: ["ap-south-1"]

tests/integration-tests/configs/osu.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{%- import 'common.jinja2' as common with context -%}
22
test-suites:
3-
efa:
4-
test_efa.py::test_efa:
3+
performance_tests:
4+
test_osu.py::test_osu:
55
dimensions:
66
- regions: [ "euw1-az1" ] # do not move, unless capacity reservation is moved as well
77
instances: [ "c5n.18xlarge" ]

tests/integration-tests/tests/ad_integration/test_ad_integration.py

Lines changed: 13 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
)
4040

4141
from tests.ad_integration.cluster_user import ClusterUser
42-
from tests.common.osu_common import compile_osu
4342
from tests.common.utils import get_sts_endpoint, retrieve_latest_ami, run_system_analyzer
4443
from tests.storage.test_fsx_lustre import create_fsx_ontap, create_fsx_open_zfs
4544

@@ -427,8 +426,7 @@ def _directory_factory(
427426
_delete_certificate(certificate_arn=certificate_arn, region=region)
428427

429428

430-
def _run_user_workloads(users, test_datadir, remote_command_executor, shared_storage_mount_dirs):
431-
compile_osu("openmpi", remote_command_executor)
429+
def _run_user_workloads(users, test_datadir, shared_storage_mount_dirs):
432430
_check_whoami(users)
433431
_check_files_permissions(users, shared_storage_mount_dirs)
434432
job_submission_outputs = [
@@ -660,8 +658,6 @@ def test_ad_integration(
660658
request,
661659
store_secret_in_secret_manager,
662660
clusters_factory,
663-
run_benchmarks,
664-
benchmarks,
665661
):
666662
"""
667663
Verify AD integration works as expected.
@@ -671,18 +667,12 @@ def test_ad_integration(
671667
3. SSH key for AD users is created when the property GenerateSshKeysForUsers is true;
672668
4. AD users can submit workloads;
673669
5. AD users filter out by LdapAccessFilter cannot access to the head node.
674-
675-
Optionally, it executes performance tests using OSU benchmarks.
676670
"""
677671
if not is_directory_supported(region, directory_type):
678672
pytest.skip(f"Skipping the test because directory type {directory_type} is not supported in region {region}")
679673

680-
head_node_instance_type = "c5n.18xlarge" if request.config.getoption("benchmarks") else "c5.xlarge"
681-
compute_instance_type_info = {"name": "c5.xlarge", "num_cores": 4}
682674
fsx_supported = is_fsx_supported(region)
683675
config_params = {
684-
"compute_instance_type": compute_instance_type_info.get("name"),
685-
"head_node_instance_type": head_node_instance_type,
686676
"fsx_supported": fsx_supported,
687677
}
688678
directory_stack_name, nlb_stack_name = directory_factory(
@@ -712,7 +702,7 @@ def test_ad_integration(
712702
)
713703
if fsx_supported:
714704
config_params.update(get_fsx_config_param_vals(fsx_factory, svm_factory))
715-
cluster_config = pcluster_config_reader(benchmarks=benchmarks, **config_params)
705+
cluster_config = pcluster_config_reader(**config_params)
716706
cluster = clusters_factory(cluster_config)
717707

718708
certificate_secret_arn = nlb_stack_parameters.get("CertificateSecretArn")
@@ -759,14 +749,12 @@ def test_ad_integration(
759749
shared_storage_mount_dirs = ["/shared", "/efs"]
760750
if fsx_supported:
761751
shared_storage_mount_dirs.extend(["/fsxlustre", "/fsxontap", "/fsxopenzfs"])
762-
_run_user_workloads(users, test_datadir, remote_command_executor, shared_storage_mount_dirs)
752+
_run_user_workloads(users, test_datadir, shared_storage_mount_dirs)
763753
logging.info("Testing pcluster update and generate ssh keys for user")
764754
_check_ssh_key_generation(users[0], remote_command_executor, scheduler_commands, False)
765755

766756
# Verify access control with ldap access provider.
767-
updated_config_file = pcluster_config_reader(
768-
config_file="pcluster.config.update.yaml", benchmarks=benchmarks, **config_params
769-
)
757+
updated_config_file = pcluster_config_reader(config_file="pcluster.config.update.yaml", **config_params)
770758
cluster.update(str(updated_config_file), force_update="true")
771759
# Reset stateful connection variables after the cluster update
772760
remote_command_executor = RemoteCommandExecutor(cluster)
@@ -780,9 +768,7 @@ def test_ad_integration(
780768

781769
# Verify access control with simple access provider.
782770
# With this test we also verify that AdditionalSssdConfigs is working properly.
783-
updated_config_file = pcluster_config_reader(
784-
config_file="pcluster.config.update2.yaml", benchmarks=benchmarks, **config_params
785-
)
771+
updated_config_file = pcluster_config_reader(config_file="pcluster.config.update2.yaml", **config_params)
786772
cluster.update(str(updated_config_file), force_update="true")
787773
# Reset stateful connection variables after the cluster update
788774
remote_command_executor = RemoteCommandExecutor(cluster)
@@ -795,7 +781,6 @@ def test_ad_integration(
795781
_check_ssh_auth(user=user, expect_success=user.alias != "PclusterUser0")
796782

797783
run_system_analyzer(cluster, scheduler_commands_factory, request)
798-
run_benchmarks(users[0].remote_command_executor(), users[0].scheduler_commands(), diretory_type=directory_type)
799784

800785

801786
def _check_ssh_auth(user, expect_success=True):
@@ -838,12 +823,6 @@ def test_ad_integration_on_login_nodes(
838823
2. SSH key for AD users is created when the property GenerateSshKeysForUsers is true;
839824
3. AD users can submit workloads;
840825
"""
841-
head_node_instance_type = "c5n.18xlarge" if request.config.getoption("benchmarks") else "c5.xlarge"
842-
compute_instance_type_info = {"name": "c5.xlarge", "num_cores": 4}
843-
config_params = {
844-
"compute_instance_type": compute_instance_type_info.get("name"),
845-
"head_node_instance_type": head_node_instance_type,
846-
}
847826
directory_stack_name, nlb_stack_name = directory_factory(
848827
request.config.getoption("directory_stack_name"),
849828
request.config.getoption("ldaps_nlb_stack_name"),
@@ -858,16 +837,14 @@ def test_ad_integration_on_login_nodes(
858837
)
859838
nlb_stack_parameters = get_infra_stack_parameters(nlb_stack_name)
860839
ldap_tls_ca_cert = "/opt/parallelcluster/shared_login_nodes/directory_service/certificate.crt"
861-
config_params.update(
862-
get_ad_config_param_vals(
863-
directory_stack_outputs,
864-
nlb_stack_parameters,
865-
password_secret_arn,
866-
ldap_tls_ca_cert,
867-
directory_type,
868-
directory_protocol,
869-
directory_certificate_verification,
870-
)
840+
config_params = get_ad_config_param_vals(
841+
directory_stack_outputs,
842+
nlb_stack_parameters,
843+
password_secret_arn,
844+
ldap_tls_ca_cert,
845+
directory_type,
846+
directory_protocol,
847+
directory_certificate_verification,
871848
)
872849
cluster_config = pcluster_config_reader(**config_params)
873850
cluster = clusters_factory(cluster_config)

tests/integration-tests/tests/ad_integration/test_ad_integration/test_ad_integration/pcluster.config.update.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Image:
22
Os: {{ os }}
33
HeadNode:
4-
InstanceType: {{ head_node_instance_type }}
4+
InstanceType: {{ instance }}
55
Networking:
66
SubnetId: {{ public_subnet_id }}
77
Ssh:
@@ -15,7 +15,7 @@ Scheduling:
1515
ComputeResources:
1616
- Name: cit
1717
Instances:
18-
- InstanceType: {{ compute_instance_type }}
18+
- InstanceType: {{ instance }}
1919
MinCount: 2
2020
MaxCount: 150
2121
Networking:

tests/integration-tests/tests/ad_integration/test_ad_integration/test_ad_integration/pcluster.config.update2.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Image:
22
Os: {{ os }}
33
HeadNode:
4-
InstanceType: {{ head_node_instance_type }}
4+
InstanceType: {{ instance }}
55
Networking:
66
SubnetId: {{ public_subnet_id }}
77
Ssh:
@@ -15,7 +15,7 @@ Scheduling:
1515
ComputeResources:
1616
- Name: cit
1717
Instances:
18-
- InstanceType: {{ compute_instance_type }}
18+
- InstanceType: {{ instance }}
1919
MinCount: 2
2020
MaxCount: 150
2121
Networking:

tests/integration-tests/tests/ad_integration/test_ad_integration/test_ad_integration/pcluster.config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Image:
22
Os: {{ os }}
33
HeadNode:
4-
InstanceType: {{ head_node_instance_type }}
4+
InstanceType: {{ instance }}
55
Networking:
66
SubnetId: {{ public_subnet_id }}
77
Ssh:
@@ -15,7 +15,7 @@ Scheduling:
1515
ComputeResources:
1616
- Name: cit
1717
Instances:
18-
- InstanceType: {{ compute_instance_type }}
18+
- InstanceType: {{ instance }}
1919
MinCount: 2
2020
MaxCount: 150
2121
Networking:

tests/integration-tests/tests/ad_integration/test_ad_integration/test_ad_integration/workload.sh

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,3 @@ for fspath in shared efs; do
66
# srun has to be used for whoami because slurm_nss plugin only send user information through srun
77
date '+%Y%m%d%H%M%S' > "/$fspath/$(srun whoami)"
88
done
9-
10-
BENCHMARK_NAME=osu_barrier
11-
OSU_BENCHMARK_VERSION=5.7.1
12-
13-
module load openmpi
14-
# Run collective benchmark. The collective operations are close to what a real application looks like.
15-
# NOTE: The test is sized for 4 compute nodes.
16-
# -np total number of processes to run (all CPUs * 4 nodes)
17-
mpirun \
18-
> /shared/"$(date '+%Y%m%d%H%M%S')-$(srun whoami)-${BENCHMARK_NAME}".out \
19-
/shared/openmpi/osu-micro-benchmarks-${OSU_BENCHMARK_VERSION}/mpi/collective/${BENCHMARK_NAME}

tests/integration-tests/tests/ad_integration/test_ad_integration/test_ad_integration_on_login_nodes/pcluster.config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ LoginNodes:
1111
Ssh:
1212
KeyName: {{ key_name }}
1313
HeadNode:
14-
InstanceType: {{ head_node_instance_type }}
14+
InstanceType: {{ instance }}
1515
Networking:
1616
SubnetId: {{ public_subnet_id }}
1717
Ssh:
@@ -25,7 +25,7 @@ Scheduling:
2525
ComputeResources:
2626
- Name: cit
2727
Instances:
28-
- InstanceType: {{ compute_instance_type }}
28+
- InstanceType: {{ instance }}
2929
MinCount: 2
3030
MaxCount: 150
3131
Networking:

0 commit comments

Comments
 (0)