Skip to content

Commit adc63a9

Browse files
[Test] Install Nvidia driver in createami tests
Reason: This increases test coverage of dependencies files in S3 buckets How 1. Use GPU instances and use availability zones with best GPU capacity 2. Change build-image config file Signed-off-by: Hanwen <hanwenli@amazon.com>
1 parent 6bedd31 commit adc63a9

File tree

4 files changed

+29
-13
lines changed

4 files changed

+29
-13
lines changed

tests/integration-tests/configs/develop.yaml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -149,16 +149,16 @@ test-suites:
149149
oss: [{{ OS_X86_3 }}]
150150
test_createami.py::test_build_image:
151151
dimensions:
152-
- regions: ["eu-west-3"]
153-
instances: {{ common.INSTANCES_DEFAULT_X86 }}
152+
- regions: ["euw3-az1"]
153+
instances: ["g4dn.2xlarge"]
154154
schedulers: [ "slurm" ]
155155
oss: {{ common.OSS_COMMERCIAL_X86 }}
156-
- regions: ["cn-north-1"]
157-
instances: {{ common.INSTANCES_DEFAULT_X86 }}
156+
- regions: ["cnn1-az1"]
157+
instances: ["g4dn.2xlarge"]
158158
schedulers: ["slurm"]
159159
oss: [{{ OS_X86_5 }}]
160160
- regions: ["us-gov-west-1"]
161-
instances: {{ common.INSTANCES_DEFAULT_X86 }}
161+
instances: ["g4dn.2xlarge"]
162162
schedulers: ["slurm"]
163163
oss: [{{ OS_X86_7 }}]
164164
test_createami.py::test_build_image_custom_components:

tests/integration-tests/tests/createami/test_createami.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
from retrying import retry
2828
from time_utils import minutes, seconds
2929
from troposphere import Template, iam
30-
from utils import generate_stack_name, get_arn_partition
30+
from utils import generate_stack_name, get_arn_partition, get_gpu_count
3131

3232
from tests.common.assertions import (
3333
assert_head_node_is_running,
@@ -94,10 +94,11 @@ def test_invalid_config(
9494
assert_that(suppressed.message).contains("Request would have succeeded")
9595

9696

97-
@pytest.mark.usefixtures("instance", "scheduler")
97+
@pytest.mark.usefixtures("scheduler")
9898
def test_build_image(
9999
region,
100100
os,
101+
instance,
101102
pcluster_config_reader,
102103
architecture,
103104
s3_bucket_factory,
@@ -138,9 +139,12 @@ def test_build_image(
138139
else:
139140
# Test vanilla AMIs.
140141
base_ami = retrieve_latest_ami(region, os, ami_type="official", architecture=architecture)
141-
142142
image_config = pcluster_config_reader(
143-
config_file="image.config.yaml", parent_image=base_ami, instance_role=instance_role, bucket_name=bucket_name
143+
config_file="image.config.yaml",
144+
parent_image=base_ami,
145+
instance_role=instance_role,
146+
bucket_name=bucket_name,
147+
gpu_count=get_gpu_count(instance),
144148
)
145149

146150
image = images_factory(image_id, image_config, region)

tests/integration-tests/tests/createami/test_createami/test_build_image/image.config.yaml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,12 @@ Build:
1919
UpdateOsPackages:
2020
Enabled: true
2121
{% endif %}
22-
{% if os in ["ubuntu2204", "rhel9", "rocky9"] %}
23-
# Disable Lustre installation because these newer operating systems release new kernels more often. Lustre usually does not support the latest kernels
2422
Installation:
2523
LustreClient:
26-
Enabled: false
27-
{% endif %}
24+
# Disable Lustre installation because these newer operating systems release new kernels more often. Lustre usually does not support the latest kernels
25+
Enabled: {% if os in ["ubuntu2204", "rhel9", "rocky9"] %} false {% else %} true {% endif %}
26+
NvidiaSoftware:
27+
Enabled: {% if gpu_count > 0 %} true {% else %} false {% endif %}
2828

2929
CustomS3Bucket: {{ bucket_name }}
3030

tests/integration-tests/utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,18 @@ def get_network_interfaces_count(instance_type, region_name=None):
612612
return get_instance_info(instance_type, region_name).get("NetworkInfo").get("MaximumNetworkCards", 1)
613613

614614

615+
def get_gpu_count(instance_type, region_name=None):
616+
"""Return the number of GPU for the provided instance type."""
617+
gpu_info = get_instance_info(instance_type, region_name).get("GpuInfo", None)
618+
gpu_count = 0
619+
if gpu_info:
620+
for gpu in gpu_info.get("Gpus", []):
621+
manufacturer = gpu.get("Manufacturer", "")
622+
if manufacturer.upper() == "NVIDIA":
623+
gpu_count += gpu.get("Count", 0)
624+
return gpu_count
625+
626+
615627
def get_root_volume_id(instance_id, region, os):
616628
"""Return the root EBS volume's ID for the given EC2 instance."""
617629
logging.info("Getting root volume for instance %s", instance_id)

0 commit comments

Comments
 (0)