Skip to content

Commit 955b244

Browse files
hanwen-clusterhanwen-pcluste
authored andcommitted
Add options to install FSx Lustre and Nvidia software
Signed-off-by: Hanwen <hanwenli@amazon.com>
1 parent c389847 commit 955b244

File tree

9 files changed

+195
-49
lines changed

9 files changed

+195
-49
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@ CHANGELOG
33
3.12.0
44
------
55

6+
**ENHANCEMENTS**
7+
- Add new build image configuration section `Build/Installation` to turn on/off Nvidia software and Lustre client installations. By default, Nvidia software, although included in official ParallelCluster AMIs, is not installed by `build-image`. By default, Lustre client is installed.
8+
69
**CHANGES**
710
- The CLI commands `export-cluster-logs` and `export-image-logs` can now by default export the logs to the default ParallelCluster bucket or to the CustomS3Bucket if specified in the config.
811

cli/src/pcluster/config/imagebuilder_config.py

Lines changed: 51 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from pcluster.validators.imagebuilder_validators import (
3434
AMIVolumeSizeValidator,
3535
ComponentsValidator,
36+
InstanceTypeSoftwareValidator,
3637
SecurityGroupsAndSubnetValidator,
3738
)
3839
from pcluster.validators.kms_validators import KmsKeyIdEncryptedValidator, KmsKeyValidator
@@ -143,6 +144,41 @@ def __init__(
143144
self.enabled = enabled
144145

145146

147+
class LustreClient(Resource):
148+
"""Represent the LustreClient configuration for the ImageBuilder."""
149+
150+
def __init__(
151+
self,
152+
enabled: bool = None,
153+
):
154+
super().__init__()
155+
self.enabled = Resource.init_param(enabled, default=True)
156+
157+
158+
class NvidiaSoftware(Resource):
159+
"""Represent the NvidiaSoftware configuration for the ImageBuilder."""
160+
161+
def __init__(
162+
self,
163+
enabled: bool = None,
164+
):
165+
super().__init__()
166+
self.enabled = Resource.init_param(enabled, default=False)
167+
168+
169+
class Installation(Resource):
170+
"""Represent the installation configuration for the ImageBuilder."""
171+
172+
def __init__(
173+
self,
174+
lustre_client: LustreClient = None,
175+
nvidia_software: NvidiaSoftware = None,
176+
):
177+
super().__init__()
178+
self.lustre_client = lustre_client or LustreClient()
179+
self.nvidia_software = nvidia_software or NvidiaSoftware()
180+
181+
146182
class Build(Resource):
147183
"""Represent the build configuration for the ImageBuilder."""
148184

@@ -157,6 +193,7 @@ def __init__(
157193
components: List[Component] = None,
158194
update_os_packages: UpdateOsPackages = None,
159195
imds: Imds = None,
196+
installation: Installation = None,
160197
):
161198
super().__init__()
162199
self.instance_type = Resource.init_param(instance_type)
@@ -168,13 +205,19 @@ def __init__(
168205
self.components = components
169206
self.update_os_packages = update_os_packages
170207
self.imds = imds or Imds(implied="v2.0")
208+
self.installation = installation or Installation()
171209

172210
def _register_validators(self, context: ValidatorContext = None): # noqa: D102 #pylint: disable=unused-argument
173211
self._register_validator(
174212
InstanceTypeBaseAMICompatibleValidator,
175213
instance_type=self.instance_type,
176214
image=self.parent_image,
177215
)
216+
self._register_validator(
217+
InstanceTypeSoftwareValidator,
218+
instance_type=self.instance_type,
219+
nvidia=self.installation.nvidia_software.enabled,
220+
)
178221
self._register_validator(
179222
ComponentsValidator,
180223
components=self.components,
@@ -282,21 +325,24 @@ def lambda_functions_vpc_config(self):
282325
class ImageBuilderExtraChefAttributes(ExtraChefAttributes):
283326
"""Extra Attributes for ImageBuilder Chef Client."""
284327

285-
def __init__(self, dev_settings: ImagebuilderDevSettings):
286-
super().__init__(dev_settings)
328+
def __init__(self, config: ImageBuilderConfig):
329+
super().__init__(config.dev_settings)
287330
self.region = None
288331
self.nvidia = None
332+
self.lustre = None
289333
self.is_official_ami_build = None
290334
self.custom_node_package = None
291335
self.custom_awsbatchcli_package = None
292336
self.base_os = None
293337
self.disable_kernel_update = None
294338
self.slurm_patches_s3_archive = None
295-
self._set_default(dev_settings)
339+
self._set_default(config)
296340

297-
def _set_default(self, dev_settings: ImagebuilderDevSettings):
341+
def _set_default(self, config: ImageBuilderConfig):
342+
dev_settings = config.dev_settings
298343
self.region = "{{ build.AWSRegion.outputs.stdout }}"
299-
self.nvidia = {"enabled": "no"}
344+
self.nvidia = {"enabled": "yes"} if config.build.installation.nvidia_software.enabled else {"enabled": "no"}
345+
self.lustre = {"enabled": "yes"} if config.build.installation.lustre_client.enabled else {"enabled": "no"}
300346
self.is_official_ami_build = "false"
301347
self.custom_node_package = dev_settings.node_package if dev_settings and dev_settings.node_package else ""
302348
self.custom_awsbatchcli_package = (

cli/src/pcluster/schemas/imagebuilder_schema.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
ImageBuilderConfig,
2727
ImagebuilderDeploymentSettings,
2828
ImagebuilderDevSettings,
29+
Installation,
30+
LustreClient,
31+
NvidiaSoftware,
2932
UpdateOsPackages,
3033
Volume,
3134
)
@@ -167,6 +170,40 @@ def make_resource(self, data, **kwargs):
167170
return UpdateOsPackages(**data)
168171

169172

173+
class LustreClientSchema(BaseSchema):
174+
"""Represent the schema of the ImageBuilder NvidiaSoftware."""
175+
176+
enabled = fields.Bool()
177+
178+
@post_load
179+
def make_resource(self, data, **kwargs):
180+
"""Generate resource."""
181+
return LustreClient(**data)
182+
183+
184+
class NvidiaSoftwareSchema(BaseSchema):
185+
"""Represent the schema of the ImageBuilder NvidiaSoftware."""
186+
187+
enabled = fields.Bool()
188+
189+
@post_load
190+
def make_resource(self, data, **kwargs):
191+
"""Generate resource."""
192+
return NvidiaSoftware(**data)
193+
194+
195+
class InstallationSchema(BaseSchema):
196+
"""Represent the schema of the ImageBuilder Installation."""
197+
198+
lustre_client = fields.Nested(LustreClientSchema)
199+
nvidia_software = fields.Nested(NvidiaSoftwareSchema)
200+
201+
@post_load
202+
def make_resource(self, data, **kwargs):
203+
"""Generate resource."""
204+
return Installation(**data)
205+
206+
170207
class BuildSchema(BaseSchema):
171208
"""Represent the schema of the ImageBuilder Build."""
172209

@@ -179,6 +216,7 @@ class BuildSchema(BaseSchema):
179216
subnet_id = fields.Str(validate=get_field_validator("subnet_id"))
180217
update_os_packages = fields.Nested(UpdateOsPackagesSchema)
181218
imds = fields.Nested(ImdsSchema)
219+
installation = fields.Nested(InstallationSchema)
182220

183221
@post_load
184222
def make_resource(self, data, **kwargs):

cli/src/pcluster/templates/imagebuilder_stack.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def _add_cfn_parameters(self):
197197
self,
198198
"CfnParamChefDnaJson",
199199
type="String",
200-
default=ImageBuilderExtraChefAttributes(self.config.dev_settings).dump_json(),
200+
default=ImageBuilderExtraChefAttributes(self.config).dump_json(),
201201
description="ChefAttributes",
202202
)
203203
CfnParameter(

cli/src/pcluster/validators/imagebuilder_validators.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,17 @@ def _validate(self, security_group_ids: list, subnet_id: str):
5454
"Subnet id {0} is specified, security groups is required.".format(subnet_id),
5555
FailureLevel.ERROR,
5656
)
57+
58+
59+
class InstanceTypeSoftwareValidator(Validator):
60+
"""Validate software compatibility with instance type."""
61+
62+
def _validate(self, instance_type: str, nvidia: bool):
63+
if nvidia:
64+
instance_type_info = AWSApi.instance().ec2.get_instance_type_info(instance_type)
65+
if instance_type_info.gpu_count() == 0:
66+
self._add_failure(
67+
f"Instance type {instance_type} does not have GPU. "
68+
"NVIDIA software can only be installed on GPU instances.",
69+
FailureLevel.ERROR,
70+
)

cli/tests/pcluster/config/dummy_imagebuilder_config.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@
1919
ImageBuilderConfig,
2020
ImagebuilderDeploymentSettings,
2121
ImagebuilderDevSettings,
22+
Installation,
23+
LustreClient,
24+
NvidiaSoftware,
2225
UpdateOsPackages,
2326
Volume,
2427
)
@@ -39,6 +42,9 @@
3942
"additional_iam_policies": AdditionalIamPolicy,
4043
"update_os_packages": UpdateOsPackages,
4144
"imds": Imds,
45+
"installation": Installation,
46+
"lustre_client": LustreClient,
47+
"nvidia_software": NvidiaSoftware,
4248
}
4349

4450

0 commit comments

Comments
 (0)