From 98a1d7739e51a51ff1b099ee151a2741fdccad99 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Thu, 22 Aug 2024 20:31:36 -0400 Subject: [PATCH 1/6] [CF] Refactoring DNA.json for HeadNode to have common values in a single file * Refactoring ComputeFleet to download common_dna.json and compute resource specific dna * giving PutObject access to headnode for s3-do-not-delete bucket * rename common,headnode and compute dna json * Adding compute-dna.json and downloading it from S3 * Adding stack arn in a write file * Using config Version to download and upload json files specific to a cluster-config version so that we can rollback using the previous config version when needed * Reduce cfn-init for compute fleet to just mention files.This file is never created but a change in config version in AWS::CloudFormation::Init signifies an update * give listversionObject and ListBucketVersions access to compute node * Use cfn-init calls for Cluster Update Path and use cfn-init to create common-dna file for queues * Use cfn-init to create common-dna file for queues --- cli/src/pcluster/models/s3_bucket.py | 7 ++ .../resources/compute_node/user_data.sh | 27 ++++- .../pcluster/templates/cdk_builder_utils.py | 24 ++++- cli/src/pcluster/templates/cluster_stack.py | 84 +++++++++------ cli/src/pcluster/templates/queues_stack.py | 100 +++++++++++------- 5 files changed, 169 insertions(+), 73 deletions(-) diff --git a/cli/src/pcluster/models/s3_bucket.py b/cli/src/pcluster/models/s3_bucket.py index b3543c279b..a67aa5b9d4 100644 --- a/cli/src/pcluster/models/s3_bucket.py +++ b/cli/src/pcluster/models/s3_bucket.py @@ -42,6 +42,9 @@ class S3FileType(Enum): """Define S3 file types.""" ASSETS = "assets" + HEAD_NODE_DNA_ASSETS = f"{ASSETS}/HeadNode" + COMPUTE_DNA_ASSETS = f"{ASSETS}/ComputeNode" + LOGIN_NODE_DNA_ASSETS = f"{ASSETS}/LoginNode" CONFIGS = "configs" TEMPLATES = "templates" CUSTOM_RESOURCES = "custom_resources" @@ -214,6 +217,10 @@ def upload_cfn_asset(self, asset_file_content, asset_name: str, format=S3FileFor file_type=S3FileType.ASSETS, content=asset_file_content, file_name=asset_name, format=format ) + def upload_dna_cfn_asset(self, asset_file_content, asset_name: str, file_type=S3FileType, format=S3FileFormat.YAML): + """Upload cloudformation assets to S3 bucket.""" + return self.upload_file(file_type=file_type, content=asset_file_content, file_name=asset_name, format=format) + def upload_resources(self, resource_dir, custom_artifacts_name): """ Upload custom resources to S3 bucket. diff --git a/cli/src/pcluster/resources/compute_node/user_data.sh b/cli/src/pcluster/resources/compute_node/user_data.sh index 809fce5ac0..f2bd437c12 100644 --- a/cli/src/pcluster/resources/compute_node/user_data.sh +++ b/cli/src/pcluster/resources/compute_node/user_data.sh @@ -51,6 +51,15 @@ datasource_list: [ Ec2, None ] output: all: "| tee -a /var/log/cloud-init-output.log | logger -t user-data -s 2>/dev/ttyS0" write_files: + - path: /tmp/stack-arn.json + permissions: '0644' + owner: root:root + content: | + { + "cluster":{ + "stack_arn": "${AWS::StackId}" + } + } - path: /tmp/bootstrap.sh permissions: '0744' owner: root:root @@ -93,11 +102,23 @@ write_files: fi } + function get_s3_dna_json_files + { + AWS_RETRY_MODE=standard + JSON_FILES="common-dna-${ClusterConfigVersion}.json;common-dna.json ComputeNode/compute-dna-${LaunchTemplateResourceId}-${ClusterConfigVersion}.json;compute-dna.json extra-${ClusterConfigVersion}.json;extra.json" + for file_name_tuple in ${!JSON_FILES[@]}; do + file_names=(${!file_name_tuple//;/ }) + s3_prefix_to_download=${!file_names[0]} + tmp_file_name=${!file_names[1]} + S3_RESULT=$(aws s3api get-object --bucket ${S3_BUCKET} --key "${S3_ARTIFACT_DIR}/assets/${!s3_prefix_to_download}" --region ${AWS::Region} /tmp/${!tmp_file_name} 2>&1 ) || error_exit "${!S3_RESULT}" + done + } + export PATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin:/opt/aws/bin [ -f /etc/parallelcluster/pcluster_cookbook_environment.sh ] && . /etc/parallelcluster/pcluster_cookbook_environment.sh - $CFN_BOOTSTRAP_VIRTUALENV_PATH/cfn-init -s ${AWS::StackName} -v -c deployFiles -r ${LaunchTemplateResourceId} --region ${AWS::Region} --url ${CloudFormationUrl} --role ${CfnInitRole} || error_exit 'Failed to bootstrap the compute node. Please check /var/log/cfn-init.log in the compute node or in CloudWatch logs. Please refer to https://docs.aws.amazon.com/parallelcluster/latest/ug/troubleshooting-v3.html#troubleshooting-v3-get-logs for more details on ParallelCluster logs.' + get_s3_dna_json_files [ -f /etc/profile.d/proxy.sh ] && . /etc/profile.d/proxy.sh @@ -139,9 +160,11 @@ write_files: vendor_cookbook fi cd /tmp + mkdir -p /etc/chef/ohai/hints + touch /etc/chef/ohai/hints/ec2.json start=$(date +%s) - + jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) { CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist" FR_CMD="/opt/parallelcluster/scripts/fetch_and_run" diff --git a/cli/src/pcluster/templates/cdk_builder_utils.py b/cli/src/pcluster/templates/cdk_builder_utils.py index 73c14d531c..6324dca1d6 100644 --- a/cli/src/pcluster/templates/cdk_builder_utils.py +++ b/cli/src/pcluster/templates/cdk_builder_utils.py @@ -648,7 +648,13 @@ def _build_policy(self) -> List[iam.PolicyStatement]: iam.PolicyStatement( sid="ResourcesS3Bucket", effect=iam.Effect.ALLOW, - actions=["s3:GetObject", "s3:GetObjectVersion", "s3:GetBucketLocation", "s3:ListBucket"], + actions=[ + "s3:GetObject", + "s3:PutObject", + "s3:GetObjectVersion", + "s3:GetBucketLocation", + "s3:ListBucket", + ], resources=[ self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""), self._format_arn( @@ -997,7 +1003,9 @@ def __init__( node: Union[HeadNode, BaseQueue, LoginNodesPool], shared_storage_infos: dict, name: str, + cluster_bucket: S3Bucket, ): + self._cluster_bucket = cluster_bucket super().__init__(scope, id, config, node, shared_storage_infos, name) def _build_policy(self) -> List[iam.PolicyStatement]: @@ -1023,6 +1031,20 @@ def _build_policy(self) -> List[iam.PolicyStatement]: ) ], ), + iam.PolicyStatement( + sid="S3GetLaunchTemplate", + actions=["s3:GetObject", "s3:ListBucket"], + effect=iam.Effect.ALLOW, + resources=[ + self._format_arn(service="s3", resource=self._cluster_bucket.name, region="", account=""), + self._format_arn( + service="s3", + resource=f"{self._cluster_bucket.name}/{self._cluster_bucket.artifact_directory}/*", + region="", + account="", + ), + ], + ), iam.PolicyStatement( sid="CloudFormation", actions=[ diff --git a/cli/src/pcluster/templates/cluster_stack.py b/cli/src/pcluster/templates/cluster_stack.py index 4627c416b2..90b2dd7dc8 100644 --- a/cli/src/pcluster/templates/cluster_stack.py +++ b/cli/src/pcluster/templates/cluster_stack.py @@ -76,7 +76,7 @@ PCLUSTER_S3_ARTIFACTS_DICT, SLURM_PORTS_RANGE, ) -from pcluster.models.s3_bucket import S3Bucket +from pcluster.models.s3_bucket import S3Bucket, S3FileFormat, S3FileType from pcluster.templates.awsbatch_builder import AwsBatchConstruct from pcluster.templates.cdk_builder_utils import ( CdkLaunchTemplateBuilder, @@ -1256,12 +1256,10 @@ def _add_head_node(self): head_node_launch_template.add_metadata("Comment", "AWS ParallelCluster Head Node") # CloudFormation::Init metadata - dna_json = json.dumps( + common_dna_json = json.dumps( { "cluster": { "stack_name": self._stack_name, - "stack_arn": self.stack.stack_id, - "raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID), "raid_shared_dir": to_comma_separated_string( self.shared_storage_mount_dirs[SharedStorageType.RAID] ), @@ -1306,24 +1304,12 @@ def _add_head_node(self): self.shared_storage_attributes[SharedStorageType.FSX]["FileSystemTypes"] ), "fsx_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.FSX]), - "volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS), "scheduler": self.config.scheduling.scheduler, - "ephemeral_dir": ( - head_node.local_storage.ephemeral_volume.mount_dir - if head_node.local_storage.ephemeral_volume - else DEFAULT_EPHEMERAL_DIR - ), "ebs_shared_dirs": to_comma_separated_string(self.shared_storage_mount_dirs[SharedStorageType.EBS]), - "proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE", - "node_type": "HeadNode", "cluster_user": OS_MAPPING[self.config.image.os]["user"], - "ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE", "log_group_name": ( self.log_group.log_group_name if self.config.monitoring.logs.cloud_watch.enabled else "NONE" ), - "dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false", - "dcv_port": head_node.dcv.port if head_node.dcv else "NONE", - "enable_intel_hpc_platform": "true" if self.config.is_intel_hpc_platform_enabled else "false", "cw_logging_enabled": "true" if self.config.is_cw_logging_enabled else "false", "log_rotation_enabled": "true" if self.config.is_log_rotation_enabled else "false", "cluster_s3_bucket": self.bucket.name, @@ -1331,24 +1317,15 @@ def _add_head_node(self): self.bucket.artifact_directory, PCLUSTER_S3_ARTIFACTS_DICT.get("config_name") ), "cluster_config_version": self.config.config_version, - "instance_types_data_version": self.config.instance_types_data_version, - "change_set_s3_key": f"{self.bucket.artifact_directory}/configs/" - f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}", - "instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/" - f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}", "custom_node_package": self.config.custom_node_package or "", "custom_awsbatchcli_package": self.config.custom_aws_batch_cli_package or "", - "head_node_imds_secured": str(self.config.head_node.imds.secured).lower(), - "compute_node_bootstrap_timeout": get_attr( - self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT - ), + "head_node_private_ip": "HEAD_NODE_PRIVATE_IP", "disable_sudo_access_for_default_user": ( "true" if self.config.deployment_settings and self.config.deployment_settings.disable_sudo_access_default_user else "false" ), - "launch_template_id": launch_template_id, **( get_slurm_specific_dna_json_for_head_node(self.config, self.scheduler_resources) if self._condition_is_slurm() @@ -1359,7 +1336,44 @@ def _add_head_node(self): }, indent=4, ) - + head_node_specific_dna_json = json.dumps( + { + "cluster": { + "stack_arn": self.stack.stack_id, + "raid_vol_ids": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.RAID), + "volume": get_shared_storage_ids_by_type(self.shared_storage_infos, SharedStorageType.EBS), + "ephemeral_dir": ( + head_node.local_storage.ephemeral_volume.mount_dir + if head_node.local_storage.ephemeral_volume + else DEFAULT_EPHEMERAL_DIR + ), + "proxy": head_node.networking.proxy.http_proxy_address if head_node.networking.proxy else "NONE", + "node_type": "HeadNode", + "ddb_table": self.dynamodb_table_status.ref if not self._condition_is_batch() else "NONE", + "dcv_enabled": "head_node" if self.config.is_dcv_enabled else "false", + "dcv_port": head_node.dcv.port if head_node.dcv else "NONE", + "common_dna_s3_key": f"{self.bucket.artifact_directory}/assets/" + f"common-dna-{self.config.config_version}.json", + "instance_types_data_version": self.config.instance_types_data_version, + "change_set_s3_key": f"{self.bucket.artifact_directory}/configs/" + f"{PCLUSTER_S3_ARTIFACTS_DICT.get('change_set_name')}", + "instance_types_data_s3_key": f"{self.bucket.artifact_directory}/configs/" + f"{PCLUSTER_S3_ARTIFACTS_DICT.get('instance_types_data_name')}", + "head_node_imds_secured": str(self.config.head_node.imds.secured).lower(), + "compute_node_bootstrap_timeout": get_attr( + self.config, "dev_settings.timeouts.compute_node_bootstrap_timeout", NODE_BOOTSTRAP_TIMEOUT + ), + "launch_template_id": launch_template_id, + }, + }, + indent=4, + ) + self.bucket.upload_dna_cfn_asset( + file_type=S3FileType.ASSETS, + asset_file_content=json.loads(self.config.extra_chef_attributes), + asset_name=f"extra-{self.config.config_version}.json", + format=S3FileFormat.JSON, + ) cfn_init = { "configSets": { "deployFiles": ["deployConfigFiles"], @@ -1377,8 +1391,15 @@ def _add_head_node(self): # A nosec comment is appended to the following line in order to disable the B108 check. # The file is needed by the product # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. - "/tmp/dna.json": { # nosec B108 - "content": dna_json, + "/tmp/head-node-dna.json": { # nosec B108 + "content": head_node_specific_dna_json, + "mode": "000644", + "owner": "root", + "group": "root", + "encoding": "plain", + }, + "/tmp/common-dna.json": { # nosec B108 + "content": common_dna_json, "mode": "000644", "owner": "root", "group": "root", @@ -1408,8 +1429,9 @@ def _add_head_node(self): "touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"}, "jq": { "command": ( - 'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json ' - '|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )' + 'jq -s ".[0] * .[1] * .[2]" /tmp/common-dna.json /tmp/head-node-dna.json /tmp/extra.json ' + '> /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json ' + "/tmp/head-node-dna.json /etc/chef/dna.json )" ) }, }, diff --git a/cli/src/pcluster/templates/queues_stack.py b/cli/src/pcluster/templates/queues_stack.py index 19cb4a357f..817aaf0d7f 100644 --- a/cli/src/pcluster/templates/queues_stack.py +++ b/cli/src/pcluster/templates/queues_stack.py @@ -17,6 +17,7 @@ PCLUSTER_QUEUE_NAME_TAG, PCLUSTER_S3_ARTIFACTS_DICT, ) +from pcluster.models.s3_bucket import S3FileFormat, S3FileType from pcluster.templates.cdk_builder_utils import ( CdkLaunchTemplateBuilder, ComputeNodeIamResources, @@ -109,6 +110,7 @@ def _add_compute_iam_resources(self): queue, self._shared_storage_infos, queue.name, + self._cluster_bucket, ) self._compute_instance_profiles = {k: v.instance_profile for k, v in iam_resources.items()} self.managed_compute_instance_roles = {k: v.instance_role for k, v in iam_resources.items()} @@ -199,6 +201,32 @@ def _add_compute_resource_launch_template( instance_role_name = self.managed_compute_instance_roles[queue.name].ref launch_template_id = f"LaunchTemplate{create_hash_suffix(queue.name + compute_resource.name)}" + compute_specific_dna_json_dict = { + "cluster": { + "cluster_name": self.stack_name, + "stack_name": self.stack_name, + "enable_efa": "efa" if compute_resource.efa and compute_resource.efa.enabled else "NONE", + "ephemeral_dir": ( + queue.compute_settings.local_storage.ephemeral_volume.mount_dir + if isinstance(queue, SlurmQueue) and queue.compute_settings.local_storage.ephemeral_volume + else DEFAULT_EPHEMERAL_DIR + ), + "proxy": queue.networking.proxy.http_proxy_address if queue.networking.proxy else "NONE", + "node_type": "ComputeFleet", + "scheduler_queue_name": queue.name, + "scheduler_compute_resource_name": compute_resource.name, + "enable_efa_gdr": ("compute" if compute_resource.efa and compute_resource.efa.gdr_support else "NONE"), + "launch_template_id": launch_template_id, + } + } + compute_specific_dna_json = json.dumps(compute_specific_dna_json_dict, indent=4) + + self._cluster_bucket.upload_dna_cfn_asset( + file_type=S3FileType.COMPUTE_DNA_ASSETS, + asset_file_content=compute_specific_dna_json_dict, + asset_name=f"compute-dna-{launch_template_id}-{self._config.config_version}.json", + format=S3FileFormat.JSON, + ) launch_template = ec2.CfnLaunchTemplate( self, launch_template_id, @@ -266,6 +294,9 @@ def _add_compute_resource_launch_template( "LaunchTemplateResourceId": launch_template_id, "CloudFormationUrl": get_service_endpoint("cloudformation", self._config.region), "CfnInitRole": instance_role_name, + "ClusterConfigVersion": self._config.config_version, + "S3_BUCKET": self._cluster_bucket.name, + "S3_ARTIFACT_DIR": self._cluster_bucket.artifact_directory, }, **get_common_user_data_env(queue, self._config), }, @@ -294,18 +325,10 @@ def _add_compute_resource_launch_template( ), ) - dna_json = json.dumps( + common_dna_json = json.dumps( { "cluster": { - "cluster_name": self.stack_name, "stack_name": self.stack_name, - "stack_arn": self.stack_id, - "cluster_s3_bucket": self._cluster_bucket.name, - "cluster_config_s3_key": "{0}/configs/{1}".format( - self._cluster_bucket.artifact_directory, PCLUSTER_S3_ARTIFACTS_DICT.get("config_name") - ), - "cluster_config_version": self._config.config_version, - "enable_efa": "efa" if compute_resource.efa and compute_resource.efa.enabled else "NONE", "raid_shared_dir": to_comma_separated_string( self._shared_storage_mount_dirs[SharedStorageType.RAID] ), @@ -314,7 +337,7 @@ def _add_compute_resource_launch_template( ), "base_os": self._config.image.os, "region": self._config.region, - "shared_storage_type": self._config.head_node.shared_storage_type.lower(), # noqa: E501 pylint: disable=line-too-long + "shared_storage_type": self._config.head_node.shared_storage_type.lower(), "default_user_home": ( self._config.deployment_settings.default_user_home.lower() if ( @@ -332,8 +355,7 @@ def _add_compute_resource_launch_template( use_lower_case=True, ), "efs_iam_authorizations": to_comma_separated_string( - self._shared_storage_attributes[SharedStorageType.EFS]["IamAuthorizations"], - use_lower_case=True, + self._shared_storage_attributes[SharedStorageType.EFS]["IamAuthorizations"], use_lower_case=True ), "efs_access_point_ids": to_comma_separated_string( self._shared_storage_attributes[SharedStorageType.EFS]["AccessPointIds"], @@ -356,50 +378,40 @@ def _add_compute_resource_launch_template( self._shared_storage_mount_dirs[SharedStorageType.FSX] ), "scheduler": self._config.scheduling.scheduler, - "ephemeral_dir": ( - queue.compute_settings.local_storage.ephemeral_volume.mount_dir - if isinstance(queue, SlurmQueue) and queue.compute_settings.local_storage.ephemeral_volume - else DEFAULT_EPHEMERAL_DIR - ), "ebs_shared_dirs": to_comma_separated_string( self._shared_storage_mount_dirs[SharedStorageType.EBS] ), - "proxy": queue.networking.proxy.http_proxy_address if queue.networking.proxy else "NONE", - "slurm_ddb_table": self._dynamodb_table.ref if self._dynamodb_table else "NONE", + "cluster_user": OS_MAPPING[self._config.image.os]["user"], "log_group_name": ( self._log_group.log_group_name if self._config.monitoring.logs.cloud_watch.enabled else "NONE" ), - "dns_domain": str(self._cluster_hosted_zone.name) if self._cluster_hosted_zone else "", - "hosted_zone": str(self._cluster_hosted_zone.ref) if self._cluster_hosted_zone else "", - "node_type": "ComputeFleet", - "cluster_user": OS_MAPPING[self._config.image.os]["user"], - "enable_intel_hpc_platform": "true" if self._config.is_intel_hpc_platform_enabled else "false", "cw_logging_enabled": "true" if self._config.is_cw_logging_enabled else "false", "log_rotation_enabled": "true" if self._config.is_log_rotation_enabled else "false", - "scheduler_queue_name": queue.name, - "scheduler_compute_resource_name": compute_resource.name, - "enable_efa_gdr": ( - "compute" if compute_resource.efa and compute_resource.efa.gdr_support else "NONE" + "cluster_s3_bucket": self._cluster_bucket.name, + "cluster_config_s3_key": "{0}/configs/{1}".format( + self._cluster_bucket.artifact_directory, PCLUSTER_S3_ARTIFACTS_DICT.get("config_name") ), + "cluster_config_version": self._config.config_version, "custom_node_package": self._config.custom_node_package or "", "custom_awsbatchcli_package": self._config.custom_aws_batch_cli_package or "", - "use_private_hostname": str( - get_attr(self._config, "scheduling.settings.dns.use_ec2_hostnames", default=False) - ).lower(), "head_node_private_ip": self._head_eni.attr_primary_private_ip_address, - "directory_service": {"enabled": str(self._config.directory_service is not None).lower()}, "disable_sudo_access_for_default_user": ( "true" if self._config.deployment_settings and self._config.deployment_settings.disable_sudo_access_default_user else "false" ), - "launch_template_id": launch_template_id, - } + "directory_service": {"enabled": str(self._config.directory_service is not None).lower()}, + "dns_domain": str(self._cluster_hosted_zone.name) if self._cluster_hosted_zone else "", + "hosted_zone": str(self._cluster_hosted_zone.ref) if self._cluster_hosted_zone else "", + "slurm_ddb_table": self._dynamodb_table.ref if self._dynamodb_table else "NONE", + "use_private_hostname": str( + get_attr(self._config, "scheduling.settings.dns.use_ec2_hostnames", default=False) + ).lower(), + }, }, indent=4, ) - cfn_init = { "configSets": { "deployFiles": ["deployConfigFiles"], @@ -410,8 +422,18 @@ def _add_compute_resource_launch_template( # A nosec comment is appended to the following line in order to disable the B108 check. # The file is needed by the product # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. - "/tmp/dna.json": { # nosec B108 - "content": dna_json, + # This file is never created. Keeping the content as config version to signify an Update for cfn-hup + "/tmp/common-dna.json": { # nosec B108 + "mode": "000644", + "owner": "root", + "group": "root", + "content": common_dna_json, + }, + # A nosec comment is appended to the following line in order to disable the B108 check. + # The file is needed by the product + # [B108:hardcoded_tmp_directory] Probable insecure usage of temp file/directory. + "/tmp/compute-dna.json": { # nosec B108 + "content": compute_specific_dna_json, "mode": "000644", "owner": "root", "group": "root", @@ -432,8 +454,8 @@ def _add_compute_resource_launch_template( "touch": {"command": "touch /etc/chef/ohai/hints/ec2.json"}, "jq": { "command": ( - 'jq -s ".[0] * .[1]" /tmp/dna.json /tmp/extra.json > /etc/chef/dna.json ' - '|| ( echo "jq not installed"; cp /tmp/dna.json /etc/chef/dna.json )' + 'jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json ' + "/tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json" ) }, }, From e33e7fdef449c88b832a458b9cb0a35814e80c3d Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 11 Sep 2024 18:02:12 -0400 Subject: [PATCH 2/6] [Temp][DFSM] Use idle~ for compute node status --- tests/integration-tests/tests/update/test_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 054c104681..4cf14fc824 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -1335,7 +1335,7 @@ def test_dynamic_file_systems_update( # because the static compute node has a job running. queue1_nodes = scheduler_commands.get_compute_nodes("queue1") assert_compute_node_states( - scheduler_commands, queue1_nodes, expected_states=["draining", "draining!", "drained*", "drained"] + scheduler_commands, queue1_nodes, expected_states=["draining", "draining!", "drained*", "drained", "idle~"] ) logging.info("Checking the status of compute nodes in queue2") From 0f4b601fad6df1259a3e66de5d1fe8e2a685cfeb Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 11 Sep 2024 18:06:09 -0400 Subject: [PATCH 3/6] [Unit test] Update Unit test for testing common and compute specific dna --- cli/tests/pcluster/models/dummy_s3_bucket.py | 4 ++ .../pcluster/templates/test_queues_stack.py | 60 +++++++++++++++---- .../{dna-1.json => common-dna-1.json} | 15 +---- .../{dna-2.json => common-dna-2.json} | 15 +---- .../compute-dna-1.json | 14 +++++ .../compute-dna-2.json | 14 +++++ 6 files changed, 86 insertions(+), 36 deletions(-) rename cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/{dna-1.json => common-dna-1.json} (77%) rename cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/{dna-2.json => common-dna-2.json} (77%) create mode 100644 cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-1.json create mode 100644 cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-2.json diff --git a/cli/tests/pcluster/models/dummy_s3_bucket.py b/cli/tests/pcluster/models/dummy_s3_bucket.py index cf8b260096..7a5810c948 100644 --- a/cli/tests/pcluster/models/dummy_s3_bucket.py +++ b/cli/tests/pcluster/models/dummy_s3_bucket.py @@ -105,6 +105,9 @@ def mock_bucket_object_utils( upload_cfn_asset_mock = mocker.patch( "pcluster.models.s3_bucket.S3Bucket.upload_cfn_asset", side_effect=upload_asset_side_effect ) + upload_dna_cfn_asset_mock = mocker.patch( + "pcluster.models.s3_bucket.S3Bucket.upload_dna_cfn_asset", side_effect=upload_asset_side_effect + ) get_cfn_template_mock = mocker.patch( "pcluster.models.s3_bucket.S3Bucket.get_cfn_template", return_value=fake_template, @@ -135,6 +138,7 @@ def mock_bucket_object_utils( "get_config": get_config_mock, "upload_cfn_template": upload_cfn_template_mock, "upload_cfn_asset": upload_cfn_asset_mock, + "upload_dna_cfn_asset": upload_dna_cfn_asset_mock, "get_cfn_template": get_cfn_template_mock, "upload_resources": upload_resources_mock, "delete_s3_artifacts": delete_s3_artifacts_mock, diff --git a/cli/tests/pcluster/templates/test_queues_stack.py b/cli/tests/pcluster/templates/test_queues_stack.py index 18e26a8b2e..1c70f8db41 100644 --- a/cli/tests/pcluster/templates/test_queues_stack.py +++ b/cli/tests/pcluster/templates/test_queues_stack.py @@ -44,6 +44,34 @@ }, "Sid": "S3GetObj", }, + { + "Action": ["s3:GetObject", "s3:ListBucket"], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + {"Ref": "AWS::Partition"}, + ":s3:::parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", + ], + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + {"Ref": "AWS::Partition"}, + ":s3:::parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete/" + "parallelcluster/clusters/dummy-cluster-randomstring123/*", + ], + ] + }, + ], + "Sid": "S3GetLaunchTemplate", + }, { "Action": "cloudformation:DescribeStackResource", "Effect": "Allow", @@ -105,16 +133,18 @@ def test_compute_nodes_iam_permissions( @freeze_time("2024-01-15T15:30:45") @pytest.mark.parametrize( - "config_file_name, expected_compute_node_dna_json_file_name, expected_compute_node_extra_json_file_name", + "config_file_name, expected_common_dna_json_file_name, " + "expected_compute_node_dna_json_file_name, expected_compute_node_extra_json_file_name", [ - ("config-1.yaml", "dna-1.json", "extra-1.json"), - ("config-2.yaml", "dna-2.json", "extra-2.json"), + ("config-1.yaml", "common-dna-1.json", "compute-dna-1.json", "extra-1.json"), + ("config-2.yaml", "common-dna-2.json", "compute-dna-2.json", "extra-2.json"), ], ) def test_compute_nodes_dna_json( mocker, test_datadir, config_file_name, + expected_common_dna_json_file_name, expected_compute_node_dna_json_file_name, expected_compute_node_extra_json_file_name, ): @@ -132,22 +162,32 @@ def test_compute_nodes_dna_json( compute_node_lt_asset = get_asset_content_with_resource_name(cdk_assets, "LaunchTemplateA7211c84b953696f") compute_node_lt = compute_node_lt_asset["Resources"]["LaunchTemplateA7211c84b953696f"] compute_node_cfn_init_files = compute_node_lt["Metadata"]["AWS::CloudFormation::Init"]["deployConfigFiles"]["files"] - compute_node_dna_json = compute_node_cfn_init_files["/tmp/dna.json"] + common_dna_json = compute_node_cfn_init_files["/tmp/common-dna.json"] + compute_node_dna_json = compute_node_cfn_init_files["/tmp/compute-dna.json"] compute_node_extra_json = compute_node_cfn_init_files["/tmp/extra.json"] # Expected dna.json and extra.json + expected_commom_dna_json = load_json_dict(test_datadir / expected_common_dna_json_file_name) expected_compute_node_dna_json = load_json_dict(test_datadir / expected_compute_node_dna_json_file_name) expected_compute_node_extra_json = load_json_dict(test_datadir / expected_compute_node_extra_json_file_name) expected_owner = expected_group = "root" expected_mode = "000644" # Assertions on dna.json - rendered_dna_json_content = render_join(compute_node_dna_json["content"]["Fn::Join"]) - rendered_dna_json_content_as_json = json.loads(rendered_dna_json_content) - assert_that(compute_node_dna_json["owner"]).is_equal_to(expected_owner) - assert_that(compute_node_dna_json["group"]).is_equal_to(expected_group) - assert_that(compute_node_dna_json["mode"]).is_equal_to(expected_mode) - assert_that(rendered_dna_json_content_as_json).is_equal_to(expected_compute_node_dna_json) + + for file_json, expected_json, keys in [ + (common_dna_json, expected_commom_dna_json, "Fn::Join"), + (compute_node_dna_json, expected_compute_node_dna_json, None), + ]: + if keys: + rendered_content = render_join(file_json["content"][keys]) + else: + rendered_content = file_json["content"] + rendered_json = json.loads(rendered_content) + assert_that(file_json["owner"]).is_equal_to(expected_owner) + assert_that(file_json["group"]).is_equal_to(expected_group) + assert_that(file_json["mode"]).is_equal_to(expected_mode) + assert_that(rendered_json).is_equal_to(expected_json) # Assertions on extra.json assert_that(compute_node_extra_json["owner"]).is_equal_to(expected_owner) diff --git a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-1.json b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-1.json similarity index 77% rename from cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-1.json rename to cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-1.json index 18140d9c0c..606dec103f 100644 --- a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-1.json +++ b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-1.json @@ -3,7 +3,6 @@ "base_os": "alinux2", "cluster_config_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml", "cluster_config_version": "", - "cluster_name": "clustername", "cluster_s3_bucket": "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "cluster_user": "ec2-user", "custom_awsbatchcli_package": "", @@ -16,15 +15,11 @@ "disable_sudo_access_for_default_user": "false", "dns_domain": "{\"Ref\": \"referencetoclusternameClusterDNSDomain8D0872E1Ref\"}", "ebs_shared_dirs": "", + "efs_access_point_ids": "", "efs_encryption_in_transits": "", "efs_fs_ids": "", "efs_iam_authorizations": "", "efs_shared_dirs": "", - "efs_access_point_ids": "", - "enable_efa": "NONE", - "enable_efa_gdr": "NONE", - "enable_intel_hpc_platform": "false", - "ephemeral_dir": "/scratch", "fsx_dns_names": "", "fsx_fs_ids": "", "fsx_fs_types": "", @@ -35,19 +30,13 @@ "hosted_zone": "{\"Ref\": \"referencetoclusternameRoute53HostedZone2388733DRef\"}", "log_group_name": "/aws/parallelcluster/clustername-202401151530", "log_rotation_enabled": "true", - "node_type": "ComputeFleet", - "proxy": "NONE", "raid_shared_dir": "", "raid_type": "", "region": "us-east-1", "scheduler": "slurm", - "scheduler_compute_resource_name": "cr1", - "scheduler_queue_name": "queue1", "shared_storage_type": "ebs", "slurm_ddb_table": "{\"Ref\": \"referencetoclusternameSlurmDynamoDBTable99119DBERef\"}", - "stack_arn": "{\"Ref\": \"AWS::StackId\"}", "stack_name": "clustername", - "use_private_hostname": "false", - "launch_template_id": "LaunchTemplateA7211c84b953696f" + "use_private_hostname": "false" } } diff --git a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-2.json b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-2.json similarity index 77% rename from cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-2.json rename to cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-2.json index 2f4f04c400..25a148df52 100644 --- a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/dna-2.json +++ b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/common-dna-2.json @@ -3,7 +3,6 @@ "base_os": "alinux2", "cluster_config_s3_key": "parallelcluster/clusters/dummy-cluster-randomstring123/configs/cluster-config-with-implied-values.yaml", "cluster_config_version": "", - "cluster_name": "clustername", "cluster_s3_bucket": "parallelcluster-a69601b5ee1fc2f2-v1-do-not-delete", "cluster_user": "ec2-user", "custom_awsbatchcli_package": "", @@ -16,15 +15,11 @@ "disable_sudo_access_for_default_user": "true", "dns_domain": "{\"Ref\": \"referencetoclusternameClusterDNSDomain8D0872E1Ref\"}", "ebs_shared_dirs": "", + "efs_access_point_ids": "", "efs_encryption_in_transits": "", "efs_fs_ids": "", "efs_iam_authorizations": "", "efs_shared_dirs": "", - "efs_access_point_ids": "", - "enable_efa": "NONE", - "enable_efa_gdr": "NONE", - "enable_intel_hpc_platform": "false", - "ephemeral_dir": "/scratch", "fsx_dns_names": "", "fsx_fs_ids": "", "fsx_fs_types": "", @@ -35,19 +30,13 @@ "hosted_zone": "{\"Ref\": \"referencetoclusternameRoute53HostedZone2388733DRef\"}", "log_group_name": "/aws/parallelcluster/clustername-202401151530", "log_rotation_enabled": "true", - "node_type": "ComputeFleet", - "proxy": "NONE", "raid_shared_dir": "", "raid_type": "", "region": "us-east-1", "scheduler": "slurm", - "scheduler_compute_resource_name": "cr1", - "scheduler_queue_name": "queue1", "shared_storage_type": "ebs", "slurm_ddb_table": "{\"Ref\": \"referencetoclusternameSlurmDynamoDBTable99119DBERef\"}", - "stack_arn": "{\"Ref\": \"AWS::StackId\"}", "stack_name": "clustername", - "use_private_hostname": "false", - "launch_template_id": "LaunchTemplateA7211c84b953696f" + "use_private_hostname": "false" } } diff --git a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-1.json b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-1.json new file mode 100644 index 0000000000..681b1e38ed --- /dev/null +++ b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-1.json @@ -0,0 +1,14 @@ +{ + "cluster": { + "cluster_name": "clustername", + "enable_efa": "NONE", + "enable_efa_gdr": "NONE", + "ephemeral_dir": "/scratch", + "launch_template_id": "LaunchTemplateA7211c84b953696f", + "node_type": "ComputeFleet", + "proxy": "NONE", + "scheduler_compute_resource_name": "cr1", + "scheduler_queue_name": "queue1", + "stack_name": "clustername" + } +} diff --git a/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-2.json b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-2.json new file mode 100644 index 0000000000..681b1e38ed --- /dev/null +++ b/cli/tests/pcluster/templates/test_queues_stack/test_compute_nodes_dna_json/compute-dna-2.json @@ -0,0 +1,14 @@ +{ + "cluster": { + "cluster_name": "clustername", + "enable_efa": "NONE", + "enable_efa_gdr": "NONE", + "ephemeral_dir": "/scratch", + "launch_template_id": "LaunchTemplateA7211c84b953696f", + "node_type": "ComputeFleet", + "proxy": "NONE", + "scheduler_compute_resource_name": "cr1", + "scheduler_queue_name": "queue1", + "stack_name": "clustername" + } +} From f7af1222f50467e74eac4adc9127f8a0cc7c04ce Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 11 Sep 2024 19:31:33 -0400 Subject: [PATCH 4/6] [Temp][DFSM] Use idle~ for compute node status --- tests/integration-tests/tests/update/test_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/tests/update/test_update.py b/tests/integration-tests/tests/update/test_update.py index 4cf14fc824..8095934bf0 100644 --- a/tests/integration-tests/tests/update/test_update.py +++ b/tests/integration-tests/tests/update/test_update.py @@ -1344,7 +1344,7 @@ def test_dynamic_file_systems_update( # or under replacement (drained, draining). queue2_nodes = scheduler_commands.get_compute_nodes("queue2") assert_compute_node_states( - scheduler_commands, queue2_nodes, expected_states=["idle", "drained", "idle%", "drained*", "draining"] + scheduler_commands, queue2_nodes, expected_states=["idle", "drained", "idle%", "drained*", "draining", "idle~"] ) logging.info("Checking that shared storage is visible on the head node") From f612fb971606914fb857b9da7321e79cc021aad1 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 11 Sep 2024 19:54:22 -0400 Subject: [PATCH 5/6] Removing the error of jq not existing as we install it using cookbook --- cli/src/pcluster/resources/compute_node/user_data.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cli/src/pcluster/resources/compute_node/user_data.sh b/cli/src/pcluster/resources/compute_node/user_data.sh index f2bd437c12..5fc4b625f5 100644 --- a/cli/src/pcluster/resources/compute_node/user_data.sh +++ b/cli/src/pcluster/resources/compute_node/user_data.sh @@ -164,7 +164,7 @@ write_files: touch /etc/chef/ohai/hints/ec2.json start=$(date +%s) - jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json || ( echo "jq not installed"; cp /tmp/common-dna.json /tmp/compute-dna.json /etc/chef/dna.json ) + jq -s ".[0] * .[1] * .[2] * .[3]" /tmp/common-dna.json /tmp/compute-dna.json /tmp/stack-arn.json /tmp/extra.json > /etc/chef/dna.json { CINC_CMD="cinc-client --local-mode --config /etc/chef/client.rb --log_level info --logfile /var/log/chef-client.log --force-formatter --no-color --chef-zero-port 8889 --json-attributes /etc/chef/dna.json --override-runlist" FR_CMD="/opt/parallelcluster/scripts/fetch_and_run" From 307d600414de4237e778a2159e8623978e84e737 Mon Sep 17 00:00:00 2001 From: Himani Deshpande Date: Wed, 11 Sep 2024 20:41:07 -0400 Subject: [PATCH 6/6] [Unit test] testing fake dna asset --- cli/tests/pcluster/cli/test_commands.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cli/tests/pcluster/cli/test_commands.py b/cli/tests/pcluster/cli/test_commands.py index fbf42a911d..27243e54ee 100644 --- a/cli/tests/pcluster/cli/test_commands.py +++ b/cli/tests/pcluster/cli/test_commands.py @@ -15,6 +15,7 @@ from pcluster.aws.common import AWSClientError from pcluster.models.cluster import ClusterActionError, ClusterStack +from pcluster.models.s3_bucket import S3FileType from tests.pcluster.aws.dummy_aws_api import mock_aws_api from tests.pcluster.config.dummy_cluster_config import dummy_awsbatch_cluster_config, dummy_slurm_cluster_config from tests.pcluster.models.dummy_s3_bucket import mock_bucket, mock_bucket_object_utils, mock_bucket_utils @@ -135,6 +136,7 @@ def test_setup_bucket_with_resources_success( upload_config_mock = mock_dict.get("upload_config") upload_template_mock = mock_dict.get("upload_cfn_template") upload_asset_mock = mock_dict.get("upload_cfn_asset") + upload_dna_asset_mock = mock_dict.get("upload_dna_cfn_asset") upload_custom_resources_mock = mock_dict.get("upload_resources") # mock bucket utils check_bucket_mock = mock_bucket_utils(mocker, root_service_dir=f"{cluster_name}-abc123")["check_bucket_exists"] @@ -150,6 +152,7 @@ def test_setup_bucket_with_resources_success( cluster.bucket.upload_config(expected_config, "fake_config_name") cluster.bucket.upload_cfn_template(expected_template, "fake_template_name") cluster.bucket.upload_cfn_asset(expected_asset, "fake_asset_name") + cluster.bucket.upload_dna_cfn_asset(expected_asset, "fake_asset_name", S3FileType.COMPUTE_DNA_ASSETS) for dir in expected_dirs: cluster.bucket.upload_resources(dir) @@ -159,6 +162,7 @@ def test_setup_bucket_with_resources_success( upload_config_mock.assert_called_with(expected_config, "fake_config_name") upload_template_mock.assert_called_with(expected_template, "fake_template_name") upload_asset_mock.assert_called_with(expected_asset, "fake_asset_name") + upload_dna_asset_mock.assert_called_with(expected_asset, "fake_asset_name", S3FileType.COMPUTE_DNA_ASSETS) upload_custom_resources_mock.assert_has_calls([mocker.call(dir) for dir in expected_dirs]) # assert bucket properties