From bac3c8f4829e8fc8ddebb6ead4b65a96db43d0af Mon Sep 17 00:00:00 2001 From: hanwenli Date: Wed, 11 Jun 2025 08:46:00 -0700 Subject: [PATCH 1/9] Do not error exit when capacity reservation from the existing cluster config has been deleted --- cli/src/pcluster/config/cluster_config.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 96a747ebd7..1c0b81c856 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -2446,9 +2446,13 @@ def _instance_type_from_capacity_reservation(self): self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None ) if capacity_reservation_id: - capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id]) - if capacity_reservations: - instance_type = capacity_reservations[0].instance_type() + try: + capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id]) + if capacity_reservations: + instance_type = capacity_reservations[0].instance_type() + except AWSClientError: + # In case the CR has expired and we are unable to retrieve the instance type + instance_type = None return instance_type From f4aee623a09bea2ed67082e7783f313c657183b2 Mon Sep 17 00:00:00 2001 From: hanwenli Date: Wed, 11 Jun 2025 12:02:08 -0700 Subject: [PATCH 2/9] Improve integration test to test capacity reservation update without instance type specified --- .../test_on_demand_capacity_reservation.py | 62 +++++++- .../pcluster.config.update.yaml | 143 ++++++++++++++++++ .../pcluster.config.yaml | 1 - 3 files changed, 203 insertions(+), 3 deletions(-) create mode 100644 tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.update.yaml diff --git a/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation.py b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation.py index b732ffd19a..179d26dc66 100644 --- a/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation.py +++ b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation.py @@ -10,11 +10,13 @@ # This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. # See the License for the specific language governing permissions and limitations under the License. import logging +import os +import subprocess import boto3 import pytest from assertpy import assert_that -from utils import describe_cluster_instances, retrieve_cfn_resources +from utils import describe_cluster_instances, retrieve_cfn_resources, wait_for_computefleet_changed @pytest.mark.usefixtures("os", "region") @@ -40,7 +42,50 @@ def test_on_demand_capacity_reservation( pg_capacity_reservation_id=odcr_resources["integTestsPgOdcr"], pg_capacity_reservation_arn=resource_group_arn, ) - cluster = clusters_factory(cluster_config) + + # Apply patch to the repo + logging.info("Applying patch to the repository") + repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../..")) + s3_bucket_file = os.path.join(repo_root, "cli/src/pcluster/models/s3_bucket.py") + + # Backup the original file + with open(s3_bucket_file, "r") as f: + original_content = f.read() + + try: + # Apply the patch - inject the bug that replaces capacity reservation IDs + with open(s3_bucket_file, "r") as f: + content = f.read() + + # Add the bug injection line after the upload_config method definition + modified_content = content.replace( + " def upload_config(self, config, config_name, format=S3FileFormat.YAML):\n" + ' """Upload config file to S3 bucket."""', + " def upload_config(self, config, config_name, format=S3FileFormat.YAML):\n" + ' """Upload config file to S3 bucket."""\n' + ' if config_name == "cluster-config.yaml":\n' + " config = re.sub(r'cr-[0-9a-f]{17}', 'cr-11111111111111111', config)", + ) + + # Write the modified content back + with open(s3_bucket_file, "w") as f: + f.write(modified_content) + + # Install the CLI + logging.info("Installing CLI from local repository") + subprocess.run(["pip", "install", "./cli"], cwd=repo_root, check=True) + + # Create the cluster + cluster = clusters_factory(cluster_config) + finally: + # Revert the patch by restoring the original file + logging.info("Reverting patch from the repository") + with open(s3_bucket_file, "w") as f: + f.write(original_content) + + # Reinstall the CLI + logging.info("Reinstalling CLI from local repository") + subprocess.run(["pip", "install", "./cli"], cwd=repo_root, check=True) _assert_instance_in_capacity_reservation(cluster, region, "open-odcr-id-cr", odcr_resources["integTestsOpenOdcr"]) _assert_instance_in_capacity_reservation(cluster, region, "open-odcr-arn-cr", odcr_resources["integTestsOpenOdcr"]) @@ -64,6 +109,19 @@ def test_on_demand_capacity_reservation( ) _assert_instance_in_capacity_reservation(cluster, region, "pg-odcr-id-cr", odcr_resources["integTestsPgOdcr"]) _assert_instance_in_capacity_reservation(cluster, region, "pg-odcr-arn-cr", odcr_resources["integTestsPgOdcr"]) + cluster.stop() + wait_for_computefleet_changed(cluster, "STOPPED") + updated_config_file = pcluster_config_reader( + config_file="pcluster.config.update.yaml", + placement_group=placement_group_stack.cfn_resources["PlacementGroup"], + open_capacity_reservation_id=odcr_resources["integTestsOpenOdcr"], + open_capacity_reservation_arn=resource_group_arn, + target_capacity_reservation_id=odcr_resources["integTestsTargetOdcr"], + target_capacity_reservation_arn=resource_group_arn, + pg_capacity_reservation_id=odcr_resources["integTestsPgOdcr"], + pg_capacity_reservation_arn=resource_group_arn, + ) + cluster.update(str(updated_config_file)) def _assert_instance_in_capacity_reservation(cluster, region, compute_resource_name, expected_reservation): diff --git a/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.update.yaml b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.update.yaml new file mode 100644 index 0000000000..3b5112eb55 --- /dev/null +++ b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.update.yaml @@ -0,0 +1,143 @@ +Image: + Os: {{ os }} +HeadNode: + InstanceType: r5.xlarge + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} +Scheduling: + Scheduler: slurm + SlurmQueues: + - Name: open-odcr-q + ComputeResources: + - Name: open-odcr-id-cr + InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationId: {{ open_capacity_reservation_id }} + - Name: open-odcr-arn-cr + InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }} + - Name: open-odcr-arn-fl-cr + Instances: + - InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }} + - Name: open-odcr-id-pg-cr + InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Enabled: true + CapacityReservationTarget: + CapacityReservationId: {{ open_capacity_reservation_id }} + - Name: open-odcr-arn-pg-cr + InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Enabled: true + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }} + - Name: open-odcr-arn-pg-fl-cr + Instances: + - InstanceType: m5.2xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Enabled: true + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }} + Networking: + SubnetIds: + - {{ public_subnet_id }} + - Name: target-odcr-q + ComputeResources: + - Name: target-odcr-id-cr + InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationId: {{ target_capacity_reservation_id }} + - Name: target-odcr-arn-cr + InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }} + - Name: target-odcr-arn-fl-cr + Instances: + - InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }} + - Name: target-odcr-id-pg-cr + InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Enabled: true + CapacityReservationTarget: + CapacityReservationId: {{ target_capacity_reservation_id }} + - Name: target-odcr-arn-pg-cr + InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }} + - Name: target-odcr-arn-pg-fl-cr + Instances: + - InstanceType: r5.xlarge + MinCount: 0 + MaxCount: 1 + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }} + Networking: + SubnetIds: + - {{ public_subnet_id }} + - Name: pg-odcr-q + ComputeResources: + - Name: pg-odcr-id-cr + InstanceType: m5.xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Name: {{ placement_group }} + CapacityReservationTarget: + CapacityReservationId: {{ pg_capacity_reservation_id }} + - Name: pg-odcr-arn-cr + InstanceType: m5.xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Name: {{ placement_group }} + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ pg_capacity_reservation_arn }} + - Name: pg-odcr-arn-fleet-cr + Instances: + - InstanceType: m5.xlarge + MinCount: 0 + MaxCount: 1 + Networking: + PlacementGroup: + Name: {{ placement_group }} + CapacityReservationTarget: + CapacityReservationResourceGroupArn: {{ pg_capacity_reservation_arn }} + Networking: + SubnetIds: + - {{ public_subnet_id }} + diff --git a/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.yaml b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.yaml index 08094c3a41..43838e4883 100644 --- a/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.yaml +++ b/tests/integration-tests/tests/capacity_reservations/test_on_demand_capacity_reservation/test_on_demand_capacity_reservation/pcluster.config.yaml @@ -12,7 +12,6 @@ Scheduling: - Name: open-odcr-q ComputeResources: - Name: open-odcr-id-cr - InstanceType: m5.2xlarge MinCount: 1 MaxCount: 1 CapacityReservationTarget: From a5f86dce627557d7e4f8419313d54d049b2307e4 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 16 Jun 2025 15:50:32 -0400 Subject: [PATCH 3/9] Update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d81f8e12e..6c8f1e5dca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ CHANGELOG **BUG FIXES** - Fix an issue where Security Group validation failed when a rule contained both IPv4 ranges (IpRanges) and security group references (UserIdGroupPairs). +- Fix a bug that update-cluster, update-compute-fleet may fail when compute resources use an expired Capacity Reservation. 3.13.1 ------ From aa590b902ccb46423b63845faeb538fafc2e3496 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 16 Jun 2025 17:37:08 -0400 Subject: [PATCH 4/9] Address mistake in changelog --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6c8f1e5dca..5a62840381 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,11 @@ CHANGELOG **BUG FIXES** - Fix an issue where Security Group validation failed when a rule contained both IPv4 ranges (IpRanges) and security group references (UserIdGroupPairs). + +3.13.2 +------ + +**BUG FIXES** - Fix a bug that update-cluster, update-compute-fleet may fail when compute resources use an expired Capacity Reservation. 3.13.1 From 34ce03d37d31274064a0bac9ac287317413b0169 Mon Sep 17 00:00:00 2001 From: Helena Greebe Date: Thu, 19 Jun 2025 16:03:50 -0400 Subject: [PATCH 5/9] Add instance type property to avoid capacity reservation from being checked during an update --- cli/src/pcluster/config/cluster_config.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/cli/src/pcluster/config/cluster_config.py b/cli/src/pcluster/config/cluster_config.py index 1c0b81c856..e4ba6fd4ca 100644 --- a/cli/src/pcluster/config/cluster_config.py +++ b/cli/src/pcluster/config/cluster_config.py @@ -2385,9 +2385,8 @@ class SlurmComputeResource(_BaseSlurmComputeResource): def __init__(self, instance_type=None, **kwargs): super().__init__(**kwargs) - _instance_type = instance_type if instance_type else self._instance_type_from_capacity_reservation() - self.instance_type = Resource.init_param(_instance_type) self.__instance_type_info = None + self._instance_type = Resource.init_param(instance_type) def is_flexible(self): """Return False because the ComputeResource can not contain multiple instance types.""" @@ -2398,6 +2397,14 @@ def instance_types(self) -> List[str]: """List of instance types under this compute resource.""" return [self.instance_type] + @property + # Do not invoke in update path + def instance_type(self): + """Instance type of this compute resource.""" + if not self._instance_type: + self._instance_type = Resource.init_param(self._instance_type_from_capacity_reservation()) + return self._instance_type + def _register_validators(self, context: ValidatorContext = None): super()._register_validators(context) self._register_validator( @@ -2446,13 +2453,9 @@ def _instance_type_from_capacity_reservation(self): self.capacity_reservation_target.capacity_reservation_id if self.capacity_reservation_target else None ) if capacity_reservation_id: - try: - capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id]) - if capacity_reservations: - instance_type = capacity_reservations[0].instance_type() - except AWSClientError: - # In case the CR has expired and we are unable to retrieve the instance type - instance_type = None + capacity_reservations = AWSApi.instance().ec2.describe_capacity_reservations([capacity_reservation_id]) + if capacity_reservations: + instance_type = capacity_reservations[0].instance_type() return instance_type From 045d889148d95acc3aeccec920fa99c854e4dbc5 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 23 Jun 2025 14:48:50 -0400 Subject: [PATCH 6/9] Revise changelog --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 48c8c9fb7a..84b7cd433f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,7 +14,8 @@ CHANGELOG ------ **BUG FIXES** -- Fix a bug that update-cluster, update-compute-fleet may fail when compute resources use an expired Capacity Reservation. +- Fix a bug which may cause `update-cluster` and `update-compute-fleet` to fail when compute resources reference an expired Capacity Reservation that is no longer accessible via EC2 APIs. +- Fix `build-image` failure on Rocky 9, occurring when the parent image is not the latest version. https://github.com/aws/aws-parallelcluster/issues/6874 3.13.1 ------ From fe76632f51d680a11ffd58b5a8f9b0cd66626834 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 23 Jun 2025 14:50:35 -0400 Subject: [PATCH 7/9] Remove duplicate changelog --- CHANGELOG.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84b7cd433f..b40880950e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,12 +20,6 @@ CHANGELOG 3.13.1 ------ -**BUG FIXES** -- Fix build image failures occurring on non-latest versions of Rocky Linux 9. - -3.13.1 ------- - **CHANGES** - Upgrade Slurm to version 24.05.8. - Upgrade EFA installer to 1.41.0 (from 1.38.1). From 811c280d5ba76e9f98b41f3d705420d3ffe9e050 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 23 Jun 2025 15:18:13 -0400 Subject: [PATCH 8/9] Final version CHANGELOG. --- CHANGELOG.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b40880950e..364f567471 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,8 +14,10 @@ CHANGELOG ------ **BUG FIXES** -- Fix a bug which may cause `update-cluster` and `update-compute-fleet` to fail when compute resources reference an expired Capacity Reservation that is no longer accessible via EC2 APIs. -- Fix `build-image` failure on Rocky 9, occurring when the parent image is not the latest version. https://github.com/aws/aws-parallelcluster/issues/6874 +- Fix a bug which may cause `update-cluster` and `update-compute-fleet` to fail when compute resources reference an expired Capacity Reservation + that is no longer accessible via EC2 APIs. +- Fix `build-image` failure on Rocky 9, occurring when the parent image does not ship the latest kernel version + See https://github.com/aws/aws-parallelcluster/issues/6874. 3.13.1 ------ From 0ca4071e93cd75061c3e3af5b139db00eb82b4d0 Mon Sep 17 00:00:00 2001 From: Xuanqi He Date: Mon, 23 Jun 2025 16:27:14 -0400 Subject: [PATCH 9/9] Correctly ternminate the sentence with a period. --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 364f567471..459a150e91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,7 +16,7 @@ CHANGELOG **BUG FIXES** - Fix a bug which may cause `update-cluster` and `update-compute-fleet` to fail when compute resources reference an expired Capacity Reservation that is no longer accessible via EC2 APIs. -- Fix `build-image` failure on Rocky 9, occurring when the parent image does not ship the latest kernel version +- Fix `build-image` failure on Rocky 9, occurring when the parent image does not ship the latest kernel version. See https://github.com/aws/aws-parallelcluster/issues/6874. 3.13.1