Skip to content

[develop] Address cluster update failure when old capacity reservation has been deleted and changelog update for 3.13.2 #6884

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Jun 23, 2025
Merged
7 changes: 5 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,14 @@ CHANGELOG
**BUG FIXES**
- Fix an issue where Security Group validation failed when a rule contained both IPv4 ranges (IpRanges) and security group references (UserIdGroupPairs).

3.13.1
3.13.2
------

**BUG FIXES**
- Fix build image failures occurring on non-latest versions of Rocky Linux 9.
- Fix a bug which may cause `update-cluster` and `update-compute-fleet` to fail when compute resources reference an expired Capacity Reservation
that is no longer accessible via EC2 APIs.
- Fix `build-image` failure on Rocky 9, occurring when the parent image does not ship the latest kernel version.
See https://github.com/aws/aws-parallelcluster/issues/6874.

3.13.1
------
Expand Down
11 changes: 9 additions & 2 deletions cli/src/pcluster/config/cluster_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -2385,9 +2385,8 @@ class SlurmComputeResource(_BaseSlurmComputeResource):

def __init__(self, instance_type=None, **kwargs):
super().__init__(**kwargs)
_instance_type = instance_type if instance_type else self._instance_type_from_capacity_reservation()
self.instance_type = Resource.init_param(_instance_type)
self.__instance_type_info = None
self._instance_type = Resource.init_param(instance_type)

def is_flexible(self):
"""Return False because the ComputeResource can not contain multiple instance types."""
Expand All @@ -2398,6 +2397,14 @@ def instance_types(self) -> List[str]:
"""List of instance types under this compute resource."""
return [self.instance_type]

@property
# Do not invoke in update path
def instance_type(self):
"""Instance type of this compute resource."""
if not self._instance_type:
self._instance_type = Resource.init_param(self._instance_type_from_capacity_reservation())
return self._instance_type

def _register_validators(self, context: ValidatorContext = None):
super()._register_validators(context)
self._register_validator(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
import os
import subprocess

import boto3
import pytest
from assertpy import assert_that
from utils import describe_cluster_instances, retrieve_cfn_resources
from utils import describe_cluster_instances, retrieve_cfn_resources, wait_for_computefleet_changed


@pytest.mark.usefixtures("os", "region")
Expand All @@ -40,7 +42,50 @@ def test_on_demand_capacity_reservation(
pg_capacity_reservation_id=odcr_resources["integTestsPgOdcr"],
pg_capacity_reservation_arn=resource_group_arn,
)
cluster = clusters_factory(cluster_config)

# Apply patch to the repo
logging.info("Applying patch to the repository")
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../.."))
s3_bucket_file = os.path.join(repo_root, "cli/src/pcluster/models/s3_bucket.py")

# Backup the original file
with open(s3_bucket_file, "r") as f:
original_content = f.read()

try:
# Apply the patch - inject the bug that replaces capacity reservation IDs
with open(s3_bucket_file, "r") as f:
content = f.read()

# Add the bug injection line after the upload_config method definition
modified_content = content.replace(
" def upload_config(self, config, config_name, format=S3FileFormat.YAML):\n"
' """Upload config file to S3 bucket."""',
" def upload_config(self, config, config_name, format=S3FileFormat.YAML):\n"
' """Upload config file to S3 bucket."""\n'
' if config_name == "cluster-config.yaml":\n'
" config = re.sub(r'cr-[0-9a-f]{17}', 'cr-11111111111111111', config)",
)

# Write the modified content back
with open(s3_bucket_file, "w") as f:
f.write(modified_content)

# Install the CLI
logging.info("Installing CLI from local repository")
subprocess.run(["pip", "install", "./cli"], cwd=repo_root, check=True)

# Create the cluster
cluster = clusters_factory(cluster_config)
finally:
# Revert the patch by restoring the original file
logging.info("Reverting patch from the repository")
with open(s3_bucket_file, "w") as f:
f.write(original_content)

# Reinstall the CLI
logging.info("Reinstalling CLI from local repository")
subprocess.run(["pip", "install", "./cli"], cwd=repo_root, check=True)

_assert_instance_in_capacity_reservation(cluster, region, "open-odcr-id-cr", odcr_resources["integTestsOpenOdcr"])
_assert_instance_in_capacity_reservation(cluster, region, "open-odcr-arn-cr", odcr_resources["integTestsOpenOdcr"])
Expand All @@ -64,6 +109,19 @@ def test_on_demand_capacity_reservation(
)
_assert_instance_in_capacity_reservation(cluster, region, "pg-odcr-id-cr", odcr_resources["integTestsPgOdcr"])
_assert_instance_in_capacity_reservation(cluster, region, "pg-odcr-arn-cr", odcr_resources["integTestsPgOdcr"])
cluster.stop()
wait_for_computefleet_changed(cluster, "STOPPED")
updated_config_file = pcluster_config_reader(
config_file="pcluster.config.update.yaml",
placement_group=placement_group_stack.cfn_resources["PlacementGroup"],
open_capacity_reservation_id=odcr_resources["integTestsOpenOdcr"],
open_capacity_reservation_arn=resource_group_arn,
target_capacity_reservation_id=odcr_resources["integTestsTargetOdcr"],
target_capacity_reservation_arn=resource_group_arn,
pg_capacity_reservation_id=odcr_resources["integTestsPgOdcr"],
pg_capacity_reservation_arn=resource_group_arn,
)
cluster.update(str(updated_config_file))


def _assert_instance_in_capacity_reservation(cluster, region, compute_resource_name, expected_reservation):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
Image:
Os: {{ os }}
HeadNode:
InstanceType: r5.xlarge
Networking:
SubnetId: {{ public_subnet_id }}
Ssh:
KeyName: {{ key_name }}
Scheduling:
Scheduler: slurm
SlurmQueues:
- Name: open-odcr-q
ComputeResources:
- Name: open-odcr-id-cr
InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationId: {{ open_capacity_reservation_id }}
- Name: open-odcr-arn-cr
InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }}
- Name: open-odcr-arn-fl-cr
Instances:
- InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }}
- Name: open-odcr-id-pg-cr
InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Enabled: true
CapacityReservationTarget:
CapacityReservationId: {{ open_capacity_reservation_id }}
- Name: open-odcr-arn-pg-cr
InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Enabled: true
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }}
- Name: open-odcr-arn-pg-fl-cr
Instances:
- InstanceType: m5.2xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Enabled: true
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ open_capacity_reservation_arn }}
Networking:
SubnetIds:
- {{ public_subnet_id }}
- Name: target-odcr-q
ComputeResources:
- Name: target-odcr-id-cr
InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationId: {{ target_capacity_reservation_id }}
- Name: target-odcr-arn-cr
InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }}
- Name: target-odcr-arn-fl-cr
Instances:
- InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }}
- Name: target-odcr-id-pg-cr
InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Enabled: true
CapacityReservationTarget:
CapacityReservationId: {{ target_capacity_reservation_id }}
- Name: target-odcr-arn-pg-cr
InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }}
- Name: target-odcr-arn-pg-fl-cr
Instances:
- InstanceType: r5.xlarge
MinCount: 0
MaxCount: 1
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ target_capacity_reservation_arn }}
Networking:
SubnetIds:
- {{ public_subnet_id }}
- Name: pg-odcr-q
ComputeResources:
- Name: pg-odcr-id-cr
InstanceType: m5.xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Name: {{ placement_group }}
CapacityReservationTarget:
CapacityReservationId: {{ pg_capacity_reservation_id }}
- Name: pg-odcr-arn-cr
InstanceType: m5.xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Name: {{ placement_group }}
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ pg_capacity_reservation_arn }}
- Name: pg-odcr-arn-fleet-cr
Instances:
- InstanceType: m5.xlarge
MinCount: 0
MaxCount: 1
Networking:
PlacementGroup:
Name: {{ placement_group }}
CapacityReservationTarget:
CapacityReservationResourceGroupArn: {{ pg_capacity_reservation_arn }}
Networking:
SubnetIds:
- {{ public_subnet_id }}

Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ Scheduling:
- Name: open-odcr-q
ComputeResources:
- Name: open-odcr-id-cr
InstanceType: m5.2xlarge
MinCount: 1
MaxCount: 1
CapacityReservationTarget:
Expand Down
Loading