Skip to content

Commit 0c9627c

Browse files
authored
Fix cluster deletion failure when placement groups are enabled (#6373)
* Revert "Remove wait for shutting-down instances" This reverts commit 4e80c57. * Prevents Lambda function from timing out if instance termination exceeds 15 minutes.
1 parent 7ea861b commit 0c9627c

File tree

2 files changed

+45
-3
lines changed

2 files changed

+45
-3
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@ CHANGELOG
55
------
66

77
**ENHANCEMENTS**
8-
98
- Add support for custom actions on login nodes.
109
- Allow DCV connection on login nodes.
1110
- Add support for ap-southeast-3 region.
@@ -15,8 +14,10 @@ CHANGELOG
1514
**BUG FIXES**
1615
- Fix validator `EfaPlacementGroupValidator` so that it does not suggest to configure a Placement Group when Capacity Blocks are used.
1716
- Fix sporadic cluster creation failures with managed FSx for Lustre.
17+
- Fix cluster deletion failure when placement group is enabled.
1818
- Fix issue with login nodes being marked unhealthy when restricting SSH access.
1919
- Fix `retrieve_supported_regions` so that it can get the correct S3 url.
20+
2021
3.10.1
2122
------
2223

cli/src/pcluster/resources/custom_resources/custom_resources_code/cleanup_resources.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,8 +100,17 @@ def _delete_s3_artifacts(event):
100100

101101

102102
def _terminate_cluster_nodes(event):
103+
"""
104+
Terminate all EC2 instances associated with the given cluster.
105+
106+
This function iterates over all EC2 instances associated with the specified cluster and attempts to terminate them.
107+
It handles retries for instances that fail to terminate initially and ensures that the function does not exceed
108+
the Lambda execution timeout.
109+
"""
103110
try:
104-
logger.info("Compute fleet clean-up: STARTED")
111+
start_time = time.time()
112+
max_exeution_time = 14 * 60 # Maximum allowed time for Lambda function execution (14 minutes) to avoid timeout
113+
logger.info("Compute fleet nodes terminate: STARTED")
105114
stack_name = event["ResourceProperties"]["StackName"]
106115
ec2 = boto3.client("ec2", config=boto3_config)
107116

@@ -117,12 +126,44 @@ def _terminate_cluster_nodes(event):
117126
logger.error("Failed when terminating instances with error %s", e)
118127
completed_successfully = False
119128
continue
120-
logger.info("Compute fleet clean-up: COMPLETED. Instances are either in shutting-down or terminated state")
129+
logger.info("Sleeping for 10 seconds to allow all instances to initiate shut-down")
130+
time.sleep(10)
131+
132+
while _has_shuttingdown_instances(stack_name):
133+
# This logic prevents Lambda function from timing out if instance termination exceeds 15 minutes
134+
# TODO: This approach may cause potential cluster deletion failure when PlacementGroups are enabled
135+
# and instance termination time exceeds 15 minutes simultaneously. Resolve the above potential failure.
136+
if time.time() - start_time > max_exeution_time:
137+
logger.warning(
138+
"Lambda execution time has exceeded 14 minutes, approaching timeout. "
139+
"Returning from Lambda after a 30-second delay; instances may still be in a shutting-down state. "
140+
"Note: Instances in shutting-down state are not recoverable and are not billed during this period."
141+
)
142+
time.sleep(30)
143+
return
144+
logger.info("Waiting for all nodes terminated...")
145+
time.sleep(10)
146+
147+
# Sleep for 30 more seconds to give PlacementGroups the time to update
148+
time.sleep(30)
149+
150+
logger.info("Compute fleet nodes terminate: COMPLETED")
121151
except Exception as e:
122152
logger.error("Failed when terminating instances with error %s", e)
123153
raise
124154

125155

156+
def _has_shuttingdown_instances(stack_name):
157+
ec2 = boto3.client("ec2", config=boto3_config)
158+
filters = [
159+
{"Name": "tag:parallelcluster:cluster-name", "Values": [stack_name]},
160+
{"Name": "instance-state-name", "Values": ["shutting-down"]},
161+
]
162+
163+
result = ec2.describe_instances(Filters=filters)
164+
return len(result.get("Reservations", [])) > 0
165+
166+
126167
def _describe_instance_ids_iterator(stack_name, instance_state=("pending", "running", "stopping", "stopped")):
127168
ec2 = boto3.client("ec2", config=boto3_config)
128169
filters = [

0 commit comments

Comments
 (0)