@@ -100,8 +100,17 @@ def _delete_s3_artifacts(event):
100
100
101
101
102
102
def _terminate_cluster_nodes (event ):
103
+ """
104
+ Terminate all EC2 instances associated with the given cluster.
105
+
106
+ This function iterates over all EC2 instances associated with the specified cluster and attempts to terminate them.
107
+ It handles retries for instances that fail to terminate initially and ensures that the function does not exceed
108
+ the Lambda execution timeout.
109
+ """
103
110
try :
104
- logger .info ("Compute fleet clean-up: STARTED" )
111
+ start_time = time .time ()
112
+ max_exeution_time = 14 * 60 # Maximum allowed time for Lambda function execution (14 minutes) to avoid timeout
113
+ logger .info ("Compute fleet nodes terminate: STARTED" )
105
114
stack_name = event ["ResourceProperties" ]["StackName" ]
106
115
ec2 = boto3 .client ("ec2" , config = boto3_config )
107
116
@@ -117,12 +126,44 @@ def _terminate_cluster_nodes(event):
117
126
logger .error ("Failed when terminating instances with error %s" , e )
118
127
completed_successfully = False
119
128
continue
120
- logger .info ("Compute fleet clean-up: COMPLETED. Instances are either in shutting-down or terminated state" )
129
+ logger .info ("Sleeping for 10 seconds to allow all instances to initiate shut-down" )
130
+ time .sleep (10 )
131
+
132
+ while _has_shuttingdown_instances (stack_name ):
133
+ # This logic prevents Lambda function from timing out if instance termination exceeds 15 minutes
134
+ # TODO: This approach may cause potential cluster deletion failure when PlacementGroups are enabled
135
+ # and instance termination time exceeds 15 minutes simultaneously. Resolve the above potential failure.
136
+ if time .time () - start_time > max_exeution_time :
137
+ logger .warning (
138
+ "Lambda execution time has exceeded 14 minutes, approaching timeout. "
139
+ "Returning from Lambda after a 30-second delay; instances may still be in a shutting-down state. "
140
+ "Note: Instances in shutting-down state are not recoverable and are not billed during this period."
141
+ )
142
+ time .sleep (30 )
143
+ return
144
+ logger .info ("Waiting for all nodes terminated..." )
145
+ time .sleep (10 )
146
+
147
+ # Sleep for 30 more seconds to give PlacementGroups the time to update
148
+ time .sleep (30 )
149
+
150
+ logger .info ("Compute fleet nodes terminate: COMPLETED" )
121
151
except Exception as e :
122
152
logger .error ("Failed when terminating instances with error %s" , e )
123
153
raise
124
154
125
155
156
+ def _has_shuttingdown_instances (stack_name ):
157
+ ec2 = boto3 .client ("ec2" , config = boto3_config )
158
+ filters = [
159
+ {"Name" : "tag:parallelcluster:cluster-name" , "Values" : [stack_name ]},
160
+ {"Name" : "instance-state-name" , "Values" : ["shutting-down" ]},
161
+ ]
162
+
163
+ result = ec2 .describe_instances (Filters = filters )
164
+ return len (result .get ("Reservations" , [])) > 0
165
+
166
+
126
167
def _describe_instance_ids_iterator (stack_name , instance_state = ("pending" , "running" , "stopping" , "stopped" )):
127
168
ec2 = boto3 .client ("ec2" , config = boto3_config )
128
169
filters = [
0 commit comments