27
27
from remote_command_executor import RemoteCommandExecutor
28
28
from retrying import retry
29
29
from time_utils import seconds
30
- from utils import find_stack_by_tag , generate_stack_name , is_directory_supported , random_alphanumeric
30
+
31
+ from utils import find_stack_by_tag , generate_stack_name , is_directory_supported , random_alphanumeric , get_quarantined_stacks , is_quarantined_stack , quarantine_stacks
31
32
32
33
from tests .ad_integration .cluster_user import ClusterUser
33
34
from tests .common .utils import run_system_analyzer
35
+ from constants import DO_NOT_DELETE_TAG_KEY , MAX_QUARANTINED_STACKS
34
36
35
37
NUM_USERS_TO_CREATE = 5
36
38
NUM_USERS_TO_TEST = 3
37
39
40
+ AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'
41
+
38
42
39
43
def get_infra_stack_outputs (stack_name ):
40
44
cfn = boto3 .client ("cloudformation" )
@@ -117,7 +121,7 @@ def add_tag_to_stack(stack_name, key, value):
117
121
stack = cfn .Stack (stack_name )
118
122
add_tag = True
119
123
for tag in stack .tags :
120
- if tag .get ("Key" ) == "DO-NOT-DELETE" :
124
+ if tag .get ("Key" ) == DO_NOT_DELETE_TAG_KEY :
121
125
add_tag = False
122
126
break
123
127
if add_tag :
@@ -127,6 +131,7 @@ def add_tag_to_stack(stack_name, key, value):
127
131
Tags = [
128
132
{"Key" : key , "Value" : value },
129
133
],
134
+ DisableRollback = True ,
130
135
)
131
136
132
137
@@ -189,7 +194,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair):
189
194
190
195
def _create_directory_stack (cfn_stacks_factory , request , directory_type , region , vpc_stack : CfnVpcStack ):
191
196
directory_stack_name = generate_stack_name (
192
- f"integ-tests-MultiUserInfraStack { directory_type } " , request .config .getoption ("stackname_suffix" )
197
+ f"{ AD_STACK_PREFIX } { directory_type } " , request .config .getoption ("stackname_suffix" )
193
198
)
194
199
195
200
if directory_type not in ("MicrosoftAD" , "SimpleAD" ):
@@ -203,7 +208,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
203
208
stack_parameters = _get_stack_parameters (directory_type , vpc_stack , request .config .getoption ("key_name" ))
204
209
tags = [{"Key" : "parallelcluster:integ-tests-ad-stack" , "Value" : directory_type }]
205
210
if request .config .getoption ("retain_ad_stack" ):
206
- tags .append ({"Key" : "DO-NOT-DELETE" , "Value" : "Retained for integration testing" })
211
+ tags .append ({"Key" : DO_NOT_DELETE_TAG_KEY , "Value" : "Retained for integration testing" })
207
212
208
213
directory_stack = CfnStack (
209
214
name = directory_stack_name ,
@@ -212,12 +217,33 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
212
217
parameters = stack_parameters ,
213
218
capabilities = ["CAPABILITY_IAM" , "CAPABILITY_NAMED_IAM" , "CAPABILITY_AUTO_EXPAND" ],
214
219
tags = tags ,
220
+ disable_rollback = True ,
215
221
)
216
- cfn_stacks_factory .create_stack (directory_stack )
222
+ try :
223
+ cfn_stacks_factory .create_stack (directory_stack , stack_is_under_test = True )
224
+ except BaseException as e :
225
+ logging .error ("Failed to create stack %s" , directory_stack_name )
226
+ # We want to retain the stack in case of failure in order to debug it.
227
+ # We retain a limited number of stack to contain the costs.
228
+ n_quarantined_ad_stacks = len (get_quarantined_stacks (region , prefix = AD_STACK_PREFIX ))
229
+ if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS :
230
+ logging .warn ("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)" ,
231
+ directory_stack_name , n_quarantined_ad_stacks , MAX_QUARANTINED_STACKS )
232
+ quarantine_stacks (region , stack_names = [directory_stack_name ])
233
+ else :
234
+ logging .warn ("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)" ,
235
+ directory_stack_name , n_quarantined_ad_stacks , MAX_QUARANTINED_STACKS )
236
+ raise e
217
237
logging .info ("Creation of stack %s complete" , directory_stack_name )
218
238
219
239
return directory_stack
220
240
241
+ def get_retained_ad_stacks_count ():
242
+ cfn = boto3 .client ("cloudformation" )
243
+ failed_stacks = cfn .list_stacks (StackStatusFilter = ['CREATE_FAILED' ])["StackSummaries" ]
244
+ failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack .get ('StackName' )]
245
+ return len ([stack for stack in failed_ad_stacks if stack .get ("Tags" ) and
246
+ any (tag .get ("Key" ) == DO_NOT_DELETE_TAG_KEY for tag in stack .get ("Tags" ))])
221
247
222
248
@retry (wait_fixed = seconds (20 ), stop_max_delay = seconds (700 ))
223
249
def _check_ssm_success (ssm_client , command_id , instance_id ):
@@ -243,10 +269,10 @@ def _directory_factory(
243
269
directory_stack_name = created_directory_stacks .get (region , {}).get ("directory" )
244
270
logging .info ("Using directory stack named %s created by another test" , directory_stack_name )
245
271
else :
246
- stack_prefix = f"integ-tests-MultiUserInfraStack { directory_type } "
272
+ stack_prefix = f"{ AD_STACK_PREFIX } { directory_type } "
247
273
directory_stack_name = find_stack_by_tag ("parallelcluster:integ-tests-ad-stack" , region , stack_prefix )
248
274
249
- if not directory_stack_name :
275
+ if not directory_stack_name or is_quarantined_stack ( region , directory_stack_name ) :
250
276
directory_stack = _create_directory_stack (
251
277
cfn_stacks_factory ,
252
278
request ,
@@ -257,7 +283,7 @@ def _directory_factory(
257
283
directory_stack_name = directory_stack .name
258
284
created_directory_stacks [region ]["directory" ] = directory_stack_name
259
285
if request .config .getoption ("retain_ad_stack" ):
260
- add_tag_to_stack (vpc_stack .name , "DO-NOT-DELETE" , "Retained for integration testing" )
286
+ add_tag_to_stack (vpc_stack .name , DO_NOT_DELETE_TAG_KEY , "Retained for integration testing" )
261
287
return directory_stack_name
262
288
263
289
yield _directory_factory
0 commit comments