Skip to content

Commit 42ec878

Browse files
committed
[Test] Quarantine AD stacks on failure for debugging purposes. At most 5 will be quarantined to limit costs.
1 parent 21d14f6 commit 42ec878

File tree

4 files changed

+89
-9
lines changed

4 files changed

+89
-9
lines changed

tests/integration-tests/cfn_stacks_factory.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,14 @@
3131
class CfnStack:
3232
"""Identify a CloudFormation stack."""
3333

34-
def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None):
34+
def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None, disable_rollback=False):
3535
self.name = name
3636
self.region = region
3737
self.template = template
3838
self.parameters = parameters or []
3939
self.capabilities = capabilities or []
4040
self.tags = tags or []
41+
self.disable_rollback=disable_rollback
4142
self.cfn_stack_id = None
4243
self.__cfn_outputs = None
4344
self.__cfn_resources = None
@@ -175,6 +176,7 @@ def create_stack(self, stack, stack_is_under_test=False):
175176
Parameters=stack.parameters,
176177
Capabilities=stack.capabilities,
177178
Tags=stack.tags,
179+
DisableRollback=stack.disable_rollback,
178180
)
179181
else:
180182
result = cfn_client.create_stack(
@@ -183,6 +185,7 @@ def create_stack(self, stack, stack_is_under_test=False):
183185
Parameters=stack.parameters,
184186
Capabilities=stack.capabilities,
185187
Tags=stack.tags,
188+
DisableRollback=stack.disable_rollback,
186189
)
187190
stack.cfn_stack_id = result["StackId"]
188191
self.__created_stacks[id] = stack

tests/integration-tests/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@
1919

2020
UNSUPPORTED_OSES_FOR_DCV = ["alinux2023"]
2121

22+
DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE'
23+
24+
QUARANTINE_TAG_KEY = 'QUARANTINE'
25+
26+
MAX_QUARANTINED_STACKS = 5
27+
28+
QUARANTINE_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }, { "Key": QUARANTINE_TAG_KEY, "Value": "true" }]
29+
RETENTION_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }]
30+
2231

2332
class NodeType(Enum):
2433
"""Categories of nodes."""

tests/integration-tests/tests/ad_integration/test_ad_integration.py

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,18 @@
2727
from remote_command_executor import RemoteCommandExecutor
2828
from retrying import retry
2929
from time_utils import seconds
30-
from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric
30+
31+
from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric, get_quarantined_stacks, is_quarantined_stack, quarantine_stacks
3132

3233
from tests.ad_integration.cluster_user import ClusterUser
3334
from tests.common.utils import run_system_analyzer
35+
from constants import DO_NOT_DELETE_TAG_KEY, MAX_QUARANTINED_STACKS
3436

3537
NUM_USERS_TO_CREATE = 5
3638
NUM_USERS_TO_TEST = 3
3739

40+
AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'
41+
3842

3943
def get_infra_stack_outputs(stack_name):
4044
cfn = boto3.client("cloudformation")
@@ -117,7 +121,7 @@ def add_tag_to_stack(stack_name, key, value):
117121
stack = cfn.Stack(stack_name)
118122
add_tag = True
119123
for tag in stack.tags:
120-
if tag.get("Key") == "DO-NOT-DELETE":
124+
if tag.get("Key") == DO_NOT_DELETE_TAG_KEY:
121125
add_tag = False
122126
break
123127
if add_tag:
@@ -127,6 +131,7 @@ def add_tag_to_stack(stack_name, key, value):
127131
Tags=[
128132
{"Key": key, "Value": value},
129133
],
134+
DisableRollback=True,
130135
)
131136

132137

@@ -189,7 +194,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair):
189194

190195
def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, vpc_stack: CfnVpcStack):
191196
directory_stack_name = generate_stack_name(
192-
f"integ-tests-MultiUserInfraStack{directory_type}", request.config.getoption("stackname_suffix")
197+
f"{AD_STACK_PREFIX}{directory_type}", request.config.getoption("stackname_suffix")
193198
)
194199

195200
if directory_type not in ("MicrosoftAD", "SimpleAD"):
@@ -203,7 +208,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
203208
stack_parameters = _get_stack_parameters(directory_type, vpc_stack, request.config.getoption("key_name"))
204209
tags = [{"Key": "parallelcluster:integ-tests-ad-stack", "Value": directory_type}]
205210
if request.config.getoption("retain_ad_stack"):
206-
tags.append({"Key": "DO-NOT-DELETE", "Value": "Retained for integration testing"})
211+
tags.append({"Key": DO_NOT_DELETE_TAG_KEY, "Value": "Retained for integration testing"})
207212

208213
directory_stack = CfnStack(
209214
name=directory_stack_name,
@@ -212,12 +217,33 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
212217
parameters=stack_parameters,
213218
capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"],
214219
tags=tags,
220+
disable_rollback=True,
215221
)
216-
cfn_stacks_factory.create_stack(directory_stack)
222+
try:
223+
cfn_stacks_factory.create_stack(directory_stack, stack_is_under_test=True)
224+
except BaseException as e:
225+
logging.error("Failed to create stack %s", directory_stack_name)
226+
# We want to retain the stack in case of failure in order to debug it.
227+
# We retain a limited number of stack to contain the costs.
228+
n_quarantined_ad_stacks = len(get_quarantined_stacks(region, prefix=AD_STACK_PREFIX))
229+
if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS:
230+
logging.warn("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)",
231+
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
232+
quarantine_stacks(region, stack_names=[directory_stack_name])
233+
else:
234+
logging.warn("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)",
235+
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
236+
raise e
217237
logging.info("Creation of stack %s complete", directory_stack_name)
218238

219239
return directory_stack
220240

241+
def get_retained_ad_stacks_count():
242+
cfn = boto3.client("cloudformation")
243+
failed_stacks = cfn.list_stacks(StackStatusFilter=['CREATE_FAILED'])["StackSummaries"]
244+
failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack.get('StackName')]
245+
return len([stack for stack in failed_ad_stacks if stack.get("Tags") and
246+
any(tag.get("Key") == DO_NOT_DELETE_TAG_KEY for tag in stack.get("Tags"))])
221247

222248
@retry(wait_fixed=seconds(20), stop_max_delay=seconds(700))
223249
def _check_ssm_success(ssm_client, command_id, instance_id):
@@ -243,10 +269,10 @@ def _directory_factory(
243269
directory_stack_name = created_directory_stacks.get(region, {}).get("directory")
244270
logging.info("Using directory stack named %s created by another test", directory_stack_name)
245271
else:
246-
stack_prefix = f"integ-tests-MultiUserInfraStack{directory_type}"
272+
stack_prefix = f"{AD_STACK_PREFIX}{directory_type}"
247273
directory_stack_name = find_stack_by_tag("parallelcluster:integ-tests-ad-stack", region, stack_prefix)
248274

249-
if not directory_stack_name:
275+
if not directory_stack_name or is_quarantined_stack(region, directory_stack_name):
250276
directory_stack = _create_directory_stack(
251277
cfn_stacks_factory,
252278
request,
@@ -257,7 +283,7 @@ def _directory_factory(
257283
directory_stack_name = directory_stack.name
258284
created_directory_stacks[region]["directory"] = directory_stack_name
259285
if request.config.getoption("retain_ad_stack"):
260-
add_tag_to_stack(vpc_stack.name, "DO-NOT-DELETE", "Retained for integration testing")
286+
add_tag_to_stack(vpc_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained for integration testing")
261287
return directory_stack_name
262288

263289
yield _directory_factory

tests/integration-tests/utils.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from jinja2.sandbox import SandboxedEnvironment
2929
from retrying import retry
3030
from time_utils import minutes, seconds
31+
from constants import QUARANTINE_TAG_KEY, DO_NOT_DELETE_TAG_KEY, QUARANTINE_TAGS
3132

3233
DEFAULT_PARTITION = "aws"
3334
PARTITION_MAP = {
@@ -908,3 +909,44 @@ def find_stack_by_tag(tag, region, stack_prefix):
908909
logging.info(f"Found stack: {name} (created on {creation_date})")
909910
return name
910911
return None
912+
913+
def get_quarantined_stacks(region, prefix=None):
914+
quarantined_stacks = []
915+
cfn_client = boto3.client("cloudformation", region_name=region)
916+
917+
for stack in cfn_client.describe_stacks().get("Stacks", []):
918+
stack_name = stack.get("StackName")
919+
if not stack_name:
920+
continue
921+
if prefix and not stack_name.startswith(prefix):
922+
continue
923+
if any(tag.get("Key") == QUARANTINE_TAG_KEY for tag in stack.get("Tags", [])):
924+
quarantined_stacks.append(stack_name)
925+
return quarantined_stacks
926+
927+
def is_quarantined_stack(region, stack_name):
928+
return stack_name in get_quarantined_stacks(region)
929+
930+
def quarantine_stacks(region, stack_names):
931+
for stack_name in stack_names:
932+
add_tags_to_stack(region, stack_name, QUARANTINE_TAGS)
933+
934+
def add_tags_to_stack(region, stack_name, tags):
935+
cfn = boto3.resource("cloudformation", region_name=region)
936+
stack = cfn.Stack(stack_name)
937+
938+
stack.update(
939+
UsePreviousTemplate=True,
940+
Parameters=get_unchanged_stack_parameters(stack),
941+
Capabilities=stack.capabilities,
942+
DisableRollback=True,
943+
Tags=tags,
944+
)
945+
946+
def get_unchanged_stack_parameters(stack):
947+
return [
948+
{
949+
'ParameterKey': current_parameter.get('ParameterKey'),
950+
'UsePreviousValue': True,
951+
} for current_parameter in stack.parameters
952+
]

0 commit comments

Comments
 (0)