Skip to content

Commit 4b76002

Browse files
committed
SQUASH Quarantine logic
1 parent fd2c90d commit 4b76002

File tree

3 files changed

+61
-14
lines changed

3 files changed

+61
-14
lines changed

tests/integration-tests/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,15 @@
1919

2020
UNSUPPORTED_OSES_FOR_DCV = ["alinux2023"]
2121

22+
DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE'
23+
24+
QUARANTINE_TAG_KEY = 'QUARANTINE'
25+
26+
MAX_QUARANTINED_STACKS = 5
27+
28+
QUARANTINE_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }, { "Key": QUARANTINE_TAG_KEY, "Value": "true" }]
29+
RETENTION_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }]
30+
2231

2332
class NodeType(Enum):
2433
"""Categories of nodes."""

tests/integration-tests/tests/ad_integration/test_ad_integration.py

Lines changed: 10 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,20 +28,17 @@
2828
from retrying import retry
2929
from time_utils import seconds
3030

31-
from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric
31+
from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric, get_quarantined_stacks, is_quarantined_stack, quarantine_stacks
3232

3333
from tests.ad_integration.cluster_user import ClusterUser
3434
from tests.common.utils import run_system_analyzer
35+
from constants import DO_NOT_DELETE_TAG_KEY, MAX_QUARANTINED_STACKS
3536

3637
NUM_USERS_TO_CREATE = 5
3738
NUM_USERS_TO_TEST = 3
3839

39-
MAX_QUARANTINED_STACKS = 5
40-
4140
AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack'
4241

43-
DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE'
44-
4542

4643
def get_infra_stack_outputs(stack_name):
4744
cfn = boto3.client("cloudformation")
@@ -228,15 +225,14 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region,
228225
logging.error("Failed to create stack %s", directory_stack_name)
229226
# We want to retain the stack in case of failure in order to debug it.
230227
# We retain a limited number of stack to contain the costs.
231-
n_retained_ad_stacks = get_retained_ad_stacks_count()
232-
if n_retained_ad_stacks < MAX_QUARANTINED_STACKS:
233-
logging.warn("Retaining failed stack %s to debug failure (retained: %d, max: %d)",
234-
directory_stack_name, n_retained_ad_stacks, MAX_QUARANTINED_STACKS)
235-
236-
add_tag_to_stack(directory_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained to debug failure")
228+
n_quarantined_ad_stacks = len(get_quarantined_stacks(region, prefix=AD_STACK_PREFIX))
229+
if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS:
230+
logging.warn("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)",
231+
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
232+
quarantine_stacks(region, stack_names=[directory_stack_name])
237233
else:
238-
logging.warn("Cannot retain failed stack %s for debugging because there are already %d retained (max: %d)",
239-
directory_stack_name, n_retained_ad_stacks, MAX_QUARANTINED_STACKS)
234+
logging.warn("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)",
235+
directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS)
240236
raise e
241237
logging.info("Creation of stack %s complete", directory_stack_name)
242238

@@ -276,7 +272,7 @@ def _directory_factory(
276272
stack_prefix = f"{AD_STACK_PREFIX}{directory_type}"
277273
directory_stack_name = find_stack_by_tag("parallelcluster:integ-tests-ad-stack", region, stack_prefix)
278274

279-
if not directory_stack_name:
275+
if not directory_stack_name or is_quarantined_stack(region, directory_stack_name):
280276
directory_stack = _create_directory_stack(
281277
cfn_stacks_factory,
282278
request,

tests/integration-tests/utils.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from jinja2.sandbox import SandboxedEnvironment
2929
from retrying import retry
3030
from time_utils import minutes, seconds
31+
from constants import QUARANTINE_TAG_KEY, DO_NOT_DELETE_TAG_KEY, QUARANTINE_TAGS
3132

3233
DEFAULT_PARTITION = "aws"
3334
PARTITION_MAP = {
@@ -908,3 +909,44 @@ def find_stack_by_tag(tag, region, stack_prefix):
908909
logging.info(f"Found stack: {name} (created on {creation_date})")
909910
return name
910911
return None
912+
913+
def get_quarantined_stacks(region, prefix=None):
914+
quarantined_stacks = []
915+
cfn_client = boto3.client("cloudformation", region_name=region)
916+
917+
for stack in cfn_client.describe_stacks().get("Stacks", []):
918+
stack_name = stack.get("StackName")
919+
if not stack_name:
920+
continue
921+
if prefix and not stack_name.startswith(prefix):
922+
continue
923+
if any(tag.get("Key") == QUARANTINE_TAG_KEY for tag in stack.get("Tags", [])):
924+
quarantined_stacks.append(stack_name)
925+
return quarantined_stacks
926+
927+
def is_quarantined_stack(region, stack_name):
928+
return stack_name in get_quarantined_stacks(region)
929+
930+
def quarantine_stacks(region, stack_names):
931+
for stack_name in stack_names:
932+
add_tags_to_stack(region, stack_name, QUARANTINE_TAGS)
933+
934+
def add_tags_to_stack(region, stack_name, tags):
935+
cfn = boto3.resource("cloudformation", region_name=region)
936+
stack = cfn.Stack(stack_name)
937+
938+
stack.update(
939+
UsePreviousTemplate=True,
940+
Parameters=get_unchanged_stack_parameters(stack),
941+
Capabilities=stack.capabilities,
942+
DisableRollback=True,
943+
Tags=tags,
944+
)
945+
946+
def get_unchanged_stack_parameters(stack):
947+
return [
948+
{
949+
'ParameterKey': current_parameter.get('ParameterKey'),
950+
'UsePreviousValue': True,
951+
} for current_parameter in stack.parameters
952+
]

0 commit comments

Comments
 (0)