From 3156e9b64aa116c8eb6224a1d20b19852d4fa53b Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 19 Mar 2025 13:51:29 -0400 Subject: [PATCH 1/3] [Test] Reduce noise in logs moving metrics publisher logs to debug --- tests/integration-tests/framework/metrics_publisher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration-tests/framework/metrics_publisher.py b/tests/integration-tests/framework/metrics_publisher.py index 20e774bc52..9103e8b48c 100644 --- a/tests/integration-tests/framework/metrics_publisher.py +++ b/tests/integration-tests/framework/metrics_publisher.py @@ -52,7 +52,7 @@ def __init__(self, region=None): def publish_metrics_to_cloudwatch(self, namespace: str, metrics: List[Metric]): """Pushes a list of metrics to cloudwatch using a single namespace.""" try: - logging.info( + logging.debug( f"publishing metrics to cloudwatch {[metric.generate_metric_data_entry() for metric in metrics]}" ) self.client.put_metric_data( From 21d14f6055733767cc5053db8b79b29476c755ba Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Wed, 19 Mar 2025 14:14:27 -0400 Subject: [PATCH 2/3] [ActiveDirectory] In AD 1-click template fail faster in case of issues. In particular: 1. Make the Ad admin node signal the failure, not only the success; in this way the wait condition handle can fail faster. 2. reduced the number of retries made by adcli from 5 to 3 because in case of issues is not necessary to do that many retries; especially considering that adcli has a 2min retry delay. 3. reduced the condition handle timeout from 900s to 600s as 10min are enough to include AD admin node bootstrap and 3 adcli retries. 4. Execute the post processing lambda only if the AD admin node was able to setup the directory --- cloudformation/ad/ad-integration.yaml | 107 +++++++++++++++----------- 1 file changed, 60 insertions(+), 47 deletions(-) diff --git a/cloudformation/ad/ad-integration.yaml b/cloudformation/ad/ad-integration.yaml index 72bc3127d5..f3e0333165 100644 --- a/cloudformation/ad/ad-integration.yaml +++ b/cloudformation/ad/ad-integration.yaml @@ -402,7 +402,7 @@ Resources: Properties: Count: 1 Handle: !Ref AdDomainAdminNodeWaitConditionHandle - Timeout: 900 + Timeout: 600 AdDomainAdminNode: Type: AWS::EC2::Instance @@ -446,55 +446,66 @@ Resources: #!/bin/bash -e set -o pipefail exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 - yum update -y aws-cfn-bootstrap - /opt/aws/bin/cfn-init -v --stack "${AWS::StackName}" --resource AdDomainAdminNode --configsets setup --region "${AWS::Region}" - echo "Directory Id: ${DirectoryId}" - echo "Domain Name: ${DirectoryDomain}" - echo "Domain DNS IP 1: ${DnsIp1}" - echo "Domain DNS IP 2: ${DnsIp2}" - echo "Domain Certificate Secret: ${DomainCertificateSecretArn}" - echo "Domain Private Key Secret: ${DomainPrivateKeySecretArn}" + function main() { + yum update -y aws-cfn-bootstrap + /opt/aws/bin/cfn-init -v --stack "${AWS::StackName}" --resource AdDomainAdminNode --configsets setup --region "${AWS::Region}" + echo "Directory Id: ${DirectoryId}" + echo "Domain Name: ${DirectoryDomain}" + echo "Domain DNS IP 1: ${DnsIp1}" + echo "Domain DNS IP 2: ${DnsIp2}" + echo "Domain Certificate Secret: ${DomainCertificateSecretArn}" + echo "Domain Private Key Secret: ${DomainPrivateKeySecretArn}" + + echo "Describing directory..." + aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}" + echo "Describing domain controllers..." + aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}" + + ADMIN_PW="${AdminPassword}" + + USERNAMES="ReadOnlyUser,${UserNames}" + echo "Registering Users: $USERNAMES ..." + for username in $(echo $USERNAMES | sed "s/,/ /g") + do + attempt=0 + max_attempts=3 + until [ $attempt -ge $max_attempts ]; do + attempt=$((attempt+1)) + echo "Registering user $username (attempt $attempt/$max_attempts) ..." + echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp1}" --display-name="$username" "$username" && echo "User registered: $username" && break + echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp2}" --display-name="$username" "$username" && echo "User registered: $username" && break + echo "User creation failed, describing directory and controllers for troubleshooting..." + aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}" + aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}" + sleep 10 + done + done + + echo "Creating domain certificate..." + PRIVATE_KEY="${DirectoryDomain}.key" + CERTIFICATE="${DirectoryDomain}.crt" + printf '.\n.\n.\n.\n.\n%s\n.\n' "${DirectoryDomain}" | openssl req -x509 -sha256 -nodes -newkey rsa:2048 -keyout "$PRIVATE_KEY" -days 365 -out "$CERTIFICATE" + + echo "Storing domain private key to Secrets Manager..." + aws secretsmanager put-secret-value --secret-id "${DomainPrivateKeySecretArn}" --secret-string "file://$PRIVATE_KEY" --region "${AWS::Region}" + + echo "Storing domain certificate to Secrets Manager..." + aws secretsmanager put-secret-value --secret-id "${DomainCertificateSecretArn}" --secret-string "file://$CERTIFICATE" --region "${AWS::Region}" + + echo "Deleting private key and certificate from local file system..." + rm -rf "$PRIVATE_KEY" "$CERTIFICATE" + } - echo "Describing directory..." - aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}" - echo "Describing domain controllers..." - aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}" + function signal_success() { + /opt/aws/bin/cfn-signal -e 0 --stack "${AWS::StackName}" --resource "${AdDomainAdminNodeWaitConditionHandle}" --region "${AWS::Region}" + } - ADMIN_PW="${AdminPassword}" + function signal_failure() { + /opt/aws/bin/cfn-signal -e 0 --stack "${AWS::StackName}" --resource "${AdDomainAdminNodeWaitConditionHandle}" --region "${AWS::Region}" + exit 1 + } - USERNAMES="ReadOnlyUser,${UserNames}" - echo "Registering Users: $USERNAMES ..." - for username in $(echo $USERNAMES | sed "s/,/ /g") - do - attempt=0 - max_attempts=5 - until [ $attempt -ge $max_attempts ]; do - attempt=$((attempt+1)) - echo "Registering user $username (attempt $attempt/$max_attempts) ..." - echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp1}" --display-name="$username" "$username" && echo "User registered: $username" && break - echo "$ADMIN_PW" | adcli create-user -v -x -U "${Admin}" --domain-controller="${DnsIp2}" --display-name="$username" "$username" && echo "User registered: $username" && break - echo "User creation failed, describing directory and controllers for troubleshooting..." - aws ds describe-directories --directory-id "${DirectoryId}" --region "${AWS::Region}" - aws ds describe-domain-controllers --directory-id "${DirectoryId}" --region "${AWS::Region}" - sleep 10 - done - done - - echo "Creating domain certificate..." - PRIVATE_KEY="${DirectoryDomain}.key" - CERTIFICATE="${DirectoryDomain}.crt" - printf '.\n.\n.\n.\n.\n%s\n.\n' "${DirectoryDomain}" | openssl req -x509 -sha256 -nodes -newkey rsa:2048 -keyout "$PRIVATE_KEY" -days 365 -out "$CERTIFICATE" - - echo "Storing domain private key to Secrets Manager..." - aws secretsmanager put-secret-value --secret-id "${DomainPrivateKeySecretArn}" --secret-string "file://$PRIVATE_KEY" --region "${AWS::Region}" - - echo "Storing domain certificate to Secrets Manager..." - aws secretsmanager put-secret-value --secret-id "${DomainCertificateSecretArn}" --secret-string "file://$CERTIFICATE" --region "${AWS::Region}" - - echo "Deleting private key and certificate from local file system..." - rm -rf "$PRIVATE_KEY" "$CERTIFICATE" - - /opt/aws/bin/cfn-signal -e "$?" --stack "${AWS::StackName}" --region "${AWS::Region}" "${AdDomainAdminNodeWaitConditionHandle}" + main && signal_success || signal_failure - { DirectoryId: !GetAtt Prep.DirectoryId, DirectoryDomain: !GetAtt Prep.DomainName, @@ -622,6 +633,8 @@ Resources: Post: Type: Custom::PostLambda + DependsOn: + - AdDomainAdminNodeWaitCondition Properties: ServiceToken: !GetAtt PostLambda.Arn AdminNodeInstanceId: !Ref AdDomainAdminNode From 42ec8789567323aae5c315b89c1ea68487d4aafb Mon Sep 17 00:00:00 2001 From: Giacomo Marciani Date: Mon, 17 Mar 2025 17:57:02 -0400 Subject: [PATCH 3/3] [Test] Quarantine AD stacks on failure for debugging purposes. At most 5 will be quarantined to limit costs. --- tests/integration-tests/cfn_stacks_factory.py | 5 ++- tests/integration-tests/constants.py | 9 ++++ .../ad_integration/test_ad_integration.py | 42 +++++++++++++++---- tests/integration-tests/utils.py | 42 +++++++++++++++++++ 4 files changed, 89 insertions(+), 9 deletions(-) diff --git a/tests/integration-tests/cfn_stacks_factory.py b/tests/integration-tests/cfn_stacks_factory.py index beb13dd606..e779f176eb 100644 --- a/tests/integration-tests/cfn_stacks_factory.py +++ b/tests/integration-tests/cfn_stacks_factory.py @@ -31,13 +31,14 @@ class CfnStack: """Identify a CloudFormation stack.""" - def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None): + def __init__(self, name, region, template, parameters=None, capabilities=None, tags=None, disable_rollback=False): self.name = name self.region = region self.template = template self.parameters = parameters or [] self.capabilities = capabilities or [] self.tags = tags or [] + self.disable_rollback=disable_rollback self.cfn_stack_id = None self.__cfn_outputs = None self.__cfn_resources = None @@ -175,6 +176,7 @@ def create_stack(self, stack, stack_is_under_test=False): Parameters=stack.parameters, Capabilities=stack.capabilities, Tags=stack.tags, + DisableRollback=stack.disable_rollback, ) else: result = cfn_client.create_stack( @@ -183,6 +185,7 @@ def create_stack(self, stack, stack_is_under_test=False): Parameters=stack.parameters, Capabilities=stack.capabilities, Tags=stack.tags, + DisableRollback=stack.disable_rollback, ) stack.cfn_stack_id = result["StackId"] self.__created_stacks[id] = stack diff --git a/tests/integration-tests/constants.py b/tests/integration-tests/constants.py index cb822950f0..44cc0b77d2 100644 --- a/tests/integration-tests/constants.py +++ b/tests/integration-tests/constants.py @@ -19,6 +19,15 @@ UNSUPPORTED_OSES_FOR_DCV = ["alinux2023"] +DO_NOT_DELETE_TAG_KEY = 'DO-NOT-DELETE' + +QUARANTINE_TAG_KEY = 'QUARANTINE' + +MAX_QUARANTINED_STACKS = 5 + +QUARANTINE_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }, { "Key": QUARANTINE_TAG_KEY, "Value": "true" }] +RETENTION_TAGS = [{ "Key": DO_NOT_DELETE_TAG_KEY, "Value": "true" }] + class NodeType(Enum): """Categories of nodes.""" diff --git a/tests/integration-tests/tests/ad_integration/test_ad_integration.py b/tests/integration-tests/tests/ad_integration/test_ad_integration.py index d8e1afdfed..4bb719bff6 100644 --- a/tests/integration-tests/tests/ad_integration/test_ad_integration.py +++ b/tests/integration-tests/tests/ad_integration/test_ad_integration.py @@ -27,14 +27,18 @@ from remote_command_executor import RemoteCommandExecutor from retrying import retry from time_utils import seconds -from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric + +from utils import find_stack_by_tag, generate_stack_name, is_directory_supported, random_alphanumeric, get_quarantined_stacks, is_quarantined_stack, quarantine_stacks from tests.ad_integration.cluster_user import ClusterUser from tests.common.utils import run_system_analyzer +from constants import DO_NOT_DELETE_TAG_KEY, MAX_QUARANTINED_STACKS NUM_USERS_TO_CREATE = 5 NUM_USERS_TO_TEST = 3 +AD_STACK_PREFIX = 'integ-tests-MultiUserInfraStack' + def get_infra_stack_outputs(stack_name): cfn = boto3.client("cloudformation") @@ -117,7 +121,7 @@ def add_tag_to_stack(stack_name, key, value): stack = cfn.Stack(stack_name) add_tag = True for tag in stack.tags: - if tag.get("Key") == "DO-NOT-DELETE": + if tag.get("Key") == DO_NOT_DELETE_TAG_KEY: add_tag = False break if add_tag: @@ -127,6 +131,7 @@ def add_tag_to_stack(stack_name, key, value): Tags=[ {"Key": key, "Value": value}, ], + DisableRollback=True, ) @@ -189,7 +194,7 @@ def _get_stack_parameters(directory_type, vpc_stack, keypair): def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, vpc_stack: CfnVpcStack): directory_stack_name = generate_stack_name( - f"integ-tests-MultiUserInfraStack{directory_type}", request.config.getoption("stackname_suffix") + f"{AD_STACK_PREFIX}{directory_type}", request.config.getoption("stackname_suffix") ) if directory_type not in ("MicrosoftAD", "SimpleAD"): @@ -203,7 +208,7 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, stack_parameters = _get_stack_parameters(directory_type, vpc_stack, request.config.getoption("key_name")) tags = [{"Key": "parallelcluster:integ-tests-ad-stack", "Value": directory_type}] if request.config.getoption("retain_ad_stack"): - tags.append({"Key": "DO-NOT-DELETE", "Value": "Retained for integration testing"}) + tags.append({"Key": DO_NOT_DELETE_TAG_KEY, "Value": "Retained for integration testing"}) directory_stack = CfnStack( name=directory_stack_name, @@ -212,12 +217,33 @@ def _create_directory_stack(cfn_stacks_factory, request, directory_type, region, parameters=stack_parameters, capabilities=["CAPABILITY_IAM", "CAPABILITY_NAMED_IAM", "CAPABILITY_AUTO_EXPAND"], tags=tags, + disable_rollback=True, ) - cfn_stacks_factory.create_stack(directory_stack) + try: + cfn_stacks_factory.create_stack(directory_stack, stack_is_under_test=True) + except BaseException as e: + logging.error("Failed to create stack %s", directory_stack_name) + # We want to retain the stack in case of failure in order to debug it. + # We retain a limited number of stack to contain the costs. + n_quarantined_ad_stacks = len(get_quarantined_stacks(region, prefix=AD_STACK_PREFIX)) + if n_quarantined_ad_stacks < MAX_QUARANTINED_STACKS: + logging.warn("Quarantining failed stack %s to debug failure (quarantined: %d, max: %d)", + directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS) + quarantine_stacks(region, stack_names=[directory_stack_name]) + else: + logging.warn("Cannot quarantine failed stack %s for debugging because there are already %d quarantined (max: %d)", + directory_stack_name, n_quarantined_ad_stacks, MAX_QUARANTINED_STACKS) + raise e logging.info("Creation of stack %s complete", directory_stack_name) return directory_stack +def get_retained_ad_stacks_count(): + cfn = boto3.client("cloudformation") + failed_stacks = cfn.list_stacks(StackStatusFilter=['CREATE_FAILED'])["StackSummaries"] + failed_ad_stacks = [stack for stack in failed_stacks if AD_STACK_PREFIX in stack.get('StackName')] + return len([stack for stack in failed_ad_stacks if stack.get("Tags") and + any(tag.get("Key") == DO_NOT_DELETE_TAG_KEY for tag in stack.get("Tags"))]) @retry(wait_fixed=seconds(20), stop_max_delay=seconds(700)) def _check_ssm_success(ssm_client, command_id, instance_id): @@ -243,10 +269,10 @@ def _directory_factory( directory_stack_name = created_directory_stacks.get(region, {}).get("directory") logging.info("Using directory stack named %s created by another test", directory_stack_name) else: - stack_prefix = f"integ-tests-MultiUserInfraStack{directory_type}" + stack_prefix = f"{AD_STACK_PREFIX}{directory_type}" directory_stack_name = find_stack_by_tag("parallelcluster:integ-tests-ad-stack", region, stack_prefix) - if not directory_stack_name: + if not directory_stack_name or is_quarantined_stack(region, directory_stack_name): directory_stack = _create_directory_stack( cfn_stacks_factory, request, @@ -257,7 +283,7 @@ def _directory_factory( directory_stack_name = directory_stack.name created_directory_stacks[region]["directory"] = directory_stack_name if request.config.getoption("retain_ad_stack"): - add_tag_to_stack(vpc_stack.name, "DO-NOT-DELETE", "Retained for integration testing") + add_tag_to_stack(vpc_stack.name, DO_NOT_DELETE_TAG_KEY, "Retained for integration testing") return directory_stack_name yield _directory_factory diff --git a/tests/integration-tests/utils.py b/tests/integration-tests/utils.py index 4baba0ac69..e1b2812746 100644 --- a/tests/integration-tests/utils.py +++ b/tests/integration-tests/utils.py @@ -28,6 +28,7 @@ from jinja2.sandbox import SandboxedEnvironment from retrying import retry from time_utils import minutes, seconds +from constants import QUARANTINE_TAG_KEY, DO_NOT_DELETE_TAG_KEY, QUARANTINE_TAGS DEFAULT_PARTITION = "aws" PARTITION_MAP = { @@ -908,3 +909,44 @@ def find_stack_by_tag(tag, region, stack_prefix): logging.info(f"Found stack: {name} (created on {creation_date})") return name return None + +def get_quarantined_stacks(region, prefix=None): + quarantined_stacks = [] + cfn_client = boto3.client("cloudformation", region_name=region) + + for stack in cfn_client.describe_stacks().get("Stacks", []): + stack_name = stack.get("StackName") + if not stack_name: + continue + if prefix and not stack_name.startswith(prefix): + continue + if any(tag.get("Key") == QUARANTINE_TAG_KEY for tag in stack.get("Tags", [])): + quarantined_stacks.append(stack_name) + return quarantined_stacks + +def is_quarantined_stack(region, stack_name): + return stack_name in get_quarantined_stacks(region) + +def quarantine_stacks(region, stack_names): + for stack_name in stack_names: + add_tags_to_stack(region, stack_name, QUARANTINE_TAGS) + +def add_tags_to_stack(region, stack_name, tags): + cfn = boto3.resource("cloudformation", region_name=region) + stack = cfn.Stack(stack_name) + + stack.update( + UsePreviousTemplate=True, + Parameters=get_unchanged_stack_parameters(stack), + Capabilities=stack.capabilities, + DisableRollback=True, + Tags=tags, + ) + +def get_unchanged_stack_parameters(stack): + return [ + { + 'ParameterKey': current_parameter.get('ParameterKey'), + 'UsePreviousValue': True, + } for current_parameter in stack.parameters + ]