From 905cc20e29cd7044efbd2eeccedb4970b5ac48ed Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Tue, 10 Dec 2024 17:44:23 -0500 Subject: [PATCH 1/5] [AD] Increasing the attempt and sleep time when we create AD Domain * using bigger instance --- cloudformation/ad/ad-integration.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cloudformation/ad/ad-integration.yaml b/cloudformation/ad/ad-integration.yaml index 3991ffc52c..5d9aedb4aa 100644 --- a/cloudformation/ad/ad-integration.yaml +++ b/cloudformation/ad/ad-integration.yaml @@ -416,7 +416,7 @@ Resources: IamInstanceProfile: Ref: JoinProfile ImageId: !Ref AdminNodeAmiId - InstanceType: t3.micro + InstanceType: t3.xlarge KeyName: !Ref Keypair LaunchTemplate: LaunchTemplateId: !Ref 'DisableImdsv1LaunchTemplate' @@ -451,14 +451,14 @@ Resources: ADMIN_PW="${AdminPassword}" attempt=0 - max_attempts=5 + max_attempts=8 until [ $attempt -ge $max_attempts ]; do attempt=$((attempt+1)) echo "[DEBUG] Checking domain name resolution for ${DirectoryDomain} ..." dig ${DirectoryDomain} echo "Joining domain (attempt $attempt/$max_attempts) ..." echo "$ADMIN_PW" | sudo realm join -U "${Admin}" "${DirectoryDomain}" --verbose && echo "Domain joined" && break - sleep 10 + sleep 12 done sleep 10 From 2cfec31f174af3a8cbb68d0a4b98e4e50f27fa41 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 12 Dec 2024 19:16:56 -0500 Subject: [PATCH 2/5] Enabling Caching for DNS queries --- cloudformation/ad/ad-integration.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/cloudformation/ad/ad-integration.yaml b/cloudformation/ad/ad-integration.yaml index 5d9aedb4aa..e1c6b0abcf 100644 --- a/cloudformation/ad/ad-integration.yaml +++ b/cloudformation/ad/ad-integration.yaml @@ -446,6 +446,7 @@ Resources: DNS=${DnsIp1} ${DnsIp2} Domains=~. EOF + sudo rm /usr/lib/systemd/resolved.conf.d/resolved-disable-stub-listener.conf service systemd-resolved restart ADMIN_PW="${AdminPassword}" From 6b924bcd639d8db5f4fc956fc8516bf5fb2fb8c7 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 13 Dec 2024 16:31:20 -0500 Subject: [PATCH 3/5] Adding Controller in main configuration files --- cloudformation/ad/ad-integration.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cloudformation/ad/ad-integration.yaml b/cloudformation/ad/ad-integration.yaml index e1c6b0abcf..6687d223fc 100644 --- a/cloudformation/ad/ad-integration.yaml +++ b/cloudformation/ad/ad-integration.yaml @@ -440,8 +440,7 @@ Resources: echo "Domain Certificate Secret: ${DomainCertificateSecretArn}" echo "Domain Private Key Secret: ${DomainPrivateKeySecretArn}" - mkdir -p /etc/systemd/resolved.conf.d - cat << EOF > /etc/systemd/resolved.conf.d/pcluster-ad-domain-dns-server.conf + cat << EOF > /etc/systemd/resolved.conf [Resolve] DNS=${DnsIp1} ${DnsIp2} Domains=~. From 6806146e917c4d26f6367a89a253b4d3336f9d64 Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Fri, 7 Mar 2025 11:07:01 -0500 Subject: [PATCH 4/5] testing Trn1 with Capacity blocks --- .../tests/trainium/test_trainium.py | 2 +- .../test_trainium/pcluster.config.yaml | 49 ++++++++++--------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/tests/integration-tests/tests/trainium/test_trainium.py b/tests/integration-tests/tests/trainium/test_trainium.py index 1ff6b9dc84..cfdb868478 100644 --- a/tests/integration-tests/tests/trainium/test_trainium.py +++ b/tests/integration-tests/tests/trainium/test_trainium.py @@ -42,7 +42,7 @@ def test_trainium( def _test_allreduce_single_node(test_datadir, remote_command_executor, scheduler_commands): - result = scheduler_commands.submit_script(str(test_datadir / "neuron-allreduce.sh"), partition="queue-trn2") + result = scheduler_commands.submit_script(str(test_datadir / "neuron-allreduce.sh"), partition="queue-trn32") job_id = scheduler_commands.assert_job_submitted(result.stdout) scheduler_commands.wait_job_completed(job_id) scheduler_commands.assert_job_succeeded(job_id) diff --git a/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml index 13580b85be..0b46efbe32 100644 --- a/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml +++ b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml @@ -18,13 +18,16 @@ Scheduling: Scheduler: slurm SlurmQueues: - Name: queue-trn32 + CapacityType: CAPACITY_BLOCK ComputeResources: - Name: compute-resource-trn32 - Instances: - - InstanceType: {{instance}} + InstanceType: {{instance}} MinCount: 2 + MaxCount: 2 Efa: Enabled: true + CapacityReservationTarget: + CapacityReservationId: cr-05b0c099ce2534ce3 Networking: SubnetIds: - {{ private_subnet_id }} @@ -42,24 +45,24 @@ Scheduling: - BucketName: {{ bucket_name }} # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available - BucketName: aws-parallelcluster-beta - - Name: queue-trn2 - ComputeResources: - - Name: compute-resource-trn2 - Instances: - - InstanceType: trn1.2xlarge - MinCount: 0 # TODO change to 1 once allreduce test is passing - Networking: - SubnetIds: - - {{ private_subnet_id }} - CustomActions: - OnNodeConfigured: - Script: s3://{{ bucket_name }}/neuron-installation.sh - Iam: - # Policy to access to Trainium beta repository info - AdditionalIamPolicies: - - Policy: arn:aws:iam::447714826191:policy/TrainiumPreviewPolicy - S3Access: - # Needed to download post install script - - BucketName: {{ bucket_name }} - # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available - - BucketName: aws-parallelcluster-beta +# - Name: queue-trn2 +# ComputeResources: +# - Name: compute-resource-trn2 +# Instances: +# - InstanceType: trn1.2xlarge +# MinCount: 0 # TODO change to 1 once allreduce test is passing +# Networking: +# SubnetIds: +# - {{ private_subnet_id }} +# CustomActions: +# OnNodeConfigured: +# Script: s3://{{ bucket_name }}/neuron-installation.sh +# Iam: +# # Policy to access to Trainium beta repository info +# AdditionalIamPolicies: +# - Policy: arn:aws:iam::447714826191:policy/TrainiumPreviewPolicy +# S3Access: +# # Needed to download post install script +# - BucketName: {{ bucket_name }} +# # Needed to download neuronx packages and neff file --> FIXME to be removed once packages are public available +# - BucketName: aws-parallelcluster-beta From a827e801e56d5d4eb048d52f04166aca5578b27c Mon Sep 17 00:00:00 2001 From: Himani Anil Deshpande Date: Thu, 6 Mar 2025 21:14:13 -0500 Subject: [PATCH 5/5] Adding HN bootstrap timeout --- .../trainium/test_trainium/test_trainium/pcluster.config.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml index 0b46efbe32..20f46ebecd 100644 --- a/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml +++ b/tests/integration-tests/tests/trainium/test_trainium/test_trainium/pcluster.config.yaml @@ -1,3 +1,6 @@ +DevSettings: + Timeouts: + HeadNodeBootstrapTimeout: 2400 Image: Os: {{ os }} HeadNode: