awslabs
diff --git a/‎ai-ml/emr-spark-rapids/README.md
Lines changed: 3 additions & 8 deletions b/‎ai-ml/emr-spark-rapids/README.md
Lines changed: 3 additions & 8 deletions
diff --git a/‎ai-ml/emr-spark-rapids/addons.tf
Lines changed: 9 additions & 3 deletions b/‎ai-ml/emr-spark-rapids/addons.tf
Lines changed: 9 additions & 3 deletions
diff --git a/‎ai-ml/emr-spark-rapids/data.tf
Lines changed: 0 additions & 21 deletions b/‎ai-ml/emr-spark-rapids/data.tf
Lines changed: 0 additions & 21 deletions
diff --git a/‎ai-ml/emr-spark-rapids/eks.tf
Lines changed: 266 additions & 0 deletions b/‎ai-ml/emr-spark-rapids/eks.tf
Lines changed: 266 additions & 0 deletions
diff --git a/‎ai-ml/emr-spark-rapids/emr-eks.tf
Lines changed: 8 additions & 5 deletions b/‎ai-ml/emr-spark-rapids/emr-eks.tf
Lines changed: 8 additions & 5 deletions
@@ -35,8 +35,6 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
 | <a name="module_kubernetes_data_addons"></a> [kubernetes\_data\_addons](#module\_kubernetes\_data\_addons) | ../../workshop/modules/terraform-aws-eks-data-addons | n/a |
 | <a name="module_s3_bucket"></a> [s3\_bucket](#module\_s3\_bucket) | terraform-aws-modules/s3-bucket/aws | ~> 3.0 |
 | <a name="module_vpc"></a> [vpc](#module\_vpc) | terraform-aws-modules/vpc/aws | ~> 5.0 |
-| <a name="module_vpc_endpoints"></a> [vpc\_endpoints](#module\_vpc\_endpoints) | terraform-aws-modules/vpc/aws//modules/vpc-endpoints | ~> 5.0 |
-| <a name="module_vpc_endpoints_sg"></a> [vpc\_endpoints\_sg](#module\_vpc\_endpoints\_sg) | terraform-aws-modules/security-group/aws | ~> 5.0 |
 
 ## Resources
 
@@ -64,15 +62,12 @@ Checkout the [documentation website](https://awslabs.github.io/data-on-eks/docs/
 |------|-------------|------|---------|:--------:|
 | <a name="input_eks_cluster_version"></a> [eks\_cluster\_version](#input\_eks\_cluster\_version) | EKS Cluster version | `string` | `"1.26"` | no |
 | <a name="input_enable_amazon_prometheus"></a> [enable\_amazon\_prometheus](#input\_enable\_amazon\_prometheus) | Enable AWS Managed Prometheus service | `bool` | `true` | no |
-| <a name="input_enable_kubecost"></a> [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost | `bool` | `false` | no |
-| <a name="input_enable_vpc_endpoints"></a> [enable\_vpc\_endpoints](#input\_enable\_vpc\_endpoints) | Enable VPC Endpoints | `string` | `false` | no |
+| <a name="input_enable_kubecost"></a> [enable\_kubecost](#input\_enable\_kubecost) | Enable Kubecost | `bool` | `true` | no |
 | <a name="input_name"></a> [name](#input\_name) | Name of the VPC and EKS Cluster | `string` | `"emr-spark-rapids"` | no |
-| <a name="input_private_subnets"></a> [private\_subnets](#input\_private\_subnets) | Private Subnets CIDRs. 254 IPs per Subnet/AZ for Private NAT + NLB + Airflow + EC2 Jumphost etc. | `list(string)` | <pre>[<br>  "10.1.1.0/24",<br>  "10.1.2.0/24"<br>]</pre> | no |
-| <a name="input_public_subnets"></a> [public\_subnets](#input\_public\_subnets) | Public Subnets CIDRs. 62 IPs per Subnet/AZ | `list(string)` | <pre>[<br>  "10.1.0.0/26",<br>  "10.1.0.64/26"<br>]</pre> | no |
 | <a name="input_region"></a> [region](#input\_region) | Region | `string` | `"us-west-2"` | no |
-| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `string` | `"100.64.0.0/16"` | no |
+| <a name="input_secondary_cidr_blocks"></a> [secondary\_cidr\_blocks](#input\_secondary\_cidr\_blocks) | Secondary CIDR blocks to be attached to VPC | `list(string)` | <pre>[<br>  "100.64.0.0/16"<br>]</pre> | no |
 | <a name="input_tags"></a> [tags](#input\_tags) | Default tags | `map(string)` | `{}` | no |
-| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/16"` | no |
+| <a name="input_vpc_cidr"></a> [vpc\_cidr](#input\_vpc\_cidr) | VPC CIDR. This should be a valid private (RFC 1918) CIDR range | `string` | `"10.1.0.0/21"` | no |
 
 ## Outputs
 
 
@@ -84,6 +84,11 @@ module "eks_blueprints_addons" {
   #---------------------------------------
   enable_karpenter                  = true
   karpenter_enable_spot_termination = true
+  karpenter_node = {
+    iam_role_additional_policies = {
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
   karpenter = {
     repository_username = data.aws_ecrpublic_authorization_token.token.user_name
     repository_password = data.aws_ecrpublic_authorization_token.token.password
@@ -98,7 +103,7 @@ module "eks_blueprints_addons" {
   }
 
   #---------------------------------------
-  # Adding AWS Load Balancer Controller
+  # AWS Load Balancer Controller
   #---------------------------------------
   enable_aws_load_balancer_controller = true
   #---------------------------------------
@@ -140,8 +145,7 @@ module "eks_blueprints_addons" {
 module "kubernetes_data_addons" {
   # Please note that local source will be replaced once the below repo is public
   # source = "https://github.com/aws-ia/terraform-aws-kubernetes-data-addons"
-  source = "../../workshop/modules/terraform-aws-eks-data-addons"
-
+  source            = "../../workshop/modules/terraform-aws-eks-data-addons"
   oidc_provider_arn = module.eks.oidc_provider_arn
 
   #---------------------------------------------------------------
@@ -155,9 +159,11 @@ module "kubernetes_data_addons" {
   #---------------------------------------------------------------
   # Kubecost Add-on
   #---------------------------------------------------------------
+  # Note: Kubecost add-on depdends on Kube Prometheus Stack add-on for storing the metrics
   enable_kubecost = var.enable_kubecost
   kubecost_helm_config = {
     values              = [templatefile("${path.module}/helm-values/kubecost-values.yaml", {})]
+    version             = "1.104.5"
     repository_username = data.aws_ecrpublic_authorization_token.token.user_name
     repository_password = data.aws_ecrpublic_authorization_token.token.password
   }
 
@@ -0,0 +1,266 @@
+#---------------------------------------------------------------
+# EKS Cluster
+#---------------------------------------------------------------
+
+module "eks" {
+  source  = "terraform-aws-modules/eks/aws"
+  version = "~> 19.15"
+
+  cluster_name    = local.name
+  cluster_version = var.eks_cluster_version
+
+  cluster_endpoint_public_access = true # if true, Your cluster API server is accessible from the internet. You can, optionally, limit the CIDR blocks that can access the public endpoint.
+
+  vpc_id = module.vpc.vpc_id
+  # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the EKS Control Plane ENIs will be created
+  subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
+
+  manage_aws_auth_configmap = true
+  aws_auth_roles = [
+    {
+      rolearn  = module.eks_blueprints_addons.karpenter.iam_role_arn
+      username = "system:node:{{EC2PrivateDNSName}}"
+      groups = [
+        "system:bootstrappers",
+        "system:nodes",
+      ]
+    },
+    {
+      # Required for EMR on EKS virtual cluster
+      rolearn  = "arn:aws:iam::${data.aws_caller_identity.current.account_id}:role/AWSServiceRoleForAmazonEMRContainers"
+      username = "emr-containers"
+      groups   = []
+    },
+  ]
+
+  #---------------------------------------
+  # Note: This can further restricted to specific required for each Add-on and your application
+  #---------------------------------------
+  # Extend cluster security group rules
+  cluster_security_group_additional_rules = {
+    ingress_nodes_ephemeral_ports_tcp = {
+      description                = "Nodes on ephemeral ports"
+      protocol                   = "tcp"
+      from_port                  = 1025
+      to_port                    = 65535
+      type                       = "ingress"
+      source_node_security_group = true
+    }
+  }
+
+  # Extend node-to-node security group rules
+  node_security_group_additional_rules = {
+    ingress_self_all = {
+      description = "Node to node all ports/protocols"
+      protocol    = "-1"
+      from_port   = 0
+      to_port     = 0
+      type        = "ingress"
+      self        = true
+    }
+    # Allows Control Plane Nodes to talk to Worker nodes on all ports. Added this to simplify the example and further avoid issues with Add-ons communication with Control plane.
+    # This can be restricted further to specific port based on the requirement for each Add-on e.g., metrics-server 4443, spark-operator 8080, karpenter 8443 etc.
+    # Change this according to your security requirements if needed
+    ingress_cluster_to_node_all_traffic = {
+      description                   = "Cluster API to Nodegroup all traffic"
+      protocol                      = "-1"
+      from_port                     = 0
+      to_port                       = 0
+      type                          = "ingress"
+      source_cluster_security_group = true
+    }
+  }
+
+  eks_managed_node_group_defaults = {
+    iam_role_additional_policies = {
+      # Not required, but used in the example to access the nodes to inspect mounted volumes
+      AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+    }
+  }
+
+  eks_managed_node_groups = {
+    #  We recommend to have a MNG to place your critical workloads and add-ons
+    #  Then rely on Karpenter to scale your workloads
+    #  You can also make uses on nodeSelector and Taints/tolerations to spread workloads on MNG or Karpenter provisioners
+    core_node_group = {
+      name        = "core-node-group"
+      description = "EKS managed node group example launch template"
+      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null])
+
+      min_size     = 3
+      max_size     = 9
+      desired_size = 3
+
+      ami_type       = "AL2_x86_64"
+      instance_types = ["m5.xlarge"]
+
+      ebs_optimized = true
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/xvda"
+          ebs = {
+            volume_size = 100
+            volume_type = "gp3"
+          }
+        }
+      }
+
+      labels = {
+        WorkerType                       = "ON_DEMAND"
+        NodeGroupType                    = "core"
+        "nvidia.com/gpu.deploy.operands" = false
+      }
+
+      tags = {
+        Name                     = "core-node-grp",
+        "karpenter.sh/discovery" = local.name
+      }
+    }
+    spark_driver_ng = {
+      name        = "spark-driver-ng"
+      description = "Spark managed node group for Driver pods with cpu and Ubuntu AMI"
+      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
+
+      # Ubuntu image for EKs Cluster 1.26 https://cloud-images.ubuntu.com/aws-eks/
+      ami_id = data.aws_ami.ubuntu.image_id
+
+      # This will ensure the bootstrap user data is used to join the node
+      # By default, EKS managed node groups will not append bootstrap script;
+      # this adds it back in using the default template provided by the module
+      # Note: this assumes the AMI provided is an EKS optimized AMI derivative
+      enable_bootstrap_user_data = true
+
+      min_size     = 1
+      max_size     = 8
+      desired_size = 1
+
+      force_update_version = true
+      instance_types       = ["m5.xlarge"] # 4 vCPU and 16GB
+
+      ebs_optimized = true
+      # This bloc device is used only for root volume. Adjust volume according to your size.
+      # NOTE: Dont use this volume for Spark workloads
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/sda1"
+          ebs = {
+            volume_size = 100
+            volume_type = "gp3"
+          }
+        }
+      }
+
+      labels = {
+        WorkerType                       = "ON_DEMAND"
+        NodeGroupType                    = "spark-driver-ca"
+        "nvidia.com/gpu.deploy.operands" = false
+      }
+
+      taints = [{
+        key    = "spark-driver-ca"
+        value  = true
+        effect = "NO_SCHEDULE"
+      }]
+
+      tags = {
+        Name = "spark-driver-ca"
+      }
+    }
+    spark_gpu_ng = {
+      name        = "spark-gpu-ng"
+      description = "Spark managed Ubuntu GPU node group for executor pods with launch template"
+      # Filtering only Secondary CIDR private subnets starting with "100.". Subnet IDs where the nodes/node groups will be provisioned
+      subnet_ids = [element(compact([for subnet_id, cidr_block in zipmap(module.vpc.private_subnets, module.vpc.private_subnets_cidr_blocks) : substr(cidr_block, 0, 4) == "100." ? subnet_id : null]), 0)]
+
+      # Ubuntu image for EKS Cluster 1.26 https://cloud-images.ubuntu.com/aws-eks/
+      ami_id = data.aws_ami.ubuntu.image_id
+
+      # This will ensure the bootstrap user data is used to join the node
+      # By default, EKS managed node groups will not append bootstrap script;
+      # this adds it back in using the default template provided by the module
+      # Note: this assumes the AMI provided is an EKS optimized AMI derivative
+      enable_bootstrap_user_data = true
+
+      # NVMe instance store volumes are automatically enumerated and assigned a device
+      pre_bootstrap_user_data = <<-EOT
+        echo "Running a custom user data script"
+        set -ex
+        apt-get update
+        apt-get install -y nvme-cli mdadm xfsprogs
+
+        # Fetch the list of NVMe devices
+        DEVICES=$(lsblk -d -o NAME | grep nvme)
+
+        DISK_ARRAY=()
+
+        for DEV in $DEVICES
+        do
+          # Exclude the root disk, /dev/nvme0n1, from the list of devices
+          if [[ $${DEV} != "nvme0n1" ]]; then
+            NVME_INFO=$(nvme id-ctrl --raw-binary "/dev/$${DEV}" | cut -c3073-3104 | tr -s ' ' | sed 's/ $//g')
+            # Check if the device is Amazon EC2 NVMe Instance Storage
+            if [[ $${NVME_INFO} == *"ephemeral"* ]]; then
+              DISK_ARRAY+=("/dev/$${DEV}")
+            fi
+          fi
+        done
+
+        DISK_COUNT=$${#DISK_ARRAY[@]}
+
+        if [ $${DISK_COUNT} -eq 0 ]; then
+          echo "No NVMe SSD disks available. No further action needed."
+        else
+          if [ $${DISK_COUNT} -eq 1 ]; then
+            TARGET_DEV=$${DISK_ARRAY[0]}
+            mkfs.xfs $${TARGET_DEV}
+          else
+            mdadm --create --verbose /dev/md0 --level=0 --raid-devices=$${DISK_COUNT} $${DISK_ARRAY[@]}
+            mkfs.xfs /dev/md0
+            TARGET_DEV=/dev/md0
+          fi
+
+          mkdir -p /local1
+          echo $${TARGET_DEV} /local1 xfs defaults,noatime 1 2 >> /etc/fstab
+          mount -a
+          /usr/bin/chown -hR +999:+1000 /local1
+        fi
+      EOT
+
+      min_size     = 8
+      max_size     = 8
+      desired_size = 8
+
+      capacity_type  = "SPOT"
+      instance_types = ["g5.2xlarge"]
+
+      ebs_optimized = true
+      # This block device is used only for root volume. Adjust volume according to your size.
+      # NOTE: Don't use this volume for Spark workloads
+      # Ubuntu uses /dev/sda1 as root volume
+      block_device_mappings = {
+        xvda = {
+          device_name = "/dev/sda1"
+          ebs = {
+            volume_size = 100
+            volume_type = "gp3"
+          }
+        }
+      }
+
+      labels = {
+        WorkerType    = "SPOT"
+        NodeGroupType = "spark-ubuntu-gpu-ca"
+      }
+
+      taints = [{ key = "spark-ubuntu-gpu-ca", value = true, effect = "NO_SCHEDULE" }]
+
+      tags = {
+        Name = "spark-ubuntu-gpu",
+      }
+    }
+  }
+
+  tags = local.tags
+}
@@ -2,18 +2,21 @@ module "emr_containers" {
   source  = "terraform-aws-modules/emr/aws//modules/virtual-cluster"
   version = "~> 1.0"
 
-  for_each = toset(["data-team-a", "data-team-b"])
+  for_each = toset(["ml-team-a", "ml-team-b"])
 
   eks_cluster_id    = module.eks.cluster_name
   oidc_provider_arn = module.eks.oidc_provider_arn
 
   name      = "${module.eks.cluster_name}-emr-${each.value}"
   namespace = "emr-${each.value}"
 
-  role_name                    = "${module.eks.cluster_name}-emr-${each.value}"
-  iam_role_use_name_prefix     = false
-  iam_role_description         = "EMR Execution Role for emr-${each.value}"
-  iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"] # Attach additional policies for execution IAM Role
+  role_name                = "${module.eks.cluster_name}-emr-${each.value}"
+  iam_role_use_name_prefix = false
+  iam_role_description     = "EMR Execution Role for emr-${each.value}"
+  # NOTE: S3 full access added only for testing purpose. You should modify this policy to restrict access to S3 buckets
+  iam_role_additional_policies = ["arn:aws:iam::aws:policy/AmazonS3FullAccess"]
+
+  cloudwatch_log_group_name = "/emr-on-eks-logs/${module.eks.cluster_name}/emr-${each.value}/"
 
   tags = merge(local.tags, { Name = "emr-${each.value}" })
 }