awslabs
diff --git a/‎ai-ml/jupyterhub/addons.tf
Lines changed: 7 additions & 6 deletions b/‎ai-ml/jupyterhub/addons.tf
Lines changed: 7 additions & 6 deletions
diff --git a/‎ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml
Lines changed: 273 additions & 0 deletions b/‎ai-ml/jupyterhub/helm/jupyterhub/jupyterhub-values-oauth.yaml
Lines changed: 273 additions & 0 deletions
diff --git a/‎ai-ml/jupyterhub/variables.tf
Lines changed: 33 additions & 2 deletions b/‎ai-ml/jupyterhub/variables.tf
Lines changed: 33 additions & 2 deletions
diff --git a/‎website/docs/blueprints/ai-ml/img/keycloak-login.png
48.1 KB b/‎website/docs/blueprints/ai-ml/img/keycloak-login.png
48.1 KB
diff --git a/‎website/docs/blueprints/ai-ml/img/oauth.png
134 KB b/‎website/docs/blueprints/ai-ml/img/oauth.png
134 KB
@@ -1,6 +1,6 @@
 # Use this data source to get the ARN of a certificate in AWS Certificate Manager (ACM)
 data "aws_acm_certificate" "issued" {
-  count    = var.jupyter_hub_auth_mechanism == "cognito" ? 1 : 0
+  count    = var.jupyter_hub_auth_mechanism != "dummy" ? 1 : 0
   domain   = var.acm_certificate_domain
   statuses = ["ISSUED"]
 }
@@ -267,11 +267,12 @@ module "eks_data_addons" {
     values = [templatefile("${path.module}/helm/jupyterhub/jupyterhub-values-${var.jupyter_hub_auth_mechanism}.yaml", {
       ssl_cert_arn                = try(data.aws_acm_certificate.issued[0].arn, "")
       jupyterdomain               = try("https://${var.jupyterhub_domain}/hub/oauth_callback", "")
-      authorize_url               = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
-      token_url                   = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "")
-      userdata_url                = try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "")
-      client_id                   = try(aws_cognito_user_pool_client.user_pool_client[0].id, "")
-      client_secret               = try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "")
+      authorize_url               = var.oauth_domain != "" ? "${var.oauth_domain}/auth" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/authorize", "")
+      token_url                   = var.oauth_domain != "" ? "${var.oauth_domain}/token" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/token", "")
+      userdata_url                = var.oauth_domain != "" ? "${var.oauth_domain}/userinfo" : try("https://${local.cognito_custom_domain}.auth.${local.region}.amazoncognito.com/oauth2/userInfo", "")
+      username_key                = try(var.oauth_username_key, "")
+      client_id                   = var.oauth_jupyter_client_id != "" ? var.oauth_jupyter_client_id : try(aws_cognito_user_pool_client.user_pool_client[0].id, "")
+      client_secret               = var.oauth_jupyter_client_secret != "" ? var.oauth_jupyter_client_secret : try(aws_cognito_user_pool_client.user_pool_client[0].client_secret, "")
       user_pool_id                = try(aws_cognito_user_pool.pool[0].id, "")
       identity_pool_id            = try(aws_cognito_identity_pool.identity_pool[0].id, "")
       jupyter_single_user_sa_name = kubernetes_service_account_v1.jupyterhub_single_user_sa.metadata[0].name
 
@@ -0,0 +1,273 @@
+hub:
+  db:
+    pvc:
+      storage: 50Gi
+      storageClassName: gp3
+  authenticatePrometheus: false
+  config:
+    GenericOAuthenticator:
+      oauth_callback_url: ${jupyterdomain}
+      client_id: ${client_id}
+      client_secret: ${client_secret}
+      authorize_url: ${authorize_url}
+      token_url: ${token_url}
+      userdata_url: ${userdata_url}
+      scope:
+        - openid
+        - profile
+      username_key: "${username_key}"
+      login_service: "oauth"
+      allow_all: true # Allows all oauth authenticated users to use Jupyterhub. For finer grained control, you can use `allowed_users`: https://jupyterhub.readthedocs.io/en/stable/tutorial/getting-started/authenticators-users-basics.html#deciding-who-is-allowed
+    JupyterHub:
+      authenticator_class: generic-oauth
+proxy:
+  https:
+    enabled: true
+    type: offload
+  service:
+    annotations:
+      service.beta.kubernetes.io/aws-load-balancer-ssl-cert: ${ssl_cert_arn}
+      service.beta.kubernetes.io/aws-load-balancer-ssl-ports: "https"
+      service.beta.kubernetes.io/aws-load-balancer-backend-protocol: "tcp"
+      service.beta.kubernetes.io/aws-load-balancer-connection-idle-timeout: "3600"
+      service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
+      service.beta.kubernetes.io/aws-load-balancer-scheme: internal
+      service.beta.kubernetes.io/aws-load-balancer-type: external
+      service.beta.kubernetes.io/aws-load-balancer-cross-zone-load-balancing-enabled: 'true'
+      service.beta.kubernetes.io/aws-load-balancer-ip-address-type: ipv4
+
+singleuser:
+  startTimeout: 1200 # 20 mins to spin up a notebook server for GPU including the image pull
+  profileList:
+    - display_name: Elyra (CPU)
+      description: "Elyra Notebooks | Karpenter Autoscaling"
+      kubespawner_override:
+        image: public.ecr.aws/data-on-eks/elyra-jupyter:3.15.0
+        node_selector:
+          NodePool: default
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    - display_name: Data Engineering (CPU)
+      description: "PySpark Notebooks | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pyspark350:
+              display_name: "PySpark 3.5.0 + Python 3.11"
+              default: true
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.5.0
+            pyspark341:
+              display_name: "PySpark 3.4.1 + Python 3.11"
+              kubespawner_override:
+                image: jupyter/pyspark-notebook:spark-3.4.1
+      kubespawner_override:
+        node_selector:
+          NodePool: default
+        cpu_guarantee: 2
+        mem_guarantee: 8G
+        cpu_limit: 4
+        mem_limit: 8G
+      cmd: null
+    # NOTE:
+    - display_name: Trainium (trn1)
+      description: "Trainium | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch 1.13.1 + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow 2.10.1 + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        node_selector:
+          NodePool: trainium
+          hub.jupyter.org/node-purpose: user
+        tolerations:
+          - key: aws.amazon.com/neuroncore
+            operator: Exists
+            effect: NoSchedule
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: "start-singleuser.sh"
+    - display_name: Inferentia (inf2)
+      description: "Inferentia | Karpenter AutoScaling"
+      profile_options:
+        image:
+          display_name: "Image"
+          choices:
+            pytorch1131:
+              display_name: "PyTorch + torch-neuronx"
+              default: true
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/pytorch-neuronx:latest
+            tflow2101:
+              display_name: "Tensorflow + tensorflow-neuronx"
+              kubespawner_override:
+                image: public.ecr.aws/data-on-eks/tensorflow-neuronx:latest
+      kubespawner_override:
+        node_selector:
+          NodePool: inferentia
+          hub.jupyter.org/node-purpose: user
+        tolerations:
+          - key: aws.amazon.com/neuroncore
+            operator: Exists
+            effect: NoSchedule
+          - key: aws.amazon.com/neuron
+            operator: Exists
+            effect: NoSchedule
+          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        cpu_guarantee: 20
+        mem_guarantee: 100G
+        cpu_limit: 20
+        mem_limit: 100G
+        extra_resource_limits:
+          aws.amazon.com/neuron: "1"
+        cmd: null
+    - display_name: Data Science (GPU + Time-Slicing - G5)
+      default: true
+      description: "GPU Time-Slicing with Single GPU VMs (G5 2x, 4x, 8x, 16x) | nvidia.com/gpu: 1 | Karpenter AutoScaling"
+      kubespawner_override:
+        # namespace: data-team-a
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        node_selector:
+          NodePool: gpu-ts # TIME-SLICING: Use this config with time-slicing mode
+          hub.jupyter.org/node-purpose: user
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "1" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_limit: 2
+        mem_limit: 4G
+        cpu_guarantee: 2
+        mem_guarantee: 4G
+        cmd: "start-singleuser.sh"
+    # Karpenter doesn't support for requesting resources with MIG slices e.g., nvidia.com/mig-1g.5gb: 1,  or nvidia.com/mig-2g.20gb: 1 etc.
+    # Hence, this profile relies on Managed node groups with GPU MIG enabled
+    - display_name: Data Science (GPU + MIG on P4d.24xlarge)
+      description: "GPU MIG with P4d instances | nvidia.com/mig-1g.5gb: 1 | Cluster Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        node_selector:
+          provisioner: cluster-autoscaler
+          node.kubernetes.io/instance-type: p4d.24xlarge
+          hub.jupyter.org/node-purpose: user
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        extra_resource_guarantees:
+          nvidia.com/mig-1g.5gb: 1 # or nvidia.com/mig-2g.10gb OR nvidia.com/mig-3g.20gb
+        # extra_resource_limits:
+        #   nvidia.com/gpu: "8" # TIME-SLICING: Use a slice of GPU using time-slicing mode
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+    - display_name: Data Science (GPU - P4d.24xlarge)
+      description: "GPU with P4d instances | Karpenter Autoscaler"
+      kubespawner_override:
+        image: cschranz/gpu-jupyter:v1.6_cuda-11.8_ubuntu-22.04_python-only
+        node_selector:
+          NodePool: gpu-mig
+          hub.jupyter.org/node-purpose: user
+        tolerations:
+          - key: "nvidia.com/gpu"
+            operator: "Exists"
+            effect: "NoSchedule"
+          - key: "hub.jupyter.org/dedicated" # According to optimization docs https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+            operator: "Equal"
+            value: "user"
+            effect: "NoSchedule"
+        extra_resource_limits:
+          nvidia.com/gpu: "8"
+        cpu_guarantee: 2
+        mem_guarantee: 10G
+        cpu_limit: 2
+        mem_limit: 10G
+        cmd: "start-singleuser.sh"
+  storage:
+    type: "static"
+    static:
+      pvcName: "efs-persist"
+      subPath: "home/{username}"
+    extraVolumes:
+      - name: jupyterhub-shared
+        persistentVolumeClaim:
+          claimName: efs-persist-shared
+    extraVolumeMounts:
+      - name: jupyterhub-shared
+        mountPath: /home/shared
+        readOnly: false
+  serviceAccountName: ${jupyter_single_user_sa_name}
+  allowPrivilegeEscalation: true
+  extraPodConfig: # This is needed for Jovyan user running in every single pod, access the Service Account
+    securityContext:
+      fsGroup: 100
+  extraEnv: # Sudo needed to configure the proper permissions to start the notebook instance
+    GRANT_SUDO: "yes"
+    NOTEBOOK_ARGS: "--allow-root"
+    CHOWN_HOME: "yes"
+    CHOWN_HOME_OPTS: "-R"
+    CHOWN_EXTRA: "/home/shared"
+  uid: 0
+  fsGid: 0
+  cmd: null
+
+# Optimizations configured according to this doc https://z2jh.jupyter.org/en/latest/administrator/optimization.html
+scheduling:
+  userScheduler:
+    enabled: true
+  podPriority:
+    enabled: true
+  userPlaceholder:
+    enabled: false
+    replicas: 1
+  userPods:
+    nodeAffinity:
+      matchNodePurpose: require # This will force single-user pods to use an specific karpenter provisioner
+
+prePuller:
+  hook:
+    enabled: false
+  continuous:
+    # NOTE: if used with Karpenter, also add user-placeholders
+    enabled: false
+
+global:
+  safeToShowValues: false
@@ -36,7 +36,7 @@ variable "secondary_cidr_blocks" {
 # Example of public domain name(<subdomain-name>.<domain-name>.com): eks.jupyter-doeks.dynamic-dns.com
 variable "jupyter_hub_auth_mechanism" {
   type        = string
-  description = "Allowed values: cognito, dummy"
+  description = "Allowed values: cognito, dummy, oauth"
   default     = "dummy"
 }
 
@@ -53,8 +53,39 @@ variable "acm_certificate_domain" {
   description = "Enter domain name with wildcard and ensure ACM certificate is created for this domain name, e.g. *.example.com"
   default     = ""
 }
+
+# Only needed if auth mechanism is set to cognito or oauth. This is the domain for jupyterhub
 variable "jupyterhub_domain" {
   type        = string
-  description = "Enter sub-domain name for jupyterhub to be hosted,  e.g. eks.example.com. Only needed if auth mechanism is set to cognito"
+  description = "Enter domain name for jupyterhub to be hosted,  e.g. eks.example.com. Only needed if auth mechanism is set to cognito or oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the root path for the oidc endpoints
+variable "oauth_domain" {
+  type        = string
+  description = "Enter oauth domain and endpoint, e.g. https://keycloak.example.com/realms/master/protocol/openid-connect. Only needed if auth mechanism is set to oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the id of the client
+variable "oauth_jupyter_client_id" {
+  type        = string
+  description = "Enter oauth client id for jupyterhub, e.g. jupyterhub. Only needed if auth mechanism is set to oauth"
+  default     = ""
+}
+
+# Only needed if auth mechanism is set to oauth. This is the secret for the client
+variable "oauth_jupyter_client_secret" {
+  type        = string
+  description = "Enter oauth client secret. Only needed if auth mechanism is set to oauth"
+  default     = ""
+  sensitive   = true
+}
+
+# Only needed if auth mechanism is set to oauth. This is the key to use for looking up the username.
+variable "oauth_username_key" {
+  type        = string
+  description = "oauth field for the username. e.g. 'preferred_username' Only needed if auth mechanism is set to oauth"
   default     = ""
 }