Skip to content

Commit 7d2d34c

Browse files
shivam-dubey-1Ubuntu
and
Ubuntu
authored
feat: Observability for RayServe and vLLM GPU (#642)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-71-11.ec2.internal>
1 parent 416447e commit 7d2d34c

File tree

15 files changed

+12872
-0
lines changed

15 files changed

+12872
-0
lines changed

ai-ml/jark-stack/terraform/addons.tf

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,39 @@ module "eks_blueprints_addons" {
143143
values = [templatefile("${path.module}/helm-values/argo-events-values.yaml", {})]
144144
}
145145

146+
#---------------------------------------
147+
# Prommetheus and Grafana stack
148+
#---------------------------------------
149+
#---------------------------------------------------------------
150+
# 1- Grafana port-forward `kubectl port-forward svc/kube-prometheus-stack-grafana 8080:80 -n kube-prometheus-stack`
151+
# 2- Grafana Admin user: admin
152+
# 3- Get sexret name from Terrafrom output: `terraform output grafana_secret_name`
153+
# 3- Get admin user password: `aws secretsmanager get-secret-value --secret-id <REPLACE_WIRTH_SECRET_ID> --region $AWS_REGION --query "SecretString" --output text`
154+
#---------------------------------------------------------------
155+
enable_kube_prometheus_stack = true
156+
kube_prometheus_stack = {
157+
values = [
158+
templatefile("${path.module}/helm-values/kube-prometheus.yaml", {
159+
storage_class_type = kubernetes_storage_class.default_gp3.id
160+
})
161+
]
162+
chart_version = "48.1.1"
163+
set_sensitive = [
164+
{
165+
name = "grafana.adminPassword"
166+
value = data.aws_secretsmanager_secret_version.admin_password_version.secret_string
167+
}
168+
],
169+
}
170+
171+
#---------------------------------------
172+
# CloudWatch metrics for EKS
173+
#---------------------------------------
174+
enable_aws_cloudwatch_metrics = true
175+
aws_cloudwatch_metrics = {
176+
values = [templatefile("${path.module}/helm-values/aws-cloudwatch-metrics-values.yaml", {})]
177+
}
178+
146179
}
147180

148181
#---------------------------------------------------------------
@@ -388,6 +421,32 @@ resource "kubernetes_config_map_v1" "notebook" {
388421
}
389422
}
390423

424+
#---------------------------------------------------------------
425+
# Grafana Admin credentials resources
426+
# Login to AWS secrets manager with the same role as Terraform to extract the Grafana admin password with the secret name as "grafana"
427+
#---------------------------------------------------------------
428+
data "aws_secretsmanager_secret_version" "admin_password_version" {
429+
secret_id = aws_secretsmanager_secret.grafana.id
430+
depends_on = [aws_secretsmanager_secret_version.grafana]
431+
}
432+
433+
resource "random_password" "grafana" {
434+
length = 16
435+
special = true
436+
override_special = "@_"
437+
}
438+
439+
#tfsec:ignore:aws-ssm-secret-use-customer-key
440+
resource "aws_secretsmanager_secret" "grafana" {
441+
name_prefix = "${local.name}-oss-grafana"
442+
recovery_window_in_days = 0 # Set to zero for this example to force delete during Terraform destroy
443+
}
444+
445+
resource "aws_secretsmanager_secret_version" "grafana" {
446+
secret_id = aws_secretsmanager_secret.grafana.id
447+
secret_string = random_password.grafana.result
448+
}
449+
391450
data "aws_iam_policy_document" "karpenter_controller_policy" {
392451
statement {
393452
actions = [
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
resources:
2+
limits:
3+
cpu: 500m
4+
memory: 2Gi
5+
requests:
6+
cpu: 200m
7+
memory: 1Gi
8+
9+
# This toleration allows Daemonset pod to be scheduled on any node, regardless of their Taints.
10+
tolerations:
11+
- operator: Exists
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
prometheus:
2+
prometheusSpec:
3+
retention: 5h
4+
scrapeInterval: 30s
5+
evaluationInterval: 30s
6+
scrapeTimeout: 10s
7+
serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
8+
storageSpec:
9+
volumeClaimTemplate:
10+
metadata:
11+
name: data
12+
spec:
13+
storageClassName: ${storage_class_type}
14+
accessModes:
15+
- ReadWriteOnce
16+
resources:
17+
requests:
18+
storage: 50Gi
19+
alertmanager:
20+
enabled: false
21+
22+
grafana:
23+
enabled: true
24+
defaultDashboardsEnabled: true
25+
prometheus:
26+
prometheusSpec:
27+
retention: 5h
28+
scrapeInterval: 30s
29+
evaluationInterval: 30s
30+
scrapeTimeout: 10s
31+
serviceMonitorSelectorNilUsesHelmValues: false # This is required to use the serviceMonitorSelector
32+
storageSpec:
33+
volumeClaimTemplate:
34+
metadata:
35+
name: data
36+
spec:
37+
storageClassName: ${storage_class_type}
38+
accessModes:
39+
- ReadWriteOnce
40+
resources:
41+
requests:
42+
storage: 50Gi
43+
alertmanager:
44+
enabled: false
45+
46+
grafana:
47+
enabled: true
48+
defaultDashboardsEnabled: true
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: monitoring.coreos.com/v1
2+
kind: PodMonitor
3+
metadata:
4+
name: ray-workers-monitor
5+
namespace: kube-prometheus-stack
6+
labels:
7+
# `release: $HELM_RELEASE`: Prometheus can only detect PodMonitor with this label.
8+
release: kube-prometheus-stack
9+
spec:
10+
jobLabel: ray-workers
11+
# Only select Kubernetes Pods in the "default" namespace.
12+
namespaceSelector:
13+
matchNames:
14+
- rayserve-vllm
15+
# Only select Kubernetes Pods with "matchLabels".
16+
selector:
17+
matchLabels:
18+
ray.io/node-type: worker
19+
# A list of endpoints allowed as part of this PodMonitor.
20+
podMetricsEndpoints:
21+
- port: metrics

0 commit comments

Comments
 (0)