From b159efe3fb8d53b24758036b616be3c0fe8b70c8 Mon Sep 17 00:00:00 2001 From: Justin Rivera Date: Fri, 11 Jul 2025 20:05:29 +0000 Subject: [PATCH] [CI] Add Terraform resources for daily CronJob that processes LLVM commits --- premerge/gke_cluster/main.tf | 8 ++ premerge/main.tf | 103 ++++++++++++++++++++++ premerge/operational_metrics_cronjob.yaml | 52 +++++++++++ 3 files changed, 163 insertions(+) create mode 100644 premerge/operational_metrics_cronjob.yaml diff --git a/premerge/gke_cluster/main.tf b/premerge/gke_cluster/main.tf index 802bad4e..57183162 100644 --- a/premerge/gke_cluster/main.tf +++ b/premerge/gke_cluster/main.tf @@ -12,6 +12,10 @@ resource "google_container_cluster" "llvm_premerge" { # for adding windows nodes to the cluster. networking_mode = "VPC_NATIVE" ip_allocation_policy {} + + workload_identity_config { + workload_pool = "llvm-premerge-checks.svc.id.goog" + } } resource "google_container_node_pool" "llvm_premerge_linux_service" { @@ -23,6 +27,10 @@ resource "google_container_node_pool" "llvm_premerge_linux_service" { node_config { machine_type = "e2-highcpu-4" + + workload_metadata_config { + mode = "GKE_METADATA" + } # Terraform wants to recreate the node pool everytime whe running # terraform apply unless we explicitly set this. # TODO(boomanaiden154): Look into why terraform is doing this so we do diff --git a/premerge/main.tf b/premerge/main.tf index 022be03b..33e250f3 100644 --- a/premerge/main.tf +++ b/premerge/main.tf @@ -190,3 +190,106 @@ resource "kubernetes_manifest" "metrics_deployment" { depends_on = [kubernetes_namespace.metrics, kubernetes_secret.metrics_secrets] } + +# Resources for collecting LLVM operational metrics data + +# Service accounts and bindings to grant access to the +# BigQuery API for our cronjob +resource "google_service_account" "operational_metrics_gsa" { + account_id = "operational-metrics-gsa" + display_name = "Operational Metrics GSA" +} + +resource "google_project_iam_binding" "bigquery_jobuser_binding" { + project = google_service_account.operational_metrics_gsa.project + role = "roles/bigquery.jobUser" + + members = [ + "serviceAccount:${google_service_account.operational_metrics_gsa.email}", + ] + + depends_on = [google_service_account.operational_metrics_gsa] +} + +resource "kubernetes_namespace" "operational_metrics" { + metadata { + name = "operational-metrics" + } + provider = kubernetes.llvm-premerge-us-central +} + +resource "kubernetes_service_account" "operational_metrics_ksa" { + metadata { + name = "operational-metrics-ksa" + namespace = "operational-metrics" + annotations = { + "iam.gke.io/gcp-service-account" = google_service_account.operational_metrics_gsa.email + } + } + + depends_on = [kubernetes_namespace.operational_metrics] +} + +resource "google_service_account_iam_binding" "workload_identity_binding" { + service_account_id = google_service_account.operational_metrics_gsa.name + role = "roles/iam.workloadIdentityUser" + + members = [ + "serviceAccount:${google_service_account.operational_metrics_gsa.project}.svc.id.goog[operational-metrics/operational-metrics-ksa]", + ] + + depends_on = [ + google_service_account.operational_metrics_gsa, + kubernetes_service_account.operational_metrics_ksa, + ] +} + +# The container for scraping LLVM commits needs persistent storage +# for a local check-out of llvm/llvm-project +resource "kubernetes_persistent_volume_claim" "operational_metrics_pvc" { + metadata { + name = "operational-metrics-pvc" + namespace = "operational-metrics" + } + + spec { + access_modes = ["ReadWriteOnce"] + resources { + requests = { + storage = "20Gi" + } + } + storage_class_name = "standard-rwo" + } + + depends_on = [kubernetes_namespace.operational_metrics] +} + +resource "kubernetes_secret" "operational_metrics_secrets" { + metadata { + name = "operational-metrics-secrets" + namespace = "operational-metrics" + } + + data = { + "github-token" = data.google_secret_manager_secret_version.metrics_github_pat.secret_data + "grafana-api-key" = data.google_secret_manager_secret_version.metrics_grafana_api_key.secret_data + "grafana-metrics-userid" = data.google_secret_manager_secret_version.metrics_grafana_metrics_userid.secret_data + } + + type = "Opaque" + provider = kubernetes.llvm-premerge-us-central + depends_on = [kubernetes_namespace.operational_metrics] +} + +resource "kubernetes_manifest" "operational_metrics_cronjob" { + manifest = yamldecode(file("operational_metrics_cronjob.yaml")) + provider = kubernetes.llvm-premerge-us-central + + depends_on = [ + kubernetes_namespace.operational_metrics, + kubernetes_persistent_volume_claim.operational_metrics_pvc, + kubernetes_secret.operational_metrics_secrets, + kubernetes_service_account.operational_metrics_ksa, + ] +} diff --git a/premerge/operational_metrics_cronjob.yaml b/premerge/operational_metrics_cronjob.yaml new file mode 100644 index 00000000..6673d0f1 --- /dev/null +++ b/premerge/operational_metrics_cronjob.yaml @@ -0,0 +1,52 @@ +# operational_metrics_cronjob.yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: operational-metrics-cronjob + namespace: operational-metrics +spec: + # Midnight PDT + schedule: "0 7 * * *" + timeZone: "Etc/UTC" + concurrencyPolicy: Forbid + jobTemplate: + spec: + template: + spec: + serviceAccountName: operational-metrics-ksa + nodeSelector: + iam.gke.io/gke-metadata-server-enabled: "true" + volumes: + - name: metrics-volume + persistentVolumeClaim: + claimName: operational-metrics-pvc + containers: + - name: process-llvm-commits + image: ghcr.io/llvm/operations-metrics:latest + env: + - name: GITHUB_TOKEN + valueFrom: + secretKeyRef: + name: operational-metrics-secrets + key: github-token + - name: GRAFANA_API_KEY + valueFrom: + secretKeyRef: + name: operational-metrics-secrets + key: grafana-api-key + - name: GRAFANA_METRICS_USERID + valueFrom: + secretKeyRef: + name: operational-metrics-secrets + key: grafana-metrics-userid + volumeMounts: + - name: metrics-volume + mountPath: "/data" + resources: + requests: + cpu: "250m" + memory: "256Mi" + limits: + cpu: "1" + memory: "512Mi" + restartPolicy: OnFailure