feat: allow instrumentation of Termination lambda (#1255)

kayman-mk · web-flow · commit 55af1d16bdf8 · 2025-02-27T11:45:46.000+01:00
## Description

This PR adds the possibility to instrument the internal Lambda function,
e.g. with APM tools. Use the following variables:

- `runner_terminate_ec2_lambda_handler` to replace the `handler` with
your function
- `runner_terminate_ec2_environment_variables` to add environment
variables. The special value `{HANDLER}` is automatically replaced by
the internal handler name to be able to call the "real" handler
- `runner_terminate_ec2_lambda_handler_layer_arns` to add additional
layers to the Lambda function
- `runner_terminate_ec2_lambda_egress_rules` to allow traffic to
external systems. IPv4/6 port 443 is the default
diff --git a/docker_autoscaler_security_group.tf b/docker_autoscaler_security_group.tf
@@ -27,6 +27,8 @@ resource "aws_vpc_security_group_ingress_rule" "docker_autoscaler_ingress" {
   referenced_security_group_id = each.value.security_group
   cidr_ipv4                    = each.value.cidr_block
   cidr_ipv6                    = each.value.ipv6_cidr_block
+
+  tags = local.tags
 }
 
 resource "aws_vpc_security_group_ingress_rule" "docker_autoscaler_internal_traffic" {
@@ -38,6 +40,8 @@ resource "aws_vpc_security_group_ingress_rule" "docker_autoscaler_internal_traff
   ip_protocol                  = "-1"
   description                  = "Allow ALL Ingress traffic between Runner Manager and Docker-autoscaler workers security group"
   referenced_security_group_id = aws_security_group.runner.id
+
+  tags = local.tags
 }
 
 # Egress rules
@@ -55,4 +59,6 @@ resource "aws_vpc_security_group_egress_rule" "docker_autoscaler_egress" {
   referenced_security_group_id = each.value.security_group
   cidr_ipv4                    = each.value.cidr_block
   cidr_ipv6                    = each.value.ipv6_cidr_block
+
+  tags = local.tags
 }
diff --git a/docs/usage.md b/docs/usage.md
@@ -111,7 +111,7 @@ We have seen that the [fork](https://gitlab.com/cki-project/docker-machine/-/tre
 module is using consume more RAM using spot fleets. For comparison, if you launch 50 machines in the same time, it consumes
 ~1.2GB of RAM. In our case, we had to change the `instance_type` of the runner from `t3.micro` to `t3.small`.
 
-#### Configuration example
+#### Spot Fleet Configuration
 
 ```hcl
 module "runner" {
@@ -146,9 +146,11 @@ module "runner" {
 
 ### Scenario: Use of Docker autoscaler
 
-As docker machine is no longer maintained by docker, gitlab recently developed docker autoscaler to replace docker machine (still in beta). An option is available to test it out.
+As docker machine is no longer maintained by docker, gitlab recently developed docker autoscaler to replace docker machine
+(still in beta). An option is available to test it out.
 
-Tested with amazon-linux-2-x86 as runner manager and ubuntu-server-22-lts-x86 for runner worker. The following commands have been added to the original AMI for the runner worker for the docker-autoscaler to work correctly:
+Tested with amazon-linux-2-x86 as runner manager and ubuntu-server-22-lts-x86 for runner worker. The following commands have been
+added to the original AMI for the runner worker for the docker-autoscaler to work correctly:
 
 ```bash
 # Install docker
@@ -170,7 +172,7 @@ apt-get install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin do
 usermod -aG docker ubuntu
 ```
 
-#### Configuration example
+#### Docker Autoscaler Configuration
 
 ```hcl
 module "runner" {
@@ -253,9 +255,7 @@ If a KMS key is set via `kms_key_id`, make sure that you also give proper access
 get errors, e.g. the build cache can't be decrypted or logging via CloudWatch is not possible. For a CloudWatch
 example checkout [kms-policy.json](https://github.com/cattle-ops/terraform-aws-gitlab-runner/blob/main/policies/kms-policy.json)
 
-### Auto Scaling Group
-
-#### Scheduled scaling
+### Auto Scaling Group - Scheduled scaling
 
 When `runner_schedule_enable=true`, the `runner_schedule_config` block can be used to scale the Auto Scaling group.
 
@@ -281,7 +281,7 @@ module "runner" {
 }
 ```
 
-#### Graceful termination / Zero Downtime deployment
+### Graceful termination / Zero Downtime deployment
 
 This module supports zero-downtime deployments by following a structured process:
 
@@ -315,6 +315,26 @@ that executes a provided Lambda function when the runner is terminated to termin
 provisioned by the Docker Machine executor. a `builds/` directory relative to the root module persists that
 contains the packaged Lambda function.
 
+### Instrumenting the Graceful termination Lambda
+
+To instrument the Lambda function, the following steps are required:
+
+```hcl
+module "runner" {
+  # ...
+   runner_terminate_ec2_environment_variables = {
+     variable1    = "here"
+     variable2    = "are"
+     old_handler = "{HANDLER}" # automatically replaced by the correct value
+   }
+   runner_terminate_ec2_lambda_egress_rules = {
+      # ... whatever you need, IPv4/IPv6 port 443 is the default
+   }
+   runner_terminate_ec2_lambda_handler = "instrumented_handler.from.a.layer"
+   runner_terminate_ec2_lambda_layer_arns = ["arn:aws:lambda:us-east-1:123456789012:layer:instrumented_handler:1"]
+}
+```
+
 ### Access the Runner instance
 
 A few option are provided to access the runner instance:
diff --git a/main.tf b/main.tf
@@ -379,6 +379,12 @@ module "terminate_agent_hook" {
   role_permissions_boundary              = var.iam_permissions_boundary == "" ? null : "arn:${data.aws_partition.current.partition}:iam::${data.aws_caller_identity.current.account_id}:policy/${var.iam_permissions_boundary}"
   kms_key_id                             = local.kms_key_arn
   asg_hook_terminating_heartbeat_timeout = local.runner_worker_graceful_terminate_heartbeat_timeout
+  environment_variables                  = var.runner_terminate_ec2_environment_variables
+  lambda_handler                         = var.runner_terminate_ec2_lambda_handler
+  layer_arns                             = var.runner_terminate_ec2_lambda_layer_arns
+  egress_rules                           = var.runner_terminate_ec2_lambda_egress_rules
+  vpc_id                                 = var.vpc_id
+  subnet_id                              = var.subnet_id
 
   tags = local.tags
 }
diff --git a/modules/terminate-agent-hook/iam.tf b/modules/terminate-agent-hook/iam.tf
@@ -37,7 +37,7 @@ resource "aws_iam_role" "lambda" {
 data "aws_iam_policy_document" "lambda" {
   # checkov:skip=CKV_AWS_111:Write access is limited to the resources needed
   statement {
-    sid = "allow kms access"
+    sid = "AllowKmsAccess"
     actions = [
       "kms:Decrypt", # to decrypt the Lambda environment variables
     ]
@@ -167,3 +167,8 @@ resource "aws_iam_role_policy_attachment" "spot_request_housekeeping" {
   role       = aws_iam_role.lambda.name
   policy_arn = aws_iam_policy.spot_request_housekeeping.arn
 }
+
+resource "aws_iam_role_policy_attachment" "aws_lambda_vpc_access_execution_role" {
+  role       = aws_iam_role.lambda.name
+  policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
+}
diff --git a/modules/terminate-agent-hook/locals.tf b/modules/terminate-agent-hook/locals.tf
@@ -0,0 +1,6 @@
+locals {
+  original_lambda_handler = "lambda_function.handler"
+  lambda_handler          = var.lambda_handler != null ? var.lambda_handler : local.original_lambda_handler
+
+  replaced_environment_variables = { for key, value in var.environment_variables : key => replace(value, "{HANDLER}", local.original_lambda_handler) }
+}
diff --git a/modules/terminate-agent-hook/main.tf b/modules/terminate-agent-hook/main.tf
@@ -15,23 +15,48 @@ data "archive_file" "terminate_runner_instances_lambda" {
   output_file_mode = "0666"
 }
 
+resource "aws_security_group" "terminate_runner_instances" {
+  name        = "${var.environment}-${var.name}"
+  description = "Allowing access to external services for the terminate runner instances lambda"
+
+  vpc_id = var.vpc_id
+
+  tags = var.tags
+}
+
+resource "aws_vpc_security_group_egress_rule" "docker_autoscaler_egress" {
+  for_each = var.egress_rules
+
+  security_group_id = aws_security_group.terminate_runner_instances.id
+
+  from_port   = each.value.from_port
+  to_port     = each.value.to_port
+  ip_protocol = each.value.protocol
+
+  description                  = each.value.description
+  prefix_list_id               = each.value.prefix_list_id
+  referenced_security_group_id = each.value.security_group
+  cidr_ipv4                    = each.value.cidr_block
+  cidr_ipv6                    = each.value.ipv6_cidr_block
+
+  tags = var.tags
+}
+
 # tracing functions can be activated by the user
 # tfsec:ignore:aws-lambda-enable-tracing
 # kics-scan ignore-line
 resource "aws_lambda_function" "terminate_runner_instances" {
   #ts:skip=AC_AWS_0485:Tracing functions can be activated by the user
-  #ts:skip=AC_AWS_0486 There is no need to run this lambda in our VPC
   # checkov:skip=CKV_AWS_50:Tracing functions can be activated by the user
   # checkov:skip=CKV_AWS_115:We do not assign a reserved concurrency as this function can't be called by users
   # checkov:skip=CKV_AWS_116:We should think about having a dead letter queue for this lambda
-  # checkov:skip=CKV_AWS_117:There is no need to run this lambda in our VPC
   # checkov:skip=CKV_AWS_272:Code signing would be a nice enhancement, but I guess we can live without it here
   architectures    = ["x86_64"]
   description      = "Lifecycle hook for terminating GitLab runner agent instances"
   filename         = data.archive_file.terminate_runner_instances_lambda.output_path
   source_code_hash = data.archive_file.terminate_runner_instances_lambda.output_base64sha256
   function_name    = "${var.environment}-${var.name}"
-  handler          = "lambda_function.handler"
+  handler          = local.lambda_handler
   memory_size      = 128
   package_type     = "Zip"
   publish          = true
@@ -40,12 +65,17 @@ resource "aws_lambda_function" "terminate_runner_instances" {
   timeout          = var.timeout
   kms_key_arn      = var.kms_key_id
 
-  tags = var.tags
+  layers = [for layer_arn in var.layer_arns : layer_arn]
 
   environment {
-    variables = {
+    variables = merge({
       NAME_EXECUTOR_INSTANCE = var.name_docker_machine_runners
-    }
+    }, local.replaced_environment_variables)
+  }
+
+  vpc_config {
+    security_group_ids = [aws_security_group.terminate_runner_instances.id]
+    subnet_ids         = [var.subnet_id]
   }
 
   dynamic "tracing_config" {
@@ -55,6 +85,8 @@ resource "aws_lambda_function" "terminate_runner_instances" {
       mode = "Passthrough"
     }
   }
+
+  tags = var.tags
 }
 
 resource "aws_lambda_permission" "current_version_triggers" {
diff --git a/modules/terminate-agent-hook/variables.tf b/modules/terminate-agent-hook/variables.tf
@@ -77,3 +77,45 @@ variable "asg_hook_terminating_heartbeat_timeout" {
     error_message = "AWS only supports heartbeat timeout in the range of 30 to 7200."
   }
 }
+
+variable "environment_variables" {
+  description = "Environment variables to set for the Lambda function. A value of `{HANDLER} is replaced with the handler value of the Lambda function."
+  type        = map(string)
+  default     = {}
+}
+
+variable "layer_arns" {
+  description = "A list of ARNs of Lambda layers to attach to the Lambda function."
+  type        = list(string)
+  default     = []
+}
+
+variable "lambda_handler" {
+  description = "The entry point for the Lambda function."
+  type        = string
+  default     = null
+}
+
+variable "vpc_id" {
+  description = "The VPC used for the runner and runner workers."
+  type        = string
+}
+
+variable "subnet_id" {
+  type        = string
+  description = "The subnet for the lambda function."
+}
+
+variable "egress_rules" {
+  description = "Map of egress rules for the Lambda function."
+  type = map(object({
+    from_port       = optional(number, null)
+    to_port         = optional(number, null)
+    protocol        = string
+    description     = string
+    cidr_block      = optional(string, null)
+    ipv6_cidr_block = optional(string, null)
+    prefix_list_id  = optional(string, null)
+    security_group  = optional(string, null)
+  }))
+}
diff --git a/variables.tf b/variables.tf
@@ -447,6 +447,73 @@ variable "runner_terminate_ec2_timeout_duration" {
   default     = 90
 }
 
+variable "runner_terminate_ec2_environment_variables" {
+  description = "Environment variables to set for the Lambda function. A value of `{HANDLER} is replaced with the handler value of the Lambda function."
+  type        = map(string)
+  default     = {}
+}
+
+variable "runner_terminate_ec2_lambda_handler" {
+  description = "The handler for the terminate Lambda function."
+  type        = string
+  default     = null
+}
+
+variable "runner_terminate_ec2_lambda_layer_arns" {
+  description = "A list of ARNs of Lambda layers to attach to the Lambda function."
+  type        = list(string)
+  default     = []
+}
+
+variable "runner_terminate_ec2_lambda_egress_rules" {
+  description = "Map of egress rules for the Lambda function."
+  type = map(object({
+    from_port       = optional(number, null)
+    to_port         = optional(number, null)
+    protocol        = string
+    description     = string
+    cidr_block      = optional(string, null)
+    ipv6_cidr_block = optional(string, null)
+    prefix_list_id  = optional(string, null)
+    security_group  = optional(string, null)
+  }))
+  default = {
+    allow_https_ipv4 = {
+      cidr_block  = "0.0.0.0/0"
+      from_port   = 443
+      to_port     = 443
+      protocol    = "tcp"
+      description = "Allow HTTPS egress traffic to all destinations (IPv4)"
+    },
+    allow_https_ipv6 = {
+      ipv6_cidr_block = "::/0"
+      from_port       = 443
+      to_port         = 443
+      protocol        = "tcp"
+      description     = "Allow HTTPS egress traffic to all destinations (IPv6)"
+    }
+  }
+
+  validation {
+    condition = alltrue([
+      for rule in values(var.runner_terminate_ec2_lambda_egress_rules) :
+      contains(["-1", "tcp", "udp", "icmp", "icmpv6"], rule.protocol)
+    ])
+    error_message = "Protocol must be '-1', 'tcp', 'udp', 'icmp', or 'icmpv6'."
+  }
+
+  validation {
+    condition = alltrue([
+      for rule in values(var.runner_terminate_ec2_lambda_egress_rules) :
+      (rule.cidr_block != null) ||
+      (rule.ipv6_cidr_block != null) ||
+      (rule.prefix_list_id != null) ||
+      (rule.security_group != null)
+    ])
+    error_message = "At least one destination must be specified."
+  }
+}
+
 /*
  * Runner Worker: The process created by the Runner on the host computing platform to run jobs.
  */