Skip to content

Commit 5da51bb

Browse files
Autoscale crater instances
Crater no longer requires each instance to be uniquely identified for assigned crates to not be duplicated across instances, which means that we can relatively directly just copy the stateless configuration into more instances. This commit adds a CPU-based autoscaler to the GCP managed instance group, and configures a systemd timer via the startup script which will update the crater-agent on the instances. This continues building upon the startup scripts rather than pursuing Ansible automation (perhaps backed by Packer or so) because they're both pretty simple and it doesn't seem worth the fairly high complexity cost Ansible and Packer would bring at this time. The current autoscaling is just so that in theory we shutdown instances during idle periods (if we have them), but in practice that's likely more of an opportunistic nice to have -- eventually it might allow us to burst up to finish work much faster and then go idle, but in practice it's not clear whether we want slow, but constant, or fast, but sometimes unavailable until the next month. Something to continue thinking about.
1 parent d4c0ed7 commit 5da51bb

File tree

3 files changed

+65
-2
lines changed

3 files changed

+65
-2
lines changed

terraform/crater/agent.tf

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,10 @@ resource "google_compute_instance_template" "agent" {
159159
role = aws_iam_role.agent.arn,
160160
docker_url = module.ecr.url
161161
})
162+
update-script = templatefile("update.sh", {
163+
role = aws_iam_role.agent.arn,
164+
docker_url = module.ecr.url
165+
})
162166
}
163167

164168
service_account {
@@ -171,6 +175,23 @@ resource "google_compute_instance_template" "agent" {
171175
}
172176
}
173177

178+
resource "google_compute_region_autoscaler" "agents" {
179+
name = "crater-autoscaler"
180+
target = google_compute_region_instance_group_manager.agents.id
181+
182+
autoscaling_policy {
183+
max_replicas = 3
184+
min_replicas = 1
185+
cooldown_period = 120
186+
// This is pretty low, but in practice we want to scale out to the max
187+
// unless we're entirely idle: crater is either all up or all down.
188+
cpu_utilization {
189+
target = 0.1
190+
}
191+
}
192+
}
193+
194+
174195
resource "google_compute_region_instance_group_manager" "agents" {
175196
name = "crater-agents"
176197

@@ -180,8 +201,6 @@ resource "google_compute_region_instance_group_manager" "agents" {
180201
instance_template = google_compute_instance_template.agent.id
181202
}
182203

183-
target_size = 1
184-
185204
auto_healing_policies {
186205
health_check = google_compute_health_check.tcp_health.id
187206
initial_delay_sec = 600

terraform/crater/startup-script.sh

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,15 @@ docker pull ${docker_url}
3737

3838
mkdir -p /var/lib/crater-agent-workspace
3939

40+
curl http://metadata.google.internal/computeMetadata/v1/instance/attributes/update-script \
41+
-o /opt/update.sh \
42+
-H "Metadata-Flavor: Google"
43+
44+
chmod +x /opt/update.sh
45+
46+
# Run update task every 5 minutes
47+
sudo systemd-run --unit crater-agent-update --on-calendar='*:0/5' /opt/update.sh
48+
4049
systemd-run \
4150
--unit crater-agent \
4251
docker run --init --rm --name crater-agent \

terraform/crater/update.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
3+
set -euo pipefail
4+
5+
aws sts assume-role-with-web-identity \
6+
--role-arn ${role} \
7+
--role-session-name $(hostname) \
8+
--duration-seconds 900 \
9+
--web-identity-token $(curl \
10+
-H "Metadata-Flavor: Google" \
11+
'http://metadata/computeMetadata/v1/instance/service-accounts/default/identity?audience=aws') \
12+
> credentials
13+
14+
export AWS_ACCESS_KEY_ID="$(jq -r .Credentials.AccessKeyId credentials)"
15+
export AWS_SECRET_ACCESS_KEY="$(jq -r .Credentials.SecretAccessKey credentials)"
16+
export AWS_SESSION_TOKEN="$(jq -r .Credentials.SessionToken credentials)"
17+
18+
rm credentials # Remove the raw file on disk, no need for that to exist
19+
20+
AGENT_TOKEN=$(aws --region us-west-1 \
21+
--output text --query Parameter.Value \
22+
ssm get-parameter \
23+
--name /prod/ansible/crater-gcp-2/crater-token \
24+
--with-decryption)
25+
26+
eval $(aws ecr get-login --no-include-email --region us-west-1)
27+
28+
old_id="$(docker images --format "{{.ID}}" "${docker_url}")"
29+
docker pull "${docker_url}"
30+
new_id="$(docker images --format "{{.ID}}" "${docker_url}")"
31+
32+
if [[ "$old_id" != "$new_id" ]]; then
33+
echo "restarting container..."
34+
sudo systemctl restart crater-agent
35+
fi

0 commit comments

Comments
 (0)