Skip to content

Commit 8fc7c16

Browse files
committed
Merge branch 'main' into chore/check-template-exists
2 parents 5c97dff + 9006f0f commit 8fc7c16

File tree

24 files changed

+767
-31
lines changed

24 files changed

+767
-31
lines changed

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ build-and-upload:build-and-upload/orchestrator
6969
build-and-upload:build-and-upload/template-manager
7070
build-and-upload:build-and-upload/envd
7171
build-and-upload:build-and-upload/clickhouse-migrator
72+
build-and-upload:build-and-upload/nomad-nodepool-apm
7273
build-and-upload/clean-nfs-cache:
7374
./scripts/confirm.sh $(TERRAFORM_ENVIRONMENT)
7475
GCP_PROJECT_ID=$(GCP_PROJECT_ID) $(MAKE) -C packages/orchestrator build-and-upload/clean-nfs-cache
@@ -183,6 +184,13 @@ local-infra:
183184
docker compose --file ./packages/local-dev/docker-compose.yaml up --abort-on-container-failure
184185

185186

187+
# Migration: Detach old template-manager-system job from Terraform state
188+
# TODO: Remove after template-manager migration is complete
189+
.PHONY: migrate-template-manager-detach
190+
migrate-template-manager-detach:
191+
./scripts/confirm.sh $(TERRAFORM_ENVIRONMENT)
192+
$(MAKE) -C iac/provider-gcp migrate-template-manager-detach
193+
186194
# TODO 2025-12-29: [ENG-3410] - Remove after migration period (14 days)
187195
define env_var_or_default
188196
$(if $(value $(strip $(1))),$($(strip $(1))),$(2))

go.work

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use (
88
./packages/docker-reverse-proxy
99
./packages/envd
1010
./packages/local-dev
11+
./packages/nomad-nodepool-apm
1112
./packages/orchestrator
1213
./packages/shared
1314

iac/provider-gcp/Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,14 @@ provider-login:
162162
gcloud --quiet auth configure-docker "$(GCP_REGION)-docker.pkg.dev"
163163
gcloud --quiet auth application-default login
164164

165+
# Migration: Detach old template-manager-system job from Terraform state
166+
# This keeps the Nomad job running but removes it from Terraform management
167+
# TODO: Remove after template-manager migration is complete
168+
.PHONY: migrate-template-manager-detach
169+
migrate-template-manager-detach:
170+
@ printf "Detaching old template-manager-system from Terraform state for env: `tput setaf 2``tput bold`$(ENV)`tput sgr0`\n"
171+
$(tf_vars) $(TF) state rm module.nomad.nomad_job.template_manager || true
172+
165173
# TODO 2025-12-29: [ENG-3410] - Remove after migration period (14 days)
166174
.PHONY: migrate-clusters-terraform
167175
migrate-clusters-terraform:
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
job "nomad-autoscaler" {
2+
type = "service"
3+
node_pool = "${node_pool}"
4+
5+
group "autoscaler" {
6+
count = 1
7+
8+
network {
9+
port "http" {}
10+
}
11+
12+
task "autoscaler" {
13+
driver = "raw_exec"
14+
15+
config {
16+
command = "/bin/bash"
17+
args = ["-c", "chmod +x local/plugins/nomad-nodepool-apm && ./local/nomad-autoscaler agent -config local/config.hcl -plugin-dir local/plugins"]
18+
}
19+
20+
artifact {
21+
source = "https://releases.hashicorp.com/nomad-autoscaler/${autoscaler_version}/nomad-autoscaler_${autoscaler_version}_linux_amd64.zip"
22+
destination = "local"
23+
}
24+
25+
# Custom nodepool APM plugin
26+
artifact {
27+
source = "gcs::https://www.googleapis.com/storage/v1/${bucket_name}/nomad-nodepool-apm"
28+
destination = "local/plugins/nomad-nodepool-apm"
29+
mode = "file"
30+
options {
31+
checksum = "md5:${nomad_nodepool_apm_checksum}"
32+
}
33+
}
34+
35+
template {
36+
data = <<-EOF
37+
# Nomad Autoscaler configuration
38+
39+
nomad {
40+
address = "http://{{ env "NOMAD_IP_http" }}:4646"
41+
token = "${nomad_token}"
42+
}
43+
44+
# Enable the HTTP health API
45+
http {
46+
bind_address = "0.0.0.0"
47+
bind_port = {{ env "NOMAD_PORT_http" }}
48+
}
49+
50+
# Policy configuration
51+
# Policies are defined in Nomad job scaling blocks, not files
52+
policy {
53+
default_cooldown = "2m"
54+
}
55+
56+
# Plugin directory for external plugins
57+
plugin_dir = "local/plugins"
58+
59+
# APM plugins configuration - custom plugin for node pool count
60+
apm "nomad-nodepool-apm" {
61+
driver = "nomad-nodepool-apm"
62+
config = {
63+
nomad_address = "http://{{ env "NOMAD_IP_http" }}:4646"
64+
nomad_token = "${nomad_token}"
65+
}
66+
}
67+
68+
# Use built-in nomad-target for job scaling (no config needed, uses nomad block above)
69+
EOF
70+
destination = "local/config.hcl"
71+
}
72+
73+
resources {
74+
cpu = 256
75+
memory = 256
76+
}
77+
78+
service {
79+
name = "nomad-autoscaler"
80+
port = "http"
81+
provider = "nomad"
82+
83+
check {
84+
type = "http"
85+
path = "/v1/health"
86+
interval = "10s"
87+
timeout = "2s"
88+
}
89+
}
90+
}
91+
}
92+
}
93+

iac/provider-gcp/nomad/jobs/template-manager.hcl

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,49 @@
1-
job "template-manager-system" {
2-
type = "system"
1+
job "template-manager" {
2+
type = "service"
33
node_pool = "${node_pool}"
4-
priority = 70
4+
priority = 75
5+
6+
group "template-manager" {
7+
# Count is fetched from current Nomad state to preserve autoscaler-managed value
8+
count = ${current_count}
9+
10+
# Ensure one allocation per node (like a system job)
11+
constraint {
12+
operator = "distinct_hosts"
13+
value = "true"
14+
}
515

6-
# https://developer.hashicorp.com/nomad/docs/job-specification/update
716
%{ if update_stanza }
8-
update {
9-
max_parallel = 1 # Update only 1 node at a time
10-
}
11-
%{ endif }
17+
# Scaling policy to match node count in the pool
18+
# Uses the nomad-nodepool APM plugin
19+
scaling {
20+
enabled = true
21+
min = 2
22+
max = 10000 # Effectively unlimited
1223

13-
group "template-manager" {
24+
policy {
25+
evaluation_interval = "10s"
26+
cooldown = "2m"
27+
28+
check "match_node_count" {
29+
source = "nomad-nodepool-apm"
30+
query = "${node_pool}"
31+
32+
strategy "pass-through" {}
33+
}
34+
}
35+
}
36+
37+
# Rolling update configuration for service jobs
38+
# https://developer.hashicorp.com/nomad/docs/job-specification/update
39+
update {
40+
max_parallel = 1
41+
min_healthy_time = "10s"
42+
healthy_deadline = "2m"
43+
progress_deadline = "80m" # Must be > healthy_deadline and > kill_timeout
44+
auto_revert = false
45+
}
46+
%{ endif }
1447

1548
// Try to restart the task indefinitely
1649
// Tries to restart every 5 seconds

iac/provider-gcp/nomad/main.tf

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,10 +463,25 @@ data "external" "template_manager" {
463463
}
464464
}
465465

466+
# Get current template-manager count from Nomad to preserve autoscaler-managed value
467+
# This prevents Terraform from resetting count on job updates
468+
# Default depends on whether scaling is enabled (min=2) or not (min=1)
469+
data "external" "template_manager_count" {
470+
program = ["bash", "${path.module}/scripts/get-nomad-job-count.sh"]
471+
472+
query = {
473+
nomad_addr = "https://nomad.${var.domain_name}"
474+
nomad_token = var.nomad_acl_token_secret
475+
job_name = "template-manager"
476+
min_count = var.template_manages_clusters_size_gt_1 ? "2" : "1"
477+
}
478+
}
479+
466480
resource "nomad_job" "template_manager" {
467481
jobspec = templatefile("${path.module}/jobs/template-manager.hcl", {
468482
update_stanza = var.template_manages_clusters_size_gt_1
469483
node_pool = var.builder_node_pool
484+
current_count = tonumber(data.external.template_manager_count.result.count)
470485

471486
gcp_project = var.gcp_project_id
472487
gcp_region = var.gcp_region
@@ -493,6 +508,37 @@ resource "nomad_job" "template_manager" {
493508
shared_chunk_cache_path = var.shared_chunk_cache_path
494509
})
495510
}
511+
512+
data "google_storage_bucket_object" "nomad_nodepool_apm" {
513+
count = var.template_manages_clusters_size_gt_1 ? 1 : 0
514+
515+
name = "nomad-nodepool-apm"
516+
bucket = var.fc_env_pipeline_bucket_name
517+
}
518+
519+
data "external" "nomad_nodepool_apm_checksum" {
520+
count = var.template_manages_clusters_size_gt_1 ? 1 : 0
521+
522+
program = ["bash", "${path.module}/scripts/checksum.sh"]
523+
524+
query = {
525+
base64 = data.google_storage_bucket_object.nomad_nodepool_apm[0].md5hash
526+
}
527+
}
528+
529+
# Nomad Autoscaler - required for template-manager dynamic scaling
530+
resource "nomad_job" "nomad_autoscaler" {
531+
count = var.template_manages_clusters_size_gt_1 ? 1 : 0
532+
533+
jobspec = templatefile("${path.module}/jobs/nomad-autoscaler.hcl", {
534+
node_pool = var.api_node_pool
535+
autoscaler_version = var.nomad_autoscaler_version
536+
bucket_name = var.fc_env_pipeline_bucket_name
537+
nomad_token = var.nomad_acl_token_secret
538+
nomad_nodepool_apm_checksum = data.external.nomad_nodepool_apm_checksum[0].result.hex
539+
})
540+
}
541+
496542
resource "nomad_job" "loki" {
497543
jobspec = templatefile("${path.module}/jobs/loki.hcl", {
498544
gcp_zone = var.gcp_zone
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
# Get current job count from Nomad API to preserve autoscaler-managed values.
3+
# This prevents Terraform from resetting count on job updates.
4+
#
5+
# IMPORTANT: This script fails on Nomad API errors (network, auth, TLS) to prevent
6+
# accidental scale-downs. Only a 404 (job not found) falls back to min_count.
7+
#
8+
# Based on: https://registry.terraform.io/providers/hashicorp/external/latest/docs/data-sources/external#processing-json-in-shell-scripts
9+
10+
set -euo pipefail
11+
12+
# Extract arguments from the input into shell variables.
13+
eval "$(jq -r '@sh "ADDR=\(.nomad_addr) TOKEN=\(.nomad_token) JOB=\(.job_name) MIN=\(.min_count)"')"
14+
15+
# Fetch job info and capture HTTP status code
16+
RESPONSE=$(curl -s -w "\n---HTTP_STATUS:%{http_code}" -H "X-Nomad-Token: $TOKEN" \
17+
"$ADDR/v1/job/$JOB" 2>&1)
18+
CURL_EXIT=$?
19+
20+
# Extract HTTP code and body
21+
HTTP_CODE=$(echo "$RESPONSE" | grep '^---HTTP_STATUS:' | sed 's/---HTTP_STATUS://')
22+
BODY=$(echo "$RESPONSE" | grep -v '^---HTTP_STATUS:')
23+
24+
# Handle curl-level failures (network, TLS, DNS, etc.)
25+
if [ $CURL_EXIT -ne 0 ]; then
26+
echo "ERROR: Failed to connect to Nomad API at $ADDR (curl exit code: $CURL_EXIT)" >&2
27+
echo "This may indicate a network issue, TLS error, or DNS failure." >&2
28+
echo "Refusing to proceed to prevent accidental scale-down." >&2
29+
exit 1
30+
fi
31+
32+
# Handle HTTP error responses
33+
if [ -z "$HTTP_CODE" ]; then
34+
echo "ERROR: Could not determine HTTP status code from Nomad API response" >&2
35+
exit 1
36+
fi
37+
38+
if [ "$HTTP_CODE" = "404" ]; then
39+
# Job doesn't exist yet - use minimum count
40+
COUNT="$MIN"
41+
elif [ "$HTTP_CODE" -ge 200 ] && [ "$HTTP_CODE" -lt 300 ]; then
42+
# Success - parse the count from response
43+
COUNT=$(echo "$BODY" | jq -r '.TaskGroups[0].Count // empty' 2>/dev/null)
44+
if ! [[ "$COUNT" =~ ^[0-9]+$ ]]; then
45+
echo "ERROR: Failed to parse job count from Nomad API response" >&2
46+
echo "Response body: $BODY" >&2
47+
exit 1
48+
fi
49+
else
50+
# Any other HTTP error (403, 500, 502, etc.) - fail to prevent bad state
51+
echo "ERROR: Nomad API returned HTTP $HTTP_CODE for job/$JOB" >&2
52+
echo "Response: $BODY" >&2
53+
echo "Refusing to proceed to prevent accidental scale-down." >&2
54+
exit 1
55+
fi
56+
57+
# Ensure COUNT is at least MIN
58+
if [ "$COUNT" -lt "$MIN" ]; then
59+
COUNT="$MIN"
60+
fi
61+
62+
# Safely produce a JSON object containing the result value.
63+
jq -n --arg count "$COUNT" '{"count":$count}'
64+

iac/provider-gcp/nomad/variables.tf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,12 @@ variable "template_manages_clusters_size_gt_1" {
290290
type = bool
291291
}
292292

293+
variable "nomad_autoscaler_version" {
294+
type = string
295+
description = "Version of the Nomad Autoscaler to deploy"
296+
default = "0.4.5"
297+
}
298+
293299
# Redis
294300
variable "redis_port" {
295301
type = object({

packages/api/go.mod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ require (
4545
github.com/golang-jwt/jwt/v5 v5.2.2
4646
github.com/golang/protobuf v1.5.4
4747
github.com/google/uuid v1.6.0
48-
github.com/hashicorp/nomad/api v0.0.0-20231208134655-099ee06a607c
48+
github.com/hashicorp/nomad/api v0.0.0-20240813123601-b34a6fe10b82
4949
github.com/jackc/pgx/v5 v5.7.4
5050
github.com/jellydator/ttlcache/v3 v3.4.0
5151
github.com/launchdarkly/go-sdk-common/v3 v3.3.0
@@ -164,6 +164,7 @@ require (
164164
github.com/mdlayher/socket v0.5.1 // indirect
165165
github.com/mfridman/interpolate v0.0.2 // indirect
166166
github.com/mitchellh/go-homedir v1.1.0 // indirect
167+
github.com/mitchellh/go-testing-interface v1.14.2-0.20210821155943-2d9075ca8770 // indirect
167168
github.com/mitchellh/mapstructure v1.5.1-0.20231216201459-8508981c8b6c // indirect
168169
github.com/moby/docker-image-spec v1.3.1 // indirect
169170
github.com/moby/go-archive v0.1.0 // indirect
@@ -195,6 +196,7 @@ require (
195196
github.com/segmentio/asm v1.2.0 // indirect
196197
github.com/sethvargo/go-retry v0.3.0 // indirect
197198
github.com/shirou/gopsutil/v4 v4.25.6 // indirect
199+
github.com/shoenig/test v1.8.2 // indirect
198200
github.com/shopspring/decimal v1.4.0 // indirect
199201
github.com/sirupsen/logrus v1.9.3 // indirect
200202
github.com/speakeasy-api/openapi-overlay v0.9.0 // indirect

packages/api/go.sum

Lines changed: 6 additions & 6 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)