From 28a871b923bd731d8cd2c45ea00e10a6ef8398ee Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Mon, 27 Oct 2025 13:55:32 -0700 Subject: [PATCH 1/6] create sole tenancy nodes --- iac/provider-gcp/main.tf | 6 +- .../.terraform.lock.hcl | 1 + .../nomad-cluster/nodepool-client-isolated.tf | 137 ++++++++++++++++++ iac/provider-gcp/nomad-cluster/variables.tf | 12 ++ iac/provider-gcp/variables.tf | 15 ++ 5 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index ab2f6a8e81..67db525ff8 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -13,7 +13,7 @@ terraform { google = { source = "hashicorp/google" - version = "6.49.3" + version = "~> 6" } cloudflare = { @@ -98,6 +98,10 @@ module "cluster" { build_cluster_cache_disk_size_gb = var.build_cluster_cache_disk_size_gb build_cluster_cache_disk_type = var.build_cluster_cache_disk_type + client_node_type = var.client_node_type + isolated_client_cluster_size = var.isolated_client_cluster_size + isolated_client_cluster_size_max = var.isolated_client_cluster_size_max + api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size clickhouse_cluster_size = var.clickhouse_cluster_size diff --git a/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl b/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl index 5ba92be9b1..a8adc24083 100644 --- a/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl +++ b/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl @@ -6,6 +6,7 @@ provider "registry.terraform.io/hashicorp/google" { constraints = "6.49.3" hashes = [ "h1:cU8PRPAD6+W3mCWROYqECv+JwmpDiXFDM8VDQmrsyC0=", + "h1:kOOq1McNjfbrTzNoKKT2sZEwNuzAZuAs6dJJ5O9lEfU=", "zh:0ecb3cda6763b671f74dc5a5f1a60a2a2134181bf8869bcafd2608912d0d2940", "zh:1d1b322559f2929baff32ef66c89ba82b7d7808956e05d00235a1efbc8cd5c86", "zh:4561e0b401f1760c954e8a59f0da4e8f9ee07b45bd4144ec5524716373f6f01f", diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf new file mode 100644 index 0000000000..e8ffdacd73 --- /dev/null +++ b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf @@ -0,0 +1,137 @@ +locals { + use_isolated_nodes = var.client_node_type != "" + + isolated_client_pool_name = "${var.prefix}${var.client_cluster_name}-isolated" +} + +resource "google_compute_node_template" "client" { + count = local.use_isolated_nodes ? 1 : 0 + name = "${local.isolated_client_pool_name}-node-template" + region = var.gcp_region + node_type = var.client_node_type + description = "Sole tenant node template for orchestrators" +} + +resource "google_compute_node_group" "client" { + count = local.use_isolated_nodes ? 1 : 0 + name = "${local.isolated_client_pool_name}-node-group" + zone = var.gcp_zone + description = "Sole tenant node group for orchestrators" + + initial_size = 1 + node_template = google_compute_node_template.client[0].id +} + +resource "google_compute_instance_template" "isolated_client" { + count = local.use_isolated_nodes ? 1 : 0 + + name_prefix = "${local.isolated_client_pool_name}-" + + instance_description = null + machine_type = var.client_machine_type + min_cpu_platform = var.min_cpu_platform + + labels = merge( + var.labels, + (var.environment != "dev" ? { + goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}" + } : {}) + ) + tags = [var.cluster_tag_name] + metadata_startup_script = local.client_startup_script + metadata = { + enable-osconfig = "TRUE", + enable-guest-attributes = "TRUE", + } + + scheduling { + on_host_maintenance = "MIGRATE" + } + + disk { + boot = true + source_image = data.google_compute_image.client_source_image.id + disk_size_gb = 300 + disk_type = "pd-ssd" + } + + disk { + auto_delete = true + boot = false + type = "PERSISTENT" + disk_size_gb = var.client_cluster_cache_disk_size_gb + disk_type = var.client_cluster_cache_disk_type + } + + network_interface { + network = var.network_name + + dynamic "access_config" { + for_each = ["public_ip"] + content {} + } + } + + # For a full list of oAuth 2.0 Scopes, see https://developers.google.com/identity/protocols/googlescopes + service_account { + email = var.google_service_account_email + scopes = [ + "userinfo-email", + "compute-ro", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring.write", + "https://www.googleapis.com/auth/trace.append", + "https://www.googleapis.com/auth/cloud-platform" + ] + } + + # Per Terraform Docs (https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#using-with-instance-group-manager), + # we need to create a new instance template before we can destroy the old one. Note that any Terraform resource on + # which this Terraform resource depends will also need this lifecycle statement. + lifecycle { + create_before_destroy = true + } + + depends_on = [ + google_storage_bucket_object.setup_config_objects["scripts/run-nomad.sh"], + google_storage_bucket_object.setup_config_objects["scripts/run-consul.sh"] + ] +} + +resource "google_compute_region_instance_group_manager" "isolated_client_pool" { + count = local.use_isolated_nodes ? 1 : 0 + + name = "${local.isolated_client_pool_name}-rig" + region = var.gcp_region + distribution_policy_zones = [var.gcp_zone] + + target_size = var.isolated_client_cluster_size < var.isolated_client_cluster_size_max ? null : var.isolated_client_cluster_size + + version { + name = google_compute_instance_template.isolated_client[0].id + instance_template = google_compute_instance_template.isolated_client[0].id + } + + auto_healing_policies { + health_check = google_compute_health_check.client_nomad_check.id + initial_delay_sec = 600 + } + + distribution_policy_target_shape = "EVEN" + + # Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be + # a rolling update. + update_policy { + type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC" + minimal_action = "REPLACE" + max_surge_fixed = 10 + max_surge_percent = null + max_unavailable_fixed = 5 + max_unavailable_percent = null + replacement_method = "SUBSTITUTE" + instance_redistribution_type = "NONE" + } + + base_instance_name = local.isolated_client_pool_name + target_pools = [] +} diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index d799c676a0..3de391f12f 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -331,3 +331,15 @@ variable "api_nat_ips" { variable "api_nat_min_ports_per_vm" { type = number } + +variable "client_node_type" { + type = string +} + +variable "isolated_client_cluster_size" { + type = number +} + +variable "isolated_client_cluster_size_max" { + type = number +} diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 8cb041c5e1..78e63d46f9 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -491,3 +491,18 @@ variable "remote_repository_enabled" { description = "Set to true to enable remote repository cache. Can be set via TF_VAR_remote_repository_enabled or REMOTE_REPOSITORY_ENABLED env var." default = false } + +variable "client_node_type" { + type = string + default = "" +} + +variable "isolated_client_cluster_size" { + type = number + default = 1 +} + +variable "isolated_client_cluster_size_max" { + type = number + default = 1 +} From 49b317c76cc430607c1666bb7c74ecaaffe8d019 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Wed, 5 Nov 2025 17:31:36 -0800 Subject: [PATCH 2/6] add a test for terraform --- .github/workflows/pr-tests.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index 3bf04ed6d7..f60c11210a 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -46,3 +46,24 @@ jobs: - name: Run tests working-directory: ${{ matrix.package }} run: go test -v ${{ matrix.test_path }} + + validate-iac: + name: Validate terraform + runs-on: ubuntu-24.04 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Parse .tool-versions + uses: wistia/parse-tool-versions@v2.1.1 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "${{ env.TERRAFORM }}" + + - name: Validate terraform + working-directory: ./iac/provider-gcp + run: | + terraform init + terraform validate From 98939ef569942816f3223d0ba10c6336436b64a4 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Wed, 5 Nov 2025 17:33:52 -0800 Subject: [PATCH 3/6] disable the state backend don't need it to validate --- .github/workflows/pr-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index f60c11210a..3ec0341c4c 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -65,5 +65,5 @@ jobs: - name: Validate terraform working-directory: ./iac/provider-gcp run: | - terraform init + terraform init -backend=false terraform validate From 860823f60ca20a1a5c5f86d4feaf00718ca318fd Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Wed, 5 Nov 2025 17:42:47 -0800 Subject: [PATCH 4/6] fix to main, swap default variables --- .../nomad-cluster/nodepool-client-isolated.tf | 17 +++++++---- iac/provider-gcp/variables.tf | 30 ++----------------- 2 files changed, 14 insertions(+), 33 deletions(-) diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf index e8ffdacd73..154fe825d8 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf @@ -55,12 +55,17 @@ resource "google_compute_instance_template" "isolated_client" { disk_type = "pd-ssd" } - disk { - auto_delete = true - boot = false - type = "PERSISTENT" - disk_size_gb = var.client_cluster_cache_disk_size_gb - disk_type = var.client_cluster_cache_disk_type + dynamic "disk" { + for_each = [for n in range(var.client_cluster_cache_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = 375 + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } } network_interface { diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 489e99dd87..3e395a6ae4 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -94,18 +94,6 @@ variable "build_cluster_root_disk_size_gb" { default = 200 } -variable "build_cluster_cache_disk_size_gb" { - type = number - description = "The size of the cache disk for the build machines in GB" - default = 200 -} - -variable "build_cluster_cache_disk_type" { - description = "The GCE cache disk type for the build machines." - type = string - default = "pd-ssd" -} - variable "clickhouse_cluster_size" { type = number } @@ -281,18 +269,6 @@ variable "allow_sandbox_internet" { default = true } -variable "client_cluster_cache_disk_size_gb" { - type = number - description = "The size of the cache disk for the orchestrator machines in GB" - default = 500 -} - -variable "client_cluster_cache_disk_type" { - description = "The GCE cache disk type for the client machines." - type = string - default = "pd-ssd" -} - variable "orchestrator_node_pool" { type = string default = "default" @@ -494,17 +470,17 @@ variable "remote_repository_enabled" { variable "client_node_type" { type = string - default = "" + default = "n1-node-96-624" } variable "isolated_client_cluster_size" { type = number - default = 1 + default = 0 } variable "isolated_client_cluster_size_max" { type = number - default = 1 + default = 0 } variable "build_cluster_cache_disk_count" { From 137f238570ef6a0fe96a7c174e71fe3d34b383ba Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Wed, 5 Nov 2025 17:54:13 -0800 Subject: [PATCH 5/6] rename, clean up a node group cannot have a zero size --- iac/provider-gcp/main.tf | 5 ++--- .../nomad-cluster/nodepool-client-isolated.tf | 18 +++++++++--------- iac/provider-gcp/nomad-cluster/variables.tf | 6 +----- iac/provider-gcp/variables.tf | 7 +------ 4 files changed, 13 insertions(+), 23 deletions(-) diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index 91a2ff4fd8..111b508ca5 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -94,9 +94,8 @@ module "cluster" { client_cluster_size_max = var.client_cluster_size_max build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb - client_node_type = var.client_node_type - isolated_client_cluster_size = var.isolated_client_cluster_size - isolated_client_cluster_size_max = var.isolated_client_cluster_size_max + client_node_type = var.client_node_type + isolated_client_cluster_target_size = var.isolated_client_cluster_target_size api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf index 154fe825d8..99a75bcd3a 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf @@ -1,10 +1,10 @@ locals { - use_isolated_nodes = var.client_node_type != "" + use_isolated_nodes = var.isolated_client_cluster_target_size > 0 isolated_client_pool_name = "${var.prefix}${var.client_cluster_name}-isolated" } -resource "google_compute_node_template" "client" { +resource "google_compute_node_template" "isolated-client" { count = local.use_isolated_nodes ? 1 : 0 name = "${local.isolated_client_pool_name}-node-template" region = var.gcp_region @@ -12,17 +12,17 @@ resource "google_compute_node_template" "client" { description = "Sole tenant node template for orchestrators" } -resource "google_compute_node_group" "client" { +resource "google_compute_node_group" "isolated-client" { count = local.use_isolated_nodes ? 1 : 0 name = "${local.isolated_client_pool_name}-node-group" zone = var.gcp_zone description = "Sole tenant node group for orchestrators" initial_size = 1 - node_template = google_compute_node_template.client[0].id + node_template = google_compute_node_template.isolated-client[0].id } -resource "google_compute_instance_template" "isolated_client" { +resource "google_compute_instance_template" "isolated-client" { count = local.use_isolated_nodes ? 1 : 0 name_prefix = "${local.isolated_client_pool_name}-" @@ -103,18 +103,18 @@ resource "google_compute_instance_template" "isolated_client" { ] } -resource "google_compute_region_instance_group_manager" "isolated_client_pool" { +resource "google_compute_region_instance_group_manager" "isolated-client-pool" { count = local.use_isolated_nodes ? 1 : 0 name = "${local.isolated_client_pool_name}-rig" region = var.gcp_region distribution_policy_zones = [var.gcp_zone] - target_size = var.isolated_client_cluster_size < var.isolated_client_cluster_size_max ? null : var.isolated_client_cluster_size + target_size = var.isolated_client_cluster_target_size version { - name = google_compute_instance_template.isolated_client[0].id - instance_template = google_compute_instance_template.isolated_client[0].id + name = google_compute_instance_template.isolated-client[0].id + instance_template = google_compute_instance_template.isolated-client[0].id } auto_healing_policies { diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index 55ddd3c0f7..56bd81da3b 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -320,11 +320,7 @@ variable "client_node_type" { type = string } -variable "isolated_client_cluster_size" { - type = number -} - -variable "isolated_client_cluster_size_max" { +variable "isolated_client_cluster_target_size" { type = number } diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 3e395a6ae4..48d9294191 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -473,12 +473,7 @@ variable "client_node_type" { default = "n1-node-96-624" } -variable "isolated_client_cluster_size" { - type = number - default = 0 -} - -variable "isolated_client_cluster_size_max" { +variable "isolated_client_cluster_target_size" { type = number default = 0 } From a45dda847a77e0bb4d01c2972b10d7e132c3e858 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Wed, 5 Nov 2025 18:37:40 -0800 Subject: [PATCH 6/6] support disk removal, as they seem to be in short supply --- .github/workflows/pr-tests.yml | 3 +++ iac/provider-gcp/main.tf | 1 + .../nomad-cluster/nodepool-client-isolated.tf | 21 +++++++++++++++++-- iac/provider-gcp/nomad-cluster/variables.tf | 4 ++++ iac/provider-gcp/variables.tf | 5 +++++ 5 files changed, 32 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index 3ec0341c4c..d3f3973974 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -2,6 +2,9 @@ name: Run tests on PRs on: [workflow_call] +permissions: + contents: read + jobs: run-tests: name: Run tests for ${{ matrix.package }} diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index 111b508ca5..cde6adc429 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -96,6 +96,7 @@ module "cluster" { client_node_type = var.client_node_type isolated_client_cluster_target_size = var.isolated_client_cluster_target_size + isolated_client_cluster_disk_count = var.isolated_client_cluster_disk_count api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf index 99a75bcd3a..ae08726dd2 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf @@ -10,6 +10,16 @@ resource "google_compute_node_template" "isolated-client" { region = var.gcp_region node_type = var.client_node_type description = "Sole tenant node template for orchestrators" + + dynamic "disks" { + for_each = var.isolated_client_cluster_disk_count == 0 ? [] : [{}] + + content { + disk_count = var.isolated_client_cluster_disk_count + disk_size_gb = 375 + disk_type = "local-ssd" + } + } } resource "google_compute_node_group" "isolated-client" { @@ -29,7 +39,6 @@ resource "google_compute_instance_template" "isolated-client" { instance_description = null machine_type = var.client_machine_type - min_cpu_platform = var.min_cpu_platform labels = merge( var.labels, @@ -46,6 +55,14 @@ resource "google_compute_instance_template" "isolated-client" { scheduling { on_host_maintenance = "MIGRATE" + + node_affinities { + key = "compute.googleapis.com/node-group-name" + operator = "IN" + values = [ + google_compute_node_group.isolated-client[0].name + ] + } } disk { @@ -56,7 +73,7 @@ resource "google_compute_instance_template" "isolated-client" { } dynamic "disk" { - for_each = [for n in range(var.client_cluster_cache_disk_count) : {}] + for_each = [for n in range(var.isolated_client_cluster_disk_count) : {}] content { auto_delete = true diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index 56bd81da3b..215fbb7bd3 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -324,6 +324,10 @@ variable "isolated_client_cluster_target_size" { type = number } +variable "isolated_client_cluster_disk_count" { + type = number +} + variable "build_cluster_cache_disk_count" { type = number diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 48d9294191..f2ac22f602 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -478,6 +478,11 @@ variable "isolated_client_cluster_target_size" { default = 0 } +variable "isolated_client_cluster_disk_count" { + type = number + default = 0 +} + variable "build_cluster_cache_disk_count" { type = number description = "The number of 375 GB NVME disks to raid together for storing build files."