diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml index 3bf04ed6d7..d3f3973974 100644 --- a/.github/workflows/pr-tests.yml +++ b/.github/workflows/pr-tests.yml @@ -2,6 +2,9 @@ name: Run tests on PRs on: [workflow_call] +permissions: + contents: read + jobs: run-tests: name: Run tests for ${{ matrix.package }} @@ -46,3 +49,24 @@ jobs: - name: Run tests working-directory: ${{ matrix.package }} run: go test -v ${{ matrix.test_path }} + + validate-iac: + name: Validate terraform + runs-on: ubuntu-24.04 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Parse .tool-versions + uses: wistia/parse-tool-versions@v2.1.1 + + - name: Setup Terraform + uses: hashicorp/setup-terraform@v3 + with: + terraform_version: "${{ env.TERRAFORM }}" + + - name: Validate terraform + working-directory: ./iac/provider-gcp + run: | + terraform init -backend=false + terraform validate diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index db461a23b6..cde6adc429 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -13,7 +13,7 @@ terraform { google = { source = "hashicorp/google" - version = "6.49.3" + version = "~> 6" } cloudflare = { @@ -94,6 +94,10 @@ module "cluster" { client_cluster_size_max = var.client_cluster_size_max build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb + client_node_type = var.client_node_type + isolated_client_cluster_target_size = var.isolated_client_cluster_target_size + isolated_client_cluster_disk_count = var.isolated_client_cluster_disk_count + api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size clickhouse_cluster_size = var.clickhouse_cluster_size diff --git a/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl b/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl index 5ba92be9b1..a8adc24083 100644 --- a/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl +++ b/iac/provider-gcp/nomad-cluster-disk-image/.terraform.lock.hcl @@ -6,6 +6,7 @@ provider "registry.terraform.io/hashicorp/google" { constraints = "6.49.3" hashes = [ "h1:cU8PRPAD6+W3mCWROYqECv+JwmpDiXFDM8VDQmrsyC0=", + "h1:kOOq1McNjfbrTzNoKKT2sZEwNuzAZuAs6dJJ5O9lEfU=", "zh:0ecb3cda6763b671f74dc5a5f1a60a2a2134181bf8869bcafd2608912d0d2940", "zh:1d1b322559f2929baff32ef66c89ba82b7d7808956e05d00235a1efbc8cd5c86", "zh:4561e0b401f1760c954e8a59f0da4e8f9ee07b45bd4144ec5524716373f6f01f", diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf new file mode 100644 index 0000000000..ae08726dd2 --- /dev/null +++ b/iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf @@ -0,0 +1,159 @@ +locals { + use_isolated_nodes = var.isolated_client_cluster_target_size > 0 + + isolated_client_pool_name = "${var.prefix}${var.client_cluster_name}-isolated" +} + +resource "google_compute_node_template" "isolated-client" { + count = local.use_isolated_nodes ? 1 : 0 + name = "${local.isolated_client_pool_name}-node-template" + region = var.gcp_region + node_type = var.client_node_type + description = "Sole tenant node template for orchestrators" + + dynamic "disks" { + for_each = var.isolated_client_cluster_disk_count == 0 ? [] : [{}] + + content { + disk_count = var.isolated_client_cluster_disk_count + disk_size_gb = 375 + disk_type = "local-ssd" + } + } +} + +resource "google_compute_node_group" "isolated-client" { + count = local.use_isolated_nodes ? 1 : 0 + name = "${local.isolated_client_pool_name}-node-group" + zone = var.gcp_zone + description = "Sole tenant node group for orchestrators" + + initial_size = 1 + node_template = google_compute_node_template.isolated-client[0].id +} + +resource "google_compute_instance_template" "isolated-client" { + count = local.use_isolated_nodes ? 1 : 0 + + name_prefix = "${local.isolated_client_pool_name}-" + + instance_description = null + machine_type = var.client_machine_type + + labels = merge( + var.labels, + (var.environment != "dev" ? { + goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}" + } : {}) + ) + tags = [var.cluster_tag_name] + metadata_startup_script = local.client_startup_script + metadata = { + enable-osconfig = "TRUE", + enable-guest-attributes = "TRUE", + } + + scheduling { + on_host_maintenance = "MIGRATE" + + node_affinities { + key = "compute.googleapis.com/node-group-name" + operator = "IN" + values = [ + google_compute_node_group.isolated-client[0].name + ] + } + } + + disk { + boot = true + source_image = data.google_compute_image.client_source_image.id + disk_size_gb = 300 + disk_type = "pd-ssd" + } + + dynamic "disk" { + for_each = [for n in range(var.isolated_client_cluster_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = 375 + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } + } + + network_interface { + network = var.network_name + + dynamic "access_config" { + for_each = ["public_ip"] + content {} + } + } + + # For a full list of oAuth 2.0 Scopes, see https://developers.google.com/identity/protocols/googlescopes + service_account { + email = var.google_service_account_email + scopes = [ + "userinfo-email", + "compute-ro", + "https://www.googleapis.com/auth/logging.write", + "https://www.googleapis.com/auth/monitoring.write", + "https://www.googleapis.com/auth/trace.append", + "https://www.googleapis.com/auth/cloud-platform" + ] + } + + # Per Terraform Docs (https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#using-with-instance-group-manager), + # we need to create a new instance template before we can destroy the old one. Note that any Terraform resource on + # which this Terraform resource depends will also need this lifecycle statement. + lifecycle { + create_before_destroy = true + } + + depends_on = [ + google_storage_bucket_object.setup_config_objects["scripts/run-nomad.sh"], + google_storage_bucket_object.setup_config_objects["scripts/run-consul.sh"] + ] +} + +resource "google_compute_region_instance_group_manager" "isolated-client-pool" { + count = local.use_isolated_nodes ? 1 : 0 + + name = "${local.isolated_client_pool_name}-rig" + region = var.gcp_region + distribution_policy_zones = [var.gcp_zone] + + target_size = var.isolated_client_cluster_target_size + + version { + name = google_compute_instance_template.isolated-client[0].id + instance_template = google_compute_instance_template.isolated-client[0].id + } + + auto_healing_policies { + health_check = google_compute_health_check.client_nomad_check.id + initial_delay_sec = 600 + } + + distribution_policy_target_shape = "EVEN" + + # Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be + # a rolling update. + update_policy { + type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC" + minimal_action = "REPLACE" + max_surge_fixed = 10 + max_surge_percent = null + max_unavailable_fixed = 5 + max_unavailable_percent = null + replacement_method = "SUBSTITUTE" + instance_redistribution_type = "NONE" + } + + base_instance_name = local.isolated_client_pool_name + target_pools = [] +} diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index 687bb3e209..215fbb7bd3 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -316,6 +316,18 @@ variable "api_nat_min_ports_per_vm" { type = number } +variable "client_node_type" { + type = string +} + +variable "isolated_client_cluster_target_size" { + type = number +} + +variable "isolated_client_cluster_disk_count" { + type = number +} + variable "build_cluster_cache_disk_count" { type = number diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 13f9abf8ab..f2ac22f602 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -94,18 +94,6 @@ variable "build_cluster_root_disk_size_gb" { default = 200 } -variable "build_cluster_cache_disk_size_gb" { - type = number - description = "The size of the cache disk for the build machines in GB" - default = 200 -} - -variable "build_cluster_cache_disk_type" { - description = "The GCE cache disk type for the build machines." - type = string - default = "pd-ssd" -} - variable "clickhouse_cluster_size" { type = number } @@ -281,18 +269,6 @@ variable "allow_sandbox_internet" { default = true } -variable "client_cluster_cache_disk_size_gb" { - type = number - description = "The size of the cache disk for the orchestrator machines in GB" - default = 500 -} - -variable "client_cluster_cache_disk_type" { - description = "The GCE cache disk type for the client machines." - type = string - default = "pd-ssd" -} - variable "orchestrator_node_pool" { type = string default = "default" @@ -492,6 +468,21 @@ variable "remote_repository_enabled" { default = false } +variable "client_node_type" { + type = string + default = "n1-node-96-624" +} + +variable "isolated_client_cluster_target_size" { + type = number + default = 0 +} + +variable "isolated_client_cluster_disk_count" { + type = number + default = 0 +} + variable "build_cluster_cache_disk_count" { type = number description = "The number of 375 GB NVME disks to raid together for storing build files."