Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/workflows/pr-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ name: Run tests on PRs

on: [workflow_call]

permissions:
contents: read

jobs:
run-tests:
name: Run tests for ${{ matrix.package }}
Expand Down Expand Up @@ -46,3 +49,24 @@ jobs:
- name: Run tests
working-directory: ${{ matrix.package }}
run: go test -v ${{ matrix.test_path }}

validate-iac:
name: Validate terraform
runs-on: ubuntu-24.04
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Parse .tool-versions
uses: wistia/parse-tool-versions@v2.1.1

- name: Setup Terraform
uses: hashicorp/setup-terraform@v3
with:
terraform_version: "${{ env.TERRAFORM }}"

- name: Validate terraform
working-directory: ./iac/provider-gcp
run: |
terraform init -backend=false
terraform validate
Comment thread Fixed
6 changes: 5 additions & 1 deletion iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ terraform {

google = {
source = "hashicorp/google"
version = "6.49.3"
version = "~> 6"
}

cloudflare = {
Expand Down Expand Up @@ -94,6 +94,10 @@ module "cluster" {
client_cluster_size_max = var.client_cluster_size_max
build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb

client_node_type = var.client_node_type
isolated_client_cluster_target_size = var.isolated_client_cluster_target_size
isolated_client_cluster_disk_count = var.isolated_client_cluster_disk_count

api_cluster_size = var.api_cluster_size
build_cluster_size = var.build_cluster_size
clickhouse_cluster_size = var.clickhouse_cluster_size
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

159 changes: 159 additions & 0 deletions iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
locals {
use_isolated_nodes = var.isolated_client_cluster_target_size > 0

isolated_client_pool_name = "${var.prefix}${var.client_cluster_name}-isolated"
}

resource "google_compute_node_template" "isolated-client" {
count = local.use_isolated_nodes ? 1 : 0
name = "${local.isolated_client_pool_name}-node-template"
region = var.gcp_region
node_type = var.client_node_type
description = "Sole tenant node template for orchestrators"

dynamic "disks" {
for_each = var.isolated_client_cluster_disk_count == 0 ? [] : [{}]

content {
disk_count = var.isolated_client_cluster_disk_count
disk_size_gb = 375
disk_type = "local-ssd"
}
}
}

resource "google_compute_node_group" "isolated-client" {
count = local.use_isolated_nodes ? 1 : 0
name = "${local.isolated_client_pool_name}-node-group"
zone = var.gcp_zone
description = "Sole tenant node group for orchestrators"

initial_size = 1
node_template = google_compute_node_template.isolated-client[0].id
}

resource "google_compute_instance_template" "isolated-client" {
count = local.use_isolated_nodes ? 1 : 0

name_prefix = "${local.isolated_client_pool_name}-"

instance_description = null
machine_type = var.client_machine_type

labels = merge(
var.labels,
(var.environment != "dev" ? {
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
} : {})
)
tags = [var.cluster_tag_name]
metadata_startup_script = local.client_startup_script
metadata = {
enable-osconfig = "TRUE",
enable-guest-attributes = "TRUE",
}

scheduling {
on_host_maintenance = "MIGRATE"

node_affinities {
key = "compute.googleapis.com/node-group-name"
operator = "IN"
values = [
google_compute_node_group.isolated-client[0].name
]
}
}

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing node_affinities for sole-tenant scheduling

The scheduling block for sole tenant instances is missing the required node_affinities configuration. Instances created from this template won't be scheduled on the sole tenant node group (google_compute_node_group.client), defeating the purpose of sole tenancy. The scheduling block should include node_affinities that reference the node group to ensure instances are placed on the dedicated sole tenant nodes.

Fix in Cursor Fix in Web


disk {
boot = true
source_image = data.google_compute_image.client_source_image.id
disk_size_gb = 300
disk_type = "pd-ssd"
}

dynamic "disk" {
for_each = [for n in range(var.isolated_client_cluster_disk_count) : {}]

content {
auto_delete = true
boot = false
disk_size_gb = 375
interface = "NVME"
disk_type = "local-ssd"
type = "SCRATCH"
}
}

network_interface {
network = var.network_name

dynamic "access_config" {
for_each = ["public_ip"]
content {}
}
}

# For a full list of oAuth 2.0 Scopes, see https://developers.google.com/identity/protocols/googlescopes
service_account {
email = var.google_service_account_email
scopes = [
"userinfo-email",
"compute-ro",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/trace.append",
"https://www.googleapis.com/auth/cloud-platform"
]
}

# Per Terraform Docs (https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#using-with-instance-group-manager),
# we need to create a new instance template before we can destroy the old one. Note that any Terraform resource on
# which this Terraform resource depends will also need this lifecycle statement.
lifecycle {
create_before_destroy = true
}

depends_on = [
google_storage_bucket_object.setup_config_objects["scripts/run-nomad.sh"],
google_storage_bucket_object.setup_config_objects["scripts/run-consul.sh"]
]
}

resource "google_compute_region_instance_group_manager" "isolated-client-pool" {
count = local.use_isolated_nodes ? 1 : 0

name = "${local.isolated_client_pool_name}-rig"
region = var.gcp_region
distribution_policy_zones = [var.gcp_zone]

target_size = var.isolated_client_cluster_target_size

version {
name = google_compute_instance_template.isolated-client[0].id
instance_template = google_compute_instance_template.isolated-client[0].id
}

auto_healing_policies {
health_check = google_compute_health_check.client_nomad_check.id
initial_delay_sec = 600
}

distribution_policy_target_shape = "EVEN"

# Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be
# a rolling update.
update_policy {
type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC"
minimal_action = "REPLACE"
max_surge_fixed = 10
max_surge_percent = null
max_unavailable_fixed = 5
max_unavailable_percent = null
replacement_method = "SUBSTITUTE"
instance_redistribution_type = "NONE"
}

base_instance_name = local.isolated_client_pool_name
target_pools = []
}
12 changes: 12 additions & 0 deletions iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ variable "api_nat_min_ports_per_vm" {
type = number
}

variable "client_node_type" {
type = string
}

variable "isolated_client_cluster_target_size" {
type = number
}

variable "isolated_client_cluster_disk_count" {
type = number
}

variable "build_cluster_cache_disk_count" {
type = number

Expand Down
39 changes: 15 additions & 24 deletions iac/provider-gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -94,18 +94,6 @@ variable "build_cluster_root_disk_size_gb" {
default = 200
}

variable "build_cluster_cache_disk_size_gb" {
type = number
description = "The size of the cache disk for the build machines in GB"
default = 200
}

variable "build_cluster_cache_disk_type" {
description = "The GCE cache disk type for the build machines."
type = string
default = "pd-ssd"
}

variable "clickhouse_cluster_size" {
type = number
}
Expand Down Expand Up @@ -281,18 +269,6 @@ variable "allow_sandbox_internet" {
default = true
}

variable "client_cluster_cache_disk_size_gb" {
type = number
description = "The size of the cache disk for the orchestrator machines in GB"
default = 500
}

variable "client_cluster_cache_disk_type" {
description = "The GCE cache disk type for the client machines."
type = string
default = "pd-ssd"
}

variable "orchestrator_node_pool" {
type = string
default = "default"
Expand Down Expand Up @@ -492,6 +468,21 @@ variable "remote_repository_enabled" {
default = false
}

variable "client_node_type" {
type = string
default = "n1-node-96-624"
}

variable "isolated_client_cluster_target_size" {
type = number
default = 0
}

variable "isolated_client_cluster_disk_count" {
type = number
default = 0
}

variable "build_cluster_cache_disk_count" {
type = number
description = "The number of 375 GB NVME disks to raid together for storing build files."
Expand Down
Loading