Skip to content
Closed
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ terraform {

google = {
source = "hashicorp/google"
version = "6.49.3"
version = "~> 6"
}

cloudflare = {
Expand Down Expand Up @@ -94,6 +94,10 @@ module "cluster" {
client_cluster_size_max = var.client_cluster_size_max
build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb

client_node_type = var.client_node_type
isolated_client_cluster_size = var.isolated_client_cluster_size
isolated_client_cluster_size_max = var.isolated_client_cluster_size_max

api_cluster_size = var.api_cluster_size
build_cluster_size = var.build_cluster_size
clickhouse_cluster_size = var.clickhouse_cluster_size
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

137 changes: 137 additions & 0 deletions iac/provider-gcp/nomad-cluster/nodepool-client-isolated.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
locals {
use_isolated_nodes = var.client_node_type != ""

isolated_client_pool_name = "${var.prefix}${var.client_cluster_name}-isolated"
}

resource "google_compute_node_template" "client" {
count = local.use_isolated_nodes ? 1 : 0
name = "${local.isolated_client_pool_name}-node-template"
region = var.gcp_region
node_type = var.client_node_type
description = "Sole tenant node template for orchestrators"
}

resource "google_compute_node_group" "client" {
count = local.use_isolated_nodes ? 1 : 0
name = "${local.isolated_client_pool_name}-node-group"
zone = var.gcp_zone
description = "Sole tenant node group for orchestrators"

initial_size = 1
node_template = google_compute_node_template.client[0].id
}

resource "google_compute_instance_template" "isolated_client" {
count = local.use_isolated_nodes ? 1 : 0

name_prefix = "${local.isolated_client_pool_name}-"

instance_description = null
machine_type = var.client_machine_type
min_cpu_platform = var.min_cpu_platform

labels = merge(
var.labels,
(var.environment != "dev" ? {
goog-ops-agent-policy = "v2-x86-template-1-2-0-${var.gcp_zone}"
} : {})
)
tags = [var.cluster_tag_name]
metadata_startup_script = local.client_startup_script
metadata = {
enable-osconfig = "TRUE",
enable-guest-attributes = "TRUE",
}

scheduling {
on_host_maintenance = "MIGRATE"
}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Missing node_affinities for sole-tenant scheduling

The scheduling block for sole tenant instances is missing the required node_affinities configuration. Instances created from this template won't be scheduled on the sole tenant node group (google_compute_node_group.client), defeating the purpose of sole tenancy. The scheduling block should include node_affinities that reference the node group to ensure instances are placed on the dedicated sole tenant nodes.

Fix in Cursor Fix in Web


disk {
boot = true
source_image = data.google_compute_image.client_source_image.id
disk_size_gb = 300
disk_type = "pd-ssd"
}

disk {
auto_delete = true
boot = false
type = "PERSISTENT"
disk_size_gb = var.client_cluster_cache_disk_size_gb
disk_type = var.client_cluster_cache_disk_type
}

network_interface {
network = var.network_name

dynamic "access_config" {
for_each = ["public_ip"]
content {}
}
}

# For a full list of oAuth 2.0 Scopes, see https://developers.google.com/identity/protocols/googlescopes
service_account {
email = var.google_service_account_email
scopes = [
"userinfo-email",
"compute-ro",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/trace.append",
"https://www.googleapis.com/auth/cloud-platform"
]
}

# Per Terraform Docs (https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#using-with-instance-group-manager),
# we need to create a new instance template before we can destroy the old one. Note that any Terraform resource on
# which this Terraform resource depends will also need this lifecycle statement.
lifecycle {
create_before_destroy = true
}

depends_on = [
google_storage_bucket_object.setup_config_objects["scripts/run-nomad.sh"],
google_storage_bucket_object.setup_config_objects["scripts/run-consul.sh"]
]
}

resource "google_compute_region_instance_group_manager" "isolated_client_pool" {
count = local.use_isolated_nodes ? 1 : 0

name = "${local.isolated_client_pool_name}-rig"
region = var.gcp_region
distribution_policy_zones = [var.gcp_zone]

target_size = var.isolated_client_cluster_size < var.isolated_client_cluster_size_max ? null : var.isolated_client_cluster_size
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Bug

The target_size logic for the isolated_client_pool instance group manager is misconfigured. When isolated_client_cluster_size is less than isolated_client_cluster_size_max, target_size becomes null. This implies autoscaling, but no autoscaling policies are defined, which can lead to the instance group manager failing or behaving unexpectedly.

Fix in Cursor Fix in Web


version {
name = google_compute_instance_template.isolated_client[0].id
instance_template = google_compute_instance_template.isolated_client[0].id
}

auto_healing_policies {
health_check = google_compute_health_check.client_nomad_check.id
initial_delay_sec = 600
}

distribution_policy_target_shape = "EVEN"

# Server is a stateful cluster, so the update strategy used to roll out a new GCE Instance Template must be
# a rolling update.
update_policy {
type = var.environment == "dev" ? "PROACTIVE" : "OPPORTUNISTIC"
minimal_action = "REPLACE"
max_surge_fixed = 10
max_surge_percent = null
max_unavailable_fixed = 5
max_unavailable_percent = null
replacement_method = "SUBSTITUTE"
instance_redistribution_type = "NONE"
}

base_instance_name = local.isolated_client_pool_name
target_pools = []
}
12 changes: 12 additions & 0 deletions iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,18 @@ variable "api_nat_min_ports_per_vm" {
type = number
}

variable "client_node_type" {
type = string
}

variable "isolated_client_cluster_size" {
type = number
}

variable "isolated_client_cluster_size_max" {
type = number
}

variable "build_cluster_cache_disk_count" {
type = number

Expand Down
15 changes: 15 additions & 0 deletions iac/provider-gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,21 @@ variable "remote_repository_enabled" {
default = false
}

variable "client_node_type" {
type = string
default = ""
}

variable "isolated_client_cluster_size" {
type = number
default = 1
}

variable "isolated_client_cluster_size_max" {
type = number
default = 1
}

variable "build_cluster_cache_disk_count" {
type = number
description = "The number of 375 GB NVME disks to raid together for storing build files."
Expand Down
Loading