Skip to content

Commit 2c0f37f

Browse files
committed
feat(env_baseline): dynamic zone selection for GKE T4 GPU and ARM nodes
Restricted GKE Standard clusters and ARM node pools to zones supporting nvidia-tesla-t4 and t2a-standard-4 hardware. Replaced hardcoded us-central1 zones with dynamic discovery using google_compute_zones and google_compute_machine_types data sources.
1 parent fe4e572 commit 2c0f37f

1 file changed

Lines changed: 47 additions & 3 deletions

File tree

  • 2-multitenant/modules/env_baseline

2-multitenant/modules/env_baseline/main.tf

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ locals {
1818
networks_re = "/networks/([^/]*)$"
1919
subnetworks_re = "/subnetworks/([^/]*)$"
2020
projects_re = "projects/([^/]*)/"
21-
regions_re = "regions/([^/]+)"
2221
cluster_project_id = data.google_project.eab_cluster_project.project_id
2322
available_cidr_ranges = var.master_ipv4_cidr_blocks
2423

@@ -30,12 +29,33 @@ locals {
3029

3130
cluster_sa = [for i in merge(module.gke-standard, module.gke-autopilot) : i.service_account][0]
3231

33-
arm_node_pool = { for k, v in local.subnets : k => (regex(local.regions_re, v)[0]) == "us-central1" ?
32+
# Map each region to zones that support nvidia-tesla-t4
33+
gpu_t4_zones = {
34+
for r_idx, r in data.google_compute_zones.available : r_idx => [
35+
for z in r.names : z if contains([for a in data.google_compute_accelerator_types.t4[z].accelerator_types : a.name], "nvidia-tesla-t4")
36+
]
37+
}
38+
39+
# Map each region to zones that support t2a-standard-4
40+
arm_zones = {
41+
for r_idx, r in data.google_compute_zones.available : r_idx => [
42+
for z in r.names : z if length(data.google_compute_machine_types.arm[z].machine_types) > 0
43+
]
44+
}
45+
46+
# ARM node pool locations must be a subset of cluster zones (which are restricted to T4 zones)
47+
arm_node_pool_zones = {
48+
for k, v in local.subnets : k => [
49+
for z in local.arm_zones[k] : z if contains(local.gpu_t4_zones[k], z)
50+
]
51+
}
52+
53+
arm_node_pool = { for k, v in local.subnets : k => length(local.arm_node_pool_zones[k]) > 0 ?
3454
[
3555
{
3656
name = "regional-arm64-pool"
3757
machine_type = "t2a-standard-4"
38-
node_locations = "us-central1-a,us-central1-b,us-central1-f"
58+
node_locations = join(",", local.arm_node_pool_zones[k])
3959
strategy = "SURGE"
4060
max_surge = 1
4161
max_unavailable = 0
@@ -169,6 +189,29 @@ data "google_compute_subnetwork" "default" {
169189
self_link = each.value
170190
}
171191

192+
data "google_compute_zones" "available" {
193+
for_each = local.subnets
194+
region = data.google_compute_subnetwork.default[each.key].region
195+
project = local.cluster_project_id
196+
}
197+
198+
locals {
199+
all_zones = distinct(flatten([for z in data.google_compute_zones.available : z.names]))
200+
}
201+
202+
data "google_compute_accelerator_types" "t4" {
203+
for_each = toset(local.all_zones)
204+
zone = each.value
205+
project = local.cluster_project_id
206+
}
207+
208+
data "google_compute_machine_types" "arm" {
209+
for_each = toset(local.all_zones)
210+
zone = each.value
211+
project = local.cluster_project_id
212+
filter = "name = \"t2a-standard-4\""
213+
}
214+
172215
resource "google_access_context_manager_access_level_condition" "access-level-conditions" {
173216
count = var.access_level_name != null ? 1 : 0
174217
access_level = var.access_level_name
@@ -256,6 +299,7 @@ module "gke-standard" {
256299
project_id = local.cluster_project_id
257300
regional = true
258301
region = data.google_compute_subnetwork.default[each.key].region
302+
zones = local.gpu_t4_zones[each.key]
259303
network_project_id = regex(local.projects_re, data.google_compute_subnetwork.default[each.key].id)[0]
260304
network = regex(local.networks_re, data.google_compute_subnetwork.default[each.key].network)[0]
261305
subnetwork = regex(local.subnetworks_re, local.subnets[each.key])[0]

0 commit comments

Comments
 (0)