From d24357a1e43e7eb4f03aa87159fee73bca83ee90 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Mon, 27 Oct 2025 16:10:48 -0700 Subject: [PATCH 1/3] Support local ephemeral nvme disks --- .../nomad-cluster/nodepool-build.tf | 1 + .../nomad-cluster/nodepool-client.tf | 22 +++++++--- .../nomad-cluster/scripts/start-client.sh | 42 ++++++++++++++++++- 3 files changed, 57 insertions(+), 8 deletions(-) diff --git a/iac/provider-gcp/nomad-cluster/nodepool-build.tf b/iac/provider-gcp/nomad-cluster/nodepool-build.tf index aea195d7f0..a058cccfec 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-build.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-build.tf @@ -22,6 +22,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.build_node_pool BASE_HUGEPAGES_PERCENTAGE = var.build_base_hugepages_percentage + LOCAL_CACHE_DISK_COUNT = local.cache_disk_count }) } diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client.tf b/iac/provider-gcp/nomad-cluster/nodepool-client.tf index c17188b1e8..25fa9239d8 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client.tf @@ -1,4 +1,7 @@ locals { + cache_disk_count = 3 # todo: make configurable + cache_disk_size = 375 # todo: make configurable + client_pool_name = "${var.prefix}${var.client_cluster_name}" client_startup_script = templatefile("${path.module}/scripts/start-client.sh", { CLUSTER_TAG_NAME = var.cluster_tag_name @@ -22,6 +25,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.orchestrator_node_pool BASE_HUGEPAGES_PERCENTAGE = var.orchestrator_base_hugepages_percentage + LOCAL_CACHE_DISK_COUNT = local.cache_disk_count }) } @@ -131,18 +135,24 @@ resource "google_compute_instance_template" "client" { } disk { + auto_delete = true boot = true source_image = data.google_compute_image.client_source_image.id disk_size_gb = 300 disk_type = "pd-ssd" } - disk { - auto_delete = true - boot = false - type = "PERSISTENT" - disk_size_gb = var.client_cluster_cache_disk_size_gb - disk_type = var.client_cluster_cache_disk_type + dynamic "disk" { + for_each = [for n in range(local.cache_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = local.cache_disk_size + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } } network_interface { diff --git a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh index 495c0cd929..2bd7df9bfc 100755 --- a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh +++ b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh @@ -16,8 +16,45 @@ set -x exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1 # Add cache disk for orchestrator and swapfile -# TODO: Parametrize this +%{ if LOCAL_CACHE_DISK_COUNT > 0 } +for i in {0..${ LOCAL_CACHE_DISK_COUNT - 1 }}; do + dev_path="/dev/disk/by-id/google-local-nvme-ssd-$i" + part_path="$dev_path-part1" + echo "partitioning drive #$i" + sudo parted --script $dev_path \ + mklabel gpt \ + mkpart primary 0% 100% \ + set 1 raid on + +done + +DISK="/dev/md0" + +echo "creating the array" +for i in {1..10}; do + if sudo mdadm --create --verbose \ + $DISK \ + --raid-devices=${ LOCAL_CACHE_DISK_COUNT } \ + %{ for i in range(LOCAL_CACHE_DISK_COUNT) ~}/dev/disk/by-id/google-local-nvme-ssd-${ i }-part1 %{ endfor }\ + --level=0; then + break + fi + + echo "failed to create array, waiting ... " + sleep 1 +done + +if [ ! -b "$DISK" ]; then + echo "failed to create raid array" + exit 99 +fi + +echo "persisting array configuration" +sudo mdadm --detail --scan --verbose | sudo tee -a /etc/mdadm/mdadm.conf +%{ else } DISK="/dev/disk/by-id/google-persistent-disk-1" +%{ endif } + MOUNT_POINT="/orchestrator" # Step 1: Format the disk with XFS and 65K block size @@ -27,7 +64,8 @@ sudo mkfs.xfs -f -b size=4096 $DISK sudo mkdir -p $MOUNT_POINT # Step 3: Mount the disk with -sudo mount -o noatime $DISK $MOUNT_POINT +echo "$DISK $MOUNT_POINT xfs noatime 0 0" | sudo tee /etc/fstab +sudo mount "$MOUNT_POINT" sudo mkdir -p /orchestrator/sandbox sudo mkdir -p /orchestrator/template From e137aaded3230050ffd4f784e6594e73cb9865d7 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Mon, 27 Oct 2025 16:23:31 -0700 Subject: [PATCH 2/3] append, don't overwrite --- iac/provider-gcp/nomad-cluster/scripts/start-client.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh index 2bd7df9bfc..ae4568d6c7 100755 --- a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh +++ b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh @@ -64,7 +64,7 @@ sudo mkfs.xfs -f -b size=4096 $DISK sudo mkdir -p $MOUNT_POINT # Step 3: Mount the disk with -echo "$DISK $MOUNT_POINT xfs noatime 0 0" | sudo tee /etc/fstab +echo "$DISK $MOUNT_POINT xfs noatime 0 0" | sudo tee -a /etc/fstab sudo mount "$MOUNT_POINT" sudo mkdir -p /orchestrator/sandbox From 4e008cd711f877203c5acd2326220495b0dbd6d9 Mon Sep 17 00:00:00 2001 From: Joe Lombrozo Date: Tue, 28 Oct 2025 11:22:39 -0700 Subject: [PATCH 3/3] remove support for persistent disks --- iac/provider-gcp/main.tf | 11 ++- .../nomad-cluster/nodepool-build.tf | 19 ++-- .../nomad-cluster/nodepool-client.tf | 9 +- .../nomad-cluster/scripts/start-client.sh | 88 +++++++++---------- iac/provider-gcp/nomad-cluster/variables.tf | 34 +++---- iac/provider-gcp/variables.tf | 12 +++ 6 files changed, 90 insertions(+), 83 deletions(-) diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index ab2f6a8e81..db461a23b6 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -91,12 +91,8 @@ module "cluster" { gcp_zone = var.gcp_zone google_service_account_key = module.init.google_service_account_key - client_cluster_size_max = var.client_cluster_size_max - client_cluster_cache_disk_size_gb = var.client_cluster_cache_disk_size_gb - client_cluster_cache_disk_type = var.client_cluster_cache_disk_type - build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb - build_cluster_cache_disk_size_gb = var.build_cluster_cache_disk_size_gb - build_cluster_cache_disk_type = var.build_cluster_cache_disk_type + client_cluster_size_max = var.client_cluster_size_max + build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size @@ -105,6 +101,9 @@ module "cluster" { server_cluster_size = var.server_cluster_size loki_cluster_size = var.loki_cluster_size + build_cluster_cache_disk_count = var.build_cluster_cache_disk_count + client_cluster_cache_disk_count = var.client_cluster_cache_disk_count + server_machine_type = var.server_machine_type client_machine_type = var.client_machine_type api_machine_type = var.api_machine_type diff --git a/iac/provider-gcp/nomad-cluster/nodepool-build.tf b/iac/provider-gcp/nomad-cluster/nodepool-build.tf index a058cccfec..ed2ed12348 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-build.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-build.tf @@ -22,7 +22,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.build_node_pool BASE_HUGEPAGES_PERCENTAGE = var.build_base_hugepages_percentage - LOCAL_CACHE_DISK_COUNT = local.cache_disk_count + LOCAL_CACHE_DISK_COUNT = var.build_cluster_cache_disk_count }) } @@ -120,12 +120,17 @@ resource "google_compute_instance_template" "build" { disk_type = "pd-ssd" } - disk { - auto_delete = true - boot = false - type = "PERSISTENT" - disk_size_gb = var.build_cluster_cache_disk_size_gb - disk_type = var.build_cluster_cache_disk_type + dynamic "disk" { + for_each = [for n in range(var.build_cluster_cache_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = 375 + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } } network_interface { diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client.tf b/iac/provider-gcp/nomad-cluster/nodepool-client.tf index 25fa9239d8..040c87187a 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client.tf @@ -1,7 +1,4 @@ locals { - cache_disk_count = 3 # todo: make configurable - cache_disk_size = 375 # todo: make configurable - client_pool_name = "${var.prefix}${var.client_cluster_name}" client_startup_script = templatefile("${path.module}/scripts/start-client.sh", { CLUSTER_TAG_NAME = var.cluster_tag_name @@ -25,7 +22,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.orchestrator_node_pool BASE_HUGEPAGES_PERCENTAGE = var.orchestrator_base_hugepages_percentage - LOCAL_CACHE_DISK_COUNT = local.cache_disk_count + LOCAL_CACHE_DISK_COUNT = var.client_cluster_cache_disk_count }) } @@ -143,12 +140,12 @@ resource "google_compute_instance_template" "client" { } dynamic "disk" { - for_each = [for n in range(local.cache_disk_count) : {}] + for_each = [for n in range(var.client_cluster_cache_disk_count) : {}] content { auto_delete = true boot = false - disk_size_gb = local.cache_disk_size + disk_size_gb = 375 interface = "NVME" disk_type = "local-ssd" type = "SCRATCH" diff --git a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh index ae4568d6c7..780cb14a67 100755 --- a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh +++ b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh @@ -16,93 +16,85 @@ set -x exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1 # Add cache disk for orchestrator and swapfile -%{ if LOCAL_CACHE_DISK_COUNT > 0 } for i in {0..${ LOCAL_CACHE_DISK_COUNT - 1 }}; do dev_path="/dev/disk/by-id/google-local-nvme-ssd-$i" - part_path="$dev_path-part1" echo "partitioning drive #$i" - sudo parted --script $dev_path \ + parted --script $dev_path \ mklabel gpt \ mkpart primary 0% 100% \ set 1 raid on - done +%{ if LOCAL_CACHE_DISK_COUNT > 1 } DISK="/dev/md0" echo "creating the array" -for i in {1..10}; do - if sudo mdadm --create --verbose \ - $DISK \ - --raid-devices=${ LOCAL_CACHE_DISK_COUNT } \ - %{ for i in range(LOCAL_CACHE_DISK_COUNT) ~}/dev/disk/by-id/google-local-nvme-ssd-${ i }-part1 %{ endfor }\ - --level=0; then - break - fi - - echo "failed to create array, waiting ... " - sleep 1 +until mdadm --create --verbose \ + $DISK \ + --raid-devices=${ LOCAL_CACHE_DISK_COUNT } \ + %{ for i in range(LOCAL_CACHE_DISK_COUNT) ~}/dev/disk/by-id/google-local-nvme-ssd-${ i }-part1 %{ endfor }\ + --level=0; do + echo "failed to create array, trying again ... " + sleep 1 done -if [ ! -b "$DISK" ]; then - echo "failed to create raid array" - exit 99 -fi - echo "persisting array configuration" -sudo mdadm --detail --scan --verbose | sudo tee -a /etc/mdadm/mdadm.conf +mdadm --detail --scan --verbose | tee -a /etc/mdadm/mdadm.conf %{ else } -DISK="/dev/disk/by-id/google-persistent-disk-1" +DISK="/dev/disk/by-id/google-local-nvme-ssd-0-part1" %{ endif } MOUNT_POINT="/orchestrator" # Step 1: Format the disk with XFS and 65K block size -sudo mkfs.xfs -f -b size=4096 $DISK +until mkfs.xfs -f -b size=4096 $DISK; do + echo "failed to make file system, trying again ... " + sleep 1 +done # Step 2: Create the mount point -sudo mkdir -p $MOUNT_POINT +mkdir -p $MOUNT_POINT # Step 3: Mount the disk with -echo "$DISK $MOUNT_POINT xfs noatime 0 0" | sudo tee -a /etc/fstab -sudo mount "$MOUNT_POINT" +echo "$DISK $MOUNT_POINT xfs noatime 0 0" | tee -a /etc/fstab +mount "$MOUNT_POINT" -sudo mkdir -p /orchestrator/sandbox -sudo mkdir -p /orchestrator/template -sudo mkdir -p /orchestrator/build +mkdir -p /orchestrator/sandbox +mkdir -p /orchestrator/template +mkdir -p /orchestrator/build # Add swapfile SWAPFILE="/swapfile" -sudo fallocate -l 100G $SWAPFILE -sudo chmod 600 $SWAPFILE -sudo mkswap $SWAPFILE -sudo swapon $SWAPFILE +fallocate -l 100G $SWAPFILE +chmod 600 $SWAPFILE +mkswap $SWAPFILE +swapon $SWAPFILE # Make swapfile persistent -echo "$SWAPFILE none swap sw 0 0" | sudo tee -a /etc/fstab +echo "$SWAPFILE none swap sw 0 0" | tee -a /etc/fstab # Set swap settings -sudo sysctl vm.swappiness=10 -sudo sysctl vm.vfs_cache_pressure=50 +sysctl vm.swappiness=10 +sysctl vm.vfs_cache_pressure=50 # TODO: Optimize the mount more according to https://cloud.google.com/filestore/docs/mounting-fileshares %{ if USE_FILESTORE_CACHE } # Mount NFS -sudo mkdir -p "${NFS_MOUNT_PATH}" -echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | sudo tee -a /etc/fstab -sudo mount "${NFS_MOUNT_PATH}" -sudo mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" +mkdir -p "${NFS_MOUNT_PATH}" +echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | tee -a /etc/fstab +mount "${NFS_MOUNT_PATH}" +mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" %{ endif } # Add tmpfs for snapshotting # TODO: Parametrize this -sudo mkdir -p /mnt/snapshot-cache -sudo mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache +mkdir -p /mnt/snapshot-cache +mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache ulimit -n 1048576 export GOMAXPROCS='nproc' -sudo tee -a /etc/sysctl.conf </etc/udev/rules.d/97-nbd-device.rules ACTION=="add|change", KERNEL=="nbd*", OPTIONS:="nowatch" EOH -sudo udevadm control --reload-rules -sudo udevadm trigger +udevadm control --reload-rules +udevadm trigger # Load the nbd module with 4096 devices -sudo modprobe nbd nbds_max=4096 +modprobe nbd nbds_max=4096 # Create the directory for the fc mounts mkdir -p /fc-vm @@ -201,7 +193,7 @@ systemctl restart systemd-resolved # The THP are by default set to madvise # We are allocating the hugepages at the start when the memory is not fragmented yet echo "[Setting up huge pages]" -sudo mkdir -p /mnt/hugepages +mkdir -p /mnt/hugepages mount -t hugetlbfs none /mnt/hugepages # Increase proactive compaction to reduce memory fragmentation for using overcomitted huge pages diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index d799c676a0..687bb3e209 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -69,14 +69,6 @@ variable "build_cluster_root_disk_size_gb" { type = number } -variable "build_cluster_cache_disk_size_gb" { - type = number -} - -variable "build_cluster_cache_disk_type" { - type = string -} - variable "edge_api_port" { type = object({ name = string @@ -138,14 +130,6 @@ variable "client_machine_type" { type = string } -variable "client_cluster_cache_disk_size_gb" { - type = number -} - -variable "client_cluster_cache_disk_type" { - type = string -} - variable "gcp_project_id" { type = string } @@ -331,3 +315,21 @@ variable "api_nat_ips" { variable "api_nat_min_ports_per_vm" { type = number } + +variable "build_cluster_cache_disk_count" { + type = number + + validation { + condition = var.build_cluster_cache_disk_count > 0 + error_message = "Must include at least 1 build cluster cache disk" + } +} + +variable "client_cluster_cache_disk_count" { + type = number + + validation { + condition = var.client_cluster_cache_disk_count > 0 + error_message = "Must include at least 1 client cluster cache disk" + } +} diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 8cb041c5e1..13f9abf8ab 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -491,3 +491,15 @@ variable "remote_repository_enabled" { description = "Set to true to enable remote repository cache. Can be set via TF_VAR_remote_repository_enabled or REMOTE_REPOSITORY_ENABLED env var." default = false } + +variable "build_cluster_cache_disk_count" { + type = number + description = "The number of 375 GB NVME disks to raid together for storing build files." + default = 3 +} + +variable "client_cluster_cache_disk_count" { + type = number + description = "The number of 375 GB NVME disks to raid together for storing sandbox files." + default = 3 +}