diff --git a/iac/provider-gcp/main.tf b/iac/provider-gcp/main.tf index ab2f6a8e81..db461a23b6 100644 --- a/iac/provider-gcp/main.tf +++ b/iac/provider-gcp/main.tf @@ -91,12 +91,8 @@ module "cluster" { gcp_zone = var.gcp_zone google_service_account_key = module.init.google_service_account_key - client_cluster_size_max = var.client_cluster_size_max - client_cluster_cache_disk_size_gb = var.client_cluster_cache_disk_size_gb - client_cluster_cache_disk_type = var.client_cluster_cache_disk_type - build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb - build_cluster_cache_disk_size_gb = var.build_cluster_cache_disk_size_gb - build_cluster_cache_disk_type = var.build_cluster_cache_disk_type + client_cluster_size_max = var.client_cluster_size_max + build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb api_cluster_size = var.api_cluster_size build_cluster_size = var.build_cluster_size @@ -105,6 +101,9 @@ module "cluster" { server_cluster_size = var.server_cluster_size loki_cluster_size = var.loki_cluster_size + build_cluster_cache_disk_count = var.build_cluster_cache_disk_count + client_cluster_cache_disk_count = var.client_cluster_cache_disk_count + server_machine_type = var.server_machine_type client_machine_type = var.client_machine_type api_machine_type = var.api_machine_type diff --git a/iac/provider-gcp/nomad-cluster/nodepool-build.tf b/iac/provider-gcp/nomad-cluster/nodepool-build.tf index aea195d7f0..ed2ed12348 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-build.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-build.tf @@ -22,6 +22,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.build_node_pool BASE_HUGEPAGES_PERCENTAGE = var.build_base_hugepages_percentage + LOCAL_CACHE_DISK_COUNT = var.build_cluster_cache_disk_count }) } @@ -119,12 +120,17 @@ resource "google_compute_instance_template" "build" { disk_type = "pd-ssd" } - disk { - auto_delete = true - boot = false - type = "PERSISTENT" - disk_size_gb = var.build_cluster_cache_disk_size_gb - disk_type = var.build_cluster_cache_disk_type + dynamic "disk" { + for_each = [for n in range(var.build_cluster_cache_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = 375 + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } } network_interface { diff --git a/iac/provider-gcp/nomad-cluster/nodepool-client.tf b/iac/provider-gcp/nomad-cluster/nodepool-client.tf index c17188b1e8..040c87187a 100644 --- a/iac/provider-gcp/nomad-cluster/nodepool-client.tf +++ b/iac/provider-gcp/nomad-cluster/nodepool-client.tf @@ -22,6 +22,7 @@ locals { USE_FILESTORE_CACHE = var.filestore_cache_enabled NODE_POOL = var.orchestrator_node_pool BASE_HUGEPAGES_PERCENTAGE = var.orchestrator_base_hugepages_percentage + LOCAL_CACHE_DISK_COUNT = var.client_cluster_cache_disk_count }) } @@ -131,18 +132,24 @@ resource "google_compute_instance_template" "client" { } disk { + auto_delete = true boot = true source_image = data.google_compute_image.client_source_image.id disk_size_gb = 300 disk_type = "pd-ssd" } - disk { - auto_delete = true - boot = false - type = "PERSISTENT" - disk_size_gb = var.client_cluster_cache_disk_size_gb - disk_type = var.client_cluster_cache_disk_type + dynamic "disk" { + for_each = [for n in range(var.client_cluster_cache_disk_count) : {}] + + content { + auto_delete = true + boot = false + disk_size_gb = 375 + interface = "NVME" + disk_type = "local-ssd" + type = "SCRATCH" + } } network_interface { diff --git a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh index 495c0cd929..780cb14a67 100755 --- a/iac/provider-gcp/nomad-cluster/scripts/start-client.sh +++ b/iac/provider-gcp/nomad-cluster/scripts/start-client.sh @@ -16,55 +16,85 @@ set -x exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1 # Add cache disk for orchestrator and swapfile -# TODO: Parametrize this -DISK="/dev/disk/by-id/google-persistent-disk-1" +for i in {0..${ LOCAL_CACHE_DISK_COUNT - 1 }}; do + dev_path="/dev/disk/by-id/google-local-nvme-ssd-$i" + echo "partitioning drive #$i" + parted --script $dev_path \ + mklabel gpt \ + mkpart primary 0% 100% \ + set 1 raid on +done + +%{ if LOCAL_CACHE_DISK_COUNT > 1 } +DISK="/dev/md0" + +echo "creating the array" +until mdadm --create --verbose \ + $DISK \ + --raid-devices=${ LOCAL_CACHE_DISK_COUNT } \ + %{ for i in range(LOCAL_CACHE_DISK_COUNT) ~}/dev/disk/by-id/google-local-nvme-ssd-${ i }-part1 %{ endfor }\ + --level=0; do + echo "failed to create array, trying again ... " + sleep 1 +done + +echo "persisting array configuration" +mdadm --detail --scan --verbose | tee -a /etc/mdadm/mdadm.conf +%{ else } +DISK="/dev/disk/by-id/google-local-nvme-ssd-0-part1" +%{ endif } + MOUNT_POINT="/orchestrator" # Step 1: Format the disk with XFS and 65K block size -sudo mkfs.xfs -f -b size=4096 $DISK +until mkfs.xfs -f -b size=4096 $DISK; do + echo "failed to make file system, trying again ... " + sleep 1 +done # Step 2: Create the mount point -sudo mkdir -p $MOUNT_POINT +mkdir -p $MOUNT_POINT # Step 3: Mount the disk with -sudo mount -o noatime $DISK $MOUNT_POINT +echo "$DISK $MOUNT_POINT xfs noatime 0 0" | tee -a /etc/fstab +mount "$MOUNT_POINT" -sudo mkdir -p /orchestrator/sandbox -sudo mkdir -p /orchestrator/template -sudo mkdir -p /orchestrator/build +mkdir -p /orchestrator/sandbox +mkdir -p /orchestrator/template +mkdir -p /orchestrator/build # Add swapfile SWAPFILE="/swapfile" -sudo fallocate -l 100G $SWAPFILE -sudo chmod 600 $SWAPFILE -sudo mkswap $SWAPFILE -sudo swapon $SWAPFILE +fallocate -l 100G $SWAPFILE +chmod 600 $SWAPFILE +mkswap $SWAPFILE +swapon $SWAPFILE # Make swapfile persistent -echo "$SWAPFILE none swap sw 0 0" | sudo tee -a /etc/fstab +echo "$SWAPFILE none swap sw 0 0" | tee -a /etc/fstab # Set swap settings -sudo sysctl vm.swappiness=10 -sudo sysctl vm.vfs_cache_pressure=50 +sysctl vm.swappiness=10 +sysctl vm.vfs_cache_pressure=50 # TODO: Optimize the mount more according to https://cloud.google.com/filestore/docs/mounting-fileshares %{ if USE_FILESTORE_CACHE } # Mount NFS -sudo mkdir -p "${NFS_MOUNT_PATH}" -echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | sudo tee -a /etc/fstab -sudo mount "${NFS_MOUNT_PATH}" -sudo mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" +mkdir -p "${NFS_MOUNT_PATH}" +echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | tee -a /etc/fstab +mount "${NFS_MOUNT_PATH}" +mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" %{ endif } # Add tmpfs for snapshotting # TODO: Parametrize this -sudo mkdir -p /mnt/snapshot-cache -sudo mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache +mkdir -p /mnt/snapshot-cache +mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache ulimit -n 1048576 export GOMAXPROCS='nproc' -sudo tee -a /etc/sysctl.conf </etc/udev/rules.d/97-nbd-device.rules ACTION=="add|change", KERNEL=="nbd*", OPTIONS:="nowatch" EOH -sudo udevadm control --reload-rules -sudo udevadm trigger +udevadm control --reload-rules +udevadm trigger # Load the nbd module with 4096 devices -sudo modprobe nbd nbds_max=4096 +modprobe nbd nbds_max=4096 # Create the directory for the fc mounts mkdir -p /fc-vm @@ -163,7 +193,7 @@ systemctl restart systemd-resolved # The THP are by default set to madvise # We are allocating the hugepages at the start when the memory is not fragmented yet echo "[Setting up huge pages]" -sudo mkdir -p /mnt/hugepages +mkdir -p /mnt/hugepages mount -t hugetlbfs none /mnt/hugepages # Increase proactive compaction to reduce memory fragmentation for using overcomitted huge pages diff --git a/iac/provider-gcp/nomad-cluster/variables.tf b/iac/provider-gcp/nomad-cluster/variables.tf index d799c676a0..687bb3e209 100644 --- a/iac/provider-gcp/nomad-cluster/variables.tf +++ b/iac/provider-gcp/nomad-cluster/variables.tf @@ -69,14 +69,6 @@ variable "build_cluster_root_disk_size_gb" { type = number } -variable "build_cluster_cache_disk_size_gb" { - type = number -} - -variable "build_cluster_cache_disk_type" { - type = string -} - variable "edge_api_port" { type = object({ name = string @@ -138,14 +130,6 @@ variable "client_machine_type" { type = string } -variable "client_cluster_cache_disk_size_gb" { - type = number -} - -variable "client_cluster_cache_disk_type" { - type = string -} - variable "gcp_project_id" { type = string } @@ -331,3 +315,21 @@ variable "api_nat_ips" { variable "api_nat_min_ports_per_vm" { type = number } + +variable "build_cluster_cache_disk_count" { + type = number + + validation { + condition = var.build_cluster_cache_disk_count > 0 + error_message = "Must include at least 1 build cluster cache disk" + } +} + +variable "client_cluster_cache_disk_count" { + type = number + + validation { + condition = var.client_cluster_cache_disk_count > 0 + error_message = "Must include at least 1 client cluster cache disk" + } +} diff --git a/iac/provider-gcp/variables.tf b/iac/provider-gcp/variables.tf index 8cb041c5e1..13f9abf8ab 100644 --- a/iac/provider-gcp/variables.tf +++ b/iac/provider-gcp/variables.tf @@ -491,3 +491,15 @@ variable "remote_repository_enabled" { description = "Set to true to enable remote repository cache. Can be set via TF_VAR_remote_repository_enabled or REMOTE_REPOSITORY_ENABLED env var." default = false } + +variable "build_cluster_cache_disk_count" { + type = number + description = "The number of 375 GB NVME disks to raid together for storing build files." + default = 3 +} + +variable "client_cluster_cache_disk_count" { + type = number + description = "The number of 375 GB NVME disks to raid together for storing sandbox files." + default = 3 +}