Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions iac/provider-gcp/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -91,12 +91,8 @@ module "cluster" {
gcp_zone = var.gcp_zone
google_service_account_key = module.init.google_service_account_key

client_cluster_size_max = var.client_cluster_size_max
client_cluster_cache_disk_size_gb = var.client_cluster_cache_disk_size_gb
client_cluster_cache_disk_type = var.client_cluster_cache_disk_type
build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb
build_cluster_cache_disk_size_gb = var.build_cluster_cache_disk_size_gb
build_cluster_cache_disk_type = var.build_cluster_cache_disk_type
client_cluster_size_max = var.client_cluster_size_max
build_cluster_root_disk_size_gb = var.build_cluster_root_disk_size_gb

api_cluster_size = var.api_cluster_size
build_cluster_size = var.build_cluster_size
Expand All @@ -105,6 +101,9 @@ module "cluster" {
server_cluster_size = var.server_cluster_size
loki_cluster_size = var.loki_cluster_size

build_cluster_cache_disk_count = var.build_cluster_cache_disk_count
client_cluster_cache_disk_count = var.client_cluster_cache_disk_count

server_machine_type = var.server_machine_type
client_machine_type = var.client_machine_type
api_machine_type = var.api_machine_type
Expand Down
18 changes: 12 additions & 6 deletions iac/provider-gcp/nomad-cluster/nodepool-build.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ locals {
USE_FILESTORE_CACHE = var.filestore_cache_enabled
NODE_POOL = var.build_node_pool
BASE_HUGEPAGES_PERCENTAGE = var.build_base_hugepages_percentage
LOCAL_CACHE_DISK_COUNT = var.build_cluster_cache_disk_count
})
}

Expand Down Expand Up @@ -119,12 +120,17 @@ resource "google_compute_instance_template" "build" {
disk_type = "pd-ssd"
}

disk {
auto_delete = true
boot = false
type = "PERSISTENT"
disk_size_gb = var.build_cluster_cache_disk_size_gb
disk_type = var.build_cluster_cache_disk_type
dynamic "disk" {
for_each = [for n in range(var.build_cluster_cache_disk_count) : {}]

content {
auto_delete = true
boot = false
disk_size_gb = 375
interface = "NVME"
disk_type = "local-ssd"
type = "SCRATCH"
}
}

network_interface {
Expand Down
19 changes: 13 additions & 6 deletions iac/provider-gcp/nomad-cluster/nodepool-client.tf
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ locals {
USE_FILESTORE_CACHE = var.filestore_cache_enabled
NODE_POOL = var.orchestrator_node_pool
BASE_HUGEPAGES_PERCENTAGE = var.orchestrator_base_hugepages_percentage
LOCAL_CACHE_DISK_COUNT = var.client_cluster_cache_disk_count
})
}

Expand Down Expand Up @@ -131,18 +132,24 @@ resource "google_compute_instance_template" "client" {
}

disk {
auto_delete = true
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want this?

Copy link
Copy Markdown
Contributor Author

@djeebus djeebus Oct 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No reason to keep a boot or cache disk around after we delete the VM instance, right?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe in some strange debugging circumstances, but agree here.
Does this mean we are accumulating the disk somewhere right now?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, turns out auto_delete is set to true by default. I can remove this if we don't want it be clear.

boot = true
source_image = data.google_compute_image.client_source_image.id
disk_size_gb = 300
disk_type = "pd-ssd"
}

disk {
auto_delete = true
boot = false
type = "PERSISTENT"
disk_size_gb = var.client_cluster_cache_disk_size_gb
disk_type = var.client_cluster_cache_disk_type
dynamic "disk" {
for_each = [for n in range(var.client_cluster_cache_disk_count) : {}]

content {
auto_delete = true
boot = false
disk_size_gb = 375
interface = "NVME"
disk_type = "local-ssd"
type = "SCRATCH"
}
}

network_interface {
Expand Down
84 changes: 57 additions & 27 deletions iac/provider-gcp/nomad-cluster/scripts/start-client.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,55 +16,85 @@ set -x
exec > >(tee /var/log/user-data.log | logger -t user-data -s 2>/dev/console) 2>&1

# Add cache disk for orchestrator and swapfile
# TODO: Parametrize this
DISK="/dev/disk/by-id/google-persistent-disk-1"
for i in {0..${ LOCAL_CACHE_DISK_COUNT - 1 }}; do
Comment thread
ValentaTomas marked this conversation as resolved.
dev_path="/dev/disk/by-id/google-local-nvme-ssd-$i"
echo "partitioning drive #$i"
parted --script $dev_path \
mklabel gpt \
mkpart primary 0% 100% \
set 1 raid on
done

%{ if LOCAL_CACHE_DISK_COUNT > 1 }
DISK="/dev/md0"

echo "creating the array"
until mdadm --create --verbose \
$DISK \
--raid-devices=${ LOCAL_CACHE_DISK_COUNT } \
%{ for i in range(LOCAL_CACHE_DISK_COUNT) ~}/dev/disk/by-id/google-local-nvme-ssd-${ i }-part1 %{ endfor }\
--level=0; do
echo "failed to create array, trying again ... "
sleep 1
done

echo "persisting array configuration"
mdadm --detail --scan --verbose | tee -a /etc/mdadm/mdadm.conf
%{ else }
DISK="/dev/disk/by-id/google-local-nvme-ssd-0-part1"
%{ endif }

MOUNT_POINT="/orchestrator"

# Step 1: Format the disk with XFS and 65K block size
sudo mkfs.xfs -f -b size=4096 $DISK
until mkfs.xfs -f -b size=4096 $DISK; do
echo "failed to make file system, trying again ... "
sleep 1
done

# Step 2: Create the mount point
sudo mkdir -p $MOUNT_POINT
mkdir -p $MOUNT_POINT

# Step 3: Mount the disk with
sudo mount -o noatime $DISK $MOUNT_POINT
echo "$DISK $MOUNT_POINT xfs noatime 0 0" | tee -a /etc/fstab
mount "$MOUNT_POINT"

sudo mkdir -p /orchestrator/sandbox
sudo mkdir -p /orchestrator/template
sudo mkdir -p /orchestrator/build
mkdir -p /orchestrator/sandbox
mkdir -p /orchestrator/template
mkdir -p /orchestrator/build

# Add swapfile
SWAPFILE="/swapfile"
sudo fallocate -l 100G $SWAPFILE
sudo chmod 600 $SWAPFILE
sudo mkswap $SWAPFILE
sudo swapon $SWAPFILE
fallocate -l 100G $SWAPFILE
chmod 600 $SWAPFILE
mkswap $SWAPFILE
swapon $SWAPFILE

# Make swapfile persistent
echo "$SWAPFILE none swap sw 0 0" | sudo tee -a /etc/fstab
echo "$SWAPFILE none swap sw 0 0" | tee -a /etc/fstab

# Set swap settings
sudo sysctl vm.swappiness=10
sudo sysctl vm.vfs_cache_pressure=50
sysctl vm.swappiness=10
sysctl vm.vfs_cache_pressure=50

# TODO: Optimize the mount more according to https://cloud.google.com/filestore/docs/mounting-fileshares
%{ if USE_FILESTORE_CACHE }
# Mount NFS
sudo mkdir -p "${NFS_MOUNT_PATH}"
echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | sudo tee -a /etc/fstab
sudo mount "${NFS_MOUNT_PATH}"
sudo mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}"
mkdir -p "${NFS_MOUNT_PATH}"
echo "${NFS_IP_ADDRESS}:/store ${NFS_MOUNT_PATH} nfs ${NFS_MOUNT_OPTS} 0 0" | tee -a /etc/fstab
mount "${NFS_MOUNT_PATH}"
mkdir -p "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}" && chmod +w "${NFS_MOUNT_PATH}/${NFS_MOUNT_SUBDIR}"
%{ endif }

# Add tmpfs for snapshotting
# TODO: Parametrize this
sudo mkdir -p /mnt/snapshot-cache
sudo mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache
mkdir -p /mnt/snapshot-cache
mount -t tmpfs -o size=65G tmpfs /mnt/snapshot-cache

ulimit -n 1048576
export GOMAXPROCS='nproc'

sudo tee -a /etc/sysctl.conf <<EOF
tee -a /etc/sysctl.conf <<EOF
# Increase the maximum number of socket connections
net.core.somaxconn = 65535

Expand All @@ -78,7 +108,7 @@ net.ipv4.tcp_max_syn_backlog = 65535
vm.max_map_count=1048576

EOF
sudo sysctl -p
sysctl -p

echo "Disabling inotify for NBD devices"
# https://lore.kernel.org/lkml/20220422054224.19527-1-matthew.ruffell@canonical.com/
Expand All @@ -87,11 +117,11 @@ cat <<EOH >/etc/udev/rules.d/97-nbd-device.rules
ACTION=="add|change", KERNEL=="nbd*", OPTIONS:="nowatch"
EOH

sudo udevadm control --reload-rules
sudo udevadm trigger
udevadm control --reload-rules
udevadm trigger

# Load the nbd module with 4096 devices
sudo modprobe nbd nbds_max=4096
modprobe nbd nbds_max=4096

# Create the directory for the fc mounts
mkdir -p /fc-vm
Expand Down Expand Up @@ -163,7 +193,7 @@ systemctl restart systemd-resolved
# The THP are by default set to madvise
# We are allocating the hugepages at the start when the memory is not fragmented yet
echo "[Setting up huge pages]"
sudo mkdir -p /mnt/hugepages
mkdir -p /mnt/hugepages
mount -t hugetlbfs none /mnt/hugepages
# Increase proactive compaction to reduce memory fragmentation for using overcomitted huge pages

Expand Down
34 changes: 18 additions & 16 deletions iac/provider-gcp/nomad-cluster/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,6 @@ variable "build_cluster_root_disk_size_gb" {
type = number
}

variable "build_cluster_cache_disk_size_gb" {
type = number
}

variable "build_cluster_cache_disk_type" {
type = string
}

variable "edge_api_port" {
type = object({
name = string
Expand Down Expand Up @@ -138,14 +130,6 @@ variable "client_machine_type" {
type = string
}

variable "client_cluster_cache_disk_size_gb" {
type = number
}

variable "client_cluster_cache_disk_type" {
type = string
}

variable "gcp_project_id" {
type = string
}
Expand Down Expand Up @@ -331,3 +315,21 @@ variable "api_nat_ips" {
variable "api_nat_min_ports_per_vm" {
type = number
}

variable "build_cluster_cache_disk_count" {
type = number

validation {
condition = var.build_cluster_cache_disk_count > 0
error_message = "Must include at least 1 build cluster cache disk"
}
}

variable "client_cluster_cache_disk_count" {
type = number

validation {
condition = var.client_cluster_cache_disk_count > 0
error_message = "Must include at least 1 client cluster cache disk"
}
}
12 changes: 12 additions & 0 deletions iac/provider-gcp/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -491,3 +491,15 @@ variable "remote_repository_enabled" {
description = "Set to true to enable remote repository cache. Can be set via TF_VAR_remote_repository_enabled or REMOTE_REPOSITORY_ENABLED env var."
default = false
}

variable "build_cluster_cache_disk_count" {
type = number
description = "The number of 375 GB NVME disks to raid together for storing build files."
default = 3
}

variable "client_cluster_cache_disk_count" {
type = number
description = "The number of 375 GB NVME disks to raid together for storing sandbox files."
default = 3
}
Loading