From cf0f1d8af8e2e595155cb01f168fe4b511a7dbd3 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 09:43:22 +0530 Subject: [PATCH 01/17] PR1: swap-encryption benchmark - shared DaemonSet/pod infra (layer 1/5); manifest moved to data/cluster and rendered via vm_util --- .../cluster/swap_encryption_daemonset.yaml.j2 | 266 +++ .../swap_encryption_benchmark.py | 1529 +++++++++++++++++ 2 files changed, 1795 insertions(+) create mode 100644 perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 create mode 100644 perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 new file mode 100644 index 0000000000..c40ec79dff --- /dev/null +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -0,0 +1,266 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ ds_name }} + namespace: {{ ds_namespace }} + labels: + app: {{ ds_label }} +spec: + selector: + matchLabels: + app: {{ ds_label }} + template: + metadata: + labels: + app: {{ ds_label }} + spec: + hostPID: true + hostNetwork: true + # Pin to the benchmark nodepool — never schedule on the dummy default pool. + nodeSelector: + pkb_nodepool: {{ benchmark_nodepool }} + tolerations: + - operator: Exists + containers: + - name: benchmark + image: {{ image }} + command: + - bash + - -c + - | + echo "[pkb] Installing benchmark tools..." + # Retry apt-get up to 3 times — transient network failures are + # common on a freshly-started GKE node. Critical tools (fio, + # stress-ng) must be present before we write the ready sentinel; + # a silent || true here would cause /tmp/pkb_ready to appear even + # when tools are missing, breaking all subsequent phases. + PKB_APT_OK=0 + for _attempt in 1 2 3; do + apt-get update -qq 2>&1 || true + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\ + fio \\ + stress-ng \\ + sysstat \\ + cryptsetup \\ + mdadm \\ + redis-server \\ + redis-tools \\ + git \\ + wget \\ + curl \\ + make \\ + gcc \\ + bc \\ + flex \\ + bison \\ + libelf-dev \\ + libssl-dev \\ + cgroup-tools \\ + nvme-cli \\ + util-linux \\ + python3-pip \\ + libevent-dev \\ + libssl-dev \\ + libpcre3-dev \\ + zlib1g-dev \\ + build-essential \\ + autoconf \\ + automake \\ + libtool \\ + libtool-bin \\ + pkg-config \\ + python3-dev \\ + default-jre-headless \\ + 2>&1 && PKB_APT_OK=1 && break + echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2 + sleep 15 + done + if [ "$PKB_APT_OK" != "1" ] || \\ + ! command -v fio >/dev/null 2>&1 || \\ + ! command -v stress-ng >/dev/null 2>&1; then + echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2 + exit 1 + fi + echo "[pkb] Installing memtier_benchmark from source..." + # Pin a stable release tag — building from the moving default + # branch (HEAD) intermittently broke (memtier_benchmark not found + # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the + # version PKB's memtier package (memtier.MemtierResult.Parse) is + # validated against and builds cleanly with the apt deps above. + # Fall back to HEAD only if the tagged clone fails. + if ! command -v memtier_benchmark >/dev/null 2>&1; then + (cd /tmp && \\ + rm -rf memtier_benchmark && \\ + ( git clone --depth 1 --branch 2.2.1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\ + git clone --depth 1 \\ + https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\ + cd memtier_benchmark && \\ + autoreconf -ivf 2>&1 && \\ + ./configure 2>&1 && \\ + make -j$(nproc) 2>&1 && \\ + make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\ + echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used" + fi + if command -v memtier_benchmark >/dev/null 2>&1; then + echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)" + fi + echo "[pkb] Installing esrally (lightweight)..." + python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true + pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ + echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used" + if command -v esrally >/dev/null 2>&1; then + echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)" + else + echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2 + fi + echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..." + # Phase 3c needs a real search server on :9200. Nothing in apt + # ships one and the pod has no systemd, so install the OpenSearch + # bundle (ships its own JDK) and launch the binary directly in the + # phase. All best-effort: if any step fails the phase probes the + # endpoint and skips cleanly rather than recording fake timings. + if [ ! -x /opt/opensearch/bin/opensearch ]; then + OS_VER=2.15.0 + (cd /opt && \\ + wget -q --timeout=600 -O os.tgz \\ + "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\ + tar -xzf os.tgz && rm -f os.tgz && \\ + mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\ + echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2 + fi + if [ -x /opt/opensearch/bin/opensearch ]; then + # pkbos owns and runs OpenSearch (it refuses to run as root). + # Give it a home so HOME/temp paths are writable. + id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true + printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\ + > /opt/opensearch/config/opensearch.yml + mkdir -p /opt/opensearch/config/jvm.options.d + # 2 GB heap: 512 MB was too small and OpenSearch aborted early. + # On a 252 GB node this still leaves plenty of page cache to + # pressure into swap during the phase. + printf -- '-Xms2g\\n-Xmx2g\\n' \\ + > /opt/opensearch/config/jvm.options.d/pkb-heap.options + sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true + # CRITICAL: never run the binary as root here (it bails and + # leaves root-owned files in logs/ that block the pkbos server). + # Clear any stale logs and chown everything to pkbos LAST. + rm -f /opt/opensearch/logs/* 2>/dev/null || true + chown -R pkbos /opt/opensearch 2>/dev/null || true + echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)" + fi + echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..." + PKB_KVER="{{ kernel_version }}" + PKB_KROOT="/mnt/stateful_partition/pkb_kernel" + PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz" + PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER" + PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz" + mkdir -p "$PKB_KROOT" + if [ ! -f "$PKB_KTARBALL" ]; then + wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\ + echo "[pkb] WARNING: kernel tarball download failed" >&2 + fi + if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then + echo "[pkb] Extracting kernel source (xz)..." + tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ + echo "[pkb] WARNING: kernel source extraction failed" >&2 + fi + echo "[pkb] Unlocking container cgroup swap limits..." + # GKE cgroup v2 sets memory.swap.max=0 per-container, which + # prevents swap usage even when the node has a swap device and + # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because + # the kernel can't page out to swap for this cgroup. + # + # NOTE: the old approach derived the cgroup path from + # /proc/self/cgroup, but inside a cgroup namespace that reports + # "0::/" — so the write targeted the host ROOT cgroup, silently + # no-op'd, and swap stayed locked (the OOM-in-15s symptom above). + # /sys is the host cgroup tree (hostPath mount) and this pod is + # privileged, so instead unlock swap across the entire kubepods + # hierarchy, which is guaranteed to contain our own container. + if [ -d /sys/fs/cgroup/kubepods.slice ] || \ + [ -d /sys/fs/cgroup/kubepods ]; then + # cgroup v2: write 'max' to every memory.swap.max under kubepods*. + find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \ + 2>/dev/null | while read -r _f; do + echo max > "$_f" 2>/dev/null || true + done + fi + # Best-effort: our own namespaced path and the unified root. + PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \ + 2>/dev/null) + for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \ + /sys/fs/cgroup/memory.swap.max; do + [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; } + done + # cgroup v1 fallback: lift the combined RAM+swap hard ceiling. + find /sys/fs/cgroup/memory -path '*kubepods*' \ + -name memory.memsw.limit_in_bytes 2>/dev/null \ + | while read -r _f; do + echo -1 > "$_f" 2>/dev/null || true + done + # Verify and surface the result in the pod log. grep -L lists + # files that do NOT contain 'max' on their first line, i.e. ones + # still capping swap. + PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \ + -name memory.swap.max 2>/dev/null \ + | xargs -r grep -L '^max' 2>/dev/null | head -1) + if [ -n "$PKB_STILL_CAPPED" ]; then + echo "[pkb] WARNING: cgroup swap still capped at \ + $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \ + OOM-killed before swap is exercised" >&2 + else + echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)" + fi + echo "[pkb] Tools installed. Writing ready sentinel." + touch /tmp/pkb_ready + sleep infinity + securityContext: + privileged: true + capabilities: + add: ["SYS_ADMIN", "IPC_LOCK"] + resources: + requests: + memory: "512Mi" + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + volumeMounts: + - name: dev + mountPath: /dev + - name: sys + mountPath: /sys + - name: run + mountPath: /run + - name: proc-host + mountPath: /proc-host + readOnly: true + - name: stateful-partition + mountPath: /mnt/stateful_partition + - name: lib-modules + mountPath: /lib/modules + readOnly: true + volumes: + - name: dev + hostPath: + path: /dev + - name: sys + hostPath: + path: /sys + - name: run + hostPath: + path: /run + - name: proc-host + hostPath: + path: /proc + - name: stateful-partition + hostPath: + path: /mnt/stateful_partition + type: DirectoryOrCreate + - name: lib-modules + hostPath: + path: /lib/modules + type: Directory diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py new file mode 100644 index 0000000000..5bdc933bba --- /dev/null +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -0,0 +1,1529 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark. + +Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws + +== Architecture == + +Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's +container_cluster abstraction, then deploys a privileged DaemonSet whose +pod has full host-device access (/dev, /sys, hostPID). All benchmark +phases execute inside this pod via kubectl exec, so measurements reflect +actual cluster-node behaviour including Kubernetes overhead (kubelet, +containerd cgroup hierarchy, etc.). + + GKE nodes ── dm-crypt with ephemeral key (go/node:swap-encryption) + swap device: /dev/mapper/swap_encrypted (over dedicated + hyperdisk or LSSD RAID-0 /dev/md0). + Single-disk fallback: plain loop device on + /mnt/stateful_partition — dm-crypt is blocked by COS + kernel namespace restrictions from inside a pod. + + EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption + swap device: /dev/nvme1n1 (or auto-detected) + +== Benchmark Phases == + + Phase 1 – fio Microbenchmarks + Run fio directly on the swap block device (swapoff first) to measure + the hardware + encryption ceiling: random IOPS (4K), sequential + bandwidth (1M), and completion latency (iodepth=1). + + Phase 2a – CPU Overhead + stress-ng drives sustained swap I/O; vmstat and pidstat capture + swap-in/out rates and per-process CPU cost (kswapd, kcryptd, + dm-crypt threads on GKE; Nitro offload on EKS). + + Phase 2b – I/O Interference + Baseline fio on a scratch volume → re-run with concurrent swap + pressure. IOPS/latency delta = storage contention cost. + + Phase 3a – Redis Latency + Dataset loaded beyond container memory limit → GET/SET p99 latency + measured while kernel swaps pages. + + Phase 3b – Kernel Build + Linux compiled inside a memory-capped cgroup; slowdown ratio vs + unconstrained baseline. + + Phase 3c – OpenSearch + Bulk-index + search query under swap pressure (esrally or curl). +""" + +import json +import logging +import re +import textwrap +import time +from typing import Any + +from absl import flags +from perfkitbenchmarker import configs +from perfkitbenchmarker import errors +from perfkitbenchmarker import sample +from perfkitbenchmarker import vm_util +from perfkitbenchmarker.resources.container_service import kubectl + +FLAGS = flags.FLAGS + +# --------------------------------------------------------------------------- +# Benchmark identity +# --------------------------------------------------------------------------- + + + +FLAGS = flags.FLAGS + + +BENCHMARK_NAME = 'swap_encryption' + + +BENCHMARK_CONFIG = """ +swap_encryption: + description: > + GKE vs. EKS swap encryption and LSSD performance comparison. + Two-step nodepool setup: PKB provisions a minimal cluster with a cheap + default nodepool (Step 1), then Prepare() adds the real benchmark + nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a + node-level startup script that configures dm-crypt swap before any pod + is scheduled, then removes the default nodepool (Step 2). All benchmark + phases run inside a privileged DaemonSet pinned to the benchmark nodepool. + flags: {} + container_cluster: + type: Kubernetes + vm_count: 1 + vm_spec: + GCP: + # Cheap placeholder — the benchmark nodepool is created in Prepare(). + machine_type: e2-medium + boot_disk_size: 20 + AWS: + # Cheap placeholder — the benchmark nodegroup is added in Prepare(). + machine_type: t3.medium + boot_disk_size: 20 +""" + + +_DAEMONSET_IMAGE = flags.DEFINE_string( + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image used for the privileged benchmark DaemonSet pod.', +) + + +_NODEPOOL = flags.DEFINE_string( + 'swap_encryption_nodepool', + 'benchmark', + 'Name of the node pool to deploy the benchmark DaemonSet on.', +) + + +_INSTANCE_SIZE_LABEL = flags.DEFINE_string( + 'swap_encryption_instance_size_label', + '', + 'Human-readable label for the current instance size being tested, e.g. ' + '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' + 'results from multiple PKB runs across different instance sizes can be ' + 'collated and compared. Defaults to the value reported by the cloud ' + 'metadata endpoint inside the pod.', +) + + +_COLLECT_COST = flags.DEFINE_boolean( + 'swap_encryption_collect_cost', + False, + 'When True, emit a cost_estimate_usd sample using on-demand pricing ' + 'for the instance type detected at runtime.', +) + + +_FAIL_ON_DEGRADED = flags.DEFINE_boolean( + 'swap_encryption_fail_on_degraded', + True, + 'When True (default), raise an error at the end of Run() if the run was ' + 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' + 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' + 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' + 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' + 'empty or meaningless data. Set False to keep the legacy behaviour of ' + 'always returning whatever partial samples were collected.', +) + + +_PHASES = flags.DEFINE_list( + 'swap_encryption_phases', + ['all'], + 'Which Run() phases to execute, for fast iteration against an ' + 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' + 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' + 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' + '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' + 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' + 'Phases not listed are skipped and do not affect the degraded-run gate ' + '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', +) + + +_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( + 'swap_encryption_benchmark_machine_type', + 'n4-highmem-32', + 'Machine type for the benchmark nodepool created in Prepare(). ' + 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' + '(LSSD RAID-0). The matching swap setup is selected automatically.', +) + + +_BENCHMARK_LSSD = flags.DEFINE_boolean( + 'swap_encryption_lssd', + False, + 'Force LSSD RAID-0 swap path even when the machine type name does not ' + 'contain "lssd". Auto-detected from machine type when False.', +) + + +_LSSD_COUNT = flags.DEFINE_integer( + 'swap_encryption_lssd_count', + 1, + 'Number of local NVMe SSDs to attach as raw block devices ' + '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' + 'count for the chosen machine type: c4-standard-8-lssd=1, ' + 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' + 'Default 1 covers most single-lssd machine types.', +) + + +_NODE_IMAGE_TYPE = flags.DEFINE_string( + 'swap_encryption_node_image_type', + 'UBUNTU_CONTAINERD', + 'GKE node image type for the benchmark nodepool. ' + 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' + 'down device-mapper at the kernel LSM level and cryptsetup hangs ' + 'indefinitely from any pod context (even privileged, even via nsenter ' + 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' + 'from privileged pods without restriction. ' + 'Use COS_CONTAINERD only when dm-crypt is disabled ' + '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' + 'AL2 on EKS.', +) + + +_BOOT_DISK_TYPE = flags.DEFINE_string( + 'swap_encryption_boot_disk_type', + 'hyperdisk-balanced', + 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' + 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' + 'dev/test machines, which do not support hyperdisk-balanced.', +) + + +_BOOT_DISK_IOPS = flags.DEFINE_integer( + 'swap_encryption_boot_disk_iops', + 80000, + 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' + '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', +) + + +_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( + 'swap_encryption_boot_disk_throughput', + 1200, + 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' + 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' + 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' + 'pd-ssd.', +) + + +_BOOT_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_boot_disk_size_gb', + 500, + 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' + 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' + '(see Engineer Assignments table in execution-plan.md). ' + 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', +) + + +_ADD_SWAP_DISK = flags.DEFINE_boolean( + 'swap_encryption_add_swap_disk', + False, + 'Attach a dedicated second disk to the benchmark nodepool for use as ' + 'the swap device. Required for dm-crypt measurement on single-boot-disk ' + 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' + 'from pod namespaces. The second disk is provisioned via ' + '--additional-node-disk using the same type/IOPS/throughput as the boot ' + 'disk flags.', +) + + +_SWAP_DISK_SIZE_GB = flags.DEFINE_integer( + 'swap_encryption_swap_disk_size_gb', + 500, + 'Size in GiB of the dedicated swap disk when ' + '--swap_encryption_add_swap_disk is True. Must satisfy the ' + 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', +) + + +_DS_NAME = 'pkb-swap-benchmark' + + +_DS_NAMESPACE = 'default' + + +_DS_LABEL = 'pkb-swap-benchmark' + + +_active_pod: list[str] = [] # single-element list so closures can mutate it + + +_degraded_reasons: list[str] = [] + + +_pod_lost: list[str] = [] + + +_oom_events: list[str] = [] + + +_BENCHMARK_NODEPOOL = 'benchmark' + + +_DEFAULT_NODEPOOL = 'default-pool' + + +def _daemonset_yaml(image: str) -> str: + """Render the privileged benchmark DaemonSet manifest. + + The manifest is a PKB data file rendered with Jinja2 + (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline + string, per PKB conventions. The DaemonSet is pinned to the benchmark + nodepool via nodeSelector so it never lands on the dummy default pool. + """ + return vm_util.ReadAndRenderJinja2Template( + 'cluster/swap_encryption_daemonset.yaml.j2', + ds_name=_DS_NAME, + ds_namespace=_DS_NAMESPACE, + ds_label=_DS_LABEL, + benchmark_nodepool=_BENCHMARK_NODEPOOL, + image=image, + kernel_version=_KERNEL_VERSION.value, + ) + + +def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + + +def Prepare(spec) -> None: + """Two-step nodepool setup then DaemonSet deployment. + + Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap + e2-medium default nodepool. + + Step 2 (this function): + a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with + COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + dm-crypt swap at the OS level — before any pod is scheduled. + b. Delete the dummy default nodepool to stop its cost immediately. + c. Deploy the privileged DaemonSet (pinned via nodeSelector to the + benchmark nodepool) and wait for tools to install. + """ + cluster = spec.container_cluster + + # ── Step 2a: add real benchmark nodepool ──────────────────────────────── + if getattr(cluster, 'project', None): + # GCP path: true two-step nodepool setup + logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') + _create_benchmark_node_pool(cluster) + + # ── Step 2b: wait for the benchmark node to join and be Ready ───────── + logging.info('[swap_encryption] Step 2b: waiting for benchmark node') + _wait_for_benchmark_node() + + # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── + # --additional-node-disk is not available in all gcloud versions, so we + # create + attach the disk after the node is up using gcloud compute. + if _ADD_SWAP_DISK.value: + logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk') + _attach_swap_disk(cluster) + else: + # AWS / EKS: nodepool management is external. PKB's cluster creation + # labels nodes pkb_nodepool=default, so re-label all existing nodes here + # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). + logging.info( + '[swap_encryption] EKS cluster — labelling existing nodes with ' + 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', + _BENCHMARK_NODEPOOL) + kubectl.RunKubectlCommand([ + 'label', 'nodes', '--all', '--overwrite', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + ]) + # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs + # on io2 hardware-encrypted storage (no-op unless swap_type=io2). + _ensure_io2_volume() + + # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── + # Deploy and wait for the pod BEFORE deleting the default nodepool. + # Deleting the default pool while the benchmark node is still joining causes + # a temporary API server i/o timeout (control plane busy with two nodepool + # ops simultaneously). Once the pod is Running the cluster is fully stable. + logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet') + _deploy_daemonset() + + pod = _wait_for_benchmark_pod() + logging.info('[swap_encryption] Benchmark pod ready: %s', pod) + + # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── + if getattr(cluster, 'project', None): + logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool') + _delete_default_node_pool(cluster) + # The DaemonSet pod may be evicted and rescheduled with a new name during + # the nodepool deletion (cluster control plane briefly interrupts pod + # lifecycle). Re-resolve the pod name to avoid stale-reference errors on + # all subsequent _pod_exec calls. + logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod ' + 'after nodepool deletion') + pod = _wait_for_benchmark_pod() + logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) + + +def _phase_selected(token: str) -> bool: + """Return True if phase `token` should run given --swap_encryption_phases. + + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ('all' in selected) or (token.lower() in selected) + + +def Run(spec) -> list[sample.Sample]: + """Execute all benchmark phases with gate logic. + + Execution is structured in three gated tiers matching the execution plan: + + Tier 1 (Gate 1) — fio microbenchmarks + Raw I/O ceiling of the swap device. Gate 1 fails if fio produces + zero samples (device not found, O_DIRECT error, etc.). + + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Requires an active swap device (Gate 1 must pass). Gate 2 fails if + stress-ng does not complete within timeout. + + Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) + Independent of Tier 2 results; always attempted if Gate 1 passed. + Individual workload failures are logged but do not abort the others. + + If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + application-level swap performance when the raw device is inaccessible. + """ + pod = _wait_for_benchmark_pod() + # Initialise the module-level active-pod tracker so _pod_exec and + # _recover_pod can transparently redirect to a replacement pod if the + # original is evicted during the run. + _active_pod.clear() + _active_pod.append(pod) + _degraded_reasons.clear() + _pod_lost.clear() + _oom_events.clear() + original_pod = pod + swap_dev = _detect_swap_device(pod) + base_meta = _build_metadata(pod, swap_dev) + results: list[sample.Sample] = [] + t_run_start = time.time() + + logging.info('[swap_encryption] swap device: %s', swap_dev) + + # ── Cost estimate ───────────────────────────────────────────────────────── + if _COLLECT_COST.value: + elapsed = time.time() - t_run_start + results += _collect_cost_sample(pod, elapsed, base_meta) + + # ── Final degradation gate ──────────────────────────────────────────────── + # The phase try/except blocks above keep the run alive so partial data is + # still collected, but that means a catastrophic failure (pod OOM-evicted + # mid-run, no fio data, stress-ng killed before it could drive swap I/O) + # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. + # Detect those conditions here and surface them explicitly. + if _active_pod and _active_pod[0] != original_pod: + _degraded_reasons.append( + f'benchmark pod was replaced during the run ' + f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap ' + f'pressure; phases executed after the eviction ran against a ' + f'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' + f'invalid') + if _pod_lost: + _degraded_reasons.append( + f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) ' + f'— the pod died (node memory-pressure eviction or container exit) and ' + f'any phase running at or after that point (e.g. kernel-build baseline, ' + f'OpenSearch) produced invalid data') + if _oom_events: + _degraded_reasons.append( + f'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by ' + f'the OOM killer (the container may have restarted in place), so the ' + f'affected phase(s) produced no or partial data') + + degraded = bool(_degraded_reasons) + results.append(sample.Sample( + 'swap_encryption_run_status', + 0.0 if degraded else 1.0, + 'status', + dict(base_meta, + degraded=degraded, + degraded_reasons='; '.join(_degraded_reasons) or 'none', + num_samples=len(results) + 1))) + + if degraded: + msg = ('[swap_encryption] RUN DEGRADED — ' + + '; '.join(_degraded_reasons)) + logging.error(msg) + if _FAIL_ON_DEGRADED.value: + # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The + # samples collected so far are still published by PKB before the failure + # is recorded, so no data is lost. + raise errors.Benchmarks.RunError(msg) + else: + logging.info('[swap_encryption] Run completed cleanly (%d samples)', + len(results)) + + return results + + +def Cleanup(spec) -> None: + """Remove the DaemonSet and tear down any swap configuration.""" + pod = _wait_for_benchmark_pod(timeout=30) + if pod: + _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True) + _pod_exec(pod, textwrap.dedent(""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + """), ignore_failure=True) + # Clean up loop device backing files (single-disk fallback path). + _pod_exec(pod, textwrap.dedent(""" + for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ + /mnt/stateful_partition/pkb_swap_backing + do + losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \ + while read dev + do + losetup -d "$dev" 2>/dev/null || true + done + rm -f "$backing" + done + """), ignore_failure=True) + _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True) + + _delete_daemonset() + + # Detach and delete the dedicated swap disk if one was provisioned. + cluster = spec.container_cluster + if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None): + _detach_and_delete_swap_disk(cluster) + + +def _deploy_daemonset() -> None: + """Apply the benchmark DaemonSet manifest to the cluster.""" + manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) + with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f: + f.write(manifest) + f.close() + kubectl.RunKubectlCommand(['apply', '-f', f.name]) + logging.info('[swap_encryption] DaemonSet applied') + + +def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: + """Wait until the DaemonSet pod is Running AND tools are installed. + + The benchmark container installs apt packages on first start and writes + /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for + that sentinel before exec-ing any commands, otherwise tools like + cryptsetup / fio may not yet be on PATH. + + Uses tab-separated name/phase output so kubectl always exits 0 regardless + of whether any pods are present, avoiding jsonpath index errors. + """ + deadline = time.time() + timeout + last_phase = '' + ready_pod = None # pod name once phase == Running + + while time.time() < deadline: + # ── Step 1: wait for Running phase ────────────────────────────────────── + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'pods', + '-l', f'app={_DS_LABEL}', + '-n', _DS_NAMESPACE, + '-o', + r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}', + ], raise_on_failure=False) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2: + pod_name, phase = parts[0].strip(), parts[1].strip() + if phase == 'Running': + logging.info('[swap_encryption] Pod %s is Running – ' + 'waiting for tool install to finish...', pod_name) + ready_pod = pod_name + break + if phase != last_phase: + logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase) + last_phase = phase + if phase in ('Pending',): + _log_pod_events(pod_name) + else: + logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...') + + # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── + if ready_pod is not None: + sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([ + 'exec', ready_pod, '-n', _DS_NAMESPACE, + '--', 'test', '-f', '/tmp/pkb_ready', + ], raise_on_failure=False) + if sentinel_rc == 0: + logging.info( + '[swap_encryption] Pod %s ready (tools installed)', ready_pod) + return ready_pod + # "container not found" means the container crashed (CrashLoopBackOff or + # exited) — treat it as a hard reset: re-check pod phase on next iteration. + if ('container not found' in sentinel_err + or 'unable to upgrade connection' in sentinel_err): + logging.warning('[swap_encryption] Pod %s: container not running (%s) ' + '— will re-check pod state', ready_pod, sentinel_err.strip()) + ready_pod = None + last_phase = '' + else: + logging.info( + '[swap_encryption] Pod %s: still installing tools...', ready_pod) + + time.sleep(15) + + logging.warning( + '[swap_encryption] Benchmark pod not ready after %ds', timeout) + return None + + +def _log_pod_events(pod_name: str) -> None: + """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand([ + 'describe', 'pod', pod_name, + '-n', _DS_NAMESPACE, + ], raise_on_failure=False) + # Only log the Events section to keep output manageable + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith('Events:'): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])) + else: + logging.info('[swap_encryption] kubectl describe output:\n%s', + events_out[-2000:] if len(events_out) > 2000 else events_out) + + +def _delete_daemonset() -> None: + """Delete the benchmark DaemonSet.""" + kubectl.RunKubectlCommand([ + 'delete', 'daemonset', _DS_NAME, + '-n', _DS_NAMESPACE, + '--ignore-not-found', + ], raise_on_failure=False) + logging.info('[swap_encryption] DaemonSet deleted') + + +def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: + """Return a bash startup script for the benchmark nodepool. + + NOTE: This function is not currently used. GKE reserves the + `startup-script` node metadata key, so dm-crypt setup is performed + from within the privileged DaemonSet pod instead (see + _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. + + Args: + enable_dmcrypt: When True, wrap the swap device in dm-crypt plain + mode (aes-xts-plain64, ephemeral random key) matching GKE's + go/node:swap-encryption implementation. + lssd: When True, build a RAID-0 array across all local SSDs before + setting up swap (matches go/gke-swap-lssd). + + Returns: + A bash script string suitable for running as root at node boot. + """ + dmcrypt_str = 'true' if enable_dmcrypt else 'false' + lssd_str = 'true' if lssd else 'false' + + return textwrap.dedent(f"""\ + #!/bin/bash + # PKB swap_encryption_benchmark — nodepool startup script. + # Configures swap once at node boot so all benchmark phases see a + # pre-warmed swap device. Runs as root on the COS host. + set -euo pipefail + ENABLE_DMCRYPT={dmcrypt_str} + LSSD={lssd_str} + + _wait_dev() {{ + local d=$1 i + for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done + echo "[pkb-startup] device $d not ready" >&2; return 1 + }} + + _boot_dev() {{ + lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1 + }} + + if $LSSD; then + BOOT=$(_boot_dev) + # Collect all non-rotational non-boot block devices (local SSDs) + DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true) + N=$(echo "$DEVS" | grep -c /dev/ || true) + if [ "$N" -gt 1 ]; then + modprobe raid0 || true + # shellcheck disable=SC2086 + mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force + TARGET=/dev/md0 + elif [ "$N" -eq 1 ]; then + TARGET=$(echo "$DEVS" | head -1) + else + echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2 + exit 0 + fi + else + BOOT=$(_boot_dev) + RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true) + if [ -z "$RAW" ]; then + echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2 + exit 0 + fi + TARGET=/dev/$RAW + fi + + _wait_dev "$TARGET" + + if $ENABLE_DMCRYPT; then + modprobe dm-crypt || true + dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\ + cryptsetup open --type plain \\ + --cipher aes-xts-plain64 --key-size 256 \\ + --key-file=- "$TARGET" pkb_swap + SWAP_DEV=/dev/mapper/pkb_swap + else + SWAP_DEV=$TARGET + fi + + mkswap "$SWAP_DEV" + swapon "$SWAP_DEV" + echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)" + """) + + +_HYPERDISK_MAX_IOPS_PER_MBPS = 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s + + +def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: + """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. + + Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed + 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails + with "Requested provisioned throughput is too low for the provisioned iops". + Clamp throughput UP to the minimum the requested IOPS need (plus a small + margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk + creation. + """ + min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) + if throughput < min_tput: + logging.warning( + '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for ' + '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d', + throughput, iops, min_tput, min_tput) + return min_tput + return throughput + + +def _create_benchmark_node_pool(cluster) -> None: + """Add the benchmark nodepool to the existing cluster (Step 2 of setup). + + Uses: + --swap_encryption_benchmark_machine_type (default n4-highmem-32) + --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_boot_disk_iops (default 80000) + --swap_encryption_enable_dmcrypt (default True) + + The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet + nodeSelector targets it exclusively. dm-crypt swap setup is performed + from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / + _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata + because GKE reserves that metadata key and rejects it at the API level. + """ + machine_type = _BENCHMARK_MACHINE_TYPE.value + # Auto-detect LSSD from machine type name; flag overrides only when True. + is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower() + + # Determine zone/region from the cluster object. + zone_flags: list[str] = [] + if getattr(cluster, 'zones', None): + zone_flags = ['--zone', cluster.zones[0]] + elif getattr(cluster, 'region', None): + zone_flags = ['--region', cluster.region] + + # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). + # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on + # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk + # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable + # headroom and matches the Config 2 spec in the Engineer Assignments table). + disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value + + disk_type = _BOOT_DISK_TYPE.value + cmd = [ + 'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL, + '--cluster', cluster.name, + '--project', cluster.project, + '--machine-type', machine_type, + '--image-type', _NODE_IMAGE_TYPE.value, + '--disk-type', disk_type, + '--disk-size', str(disk_size_gb), + '--num-nodes', '1', + '--node-labels', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '--no-enable-autoupgrade', + '--no-enable-autorepair', + ] + zone_flags + + # IOPS and throughput provisioning only applies to hyperdisk-* types AND + # only when the boot disk is also the swap device (non-LSSD configs). + # For LSSD machines the boot disk is OS-only; swap is on local NVMe. + # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the + # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). + if disk_type.startswith('hyperdisk') and not is_lssd: + cmd += [ + '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--boot-disk-provisioned-throughput', + str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, + _BOOT_DISK_THROUGHPUT.value)), + ] + + # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm + # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). + if is_lssd: + cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] + + logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / ' + 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' + 'add_swap_disk=%s', + _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, + disk_size_gb, _BOOT_DISK_IOPS.value, + _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value) + + # LSSD nodepools take longer to provision than PD-only nodepools because + # GKE must also initialise the local NVMe devices before marking nodes Ready. + # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. + stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200, + raise_on_failure=False) + + if rc != 0: + # Idempotent prepare: if the nodepool already exists (e.g. re-running + # --run_stage=prepare,run to redeploy the DaemonSet onto an existing + # cluster), reuse it instead of failing. gcloud returns a 409 / + # "Already exists" message in this case. + low = (stderr or '').lower() + if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low: + logging.info('[swap_encryption] Benchmark nodepool already exists — ' + 'reusing it (idempotent prepare); proceeding to DaemonSet') + return + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create benchmark nodepool ' + f'(rc={rc}): {stderr}' + ) + logging.info('[swap_encryption] Benchmark nodepool ready') + + +def _wait_for_benchmark_node(timeout: int = 900) -> None: + """Block until a node labelled pkb_nodepool=benchmark is Ready. + + gcloud container node-pools create returns as soon as the API accepts the + request — the actual node VM may take another 2-4 minutes to boot, join the + cluster, and pass its readiness checks. Deploying the DaemonSet before that + point leaves the pod Pending indefinitely because the nodeSelector finds no + eligible node. + + This function polls kubectl every 15 s until at least one node with + pkb_nodepool=benchmark has Ready=True, then returns. + """ + deadline = time.time() + timeout + logging.info('[swap_encryption] Waiting for benchmark node ' + '(pkb_nodepool=benchmark) to be Ready...') + while time.time() < deadline: + out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'nodes', + '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}', + ], raise_on_failure=False) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2 and parts[1].strip() == 'True': + logging.info('[swap_encryption] Benchmark node ready: %s', + parts[0].strip()) + return + + logging.info('[swap_encryption] Benchmark node not yet Ready — ' + 'retrying in 15 s...') + time.sleep(15) + + raise errors.Benchmarks.RunError( + '[swap_encryption] Timed out waiting for benchmark node ' + f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready ' + f'after {timeout}s' + ) + + +def _attach_swap_disk(cluster) -> None: + """Create a dedicated hyperdisk and attach it to the benchmark node. + + gcloud container node-pools create --additional-node-disk is not available + in all gcloud SDK versions, so we use gcloud compute to create the disk and + attach it after the node is ready. In GKE the Kubernetes node name is the + same as the GCE instance name, so no translation is needed. + + After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe + nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. + + The disk is named pkb-swap- to avoid name collisions across + concurrent runs. Cleanup deletes it in Cleanup() if it exists. + """ + # Resolve zone from cluster + zone = None + if getattr(cluster, 'zones', None): + zone = cluster.zones[0] + elif getattr(cluster, 'region', None): + zone = cluster.region + if not zone: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot attach swap disk: cluster zone unknown') + + project = cluster.project + disk_name = f'pkb-swap-{cluster.name}' + disk_type = _BOOT_DISK_TYPE.value + disk_size_gb = _SWAP_DISK_SIZE_GB.value + + # ── Step 1: get the GCE instance name of the benchmark node ─────────────── + node_out, _, rc = kubectl.RunKubectlCommand([ + 'get', 'nodes', + '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', 'jsonpath={.items[0].metadata.name}', + ], raise_on_failure=False) + instance_name = node_out.strip() + if rc != 0 or not instance_name: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot find benchmark node for swap disk attach') + logging.info('[swap_encryption] Benchmark node instance: %s', instance_name) + + # ── Step 2: create the hyperdisk ────────────────────────────────────────── + logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)', + disk_name, disk_size_gb, disk_type) + create_cmd = [ + 'gcloud', 'compute', 'disks', 'create', disk_name, + '--project', project, + '--zone', zone, + '--type', disk_type, + '--size', f'{disk_size_gb}GB', + '--quiet', + ] + if disk_type.startswith('hyperdisk'): + create_cmd += [ + '--provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--provisioned-throughput', + str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, + _BOOT_DISK_THROUGHPUT.value)), + ] + _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120, + raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}') + + # ── Step 3: attach the disk to the node VM ──────────────────────────────── + logging.info('[swap_encryption] Attaching swap disk %s to %s', + disk_name, instance_name) + attach_cmd = [ + 'gcloud', 'compute', 'instances', 'attach-disk', instance_name, + '--project', project, + '--zone', zone, + '--disk', disk_name, + '--device-name', 'pkb-swap', + '--quiet', + ] + _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120, + raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' + f'{stderr}') + logging.info('[swap_encryption] Swap disk attached: %s → %s', + disk_name, instance_name) + + +def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: + """Detach (if attached) and delete a GCE disk, robustly, with retries. + + Finds the attached instance from the disk's own `users` field rather than + kubectl — kubectl is often unavailable during teardown (cluster being + deleted), which previously left the disk attached and undeletable, so it + leaked. Returns True if the disk is gone (deleted or already absent). + """ + for attempt in range(1, 5): + users, _, rc = vm_util.IssueCommand( + ['gcloud', 'compute', 'disks', 'describe', disk_name, + '--project', project, '--zone', zone, '--format=value(users)'], + timeout=60, raise_on_failure=False) + if rc != 0: + logging.info('[swap_encryption] Swap disk %s not present — nothing to ' + 'delete', disk_name) + return True # already gone + user = users.strip() + if user: + inst = user.split('/')[-1] + logging.info('[swap_encryption] Detaching swap disk %s from %s', + disk_name, inst) + vm_util.IssueCommand( + ['gcloud', 'compute', 'instances', 'detach-disk', inst, + '--project', project, '--zone', zone, '--disk', disk_name, + '--quiet'], timeout=120, raise_on_failure=False) + _, derr, drc = vm_util.IssueCommand( + ['gcloud', 'compute', 'disks', 'delete', disk_name, + '--project', project, '--zone', zone, '--quiet'], + timeout=180, raise_on_failure=False) + if drc == 0: + logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) + return True + logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed ' + '(%s); retrying in 10s', attempt, derr.strip()[:160]) + time.sleep(10) + logging.error('[swap_encryption] Could NOT delete swap disk %s after retries ' + '— delete it manually: gcloud compute disks delete %s ' + '--zone %s --quiet', disk_name, disk_name, zone) + return False + + +def _detach_and_delete_swap_disk(cluster) -> None: + """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" + zone = None + if getattr(cluster, 'zones', None): + zone = cluster.zones[0] + elif getattr(cluster, 'region', None): + zone = cluster.region + if not zone or not getattr(cluster, 'project', None): + return + _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone) + + +def _delete_default_node_pool(cluster) -> None: + """Delete the dummy default nodepool after the benchmark pool is ready. + + The default nodepool (e2-medium) was only needed to satisfy GKE's + requirement that a cluster must have at least one nodepool at creation time. + Removing it stops the clock on its cost immediately. + """ + zone_flags: list[str] = [] + if getattr(cluster, 'zones', None): + zone_flags = ['--zone', cluster.zones[0]] + elif getattr(cluster, 'region', None): + zone_flags = ['--region', cluster.region] + + cmd = [ + 'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL, + '--cluster', cluster.name, + '--project', cluster.project, + '--quiet', + ] + zone_flags + + logging.info( + '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL) + stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300, + raise_on_failure=False) + if rc != 0: + logging.warning('[swap_encryption] Could not delete default nodepool ' + '(rc=%d): %s', rc, stderr) + else: + logging.info('[swap_encryption] Default nodepool deleted') + + +def _is_pod_gone(pod: str) -> bool: + """Return True if the named pod no longer exists in the cluster. + + Used to distinguish OOM-killed container processes (pod still alive, rc=137) + from OOM-evicted pods (pod gone, DaemonSet will create a replacement). + """ + try: + _, err, rc = kubectl.RunKubectlCommand( + ['get', 'pod', pod, '-n', _DS_NAMESPACE, + '-o', 'jsonpath={.metadata.name}'], + raise_on_failure=False, timeout=15, + ) + return rc != 0 and 'not found' in (err or '').lower() + except Exception: # pylint: disable=broad-except + return False + + +def _pod_exec( + pod: str, + cmd: str, + ignore_failure: bool = False, + timeout: int = 300, + _retries: int = 2, +) -> tuple[str, str]: + """Run a shell command inside the benchmark pod via kubectl exec. + + Args: + pod: Pod name returned by _wait_for_benchmark_pod. + cmd: Shell command string passed to bash -c. + ignore_failure: When True, non-zero exit codes are logged but not + raised. + timeout: Seconds before PKB kills the kubectl exec process. Default + 300 s matches PKB's IssueCommand default. Pass a larger value for + long-running jobs (fio, stress-ng, kernel build). + _retries: Number of automatic retries on transient GKE websocket + resets ("connection reset by peer"). Set to 0 to disable retries + for idempotent-sensitive commands. + + Returns: + Tuple of (stdout, stderr) strings. + """ + _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close') + # Errors that indicate the container/pod is gone and needs recovery. + # 'not found' covers "Error from server (NotFound): pods ... not found" + # which occurs when the DaemonSet pod was evicted and recreated under a + # new name (e.g. after OOM-triggered node pressure eviction). + # 'deleted state' covers "cannot exec in a deleted state" — the container + # was OOM-killed and is mid-termination (not yet recreated). + _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received', + 'unable to upgrade connection', 'not found', + 'deleted state') + # Use the globally-tracked active pod name — it may have been updated by + # a previous _recover_pod call when eviction replaced the pod. + active = _active_pod[0] if _active_pod else pod + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + ['exec', active, '-n', _DS_NAMESPACE, + '--', 'bash', '-c', cmd], + raise_on_failure=False, + raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets + timeout=timeout, + ) + is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) + if is_transient and attempt < _retries: + logging.warning( + '[swap_encryption] kubectl exec connection reset (attempt %d/%d); ' + 'retrying in 10 s', attempt + 1, _retries + 1) + time.sleep(10) + continue + # rc=137 (SIGKILL): the OOM killer terminated the container process. + # Two sub-cases: + # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. + # B) Container OOM restart: pod still exists, container restarts in place. + # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, + # tools must be re-installed before subsequent commands can run.) + # In both cases we call _recover_pod to wait for tools + sentinel, and + # we do NOT retry the OOM-triggering command itself. + if rc == 137: + # Record the OOM so the run-level gate can flag it even if the container + # restarts in place under the same pod name (which leaves both the + # "pod replaced" and "pod NotFound" checks silent). + if active not in _oom_events: + _oom_events.append(active) + # CRITICAL: sleep before checking pod state. Kubernetes takes a few + # seconds to mark a just-evicted pod as Terminating / NotFound. Without + # this delay _recover_pod sees the pod still in "Running" phase, returns + # the old pod name immediately, and every subsequent command fails with + # "Error from server (NotFound): pods … not found". + logging.warning( + '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update ' + 'pod state before recovery check') + time.sleep(15) + pod_gone = _is_pod_gone(active) + if pod_gone: + logging.warning( + '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — ' + 'recovering pod name for subsequent commands (not retrying this cmd)') + else: + logging.warning( + '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — ' + 'waiting for container restart and tool re-install before continuing') + new_pod = _recover_pod(active) + if new_pod != active: + logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + + is_container_gone = (rc != 0 and + any(e in err.lower() for e in _CONTAINER_GONE_ERRORS)) + if is_container_gone: + # Record the loss for the run-level degradation gate REGARDLESS of retry + # budget or ignore_failure. A "pods … not found" on a best-effort command + # (kernel build, opensearch, cleanup of a dead pod) still means the pod + # died; without this the gate stays blind because _active_pod is only + # renamed on the retry path below, which _retries=0 callers never reach. + if active and active not in _pod_lost: + _pod_lost.append(active) + logging.error( + '[swap_encryption] Benchmark pod %s is gone (%s) — recording run ' + 'as degraded', active, (err or '').strip()[:160]) + if attempt < _retries: + logging.warning( + '[swap_encryption] Container gone/restarting (attempt %d/%d) — ' + 'waiting for pod to recover...', attempt + 1, _retries + 1) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + continue + break + + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] _pod_exec failed (rc={rc}): {err}') + return out, err + + +def _recover_pod(pod: str, timeout_sec: int = 600) -> str: + """Wait for a DaemonSet container to recover after OOM kill or eviction. + + Handles two scenarios: + 1. Container OOM restart: same pod name, container restarting in place. + DaemonSet restartPolicy=Always brings it back under the same pod name. + 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates + a new pod with a DIFFERENT name. We detect this by checking whether + the named pod still exists; if not, we search by the DaemonSet label + selector for a Running pod. + + Returns the (possibly new) pod name once it is Running and ready. + """ + deadline = time.time() + timeout_sec + logging.info('[swap_encryption] Waiting for pod %s to recover ' + '(up to %ds)...', pod, timeout_sec) + + # Phase 1: wait for a Running pod — either the named one (container + # restart) or a replacement pod found via label selector (eviction). + # + # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a + # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp + # (the pod is "Terminating") while status.phase may still read "Running" for + # several seconds. Checking only status.phase causes a false-positive: we + # return the old pod name immediately and every subsequent command fails with + # "Error from server (NotFound)". Checking deletionTimestamp catches this. + recovered_pod = pod + while time.time() < deadline: + # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not + # stdout. When the pod is gone, status_out is empty and the error text + # lives entirely in status_err. Discarding stderr (using _) means the + # 'not found' check below never fires and we spin until deadline. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + ['get', 'pod', pod, '-n', _DS_NAMESPACE, + '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'], + raise_on_failure=False, timeout=30, + ) + # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) + fields = status_out.strip().split('|') + phase = fields[0].strip() if fields else '' + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Pod is genuinely Running and NOT being deleted — recovery complete. + if status_rc == 0 and phase == 'Running' and not is_terminating: + break + + # Pod no longer exists, OR it exists but is being terminated (Terminating + # state or deletionTimestamp set) — look for a replacement pod by label. + pod_gone_or_terminating = ( + (status_rc != 0 and 'not found' in (status_out + status_err).lower()) + or is_terminating + ) + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + ['get', 'pods', '-n', _DS_NAMESPACE, + '-l', f'app={_DS_LABEL}', + '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}'], + raise_on_failure=False, timeout=30, + ) + new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip() + and p.strip() != pod] # exclude the dying pod + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info('[swap_encryption] Original pod %s gone/terminating; ' + 'found replacement %s', pod, recovered_pod) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] No Running pod found (original: {pod}) ' + f'within {timeout_sec}s after OOM kill / eviction') + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + ['exec', recovered_pod, '-n', _DS_NAMESPACE, + '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'], + raise_on_failure=False, timeout=30, + ) + if ready_rc == 0 and 'READY' in ready_out: + logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod) + return recovered_pod + time.sleep(15) + + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] Pod {recovered_pod} did not become ready ' + f'within {timeout_sec}s after OOM kill / eviction') + + +_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { + # GCP (on-demand, us-central1 unless noted) + 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD + 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM + 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM + 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM + 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + # AWS + 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store +} + + +def _collect_cost_sample( + pod: str, elapsed_sec: float, base_meta: dict +) -> list[sample.Sample]: + """Emit a cost_estimate_usd sample for the benchmark run (gap 7). + + Instance type is read from cloud metadata inside the pod. Price is looked + up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and + a warning is logged. + + Args: + pod: Benchmark pod name. + elapsed_sec: Wall-clock seconds the benchmark phases took. + base_meta: Shared metadata dict. + + Returns: + A list of zero or one sample.Sample. + """ + # Detect instance type from cloud metadata + instance_type = '' + + # GCP: machine type is the last segment of the metadata URL value + gcp_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' + '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_type = gcp_type_out.strip().split('/')[-1] + + if not instance_type: + # AWS: instance-type is a plain string + aws_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_type = aws_type_out.strip() + + # Allow explicit override (useful when running on custom/renamed machine + # types or when the pod was unavailable during cost collection). + if _INSTANCE_SIZE_LABEL.value: + instance_type = _INSTANCE_SIZE_LABEL.value + + # Last resort: fall back to the benchmark machine type flag. This ensures + # cost tracking works even when the pod was evicted before cost collection + # ran (in which case the metadata curl above returned empty). + if not instance_type and _BENCHMARK_MACHINE_TYPE.value: + instance_type = _BENCHMARK_MACHINE_TYPE.value + logging.info( + '[swap_encryption] Instance type from metadata unavailable; ' + 'using --swap_encryption_benchmark_machine_type=%s for cost tracking', + instance_type, + ) + + price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) + if price is None: + logging.warning( + '[swap_encryption] Unknown instance type "%s" – skipping cost sample. ' + 'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.', + instance_type, + ) + return [] + + hours = elapsed_sec / 3600.0 + cost = hours * price + meta = dict( + base_meta, + instance_type=instance_type, + price_usd_per_hr=price, + benchmark_elapsed_sec=round(elapsed_sec, 1), + ) + return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)] + + +def _detect_swap_device(pod: str) -> str: + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value + + # /proc/swaps is the source of truth: it lists the swap device that is + # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, + # because a stale dm-crypt mapping from a previous run on a reused node can + # still exist as a /dev node while being non-functional (fio/swapoff then + # fail with "No such device or address"). So read the active device from + # /proc/swaps first; only fall back to the mapper path if /proc/swaps is + # somehow empty but the mapper is genuinely present. + dm_out, _ = _pod_exec( + pod, + textwrap.dedent(""" + ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) + if [ -n "$ACTIVE" ] + then + echo "$ACTIVE" + elif test -e /dev/mapper/swap_encrypted + then + echo /dev/mapper/swap_encrypted + fi + """), + ignore_failure=True, + ) + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' + if dev: + return dev + raise ValueError( + 'No active swap device found in the benchmark pod. ' + 'Use --swap_encryption_device to specify one.' + ) + + +def _build_metadata(pod: str, swap_dev: str) -> dict: + """Collect node environment, encryption type, and config into a dict.""" + + kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True) + mem_out, _ = _pod_exec( + pod, "awk '/MemTotal/{print $2}' /proc/meminfo", + ignore_failure=True, + ) + swap_out, _ = _pod_exec( + pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", + ignore_failure=True, + ) + + try: + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 + + # Encryption type — key off dm-crypt presence + the swap target, NOT the + # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- + # encrypted; only the AWS targets (instance_store / io2) are. + enc = 'unknown' + if '/dev/mapper/' in swap_dev: + table_out, _ = _pod_exec( + pod, + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, + ) + enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' + elif _SWAP_TYPE.value in ('instance_store', 'io2'): + enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card + elif not _ENABLE_DMCRYPT.value: + enc = 'none' # GKE plain swap (encryption OFF) + + cloud = _detect_cloud(pod) + + # Gap 6: instance size label for multi-size comparison runs. + # If the flag is set use it directly; otherwise try to read it from + # cloud metadata so that the field is always populated. + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' + '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split('/')[-1] + if not instance_label: + aws_type_out, _ = _pod_exec( + pod, + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_label = aws_type_out.strip() + + return { + 'benchmark': BENCHMARK_NAME, + 'execution_mode': 'kubernetes_privileged_pod', + 'cloud': cloud, + 'instance_size': instance_label, + 'kernel_version': kernel_out.strip(), + 'host_memory_gb': mem_gb, + 'swap_device': swap_dev, + 'swap_size_gb': swap_gb, + 'swap_encryption': enc, + # Test-matrix columns: storage target, encryption on/off, image, IOPS + 'storage_target': _SWAP_TYPE.value, + 'boot_disk_type': _BOOT_DISK_TYPE.value, + 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, + 'node_image_type': _NODE_IMAGE_TYPE.value, + 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, + 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, + # Other config + 'zswap_enabled': _ENABLE_ZSWAP.value, + 'min_free_kbytes': _MIN_FREE_KBYTES.value, + 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, + # Requested config value only. The *effective* stress-ng footprint may + # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the + # actual value it ran with as 'stress_vm_bytes' so the two never conflict. + 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, + 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, + 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, + 'nodepool': _NODEPOOL.value, + } From 0f75d80697af1a6f7943f0e61bd12910f0568c56 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Fri, 19 Jun 2026 13:18:44 +0530 Subject: [PATCH 02/17] formatting the code as per standard --- .../swap_encryption_benchmark.py | 1007 +++++++++++------ 1 file changed, 648 insertions(+), 359 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 5bdc933bba..026831efe0 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -63,14 +63,13 @@ Bulk-index + search query under swap pressure (esrally or curl). """ -import json import logging -import re import textwrap import time from typing import Any from absl import flags +from perfkitbenchmarker import benchmark_spec as bm_spec_lib from perfkitbenchmarker import configs from perfkitbenchmarker import errors from perfkitbenchmarker import sample @@ -79,15 +78,12 @@ FLAGS = flags.FLAGS +_BenchmarkSpec = bm_spec_lib.BenchmarkSpec + # --------------------------------------------------------------------------- # Benchmark identity # --------------------------------------------------------------------------- - - -FLAGS = flags.FLAGS - - BENCHMARK_NAME = 'swap_encryption' @@ -277,15 +273,91 @@ 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', ) +_ENABLE_DMCRYPT = flags.DEFINE_boolean( + 'swap_encryption_enable_dmcrypt', + True, + 'When True (default), wrap the swap device in dm-crypt plain mode ' + "(aes-xts-plain64, ephemeral random key) matching GKE's " + 'go/node:swap-encryption implementation. Set False to measure plain ' + '(unencrypted) swap overhead as a baseline.', +) -_DS_NAME = 'pkb-swap-benchmark' +_SWAP_DEVICE = flags.DEFINE_string( + 'swap_encryption_device', + '', + 'Explicit block device path to use as the swap device, e.g. ' + '/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), ' + 'the device is auto-detected from /proc/swaps inside the benchmark pod.', +) +_SWAP_TYPE = flags.DEFINE_string( + 'swap_encryption_swap_type', + 'hyperdisk', + 'Storage target for the swap device. One of: hyperdisk (default), ' + 'lssd, instance_store, io2.', +) -_DS_NAMESPACE = 'default' +_KERNEL_VERSION = flags.DEFINE_string( + 'swap_encryption_kernel_version', + '', + 'Kernel version string to embed in the DaemonSet pod spec as a label. ' + 'When empty (default) the version is not pinned.', +) + +_ENABLE_ZSWAP = flags.DEFINE_boolean( + 'swap_encryption_enable_zswap', + False, + 'When True, enable zswap compressed swap cache on the benchmark node.', +) +_MIN_FREE_KBYTES = flags.DEFINE_integer( + 'swap_encryption_min_free_kbytes', + 0, + 'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. ' + '0 (default) leaves the kernel default unchanged.', +) + +_FIO_RUNTIME_SEC = flags.DEFINE_integer( + 'swap_encryption_fio_runtime_sec', + 60, + 'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.', +) +_STRESS_VM_BYTES = flags.DEFINE_string( + 'swap_encryption_stress_vm_bytes', + '28G', + 'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. ' + 'Should exceed available node RAM to force sustained paging.', +) + +_STRESS_VM_BYTES_LIST = flags.DEFINE_list( + 'swap_encryption_stress_vm_bytes_list', + [], + 'Comma-separated list of --vm-bytes values to sweep in Phase 2a, ' + 'e.g. "14G,28G,56G". Overrides --swap_encryption_stress_vm_bytes.', +) + +_STRESS_TIMEOUT_SEC = flags.DEFINE_integer( + 'swap_encryption_stress_timeout_sec', + 300, + 'Maximum seconds to wait for the stress-ng swap-pressure phase.', +) + +_DS_NAME = 'pkb-swap-benchmark' +_DS_NAMESPACE = 'default' _DS_LABEL = 'pkb-swap-benchmark' +# Transient kubectl errors that are safe to retry. +_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close') + +# Errors indicating the container/pod is gone and needs recovery. +_CONTAINER_GONE_KUBECTL_ERRORS = ( + 'container not found', + 'procReady not received', + 'unable to upgrade connection', + 'not found', + 'deleted state', +) _active_pod: list[str] = [] # single-element list so closures can mutate it @@ -298,10 +370,7 @@ _oom_events: list[str] = [] - _BENCHMARK_NODEPOOL = 'benchmark' - - _DEFAULT_NODEPOOL = 'default-pool' @@ -325,10 +394,11 @@ def _daemonset_yaml(image: str) -> str: def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + """Load and return benchmark config spec.""" return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) -def Prepare(spec) -> None: +def Prepare(spec: _BenchmarkSpec) -> None: """Two-step nodepool setup then DaemonSet deployment. Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap @@ -345,8 +415,27 @@ def Prepare(spec) -> None: cluster = spec.container_cluster # ── Step 2a: add real benchmark nodepool ──────────────────────────────── - if getattr(cluster, 'project', None): - # GCP path: true two-step nodepool setup + if not getattr(cluster, 'project', None): + # Guard: AWS / EKS path — nodepool management is external. + # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet + # nodeSelector (pkb_nodepool=benchmark) before deploying the pod. + logging.info( + '[swap_encryption] EKS cluster — labelling existing nodes with ' + 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', + _BENCHMARK_NODEPOOL, + ) + kubectl.RunKubectlCommand([ + 'label', + 'nodes', + '--all', + '--overwrite', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + ]) + # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs + # on io2 hardware-encrypted storage (no-op unless swap_type=io2). + _ensure_io2_volume() + else: + # GCP path: true two-step nodepool setup. logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') _create_benchmark_node_pool(cluster) @@ -355,26 +444,9 @@ def Prepare(spec) -> None: _wait_for_benchmark_node() # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── - # --additional-node-disk is not available in all gcloud versions, so we - # create + attach the disk after the node is up using gcloud compute. if _ADD_SWAP_DISK.value: logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk') _attach_swap_disk(cluster) - else: - # AWS / EKS: nodepool management is external. PKB's cluster creation - # labels nodes pkb_nodepool=default, so re-label all existing nodes here - # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark). - logging.info( - '[swap_encryption] EKS cluster — labelling existing nodes with ' - 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', - _BENCHMARK_NODEPOOL) - kubectl.RunKubectlCommand([ - 'label', 'nodes', '--all', '--overwrite', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - ]) - # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs - # on io2 hardware-encrypted storage (no-op unless swap_type=io2). - _ensure_io2_volume() # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── # Deploy and wait for the pod BEFORE deleting the default nodepool. @@ -395,8 +467,10 @@ def Prepare(spec) -> None: # the nodepool deletion (cluster control plane briefly interrupts pod # lifecycle). Re-resolve the pod name to avoid stale-reference errors on # all subsequent _pod_exec calls. - logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod ' - 'after nodepool deletion') + logging.info( + '[swap_encryption] Step 2d: re-resolving benchmark pod ' + 'after nodepool deletion' + ) pod = _wait_for_benchmark_pod() logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) @@ -411,7 +485,7 @@ def _phase_selected(token: str) -> bool: return (not selected) or ('all' in selected) or (token.lower() in selected) -def Run(spec) -> list[sample.Sample]: +def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: """Execute all benchmark phases with gate logic. Execution is structured in three gated tiers matching the execution plan: @@ -432,6 +506,10 @@ def Run(spec) -> list[sample.Sample]: application-level swap performance when the raw device is inaccessible. """ pod = _wait_for_benchmark_pod() + if pod is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] Benchmark pod never became ready.' + ) # Initialise the module-level active-pod tracker so _pod_exec and # _recover_pod can transparently redirect to a replacement pod if the # original is evicted during the run. @@ -461,37 +539,43 @@ def Run(spec) -> list[sample.Sample]: # Detect those conditions here and surface them explicitly. if _active_pod and _active_pod[0] != original_pod: _degraded_reasons.append( - f'benchmark pod was replaced during the run ' + 'benchmark pod was replaced during the run ' f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap ' - f'pressure; phases executed after the eviction ran against a ' - f'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' - f'invalid') + 'pressure; phases executed after the eviction ran against a ' + 'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' + 'invalid' + ) if _pod_lost: _degraded_reasons.append( - f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) ' - f'— the pod died (node memory-pressure eviction or container exit) and ' - f'any phase running at or after that point (e.g. kernel-build baseline, ' - f'OpenSearch) produced invalid data') + f'pod(s) NotFound during run: {", ".join(_pod_lost)} — pod died' + ' (eviction/exit); phases at/after that point (e.g.' + ' kernel-build, OpenSearch) produced invalid data' + ) if _oom_events: _degraded_reasons.append( - f'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + 'OOM kill(s) (rc=137) occurred during the run on pod(s) ' f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by ' - f'the OOM killer (the container may have restarted in place), so the ' - f'affected phase(s) produced no or partial data') + 'the OOM killer (the container may have restarted in place), so the ' + 'affected phase(s) produced no or partial data' + ) degraded = bool(_degraded_reasons) - results.append(sample.Sample( - 'swap_encryption_run_status', - 0.0 if degraded else 1.0, - 'status', - dict(base_meta, - degraded=degraded, - degraded_reasons='; '.join(_degraded_reasons) or 'none', - num_samples=len(results) + 1))) + results.append( + sample.Sample( + 'swap_encryption_run_status', + 0.0 if degraded else 1.0, + 'status', + dict( + base_meta, + degraded=degraded, + degraded_reasons='; '.join(_degraded_reasons) or 'none', + num_samples=len(results) + 1, + ), + ) + ) if degraded: - msg = ('[swap_encryption] RUN DEGRADED — ' - + '; '.join(_degraded_reasons)) + msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(_degraded_reasons) logging.error(msg) if _FAIL_ON_DEGRADED.value: # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The @@ -499,23 +583,31 @@ def Run(spec) -> list[sample.Sample]: # is recorded, so no data is lost. raise errors.Benchmarks.RunError(msg) else: - logging.info('[swap_encryption] Run completed cleanly (%d samples)', - len(results)) + logging.info( + '[swap_encryption] Run completed cleanly (%d samples)', len(results) + ) return results -def Cleanup(spec) -> None: +def Cleanup(spec: _BenchmarkSpec) -> None: """Remove the DaemonSet and tear down any swap configuration.""" pod = _wait_for_benchmark_pod(timeout=30) if pod: _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True) - _pod_exec(pod, textwrap.dedent(""" - swapoff /dev/mapper/swap_encrypted 2>/dev/null || true - dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true - """), ignore_failure=True) + _pod_exec( + pod, + textwrap.dedent(""" + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync \ + swap_encrypted 2>/dev/null || true + """), + ignore_failure=True, + ) # Clean up loop device backing files (single-disk fallback path). - _pod_exec(pod, textwrap.dedent(""" + _pod_exec( + pod, + textwrap.dedent(""" for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ /mnt/stateful_partition/pkb_swap_backing do @@ -526,9 +618,14 @@ def Cleanup(spec) -> None: done rm -f "$backing" done - """), ignore_failure=True) - _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True) + """), + ignore_failure=True, + ) + _pod_exec( + pod, + "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True, + ) _delete_daemonset() @@ -561,18 +658,25 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: """ deadline = time.time() + timeout last_phase = '' - ready_pod = None # pod name once phase == Running + ready_pod = None # pod name once phase == Running while time.time() < deadline: # ── Step 1: wait for Running phase ────────────────────────────────────── if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'pods', - '-l', f'app={_DS_LABEL}', - '-n', _DS_NAMESPACE, - '-o', - r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}', - ], raise_on_failure=False) + out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pods', + '-l', + f'app={_DS_LABEL}', + '-n', + _DS_NAMESPACE, + '-o', + r'jsonpath={range .items[*]}{.metadata.name}' + r'{"\t"}{.status.phase}{"\n"}{end}', + ], + raise_on_failure=False, + ) if rc == 0 and out.strip(): for line in out.strip().splitlines(): @@ -580,12 +684,17 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: if len(parts) == 2: pod_name, phase = parts[0].strip(), parts[1].strip() if phase == 'Running': - logging.info('[swap_encryption] Pod %s is Running – ' - 'waiting for tool install to finish...', pod_name) + logging.info( + '[swap_encryption] Pod %s is Running – ' + 'waiting for tool install to finish...', + pod_name, + ) ready_pod = pod_name break if phase != last_phase: - logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase) + logging.info( + '[swap_encryption] Pod %s phase: %s', pod_name, phase + ) last_phase = phase if phase in ('Pending',): _log_pod_events(pod_name) @@ -594,39 +703,63 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── if ready_pod is not None: - sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([ - 'exec', ready_pod, '-n', _DS_NAMESPACE, - '--', 'test', '-f', '/tmp/pkb_ready', - ], raise_on_failure=False) + sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( + [ + 'exec', + ready_pod, + '-n', + _DS_NAMESPACE, + '--', + 'test', + '-f', + '/tmp/pkb_ready', + ], + raise_on_failure=False, + ) if sentinel_rc == 0: logging.info( - '[swap_encryption] Pod %s ready (tools installed)', ready_pod) + '[swap_encryption] Pod %s ready (tools installed)', ready_pod + ) return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff or - # exited) — treat it as a hard reset: re-check pod phase on next iteration. - if ('container not found' in sentinel_err - or 'unable to upgrade connection' in sentinel_err): - logging.warning('[swap_encryption] Pod %s: container not running (%s) ' - '— will re-check pod state', ready_pod, sentinel_err.strip()) + # "container not found" means the container crashed (CrashLoopBackOff + # or exited) — hard reset: re-check pod phase on next iteration. + if ( + 'container not found' in sentinel_err + or 'unable to upgrade connection' in sentinel_err + ): + logging.warning( + '[swap_encryption] Pod %s: container not running (%s)' + ' — will re-check pod state', + ready_pod, + sentinel_err.strip(), + ) ready_pod = None last_phase = '' else: logging.info( - '[swap_encryption] Pod %s: still installing tools...', ready_pod) + '[swap_encryption] Pod %s: still installing tools...', ready_pod + ) time.sleep(15) logging.warning( - '[swap_encryption] Benchmark pod not ready after %ds', timeout) + '[swap_encryption] Benchmark pod not ready after %ds', timeout + ) return None def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand([ - 'describe', 'pod', pod_name, - '-n', _DS_NAMESPACE, - ], raise_on_failure=False) + """Dump recent Kubernetes events for the pod to diagnose startup hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand( + [ + 'describe', + 'pod', + pod_name, + '-n', + _DS_NAMESPACE, + ], + raise_on_failure=False, + ) # Only log the Events section to keep output manageable in_events = False lines = [] @@ -638,106 +771,30 @@ def _log_pod_events(pod_name: str) -> None: if lines: logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])) else: - logging.info('[swap_encryption] kubectl describe output:\n%s', - events_out[-2000:] if len(events_out) > 2000 else events_out) + logging.info( + '[swap_encryption] kubectl describe output:\n%s', + events_out[-2000:] if len(events_out) > 2000 else events_out, + ) def _delete_daemonset() -> None: """Delete the benchmark DaemonSet.""" - kubectl.RunKubectlCommand([ - 'delete', 'daemonset', _DS_NAME, - '-n', _DS_NAMESPACE, - '--ignore-not-found', - ], raise_on_failure=False) + kubectl.RunKubectlCommand( + [ + 'delete', + 'daemonset', + _DS_NAME, + '-n', + _DS_NAMESPACE, + '--ignore-not-found', + ], + raise_on_failure=False, + ) logging.info('[swap_encryption] DaemonSet deleted') -def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str: - """Return a bash startup script for the benchmark nodepool. - - NOTE: This function is not currently used. GKE reserves the - `startup-script` node metadata key, so dm-crypt setup is performed - from within the privileged DaemonSet pod instead (see - _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference. - - Args: - enable_dmcrypt: When True, wrap the swap device in dm-crypt plain - mode (aes-xts-plain64, ephemeral random key) matching GKE's - go/node:swap-encryption implementation. - lssd: When True, build a RAID-0 array across all local SSDs before - setting up swap (matches go/gke-swap-lssd). - - Returns: - A bash script string suitable for running as root at node boot. - """ - dmcrypt_str = 'true' if enable_dmcrypt else 'false' - lssd_str = 'true' if lssd else 'false' - - return textwrap.dedent(f"""\ - #!/bin/bash - # PKB swap_encryption_benchmark — nodepool startup script. - # Configures swap once at node boot so all benchmark phases see a - # pre-warmed swap device. Runs as root on the COS host. - set -euo pipefail - ENABLE_DMCRYPT={dmcrypt_str} - LSSD={lssd_str} - - _wait_dev() {{ - local d=$1 i - for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done - echo "[pkb-startup] device $d not ready" >&2; return 1 - }} - - _boot_dev() {{ - lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1 - }} - - if $LSSD; then - BOOT=$(_boot_dev) - # Collect all non-rotational non-boot block devices (local SSDs) - DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true) - N=$(echo "$DEVS" | grep -c /dev/ || true) - if [ "$N" -gt 1 ]; then - modprobe raid0 || true - # shellcheck disable=SC2086 - mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force - TARGET=/dev/md0 - elif [ "$N" -eq 1 ]; then - TARGET=$(echo "$DEVS" | head -1) - else - echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2 - exit 0 - fi - else - BOOT=$(_boot_dev) - RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true) - if [ -z "$RAW" ]; then - echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2 - exit 0 - fi - TARGET=/dev/$RAW - fi - - _wait_dev "$TARGET" - - if $ENABLE_DMCRYPT; then - modprobe dm-crypt || true - dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\ - cryptsetup open --type plain \\ - --cipher aes-xts-plain64 --key-size 256 \\ - --key-file=- "$TARGET" pkb_swap - SWAP_DEV=/dev/mapper/pkb_swap - else - SWAP_DEV=$TARGET - fi - - mkswap "$SWAP_DEV" - swapon "$SWAP_DEV" - echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)" - """) - - -_HYPERDISK_MAX_IOPS_PER_MBPS = 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s +# GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput. +_HYPERDISK_MAX_IOPS_PER_MBPS = 256 def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: @@ -755,7 +812,11 @@ def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: logging.warning( '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for ' '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d', - throughput, iops, min_tput, min_tput) + throughput, + iops, + min_tput, + min_tput, + ) return min_tput return throughput @@ -795,15 +856,27 @@ def _create_benchmark_node_pool(cluster) -> None: disk_type = _BOOT_DISK_TYPE.value cmd = [ - 'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL, - '--cluster', cluster.name, - '--project', cluster.project, - '--machine-type', machine_type, - '--image-type', _NODE_IMAGE_TYPE.value, - '--disk-type', disk_type, - '--disk-size', str(disk_size_gb), - '--num-nodes', '1', - '--node-labels', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + 'gcloud', + 'container', + 'node-pools', + 'create', + _BENCHMARK_NODEPOOL, + '--cluster', + cluster.name, + '--project', + cluster.project, + '--machine-type', + machine_type, + '--image-type', + _NODE_IMAGE_TYPE.value, + '--disk-type', + disk_type, + '--disk-size', + str(disk_size_gb), + '--num-nodes', + '1', + '--node-labels', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', '--no-enable-autoupgrade', '--no-enable-autorepair', ] + zone_flags @@ -815,10 +888,14 @@ def _create_benchmark_node_pool(cluster) -> None: # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). if disk_type.startswith('hyperdisk') and not is_lssd: cmd += [ - '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--boot-disk-provisioned-iops', + str(_BOOT_DISK_IOPS.value), '--boot-disk-provisioned-throughput', - str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, - _BOOT_DISK_THROUGHPUT.value)), + str( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ), ] # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm @@ -826,18 +903,26 @@ def _create_benchmark_node_pool(cluster) -> None: if is_lssd: cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] - logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / ' - 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' - 'add_swap_disk=%s', - _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, - disk_size_gb, _BOOT_DISK_IOPS.value, - _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value) + logging.info( + '[swap_encryption] Creating benchmark nodepool: %s / %s / ' + 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' + 'add_swap_disk=%s', + _BENCHMARK_NODEPOOL, + machine_type, + _NODE_IMAGE_TYPE.value, + disk_size_gb, + _BOOT_DISK_IOPS.value, + _ENABLE_DMCRYPT.value, + is_lssd, + _ADD_SWAP_DISK.value, + ) # LSSD nodepools take longer to provision than PD-only nodepools because # GKE must also initialise the local NVMe devices before marking nodes Ready. # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200, - raise_on_failure=False) + stdout, stderr, rc = vm_util.IssueCommand( + cmd, timeout=1200, raise_on_failure=False + ) if rc != 0: # Idempotent prepare: if the nodepool already exists (e.g. re-running @@ -846,11 +931,13 @@ def _create_benchmark_node_pool(cluster) -> None: # "Already exists" message in this case. low = (stderr or '').lower() if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low: - logging.info('[swap_encryption] Benchmark nodepool already exists — ' - 'reusing it (idempotent prepare); proceeding to DaemonSet') + logging.info( + '[swap_encryption] Benchmark nodepool already exists — ' + 'reusing it (idempotent prepare); proceeding to DaemonSet' + ) return raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create benchmark nodepool ' + '[swap_encryption] Failed to create benchmark nodepool ' f'(rc={rc}): {stderr}' ) logging.info('[swap_encryption] Benchmark nodepool ready') @@ -869,28 +956,38 @@ def _wait_for_benchmark_node(timeout: int = 900) -> None: pkb_nodepool=benchmark has Ready=True, then returns. """ deadline = time.time() + timeout - logging.info('[swap_encryption] Waiting for benchmark node ' - '(pkb_nodepool=benchmark) to be Ready...') + logging.info( + '[swap_encryption] Waiting for benchmark node ' + '(pkb_nodepool=benchmark) to be Ready...' + ) while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'nodes', - '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', r'jsonpath={range .items[*]}' - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}', - ], raise_on_failure=False) + out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'nodes', + '-l', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', + r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}', + ], + raise_on_failure=False, + ) if rc == 0 and out.strip(): for line in out.strip().splitlines(): parts = line.split('\t') if len(parts) == 2 and parts[1].strip() == 'True': - logging.info('[swap_encryption] Benchmark node ready: %s', - parts[0].strip()) + logging.info( + '[swap_encryption] Benchmark node ready: %s', parts[0].strip() + ) return - logging.info('[swap_encryption] Benchmark node not yet Ready — ' - 'retrying in 15 s...') + logging.info( + '[swap_encryption] Benchmark node not yet Ready — retrying in 15 s...' + ) time.sleep(15) raise errors.Benchmarks.RunError( @@ -922,7 +1019,8 @@ def _attach_swap_disk(cluster) -> None: zone = cluster.region if not zone: raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot attach swap disk: cluster zone unknown') + '[swap_encryption] Cannot attach swap disk: cluster zone unknown' + ) project = cluster.project disk_name = f'pkb-swap-{cluster.name}' @@ -930,60 +1028,97 @@ def _attach_swap_disk(cluster) -> None: disk_size_gb = _SWAP_DISK_SIZE_GB.value # ── Step 1: get the GCE instance name of the benchmark node ─────────────── - node_out, _, rc = kubectl.RunKubectlCommand([ - 'get', 'nodes', - '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', 'jsonpath={.items[0].metadata.name}', - ], raise_on_failure=False) + node_out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'nodes', + '-l', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', + 'jsonpath={.items[0].metadata.name}', + ], + raise_on_failure=False, + ) instance_name = node_out.strip() if rc != 0 or not instance_name: raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot find benchmark node for swap disk attach') + '[swap_encryption] Cannot find benchmark node for swap disk attach' + ) logging.info('[swap_encryption] Benchmark node instance: %s', instance_name) # ── Step 2: create the hyperdisk ────────────────────────────────────────── - logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)', - disk_name, disk_size_gb, disk_type) + logging.info( + '[swap_encryption] Creating swap disk %s (%dGiB %s)', + disk_name, + disk_size_gb, + disk_type, + ) create_cmd = [ - 'gcloud', 'compute', 'disks', 'create', disk_name, - '--project', project, - '--zone', zone, - '--type', disk_type, - '--size', f'{disk_size_gb}GB', + 'gcloud', + 'compute', + 'disks', + 'create', + disk_name, + '--project', + project, + '--zone', + zone, + '--type', + disk_type, + '--size', + f'{disk_size_gb}GB', '--quiet', ] if disk_type.startswith('hyperdisk'): create_cmd += [ - '--provisioned-iops', str(_BOOT_DISK_IOPS.value), + '--provisioned-iops', + str(_BOOT_DISK_IOPS.value), '--provisioned-throughput', - str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value, - _BOOT_DISK_THROUGHPUT.value)), + str( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ), ] - _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120, - raise_on_failure=False) + _, stderr, rc = vm_util.IssueCommand( + create_cmd, timeout=120, raise_on_failure=False + ) if rc != 0: raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}') + f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}' + ) # ── Step 3: attach the disk to the node VM ──────────────────────────────── - logging.info('[swap_encryption] Attaching swap disk %s to %s', - disk_name, instance_name) + logging.info( + '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name + ) attach_cmd = [ - 'gcloud', 'compute', 'instances', 'attach-disk', instance_name, - '--project', project, - '--zone', zone, - '--disk', disk_name, - '--device-name', 'pkb-swap', + 'gcloud', + 'compute', + 'instances', + 'attach-disk', + instance_name, + '--project', + project, + '--zone', + zone, + '--disk', + disk_name, + '--device-name', + 'pkb-swap', '--quiet', ] - _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120, - raise_on_failure=False) + _, stderr, rc = vm_util.IssueCommand( + attach_cmd, timeout=120, raise_on_failure=False + ) if rc != 0: raise errors.Benchmarks.RunError( f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' - f'{stderr}') - logging.info('[swap_encryption] Swap disk attached: %s → %s', - disk_name, instance_name) + f'{stderr}' + ) + logging.info( + '[swap_encryption] Swap disk attached: %s → %s', disk_name, instance_name + ) def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: @@ -996,35 +1131,85 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: """ for attempt in range(1, 5): users, _, rc = vm_util.IssueCommand( - ['gcloud', 'compute', 'disks', 'describe', disk_name, - '--project', project, '--zone', zone, '--format=value(users)'], - timeout=60, raise_on_failure=False) + [ + 'gcloud', + 'compute', + 'disks', + 'describe', + disk_name, + '--project', + project, + '--zone', + zone, + '--format=value(users)', + ], + timeout=60, + raise_on_failure=False, + ) if rc != 0: - logging.info('[swap_encryption] Swap disk %s not present — nothing to ' - 'delete', disk_name) + logging.info( + '[swap_encryption] Swap disk %s not present — nothing to delete', + disk_name, + ) return True # already gone user = users.strip() if user: inst = user.split('/')[-1] - logging.info('[swap_encryption] Detaching swap disk %s from %s', - disk_name, inst) + logging.info( + '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst + ) vm_util.IssueCommand( - ['gcloud', 'compute', 'instances', 'detach-disk', inst, - '--project', project, '--zone', zone, '--disk', disk_name, - '--quiet'], timeout=120, raise_on_failure=False) + [ + 'gcloud', + 'compute', + 'instances', + 'detach-disk', + inst, + '--project', + project, + '--zone', + zone, + '--disk', + disk_name, + '--quiet', + ], + timeout=120, + raise_on_failure=False, + ) _, derr, drc = vm_util.IssueCommand( - ['gcloud', 'compute', 'disks', 'delete', disk_name, - '--project', project, '--zone', zone, '--quiet'], - timeout=180, raise_on_failure=False) + [ + 'gcloud', + 'compute', + 'disks', + 'delete', + disk_name, + '--project', + project, + '--zone', + zone, + '--quiet', + ], + timeout=180, + raise_on_failure=False, + ) if drc == 0: logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) return True - logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed ' - '(%s); retrying in 10s', attempt, derr.strip()[:160]) + logging.warning( + '[swap_encryption] Swap disk delete attempt %d/4 failed ' + '(%s); retrying in 10s', + attempt, + derr.strip()[:160], + ) time.sleep(10) - logging.error('[swap_encryption] Could NOT delete swap disk %s after retries ' - '— delete it manually: gcloud compute disks delete %s ' - '--zone %s --quiet', disk_name, disk_name, zone) + logging.error( + '[swap_encryption] Could NOT delete swap disk %s after retries ' + '— delete it manually: gcloud compute disks delete %s ' + '--zone %s --quiet', + disk_name, + disk_name, + zone, + ) return False @@ -1054,19 +1239,30 @@ def _delete_default_node_pool(cluster) -> None: zone_flags = ['--region', cluster.region] cmd = [ - 'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL, - '--cluster', cluster.name, - '--project', cluster.project, + 'gcloud', + 'container', + 'node-pools', + 'delete', + _DEFAULT_NODEPOOL, + '--cluster', + cluster.name, + '--project', + cluster.project, '--quiet', ] + zone_flags logging.info( - '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL) - stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300, - raise_on_failure=False) + '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL + ) + stdout, stderr, rc = vm_util.IssueCommand( + cmd, timeout=300, raise_on_failure=False + ) if rc != 0: - logging.warning('[swap_encryption] Could not delete default nodepool ' - '(rc=%d): %s', rc, stderr) + logging.warning( + '[swap_encryption] Could not delete default nodepool (rc=%d): %s', + rc, + stderr, + ) else: logging.info('[swap_encryption] Default nodepool deleted') @@ -1079,9 +1275,17 @@ def _is_pod_gone(pod: str) -> bool: """ try: _, err, rc = kubectl.RunKubectlCommand( - ['get', 'pod', pod, '-n', _DS_NAMESPACE, - '-o', 'jsonpath={.metadata.name}'], - raise_on_failure=False, timeout=15, + [ + 'get', + 'pod', + pod, + '-n', + _DS_NAMESPACE, + '-o', + 'jsonpath={.metadata.name}', + ], + raise_on_failure=False, + timeout=15, ) return rc != 0 and 'not found' in (err or '').lower() except Exception: # pylint: disable=broad-except @@ -1112,33 +1316,27 @@ def _pod_exec( Returns: Tuple of (stdout, stderr) strings. """ - _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close') - # Errors that indicate the container/pod is gone and needs recovery. - # 'not found' covers "Error from server (NotFound): pods ... not found" - # which occurs when the DaemonSet pod was evicted and recreated under a - # new name (e.g. after OOM-triggered node pressure eviction). - # 'deleted state' covers "cannot exec in a deleted state" — the container - # was OOM-killed and is mid-termination (not yet recreated). - _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received', - 'unable to upgrade connection', 'not found', - 'deleted state') + # Use module-level constants for error strings (defined at top of module). # Use the globally-tracked active pod name — it may have been updated by # a previous _recover_pod call when eviction replaced the pod. active = _active_pod[0] if _active_pod else pod for attempt in range(_retries + 1): out, err, rc = kubectl.RunKubectlCommand( - ['exec', active, '-n', _DS_NAMESPACE, - '--', 'bash', '-c', cmd], + ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd], raise_on_failure=False, - raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets + # Retry loop in _pod_exec handles transient resets. + raise_on_timeout=False, timeout=timeout, ) - is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS) + is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS) if is_transient and attempt < _retries: logging.warning( '[swap_encryption] kubectl exec connection reset (attempt %d/%d); ' - 'retrying in 10 s', attempt + 1, _retries + 1) + 'retrying in 10 s', + attempt + 1, + _retries + 1, + ) time.sleep(10) continue # rc=137 (SIGKILL): the OOM killer terminated the container process. @@ -1162,27 +1360,33 @@ def _pod_exec( # "Error from server (NotFound): pods … not found". logging.warning( '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update ' - 'pod state before recovery check') + 'pod state before recovery check' + ) time.sleep(15) pod_gone = _is_pod_gone(active) if pod_gone: logging.warning( - '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — ' - 'recovering pod name for subsequent commands (not retrying this cmd)') + '[swap_encryption] OOM-eviction (rc=137, pod gone) —' + ' recovering pod name (cmd not retried)' + ) else: logging.warning( - '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — ' - 'waiting for container restart and tool re-install before continuing') + '[swap_encryption] OOM-kill (rc=137, pod exists) —' + ' waiting for container restart before continuing' + ) new_pod = _recover_pod(active) if new_pod != active: - logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + logging.info( + '[swap_encryption] Pod name updated: %s → %s', active, new_pod + ) if _active_pod: _active_pod[0] = new_pod active = new_pod break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. - is_container_gone = (rc != 0 and - any(e in err.lower() for e in _CONTAINER_GONE_ERRORS)) + is_container_gone = rc != 0 and any( + e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS + ) if is_container_gone: # Record the loss for the run-level degradation gate REGARDLESS of retry # budget or ignore_failure. A "pods … not found" on a best-effort command @@ -1193,14 +1397,22 @@ def _pod_exec( _pod_lost.append(active) logging.error( '[swap_encryption] Benchmark pod %s is gone (%s) — recording run ' - 'as degraded', active, (err or '').strip()[:160]) + 'as degraded', + active, + (err or '').strip()[:160], + ) if attempt < _retries: logging.warning( '[swap_encryption] Container gone/restarting (attempt %d/%d) — ' - 'waiting for pod to recover...', attempt + 1, _retries + 1) + 'waiting for pod to recover...', + attempt + 1, + _retries + 1, + ) new_pod = _recover_pod(active) if new_pod != active: - logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod) + logging.info( + '[swap_encryption] Pod name updated: %s → %s', active, new_pod + ) if _active_pod: _active_pod[0] = new_pod active = new_pod @@ -1209,7 +1421,8 @@ def _pod_exec( if rc != 0 and not ignore_failure: raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] _pod_exec failed (rc={rc}): {err}') + f'[swap_encryption] _pod_exec failed (rc={rc}): {err}' + ) return out, err @@ -1227,8 +1440,11 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str: Returns the (possibly new) pod name once it is Running and ready. """ deadline = time.time() + timeout_sec - logging.info('[swap_encryption] Waiting for pod %s to recover ' - '(up to %ds)...', pod, timeout_sec) + logging.info( + '[swap_encryption] Waiting for pod %s to recover (up to %ds)...', + pod, + timeout_sec, + ) # Phase 1: wait for a Running pod — either the named one (container # restart) or a replacement pod found via label selector (eviction). @@ -1246,9 +1462,17 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str: # lives entirely in status_err. Discarding stderr (using _) means the # 'not found' check below never fires and we spin until deadline. status_out, status_err, status_rc = kubectl.RunKubectlCommand( - ['get', 'pod', pod, '-n', _DS_NAMESPACE, - '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'], - raise_on_failure=False, timeout=30, + [ + 'get', + 'pod', + pod, + '-n', + _DS_NAMESPACE, + '-o', + 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}', + ], + raise_on_failure=False, + timeout=30, ) # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) fields = status_out.strip().split('|') @@ -1262,62 +1486,91 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str: # Pod no longer exists, OR it exists but is being terminated (Terminating # state or deletionTimestamp set) — look for a replacement pod by label. pod_gone_or_terminating = ( - (status_rc != 0 and 'not found' in (status_out + status_err).lower()) - or is_terminating - ) + status_rc != 0 and 'not found' in (status_out + status_err).lower() + ) or is_terminating if pod_gone_or_terminating: label_out, _, label_rc = kubectl.RunKubectlCommand( - ['get', 'pods', '-n', _DS_NAMESPACE, - '-l', f'app={_DS_LABEL}', - '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}'], - raise_on_failure=False, timeout=30, + [ + 'get', + 'pods', + '-n', + _DS_NAMESPACE, + '-l', + f'app={_DS_LABEL}', + '-o', + ( + 'jsonpath={range .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}' + ), + ], + raise_on_failure=False, + timeout=30, ) - new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip() - and p.strip() != pod] # exclude the dying pod + new_pods = [ + p.strip() + for p in label_out.strip().splitlines() + if p.strip() and p.strip() != pod + ] # exclude the dying pod if label_rc == 0 and new_pods: recovered_pod = new_pods[0] - logging.info('[swap_encryption] Original pod %s gone/terminating; ' - 'found replacement %s', pod, recovered_pod) + logging.info( + '[swap_encryption] Original pod %s gone/terminating; ' + 'found replacement %s', + pod, + recovered_pod, + ) break time.sleep(10) else: raise errors.VmUtil.IssueCommandError( f'[swap_encryption] No Running pod found (original: {pod}) ' - f'within {timeout_sec}s after OOM kill / eviction') + f'within {timeout_sec}s after OOM kill / eviction' + ) # Phase 2: wait for init script to finish (sentinel written last). while time.time() < deadline: ready_out, _, ready_rc = kubectl.RunKubectlCommand( - ['exec', recovered_pod, '-n', _DS_NAMESPACE, - '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'], - raise_on_failure=False, timeout=30, + [ + 'exec', + recovered_pod, + '-n', + _DS_NAMESPACE, + '--', + 'bash', + '-c', + 'test -f /tmp/pkb_ready && echo READY', + ], + raise_on_failure=False, + timeout=30, ) if ready_rc == 0 and 'READY' in ready_out: - logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod) + logging.info( + '[swap_encryption] Pod %s recovered and ready', recovered_pod + ) return recovered_pod time.sleep(15) raise errors.VmUtil.IssueCommandError( f'[swap_encryption] Pod {recovered_pod} did not become ready ' - f'within {timeout_sec}s after OOM kill / eviction') + f'within {timeout_sec}s after OOM kill / eviction' + ) _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD - 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD - 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM - 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM - 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM - 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM + 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM + 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM + 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD # AWS - 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store - 'i4i.2xlarge': 0.7480, - 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store - 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store - 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store + 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store } @@ -1342,11 +1595,13 @@ def _collect_cost_sample( instance_type = '' # GCP: machine type is the last segment of the metadata URL value + _gcp_meta_url = ( + 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type' + ) gcp_type_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' - '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + f'curl -s -m 3 --fail {_gcp_meta_url}' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) if gcp_type_out.strip(): @@ -1434,16 +1689,18 @@ def _detect_swap_device(pod: str) -> str: ) -def _build_metadata(pod: str, swap_dev: str) -> dict: +def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]: """Collect node environment, encryption type, and config into a dict.""" kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True) mem_out, _ = _pod_exec( - pod, "awk '/MemTotal/{print $2}' /proc/meminfo", + pod, + "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True, ) swap_out, _ = _pod_exec( - pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", + pod, + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True, ) @@ -1468,9 +1725,9 @@ def _build_metadata(pod: str, swap_dev: str) -> dict: ) enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' elif _SWAP_TYPE.value in ('instance_store', 'io2'): - enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card + enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card elif not _ENABLE_DMCRYPT.value: - enc = 'none' # GKE plain swap (encryption OFF) + enc = 'none' # GKE plain swap (encryption OFF) cloud = _detect_cloud(pod) @@ -1479,11 +1736,14 @@ def _build_metadata(pod: str, swap_dev: str) -> dict: # cloud metadata so that the field is always populated. instance_label = _INSTANCE_SIZE_LABEL.value if not instance_label: + _gcp_mt_url = ( + 'http://metadata.google.internal' + '/computeMetadata/v1/instance/machine-type' + ) gcp_type_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type ' - '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + f'curl -s -m 3 --fail {_gcp_mt_url}' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) if gcp_type_out.strip(): @@ -1527,3 +1787,32 @@ def _build_metadata(pod: str, swap_dev: str) -> dict: 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, 'nodepool': _NODEPOOL.value, } + + +def _detect_cloud(pod: str) -> str: + """Detect whether the benchmark pod is running on GCP or AWS. + + Queries the cloud instance metadata endpoint inside the pod. Returns + 'GCP' if the GCP metadata server responds, 'AWS' otherwise. + """ + gcp_out, _ = _pod_exec( + pod, + 'curl -s -m 2 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/project/project-id' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_out.strip(): + return 'GCP' + return 'AWS' + + +def _ensure_io2_volume() -> None: + """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). + + Only executed when --swap_encryption_swap_type=io2. Full implementation + is deferred to PR2 (swap-capability layer). + """ + if _SWAP_TYPE.value != 'io2': + return + logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2') \ No newline at end of file From 6489d06d58e3c98c35ff877eb503c9172f0fab67 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 19 Jun 2025 10:46:41 +0000 Subject: [PATCH 03/17] swap_encryption: add GKE kubelet memorySwapBehavior config Per Ajay's review comment on PR #6758: - Add _GKE_KUBELET_MEMORY_SWAP flag (default LimitedSwap) so the benchmark nodepool is created with kubeletConfig.memorySwapBehavior set via --system-config-from-file, enabling pod-level swap usage. - Wrap gcloud IssueCommand in try/finally to clean up the temp YAML. - Update nodepool creation log to include kubelet_swap value. --- .../swap_encryption_benchmark.py | 123 ++++++++++++------ 1 file changed, 84 insertions(+), 39 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 026831efe0..215a9b40f3 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -64,6 +64,8 @@ """ import logging +import os +import tempfile import textwrap import time from typing import Any @@ -282,6 +284,18 @@ '(unencrypted) swap overhead as a baseline.', ) +_GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string( + 'swap_encryption_gke_kubelet_memory_swap', + 'LimitedSwap', + 'Value for kubeletConfig.memorySwapBehavior injected via ' + '--system-config-from-file when creating the GKE benchmark nodepool. ' + 'LimitedSwap (default) — the kubelet allows pods to use swap up to their ' + 'memory limit; required for the DaemonSet pod to drive kernel swapping. ' + 'NoSwap — disables swap at the kubelet level (use for a baseline run that ' + 'confirms zero swap activity). Set empty string to omit the flag entirely ' + 'and rely on the cluster-level default.', +) + _SWAP_DEVICE = flags.DEFINE_string( 'swap_encryption_device', '', @@ -547,9 +561,10 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: ) if _pod_lost: _degraded_reasons.append( - f'pod(s) NotFound during run: {", ".join(_pod_lost)} — pod died' - ' (eviction/exit); phases at/after that point (e.g.' - ' kernel-build, OpenSearch) produced invalid data' + 'benchmark pod(s) went NotFound during the run' + f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' + ' eviction or container exit) and any phase running at or after that' + ' point (e.g. kernel-build baseline, OpenSearch) produced invalid data' ) if _oom_events: _degraded_reasons.append( @@ -598,10 +613,9 @@ def Cleanup(spec: _BenchmarkSpec) -> None: _pod_exec( pod, textwrap.dedent(""" - swapoff /dev/mapper/swap_encrypted 2>/dev/null || true - dmsetup remove --noudevrules --noudevsync \ - swap_encrypted 2>/dev/null || true - """), + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true + """), ignore_failure=True, ) # Clean up loop device backing files (single-disk fallback path). @@ -622,9 +636,7 @@ def Cleanup(spec: _BenchmarkSpec) -> None: ignore_failure=True, ) _pod_exec( - pod, - "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True, + pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", ignore_failure=True ) _delete_daemonset() @@ -672,8 +684,10 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: '-n', _DS_NAMESPACE, '-o', - r'jsonpath={range .items[*]}{.metadata.name}' - r'{"\t"}{.status.phase}{"\n"}{end}', + ( + r'jsonpath={range' + r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' + ), ], raise_on_failure=False, ) @@ -721,15 +735,15 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: '[swap_encryption] Pod %s ready (tools installed)', ready_pod ) return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff - # or exited) — hard reset: re-check pod phase on next iteration. + # "container not found" means the container crashed (CrashLoopBackOff or + # exited) — treat it as a hard reset: re-check pod phase on next iteration. if ( 'container not found' in sentinel_err or 'unable to upgrade connection' in sentinel_err ): logging.warning( - '[swap_encryption] Pod %s: container not running (%s)' - ' — will re-check pod state', + '[swap_encryption] Pod %s: container not running (%s) ' + '— will re-check pod state', ready_pod, sentinel_err.strip(), ) @@ -749,7 +763,7 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to diagnose startup hangs.""" + """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" events_out, _, _ = kubectl.RunKubectlCommand( [ 'describe', @@ -793,8 +807,9 @@ def _delete_daemonset() -> None: logging.info('[swap_encryption] DaemonSet deleted') -# GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput. -_HYPERDISK_MAX_IOPS_PER_MBPS = 256 +_HYPERDISK_MAX_IOPS_PER_MBPS = ( + 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s +) def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: @@ -903,10 +918,36 @@ def _create_benchmark_node_pool(cluster) -> None: if is_lssd: cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] + # ── GKE kubelet swap config ─────────────────────────────────────────────── + # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark + # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap + # so that the kubelet allocates swap to the DaemonSet pod. Without this flag + # the Linux kernel swap device may exist but the kubelet blocks pod-level + # swap usage and the benchmark pod cannot drive swap I/O. + # + # Passed as --system-config-from-file pointing to a temp YAML, which is the + # same mechanism PKB's gke_node_system_config flag uses: + # perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py + swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value + system_config_tmp = None + if swap_behavior: + kubelet_yaml = f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\n' + system_config_tmp = tempfile.NamedTemporaryFile( + mode='w', suffix='.yaml', delete=False + ) + system_config_tmp.write(kubelet_yaml) + system_config_tmp.flush() + cmd += ['--system-config-from-file', system_config_tmp.name] + logging.info( + '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)', + swap_behavior, + system_config_tmp.name, + ) + logging.info( '[swap_encryption] Creating benchmark nodepool: %s / %s / ' 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' - 'add_swap_disk=%s', + 'add_swap_disk=%s / kubelet_swap=%s', _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, @@ -915,14 +956,22 @@ def _create_benchmark_node_pool(cluster) -> None: _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value, + swap_behavior or 'unset', ) # LSSD nodepools take longer to provision than PD-only nodepools because # GKE must also initialise the local NVMe devices before marking nodes Ready. # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - stdout, stderr, rc = vm_util.IssueCommand( - cmd, timeout=1200, raise_on_failure=False - ) + try: + stdout, stderr, rc = vm_util.IssueCommand( + cmd, timeout=1200, raise_on_failure=False + ) + finally: + if system_config_tmp is not None: + try: + os.unlink(system_config_tmp.name) + except OSError: + pass if rc != 0: # Idempotent prepare: if the nodepool already exists (e.g. re-running @@ -1325,8 +1374,7 @@ def _pod_exec( out, err, rc = kubectl.RunKubectlCommand( ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd], raise_on_failure=False, - # Retry loop in _pod_exec handles transient resets. - raise_on_timeout=False, + raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets timeout=timeout, ) is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS) @@ -1366,13 +1414,15 @@ def _pod_exec( pod_gone = _is_pod_gone(active) if pod_gone: logging.warning( - '[swap_encryption] OOM-eviction (rc=137, pod gone) —' - ' recovering pod name (cmd not retried)' + '[swap_encryption] OOM-eviction detected (rc=137, pod gone) —' + ' recovering pod name for subsequent commands (not retrying this' + ' cmd)' ) else: logging.warning( - '[swap_encryption] OOM-kill (rc=137, pod exists) —' - ' waiting for container restart before continuing' + '[swap_encryption] Container OOM-killed (rc=137, pod still exists)' + ' — waiting for container restart and tool re-install before' + ' continuing' ) new_pod = _recover_pod(active) if new_pod != active: @@ -1595,12 +1645,10 @@ def _collect_cost_sample( instance_type = '' # GCP: machine type is the last segment of the metadata URL value - _gcp_meta_url = ( - 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type' - ) gcp_type_out, _ = _pod_exec( pod, - f'curl -s -m 3 --fail {_gcp_meta_url}' + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) @@ -1736,13 +1784,10 @@ def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]: # cloud metadata so that the field is always populated. instance_label = _INSTANCE_SIZE_LABEL.value if not instance_label: - _gcp_mt_url = ( - 'http://metadata.google.internal' - '/computeMetadata/v1/instance/machine-type' - ) gcp_type_out, _ = _pod_exec( pod, - f'curl -s -m 3 --fail {_gcp_mt_url}' + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) @@ -1815,4 +1860,4 @@ def _ensure_io2_volume() -> None: """ if _SWAP_TYPE.value != 'io2': return - logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2') \ No newline at end of file + logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2') From 95df3acfae53599ab6978b2a34ab46a6079121fa Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 19 Jun 2025 10:46:42 +0000 Subject: [PATCH 04/17] refactor(swap_encryption): use PKB GcloudCommand instead of raw vm_util.IssueCommand Replace all raw ['gcloud', ...] list + vm_util.IssueCommand calls in swap_encryption_benchmark.py with PKB's existing GcloudCommand infrastructure: - _create_benchmark_node_pool: cluster._GcloudCommand() + cmd.flags + cmd.Issue - _delete_default_node_pool: cluster._GcloudCommand() + cmd.Issue - _attach_swap_disk: gcp_util.GcloudCommand(_GcpZonalResource) for create+attach - _delete_disk_by_name: gcp_util.GcloudCommand for describe/detach/delete Add _GcpZonalResource shim: pins zone for gcloud compute operations. GcloudCommand auto-injects --project and --zone/--region, handles auth token refresh -- matching PKB standards. --- .../swap_encryption_benchmark.py | 228 ++++++------------ 1 file changed, 79 insertions(+), 149 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 215a9b40f3..e325220c03 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -77,6 +77,7 @@ from perfkitbenchmarker import sample from perfkitbenchmarker import vm_util from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.providers.gcp import util as gcp_util FLAGS = flags.FLAGS @@ -388,6 +389,22 @@ _DEFAULT_NODEPOOL = 'default-pool' +class _GcpZonalResource: + """Minimal resource shim for gcp_util.GcloudCommand on compute operations. + + gcp_util.GcloudCommand auto-injects --project and --zone from the resource + object passed to it. GkeCluster._GcloudCommand() handles container/* + operations correctly but also switches --zone → --region for multi-zone + clusters, which is wrong for gcloud compute commands (--region creates + regional resources, not zonal ones). This shim pins a single zone so all + gcloud compute calls target the correct AZ. + """ + + def __init__(self, project: str, zone: str) -> None: + self.project = project + self.zone = zone + + def _daemonset_yaml(image: str) -> str: """Render the privileged benchmark DaemonSet manifest. @@ -856,12 +873,6 @@ def _create_benchmark_node_pool(cluster) -> None: is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower() # Determine zone/region from the cluster object. - zone_flags: list[str] = [] - if getattr(cluster, 'zones', None): - zone_flags = ['--zone', cluster.zones[0]] - elif getattr(cluster, 'region', None): - zone_flags = ['--region', cluster.region] - # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk @@ -870,31 +881,25 @@ def _create_benchmark_node_pool(cluster) -> None: disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value disk_type = _BOOT_DISK_TYPE.value - cmd = [ - 'gcloud', + + # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region, + # and auth token refresh. GkeCluster._GcloudCommand also handles the + # zone → region promotion for multi-zone / regional clusters. + cmd = cluster._GcloudCommand( 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL, '--cluster', cluster.name, - '--project', - cluster.project, - '--machine-type', - machine_type, - '--image-type', - _NODE_IMAGE_TYPE.value, - '--disk-type', - disk_type, - '--disk-size', - str(disk_size_gb), - '--num-nodes', - '1', - '--node-labels', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '--no-enable-autoupgrade', - '--no-enable-autorepair', - ] + zone_flags + ) + cmd.flags['machine-type'] = machine_type + cmd.flags['image-type'] = _NODE_IMAGE_TYPE.value + cmd.flags['disk-type'] = disk_type + cmd.flags['disk-size'] = disk_size_gb + cmd.flags['num-nodes'] = 1 + cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}' + cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair'] # IOPS and throughput provisioning only applies to hyperdisk-* types AND # only when the boot disk is also the swap device (non-LSSD configs). @@ -902,21 +907,17 @@ def _create_benchmark_node_pool(cluster) -> None: # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). if disk_type.startswith('hyperdisk') and not is_lssd: - cmd += [ - '--boot-disk-provisioned-iops', - str(_BOOT_DISK_IOPS.value), - '--boot-disk-provisioned-throughput', - str( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ), - ] + # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by + # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks). + cmd.flags['boot-disk-provisioned-iops'] = _BOOT_DISK_IOPS.value + cmd.flags['boot-disk-provisioned-throughput'] = _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). if is_lssd: - cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}'] + cmd.flags['local-nvme-ssd-block'] = f'count={_LSSD_COUNT.value}' # ── GKE kubelet swap config ─────────────────────────────────────────────── # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark @@ -937,7 +938,7 @@ def _create_benchmark_node_pool(cluster) -> None: ) system_config_tmp.write(kubelet_yaml) system_config_tmp.flush() - cmd += ['--system-config-from-file', system_config_tmp.name] + cmd.flags['system-config-from-file'] = system_config_tmp.name logging.info( '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)', swap_behavior, @@ -963,9 +964,7 @@ def _create_benchmark_node_pool(cluster) -> None: # GKE must also initialise the local NVMe devices before marking nodes Ready. # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. try: - stdout, stderr, rc = vm_util.IssueCommand( - cmd, timeout=1200, raise_on_failure=False - ) + _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) finally: if system_config_tmp is not None: try: @@ -1102,36 +1101,22 @@ def _attach_swap_disk(cluster) -> None: disk_size_gb, disk_type, ) - create_cmd = [ - 'gcloud', - 'compute', - 'disks', - 'create', - disk_name, - '--project', - project, - '--zone', - zone, - '--type', - disk_type, - '--size', - f'{disk_size_gb}GB', - '--quiet', - ] - if disk_type.startswith('hyperdisk'): - create_cmd += [ - '--provisioned-iops', - str(_BOOT_DISK_IOPS.value), - '--provisioned-throughput', - str( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ), - ] - _, stderr, rc = vm_util.IssueCommand( - create_cmd, timeout=120, raise_on_failure=False + # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project + # and --zone (always zonal — gcloud compute --region creates regional + # resources, which is not what we want for a node-attached swap disk). + gcp_res = _GcpZonalResource(project, zone) + create_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'create', disk_name ) + create_cmd.flags['type'] = disk_type + create_cmd.flags['size'] = f'{disk_size_gb}GB' + create_cmd.args.append('--quiet') + if disk_type.startswith('hyperdisk'): + create_cmd.flags['provisioned-iops'] = _BOOT_DISK_IOPS.value + create_cmd.flags['provisioned-throughput'] = _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) if rc != 0: raise errors.Benchmarks.RunError( f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}' @@ -1141,25 +1126,13 @@ def _attach_swap_disk(cluster) -> None: logging.info( '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name ) - attach_cmd = [ - 'gcloud', - 'compute', - 'instances', - 'attach-disk', - instance_name, - '--project', - project, - '--zone', - zone, - '--disk', - disk_name, - '--device-name', - 'pkb-swap', - '--quiet', - ] - _, stderr, rc = vm_util.IssueCommand( - attach_cmd, timeout=120, raise_on_failure=False + attach_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'instances', 'attach-disk', instance_name ) + attach_cmd.flags['disk'] = disk_name + attach_cmd.flags['device-name'] = 'pkb-swap' + attach_cmd.args.append('--quiet') + _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) if rc != 0: raise errors.Benchmarks.RunError( f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' @@ -1179,22 +1152,12 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: leaked. Returns True if the disk is gone (deleted or already absent). """ for attempt in range(1, 5): - users, _, rc = vm_util.IssueCommand( - [ - 'gcloud', - 'compute', - 'disks', - 'describe', - disk_name, - '--project', - project, - '--zone', - zone, - '--format=value(users)', - ], - timeout=60, - raise_on_failure=False, + gcp_res = _GcpZonalResource(project, zone) + describe_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'describe', disk_name ) + describe_cmd.flags['format'] = 'value(users)' + users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) if rc != 0: logging.info( '[swap_encryption] Swap disk %s not present — nothing to delete', @@ -1207,40 +1170,17 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: logging.info( '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst ) - vm_util.IssueCommand( - [ - 'gcloud', - 'compute', - 'instances', - 'detach-disk', - inst, - '--project', - project, - '--zone', - zone, - '--disk', - disk_name, - '--quiet', - ], - timeout=120, - raise_on_failure=False, + detach_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'instances', 'detach-disk', inst ) - _, derr, drc = vm_util.IssueCommand( - [ - 'gcloud', - 'compute', - 'disks', - 'delete', - disk_name, - '--project', - project, - '--zone', - zone, - '--quiet', - ], - timeout=180, - raise_on_failure=False, + detach_cmd.flags['disk'] = disk_name + detach_cmd.args.append('--quiet') + detach_cmd.Issue(timeout=120, raise_on_failure=False) + delete_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'delete', disk_name ) + delete_cmd.args.append('--quiet') + _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) if drc == 0: logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) return True @@ -1281,31 +1221,21 @@ def _delete_default_node_pool(cluster) -> None: requirement that a cluster must have at least one nodepool at creation time. Removing it stops the clock on its cost immediately. """ - zone_flags: list[str] = [] - if getattr(cluster, 'zones', None): - zone_flags = ['--zone', cluster.zones[0]] - elif getattr(cluster, 'region', None): - zone_flags = ['--region', cluster.region] - - cmd = [ - 'gcloud', + # Use PKB's GcloudCommand: auto-injects --project, --zone/--region. + cmd = cluster._GcloudCommand( 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL, '--cluster', cluster.name, - '--project', - cluster.project, - '--quiet', - ] + zone_flags + ) + cmd.args.append('--quiet') logging.info( '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL ) - stdout, stderr, rc = vm_util.IssueCommand( - cmd, timeout=300, raise_on_failure=False - ) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) if rc != 0: logging.warning( '[swap_encryption] Could not delete default nodepool (rc=%d): %s', From 48da7c4caae74f6d778dcdd98573414d404d90d5 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 19 Jun 2025 10:46:43 +0000 Subject: [PATCH 05/17] refactor(swap_encryption): use kubernetes_commands.ApplyManifest and fix imports Replace manual temp-file + kubectl apply in _deploy_daemonset() with PKB's kubernetes_commands.ApplyManifest(): - Remove _daemonset_yaml() helper - _deploy_daemonset() delegates to kubernetes_commands.ApplyManifest( 'cluster/swap_encryption_daemonset.yaml.j2', **kwargs) - Add kubernetes_commands import; remove vm_util import (now unused) - Fix import order: providers.gcp before resources.container_service --- .../swap_encryption_benchmark.py | 45 ++++++++----------- 1 file changed, 18 insertions(+), 27 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index e325220c03..5767f8eb71 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -75,9 +75,9 @@ from perfkitbenchmarker import configs from perfkitbenchmarker import errors from perfkitbenchmarker import sample -from perfkitbenchmarker import vm_util -from perfkitbenchmarker.resources.container_service import kubectl from perfkitbenchmarker.providers.gcp import util as gcp_util +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.resources.container_service import kubernetes_commands FLAGS = flags.FLAGS @@ -405,25 +405,6 @@ def __init__(self, project: str, zone: str) -> None: self.zone = zone -def _daemonset_yaml(image: str) -> str: - """Render the privileged benchmark DaemonSet manifest. - - The manifest is a PKB data file rendered with Jinja2 - (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline - string, per PKB conventions. The DaemonSet is pinned to the benchmark - nodepool via nodeSelector so it never lands on the dummy default pool. - """ - return vm_util.ReadAndRenderJinja2Template( - 'cluster/swap_encryption_daemonset.yaml.j2', - ds_name=_DS_NAME, - ds_namespace=_DS_NAMESPACE, - ds_label=_DS_LABEL, - benchmark_nodepool=_BENCHMARK_NODEPOOL, - image=image, - kernel_version=_KERNEL_VERSION.value, - ) - - def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: """Load and return benchmark config spec.""" return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) @@ -665,12 +646,22 @@ def Cleanup(spec: _BenchmarkSpec) -> None: def _deploy_daemonset() -> None: - """Apply the benchmark DaemonSet manifest to the cluster.""" - manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value) - with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f: - f.write(manifest) - f.close() - kubectl.RunKubectlCommand(['apply', '-f', f.name]) + """Apply the benchmark DaemonSet manifest to the cluster. + + Uses kubernetes_commands.ApplyManifest which renders the Jinja2 template + from perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2, + writes it to a temp file, and calls kubectl apply -f — the standard PKB + pattern for deploying manifests. + """ + kubernetes_commands.ApplyManifest( + 'cluster/swap_encryption_daemonset.yaml.j2', + ds_name=_DS_NAME, + ds_namespace=_DS_NAMESPACE, + ds_label=_DS_LABEL, + benchmark_nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + kernel_version=_KERNEL_VERSION.value, + ) logging.info('[swap_encryption] DaemonSet applied') From 3cd49537a59e14aa5e48e93178da7c637fd75c2e Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 19 Jun 2025 10:46:44 +0000 Subject: [PATCH 06/17] fix(swap_encryption): add linuxConfig.swapConfig to system-config and remove cgroup hack Address Ajay review comments on PR #6776: Comment #r3457877984 (linuxConfig.swapConfig): Extend --system-config-from-file YAML with linuxConfig blocks: linuxConfig.swapConfig.enabled: true -- GKE sets up node-level swap dedicatedLocalSsdProfile.diskCount: N -- LSSD: use local NVMe for swap linuxConfig.sysctl: vm.swappiness=100, vm.min_free_kbytes=200, vm.watermark_scale_factor=500 Ref: https://cloud.google.com/kubernetes-engine/docs/how-to/node-memory-swap Comment #r3457928855 (cgroup hack): Remove memory.swap.max=max loop from swap_encryption_daemonset.yaml.j2. With kubeletConfig.memorySwapBehavior=LimitedSwap the kubelet manages per-container swap allocation; the cgroup hack is unnecessary. --- .../cluster/swap_encryption_daemonset.yaml.j2 | 57 ++++--------------- .../swap_encryption_benchmark.py | 34 ++++++++++- 2 files changed, 42 insertions(+), 49 deletions(-) diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 index c40ec79dff..62b773ccfd 100644 --- a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -166,53 +166,15 @@ spec: tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ echo "[pkb] WARNING: kernel source extraction failed" >&2 fi - echo "[pkb] Unlocking container cgroup swap limits..." - # GKE cgroup v2 sets memory.swap.max=0 per-container, which - # prevents swap usage even when the node has a swap device and - # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because - # the kernel can't page out to swap for this cgroup. - # - # NOTE: the old approach derived the cgroup path from - # /proc/self/cgroup, but inside a cgroup namespace that reports - # "0::/" — so the write targeted the host ROOT cgroup, silently - # no-op'd, and swap stayed locked (the OOM-in-15s symptom above). - # /sys is the host cgroup tree (hostPath mount) and this pod is - # privileged, so instead unlock swap across the entire kubepods - # hierarchy, which is guaranteed to contain our own container. - if [ -d /sys/fs/cgroup/kubepods.slice ] || \ - [ -d /sys/fs/cgroup/kubepods ]; then - # cgroup v2: write 'max' to every memory.swap.max under kubepods*. - find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \ - 2>/dev/null | while read -r _f; do - echo max > "$_f" 2>/dev/null || true - done - fi - # Best-effort: our own namespaced path and the unified root. - PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \ - 2>/dev/null) - for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \ - /sys/fs/cgroup/memory.swap.max; do - [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; } - done - # cgroup v1 fallback: lift the combined RAM+swap hard ceiling. - find /sys/fs/cgroup/memory -path '*kubepods*' \ - -name memory.memsw.limit_in_bytes 2>/dev/null \ - | while read -r _f; do - echo -1 > "$_f" 2>/dev/null || true - done - # Verify and surface the result in the pod log. grep -L lists - # files that do NOT contain 'max' on their first line, i.e. ones - # still capping swap. - PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \ - -name memory.swap.max 2>/dev/null \ - | xargs -r grep -L '^max' 2>/dev/null | head -1) - if [ -n "$PKB_STILL_CAPPED" ]; then - echo "[pkb] WARNING: cgroup swap still capped at \ - $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \ - OOM-killed before swap is exercised" >&2 - else - echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)" - fi + # Container cgroup swap limits are managed by the kubelet when + # kubeletConfig.memorySwapBehavior=LimitedSwap is set via + # --system-config-from-file (GKE) or kubelet-config.json (EKS). + # Manually writing memory.swap.max=max across kubepods is not + # required and is superseded by the kubelet swap config. + # Reference: Ajay's review comment go/pkb-swap-encryption-pr1 + # #r3457928855 — https://github.com/GoogleCloudPlatform/ + # PerfKitBenchmarker/pull/6776#discussion_r3457928855 + echo "[pkb] Swap limits managed by kubelet (LimitedSwap config)." echo "[pkb] Tools installed. Writing ready sentinel." touch /tmp/pkb_ready sleep infinity @@ -264,3 +226,4 @@ spec: hostPath: path: /lib/modules type: Directory + \ No newline at end of file diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 5767f8eb71..f8076ac4e7 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -923,7 +923,34 @@ def _create_benchmark_node_pool(cluster) -> None: swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value system_config_tmp = None if swap_behavior: - kubelet_yaml = f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\n' + # Build system-config YAML for --system-config-from-file. + # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): + # kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods. + # linuxConfig.swapConfig: GKE enables node-level swap device. + # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use + # the local NVMe as the swap device (avoids boot-disk overhead). + # linuxConfig.sysctl: swap aggressiveness tuning so the benchmark + # workloads can drive sustained swap I/O. + # Reference: + # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ + # node-memory-swap#enable + if is_lssd: + swap_config_block = ( + ' swapConfig:\n' + ' enabled: true\n' + ' dedicatedLocalSsdProfile:\n' + f' diskCount: {_LSSD_COUNT.value}\n' + ) + else: + swap_config_block = ' swapConfig:\n enabled: true\n' + kubelet_yaml = ( + f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\nlinuxConfig:\n' + + swap_config_block + + ' sysctl:\n' + ' vm.min_free_kbytes: 200\n' + ' vm.watermark_scale_factor: 500\n' + ' vm.swappiness: 100\n' + ) system_config_tmp = tempfile.NamedTemporaryFile( mode='w', suffix='.yaml', delete=False ) @@ -931,9 +958,12 @@ def _create_benchmark_node_pool(cluster) -> None: system_config_tmp.flush() cmd.flags['system-config-from-file'] = system_config_tmp.name logging.info( - '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)', + '[swap_encryption] system-config-from-file: ' + 'kubelet_swap=%s lssd=%s (written to %s):\n%s', swap_behavior, + is_lssd, system_config_tmp.name, + kubelet_yaml, ) logging.info( From 122505240178962daf942fb237da6ddd9951348a Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Wed, 24 Jun 2026 13:13:52 +0530 Subject: [PATCH 07/17] fix(swap_encryption): lean DaemonSet + Phase 1 fio microbenchmarks --- .../cluster/swap_encryption_daemonset.yaml.j2 | 169 +- .../swap_encryption_benchmark.py | 3031 +++++++++-------- 2 files changed, 1672 insertions(+), 1528 deletions(-) diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 index 62b773ccfd..29cacfb3ce 100644 --- a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 +++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 @@ -28,154 +28,46 @@ spec: - bash - -c - | - echo "[pkb] Installing benchmark tools..." - # Retry apt-get up to 3 times — transient network failures are - # common on a freshly-started GKE node. Critical tools (fio, - # stress-ng) must be present before we write the ready sentinel; - # a silent || true here would cause /tmp/pkb_ready to appear even - # when tools are missing, breaking all subsequent phases. + echo "[pkb] Installing measurement tools..." + # Only the tools needed for Phase 1 (raw-device fio) and Phase 2 + # (CPU/I/O overhead) are installed here. Workload benchmarks + # (redis, opensearch, kernel-build) run in separate pods via + # existing PKB benchmark modules and are NOT installed here. PKB_APT_OK=0 for _attempt in 1 2 3; do apt-get update -qq 2>&1 || true - DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\ - fio \\ - stress-ng \\ - sysstat \\ - cryptsetup \\ - mdadm \\ - redis-server \\ - redis-tools \\ - git \\ - wget \\ - curl \\ - make \\ - gcc \\ - bc \\ - flex \\ - bison \\ - libelf-dev \\ - libssl-dev \\ - cgroup-tools \\ - nvme-cli \\ - util-linux \\ - python3-pip \\ - libevent-dev \\ - libssl-dev \\ - libpcre3-dev \\ - zlib1g-dev \\ - build-essential \\ - autoconf \\ - automake \\ - libtool \\ - libtool-bin \\ - pkg-config \\ - python3-dev \\ - default-jre-headless \\ + DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \ + fio \ + cryptsetup \ + mdadm \ + sysstat \ + nvme-cli \ 2>&1 && PKB_APT_OK=1 && break echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2 sleep 15 done - if [ "$PKB_APT_OK" != "1" ] || \\ - ! command -v fio >/dev/null 2>&1 || \\ - ! command -v stress-ng >/dev/null 2>&1; then - echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2 + if [ "$PKB_APT_OK" != "1" ] || ! command -v fio >/dev/null 2>&1; then + echo "[pkb] FATAL: fio not installed after 3 attempts" >&2 exit 1 fi - echo "[pkb] Installing memtier_benchmark from source..." - # Pin a stable release tag — building from the moving default - # branch (HEAD) intermittently broke (memtier_benchmark not found - # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the - # version PKB's memtier package (memtier.MemtierResult.Parse) is - # validated against and builds cleanly with the apt deps above. - # Fall back to HEAD only if the tagged clone fails. - if ! command -v memtier_benchmark >/dev/null 2>&1; then - (cd /tmp && \\ - rm -rf memtier_benchmark && \\ - ( git clone --depth 1 --branch 2.2.1 \\ - https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\ - git clone --depth 1 \\ - https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\ - cd memtier_benchmark && \\ - autoreconf -ivf 2>&1 && \\ - ./configure 2>&1 && \\ - make -j$(nproc) 2>&1 && \\ - make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\ - echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used" - fi - if command -v memtier_benchmark >/dev/null 2>&1; then - echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)" - fi - echo "[pkb] Installing esrally (lightweight)..." - python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true - pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ - pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\ - echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used" - if command -v esrally >/dev/null 2>&1; then - echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)" - else - echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2 - fi - echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..." - # Phase 3c needs a real search server on :9200. Nothing in apt - # ships one and the pod has no systemd, so install the OpenSearch - # bundle (ships its own JDK) and launch the binary directly in the - # phase. All best-effort: if any step fails the phase probes the - # endpoint and skips cleanly rather than recording fake timings. - if [ ! -x /opt/opensearch/bin/opensearch ]; then - OS_VER=2.15.0 - (cd /opt && \\ - wget -q --timeout=600 -O os.tgz \\ - "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\ - tar -xzf os.tgz && rm -f os.tgz && \\ - mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\ - echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2 - fi - if [ -x /opt/opensearch/bin/opensearch ]; then - # pkbos owns and runs OpenSearch (it refuses to run as root). - # Give it a home so HOME/temp paths are writable. - id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true - printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\ - > /opt/opensearch/config/opensearch.yml - mkdir -p /opt/opensearch/config/jvm.options.d - # 2 GB heap: 512 MB was too small and OpenSearch aborted early. - # On a 252 GB node this still leaves plenty of page cache to - # pressure into swap during the phase. - printf -- '-Xms2g\\n-Xmx2g\\n' \\ - > /opt/opensearch/config/jvm.options.d/pkb-heap.options - sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true - # CRITICAL: never run the binary as root here (it bails and - # leaves root-owned files in logs/ that block the pkbos server). - # Clear any stale logs and chown everything to pkbos LAST. - rm -f /opt/opensearch/logs/* 2>/dev/null || true - chown -R pkbos /opt/opensearch 2>/dev/null || true - echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)" - fi - echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..." - PKB_KVER="{{ kernel_version }}" - PKB_KROOT="/mnt/stateful_partition/pkb_kernel" - PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz" - PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER" - PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz" - mkdir -p "$PKB_KROOT" - if [ ! -f "$PKB_KTARBALL" ]; then - wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\ - echo "[pkb] WARNING: kernel tarball download failed" >&2 - fi - if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then - echo "[pkb] Extracting kernel source (xz)..." - tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\ - echo "[pkb] WARNING: kernel source extraction failed" >&2 + echo "[pkb] fio installed: $(fio --version 2>&1 | head -1)" + echo "[pkb] Verifying swap device is active..." + PKB_SWAP_FOUND=0 + for _attempt in $(seq 1 30); do + if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then + PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps) + echo "[pkb] Swap device active: $PKB_SWAP_DEV" + PKB_SWAP_FOUND=1 + break + fi + echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2 + sleep 5 + done + if [ "$PKB_SWAP_FOUND" != "1" ]; then + echo "[pkb] WARNING: no active swap device after 150s — " \ + "check linuxConfig.swapConfig / kubelet swap config." >&2 fi - # Container cgroup swap limits are managed by the kubelet when - # kubeletConfig.memorySwapBehavior=LimitedSwap is set via - # --system-config-from-file (GKE) or kubelet-config.json (EKS). - # Manually writing memory.swap.max=max across kubepods is not - # required and is superseded by the kubelet swap config. - # Reference: Ajay's review comment go/pkb-swap-encryption-pr1 - # #r3457928855 — https://github.com/GoogleCloudPlatform/ - # PerfKitBenchmarker/pull/6776#discussion_r3457928855 - echo "[pkb] Swap limits managed by kubelet (LimitedSwap config)." - echo "[pkb] Tools installed. Writing ready sentinel." + echo "[pkb] Measurement tools ready. Writing ready sentinel." touch /tmp/pkb_ready sleep infinity securityContext: @@ -226,4 +118,3 @@ spec: hostPath: path: /lib/modules type: Directory - \ No newline at end of file diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index f8076ac4e7..e596abf963 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -63,6 +63,7 @@ Bulk-index + search query under swap pressure (esrally or curl). """ +import json import logging import os import tempfile @@ -87,7 +88,7 @@ # Benchmark identity # --------------------------------------------------------------------------- -BENCHMARK_NAME = 'swap_encryption' +BENCHMARK_NAME = "swap_encryption" BENCHMARK_CONFIG = """ @@ -117,261 +118,254 @@ _DAEMONSET_IMAGE = flags.DEFINE_string( - 'swap_encryption_daemonset_image', - 'ubuntu:22.04', - 'Container image used for the privileged benchmark DaemonSet pod.', + "swap_encryption_daemonset_image", + "ubuntu:22.04", + "Container image used for the privileged benchmark DaemonSet pod.", ) _NODEPOOL = flags.DEFINE_string( - 'swap_encryption_nodepool', - 'benchmark', - 'Name of the node pool to deploy the benchmark DaemonSet on.', + "swap_encryption_nodepool", + "benchmark", + "Name of the node pool to deploy the benchmark DaemonSet on.", ) _INSTANCE_SIZE_LABEL = flags.DEFINE_string( - 'swap_encryption_instance_size_label', - '', - 'Human-readable label for the current instance size being tested, e.g. ' + "swap_encryption_instance_size_label", + "", + "Human-readable label for the current instance size being tested, e.g. " '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' - 'results from multiple PKB runs across different instance sizes can be ' - 'collated and compared. Defaults to the value reported by the cloud ' - 'metadata endpoint inside the pod.', + "results from multiple PKB runs across different instance sizes can be " + "collated and compared. Defaults to the value reported by the cloud " + "metadata endpoint inside the pod.", ) _COLLECT_COST = flags.DEFINE_boolean( - 'swap_encryption_collect_cost', + "swap_encryption_collect_cost", False, - 'When True, emit a cost_estimate_usd sample using on-demand pricing ' - 'for the instance type detected at runtime.', + "When True, emit a cost_estimate_usd sample using on-demand pricing " + "for the instance type detected at runtime.", ) _FAIL_ON_DEGRADED = flags.DEFINE_boolean( - 'swap_encryption_fail_on_degraded', + "swap_encryption_fail_on_degraded", True, - 'When True (default), raise an error at the end of Run() if the run was ' - 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' - 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' - 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' - 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' - 'empty or meaningless data. Set False to keep the legacy behaviour of ' - 'always returning whatever partial samples were collected.', + "When True (default), raise an error at the end of Run() if the run was " + "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and " + "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng " + "swap-pressure phase was OOM-killed before completing. This prevents PKB " + "from reporting SUCCEEDED for a run whose post-eviction phases produced " + "empty or meaningless data. Set False to keep the legacy behaviour of " + "always returning whatever partial samples were collected.", ) _PHASES = flags.DEFINE_list( - 'swap_encryption_phases', - ['all'], - 'Which Run() phases to execute, for fast iteration against an ' - 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' - 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' - 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' + "swap_encryption_phases", + ["all"], + "Which Run() phases to execute, for fast iteration against an " + "already-provisioned cluster (e.g. --run_stage=run --run_uri=...). " + "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng " + "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), " '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' - 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' - 'Phases not listed are skipped and do not affect the degraded-run gate ' + "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. " + "Phases not listed are skipped and do not affect the degraded-run gate " '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', ) _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( - 'swap_encryption_benchmark_machine_type', - 'n4-highmem-32', - 'Machine type for the benchmark nodepool created in Prepare(). ' - 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' - '(LSSD RAID-0). The matching swap setup is selected automatically.', + "swap_encryption_benchmark_machine_type", + "n4-highmem-32", + "Machine type for the benchmark nodepool created in Prepare(). " + "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd " + "(LSSD RAID-0). The matching swap setup is selected automatically.", ) _BENCHMARK_LSSD = flags.DEFINE_boolean( - 'swap_encryption_lssd', + "swap_encryption_lssd", False, - 'Force LSSD RAID-0 swap path even when the machine type name does not ' + "Force LSSD RAID-0 swap path even when the machine type name does not " 'contain "lssd". Auto-detected from machine type when False.', ) _LSSD_COUNT = flags.DEFINE_integer( - 'swap_encryption_lssd_count', + "swap_encryption_lssd_count", 1, - 'Number of local NVMe SSDs to attach as raw block devices ' - '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' - 'count for the chosen machine type: c4-standard-8-lssd=1, ' - 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' - 'Default 1 covers most single-lssd machine types.', + "Number of local NVMe SSDs to attach as raw block devices " + "(--local-nvme-ssd-block count=N). Must match the fixed local SSD " + "count for the chosen machine type: c4-standard-8-lssd=1, " + "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). " + "Default 1 covers most single-lssd machine types.", ) _NODE_IMAGE_TYPE = flags.DEFINE_string( - 'swap_encryption_node_image_type', - 'UBUNTU_CONTAINERD', - 'GKE node image type for the benchmark nodepool. ' - 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' - 'down device-mapper at the kernel LSM level and cryptsetup hangs ' - 'indefinitely from any pod context (even privileged, even via nsenter ' - 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' - 'from privileged pods without restriction. ' - 'Use COS_CONTAINERD only when dm-crypt is disabled ' - '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' - 'AL2 on EKS.', + "swap_encryption_node_image_type", + "UBUNTU_CONTAINERD", + "GKE node image type for the benchmark nodepool. " + "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks " + "down device-mapper at the kernel LSM level and cryptsetup hangs " + "indefinitely from any pod context (even privileged, even via nsenter " + "into the host mount namespace). Ubuntu GKE nodes allow cryptsetup " + "from privileged pods without restriction. " + "Use COS_CONTAINERD only when dm-crypt is disabled " + "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. " + "AL2 on EKS.", ) _BOOT_DISK_TYPE = flags.DEFINE_string( - 'swap_encryption_boot_disk_type', - 'hyperdisk-balanced', - 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' - 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' - 'dev/test machines, which do not support hyperdisk-balanced.', + "swap_encryption_boot_disk_type", + "hyperdisk-balanced", + "Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced " + "for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 " + "dev/test machines, which do not support hyperdisk-balanced.", ) _BOOT_DISK_IOPS = flags.DEFINE_integer( - 'swap_encryption_boot_disk_iops', + "swap_encryption_boot_disk_iops", 80000, - 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' - '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', + "Provisioned IOPS for the boot disk (hyperdisk-balanced only). " + "80 000 is the COS max-IOPS target. Ignored for pd-ssd.", ) _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( - 'swap_encryption_boot_disk_throughput', + "swap_encryption_boot_disk_throughput", 1200, - 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' - 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' - 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' - 'pd-ssd.', + "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced " + "only). Must be set together with iops. 1200 MB/s pairs with 80 000 " + "IOPS for production; use 140 (minimum) for dev/test. Ignored for " + "pd-ssd.", ) _BOOT_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_boot_disk_size_gb', + "swap_encryption_boot_disk_size_gb", 500, - 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' - 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' - '(see Engineer Assignments table in execution-plan.md). ' - 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', + "Boot disk size in GiB for the benchmark nodepool. 500 GiB is " + "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run " + "(see Engineer Assignments table in execution-plan.md). " + "For LSSD configs the boot disk is smaller; 100 GiB is fine.", ) _ADD_SWAP_DISK = flags.DEFINE_boolean( - 'swap_encryption_add_swap_disk', + "swap_encryption_add_swap_disk", False, - 'Attach a dedicated second disk to the benchmark nodepool for use as ' - 'the swap device. Required for dm-crypt measurement on single-boot-disk ' - 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' - 'from pod namespaces. The second disk is provisioned via ' - '--additional-node-disk using the same type/IOPS/throughput as the boot ' - 'disk flags.', + "Attach a dedicated second disk to the benchmark nodepool for use as " + "the swap device. Required for dm-crypt measurement on single-boot-disk " + "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper " + "from pod namespaces. The second disk is provisioned via " + "--additional-node-disk using the same type/IOPS/throughput as the boot " + "disk flags.", ) _SWAP_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_swap_disk_size_gb', + "swap_encryption_swap_disk_size_gb", 500, - 'Size in GiB of the dedicated swap disk when ' - '--swap_encryption_add_swap_disk is True. Must satisfy the ' - 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', + "Size in GiB of the dedicated swap disk when " + "--swap_encryption_add_swap_disk is True. Must satisfy the " + "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.", ) _ENABLE_DMCRYPT = flags.DEFINE_boolean( - 'swap_encryption_enable_dmcrypt', + "swap_encryption_enable_dmcrypt", True, - 'When True (default), wrap the swap device in dm-crypt plain mode ' + "When True (default), wrap the swap device in dm-crypt plain mode " "(aes-xts-plain64, ephemeral random key) matching GKE's " - 'go/node:swap-encryption implementation. Set False to measure plain ' - '(unencrypted) swap overhead as a baseline.', + "go/node:swap-encryption implementation. Set False to measure plain " + "(unencrypted) swap overhead as a baseline.", ) _GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string( - 'swap_encryption_gke_kubelet_memory_swap', - 'LimitedSwap', - 'Value for kubeletConfig.memorySwapBehavior injected via ' - '--system-config-from-file when creating the GKE benchmark nodepool. ' - 'LimitedSwap (default) — the kubelet allows pods to use swap up to their ' - 'memory limit; required for the DaemonSet pod to drive kernel swapping. ' - 'NoSwap — disables swap at the kubelet level (use for a baseline run that ' - 'confirms zero swap activity). Set empty string to omit the flag entirely ' - 'and rely on the cluster-level default.', + "swap_encryption_gke_kubelet_memory_swap", + "LimitedSwap", + "Value for kubeletConfig.memorySwapBehavior injected via " + "--system-config-from-file when creating the GKE benchmark nodepool. " + "LimitedSwap (default) — the kubelet allows pods to use swap up to their " + "memory limit; required for the DaemonSet pod to drive kernel swapping. " + "NoSwap — disables swap at the kubelet level (use for a baseline run that " + "confirms zero swap activity). Set empty string to omit the flag entirely " + "and rely on the cluster-level default.", ) _SWAP_DEVICE = flags.DEFINE_string( - 'swap_encryption_device', - '', - 'Explicit block device path to use as the swap device, e.g. ' - '/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), ' - 'the device is auto-detected from /proc/swaps inside the benchmark pod.', + "swap_encryption_device", + "", + "Explicit block device path to use as the swap device, e.g. " + "/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), " + "the device is auto-detected from /proc/swaps inside the benchmark pod.", ) _SWAP_TYPE = flags.DEFINE_string( - 'swap_encryption_swap_type', - 'hyperdisk', - 'Storage target for the swap device. One of: hyperdisk (default), ' - 'lssd, instance_store, io2.', -) - -_KERNEL_VERSION = flags.DEFINE_string( - 'swap_encryption_kernel_version', - '', - 'Kernel version string to embed in the DaemonSet pod spec as a label. ' - 'When empty (default) the version is not pinned.', + "swap_encryption_swap_type", + "hyperdisk", + "Storage target for the swap device. One of: hyperdisk (default), " + "lssd, instance_store, io2.", ) _ENABLE_ZSWAP = flags.DEFINE_boolean( - 'swap_encryption_enable_zswap', + "swap_encryption_enable_zswap", False, - 'When True, enable zswap compressed swap cache on the benchmark node.', + "When True, enable zswap compressed swap cache on the benchmark node.", ) _MIN_FREE_KBYTES = flags.DEFINE_integer( - 'swap_encryption_min_free_kbytes', + "swap_encryption_min_free_kbytes", 0, - 'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. ' - '0 (default) leaves the kernel default unchanged.', + "Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. " + "0 (default) leaves the kernel default unchanged.", ) _FIO_RUNTIME_SEC = flags.DEFINE_integer( - 'swap_encryption_fio_runtime_sec', + "swap_encryption_fio_runtime_sec", 60, - 'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.', + "Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.", ) _STRESS_VM_BYTES = flags.DEFINE_string( - 'swap_encryption_stress_vm_bytes', - '28G', - 'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. ' - 'Should exceed available node RAM to force sustained paging.', + "swap_encryption_stress_vm_bytes", + "28G", + "stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. " + "Should exceed available node RAM to force sustained paging.", ) _STRESS_VM_BYTES_LIST = flags.DEFINE_list( - 'swap_encryption_stress_vm_bytes_list', + "swap_encryption_stress_vm_bytes_list", [], - 'Comma-separated list of --vm-bytes values to sweep in Phase 2a, ' + "Comma-separated list of --vm-bytes values to sweep in Phase 2a, " 'e.g. "14G,28G,56G". Overrides --swap_encryption_stress_vm_bytes.', ) _STRESS_TIMEOUT_SEC = flags.DEFINE_integer( - 'swap_encryption_stress_timeout_sec', + "swap_encryption_stress_timeout_sec", 300, - 'Maximum seconds to wait for the stress-ng swap-pressure phase.', + "Maximum seconds to wait for the stress-ng swap-pressure phase.", ) -_DS_NAME = 'pkb-swap-benchmark' -_DS_NAMESPACE = 'default' -_DS_LABEL = 'pkb-swap-benchmark' +_DS_NAME = "pkb-swap-benchmark" +_DS_NAMESPACE = "default" +_DS_LABEL = "pkb-swap-benchmark" # Transient kubectl errors that are safe to retry. -_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close') +_TRANSIENT_KUBECTL_ERRORS = ("connection reset by peer", "websocket: close") # Errors indicating the container/pod is gone and needs recovery. _CONTAINER_GONE_KUBECTL_ERRORS = ( - 'container not found', - 'procReady not received', - 'unable to upgrade connection', - 'not found', - 'deleted state', + "container not found", + "procReady not received", + "unable to upgrade connection", + "not found", + "deleted state", ) _active_pod: list[str] = [] # single-element list so closures can mutate it @@ -385,241 +379,267 @@ _oom_events: list[str] = [] -_BENCHMARK_NODEPOOL = 'benchmark' -_DEFAULT_NODEPOOL = 'default-pool' +_BENCHMARK_NODEPOOL = "benchmark" +_DEFAULT_NODEPOOL = "default-pool" class _GcpZonalResource: - """Minimal resource shim for gcp_util.GcloudCommand on compute operations. + """Minimal resource shim for gcp_util.GcloudCommand on compute operations. - gcp_util.GcloudCommand auto-injects --project and --zone from the resource - object passed to it. GkeCluster._GcloudCommand() handles container/* - operations correctly but also switches --zone → --region for multi-zone - clusters, which is wrong for gcloud compute commands (--region creates - regional resources, not zonal ones). This shim pins a single zone so all - gcloud compute calls target the correct AZ. - """ + gcp_util.GcloudCommand auto-injects --project and --zone from the resource + object passed to it. GkeCluster._GcloudCommand() handles container/* + operations correctly but also switches --zone → --region for multi-zone + clusters, which is wrong for gcloud compute commands (--region creates + regional resources, not zonal ones). This shim pins a single zone so all + gcloud compute calls target the correct AZ. + """ - def __init__(self, project: str, zone: str) -> None: - self.project = project - self.zone = zone + def __init__(self, project: str, zone: str) -> None: + self.project = project + self.zone = zone def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: - """Load and return benchmark config spec.""" - return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + """Load and return benchmark config spec.""" + return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) def Prepare(spec: _BenchmarkSpec) -> None: - """Two-step nodepool setup then DaemonSet deployment. - - Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap - e2-medium default nodepool. - - Step 2 (this function): - a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures - dm-crypt swap at the OS level — before any pod is scheduled. - b. Delete the dummy default nodepool to stop its cost immediately. - c. Deploy the privileged DaemonSet (pinned via nodeSelector to the - benchmark nodepool) and wait for tools to install. - """ - cluster = spec.container_cluster - - # ── Step 2a: add real benchmark nodepool ──────────────────────────────── - if not getattr(cluster, 'project', None): - # Guard: AWS / EKS path — nodepool management is external. - # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet - # nodeSelector (pkb_nodepool=benchmark) before deploying the pod. - logging.info( - '[swap_encryption] EKS cluster — labelling existing nodes with ' - 'pkb_nodepool=%s so the DaemonSet nodeSelector matches.', - _BENCHMARK_NODEPOOL, - ) - kubectl.RunKubectlCommand([ - 'label', - 'nodes', - '--all', - '--overwrite', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - ]) - # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs - # on io2 hardware-encrypted storage (no-op unless swap_type=io2). - _ensure_io2_volume() - else: - # GCP path: true two-step nodepool setup. - logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') - _create_benchmark_node_pool(cluster) - - # ── Step 2b: wait for the benchmark node to join and be Ready ───────── - logging.info('[swap_encryption] Step 2b: waiting for benchmark node') - _wait_for_benchmark_node() - - # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── - if _ADD_SWAP_DISK.value: - logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk') - _attach_swap_disk(cluster) - - # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── - # Deploy and wait for the pod BEFORE deleting the default nodepool. - # Deleting the default pool while the benchmark node is still joining causes - # a temporary API server i/o timeout (control plane busy with two nodepool - # ops simultaneously). Once the pod is Running the cluster is fully stable. - logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet') - _deploy_daemonset() - - pod = _wait_for_benchmark_pod() - logging.info('[swap_encryption] Benchmark pod ready: %s', pod) - - # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── - if getattr(cluster, 'project', None): - logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool') - _delete_default_node_pool(cluster) - # The DaemonSet pod may be evicted and rescheduled with a new name during - # the nodepool deletion (cluster control plane briefly interrupts pod - # lifecycle). Re-resolve the pod name to avoid stale-reference errors on - # all subsequent _pod_exec calls. - logging.info( - '[swap_encryption] Step 2d: re-resolving benchmark pod ' - 'after nodepool deletion' - ) + """Two-step nodepool setup then DaemonSet deployment. + + Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap + e2-medium default nodepool. + + Step 2 (this function): + a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with + COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + dm-crypt swap at the OS level — before any pod is scheduled. + b. Delete the dummy default nodepool to stop its cost immediately. + c. Deploy the privileged DaemonSet (pinned via nodeSelector to the + benchmark nodepool) and wait for tools to install. + """ + cluster = spec.container_cluster + + # ── Step 2a: add real benchmark nodepool ──────────────────────────────── + if not getattr(cluster, "project", None): + # Guard: AWS / EKS path — nodepool management is external. + # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet + # nodeSelector (pkb_nodepool=benchmark) before deploying the pod. + logging.info( + "[swap_encryption] EKS cluster — labelling existing nodes with " + "pkb_nodepool=%s so the DaemonSet nodeSelector matches.", + _BENCHMARK_NODEPOOL, + ) + kubectl.RunKubectlCommand([ + "label", + "nodes", + "--all", + "--overwrite", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + ]) + # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs + # on io2 hardware-encrypted storage (no-op unless swap_type=io2). + _ensure_io2_volume() + else: + # GCP path: true two-step nodepool setup. + logging.info("[swap_encryption] Step 2a: creating benchmark nodepool") + _create_benchmark_node_pool(cluster) + + # ── Step 2b: wait for the benchmark node to join and be Ready ───────── + logging.info("[swap_encryption] Step 2b: waiting for benchmark node") + _wait_for_benchmark_node() + + # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── + if _ADD_SWAP_DISK.value: + logging.info( + "[swap_encryption] Step 2b2: attaching dedicated swap disk" + ) + _attach_swap_disk(cluster) + + # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── + # Deploy and wait for the pod BEFORE deleting the default nodepool. + # Deleting the default pool while the benchmark node is still joining causes + # a temporary API server i/o timeout (control plane busy with two nodepool + # ops simultaneously). Once the pod is Running the cluster is fully stable. + logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet") + _deploy_daemonset() + pod = _wait_for_benchmark_pod() - logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod) + logging.info("[swap_encryption] Benchmark pod ready: %s", pod) + + # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── + if getattr(cluster, "project", None): + logging.info( + "[swap_encryption] Step 2d: deleting dummy default nodepool" + ) + _delete_default_node_pool(cluster) + # The DaemonSet pod may be evicted and rescheduled with a new name during + # the nodepool deletion (cluster control plane briefly interrupts pod + # lifecycle). Re-resolve the pod name to avoid stale-reference errors on + # all subsequent _pod_exec calls. + logging.info( + "[swap_encryption] Step 2d: re-resolving benchmark pod " + "after nodepool deletion" + ) + pod = _wait_for_benchmark_pod() + logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod) def _phase_selected(token: str) -> bool: - """Return True if phase `token` should run given --swap_encryption_phases. + """Return True if phase `token` should run given --swap_encryption_phases. - 'all' (the default) selects every phase. Otherwise only the comma-separated - tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. - """ - selected = [p.strip().lower() for p in _PHASES.value if p.strip()] - return (not selected) or ('all' in selected) or (token.lower() in selected) + 'all' (the default) selects every phase. Otherwise only the comma-separated + tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ("all" in selected) or (token.lower() in selected) def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: - """Execute all benchmark phases with gate logic. + """Execute all benchmark phases with gate logic. - Execution is structured in three gated tiers matching the execution plan: + Execution is structured in three gated tiers matching the execution plan: - Tier 1 (Gate 1) — fio microbenchmarks - Raw I/O ceiling of the swap device. Gate 1 fails if fio produces - zero samples (device not found, O_DIRECT error, etc.). + Tier 1 (Gate 1) — fio microbenchmarks + Raw I/O ceiling of the swap device. Gate 1 fails if fio produces + zero samples (device not found, O_DIRECT error, etc.). - Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference - Requires an active swap device (Gate 1 must pass). Gate 2 fails if - stress-ng does not complete within timeout. + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference + Requires an active swap device (Gate 1 must pass). Gate 2 fails if + stress-ng does not complete within timeout. - Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) - Independent of Tier 2 results; always attempted if Gate 1 passed. - Individual workload failures are logged but do not abort the others. + Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) + Independent of Tier 2 results; always attempted if Gate 1 passed. + Individual workload failures are logged but do not abort the others. - If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring - application-level swap performance when the raw device is inaccessible. - """ - pod = _wait_for_benchmark_pod() - if pod is None: - raise errors.Benchmarks.RunError( - '[swap_encryption] Benchmark pod never became ready.' - ) - # Initialise the module-level active-pod tracker so _pod_exec and - # _recover_pod can transparently redirect to a replacement pod if the - # original is evicted during the run. - _active_pod.clear() - _active_pod.append(pod) - _degraded_reasons.clear() - _pod_lost.clear() - _oom_events.clear() - original_pod = pod - swap_dev = _detect_swap_device(pod) - base_meta = _build_metadata(pod, swap_dev) - results: list[sample.Sample] = [] - t_run_start = time.time() - - logging.info('[swap_encryption] swap device: %s', swap_dev) - - # ── Cost estimate ───────────────────────────────────────────────────────── - if _COLLECT_COST.value: - elapsed = time.time() - t_run_start - results += _collect_cost_sample(pod, elapsed, base_meta) - - # ── Final degradation gate ──────────────────────────────────────────────── - # The phase try/except blocks above keep the run alive so partial data is - # still collected, but that means a catastrophic failure (pod OOM-evicted - # mid-run, no fio data, stress-ng killed before it could drive swap I/O) - # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. - # Detect those conditions here and surface them explicitly. - if _active_pod and _active_pod[0] != original_pod: - _degraded_reasons.append( - 'benchmark pod was replaced during the run ' - f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap ' - 'pressure; phases executed after the eviction ran against a ' - 'freshly-initialised pod (empty /tmp, swap re-setup) and may be ' - 'invalid' - ) - if _pod_lost: - _degraded_reasons.append( - 'benchmark pod(s) went NotFound during the run' - f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' - ' eviction or container exit) and any phase running at or after that' - ' point (e.g. kernel-build baseline, OpenSearch) produced invalid data' - ) - if _oom_events: - _degraded_reasons.append( - 'OOM kill(s) (rc=137) occurred during the run on pod(s) ' - f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by ' - 'the OOM killer (the container may have restarted in place), so the ' - 'affected phase(s) produced no or partial data' - ) + If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring + application-level swap performance when the raw device is inaccessible. + """ + pod = _wait_for_benchmark_pod() + if pod is None: + raise errors.Benchmarks.RunError( + "[swap_encryption] Benchmark pod never became ready." + ) + # Initialise the module-level active-pod tracker so _pod_exec and + # _recover_pod can transparently redirect to a replacement pod if the + # original is evicted during the run. + _active_pod.clear() + _active_pod.append(pod) + _degraded_reasons.clear() + _pod_lost.clear() + _oom_events.clear() + original_pod = pod + swap_dev = _detect_swap_device(pod) + base_meta = _build_metadata(pod, swap_dev) + results: list[sample.Sample] = [] + t_run_start = time.time() + + logging.info("[swap_encryption] swap device: %s", swap_dev) + + # ── Phase 1: fio microbenchmarks on raw swap device ───────────────────────── + if _phase_selected("fio"): + logging.info( + "[swap_encryption] Phase 1: fio microbenchmarks on %s", swap_dev + ) + try: + phase1_samples = _run_phase1_fio(pod, swap_dev, base_meta) + results += phase1_samples + if not phase1_samples: + _degraded_reasons.append( + "Phase 1 (fio) produced no samples — " + "check fio install and swap device accessibility" + ) + logging.error("[swap_encryption] Phase 1: no samples produced") + except Exception as e: # pylint: disable=broad-except + _degraded_reasons.append(f"Phase 1 fio failed: {e}") + logging.error("[swap_encryption] Phase 1 fio error: %s", e) + + # ── Cost estimate ───────────────────────────────────────────────────────── + if _COLLECT_COST.value: + elapsed = time.time() - t_run_start + results += _collect_cost_sample(pod, elapsed, base_meta) + + # ── Final degradation gate ──────────────────────────────────────────────── + # The phase try/except blocks above keep the run alive so partial data is + # still collected, but that means a catastrophic failure (pod OOM-evicted + # mid-run, no fio data, stress-ng killed before it could drive swap I/O) + # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. + # Detect those conditions here and surface them explicitly. + if _active_pod and _active_pod[0] != original_pod: + _degraded_reasons.append( + f"benchmark pod was replaced during the run ({original_pod} →" + f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;" + " phases executed after the eviction ran against a" + " freshly-initialised pod (empty /tmp, swap re-setup) and may be" + " invalid" + ) + if _pod_lost: + _degraded_reasons.append( + "benchmark pod(s) went NotFound during the run" + f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' + " eviction or container exit) and any phase running at or after" + " that" + " point (e.g. kernel-build baseline, OpenSearch) produced invalid" + " data" + ) + if _oom_events: + _degraded_reasons.append( + "OOM kill(s) (rc=137) occurred during the run on pod(s) " + f'{", ".join(_oom_events)} — a phase exceeded memory and was' + " killed by " + "the OOM killer (the container may have restarted in place), so" + " the " + "affected phase(s) produced no or partial data" + ) - degraded = bool(_degraded_reasons) - results.append( - sample.Sample( - 'swap_encryption_run_status', - 0.0 if degraded else 1.0, - 'status', - dict( - base_meta, - degraded=degraded, - degraded_reasons='; '.join(_degraded_reasons) or 'none', - num_samples=len(results) + 1, - ), - ) - ) - - if degraded: - msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(_degraded_reasons) - logging.error(msg) - if _FAIL_ON_DEGRADED.value: - # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The - # samples collected so far are still published by PKB before the failure - # is recorded, so no data is lost. - raise errors.Benchmarks.RunError(msg) - else: - logging.info( - '[swap_encryption] Run completed cleanly (%d samples)', len(results) + degraded = bool(_degraded_reasons) + results.append( + sample.Sample( + "swap_encryption_run_status", + 0.0 if degraded else 1.0, + "status", + dict( + base_meta, + degraded=degraded, + degraded_reasons="; ".join(_degraded_reasons) or "none", + num_samples=len(results) + 1, + ), + ) ) - return results + if degraded: + msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons) + logging.error(msg) + if _FAIL_ON_DEGRADED.value: + # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The + # samples collected so far are still published by PKB before the failure + # is recorded, so no data is lost. + raise errors.Benchmarks.RunError(msg) + else: + logging.info( + "[swap_encryption] Run completed cleanly (%d samples)", len(results) + ) + + return results def Cleanup(spec: _BenchmarkSpec) -> None: - """Remove the DaemonSet and tear down any swap configuration.""" - pod = _wait_for_benchmark_pod(timeout=30) - if pod: - _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True) - _pod_exec( - pod, - textwrap.dedent(""" + """Remove the DaemonSet and tear down any swap configuration.""" + pod = _wait_for_benchmark_pod(timeout=30) + if pod: + _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True) + _pod_exec( + pod, + textwrap.dedent(""" swapoff /dev/mapper/swap_encrypted 2>/dev/null || true dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true """), - ignore_failure=True, - ) - # Clean up loop device backing files (single-disk fallback path). - _pod_exec( - pod, - textwrap.dedent(""" + ignore_failure=True, + ) + # Clean up loop device backing files (single-disk fallback path). + _pod_exec( + pod, + textwrap.dedent(""" for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ /mnt/stateful_partition/pkb_swap_backing do @@ -631,188 +651,230 @@ def Cleanup(spec: _BenchmarkSpec) -> None: rm -f "$backing" done """), - ignore_failure=True, - ) - _pod_exec( - pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", ignore_failure=True - ) + ignore_failure=True, + ) + _pod_exec( + pod, + "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True, + ) - _delete_daemonset() + _delete_daemonset() - # Detach and delete the dedicated swap disk if one was provisioned. - cluster = spec.container_cluster - if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None): - _detach_and_delete_swap_disk(cluster) + # Detach and delete the dedicated swap disk if one was provisioned. + cluster = spec.container_cluster + if _ADD_SWAP_DISK.value and getattr(cluster, "project", None): + _detach_and_delete_swap_disk(cluster) -def _deploy_daemonset() -> None: - """Apply the benchmark DaemonSet manifest to the cluster. - - Uses kubernetes_commands.ApplyManifest which renders the Jinja2 template - from perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2, - writes it to a temp file, and calls kubectl apply -f — the standard PKB - pattern for deploying manifests. - """ - kubernetes_commands.ApplyManifest( - 'cluster/swap_encryption_daemonset.yaml.j2', - ds_name=_DS_NAME, - ds_namespace=_DS_NAMESPACE, - ds_label=_DS_LABEL, - benchmark_nodepool=_BENCHMARK_NODEPOOL, - image=_DAEMONSET_IMAGE.value, - kernel_version=_KERNEL_VERSION.value, - ) - logging.info('[swap_encryption] DaemonSet applied') - - -def _wait_for_benchmark_pod(timeout: int = 900) -> str | None: - """Wait until the DaemonSet pod is Running AND tools are installed. - - The benchmark container installs apt packages on first start and writes - /tmp/pkb_ready when done (~2-4 min on a cold node). We must wait for - that sentinel before exec-ing any commands, otherwise tools like - cryptsetup / fio may not yet be on PATH. - - Uses tab-separated name/phase output so kubectl always exits 0 regardless - of whether any pods are present, avoiding jsonpath index errors. - """ - deadline = time.time() + timeout - last_phase = '' - ready_pod = None # pod name once phase == Running - - while time.time() < deadline: - # ── Step 1: wait for Running phase ────────────────────────────────────── - if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pods', - '-l', - f'app={_DS_LABEL}', - '-n', - _DS_NAMESPACE, - '-o', - ( - r'jsonpath={range' - r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' - ), - ], - raise_on_failure=False, - ) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2: - pod_name, phase = parts[0].strip(), parts[1].strip() - if phase == 'Running': - logging.info( - '[swap_encryption] Pod %s is Running – ' - 'waiting for tool install to finish...', - pod_name, - ) - ready_pod = pod_name - break - if phase != last_phase: - logging.info( - '[swap_encryption] Pod %s phase: %s', pod_name, phase - ) - last_phase = phase - if phase in ('Pending',): - _log_pod_events(pod_name) - else: - logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...') - - # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── - if ready_pod is not None: - sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( - [ - 'exec', - ready_pod, - '-n', - _DS_NAMESPACE, - '--', - 'test', - '-f', - '/tmp/pkb_ready', - ], - raise_on_failure=False, - ) - if sentinel_rc == 0: - logging.info( - '[swap_encryption] Pod %s ready (tools installed)', ready_pod - ) - return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff or - # exited) — treat it as a hard reset: re-check pod phase on next iteration. - if ( - 'container not found' in sentinel_err - or 'unable to upgrade connection' in sentinel_err - ): - logging.warning( - '[swap_encryption] Pod %s: container not running (%s) ' - '— will re-check pod state', - ready_pod, - sentinel_err.strip(), - ) - ready_pod = None - last_phase = '' - else: - logging.info( - '[swap_encryption] Pod %s: still installing tools...', ready_pod - ) +def _configure_eks_kubelet_swap(spec) -> None: + """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. + + NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm + integration) to merge. When that lands, EKS node pools should include + a preBootstrapCommands block writing nodeadm config with + memorySwapBehavior: LimitedSwap before kubelet starts:: + + apiVersion: node.eks.aws/v1alpha1 + kind: NodeConfig + spec: + kubelet: + config: + memorySwapBehavior: LimitedSwap + failSwapOn: false + + GKE equivalent: linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior + via --system-config-from-file, already implemented in + _create_benchmark_node_pool. + + See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 + """ + logging.warning( + "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is " + "deferred (blocked on PR #6780 — SwapConfigSpec). " + "EKS nodes will use default kubelet swap settings until that PR merges." + ) + - time.sleep(15) +def _deploy_daemonset() -> None: + """Apply the swap-infra DaemonSet manifest to the cluster. + + The DaemonSet is intentionally lean: it only verifies the node-level swap + device is active (configured via linuxConfig.swapConfig on GKE or + kubelet-config.json on EKS) and writes /tmp/pkb_ready. No benchmark + tooling is installed here — workloads are delegated to existing PKB + benchmark modules (kubernetes_fio, kubernetes_redis_memtier, etc.) which + manage their own tool installs inside separate benchmark pods. + + Uses kubernetes_commands.ApplyManifest to render the Jinja2 template from + perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 and + apply it via kubectl — the standard PKB pattern for deploying manifests. + """ + kubernetes_commands.ApplyManifest( + "cluster/swap_encryption_daemonset.yaml.j2", + ds_name=_DS_NAME, + ds_namespace=_DS_NAMESPACE, + ds_label=_DS_LABEL, + benchmark_nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + ) + logging.info("[swap_encryption] Swap-infra DaemonSet applied") + + +def _wait_for_benchmark_pod(timeout: int = 600) -> str | None: + """Wait until the swap-infra DaemonSet pod is Running AND swap is active. + + The DaemonSet installs fio and a small set of measurement tools then + verifies the swap device before writing /tmp/pkb_ready (~1-2 min on a + cold apt cache). Default timeout 600 s covers worst-case APT latency + on a freshly-started node. + + Uses tab-separated name/phase output so kubectl always exits 0 regardless + of whether any pods are present, avoiding jsonpath index errors. + """ + deadline = time.time() + timeout + last_phase = "" + ready_pod = None # pod name once phase == Running + + while time.time() < deadline: + # ── Step 1: wait for Running phase ────────────────────────────────────── + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "pods", + "-l", + f"app={_DS_LABEL}", + "-n", + _DS_NAMESPACE, + "-o", + ( + r"jsonpath={range" + r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' + ), + ], + raise_on_failure=False, + ) + + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split("\t") + if len(parts) == 2: + pod_name, phase = parts[0].strip(), parts[1].strip() + if phase == "Running": + logging.info( + "[swap_encryption] Pod %s is Running – " + "waiting for swap device readiness sentinel...", + pod_name, + ) + ready_pod = pod_name + break + if phase != last_phase: + logging.info( + "[swap_encryption] Pod %s phase: %s", + pod_name, + phase, + ) + last_phase = phase + if phase in ("Pending",): + _log_pod_events(pod_name) + else: + logging.info( + "[swap_encryption] Waiting for DaemonSet pod to appear..." + ) + + # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── + if ready_pod is not None: + sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( + [ + "exec", + ready_pod, + "-n", + _DS_NAMESPACE, + "--", + "test", + "-f", + "/tmp/pkb_ready", + ], + raise_on_failure=False, + ) + if sentinel_rc == 0: + logging.info( + "[swap_encryption] Pod %s ready (swap device active)", + ready_pod, + ) + return ready_pod + # "container not found" means the container crashed (CrashLoopBackOff or + # exited) — treat it as a hard reset: re-check pod phase on next iteration. + if ( + "container not found" in sentinel_err + or "unable to upgrade connection" in sentinel_err + ): + logging.warning( + "[swap_encryption] Pod %s: container not running (%s) " + "— will re-check pod state", + ready_pod, + sentinel_err.strip(), + ) + ready_pod = None + last_phase = "" + else: + logging.info( + "[swap_encryption] Pod %s: still installing tools...", + ready_pod, + ) + + time.sleep(15) - logging.warning( - '[swap_encryption] Benchmark pod not ready after %ds', timeout - ) - return None + logging.warning( + "[swap_encryption] Benchmark pod not ready after %ds", timeout + ) + return None def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand( - [ - 'describe', - 'pod', - pod_name, - '-n', - _DS_NAMESPACE, - ], - raise_on_failure=False, - ) - # Only log the Events section to keep output manageable - in_events = False - lines = [] - for line in events_out.splitlines(): - if line.startswith('Events:'): - in_events = True - if in_events: - lines.append(line) - if lines: - logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])) - else: - logging.info( - '[swap_encryption] kubectl describe output:\n%s', - events_out[-2000:] if len(events_out) > 2000 else events_out, + """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand( + [ + "describe", + "pod", + pod_name, + "-n", + _DS_NAMESPACE, + ], + raise_on_failure=False, ) + # Only log the Events section to keep output manageable + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith("Events:"): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30])) + else: + logging.info( + "[swap_encryption] kubectl describe output:\n%s", + events_out[-2000:] if len(events_out) > 2000 else events_out, + ) def _delete_daemonset() -> None: - """Delete the benchmark DaemonSet.""" - kubectl.RunKubectlCommand( - [ - 'delete', - 'daemonset', - _DS_NAME, - '-n', - _DS_NAMESPACE, - '--ignore-not-found', - ], - raise_on_failure=False, - ) - logging.info('[swap_encryption] DaemonSet deleted') + """Delete the benchmark DaemonSet.""" + kubectl.RunKubectlCommand( + [ + "delete", + "daemonset", + _DS_NAME, + "-n", + _DS_NAMESPACE, + "--ignore-not-found", + ], + raise_on_failure=False, + ) + logging.info("[swap_encryption] DaemonSet deleted") _HYPERDISK_MAX_IOPS_PER_MBPS = ( @@ -821,475 +883,495 @@ def _delete_daemonset() -> None: def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: - """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. - - Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed - 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails - with "Requested provisioned throughput is too low for the provisioned iops". - Clamp throughput UP to the minimum the requested IOPS need (plus a small - margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk - creation. - """ - min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) - if throughput < min_tput: - logging.warning( - '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for ' - '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d', - throughput, - iops, - min_tput, - min_tput, - ) - return min_tput - return throughput + """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. + + Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed + 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails + with "Requested provisioned throughput is too low for the provisioned iops". + Clamp throughput UP to the minimum the requested IOPS need (plus a small + margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk + creation. + """ + min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) + if throughput < min_tput: + logging.warning( + "[swap_encryption] boot/swap disk throughput %d MiB/s is too low" + " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);" + " raising to %d", + throughput, + iops, + min_tput, + min_tput, + ) + return min_tput + return throughput def _create_benchmark_node_pool(cluster) -> None: - """Add the benchmark nodepool to the existing cluster (Step 2 of setup). - - Uses: - --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default COS_CONTAINERD) - --swap_encryption_boot_disk_iops (default 80000) - --swap_encryption_enable_dmcrypt (default True) - - The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet - nodeSelector targets it exclusively. dm-crypt swap setup is performed - from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / - _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata - because GKE reserves that metadata key and rejects it at the API level. - """ - machine_type = _BENCHMARK_MACHINE_TYPE.value - # Auto-detect LSSD from machine type name; flag overrides only when True. - is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower() - - # Determine zone/region from the cluster object. - # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). - # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on - # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk - # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable - # headroom and matches the Config 2 spec in the Engineer Assignments table). - disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value - - disk_type = _BOOT_DISK_TYPE.value - - # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region, - # and auth token refresh. GkeCluster._GcloudCommand also handles the - # zone → region promotion for multi-zone / regional clusters. - cmd = cluster._GcloudCommand( - 'container', - 'node-pools', - 'create', - _BENCHMARK_NODEPOOL, - '--cluster', - cluster.name, - ) - cmd.flags['machine-type'] = machine_type - cmd.flags['image-type'] = _NODE_IMAGE_TYPE.value - cmd.flags['disk-type'] = disk_type - cmd.flags['disk-size'] = disk_size_gb - cmd.flags['num-nodes'] = 1 - cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}' - cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair'] - - # IOPS and throughput provisioning only applies to hyperdisk-* types AND - # only when the boot disk is also the swap device (non-LSSD configs). - # For LSSD machines the boot disk is OS-only; swap is on local NVMe. - # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the - # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). - if disk_type.startswith('hyperdisk') and not is_lssd: - # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by - # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks). - cmd.flags['boot-disk-provisioned-iops'] = _BOOT_DISK_IOPS.value - cmd.flags['boot-disk-provisioned-throughput'] = _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + """Add the benchmark nodepool to the existing cluster (Step 2 of setup). + + Uses: + --swap_encryption_benchmark_machine_type (default n4-highmem-32) + --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_boot_disk_iops (default 80000) + --swap_encryption_enable_dmcrypt (default True) + + The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet + nodeSelector targets it exclusively. dm-crypt swap setup is performed + from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / + _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata + because GKE reserves that metadata key and rejects it at the API level. + """ + machine_type = _BENCHMARK_MACHINE_TYPE.value + # Auto-detect LSSD from machine type name; flag overrides only when True. + is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower() + + # Determine zone/region from the cluster object. + # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). + # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on + # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk + # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable + # headroom and matches the Config 2 spec in the Engineer Assignments table). + disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value + + disk_type = _BOOT_DISK_TYPE.value + + # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region, + # and auth token refresh. GkeCluster._GcloudCommand also handles the + # zone → region promotion for multi-zone / regional clusters. + cmd = cluster._GcloudCommand( + "container", + "node-pools", + "create", + _BENCHMARK_NODEPOOL, + "--cluster", + cluster.name, ) + cmd.flags["machine-type"] = machine_type + cmd.flags["image-type"] = _NODE_IMAGE_TYPE.value + cmd.flags["disk-type"] = disk_type + cmd.flags["disk-size"] = disk_size_gb + cmd.flags["num-nodes"] = 1 + cmd.flags["node-labels"] = f"pkb_nodepool={_BENCHMARK_NODEPOOL}" + cmd.args += ["--no-enable-autoupgrade", "--no-enable-autorepair"] + + # IOPS and throughput provisioning only applies to hyperdisk-* types AND + # only when the boot disk is also the swap device (non-LSSD configs). + # For LSSD machines the boot disk is OS-only; swap is on local NVMe. + # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the + # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). + if disk_type.startswith("hyperdisk") and not is_lssd: + # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by + # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks). + cmd.flags["boot-disk-provisioned-iops"] = _BOOT_DISK_IOPS.value + cmd.flags["boot-disk-provisioned-throughput"] = ( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ) - # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm - # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). - if is_lssd: - cmd.flags['local-nvme-ssd-block'] = f'count={_LSSD_COUNT.value}' - - # ── GKE kubelet swap config ─────────────────────────────────────────────── - # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark - # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap - # so that the kubelet allocates swap to the DaemonSet pod. Without this flag - # the Linux kernel swap device may exist but the kubelet blocks pod-level - # swap usage and the benchmark pod cannot drive swap I/O. - # - # Passed as --system-config-from-file pointing to a temp YAML, which is the - # same mechanism PKB's gke_node_system_config flag uses: - # perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py - swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value - system_config_tmp = None - if swap_behavior: - # Build system-config YAML for --system-config-from-file. - # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): - # kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods. - # linuxConfig.swapConfig: GKE enables node-level swap device. - # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use - # the local NVMe as the swap device (avoids boot-disk overhead). - # linuxConfig.sysctl: swap aggressiveness tuning so the benchmark - # workloads can drive sustained swap I/O. - # Reference: - # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ - # node-memory-swap#enable + # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm + # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). if is_lssd: - swap_config_block = ( - ' swapConfig:\n' - ' enabled: true\n' - ' dedicatedLocalSsdProfile:\n' - f' diskCount: {_LSSD_COUNT.value}\n' - ) - else: - swap_config_block = ' swapConfig:\n enabled: true\n' - kubelet_yaml = ( - f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\nlinuxConfig:\n' - + swap_config_block - + ' sysctl:\n' - ' vm.min_free_kbytes: 200\n' - ' vm.watermark_scale_factor: 500\n' - ' vm.swappiness: 100\n' - ) - system_config_tmp = tempfile.NamedTemporaryFile( - mode='w', suffix='.yaml', delete=False - ) - system_config_tmp.write(kubelet_yaml) - system_config_tmp.flush() - cmd.flags['system-config-from-file'] = system_config_tmp.name + cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}" + + # ── GKE kubelet swap config ─────────────────────────────────────────────── + # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark + # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap + # so that the kubelet allocates swap to the DaemonSet pod. Without this flag + # the Linux kernel swap device may exist but the kubelet blocks pod-level + # swap usage and the benchmark pod cannot drive swap I/O. + # + # Passed as --system-config-from-file pointing to a temp YAML, which is the + # same mechanism PKB's gke_node_system_config flag uses: + # perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py + swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value + system_config_tmp = None + if swap_behavior: + # Build system-config YAML for --system-config-from-file. + # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): + # kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods. + # linuxConfig.swapConfig: GKE enables node-level swap device. + # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use + # the local NVMe as the swap device (avoids boot-disk overhead). + # linuxConfig.sysctl: swap aggressiveness tuning so the benchmark + # workloads can drive sustained swap I/O. + # Reference: + # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ + # node-memory-swap#enable + if is_lssd: + swap_config_block = ( + " swapConfig:\n" + " enabled: true\n" + " dedicatedLocalSsdProfile:\n" + f" diskCount: {_LSSD_COUNT.value}\n" + ) + else: + swap_config_block = " swapConfig:\n enabled: true\n" + kubelet_yaml = ( + "kubeletConfig:\n memorySwapBehavior:" + f" {swap_behavior}\nlinuxConfig:\n" + + swap_config_block + + " sysctl:\n" + " vm.min_free_kbytes: 200\n" + " vm.watermark_scale_factor: 500\n" + " vm.swappiness: 100\n" + ) + system_config_tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) + system_config_tmp.write(kubelet_yaml) + system_config_tmp.flush() + cmd.flags["system-config-from-file"] = system_config_tmp.name + logging.info( + "[swap_encryption] system-config-from-file: " + "kubelet_swap=%s lssd=%s (written to %s):\n%s", + swap_behavior, + is_lssd, + system_config_tmp.name, + kubelet_yaml, + ) + logging.info( - '[swap_encryption] system-config-from-file: ' - 'kubelet_swap=%s lssd=%s (written to %s):\n%s', - swap_behavior, + "[swap_encryption] Creating benchmark nodepool: %s / %s / " + "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / " + "add_swap_disk=%s / kubelet_swap=%s", + _BENCHMARK_NODEPOOL, + machine_type, + _NODE_IMAGE_TYPE.value, + disk_size_gb, + _BOOT_DISK_IOPS.value, + _ENABLE_DMCRYPT.value, is_lssd, - system_config_tmp.name, - kubelet_yaml, - ) - - logging.info( - '[swap_encryption] Creating benchmark nodepool: %s / %s / ' - 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / ' - 'add_swap_disk=%s / kubelet_swap=%s', - _BENCHMARK_NODEPOOL, - machine_type, - _NODE_IMAGE_TYPE.value, - disk_size_gb, - _BOOT_DISK_IOPS.value, - _ENABLE_DMCRYPT.value, - is_lssd, - _ADD_SWAP_DISK.value, - swap_behavior or 'unset', - ) - - # LSSD nodepools take longer to provision than PD-only nodepools because - # GKE must also initialise the local NVMe devices before marking nodes Ready. - # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - try: - _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) - finally: - if system_config_tmp is not None: - try: - os.unlink(system_config_tmp.name) - except OSError: - pass - - if rc != 0: - # Idempotent prepare: if the nodepool already exists (e.g. re-running - # --run_stage=prepare,run to redeploy the DaemonSet onto an existing - # cluster), reuse it instead of failing. gcloud returns a 409 / - # "Already exists" message in this case. - low = (stderr or '').lower() - if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low: - logging.info( - '[swap_encryption] Benchmark nodepool already exists — ' - 'reusing it (idempotent prepare); proceeding to DaemonSet' - ) - return - raise errors.Benchmarks.RunError( - '[swap_encryption] Failed to create benchmark nodepool ' - f'(rc={rc}): {stderr}' + _ADD_SWAP_DISK.value, + swap_behavior or "unset", ) - logging.info('[swap_encryption] Benchmark nodepool ready') + # LSSD nodepools take longer to provision than PD-only nodepools because + # GKE must also initialise the local NVMe devices before marking nodes Ready. + # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. + try: + _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) + finally: + if system_config_tmp is not None: + try: + os.unlink(system_config_tmp.name) + except OSError: + pass -def _wait_for_benchmark_node(timeout: int = 900) -> None: - """Block until a node labelled pkb_nodepool=benchmark is Ready. - - gcloud container node-pools create returns as soon as the API accepts the - request — the actual node VM may take another 2-4 minutes to boot, join the - cluster, and pass its readiness checks. Deploying the DaemonSet before that - point leaves the pod Pending indefinitely because the nodeSelector finds no - eligible node. - - This function polls kubectl every 15 s until at least one node with - pkb_nodepool=benchmark has Ready=True, then returns. - """ - deadline = time.time() + timeout - logging.info( - '[swap_encryption] Waiting for benchmark node ' - '(pkb_nodepool=benchmark) to be Ready...' - ) - while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'nodes', - '-l', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', - r'jsonpath={range .items[*]}' - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}', - ], - raise_on_failure=False, - ) + if rc != 0: + # Idempotent prepare: if the nodepool already exists (e.g. re-running + # --run_stage=prepare,run to redeploy the DaemonSet onto an existing + # cluster), reuse it instead of failing. gcloud returns a 409 / + # "Already exists" message in this case. + low = (stderr or "").lower() + if ( + "already exists" in low + or "alreadyexists" in low + or "code=409" in low + ): + logging.info( + "[swap_encryption] Benchmark nodepool already exists — " + "reusing it (idempotent prepare); proceeding to DaemonSet" + ) + return + raise errors.Benchmarks.RunError( + "[swap_encryption] Failed to create benchmark nodepool " + f"(rc={rc}): {stderr}" + ) + logging.info("[swap_encryption] Benchmark nodepool ready") - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2 and parts[1].strip() == 'True': - logging.info( - '[swap_encryption] Benchmark node ready: %s', parts[0].strip() - ) - return +def _wait_for_benchmark_node(timeout: int = 900) -> None: + """Block until a node labelled pkb_nodepool=benchmark is Ready. + + gcloud container node-pools create returns as soon as the API accepts the + request — the actual node VM may take another 2-4 minutes to boot, join the + cluster, and pass its readiness checks. Deploying the DaemonSet before that + point leaves the pod Pending indefinitely because the nodeSelector finds no + eligible node. + + This function polls kubectl every 15 s until at least one node with + pkb_nodepool=benchmark has Ready=True, then returns. + """ + deadline = time.time() + timeout logging.info( - '[swap_encryption] Benchmark node not yet Ready — retrying in 15 s...' + "[swap_encryption] Waiting for benchmark node " + "(pkb_nodepool=benchmark) to be Ready..." ) - time.sleep(15) + while time.time() < deadline: + out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "nodes", + "-l", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + "-o", + r"jsonpath={range .items[*]}" + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}', + ], + raise_on_failure=False, + ) - raise errors.Benchmarks.RunError( - '[swap_encryption] Timed out waiting for benchmark node ' - f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready ' - f'after {timeout}s' - ) + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split("\t") + if len(parts) == 2 and parts[1].strip() == "True": + logging.info( + "[swap_encryption] Benchmark node ready: %s", + parts[0].strip(), + ) + return + logging.info( + "[swap_encryption] Benchmark node not yet Ready — retrying in 15" + " s..." + ) + time.sleep(15) -def _attach_swap_disk(cluster) -> None: - """Create a dedicated hyperdisk and attach it to the benchmark node. - - gcloud container node-pools create --additional-node-disk is not available - in all gcloud SDK versions, so we use gcloud compute to create the disk and - attach it after the node is ready. In GKE the Kubernetes node name is the - same as the GCE instance name, so no translation is needed. - - After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe - nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. - - The disk is named pkb-swap- to avoid name collisions across - concurrent runs. Cleanup deletes it in Cleanup() if it exists. - """ - # Resolve zone from cluster - zone = None - if getattr(cluster, 'zones', None): - zone = cluster.zones[0] - elif getattr(cluster, 'region', None): - zone = cluster.region - if not zone: raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot attach swap disk: cluster zone unknown' + "[swap_encryption] Timed out waiting for benchmark node " + f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready " + f"after {timeout}s" ) - project = cluster.project - disk_name = f'pkb-swap-{cluster.name}' - disk_type = _BOOT_DISK_TYPE.value - disk_size_gb = _SWAP_DISK_SIZE_GB.value - - # ── Step 1: get the GCE instance name of the benchmark node ─────────────── - node_out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'nodes', - '-l', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', - 'jsonpath={.items[0].metadata.name}', - ], - raise_on_failure=False, - ) - instance_name = node_out.strip() - if rc != 0 or not instance_name: - raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot find benchmark node for swap disk attach' + +def _attach_swap_disk(cluster) -> None: + """Create a dedicated hyperdisk and attach it to the benchmark node. + + gcloud container node-pools create --additional-node-disk is not available + in all gcloud SDK versions, so we use gcloud compute to create the disk and + attach it after the node is ready. In GKE the Kubernetes node name is the + same as the GCE instance name, so no translation is needed. + + After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe + nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. + + The disk is named pkb-swap- to avoid name collisions across + concurrent runs. Cleanup deletes it in Cleanup() if it exists. + """ + # Resolve zone from cluster + zone = None + if getattr(cluster, "zones", None): + zone = cluster.zones[0] + elif getattr(cluster, "region", None): + zone = cluster.region + if not zone: + raise errors.Benchmarks.RunError( + "[swap_encryption] Cannot attach swap disk: cluster zone unknown" + ) + + project = cluster.project + disk_name = f"pkb-swap-{cluster.name}" + disk_type = _BOOT_DISK_TYPE.value + disk_size_gb = _SWAP_DISK_SIZE_GB.value + + # ── Step 1: get the GCE instance name of the benchmark node ─────────────── + node_out, _, rc = kubectl.RunKubectlCommand( + [ + "get", + "nodes", + "-l", + f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + "-o", + "jsonpath={.items[0].metadata.name}", + ], + raise_on_failure=False, ) - logging.info('[swap_encryption] Benchmark node instance: %s', instance_name) - - # ── Step 2: create the hyperdisk ────────────────────────────────────────── - logging.info( - '[swap_encryption] Creating swap disk %s (%dGiB %s)', - disk_name, - disk_size_gb, - disk_type, - ) - # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project - # and --zone (always zonal — gcloud compute --region creates regional - # resources, which is not what we want for a node-attached swap disk). - gcp_res = _GcpZonalResource(project, zone) - create_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'create', disk_name - ) - create_cmd.flags['type'] = disk_type - create_cmd.flags['size'] = f'{disk_size_gb}GB' - create_cmd.args.append('--quiet') - if disk_type.startswith('hyperdisk'): - create_cmd.flags['provisioned-iops'] = _BOOT_DISK_IOPS.value - create_cmd.flags['provisioned-throughput'] = _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + instance_name = node_out.strip() + if rc != 0 or not instance_name: + raise errors.Benchmarks.RunError( + "[swap_encryption] Cannot find benchmark node for swap disk attach" + ) + logging.info("[swap_encryption] Benchmark node instance: %s", instance_name) + + # ── Step 2: create the hyperdisk ────────────────────────────────────────── + logging.info( + "[swap_encryption] Creating swap disk %s (%dGiB %s)", + disk_name, + disk_size_gb, + disk_type, ) - _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}' + # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project + # and --zone (always zonal — gcloud compute --region creates regional + # resources, which is not what we want for a node-attached swap disk). + gcp_res = _GcpZonalResource(project, zone) + create_cmd = gcp_util.GcloudCommand( + gcp_res, "compute", "disks", "create", disk_name ) + create_cmd.flags["type"] = disk_type + create_cmd.flags["size"] = f"{disk_size_gb}GB" + create_cmd.args.append("--quiet") + if disk_type.startswith("hyperdisk"): + create_cmd.flags["provisioned-iops"] = _BOOT_DISK_IOPS.value + create_cmd.flags["provisioned-throughput"] = ( + _valid_hyperdisk_throughput( + _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value + ) + ) + _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f"[swap_encryption] Failed to create swap disk {disk_name}:" + f" {stderr}" + ) - # ── Step 3: attach the disk to the node VM ──────────────────────────────── - logging.info( - '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name - ) - attach_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'instances', 'attach-disk', instance_name - ) - attach_cmd.flags['disk'] = disk_name - attach_cmd.flags['device-name'] = 'pkb-swap' - attach_cmd.args.append('--quiet') - _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to attach swap disk to {instance_name}: ' - f'{stderr}' + # ── Step 3: attach the disk to the node VM ──────────────────────────────── + logging.info( + "[swap_encryption] Attaching swap disk %s to %s", + disk_name, + instance_name, ) - logging.info( - '[swap_encryption] Swap disk attached: %s → %s', disk_name, instance_name - ) - - -def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: - """Detach (if attached) and delete a GCE disk, robustly, with retries. - - Finds the attached instance from the disk's own `users` field rather than - kubectl — kubectl is often unavailable during teardown (cluster being - deleted), which previously left the disk attached and undeletable, so it - leaked. Returns True if the disk is gone (deleted or already absent). - """ - for attempt in range(1, 5): - gcp_res = _GcpZonalResource(project, zone) - describe_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'describe', disk_name + attach_cmd = gcp_util.GcloudCommand( + gcp_res, "compute", "instances", "attach-disk", instance_name ) - describe_cmd.flags['format'] = 'value(users)' - users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) + attach_cmd.flags["disk"] = disk_name + attach_cmd.flags["device-name"] = "pkb-swap" + attach_cmd.args.append("--quiet") + _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) if rc != 0: - logging.info( - '[swap_encryption] Swap disk %s not present — nothing to delete', - disk_name, - ) - return True # already gone - user = users.strip() - if user: - inst = user.split('/')[-1] - logging.info( - '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst - ) - detach_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'instances', 'detach-disk', inst - ) - detach_cmd.flags['disk'] = disk_name - detach_cmd.args.append('--quiet') - detach_cmd.Issue(timeout=120, raise_on_failure=False) - delete_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'delete', disk_name + raise errors.Benchmarks.RunError( + f"[swap_encryption] Failed to attach swap disk to {instance_name}: " + f"{stderr}" + ) + logging.info( + "[swap_encryption] Swap disk attached: %s → %s", + disk_name, + instance_name, ) - delete_cmd.args.append('--quiet') - _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) - if drc == 0: - logging.info('[swap_encryption] Swap disk deleted: %s', disk_name) - return True - logging.warning( - '[swap_encryption] Swap disk delete attempt %d/4 failed ' - '(%s); retrying in 10s', - attempt, - derr.strip()[:160], + + +def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: + """Detach (if attached) and delete a GCE disk, robustly, with retries. + + Finds the attached instance from the disk's own `users` field rather than + kubectl — kubectl is often unavailable during teardown (cluster being + deleted), which previously left the disk attached and undeletable, so it + leaked. Returns True if the disk is gone (deleted or already absent). + """ + for attempt in range(1, 5): + gcp_res = _GcpZonalResource(project, zone) + describe_cmd = gcp_util.GcloudCommand( + gcp_res, "compute", "disks", "describe", disk_name + ) + describe_cmd.flags["format"] = "value(users)" + users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) + if rc != 0: + logging.info( + "[swap_encryption] Swap disk %s not present — nothing to" + " delete", + disk_name, + ) + return True # already gone + user = users.strip() + if user: + inst = user.split("/")[-1] + logging.info( + "[swap_encryption] Detaching swap disk %s from %s", + disk_name, + inst, + ) + detach_cmd = gcp_util.GcloudCommand( + gcp_res, "compute", "instances", "detach-disk", inst + ) + detach_cmd.flags["disk"] = disk_name + detach_cmd.args.append("--quiet") + detach_cmd.Issue(timeout=120, raise_on_failure=False) + delete_cmd = gcp_util.GcloudCommand( + gcp_res, "compute", "disks", "delete", disk_name + ) + delete_cmd.args.append("--quiet") + _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) + if drc == 0: + logging.info("[swap_encryption] Swap disk deleted: %s", disk_name) + return True + logging.warning( + "[swap_encryption] Swap disk delete attempt %d/4 failed " + "(%s); retrying in 10s", + attempt, + derr.strip()[:160], + ) + time.sleep(10) + logging.error( + "[swap_encryption] Could NOT delete swap disk %s after retries " + "— delete it manually: gcloud compute disks delete %s " + "--zone %s --quiet", + disk_name, + disk_name, + zone, ) - time.sleep(10) - logging.error( - '[swap_encryption] Could NOT delete swap disk %s after retries ' - '— delete it manually: gcloud compute disks delete %s ' - '--zone %s --quiet', - disk_name, - disk_name, - zone, - ) - return False + return False def _detach_and_delete_swap_disk(cluster) -> None: - """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" - zone = None - if getattr(cluster, 'zones', None): - zone = cluster.zones[0] - elif getattr(cluster, 'region', None): - zone = cluster.region - if not zone or not getattr(cluster, 'project', None): - return - _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone) + """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" + zone = None + if getattr(cluster, "zones", None): + zone = cluster.zones[0] + elif getattr(cluster, "region", None): + zone = cluster.region + if not zone or not getattr(cluster, "project", None): + return + _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone) def _delete_default_node_pool(cluster) -> None: - """Delete the dummy default nodepool after the benchmark pool is ready. - - The default nodepool (e2-medium) was only needed to satisfy GKE's - requirement that a cluster must have at least one nodepool at creation time. - Removing it stops the clock on its cost immediately. - """ - # Use PKB's GcloudCommand: auto-injects --project, --zone/--region. - cmd = cluster._GcloudCommand( - 'container', - 'node-pools', - 'delete', - _DEFAULT_NODEPOOL, - '--cluster', - cluster.name, - ) - cmd.args.append('--quiet') - - logging.info( - '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL - ) - _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) - if rc != 0: - logging.warning( - '[swap_encryption] Could not delete default nodepool (rc=%d): %s', - rc, - stderr, + """Delete the dummy default nodepool after the benchmark pool is ready. + + The default nodepool (e2-medium) was only needed to satisfy GKE's + requirement that a cluster must have at least one nodepool at creation time. + Removing it stops the clock on its cost immediately. + """ + # Use PKB's GcloudCommand: auto-injects --project, --zone/--region. + cmd = cluster._GcloudCommand( + "container", + "node-pools", + "delete", + _DEFAULT_NODEPOOL, + "--cluster", + cluster.name, ) - else: - logging.info('[swap_encryption] Default nodepool deleted') + cmd.args.append("--quiet") + logging.info( + "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL + ) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) + if rc != 0: + logging.warning( + "[swap_encryption] Could not delete default nodepool (rc=%d): %s", + rc, + stderr, + ) + else: + logging.info("[swap_encryption] Default nodepool deleted") -def _is_pod_gone(pod: str) -> bool: - """Return True if the named pod no longer exists in the cluster. - Used to distinguish OOM-killed container processes (pod still alive, rc=137) - from OOM-evicted pods (pod gone, DaemonSet will create a replacement). - """ - try: - _, err, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pod', - pod, - '-n', - _DS_NAMESPACE, - '-o', - 'jsonpath={.metadata.name}', - ], - raise_on_failure=False, - timeout=15, - ) - return rc != 0 and 'not found' in (err or '').lower() - except Exception: # pylint: disable=broad-except - return False +def _is_pod_gone(pod: str) -> bool: + """Return True if the named pod no longer exists in the cluster. + + Used to distinguish OOM-killed container processes (pod still alive, rc=137) + from OOM-evicted pods (pod gone, DaemonSet will create a replacement). + """ + try: + _, err, rc = kubectl.RunKubectlCommand( + [ + "get", + "pod", + pod, + "-n", + _DS_NAMESPACE, + "-o", + "jsonpath={.metadata.name}", + ], + raise_on_failure=False, + timeout=15, + ) + return rc != 0 and "not found" in (err or "").lower() + except Exception: # pylint: disable=broad-except + return False def _pod_exec( @@ -1299,375 +1381,544 @@ def _pod_exec( timeout: int = 300, _retries: int = 2, ) -> tuple[str, str]: - """Run a shell command inside the benchmark pod via kubectl exec. - - Args: - pod: Pod name returned by _wait_for_benchmark_pod. - cmd: Shell command string passed to bash -c. - ignore_failure: When True, non-zero exit codes are logged but not - raised. - timeout: Seconds before PKB kills the kubectl exec process. Default - 300 s matches PKB's IssueCommand default. Pass a larger value for - long-running jobs (fio, stress-ng, kernel build). - _retries: Number of automatic retries on transient GKE websocket - resets ("connection reset by peer"). Set to 0 to disable retries - for idempotent-sensitive commands. - - Returns: - Tuple of (stdout, stderr) strings. - """ - # Use module-level constants for error strings (defined at top of module). - # Use the globally-tracked active pod name — it may have been updated by - # a previous _recover_pod call when eviction replaced the pod. - active = _active_pod[0] if _active_pod else pod - - for attempt in range(_retries + 1): - out, err, rc = kubectl.RunKubectlCommand( - ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd], - raise_on_failure=False, - raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets - timeout=timeout, - ) - is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS) - if is_transient and attempt < _retries: - logging.warning( - '[swap_encryption] kubectl exec connection reset (attempt %d/%d); ' - 'retrying in 10 s', - attempt + 1, - _retries + 1, - ) - time.sleep(10) - continue - # rc=137 (SIGKILL): the OOM killer terminated the container process. - # Two sub-cases: - # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. - # B) Container OOM restart: pod still exists, container restarts in place. - # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, - # tools must be re-installed before subsequent commands can run.) - # In both cases we call _recover_pod to wait for tools + sentinel, and - # we do NOT retry the OOM-triggering command itself. - if rc == 137: - # Record the OOM so the run-level gate can flag it even if the container - # restarts in place under the same pod name (which leaves both the - # "pod replaced" and "pod NotFound" checks silent). - if active not in _oom_events: - _oom_events.append(active) - # CRITICAL: sleep before checking pod state. Kubernetes takes a few - # seconds to mark a just-evicted pod as Terminating / NotFound. Without - # this delay _recover_pod sees the pod still in "Running" phase, returns - # the old pod name immediately, and every subsequent command fails with - # "Error from server (NotFound): pods … not found". - logging.warning( - '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update ' - 'pod state before recovery check' - ) - time.sleep(15) - pod_gone = _is_pod_gone(active) - if pod_gone: - logging.warning( - '[swap_encryption] OOM-eviction detected (rc=137, pod gone) —' - ' recovering pod name for subsequent commands (not retrying this' - ' cmd)' + """Run a shell command inside the benchmark pod via kubectl exec. + + Args: + pod: Pod name returned by _wait_for_benchmark_pod. + cmd: Shell command string passed to bash -c. + ignore_failure: When True, non-zero exit codes are logged but not + raised. + timeout: Seconds before PKB kills the kubectl exec process. Default + 300 s matches PKB's IssueCommand default. Pass a larger value for + long-running jobs (fio, stress-ng, kernel build). + _retries: Number of automatic retries on transient GKE websocket + resets ("connection reset by peer"). Set to 0 to disable retries + for idempotent-sensitive commands. + + Returns: + Tuple of (stdout, stderr) strings. + """ + # Use module-level constants for error strings (defined at top of module). + # Use the globally-tracked active pod name — it may have been updated by + # a previous _recover_pod call when eviction replaced the pod. + active = _active_pod[0] if _active_pod else pod + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd], + raise_on_failure=False, + raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets + timeout=timeout, ) - else: - logging.warning( - '[swap_encryption] Container OOM-killed (rc=137, pod still exists)' - ' — waiting for container restart and tool re-install before' - ' continuing' + is_transient = rc != 0 and any( + e in err for e in _TRANSIENT_KUBECTL_ERRORS ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - '[swap_encryption] Pod name updated: %s → %s', active, new_pod + if is_transient and attempt < _retries: + logging.warning( + "[swap_encryption] kubectl exec connection reset (attempt" + " %d/%d); retrying in 10 s", + attempt + 1, + _retries + 1, + ) + time.sleep(10) + continue + # rc=137 (SIGKILL): the OOM killer terminated the container process. + # Two sub-cases: + # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. + # B) Container OOM restart: pod still exists, container restarts in place. + # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, + # tools must be re-installed before subsequent commands can run.) + # In both cases we call _recover_pod to wait for tools + sentinel, and + # we do NOT retry the OOM-triggering command itself. + if rc == 137: + # Record the OOM so the run-level gate can flag it even if the container + # restarts in place under the same pod name (which leaves both the + # "pod replaced" and "pod NotFound" checks silent). + if active not in _oom_events: + _oom_events.append(active) + # CRITICAL: sleep before checking pod state. Kubernetes takes a few + # seconds to mark a just-evicted pod as Terminating / NotFound. Without + # this delay _recover_pod sees the pod still in "Running" phase, returns + # the old pod name immediately, and every subsequent command fails with + # "Error from server (NotFound): pods … not found". + logging.warning( + "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to" + " update pod state before recovery check" + ) + time.sleep(15) + pod_gone = _is_pod_gone(active) + if pod_gone: + logging.warning( + "[swap_encryption] OOM-eviction detected (rc=137, pod gone)" + " — recovering pod name for subsequent commands (not" + " retrying this cmd)" + ) + else: + logging.warning( + "[swap_encryption] Container OOM-killed (rc=137, pod still" + " exists) — waiting for container restart and tool" + " re-install before continuing" + ) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info( + "[swap_encryption] Pod name updated: %s → %s", + active, + new_pod, + ) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + + is_container_gone = rc != 0 and any( + e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS + ) + if is_container_gone: + # Record the loss for the run-level degradation gate REGARDLESS of retry + # budget or ignore_failure. A "pods … not found" on a best-effort command + # (kernel build, opensearch, cleanup of a dead pod) still means the pod + # died; without this the gate stays blind because _active_pod is only + # renamed on the retry path below, which _retries=0 callers never reach. + if active and active not in _pod_lost: + _pod_lost.append(active) + logging.error( + "[swap_encryption] Benchmark pod %s is gone (%s) —" + " recording run as degraded", + active, + (err or "").strip()[:160], + ) + if attempt < _retries: + logging.warning( + "[swap_encryption] Container gone/restarting (attempt" + " %d/%d) — waiting for pod to recover...", + attempt + 1, + _retries + 1, + ) + new_pod = _recover_pod(active) + if new_pod != active: + logging.info( + "[swap_encryption] Pod name updated: %s → %s", + active, + new_pod, + ) + if _active_pod: + _active_pod[0] = new_pod + active = new_pod + continue + break + + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f"[swap_encryption] _pod_exec failed (rc={rc}): {err}" ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + return out, err - is_container_gone = rc != 0 and any( - e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS + +def _recover_pod(pod: str, timeout_sec: int = 600) -> str: + """Wait for a DaemonSet container to recover after OOM kill or eviction. + + Handles two scenarios: + 1. Container OOM restart: same pod name, container restarting in place. + DaemonSet restartPolicy=Always brings it back under the same pod name. + 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates + a new pod with a DIFFERENT name. We detect this by checking whether + the named pod still exists; if not, we search by the DaemonSet label + selector for a Running pod. + + Returns the (possibly new) pod name once it is Running and ready. + """ + deadline = time.time() + timeout_sec + logging.info( + "[swap_encryption] Waiting for pod %s to recover (up to %ds)...", + pod, + timeout_sec, ) - if is_container_gone: - # Record the loss for the run-level degradation gate REGARDLESS of retry - # budget or ignore_failure. A "pods … not found" on a best-effort command - # (kernel build, opensearch, cleanup of a dead pod) still means the pod - # died; without this the gate stays blind because _active_pod is only - # renamed on the retry path below, which _retries=0 callers never reach. - if active and active not in _pod_lost: - _pod_lost.append(active) - logging.error( - '[swap_encryption] Benchmark pod %s is gone (%s) — recording run ' - 'as degraded', - active, - (err or '').strip()[:160], + + # Phase 1: wait for a Running pod — either the named one (container + # restart) or a replacement pod found via label selector (eviction). + # + # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a + # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp + # (the pod is "Terminating") while status.phase may still read "Running" for + # several seconds. Checking only status.phase causes a false-positive: we + # return the old pod name immediately and every subsequent command fails with + # "Error from server (NotFound)". Checking deletionTimestamp catches this. + recovered_pod = pod + while time.time() < deadline: + # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not + # stdout. When the pod is gone, status_out is empty and the error text + # lives entirely in status_err. Discarding stderr (using _) means the + # 'not found' check below never fires and we spin until deadline. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + [ + "get", + "pod", + pod, + "-n", + _DS_NAMESPACE, + "-o", + "jsonpath={.status.phase}|{.metadata.deletionTimestamp}", + ], + raise_on_failure=False, + timeout=30, ) - if attempt < _retries: - logging.warning( - '[swap_encryption] Container gone/restarting (attempt %d/%d) — ' - 'waiting for pod to recover...', - attempt + 1, - _retries + 1, + # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) + fields = status_out.strip().split("|") + phase = fields[0].strip() if fields else "" + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Pod is genuinely Running and NOT being deleted — recovery complete. + if status_rc == 0 and phase == "Running" and not is_terminating: + break + + # Pod no longer exists, OR it exists but is being terminated (Terminating + # state or deletionTimestamp set) — look for a replacement pod by label. + pod_gone_or_terminating = ( + status_rc != 0 and "not found" in (status_out + status_err).lower() + ) or is_terminating + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + [ + "get", + "pods", + "-n", + _DS_NAMESPACE, + "-l", + f"app={_DS_LABEL}", + "-o", + ( + 'jsonpath={range .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}' + ), + ], + raise_on_failure=False, + timeout=30, + ) + new_pods = [ + p.strip() + for p in label_out.strip().splitlines() + if p.strip() and p.strip() != pod + ] # exclude the dying pod + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info( + "[swap_encryption] Original pod %s gone/terminating; " + "found replacement %s", + pod, + recovered_pod, + ) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f"[swap_encryption] No Running pod found (original: {pod}) " + f"within {timeout_sec}s after OOM kill / eviction" + ) + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + [ + "exec", + recovered_pod, + "-n", + _DS_NAMESPACE, + "--", + "bash", + "-c", + "test -f /tmp/pkb_ready && echo READY", + ], + raise_on_failure=False, + timeout=30, ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - '[swap_encryption] Pod name updated: %s → %s', active, new_pod - ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - continue - break - - if rc != 0 and not ignore_failure: + if ready_rc == 0 and "READY" in ready_out: + logging.info( + "[swap_encryption] Pod %s recovered (swap device active)", + recovered_pod, + ) + return recovered_pod + time.sleep(15) + raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] _pod_exec failed (rc={rc}): {err}' + f"[swap_encryption] Pod {recovered_pod} did not become ready " + f"within {timeout_sec}s after OOM kill / eviction" ) - return out, err -def _recover_pod(pod: str, timeout_sec: int = 600) -> str: - """Wait for a DaemonSet container to recover after OOM kill or eviction. - - Handles two scenarios: - 1. Container OOM restart: same pod name, container restarting in place. - DaemonSet restartPolicy=Always brings it back under the same pod name. - 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates - a new pod with a DIFFERENT name. We detect this by checking whether - the named pod still exists; if not, we search by the DaemonSet label - selector for a Running pod. - - Returns the (possibly new) pod name once it is Running and ready. - """ - deadline = time.time() + timeout_sec - logging.info( - '[swap_encryption] Waiting for pod %s to recover (up to %ds)...', - pod, - timeout_sec, - ) - - # Phase 1: wait for a Running pod — either the named one (container - # restart) or a replacement pod found via label selector (eviction). - # - # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a - # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp - # (the pod is "Terminating") while status.phase may still read "Running" for - # several seconds. Checking only status.phase causes a false-positive: we - # return the old pod name immediately and every subsequent command fails with - # "Error from server (NotFound)". Checking deletionTimestamp catches this. - recovered_pod = pod - while time.time() < deadline: - # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not - # stdout. When the pod is gone, status_out is empty and the error text - # lives entirely in status_err. Discarding stderr (using _) means the - # 'not found' check below never fires and we spin until deadline. - status_out, status_err, status_rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pod', - pod, - '-n', - _DS_NAMESPACE, - '-o', - 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}', - ], - raise_on_failure=False, +def _run_phase1_fio( + pod: str, swap_dev: str, base_meta: dict[str, Any] +) -> list[sample.Sample]: + """Run fio microbenchmarks on the raw swap block device (Phase 1). + + Calls swapoff before running fio so measurements reflect the raw + hardware + encryption ceiling with no swap-daemon overhead. Re-enables + swap unconditionally after all jobs complete. + + Jobs: + 4k_randread iodepth=32 → random read IOPS + 4k_randwrite iodepth=32 → random write IOPS + 1m_seqread iodepth=8 → sequential read bandwidth + 1m_seqwrite iodepth=8 → sequential write bandwidth + 4k_lat_read iodepth=1 → completion latency floor (read) + + Args: + pod: Benchmark pod name. + swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted. + base_meta: Shared metadata dict from _build_metadata(). + + Returns: + List of Sample objects with IOPS, bandwidth and latency metrics. + """ + samples: list[sample.Sample] = [] + + # swapoff before fio — running fio with --direct=1 on an active swap + # device races with kernel page-reclaim on the same dm-crypt target + # and can cause kernel panics on some kernels. + logging.info("[swap_encryption] Phase 1: swapoff %s", swap_dev) + _pod_exec( + pod, + f"swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true", timeout=30, + ignore_failure=True, ) - # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) - fields = status_out.strip().split('|') - phase = fields[0].strip() if fields else '' - is_terminating = len(fields) > 1 and bool(fields[1].strip()) - - # Pod is genuinely Running and NOT being deleted — recovery complete. - if status_rc == 0 and phase == 'Running' and not is_terminating: - break - - # Pod no longer exists, OR it exists but is being terminated (Terminating - # state or deletionTimestamp set) — look for a replacement pod by label. - pod_gone_or_terminating = ( - status_rc != 0 and 'not found' in (status_out + status_err).lower() - ) or is_terminating - if pod_gone_or_terminating: - label_out, _, label_rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pods', - '-n', - _DS_NAMESPACE, - '-l', - f'app={_DS_LABEL}', - '-o', - ( - 'jsonpath={range .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}' - ), - ], - raise_on_failure=False, - timeout=30, - ) - new_pods = [ - p.strip() - for p in label_out.strip().splitlines() - if p.strip() and p.strip() != pod - ] # exclude the dying pod - if label_rc == 0 and new_pods: - recovered_pod = new_pods[0] - logging.info( - '[swap_encryption] Original pod %s gone/terminating; ' - 'found replacement %s', + + # (name, rw_mode, block_size, iodepth) + fio_jobs = [ + ("4k_randread", "randread", "4k", 32), + ("4k_randwrite", "randwrite", "4k", 32), + ("1m_seqread", "read", "1m", 8), + ("1m_seqwrite", "write", "1m", 8), + ("4k_lat_read", "randread", "4k", 1), + ] + + runtime = _FIO_RUNTIME_SEC.value + try: + for name, rw, bs, iodepth in fio_jobs: + cmd = ( + f"fio --name={name} --filename={swap_dev}" + f" --rw={rw} --bs={bs} --iodepth={iodepth}" + " --ioengine=libaio --direct=1" + f" --runtime={runtime} --time_based --group_reporting" + " --output-format=json 2>/dev/null" + ) + logging.info("[swap_encryption] Phase 1: fio job %s", name) + out, _ = _pod_exec(pod, cmd, timeout=runtime + 120) + samples += _parse_fio_json(out, name, base_meta) + finally: + # Always re-enable swap so subsequent phases can drive swap I/O. + logging.info("[swap_encryption] Phase 1: swapon %s", swap_dev) + _pod_exec( pod, - recovered_pod, + f"swapon {swap_dev} 2>/dev/null || true", + timeout=30, + ignore_failure=True, ) - break - time.sleep(10) - else: - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] No Running pod found (original: {pod}) ' - f'within {timeout_sec}s after OOM kill / eviction' + logging.info( + "[swap_encryption] Phase 1 complete (%d samples)", len(samples) ) + return samples - # Phase 2: wait for init script to finish (sentinel written last). - while time.time() < deadline: - ready_out, _, ready_rc = kubectl.RunKubectlCommand( - [ - 'exec', - recovered_pod, - '-n', - _DS_NAMESPACE, - '--', - 'bash', - '-c', - 'test -f /tmp/pkb_ready && echo READY', - ], - raise_on_failure=False, - timeout=30, - ) - if ready_rc == 0 and 'READY' in ready_out: - logging.info( - '[swap_encryption] Pod %s recovered and ready', recovered_pod - ) - return recovered_pod - time.sleep(15) - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] Pod {recovered_pod} did not become ready ' - f'within {timeout_sec}s after OOM kill / eviction' - ) +def _parse_fio_json( + fio_output: str, job_name: str, base_meta: dict[str, Any] +) -> list[sample.Sample]: + """Parse fio --output-format=json output into PKB Sample objects. + + Extracts per-direction (read/write) IOPS, bandwidth (MB/s) and completion + latency (mean + p50/p99/p999 percentiles). + + Args: + fio_output: Raw stdout from fio with --output-format=json. + job_name: Short identifier embedded in metric names, e.g. '4k_randread'. + base_meta: Shared metadata dict copied into each sample. + + Returns: + List of Sample objects; empty if output cannot be parsed or is zero. + """ + # fio sometimes emits kernel warnings before the JSON object. + json_start = fio_output.find("{") + if json_start == -1: + logging.warning( + "[swap_encryption] Phase 1: no JSON in fio output for %s", job_name + ) + return [] + + try: + data = json.loads(fio_output[json_start:]) + except json.JSONDecodeError as e: + logging.warning( + "[swap_encryption] Phase 1: fio JSON parse error (%s): %s", + job_name, + e, + ) + return [] + + jobs = data.get("jobs", []) + if not jobs: + return [] + + job = jobs[0] + samples: list[sample.Sample] = [] + meta = dict(base_meta, fio_job=job_name) + + for direction in ("read", "write"): + d = job.get(direction, {}) + iops = float(d.get("iops", 0)) + bw_kbps = float(d.get("bw", 0)) # fio reports KiB/s + bw_mbps = bw_kbps / 1024.0 + + # Skip directions with near-zero throughput (e.g. write on a randread job). + if iops < 1 and bw_kbps < 1: + continue + + prefix = f"phase1_fio_{job_name}_{direction}" + samples.append(sample.Sample(f"{prefix}_iops", iops, "IOPS", meta)) + samples.append( + sample.Sample(f"{prefix}_bw_mbps", bw_mbps, "MB/s", meta) + ) + + # Completion latency — fio reports nanoseconds; emit microseconds. + clat = d.get("clat_ns", d.get("lat_ns", {})) + lat_mean_ns = float(clat.get("mean", 0)) + if lat_mean_ns > 0: + samples.append( + sample.Sample( + f"{prefix}_lat_mean_us", lat_mean_ns / 1000.0, "us", meta + ) + ) + for pct_key, label in ( + ("50.000000", "p50"), + ("99.000000", "p99"), + ("99.900000", "p999"), + ): + val_ns = clat.get("percentile", {}).get(pct_key, 0) + if val_ns: + samples.append( + sample.Sample( + f"{prefix}_lat_{label}_us", + val_ns / 1000.0, + "us", + meta, + ) + ) + + return samples _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) - 'c4-standard-8-lssd': 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD - 'c4-standard-8': 0.5008, # 8 vCPU, 32 GB RAM, no LSSD - 'n4-highmem-32': 3.0256, # 32 vCPU, 256 GB RAM - 'n2-highmem-32': 2.5216, # 32 vCPU, 256 GB RAM - 'n2-standard-32': 1.5264, # 32 vCPU, 120 GB RAM - 'z3-highmem-8': 2.7248, # 8 vCPU + 4× LSSD + "c4-standard-8-lssd": 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD + "c4-standard-8": 0.5008, # 8 vCPU, 32 GB RAM, no LSSD + "n4-highmem-32": 3.0256, # 32 vCPU, 256 GB RAM + "n2-highmem-32": 2.5216, # 32 vCPU, 256 GB RAM + "n2-standard-32": 1.5264, # 32 vCPU, 120 GB RAM + "z3-highmem-8": 2.7248, # 8 vCPU + 4× LSSD # AWS - 'i4i.4xlarge': 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store - 'i4i.2xlarge': 0.7480, - 'm6id.4xlarge': 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store - 'm6i.4xlarge': 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store - 'r6i.4xlarge': 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store + "i4i.4xlarge": 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store + "i4i.2xlarge": 0.7480, + "m6id.4xlarge": 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store + "m6i.4xlarge": 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store + "r6i.4xlarge": 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store } def _collect_cost_sample( pod: str, elapsed_sec: float, base_meta: dict ) -> list[sample.Sample]: - """Emit a cost_estimate_usd sample for the benchmark run (gap 7). - - Instance type is read from cloud metadata inside the pod. Price is looked - up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and - a warning is logged. - - Args: - pod: Benchmark pod name. - elapsed_sec: Wall-clock seconds the benchmark phases took. - base_meta: Shared metadata dict. - - Returns: - A list of zero or one sample.Sample. - """ - # Detect instance type from cloud metadata - instance_type = '' - - # GCP: machine type is the last segment of the metadata URL value - gcp_type_out, _ = _pod_exec( - pod, - 'curl -s -m 3 --fail' - ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_type = gcp_type_out.strip().split('/')[-1] - - if not instance_type: - # AWS: instance-type is a plain string - aws_type_out, _ = _pod_exec( + """Emit a cost_estimate_usd sample for the benchmark run (gap 7). + + Instance type is read from cloud metadata inside the pod. Price is looked + up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and + a warning is logged. + + Args: + pod: Benchmark pod name. + elapsed_sec: Wall-clock seconds the benchmark phases took. + base_meta: Shared metadata dict. + + Returns: + A list of zero or one sample.Sample. + """ + # Detect instance type from cloud metadata + instance_type = "" + + # GCP: machine type is the last segment of the metadata URL value + gcp_type_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', + "curl -s -m 3 --fail" + " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) - instance_type = aws_type_out.strip() - - # Allow explicit override (useful when running on custom/renamed machine - # types or when the pod was unavailable during cost collection). - if _INSTANCE_SIZE_LABEL.value: - instance_type = _INSTANCE_SIZE_LABEL.value - - # Last resort: fall back to the benchmark machine type flag. This ensures - # cost tracking works even when the pod was evicted before cost collection - # ran (in which case the metadata curl above returned empty). - if not instance_type and _BENCHMARK_MACHINE_TYPE.value: - instance_type = _BENCHMARK_MACHINE_TYPE.value - logging.info( - '[swap_encryption] Instance type from metadata unavailable; ' - 'using --swap_encryption_benchmark_machine_type=%s for cost tracking', - instance_type, - ) + if gcp_type_out.strip(): + instance_type = gcp_type_out.strip().split("/")[-1] - price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) - if price is None: - logging.warning( - '[swap_encryption] Unknown instance type "%s" – skipping cost sample. ' - 'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.', - instance_type, - ) - return [] + if not instance_type: + # AWS: instance-type is a plain string + aws_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail " + "http://169.254.169.254/latest/meta-data/instance-type " + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_type = aws_type_out.strip() + + # Allow explicit override (useful when running on custom/renamed machine + # types or when the pod was unavailable during cost collection). + if _INSTANCE_SIZE_LABEL.value: + instance_type = _INSTANCE_SIZE_LABEL.value + + # Last resort: fall back to the benchmark machine type flag. This ensures + # cost tracking works even when the pod was evicted before cost collection + # ran (in which case the metadata curl above returned empty). + if not instance_type and _BENCHMARK_MACHINE_TYPE.value: + instance_type = _BENCHMARK_MACHINE_TYPE.value + logging.info( + "[swap_encryption] Instance type from metadata unavailable; using" + " --swap_encryption_benchmark_machine_type=%s for cost tracking", + instance_type, + ) - hours = elapsed_sec / 3600.0 - cost = hours * price - meta = dict( - base_meta, - instance_type=instance_type, - price_usd_per_hr=price, - benchmark_elapsed_sec=round(elapsed_sec, 1), - ) - return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)] + price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) + if price is None: + logging.warning( + '[swap_encryption] Unknown instance type "%s" – skipping cost' + " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost" + " tracking.", + instance_type, + ) + return [] + + hours = elapsed_sec / 3600.0 + cost = hours * price + meta = dict( + base_meta, + instance_type=instance_type, + price_usd_per_hr=price, + benchmark_elapsed_sec=round(elapsed_sec, 1), + ) + return [sample.Sample("cost_estimate_usd", cost, "USD", meta)] def _detect_swap_device(pod: str) -> str: - """Return the active swap device path on the cluster node.""" - if _SWAP_DEVICE.value: - return _SWAP_DEVICE.value - - # /proc/swaps is the source of truth: it lists the swap device that is - # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, - # because a stale dm-crypt mapping from a previous run on a reused node can - # still exist as a /dev node while being non-functional (fio/swapoff then - # fail with "No such device or address"). So read the active device from - # /proc/swaps first; only fall back to the mapper path if /proc/swaps is - # somehow empty but the mapper is genuinely present. - dm_out, _ = _pod_exec( - pod, - textwrap.dedent(""" + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value + + # /proc/swaps is the source of truth: it lists the swap device that is + # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, + # because a stale dm-crypt mapping from a previous run on a reused node can + # still exist as a /dev node while being non-functional (fio/swapoff then + # fail with "No such device or address"). So read the active device from + # /proc/swaps first; only fall back to the mapper path if /proc/swaps is + # somehow empty but the mapper is genuinely present. + dm_out, _ = _pod_exec( + pod, + textwrap.dedent(""" ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) if [ -n "$ACTIVE" ] then @@ -1677,138 +1928,140 @@ def _detect_swap_device(pod: str) -> str: echo /dev/mapper/swap_encrypted fi """), - ignore_failure=True, - ) - dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' - if dev: - return dev - raise ValueError( - 'No active swap device found in the benchmark pod. ' - 'Use --swap_encryption_device to specify one.' - ) + ignore_failure=True, + ) + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else "" + if dev: + return dev + raise ValueError( + "No active swap device found in the benchmark pod. " + "Use --swap_encryption_device to specify one." + ) def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]: - """Collect node environment, encryption type, and config into a dict.""" - - kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True) - mem_out, _ = _pod_exec( - pod, - "awk '/MemTotal/{print $2}' /proc/meminfo", - ignore_failure=True, - ) - swap_out, _ = _pod_exec( - pod, - "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", - ignore_failure=True, - ) - - try: - mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) - except ValueError: - mem_gb = 0 - try: - swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) - except ValueError: - swap_gb = 0 - - # Encryption type — key off dm-crypt presence + the swap target, NOT the - # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- - # encrypted; only the AWS targets (instance_store / io2) are. - enc = 'unknown' - if '/dev/mapper/' in swap_dev: - table_out, _ = _pod_exec( - pod, - f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', - ignore_failure=True, - ) - enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' - elif _SWAP_TYPE.value in ('instance_store', 'io2'): - enc = 'nitro_hardware_offload' # AWS: encrypted by the Nitro card - elif not _ENABLE_DMCRYPT.value: - enc = 'none' # GKE plain swap (encryption OFF) - - cloud = _detect_cloud(pod) - - # Gap 6: instance size label for multi-size comparison runs. - # If the flag is set use it directly; otherwise try to read it from - # cloud metadata so that the field is always populated. - instance_label = _INSTANCE_SIZE_LABEL.value - if not instance_label: - gcp_type_out, _ = _pod_exec( + """Collect node environment, encryption type, and config into a dict.""" + + kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True) + mem_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail' - ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True, ) - if gcp_type_out.strip(): - instance_label = gcp_type_out.strip().split('/')[-1] - if not instance_label: - aws_type_out, _ = _pod_exec( + swap_out, _ = _pod_exec( pod, - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True, ) - instance_label = aws_type_out.strip() - - return { - 'benchmark': BENCHMARK_NAME, - 'execution_mode': 'kubernetes_privileged_pod', - 'cloud': cloud, - 'instance_size': instance_label, - 'kernel_version': kernel_out.strip(), - 'host_memory_gb': mem_gb, - 'swap_device': swap_dev, - 'swap_size_gb': swap_gb, - 'swap_encryption': enc, - # Test-matrix columns: storage target, encryption on/off, image, IOPS - 'storage_target': _SWAP_TYPE.value, - 'boot_disk_type': _BOOT_DISK_TYPE.value, - 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, - 'node_image_type': _NODE_IMAGE_TYPE.value, - 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, - 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, - # Other config - 'zswap_enabled': _ENABLE_ZSWAP.value, - 'min_free_kbytes': _MIN_FREE_KBYTES.value, - 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, - # Requested config value only. The *effective* stress-ng footprint may - # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the - # actual value it ran with as 'stress_vm_bytes' so the two never conflict. - 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, - 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, - 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, - 'nodepool': _NODEPOOL.value, - } + + try: + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 + + # Encryption type — key off dm-crypt presence + the swap target, NOT the + # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- + # encrypted; only the AWS targets (instance_store / io2) are. + enc = "unknown" + if "/dev/mapper/" in swap_dev: + table_out, _ = _pod_exec( + pod, + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, + ) + enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other" + elif _SWAP_TYPE.value in ("instance_store", "io2"): + enc = "nitro_hardware_offload" # AWS: encrypted by the Nitro card + elif not _ENABLE_DMCRYPT.value: + enc = "none" # GKE plain swap (encryption OFF) + + cloud = _detect_cloud(pod) + + # Gap 6: instance size label for multi-size comparison runs. + # If the flag is set use it directly; otherwise try to read it from + # cloud metadata so that the field is always populated. + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail" + " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split("/")[-1] + if not instance_label: + aws_type_out, _ = _pod_exec( + pod, + "curl -s -m 3 --fail " + "http://169.254.169.254/latest/meta-data/instance-type " + '2>/dev/null || echo ""', + ignore_failure=True, + ) + instance_label = aws_type_out.strip() + + return { + "benchmark": BENCHMARK_NAME, + "execution_mode": "kubernetes_privileged_pod", + "cloud": cloud, + "instance_size": instance_label, + "kernel_version": kernel_out.strip(), + "host_memory_gb": mem_gb, + "swap_device": swap_dev, + "swap_size_gb": swap_gb, + "swap_encryption": enc, + # Test-matrix columns: storage target, encryption on/off, image, IOPS + "storage_target": _SWAP_TYPE.value, + "boot_disk_type": _BOOT_DISK_TYPE.value, + "dmcrypt_enabled": _ENABLE_DMCRYPT.value, + "node_image_type": _NODE_IMAGE_TYPE.value, + "boot_disk_iops_target": _BOOT_DISK_IOPS.value, + "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value, + # Other config + "zswap_enabled": _ENABLE_ZSWAP.value, + "min_free_kbytes": _MIN_FREE_KBYTES.value, + "fio_runtime_sec": _FIO_RUNTIME_SEC.value, + # Requested config value only. The *effective* stress-ng footprint may + # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the + # actual value it ran with as 'stress_vm_bytes' so the two never conflict. + "stress_vm_bytes_requested": _STRESS_VM_BYTES.value, + "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value, + "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value, + "nodepool": _NODEPOOL.value, + } def _detect_cloud(pod: str) -> str: - """Detect whether the benchmark pod is running on GCP or AWS. - - Queries the cloud instance metadata endpoint inside the pod. Returns - 'GCP' if the GCP metadata server responds, 'AWS' otherwise. - """ - gcp_out, _ = _pod_exec( - pod, - 'curl -s -m 2 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/project/project-id' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_out.strip(): - return 'GCP' - return 'AWS' + """Detect whether the benchmark pod is running on GCP or AWS. + + Queries the cloud instance metadata endpoint inside the pod. Returns + 'GCP' if the GCP metadata server responds, 'AWS' otherwise. + """ + gcp_out, _ = _pod_exec( + pod, + "curl -s -m 2 --fail " + "http://metadata.google.internal/computeMetadata/v1/project/project-id" + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, + ) + if gcp_out.strip(): + return "GCP" + return "AWS" def _ensure_io2_volume() -> None: - """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). - - Only executed when --swap_encryption_swap_type=io2. Full implementation - is deferred to PR2 (swap-capability layer). - """ - if _SWAP_TYPE.value != 'io2': - return - logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2') + """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). + + Only executed when --swap_encryption_swap_type=io2. Full implementation + is deferred to PR2 (swap-capability layer). + """ + if _SWAP_TYPE.value != "io2": + return + logging.info( + "[swap_encryption] io2 swap volume provisioning deferred to PR2" + ) From b8b4300db55afaf1e6b3ad37a73847681ab68faa Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Thu, 25 Jun 2026 20:28:04 +0530 Subject: [PATCH 08/17] fix(swap_encryption): lean DaemonSet + Phase 1 fio microbenchmarks --- .../swap_encryption_benchmark.py | 121 +++++++----------- 1 file changed, 48 insertions(+), 73 deletions(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index e596abf963..e30854e188 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -97,7 +97,7 @@ GKE vs. EKS swap encryption and LSSD performance comparison. Two-step nodepool setup: PKB provisions a minimal cluster with a cheap default nodepool (Step 1), then Prepare() adds the real benchmark - nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a + nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a node-level startup script that configures dm-crypt swap before any pod is scheduled, then removes the default nodepool (Step 2). All benchmark phases run inside a privileged DaemonSet pinned to the benchmark nodepool. @@ -286,17 +286,6 @@ "(unencrypted) swap overhead as a baseline.", ) -_GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string( - "swap_encryption_gke_kubelet_memory_swap", - "LimitedSwap", - "Value for kubeletConfig.memorySwapBehavior injected via " - "--system-config-from-file when creating the GKE benchmark nodepool. " - "LimitedSwap (default) — the kubelet allows pods to use swap up to their " - "memory limit; required for the DaemonSet pod to drive kernel swapping. " - "NoSwap — disables swap at the kubelet level (use for a baseline run that " - "confirms zero swap activity). Set empty string to omit the flag entirely " - "and rely on the cluster-level default.", -) _SWAP_DEVICE = flags.DEFINE_string( "swap_encryption_device", @@ -412,7 +401,7 @@ def Prepare(spec: _BenchmarkSpec) -> None: Step 2 (this function): a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures + UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures dm-crypt swap at the OS level — before any pod is scheduled. b. Delete the dummy default nodepool to stop its cost immediately. c. Deploy the privileged DaemonSet (pinned via nodeSelector to the @@ -683,9 +672,9 @@ def _configure_eks_kubelet_swap(spec) -> None: memorySwapBehavior: LimitedSwap failSwapOn: false - GKE equivalent: linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior - via --system-config-from-file, already implemented in - _create_benchmark_node_pool. + GKE equivalent: linuxConfig.swapConfig via --system-config-from-file + (swapConfig automatically enables memorySwapBehavior=LimitedSwap), + already implemented in _create_benchmark_node_pool. See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 """ @@ -912,7 +901,7 @@ def _create_benchmark_node_pool(cluster) -> None: Uses: --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default COS_CONTAINERD) + --swap_encryption_node_image_type (default UBUNTU_CONTAINERD) --swap_encryption_boot_disk_iops (default 80000) --swap_encryption_enable_dmcrypt (default True) @@ -975,67 +964,54 @@ def _create_benchmark_node_pool(cluster) -> None: if is_lssd: cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}" - # ── GKE kubelet swap config ─────────────────────────────────────────────── - # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark - # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap - # so that the kubelet allocates swap to the DaemonSet pod. Without this flag - # the Linux kernel swap device may exist but the kubelet blocks pod-level - # swap usage and the benchmark pod cannot drive swap I/O. - # - # Passed as --system-config-from-file pointing to a temp YAML, which is the - # same mechanism PKB's gke_node_system_config flag uses: - # perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py - swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value + # ── GKE swap system-config ─────────────────────────────────────────────── + # Pass linuxConfig.swapConfig + linuxConfig.sysctl via --system-config-from-file. + # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): + # linuxConfig.swapConfig: GKE enables node-level swap device and + # automatically sets kubeletConfig.memorySwapBehavior=LimitedSwap. + # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use the + # local NVMe as the swap device (avoids boot-disk overhead). + # linuxConfig.sysctl: swap aggressiveness tuning so benchmark workloads + # can drive sustained swap I/O. + # Reference: + # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ + # node-memory-swap#enable system_config_tmp = None - if swap_behavior: - # Build system-config YAML for --system-config-from-file. - # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): - # kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods. - # linuxConfig.swapConfig: GKE enables node-level swap device. - # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use - # the local NVMe as the swap device (avoids boot-disk overhead). - # linuxConfig.sysctl: swap aggressiveness tuning so the benchmark - # workloads can drive sustained swap I/O. - # Reference: - # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ - # node-memory-swap#enable - if is_lssd: - swap_config_block = ( - " swapConfig:\n" - " enabled: true\n" - " dedicatedLocalSsdProfile:\n" - f" diskCount: {_LSSD_COUNT.value}\n" - ) - else: - swap_config_block = " swapConfig:\n enabled: true\n" - kubelet_yaml = ( - "kubeletConfig:\n memorySwapBehavior:" - f" {swap_behavior}\nlinuxConfig:\n" - + swap_config_block - + " sysctl:\n" - " vm.min_free_kbytes: 200\n" - " vm.watermark_scale_factor: 500\n" - " vm.swappiness: 100\n" - ) - system_config_tmp = tempfile.NamedTemporaryFile( - mode="w", suffix=".yaml", delete=False - ) - system_config_tmp.write(kubelet_yaml) - system_config_tmp.flush() - cmd.flags["system-config-from-file"] = system_config_tmp.name - logging.info( - "[swap_encryption] system-config-from-file: " - "kubelet_swap=%s lssd=%s (written to %s):\n%s", - swap_behavior, - is_lssd, - system_config_tmp.name, - kubelet_yaml, + if is_lssd: + swap_config_block = ( + " swapConfig:\n" + " enabled: true\n" + " dedicatedLocalSsdProfile:\n" + f" diskCount: {_LSSD_COUNT.value}\n" ) + else: + swap_config_block = " swapConfig:\n enabled: true\n" + swap_config_yaml = ( + "linuxConfig:\n" + + swap_config_block + + " sysctl:\n" + " vm.min_free_kbytes: 200\n" + " vm.watermark_scale_factor: 500\n" + " vm.swappiness: 100\n" + ) + system_config_tmp = tempfile.NamedTemporaryFile( + mode="w", suffix=".yaml", delete=False + ) + system_config_tmp.write(swap_config_yaml) + system_config_tmp.flush() + cmd.flags["system-config-from-file"] = system_config_tmp.name + logging.info( + "[swap_encryption] system-config-from-file: " + "lssd=%s (written to %s):\n%s", + is_lssd, + system_config_tmp.name, + swap_config_yaml, + ) logging.info( "[swap_encryption] Creating benchmark nodepool: %s / %s / " "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / " - "add_swap_disk=%s / kubelet_swap=%s", + "add_swap_disk=%s", _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value, @@ -1044,7 +1020,6 @@ def _create_benchmark_node_pool(cluster) -> None: _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value, - swap_behavior or "unset", ) # LSSD nodepools take longer to provision than PD-only nodepools because From a3f9aa2513d25b7d264d826c1dfbadfc3552ad7e Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 16:33:09 +0530 Subject: [PATCH 09/17] refactor(swap_encryption/pr1): extract infra into BaseResource classes - Add SwapDaemonSet(resource.BaseResource) in resources/container_service/swap_daemonset.py - _Create(): apply Jinja2 manifest + wait for Running + /tmp/pkb_ready - _Delete(): in-pod swapoff/dmsetup/losetup/pkill teardown; kubectl delete - PodExec(): transient-reset retry, rc=137 OOM detection, pod recovery - Add SwapNodePool(resource.BaseResource) in resources/container_service/swap_nodepool.py - _Create(): gcloud node-pools create with linuxConfig.swapConfig + optional swap disk - _Delete(): detach+delete disk; delete nodepool - DeleteDefaultPool(): remove dummy e2-medium pool after DaemonSet pod Running - Rewrite benchmark to thin pattern: Prepare() uses resource.Create() + spec.resources - Cleanup() is empty - PKB framework auto-deletes spec.resources - Run() uses daemonset.PodExec() throughout - Addresses Zac review: resources pattern, no infra code in benchmark file - Fix COS_CONTAINERD -> UBUNTU_CONTAINERD (r3472549985) - swapConfig auto-enables memorySwapBehavior=LimitedSwap (r3472513706) --- .../swap_encryption_benchmark.py | 2035 ++++------------- .../container_service/swap_daemonset.py | 609 +++++ .../container_service/swap_nodepool.py | 575 +++++ 3 files changed, 1682 insertions(+), 1537 deletions(-) create mode 100644 perfkitbenchmarker/resources/container_service/swap_daemonset.py create mode 100644 perfkitbenchmarker/resources/container_service/swap_nodepool.py diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index e30854e188..7f981b1bb7 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -35,38 +35,44 @@ EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption swap device: /dev/nvme1n1 (or auto-detected) -== Benchmark Phases == +== Resource pattern == - Phase 1 – fio Microbenchmarks - Run fio directly on the swap block device (swapoff first) to measure - the hardware + encryption ceiling: random IOPS (4K), sequential - bandwidth (1M), and completion latency (iodepth=1). +Infrastructure lifecycle lives in two BaseResource subclasses: - Phase 2a – CPU Overhead - stress-ng drives sustained swap I/O; vmstat and pidstat capture - swap-in/out rates and per-process CPU cost (kswapd, kcryptd, - dm-crypt threads on GKE; Nitro offload on EKS). + SwapNodePool (perfkitbenchmarker/resources/container_service/swap_nodepool.py) + _Create(): gcloud container node-pools create with linuxConfig.swapConfig + + sysctl via --system-config-from-file; waits for node Ready; + optionally creates and attaches a dedicated swap disk. + _Delete(): detach+delete disk; delete the nodepool. + DeleteDefaultPool(): remove the dummy e2-medium default pool after the + DaemonSet pod is Running (separate step to avoid API-server + contention during nodepool ops). - Phase 2b – I/O Interference - Baseline fio on a scratch volume → re-run with concurrent swap - pressure. IOPS/latency delta = storage contention cost. + SwapDaemonSet (perfkitbenchmarker/resources/container_service/swap_daemonset.py) + _Create(): apply Jinja2 manifest; wait for Running + /tmp/pkb_ready. + _Delete(): in-pod swapoff / dmsetup / losetup teardown; kubectl delete. + PodExec(): kubectl exec wrapper with transient-reset retry, OOM-kill + detection (rc=137), and automatic pod recovery. - Phase 3a – Redis Latency - Dataset loaded beyond container memory limit → GET/SET p99 latency - measured while kernel swaps pages. +Both resources are added to spec.resources in Prepare() and are auto-deleted +by the PKB framework in Cleanup(). - Phase 3b – Kernel Build - Linux compiled inside a memory-capped cgroup; slowdown ratio vs - unconstrained baseline. +== Benchmark Phases == - Phase 3c – OpenSearch - Bulk-index + search query under swap pressure (esrally or curl). + Phase 1 – fio Microbenchmarks (this PR) + Run fio directly on the swap block device (swapoff first) to measure + the hardware + encryption ceiling: random IOPS (4K), sequential + bandwidth (1M), and completion latency (iodepth=1). + + Phase 2a – CPU Overhead (PR2/PR4) + Phase 2b – I/O Interference (PR4) + Phase 3a – Redis Latency (PR5) + Phase 3b – Kernel Build (PR5) + Phase 3c – OpenSearch (PR5) """ import json import logging -import os -import tempfile import textwrap import time from typing import Any @@ -76,9 +82,9 @@ from perfkitbenchmarker import configs from perfkitbenchmarker import errors from perfkitbenchmarker import sample -from perfkitbenchmarker.providers.gcp import util as gcp_util from perfkitbenchmarker.resources.container_service import kubectl -from perfkitbenchmarker.resources.container_service import kubernetes_commands +from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod +from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod FLAGS = flags.FLAGS @@ -88,7 +94,7 @@ # Benchmark identity # --------------------------------------------------------------------------- -BENCHMARK_NAME = "swap_encryption" +BENCHMARK_NAME = 'swap_encryption' BENCHMARK_CONFIG = """ @@ -118,274 +124,234 @@ _DAEMONSET_IMAGE = flags.DEFINE_string( - "swap_encryption_daemonset_image", - "ubuntu:22.04", - "Container image used for the privileged benchmark DaemonSet pod.", + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image used for the privileged benchmark DaemonSet pod.', ) _NODEPOOL = flags.DEFINE_string( - "swap_encryption_nodepool", - "benchmark", - "Name of the node pool to deploy the benchmark DaemonSet on.", + 'swap_encryption_nodepool', + 'benchmark', + 'Name of the node pool to deploy the benchmark DaemonSet on.', ) _INSTANCE_SIZE_LABEL = flags.DEFINE_string( - "swap_encryption_instance_size_label", - "", - "Human-readable label for the current instance size being tested, e.g. " + 'swap_encryption_instance_size_label', + '', + 'Human-readable label for the current instance size being tested, e.g. ' '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' - "results from multiple PKB runs across different instance sizes can be " - "collated and compared. Defaults to the value reported by the cloud " - "metadata endpoint inside the pod.", + 'results from multiple PKB runs across different instance sizes can be ' + 'collated and compared. Defaults to the value reported by the cloud ' + 'metadata endpoint inside the pod.', ) _COLLECT_COST = flags.DEFINE_boolean( - "swap_encryption_collect_cost", + 'swap_encryption_collect_cost', False, - "When True, emit a cost_estimate_usd sample using on-demand pricing " - "for the instance type detected at runtime.", + 'When True, emit a cost_estimate_usd sample using on-demand pricing ' + 'for the instance type detected at runtime.', ) _FAIL_ON_DEGRADED = flags.DEFINE_boolean( - "swap_encryption_fail_on_degraded", + 'swap_encryption_fail_on_degraded', True, - "When True (default), raise an error at the end of Run() if the run was " - "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and " - "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng " - "swap-pressure phase was OOM-killed before completing. This prevents PKB " - "from reporting SUCCEEDED for a run whose post-eviction phases produced " - "empty or meaningless data. Set False to keep the legacy behaviour of " - "always returning whatever partial samples were collected.", + 'When True (default), raise an error at the end of Run() if the run was ' + 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' + 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' + 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' + 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' + 'empty or meaningless data. Set False to keep the legacy behaviour of ' + 'always returning whatever partial samples were collected.', ) _PHASES = flags.DEFINE_list( - "swap_encryption_phases", - ["all"], - "Which Run() phases to execute, for fast iteration against an " - "already-provisioned cluster (e.g. --run_stage=run --run_uri=...). " - "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng " - "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), " + 'swap_encryption_phases', + ['all'], + 'Which Run() phases to execute, for fast iteration against an ' + 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' + 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' + 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' - "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. " - "Phases not listed are skipped and do not affect the degraded-run gate " + 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' + 'Phases not listed are skipped and do not affect the degraded-run gate ' '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', ) _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( - "swap_encryption_benchmark_machine_type", - "n4-highmem-32", - "Machine type for the benchmark nodepool created in Prepare(). " - "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd " - "(LSSD RAID-0). The matching swap setup is selected automatically.", + 'swap_encryption_benchmark_machine_type', + 'n4-highmem-32', + 'Machine type for the benchmark nodepool created in Prepare(). ' + 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' + '(LSSD RAID-0). The matching swap setup is selected automatically.', ) _BENCHMARK_LSSD = flags.DEFINE_boolean( - "swap_encryption_lssd", + 'swap_encryption_lssd', False, - "Force LSSD RAID-0 swap path even when the machine type name does not " + 'Force LSSD RAID-0 swap path even when the machine type name does not ' 'contain "lssd". Auto-detected from machine type when False.', ) _LSSD_COUNT = flags.DEFINE_integer( - "swap_encryption_lssd_count", + 'swap_encryption_lssd_count', 1, - "Number of local NVMe SSDs to attach as raw block devices " - "(--local-nvme-ssd-block count=N). Must match the fixed local SSD " - "count for the chosen machine type: c4-standard-8-lssd=1, " - "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). " - "Default 1 covers most single-lssd machine types.", + 'Number of local NVMe SSDs to attach as raw block devices ' + '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' + 'count for the chosen machine type: c4-standard-8-lssd=1, ' + 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' + 'Default 1 covers most single-lssd machine types.', ) _NODE_IMAGE_TYPE = flags.DEFINE_string( - "swap_encryption_node_image_type", - "UBUNTU_CONTAINERD", - "GKE node image type for the benchmark nodepool. " - "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks " - "down device-mapper at the kernel LSM level and cryptsetup hangs " - "indefinitely from any pod context (even privileged, even via nsenter " - "into the host mount namespace). Ubuntu GKE nodes allow cryptsetup " - "from privileged pods without restriction. " - "Use COS_CONTAINERD only when dm-crypt is disabled " - "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. " - "AL2 on EKS.", + 'swap_encryption_node_image_type', + 'UBUNTU_CONTAINERD', + 'GKE node image type for the benchmark nodepool. ' + 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' + 'down device-mapper at the kernel LSM level and cryptsetup hangs ' + 'indefinitely from any pod context (even privileged, even via nsenter ' + 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' + 'from privileged pods without restriction. ' + 'Use COS_CONTAINERD only when dm-crypt is disabled ' + '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' + 'AL2 on EKS.', ) _BOOT_DISK_TYPE = flags.DEFINE_string( - "swap_encryption_boot_disk_type", - "hyperdisk-balanced", - "Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced " - "for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 " - "dev/test machines, which do not support hyperdisk-balanced.", + 'swap_encryption_boot_disk_type', + 'hyperdisk-balanced', + 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' + 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' + 'dev/test machines, which do not support hyperdisk-balanced.', ) _BOOT_DISK_IOPS = flags.DEFINE_integer( - "swap_encryption_boot_disk_iops", + 'swap_encryption_boot_disk_iops', 80000, - "Provisioned IOPS for the boot disk (hyperdisk-balanced only). " - "80 000 is the COS max-IOPS target. Ignored for pd-ssd.", + 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' + '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', ) _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( - "swap_encryption_boot_disk_throughput", + 'swap_encryption_boot_disk_throughput', 1200, - "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced " - "only). Must be set together with iops. 1200 MB/s pairs with 80 000 " - "IOPS for production; use 140 (minimum) for dev/test. Ignored for " - "pd-ssd.", + 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' + 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' + 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' + 'pd-ssd.', ) _BOOT_DISK_SIZE_GB = flags.DEFINE_integer( - "swap_encryption_boot_disk_size_gb", + 'swap_encryption_boot_disk_size_gb', 500, - "Boot disk size in GiB for the benchmark nodepool. 500 GiB is " - "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run " - "(see Engineer Assignments table in execution-plan.md). " - "For LSSD configs the boot disk is smaller; 100 GiB is fine.", + 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' + 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' + '(see Engineer Assignments table in execution-plan.md). ' + 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', ) _ADD_SWAP_DISK = flags.DEFINE_boolean( - "swap_encryption_add_swap_disk", + 'swap_encryption_add_swap_disk', False, - "Attach a dedicated second disk to the benchmark nodepool for use as " - "the swap device. Required for dm-crypt measurement on single-boot-disk " - "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper " - "from pod namespaces. The second disk is provisioned via " - "--additional-node-disk using the same type/IOPS/throughput as the boot " - "disk flags.", + 'Attach a dedicated second disk to the benchmark nodepool for use as ' + 'the swap device. Required for dm-crypt measurement on single-boot-disk ' + 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' + 'from pod namespaces. The second disk is provisioned via ' + '--additional-node-disk using the same type/IOPS/throughput as the boot ' + 'disk flags.', ) _SWAP_DISK_SIZE_GB = flags.DEFINE_integer( - "swap_encryption_swap_disk_size_gb", + 'swap_encryption_swap_disk_size_gb', 500, - "Size in GiB of the dedicated swap disk when " - "--swap_encryption_add_swap_disk is True. Must satisfy the " - "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.", + 'Size in GiB of the dedicated swap disk when ' + '--swap_encryption_add_swap_disk is True. Must satisfy the ' + 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', ) _ENABLE_DMCRYPT = flags.DEFINE_boolean( - "swap_encryption_enable_dmcrypt", + 'swap_encryption_enable_dmcrypt', True, - "When True (default), wrap the swap device in dm-crypt plain mode " - "(aes-xts-plain64, ephemeral random key) matching GKE's " - "go/node:swap-encryption implementation. Set False to measure plain " - "(unencrypted) swap overhead as a baseline.", + 'When True (default), wrap the swap device in dm-crypt plain mode ' + '(aes-xts-plain64, ephemeral random key) matching GKE\'s ' + 'go/node:swap-encryption implementation. Set False to measure plain ' + '(unencrypted) swap overhead as a baseline.', ) _SWAP_DEVICE = flags.DEFINE_string( - "swap_encryption_device", - "", - "Explicit block device path to use as the swap device, e.g. " - "/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), " - "the device is auto-detected from /proc/swaps inside the benchmark pod.", + 'swap_encryption_device', + '', + 'Explicit block device path to use as the swap device, e.g. ' + '/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), ' + 'the device is auto-detected from /proc/swaps inside the benchmark pod.', ) _SWAP_TYPE = flags.DEFINE_string( - "swap_encryption_swap_type", - "hyperdisk", - "Storage target for the swap device. One of: hyperdisk (default), " - "lssd, instance_store, io2.", + 'swap_encryption_swap_type', + 'hyperdisk', + 'Storage target for the swap device. One of: hyperdisk (default), ' + 'lssd, instance_store, io2.', ) _ENABLE_ZSWAP = flags.DEFINE_boolean( - "swap_encryption_enable_zswap", + 'swap_encryption_enable_zswap', False, - "When True, enable zswap compressed swap cache on the benchmark node.", + 'When True, enable zswap compressed swap cache on the benchmark node.', ) _MIN_FREE_KBYTES = flags.DEFINE_integer( - "swap_encryption_min_free_kbytes", + 'swap_encryption_min_free_kbytes', 0, - "Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. " - "0 (default) leaves the kernel default unchanged.", + 'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. ' + '0 (default) leaves the kernel default unchanged.', ) _FIO_RUNTIME_SEC = flags.DEFINE_integer( - "swap_encryption_fio_runtime_sec", + 'swap_encryption_fio_runtime_sec', 60, - "Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.", + 'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.', ) _STRESS_VM_BYTES = flags.DEFINE_string( - "swap_encryption_stress_vm_bytes", - "28G", - "stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. " - "Should exceed available node RAM to force sustained paging.", + 'swap_encryption_stress_vm_bytes', + '28G', + 'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. ' + 'Should exceed available node RAM to force sustained paging.', ) _STRESS_VM_BYTES_LIST = flags.DEFINE_list( - "swap_encryption_stress_vm_bytes_list", + 'swap_encryption_stress_vm_bytes_list', [], - "Comma-separated list of --vm-bytes values to sweep in Phase 2a, " + 'Comma-separated list of --vm-bytes values to sweep in Phase 2a, ' 'e.g. "14G,28G,56G". Overrides --swap_encryption_stress_vm_bytes.', ) _STRESS_TIMEOUT_SEC = flags.DEFINE_integer( - "swap_encryption_stress_timeout_sec", + 'swap_encryption_stress_timeout_sec', 300, - "Maximum seconds to wait for the stress-ng swap-pressure phase.", -) - -_DS_NAME = "pkb-swap-benchmark" -_DS_NAMESPACE = "default" -_DS_LABEL = "pkb-swap-benchmark" - -# Transient kubectl errors that are safe to retry. -_TRANSIENT_KUBECTL_ERRORS = ("connection reset by peer", "websocket: close") - -# Errors indicating the container/pod is gone and needs recovery. -_CONTAINER_GONE_KUBECTL_ERRORS = ( - "container not found", - "procReady not received", - "unable to upgrade connection", - "not found", - "deleted state", + 'Maximum seconds to wait for the stress-ng swap-pressure phase.', ) -_active_pod: list[str] = [] # single-element list so closures can mutate it - - -_degraded_reasons: list[str] = [] - - -_pod_lost: list[str] = [] - - -_oom_events: list[str] = [] - -_BENCHMARK_NODEPOOL = "benchmark" -_DEFAULT_NODEPOOL = "default-pool" - - -class _GcpZonalResource: - """Minimal resource shim for gcp_util.GcloudCommand on compute operations. - - gcp_util.GcloudCommand auto-injects --project and --zone from the resource - object passed to it. GkeCluster._GcloudCommand() handles container/* - operations correctly but also switches --zone → --region for multi-zone - clusters, which is wrong for gcloud compute commands (--region creates - regional resources, not zonal ones). This shim pins a single zone so all - gcloud compute calls target the correct AZ. - """ - - def __init__(self, project: str, zone: str) -> None: - self.project = project - self.zone = zone +# DaemonSet constants used by both SwapDaemonSet construction and the EKS path. +_DS_NAME = 'pkb-swap-benchmark' +_DS_NAMESPACE = 'default' +_DS_LABEL = 'pkb-swap-benchmark' +_BENCHMARK_NODEPOOL = 'benchmark' def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: @@ -400,88 +366,86 @@ def Prepare(spec: _BenchmarkSpec) -> None: e2-medium default nodepool. Step 2 (this function): - a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with - UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures - dm-crypt swap at the OS level — before any pod is scheduled. - b. Delete the dummy default nodepool to stop its cost immediately. - c. Deploy the privileged DaemonSet (pinned via nodeSelector to the - benchmark nodepool) and wait for tools to install. + a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk). + EKS: label existing nodes with pkb_nodepool=benchmark. + b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel. + c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running. + d. GCP: re-resolve pod name in case default-pool deletion evicts the pod. + + Both resources are appended to spec.resources for auto-cleanup. """ cluster = spec.container_cluster - - # ── Step 2a: add real benchmark nodepool ──────────────────────────────── - if not getattr(cluster, "project", None): - # Guard: AWS / EKS path — nodepool management is external. - # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet - # nodeSelector (pkb_nodepool=benchmark) before deploying the pod. + is_gcp = getattr(cluster, 'project', None) is not None + + if is_gcp: + # ── Step 2a (GCP): create benchmark nodepool + wait for node ────────── + logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') + nodepool = _np_mod.SwapNodePool( + cluster=cluster, + machine_type=_BENCHMARK_MACHINE_TYPE.value, + node_image_type=_NODE_IMAGE_TYPE.value, + disk_type=_BOOT_DISK_TYPE.value, + disk_size_gb=_BOOT_DISK_SIZE_GB.value, + disk_iops=_BOOT_DISK_IOPS.value, + disk_throughput=_BOOT_DISK_THROUGHPUT.value, + lssd=_BENCHMARK_LSSD.value, + lssd_count=_LSSD_COUNT.value, + add_swap_disk=_ADD_SWAP_DISK.value, + swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value, + ) + nodepool.Create() + spec.resources.append(nodepool) + else: + # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ── logging.info( - "[swap_encryption] EKS cluster — labelling existing nodes with " - "pkb_nodepool=%s so the DaemonSet nodeSelector matches.", + '[swap_encryption] EKS cluster — labelling existing nodes with' + ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.', _BENCHMARK_NODEPOOL, ) kubectl.RunKubectlCommand([ - "label", - "nodes", - "--all", - "--overwrite", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", + 'label', + 'nodes', + '--all', + '--overwrite', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', ]) - # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs - # on io2 hardware-encrypted storage (no-op unless swap_type=io2). _ensure_io2_volume() - else: - # GCP path: true two-step nodepool setup. - logging.info("[swap_encryption] Step 2a: creating benchmark nodepool") - _create_benchmark_node_pool(cluster) - - # ── Step 2b: wait for the benchmark node to join and be Ready ───────── - logging.info("[swap_encryption] Step 2b: waiting for benchmark node") - _wait_for_benchmark_node() - - # ── Step 2b2: attach dedicated swap disk (if requested) ─────────────── - if _ADD_SWAP_DISK.value: - logging.info( - "[swap_encryption] Step 2b2: attaching dedicated swap disk" - ) - _attach_swap_disk(cluster) - - # ── Step 2c: deploy DaemonSet ──────────────────────────────────────────── - # Deploy and wait for the pod BEFORE deleting the default nodepool. - # Deleting the default pool while the benchmark node is still joining causes - # a temporary API server i/o timeout (control plane busy with two nodepool - # ops simultaneously). Once the pod is Running the cluster is fully stable. - logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet") - _deploy_daemonset() - pod = _wait_for_benchmark_pod() - logging.info("[swap_encryption] Benchmark pod ready: %s", pod) + # ── Step 2b: deploy DaemonSet and wait for pod ──────────────────────────── + # Deploy BEFORE deleting the default pool: deleting the default pool while + # the benchmark node is still joining causes a brief API-server I/O timeout. + # The pod being Running means the cluster is fully stable. + logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet') + daemonset = _ds_mod.SwapDaemonSet( + name=_DS_NAME, + namespace=_DS_NAMESPACE, + label=_DS_LABEL, + nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + ) + daemonset.Create() + spec.resources.append(daemonset) + logging.info( + '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name + ) - # ── Step 2d: now safe to remove the dummy default nodepool ─────────────── - if getattr(cluster, "project", None): + # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ── + if is_gcp: logging.info( - "[swap_encryption] Step 2d: deleting dummy default nodepool" + '[swap_encryption] Step 2c: deleting dummy default nodepool' ) - _delete_default_node_pool(cluster) - # The DaemonSet pod may be evicted and rescheduled with a new name during - # the nodepool deletion (cluster control plane briefly interrupts pod - # lifecycle). Re-resolve the pod name to avoid stale-reference errors on - # all subsequent _pod_exec calls. + nodepool.DeleteDefaultPool() + # The pod may be evicted and rescheduled with a new name during the + # default nodepool deletion. Re-resolve to avoid stale references. logging.info( - "[swap_encryption] Step 2d: re-resolving benchmark pod " - "after nodepool deletion" + '[swap_encryption] Step 2d: re-resolving benchmark pod after' + ' nodepool deletion' + ) + daemonset.WaitForPod() + logging.info( + '[swap_encryption] Benchmark pod (post-deletion): %s', + daemonset.pod_name, ) - pod = _wait_for_benchmark_pod() - logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod) - - -def _phase_selected(token: str) -> bool: - """Return True if phase `token` should run given --swap_encryption_phases. - - 'all' (the default) selects every phase. Otherwise only the comma-separated - tokens listed in the flag run. Tokens: fio, 2a, 2b, 3a, 3b, 3c. - """ - selected = [p.strip().lower() for p in _PHASES.value if p.strip()] - return (not selected) or ("all" in selected) or (token.lower() in selected) def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: @@ -493,167 +457,147 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: Raw I/O ceiling of the swap device. Gate 1 fails if fio produces zero samples (device not found, O_DIRECT error, etc.). - Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference - Requires an active swap device (Gate 1 must pass). Gate 2 fails if - stress-ng does not complete within timeout. + Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (PR4) + Requires an active swap device (Gate 1 must pass). - Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch) - Independent of Tier 2 results; always attempted if Gate 1 passed. - Individual workload failures are logged but do not abort the others. + Tier 3 (Gate 3) — real-world workloads (PR5) + Independent of Tier 2 results. - If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring - application-level swap performance when the raw device is inaccessible. + If Gate 1 fails, Tiers 2 and 3 are skipped. """ - pod = _wait_for_benchmark_pod() + daemonset = _get_daemonset(spec) + + pod = daemonset.WaitForPod() if pod is None: raise errors.Benchmarks.RunError( - "[swap_encryption] Benchmark pod never became ready." + '[swap_encryption] Benchmark pod never became ready.' ) - # Initialise the module-level active-pod tracker so _pod_exec and - # _recover_pod can transparently redirect to a replacement pod if the - # original is evicted during the run. - _active_pod.clear() - _active_pod.append(pod) - _degraded_reasons.clear() - _pod_lost.clear() - _oom_events.clear() + # Reset per-run accumulators before starting phases. + daemonset.oom_events.clear() + daemonset.pod_lost.clear() original_pod = pod - swap_dev = _detect_swap_device(pod) - base_meta = _build_metadata(pod, swap_dev) + degraded_reasons: list[str] = [] + + swap_dev = _detect_swap_device(daemonset) + base_meta = _build_metadata(daemonset, swap_dev) results: list[sample.Sample] = [] t_run_start = time.time() - logging.info("[swap_encryption] swap device: %s", swap_dev) + logging.info('[swap_encryption] swap device: %s', swap_dev) - # ── Phase 1: fio microbenchmarks on raw swap device ───────────────────────── - if _phase_selected("fio"): + # ── Phase 1: fio microbenchmarks on raw swap device ─────────────────────── + if _phase_selected('fio'): logging.info( - "[swap_encryption] Phase 1: fio microbenchmarks on %s", swap_dev + '[swap_encryption] Phase 1: fio microbenchmarks on %s', swap_dev ) try: - phase1_samples = _run_phase1_fio(pod, swap_dev, base_meta) + phase1_samples = _run_phase1_fio(daemonset, swap_dev, base_meta) results += phase1_samples if not phase1_samples: - _degraded_reasons.append( - "Phase 1 (fio) produced no samples — " - "check fio install and swap device accessibility" + degraded_reasons.append( + 'Phase 1 (fio) produced no samples — ' + 'check fio install and swap device accessibility' ) - logging.error("[swap_encryption] Phase 1: no samples produced") + logging.error('[swap_encryption] Phase 1: no samples produced') except Exception as e: # pylint: disable=broad-except - _degraded_reasons.append(f"Phase 1 fio failed: {e}") - logging.error("[swap_encryption] Phase 1 fio error: %s", e) + degraded_reasons.append(f'Phase 1 fio failed: {e}') + logging.error('[swap_encryption] Phase 1 fio error: %s', e) # ── Cost estimate ───────────────────────────────────────────────────────── if _COLLECT_COST.value: elapsed = time.time() - t_run_start - results += _collect_cost_sample(pod, elapsed, base_meta) + results += _collect_cost_sample(daemonset, elapsed, base_meta) # ── Final degradation gate ──────────────────────────────────────────────── - # The phase try/except blocks above keep the run alive so partial data is - # still collected, but that means a catastrophic failure (pod OOM-evicted - # mid-run, no fio data, stress-ng killed before it could drive swap I/O) - # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics. - # Detect those conditions here and surface them explicitly. - if _active_pod and _active_pod[0] != original_pod: - _degraded_reasons.append( - f"benchmark pod was replaced during the run ({original_pod} →" - f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;" - " phases executed after the eviction ran against a" - " freshly-initialised pod (empty /tmp, swap re-setup) and may be" - " invalid" - ) - if _pod_lost: - _degraded_reasons.append( - "benchmark pod(s) went NotFound during the run" - f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure' - " eviction or container exit) and any phase running at or after" - " that" - " point (e.g. kernel-build baseline, OpenSearch) produced invalid" - " data" - ) - if _oom_events: - _degraded_reasons.append( - "OOM kill(s) (rc=137) occurred during the run on pod(s) " - f'{", ".join(_oom_events)} — a phase exceeded memory and was' - " killed by " - "the OOM killer (the container may have restarted in place), so" - " the " - "affected phase(s) produced no or partial data" - ) - - degraded = bool(_degraded_reasons) + if daemonset.pod_name and daemonset.pod_name != original_pod: + degraded_reasons.append( + f'benchmark pod was replaced during the run ({original_pod} →' + f' {daemonset.pod_name}) — it was OOM-evicted under swap' + ' pressure; phases executed after the eviction ran against a' + ' freshly-initialised pod (empty /tmp, swap re-setup) and may' + ' be invalid' + ) + if daemonset.pod_lost: + degraded_reasons.append( + 'benchmark pod(s) went NotFound during the run' + f' ({", ".join(daemonset.pod_lost)}) — the pod died (node' + ' memory-pressure eviction or container exit) and any phase' + ' running at or after that point produced invalid data' + ) + if daemonset.oom_events: + degraded_reasons.append( + 'OOM kill(s) (rc=137) occurred during the run on pod(s) ' + f'{", ".join(daemonset.oom_events)} — a phase exceeded memory' + ' and was killed by the OOM killer; the affected phase(s)' + ' produced no or partial data' + ) + + degraded = bool(degraded_reasons) results.append( sample.Sample( - "swap_encryption_run_status", + 'swap_encryption_run_status', 0.0 if degraded else 1.0, - "status", + 'status', dict( base_meta, degraded=degraded, - degraded_reasons="; ".join(_degraded_reasons) or "none", + degraded_reasons='; '.join(degraded_reasons) or 'none', num_samples=len(results) + 1, ), ) ) if degraded: - msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons) + msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons) logging.error(msg) if _FAIL_ON_DEGRADED.value: - # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED. The - # samples collected so far are still published by PKB before the failure - # is recorded, so no data is lost. raise errors.Benchmarks.RunError(msg) else: logging.info( - "[swap_encryption] Run completed cleanly (%d samples)", len(results) + '[swap_encryption] Run completed cleanly (%d samples)', + len(results), ) return results def Cleanup(spec: _BenchmarkSpec) -> None: - """Remove the DaemonSet and tear down any swap configuration.""" - pod = _wait_for_benchmark_pod(timeout=30) - if pod: - _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True) - _pod_exec( - pod, - textwrap.dedent(""" - swapoff /dev/mapper/swap_encrypted 2>/dev/null || true - dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true - """), - ignore_failure=True, - ) - # Clean up loop device backing files (single-disk fallback path). - _pod_exec( - pod, - textwrap.dedent(""" - for backing in /var/pkb_swap_backing /run/pkb_swap_backing \ - /mnt/stateful_partition/pkb_swap_backing - do - losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \ - while read dev - do - losetup -d "$dev" 2>/dev/null || true - done - rm -f "$backing" - done - """), - ignore_failure=True, - ) - _pod_exec( - pod, - "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True, + """Resources in spec.resources are auto-deleted by the PKB framework. + + SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove, + losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet. + SwapNodePool._Delete() detaches+deletes the swap disk (if any) then + deletes the benchmark nodepool. + """ + + +# --------------------------------------------------------------------------- +# Internal helpers +# --------------------------------------------------------------------------- + + +def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet: + """Retrieve the SwapDaemonSet resource from spec.resources.""" + daemonset = next( + (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)), + None, + ) + if daemonset is None: + raise errors.Benchmarks.RunError( + '[swap_encryption] SwapDaemonSet not found in spec.resources —' + ' was Prepare() called?' ) + return daemonset - _delete_daemonset() - # Detach and delete the dedicated swap disk if one was provisioned. - cluster = spec.container_cluster - if _ADD_SWAP_DISK.value and getattr(cluster, "project", None): - _detach_and_delete_swap_disk(cluster) +def _phase_selected(token: str) -> bool: + """Return True if phase `token` should run given --swap_encryption_phases. + + 'all' (the default) selects every phase. Otherwise only the + comma-separated tokens listed in the flag run. + """ + selected = [p.strip().lower() for p in _PHASES.value if p.strip()] + return (not selected) or ('all' in selected) or (token.lower() in selected) def _configure_eks_kubelet_swap(spec) -> None: @@ -674,956 +618,161 @@ def _configure_eks_kubelet_swap(spec) -> None: GKE equivalent: linuxConfig.swapConfig via --system-config-from-file (swapConfig automatically enables memorySwapBehavior=LimitedSwap), - already implemented in _create_benchmark_node_pool. + already implemented in SwapNodePool._CreateNodePool(). See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 """ logging.warning( - "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is " - "deferred (blocked on PR #6780 — SwapConfigSpec). " - "EKS nodes will use default kubelet swap settings until that PR merges." + '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is ' + 'deferred (blocked on PR #6780 — SwapConfigSpec). ' + 'EKS nodes will use default kubelet swap settings until that PR merges.' ) -def _deploy_daemonset() -> None: - """Apply the swap-infra DaemonSet manifest to the cluster. - - The DaemonSet is intentionally lean: it only verifies the node-level swap - device is active (configured via linuxConfig.swapConfig on GKE or - kubelet-config.json on EKS) and writes /tmp/pkb_ready. No benchmark - tooling is installed here — workloads are delegated to existing PKB - benchmark modules (kubernetes_fio, kubernetes_redis_memtier, etc.) which - manage their own tool installs inside separate benchmark pods. - - Uses kubernetes_commands.ApplyManifest to render the Jinja2 template from - perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 and - apply it via kubectl — the standard PKB pattern for deploying manifests. - """ - kubernetes_commands.ApplyManifest( - "cluster/swap_encryption_daemonset.yaml.j2", - ds_name=_DS_NAME, - ds_namespace=_DS_NAMESPACE, - ds_label=_DS_LABEL, - benchmark_nodepool=_BENCHMARK_NODEPOOL, - image=_DAEMONSET_IMAGE.value, - ) - logging.info("[swap_encryption] Swap-infra DaemonSet applied") - - -def _wait_for_benchmark_pod(timeout: int = 600) -> str | None: - """Wait until the swap-infra DaemonSet pod is Running AND swap is active. - - The DaemonSet installs fio and a small set of measurement tools then - verifies the swap device before writing /tmp/pkb_ready (~1-2 min on a - cold apt cache). Default timeout 600 s covers worst-case APT latency - on a freshly-started node. - - Uses tab-separated name/phase output so kubectl always exits 0 regardless - of whether any pods are present, avoiding jsonpath index errors. - """ - deadline = time.time() + timeout - last_phase = "" - ready_pod = None # pod name once phase == Running - - while time.time() < deadline: - # ── Step 1: wait for Running phase ────────────────────────────────────── - if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "pods", - "-l", - f"app={_DS_LABEL}", - "-n", - _DS_NAMESPACE, - "-o", - ( - r"jsonpath={range" - r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}' - ), - ], - raise_on_failure=False, - ) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split("\t") - if len(parts) == 2: - pod_name, phase = parts[0].strip(), parts[1].strip() - if phase == "Running": - logging.info( - "[swap_encryption] Pod %s is Running – " - "waiting for swap device readiness sentinel...", - pod_name, - ) - ready_pod = pod_name - break - if phase != last_phase: - logging.info( - "[swap_encryption] Pod %s phase: %s", - pod_name, - phase, - ) - last_phase = phase - if phase in ("Pending",): - _log_pod_events(pod_name) - else: - logging.info( - "[swap_encryption] Waiting for DaemonSet pod to appear..." - ) - - # ── Step 2: poll for /tmp/pkb_ready sentinel ──────────────────────────── - if ready_pod is not None: - sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( - [ - "exec", - ready_pod, - "-n", - _DS_NAMESPACE, - "--", - "test", - "-f", - "/tmp/pkb_ready", - ], - raise_on_failure=False, - ) - if sentinel_rc == 0: - logging.info( - "[swap_encryption] Pod %s ready (swap device active)", - ready_pod, - ) - return ready_pod - # "container not found" means the container crashed (CrashLoopBackOff or - # exited) — treat it as a hard reset: re-check pod phase on next iteration. - if ( - "container not found" in sentinel_err - or "unable to upgrade connection" in sentinel_err - ): - logging.warning( - "[swap_encryption] Pod %s: container not running (%s) " - "— will re-check pod state", - ready_pod, - sentinel_err.strip(), - ) - ready_pod = None - last_phase = "" - else: - logging.info( - "[swap_encryption] Pod %s: still installing tools...", - ready_pod, - ) - - time.sleep(15) - - logging.warning( - "[swap_encryption] Benchmark pod not ready after %ds", timeout - ) - return None - - -def _log_pod_events(pod_name: str) -> None: - """Dump recent Kubernetes events for the pod to help diagnose startup hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand( - [ - "describe", - "pod", - pod_name, - "-n", - _DS_NAMESPACE, - ], - raise_on_failure=False, - ) - # Only log the Events section to keep output manageable - in_events = False - lines = [] - for line in events_out.splitlines(): - if line.startswith("Events:"): - in_events = True - if in_events: - lines.append(line) - if lines: - logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30])) - else: - logging.info( - "[swap_encryption] kubectl describe output:\n%s", - events_out[-2000:] if len(events_out) > 2000 else events_out, - ) - - -def _delete_daemonset() -> None: - """Delete the benchmark DaemonSet.""" - kubectl.RunKubectlCommand( - [ - "delete", - "daemonset", - _DS_NAME, - "-n", - _DS_NAMESPACE, - "--ignore-not-found", - ], - raise_on_failure=False, - ) - logging.info("[swap_encryption] DaemonSet deleted") - - -_HYPERDISK_MAX_IOPS_PER_MBPS = ( - 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s -) - - -def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: - """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint. - - Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed - 256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails - with "Requested provisioned throughput is too low for the provisioned iops". - Clamp throughput UP to the minimum the requested IOPS need (plus a small - margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk - creation. - """ - min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) - if throughput < min_tput: - logging.warning( - "[swap_encryption] boot/swap disk throughput %d MiB/s is too low" - " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);" - " raising to %d", - throughput, - iops, - min_tput, - min_tput, - ) - return min_tput - return throughput - - -def _create_benchmark_node_pool(cluster) -> None: - """Add the benchmark nodepool to the existing cluster (Step 2 of setup). - - Uses: - --swap_encryption_benchmark_machine_type (default n4-highmem-32) - --swap_encryption_node_image_type (default UBUNTU_CONTAINERD) - --swap_encryption_boot_disk_iops (default 80000) - --swap_encryption_enable_dmcrypt (default True) - - The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet - nodeSelector targets it exclusively. dm-crypt swap setup is performed - from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap / - _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata - because GKE reserves that metadata key and rejects it at the API level. - """ - machine_type = _BENCHMARK_MACHINE_TYPE.value - # Auto-detect LSSD from machine type name; flag overrides only when True. - is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower() - - # Determine zone/region from the cluster object. - # LSSD configs only need a small boot disk (OS only; swap is on local NVMe). - # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on - # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk - # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable - # headroom and matches the Config 2 spec in the Engineer Assignments table). - disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value - - disk_type = _BOOT_DISK_TYPE.value - - # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region, - # and auth token refresh. GkeCluster._GcloudCommand also handles the - # zone → region promotion for multi-zone / regional clusters. - cmd = cluster._GcloudCommand( - "container", - "node-pools", - "create", - _BENCHMARK_NODEPOOL, - "--cluster", - cluster.name, - ) - cmd.flags["machine-type"] = machine_type - cmd.flags["image-type"] = _NODE_IMAGE_TYPE.value - cmd.flags["disk-type"] = disk_type - cmd.flags["disk-size"] = disk_size_gb - cmd.flags["num-nodes"] = 1 - cmd.flags["node-labels"] = f"pkb_nodepool={_BENCHMARK_NODEPOOL}" - cmd.args += ["--no-enable-autoupgrade", "--no-enable-autorepair"] - - # IOPS and throughput provisioning only applies to hyperdisk-* types AND - # only when the boot disk is also the swap device (non-LSSD configs). - # For LSSD machines the boot disk is OS-only; swap is on local NVMe. - # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the - # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max). - if disk_type.startswith("hyperdisk") and not is_lssd: - # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by - # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks). - cmd.flags["boot-disk-provisioned-iops"] = _BOOT_DISK_IOPS.value - cmd.flags["boot-disk-provisioned-throughput"] = ( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ) - - # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm - # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block). - if is_lssd: - cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}" - - # ── GKE swap system-config ─────────────────────────────────────────────── - # Pass linuxConfig.swapConfig + linuxConfig.sysctl via --system-config-from-file. - # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984): - # linuxConfig.swapConfig: GKE enables node-level swap device and - # automatically sets kubeletConfig.memorySwapBehavior=LimitedSwap. - # For LSSD machines, dedicatedLocalSsdProfile tells GKE to use the - # local NVMe as the swap device (avoids boot-disk overhead). - # linuxConfig.sysctl: swap aggressiveness tuning so benchmark workloads - # can drive sustained swap I/O. - # Reference: - # https://docs.cloud.google.com/kubernetes-engine/docs/how-to/ - # node-memory-swap#enable - system_config_tmp = None - if is_lssd: - swap_config_block = ( - " swapConfig:\n" - " enabled: true\n" - " dedicatedLocalSsdProfile:\n" - f" diskCount: {_LSSD_COUNT.value}\n" - ) - else: - swap_config_block = " swapConfig:\n enabled: true\n" - swap_config_yaml = ( - "linuxConfig:\n" - + swap_config_block - + " sysctl:\n" - " vm.min_free_kbytes: 200\n" - " vm.watermark_scale_factor: 500\n" - " vm.swappiness: 100\n" - ) - system_config_tmp = tempfile.NamedTemporaryFile( - mode="w", suffix=".yaml", delete=False - ) - system_config_tmp.write(swap_config_yaml) - system_config_tmp.flush() - cmd.flags["system-config-from-file"] = system_config_tmp.name - logging.info( - "[swap_encryption] system-config-from-file: " - "lssd=%s (written to %s):\n%s", - is_lssd, - system_config_tmp.name, - swap_config_yaml, - ) - - logging.info( - "[swap_encryption] Creating benchmark nodepool: %s / %s / " - "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / " - "add_swap_disk=%s", - _BENCHMARK_NODEPOOL, - machine_type, - _NODE_IMAGE_TYPE.value, - disk_size_gb, - _BOOT_DISK_IOPS.value, - _ENABLE_DMCRYPT.value, - is_lssd, - _ADD_SWAP_DISK.value, - ) - - # LSSD nodepools take longer to provision than PD-only nodepools because - # GKE must also initialise the local NVMe devices before marking nodes Ready. - # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs. - try: - _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) - finally: - if system_config_tmp is not None: - try: - os.unlink(system_config_tmp.name) - except OSError: - pass - - if rc != 0: - # Idempotent prepare: if the nodepool already exists (e.g. re-running - # --run_stage=prepare,run to redeploy the DaemonSet onto an existing - # cluster), reuse it instead of failing. gcloud returns a 409 / - # "Already exists" message in this case. - low = (stderr or "").lower() - if ( - "already exists" in low - or "alreadyexists" in low - or "code=409" in low - ): - logging.info( - "[swap_encryption] Benchmark nodepool already exists — " - "reusing it (idempotent prepare); proceeding to DaemonSet" - ) - return - raise errors.Benchmarks.RunError( - "[swap_encryption] Failed to create benchmark nodepool " - f"(rc={rc}): {stderr}" - ) - logging.info("[swap_encryption] Benchmark nodepool ready") - - -def _wait_for_benchmark_node(timeout: int = 900) -> None: - """Block until a node labelled pkb_nodepool=benchmark is Ready. - - gcloud container node-pools create returns as soon as the API accepts the - request — the actual node VM may take another 2-4 minutes to boot, join the - cluster, and pass its readiness checks. Deploying the DaemonSet before that - point leaves the pod Pending indefinitely because the nodeSelector finds no - eligible node. +def _ensure_io2_volume() -> None: + """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). - This function polls kubectl every 15 s until at least one node with - pkb_nodepool=benchmark has Ready=True, then returns. + Only executed when --swap_encryption_swap_type=io2. Full implementation + is deferred to PR2 (swap-capability layer). """ - deadline = time.time() + timeout + if _SWAP_TYPE.value != 'io2': + return logging.info( - "[swap_encryption] Waiting for benchmark node " - "(pkb_nodepool=benchmark) to be Ready..." - ) - while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "nodes", - "-l", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", - "-o", - r"jsonpath={range .items[*]}" - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}', - ], - raise_on_failure=False, - ) - - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split("\t") - if len(parts) == 2 and parts[1].strip() == "True": - logging.info( - "[swap_encryption] Benchmark node ready: %s", - parts[0].strip(), - ) - return - - logging.info( - "[swap_encryption] Benchmark node not yet Ready — retrying in 15" - " s..." - ) - time.sleep(15) - - raise errors.Benchmarks.RunError( - "[swap_encryption] Timed out waiting for benchmark node " - f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready " - f"after {timeout}s" + '[swap_encryption] io2 swap volume provisioning deferred to PR2' ) -def _attach_swap_disk(cluster) -> None: - """Create a dedicated hyperdisk and attach it to the benchmark node. - - gcloud container node-pools create --additional-node-disk is not available - in all gcloud SDK versions, so we use gcloud compute to create the disk and - attach it after the node is ready. In GKE the Kubernetes node name is the - same as the GCE instance name, so no translation is needed. - - After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe - nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk. - - The disk is named pkb-swap- to avoid name collisions across - concurrent runs. Cleanup deletes it in Cleanup() if it exists. - """ - # Resolve zone from cluster - zone = None - if getattr(cluster, "zones", None): - zone = cluster.zones[0] - elif getattr(cluster, "region", None): - zone = cluster.region - if not zone: - raise errors.Benchmarks.RunError( - "[swap_encryption] Cannot attach swap disk: cluster zone unknown" - ) - - project = cluster.project - disk_name = f"pkb-swap-{cluster.name}" - disk_type = _BOOT_DISK_TYPE.value - disk_size_gb = _SWAP_DISK_SIZE_GB.value - - # ── Step 1: get the GCE instance name of the benchmark node ─────────────── - node_out, _, rc = kubectl.RunKubectlCommand( - [ - "get", - "nodes", - "-l", - f"pkb_nodepool={_BENCHMARK_NODEPOOL}", - "-o", - "jsonpath={.items[0].metadata.name}", - ], - raise_on_failure=False, - ) - instance_name = node_out.strip() - if rc != 0 or not instance_name: - raise errors.Benchmarks.RunError( - "[swap_encryption] Cannot find benchmark node for swap disk attach" - ) - logging.info("[swap_encryption] Benchmark node instance: %s", instance_name) - - # ── Step 2: create the hyperdisk ────────────────────────────────────────── - logging.info( - "[swap_encryption] Creating swap disk %s (%dGiB %s)", - disk_name, - disk_size_gb, - disk_type, - ) - # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project - # and --zone (always zonal — gcloud compute --region creates regional - # resources, which is not what we want for a node-attached swap disk). - gcp_res = _GcpZonalResource(project, zone) - create_cmd = gcp_util.GcloudCommand( - gcp_res, "compute", "disks", "create", disk_name - ) - create_cmd.flags["type"] = disk_type - create_cmd.flags["size"] = f"{disk_size_gb}GB" - create_cmd.args.append("--quiet") - if disk_type.startswith("hyperdisk"): - create_cmd.flags["provisioned-iops"] = _BOOT_DISK_IOPS.value - create_cmd.flags["provisioned-throughput"] = ( - _valid_hyperdisk_throughput( - _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value - ) - ) - _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f"[swap_encryption] Failed to create swap disk {disk_name}:" - f" {stderr}" - ) +def _detect_swap_device( + daemonset: _ds_mod.SwapDaemonSet, +) -> str: + """Return the active swap device path on the cluster node.""" + if _SWAP_DEVICE.value: + return _SWAP_DEVICE.value - # ── Step 3: attach the disk to the node VM ──────────────────────────────── - logging.info( - "[swap_encryption] Attaching swap disk %s to %s", - disk_name, - instance_name, - ) - attach_cmd = gcp_util.GcloudCommand( - gcp_res, "compute", "instances", "attach-disk", instance_name - ) - attach_cmd.flags["disk"] = disk_name - attach_cmd.flags["device-name"] = "pkb-swap" - attach_cmd.args.append("--quiet") - _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f"[swap_encryption] Failed to attach swap disk to {instance_name}: " - f"{stderr}" - ) - logging.info( - "[swap_encryption] Swap disk attached: %s → %s", - disk_name, - instance_name, + # /proc/swaps is the source of truth — it lists the device ACTUALLY active. + # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping + # from a previous run on a reused node can still appear as a /dev node while + # being non-functional (fio/swapoff fail with "No such device or address"). + dm_out, _ = daemonset.PodExec( + textwrap.dedent(""" + ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) + if [ -n "$ACTIVE" ] + then + echo "$ACTIVE" + elif test -e /dev/mapper/swap_encrypted + then + echo /dev/mapper/swap_encrypted + fi + """), + ignore_failure=True, ) - - -def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool: - """Detach (if attached) and delete a GCE disk, robustly, with retries. - - Finds the attached instance from the disk's own `users` field rather than - kubectl — kubectl is often unavailable during teardown (cluster being - deleted), which previously left the disk attached and undeletable, so it - leaked. Returns True if the disk is gone (deleted or already absent). - """ - for attempt in range(1, 5): - gcp_res = _GcpZonalResource(project, zone) - describe_cmd = gcp_util.GcloudCommand( - gcp_res, "compute", "disks", "describe", disk_name - ) - describe_cmd.flags["format"] = "value(users)" - users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) - if rc != 0: - logging.info( - "[swap_encryption] Swap disk %s not present — nothing to" - " delete", - disk_name, - ) - return True # already gone - user = users.strip() - if user: - inst = user.split("/")[-1] - logging.info( - "[swap_encryption] Detaching swap disk %s from %s", - disk_name, - inst, - ) - detach_cmd = gcp_util.GcloudCommand( - gcp_res, "compute", "instances", "detach-disk", inst - ) - detach_cmd.flags["disk"] = disk_name - detach_cmd.args.append("--quiet") - detach_cmd.Issue(timeout=120, raise_on_failure=False) - delete_cmd = gcp_util.GcloudCommand( - gcp_res, "compute", "disks", "delete", disk_name - ) - delete_cmd.args.append("--quiet") - _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) - if drc == 0: - logging.info("[swap_encryption] Swap disk deleted: %s", disk_name) - return True - logging.warning( - "[swap_encryption] Swap disk delete attempt %d/4 failed " - "(%s); retrying in 10s", - attempt, - derr.strip()[:160], - ) - time.sleep(10) - logging.error( - "[swap_encryption] Could NOT delete swap disk %s after retries " - "— delete it manually: gcloud compute disks delete %s " - "--zone %s --quiet", - disk_name, - disk_name, - zone, + dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' + if dev: + return dev + raise ValueError( + 'No active swap device found in the benchmark pod. ' + 'Use --swap_encryption_device to specify one.' ) - return False -def _detach_and_delete_swap_disk(cluster) -> None: - """Detach and delete the dedicated swap disk created by _attach_swap_disk.""" - zone = None - if getattr(cluster, "zones", None): - zone = cluster.zones[0] - elif getattr(cluster, "region", None): - zone = cluster.region - if not zone or not getattr(cluster, "project", None): - return - _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone) - - -def _delete_default_node_pool(cluster) -> None: - """Delete the dummy default nodepool after the benchmark pool is ready. - - The default nodepool (e2-medium) was only needed to satisfy GKE's - requirement that a cluster must have at least one nodepool at creation time. - Removing it stops the clock on its cost immediately. - """ - # Use PKB's GcloudCommand: auto-injects --project, --zone/--region. - cmd = cluster._GcloudCommand( - "container", - "node-pools", - "delete", - _DEFAULT_NODEPOOL, - "--cluster", - cluster.name, +def _build_metadata( + daemonset: _ds_mod.SwapDaemonSet, swap_dev: str +) -> dict[str, Any]: + """Collect node environment, encryption type, and config into a dict.""" + kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True) + mem_out, _ = daemonset.PodExec( + "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True ) - cmd.args.append("--quiet") - - logging.info( - "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL + swap_out, _ = daemonset.PodExec( + "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True ) - _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) - if rc != 0: - logging.warning( - "[swap_encryption] Could not delete default nodepool (rc=%d): %s", - rc, - stderr, - ) - else: - logging.info("[swap_encryption] Default nodepool deleted") - -def _is_pod_gone(pod: str) -> bool: - """Return True if the named pod no longer exists in the cluster. - - Used to distinguish OOM-killed container processes (pod still alive, rc=137) - from OOM-evicted pods (pod gone, DaemonSet will create a replacement). - """ try: - _, err, rc = kubectl.RunKubectlCommand( - [ - "get", - "pod", - pod, - "-n", - _DS_NAMESPACE, - "-o", - "jsonpath={.metadata.name}", - ], - raise_on_failure=False, - timeout=15, - ) - return rc != 0 and "not found" in (err or "").lower() - except Exception: # pylint: disable=broad-except - return False - - -def _pod_exec( - pod: str, - cmd: str, - ignore_failure: bool = False, - timeout: int = 300, - _retries: int = 2, -) -> tuple[str, str]: - """Run a shell command inside the benchmark pod via kubectl exec. - - Args: - pod: Pod name returned by _wait_for_benchmark_pod. - cmd: Shell command string passed to bash -c. - ignore_failure: When True, non-zero exit codes are logged but not - raised. - timeout: Seconds before PKB kills the kubectl exec process. Default - 300 s matches PKB's IssueCommand default. Pass a larger value for - long-running jobs (fio, stress-ng, kernel build). - _retries: Number of automatic retries on transient GKE websocket - resets ("connection reset by peer"). Set to 0 to disable retries - for idempotent-sensitive commands. - - Returns: - Tuple of (stdout, stderr) strings. - """ - # Use module-level constants for error strings (defined at top of module). - # Use the globally-tracked active pod name — it may have been updated by - # a previous _recover_pod call when eviction replaced the pod. - active = _active_pod[0] if _active_pod else pod - - for attempt in range(_retries + 1): - out, err, rc = kubectl.RunKubectlCommand( - ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd], - raise_on_failure=False, - raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets - timeout=timeout, - ) - is_transient = rc != 0 and any( - e in err for e in _TRANSIENT_KUBECTL_ERRORS - ) - if is_transient and attempt < _retries: - logging.warning( - "[swap_encryption] kubectl exec connection reset (attempt" - " %d/%d); retrying in 10 s", - attempt + 1, - _retries + 1, - ) - time.sleep(10) - continue - # rc=137 (SIGKILL): the OOM killer terminated the container process. - # Two sub-cases: - # A) Pod eviction: pod is gone, DaemonSet recreates it under a new name. - # B) Container OOM restart: pod still exists, container restarts in place. - # (DaemonSet restartPolicy=Always restarts the container, /tmp is lost, - # tools must be re-installed before subsequent commands can run.) - # In both cases we call _recover_pod to wait for tools + sentinel, and - # we do NOT retry the OOM-triggering command itself. - if rc == 137: - # Record the OOM so the run-level gate can flag it even if the container - # restarts in place under the same pod name (which leaves both the - # "pod replaced" and "pod NotFound" checks silent). - if active not in _oom_events: - _oom_events.append(active) - # CRITICAL: sleep before checking pod state. Kubernetes takes a few - # seconds to mark a just-evicted pod as Terminating / NotFound. Without - # this delay _recover_pod sees the pod still in "Running" phase, returns - # the old pod name immediately, and every subsequent command fails with - # "Error from server (NotFound): pods … not found". - logging.warning( - "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to" - " update pod state before recovery check" - ) - time.sleep(15) - pod_gone = _is_pod_gone(active) - if pod_gone: - logging.warning( - "[swap_encryption] OOM-eviction detected (rc=137, pod gone)" - " — recovering pod name for subsequent commands (not" - " retrying this cmd)" - ) - else: - logging.warning( - "[swap_encryption] Container OOM-killed (rc=137, pod still" - " exists) — waiting for container restart and tool" - " re-install before continuing" - ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - "[swap_encryption] Pod name updated: %s → %s", - active, - new_pod, - ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - break # Do NOT retry — the OOM cmd itself is not re-run on the new pod. + mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) + except ValueError: + mem_gb = 0 + try: + swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) + except ValueError: + swap_gb = 0 - is_container_gone = rc != 0 and any( - e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS - ) - if is_container_gone: - # Record the loss for the run-level degradation gate REGARDLESS of retry - # budget or ignore_failure. A "pods … not found" on a best-effort command - # (kernel build, opensearch, cleanup of a dead pod) still means the pod - # died; without this the gate stays blind because _active_pod is only - # renamed on the retry path below, which _retries=0 callers never reach. - if active and active not in _pod_lost: - _pod_lost.append(active) - logging.error( - "[swap_encryption] Benchmark pod %s is gone (%s) —" - " recording run as degraded", - active, - (err or "").strip()[:160], - ) - if attempt < _retries: - logging.warning( - "[swap_encryption] Container gone/restarting (attempt" - " %d/%d) — waiting for pod to recover...", - attempt + 1, - _retries + 1, - ) - new_pod = _recover_pod(active) - if new_pod != active: - logging.info( - "[swap_encryption] Pod name updated: %s → %s", - active, - new_pod, - ) - if _active_pod: - _active_pod[0] = new_pod - active = new_pod - continue - break - - if rc != 0 and not ignore_failure: - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] _pod_exec failed (rc={rc}): {err}" + # Encryption type — key off dm-crypt presence + swap target. + enc = 'unknown' + if '/dev/mapper/' in swap_dev: + table_out, _ = daemonset.PodExec( + f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', + ignore_failure=True, ) - return out, err - - -def _recover_pod(pod: str, timeout_sec: int = 600) -> str: - """Wait for a DaemonSet container to recover after OOM kill or eviction. + enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' + elif _SWAP_TYPE.value in ('instance_store', 'io2'): + enc = 'nitro_hardware_offload' + elif not _ENABLE_DMCRYPT.value: + enc = 'none' - Handles two scenarios: - 1. Container OOM restart: same pod name, container restarting in place. - DaemonSet restartPolicy=Always brings it back under the same pod name. - 2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates - a new pod with a DIFFERENT name. We detect this by checking whether - the named pod still exists; if not, we search by the DaemonSet label - selector for a Running pod. + cloud = _detect_cloud(daemonset) - Returns the (possibly new) pod name once it is Running and ready. - """ - deadline = time.time() + timeout_sec - logging.info( - "[swap_encryption] Waiting for pod %s to recover (up to %ds)...", - pod, - timeout_sec, - ) - - # Phase 1: wait for a Running pod — either the named one (container - # restart) or a replacement pod found via label selector (eviction). - # - # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a - # single call. When a pod is evicted, Kubernetes first sets deletionTimestamp - # (the pod is "Terminating") while status.phase may still read "Running" for - # several seconds. Checking only status.phase causes a false-positive: we - # return the old pod name immediately and every subsequent command fails with - # "Error from server (NotFound)". Checking deletionTimestamp catches this. - recovered_pod = pod - while time.time() < deadline: - # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not - # stdout. When the pod is gone, status_out is empty and the error text - # lives entirely in status_err. Discarding stderr (using _) means the - # 'not found' check below never fires and we spin until deadline. - status_out, status_err, status_rc = kubectl.RunKubectlCommand( - [ - "get", - "pod", - pod, - "-n", - _DS_NAMESPACE, - "-o", - "jsonpath={.status.phase}|{.metadata.deletionTimestamp}", - ], - raise_on_failure=False, - timeout=30, + instance_label = _INSTANCE_SIZE_LABEL.value + if not instance_label: + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, ) - # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating) - fields = status_out.strip().split("|") - phase = fields[0].strip() if fields else "" - is_terminating = len(fields) > 1 and bool(fields[1].strip()) - - # Pod is genuinely Running and NOT being deleted — recovery complete. - if status_rc == 0 and phase == "Running" and not is_terminating: - break - - # Pod no longer exists, OR it exists but is being terminated (Terminating - # state or deletionTimestamp set) — look for a replacement pod by label. - pod_gone_or_terminating = ( - status_rc != 0 and "not found" in (status_out + status_err).lower() - ) or is_terminating - if pod_gone_or_terminating: - label_out, _, label_rc = kubectl.RunKubectlCommand( - [ - "get", - "pods", - "-n", - _DS_NAMESPACE, - "-l", - f"app={_DS_LABEL}", - "-o", - ( - 'jsonpath={range .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}' - ), - ], - raise_on_failure=False, - timeout=30, - ) - new_pods = [ - p.strip() - for p in label_out.strip().splitlines() - if p.strip() and p.strip() != pod - ] # exclude the dying pod - if label_rc == 0 and new_pods: - recovered_pod = new_pods[0] - logging.info( - "[swap_encryption] Original pod %s gone/terminating; " - "found replacement %s", - pod, - recovered_pod, - ) - break - - time.sleep(10) - else: - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] No Running pod found (original: {pod}) " - f"within {timeout_sec}s after OOM kill / eviction" + if gcp_type_out.strip(): + instance_label = gcp_type_out.strip().split('/')[-1] + if not instance_label: + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' + '2>/dev/null || echo ""', + ignore_failure=True, ) + instance_label = aws_type_out.strip() - # Phase 2: wait for init script to finish (sentinel written last). - while time.time() < deadline: - ready_out, _, ready_rc = kubectl.RunKubectlCommand( - [ - "exec", - recovered_pod, - "-n", - _DS_NAMESPACE, - "--", - "bash", - "-c", - "test -f /tmp/pkb_ready && echo READY", - ], - raise_on_failure=False, - timeout=30, - ) - if ready_rc == 0 and "READY" in ready_out: - logging.info( - "[swap_encryption] Pod %s recovered (swap device active)", - recovered_pod, - ) - return recovered_pod - time.sleep(15) + return { + 'benchmark': BENCHMARK_NAME, + 'execution_mode': 'kubernetes_privileged_pod', + 'cloud': cloud, + 'instance_size': instance_label, + 'kernel_version': kernel_out.strip(), + 'host_memory_gb': mem_gb, + 'swap_device': swap_dev, + 'swap_size_gb': swap_gb, + 'swap_encryption': enc, + 'storage_target': _SWAP_TYPE.value, + 'boot_disk_type': _BOOT_DISK_TYPE.value, + 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, + 'node_image_type': _NODE_IMAGE_TYPE.value, + 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, + 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, + 'zswap_enabled': _ENABLE_ZSWAP.value, + 'min_free_kbytes': _MIN_FREE_KBYTES.value, + 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, + 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, + 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, + 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, + 'nodepool': _NODEPOOL.value, + } - raise errors.VmUtil.IssueCommandError( - f"[swap_encryption] Pod {recovered_pod} did not become ready " - f"within {timeout_sec}s after OOM kill / eviction" + +def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str: + """Detect whether the benchmark pod is running on GCP or AWS.""" + gcp_out, _ = daemonset.PodExec( + 'curl -s -m 2 --fail ' + 'http://metadata.google.internal/computeMetadata/v1/project/project-id' + ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', + ignore_failure=True, ) + if gcp_out.strip(): + return 'GCP' + return 'AWS' def _run_phase1_fio( - pod: str, swap_dev: str, base_meta: dict[str, Any] + daemonset: _ds_mod.SwapDaemonSet, + swap_dev: str, + base_meta: dict[str, Any], ) -> list[sample.Sample]: """Run fio microbenchmarks on the raw swap block device (Phase 1). @@ -1639,7 +788,7 @@ def _run_phase1_fio( 4k_lat_read iodepth=1 → completion latency floor (read) Args: - pod: Benchmark pod name. + daemonset: Active SwapDaemonSet resource. swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted. base_meta: Shared metadata dict from _build_metadata(). @@ -1648,51 +797,48 @@ def _run_phase1_fio( """ samples: list[sample.Sample] = [] - # swapoff before fio — running fio with --direct=1 on an active swap - # device races with kernel page-reclaim on the same dm-crypt target - # and can cause kernel panics on some kernels. - logging.info("[swap_encryption] Phase 1: swapoff %s", swap_dev) - _pod_exec( - pod, - f"swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true", + # swapoff before fio — running fio with --direct=1 on an active swap device + # races with kernel page-reclaim on the same dm-crypt target. + logging.info('[swap_encryption] Phase 1: swapoff %s', swap_dev) + daemonset.PodExec( + f'swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true', timeout=30, ignore_failure=True, ) # (name, rw_mode, block_size, iodepth) fio_jobs = [ - ("4k_randread", "randread", "4k", 32), - ("4k_randwrite", "randwrite", "4k", 32), - ("1m_seqread", "read", "1m", 8), - ("1m_seqwrite", "write", "1m", 8), - ("4k_lat_read", "randread", "4k", 1), + ('4k_randread', 'randread', '4k', 32), + ('4k_randwrite', 'randwrite', '4k', 32), + ('1m_seqread', 'read', '1m', 8), + ('1m_seqwrite', 'write', '1m', 8), + ('4k_lat_read', 'randread', '4k', 1), ] runtime = _FIO_RUNTIME_SEC.value try: for name, rw, bs, iodepth in fio_jobs: cmd = ( - f"fio --name={name} --filename={swap_dev}" - f" --rw={rw} --bs={bs} --iodepth={iodepth}" - " --ioengine=libaio --direct=1" - f" --runtime={runtime} --time_based --group_reporting" - " --output-format=json 2>/dev/null" + f'fio --name={name} --filename={swap_dev}' + f' --rw={rw} --bs={bs} --iodepth={iodepth}' + ' --ioengine=libaio --direct=1' + f' --runtime={runtime} --time_based --group_reporting' + ' --output-format=json 2>/dev/null' ) - logging.info("[swap_encryption] Phase 1: fio job %s", name) - out, _ = _pod_exec(pod, cmd, timeout=runtime + 120) + logging.info('[swap_encryption] Phase 1: fio job %s', name) + out, _ = daemonset.PodExec(cmd, timeout=runtime + 120) samples += _parse_fio_json(out, name, base_meta) finally: # Always re-enable swap so subsequent phases can drive swap I/O. - logging.info("[swap_encryption] Phase 1: swapon %s", swap_dev) - _pod_exec( - pod, - f"swapon {swap_dev} 2>/dev/null || true", + logging.info('[swap_encryption] Phase 1: swapon %s', swap_dev) + daemonset.PodExec( + f'swapon {swap_dev} 2>/dev/null || true', timeout=30, ignore_failure=True, ) logging.info( - "[swap_encryption] Phase 1 complete (%d samples)", len(samples) + '[swap_encryption] Phase 1 complete (%d samples)', len(samples) ) return samples @@ -1714,10 +860,10 @@ def _parse_fio_json( List of Sample objects; empty if output cannot be parsed or is zero. """ # fio sometimes emits kernel warnings before the JSON object. - json_start = fio_output.find("{") + json_start = fio_output.find('{') if json_start == -1: logging.warning( - "[swap_encryption] Phase 1: no JSON in fio output for %s", job_name + '[swap_encryption] Phase 1: no JSON in fio output for %s', job_name ) return [] @@ -1725,13 +871,13 @@ def _parse_fio_json( data = json.loads(fio_output[json_start:]) except json.JSONDecodeError as e: logging.warning( - "[swap_encryption] Phase 1: fio JSON parse error (%s): %s", + '[swap_encryption] Phase 1: fio JSON parse error (%s): %s', job_name, e, ) return [] - jobs = data.get("jobs", []) + jobs = data.get('jobs', []) if not jobs: return [] @@ -1739,43 +885,43 @@ def _parse_fio_json( samples: list[sample.Sample] = [] meta = dict(base_meta, fio_job=job_name) - for direction in ("read", "write"): + for direction in ('read', 'write'): d = job.get(direction, {}) - iops = float(d.get("iops", 0)) - bw_kbps = float(d.get("bw", 0)) # fio reports KiB/s + iops = float(d.get('iops', 0)) + bw_kbps = float(d.get('bw', 0)) # fio reports KiB/s bw_mbps = bw_kbps / 1024.0 - # Skip directions with near-zero throughput (e.g. write on a randread job). + # Skip directions with near-zero throughput. if iops < 1 and bw_kbps < 1: continue - prefix = f"phase1_fio_{job_name}_{direction}" - samples.append(sample.Sample(f"{prefix}_iops", iops, "IOPS", meta)) + prefix = f'phase1_fio_{job_name}_{direction}' + samples.append(sample.Sample(f'{prefix}_iops', iops, 'IOPS', meta)) samples.append( - sample.Sample(f"{prefix}_bw_mbps", bw_mbps, "MB/s", meta) + sample.Sample(f'{prefix}_bw_mbps', bw_mbps, 'MB/s', meta) ) # Completion latency — fio reports nanoseconds; emit microseconds. - clat = d.get("clat_ns", d.get("lat_ns", {})) - lat_mean_ns = float(clat.get("mean", 0)) + clat = d.get('clat_ns', d.get('lat_ns', {})) + lat_mean_ns = float(clat.get('mean', 0)) if lat_mean_ns > 0: samples.append( sample.Sample( - f"{prefix}_lat_mean_us", lat_mean_ns / 1000.0, "us", meta + f'{prefix}_lat_mean_us', lat_mean_ns / 1000.0, 'us', meta ) ) for pct_key, label in ( - ("50.000000", "p50"), - ("99.000000", "p99"), - ("99.900000", "p999"), + ('50.000000', 'p50'), + ('99.000000', 'p99'), + ('99.900000', 'p999'), ): - val_ns = clat.get("percentile", {}).get(pct_key, 0) + val_ns = clat.get('percentile', {}).get(pct_key, 0) if val_ns: samples.append( sample.Sample( - f"{prefix}_lat_{label}_us", + f'{prefix}_lat_{label}_us', val_ns / 1000.0, - "us", + 'us', meta, ) ) @@ -1785,258 +931,73 @@ def _parse_fio_json( _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { # GCP (on-demand, us-central1 unless noted) - "c4-standard-8-lssd": 0.5888, # 8 vCPU, 32 GB RAM + 1×375 GB LSSD - "c4-standard-8": 0.5008, # 8 vCPU, 32 GB RAM, no LSSD - "n4-highmem-32": 3.0256, # 32 vCPU, 256 GB RAM - "n2-highmem-32": 2.5216, # 32 vCPU, 256 GB RAM - "n2-standard-32": 1.5264, # 32 vCPU, 120 GB RAM - "z3-highmem-8": 2.7248, # 8 vCPU + 4× LSSD + 'c4-standard-8-lssd': 0.5888, + 'c4-standard-8': 0.5008, + 'n4-highmem-32': 3.0256, + 'n2-highmem-32': 2.5216, + 'n2-standard-32': 1.5264, + 'z3-highmem-8': 2.7248, # AWS - "i4i.4xlarge": 1.4960, # 16 vCPU, 128 GB RAM, NVMe Instance Store - "i4i.2xlarge": 0.7480, - "m6id.4xlarge": 0.9072, # 16 vCPU, 64 GB RAM, NVMe Instance Store - "m6i.4xlarge": 0.7680, # 16 vCPU, 64 GB RAM, no Instance Store - "r6i.4xlarge": 1.0080, # 16 vCPU, 128 GB RAM, no Instance Store + 'i4i.4xlarge': 1.4960, + 'i4i.2xlarge': 0.7480, + 'm6id.4xlarge': 0.9072, + 'm6i.4xlarge': 0.7680, + 'r6i.4xlarge': 1.0080, } def _collect_cost_sample( - pod: str, elapsed_sec: float, base_meta: dict + daemonset: _ds_mod.SwapDaemonSet, + elapsed_sec: float, + base_meta: dict, ) -> list[sample.Sample]: - """Emit a cost_estimate_usd sample for the benchmark run (gap 7). - - Instance type is read from cloud metadata inside the pod. Price is looked - up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and - a warning is logged. + """Emit a cost_estimate_usd sample for the benchmark run.""" + instance_type = '' - Args: - pod: Benchmark pod name. - elapsed_sec: Wall-clock seconds the benchmark phases took. - base_meta: Shared metadata dict. - - Returns: - A list of zero or one sample.Sample. - """ - # Detect instance type from cloud metadata - instance_type = "" - - # GCP: machine type is the last segment of the metadata URL value - gcp_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail" - " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" + gcp_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail' + ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', ignore_failure=True, ) if gcp_type_out.strip(): - instance_type = gcp_type_out.strip().split("/")[-1] + instance_type = gcp_type_out.strip().split('/')[-1] if not instance_type: - # AWS: instance-type is a plain string - aws_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail " - "http://169.254.169.254/latest/meta-data/instance-type " + aws_type_out, _ = daemonset.PodExec( + 'curl -s -m 3 --fail ' + 'http://169.254.169.254/latest/meta-data/instance-type ' '2>/dev/null || echo ""', ignore_failure=True, ) instance_type = aws_type_out.strip() - # Allow explicit override (useful when running on custom/renamed machine - # types or when the pod was unavailable during cost collection). if _INSTANCE_SIZE_LABEL.value: instance_type = _INSTANCE_SIZE_LABEL.value - # Last resort: fall back to the benchmark machine type flag. This ensures - # cost tracking works even when the pod was evicted before cost collection - # ran (in which case the metadata curl above returned empty). if not instance_type and _BENCHMARK_MACHINE_TYPE.value: instance_type = _BENCHMARK_MACHINE_TYPE.value logging.info( - "[swap_encryption] Instance type from metadata unavailable; using" - " --swap_encryption_benchmark_machine_type=%s for cost tracking", + '[swap_encryption] Instance type from metadata unavailable; using' + ' --swap_encryption_benchmark_machine_type=%s for cost tracking', instance_type, ) price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) if price is None: logging.warning( - '[swap_encryption] Unknown instance type "%s" – skipping cost' - " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost" - " tracking.", + '[swap_encryption] Unknown instance type "%s" — skipping cost' + ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost' + ' tracking.', instance_type, ) return [] hours = elapsed_sec / 3600.0 - cost = hours * price meta = dict( base_meta, instance_type=instance_type, price_usd_per_hr=price, benchmark_elapsed_sec=round(elapsed_sec, 1), ) - return [sample.Sample("cost_estimate_usd", cost, "USD", meta)] - - -def _detect_swap_device(pod: str) -> str: - """Return the active swap device path on the cluster node.""" - if _SWAP_DEVICE.value: - return _SWAP_DEVICE.value - - # /proc/swaps is the source of truth: it lists the swap device that is - # ACTUALLY active. We must NOT just `test -e /dev/mapper/swap_encrypted`, - # because a stale dm-crypt mapping from a previous run on a reused node can - # still exist as a /dev node while being non-functional (fio/swapoff then - # fail with "No such device or address"). So read the active device from - # /proc/swaps first; only fall back to the mapper path if /proc/swaps is - # somehow empty but the mapper is genuinely present. - dm_out, _ = _pod_exec( - pod, - textwrap.dedent(""" - ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) - if [ -n "$ACTIVE" ] - then - echo "$ACTIVE" - elif test -e /dev/mapper/swap_encrypted - then - echo /dev/mapper/swap_encrypted - fi - """), - ignore_failure=True, - ) - dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else "" - if dev: - return dev - raise ValueError( - "No active swap device found in the benchmark pod. " - "Use --swap_encryption_device to specify one." - ) - - -def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]: - """Collect node environment, encryption type, and config into a dict.""" - - kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True) - mem_out, _ = _pod_exec( - pod, - "awk '/MemTotal/{print $2}' /proc/meminfo", - ignore_failure=True, - ) - swap_out, _ = _pod_exec( - pod, - "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", - ignore_failure=True, - ) - - try: - mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) - except ValueError: - mem_gb = 0 - try: - swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) - except ValueError: - swap_gb = 0 - - # Encryption type — key off dm-crypt presence + the swap target, NOT the - # device path. A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro- - # encrypted; only the AWS targets (instance_store / io2) are. - enc = "unknown" - if "/dev/mapper/" in swap_dev: - table_out, _ = _pod_exec( - pod, - f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', - ignore_failure=True, - ) - enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other" - elif _SWAP_TYPE.value in ("instance_store", "io2"): - enc = "nitro_hardware_offload" # AWS: encrypted by the Nitro card - elif not _ENABLE_DMCRYPT.value: - enc = "none" # GKE plain swap (encryption OFF) - - cloud = _detect_cloud(pod) - - # Gap 6: instance size label for multi-size comparison runs. - # If the flag is set use it directly; otherwise try to read it from - # cloud metadata so that the field is always populated. - instance_label = _INSTANCE_SIZE_LABEL.value - if not instance_label: - gcp_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail" - " http://metadata.google.internal/computeMetadata/v1/instance/machine-type" - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_label = gcp_type_out.strip().split("/")[-1] - if not instance_label: - aws_type_out, _ = _pod_exec( - pod, - "curl -s -m 3 --fail " - "http://169.254.169.254/latest/meta-data/instance-type " - '2>/dev/null || echo ""', - ignore_failure=True, - ) - instance_label = aws_type_out.strip() - - return { - "benchmark": BENCHMARK_NAME, - "execution_mode": "kubernetes_privileged_pod", - "cloud": cloud, - "instance_size": instance_label, - "kernel_version": kernel_out.strip(), - "host_memory_gb": mem_gb, - "swap_device": swap_dev, - "swap_size_gb": swap_gb, - "swap_encryption": enc, - # Test-matrix columns: storage target, encryption on/off, image, IOPS - "storage_target": _SWAP_TYPE.value, - "boot_disk_type": _BOOT_DISK_TYPE.value, - "dmcrypt_enabled": _ENABLE_DMCRYPT.value, - "node_image_type": _NODE_IMAGE_TYPE.value, - "boot_disk_iops_target": _BOOT_DISK_IOPS.value, - "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value, - # Other config - "zswap_enabled": _ENABLE_ZSWAP.value, - "min_free_kbytes": _MIN_FREE_KBYTES.value, - "fio_runtime_sec": _FIO_RUNTIME_SEC.value, - # Requested config value only. The *effective* stress-ng footprint may - # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the - # actual value it ran with as 'stress_vm_bytes' so the two never conflict. - "stress_vm_bytes_requested": _STRESS_VM_BYTES.value, - "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value, - "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value, - "nodepool": _NODEPOOL.value, - } - - -def _detect_cloud(pod: str) -> str: - """Detect whether the benchmark pod is running on GCP or AWS. - - Queries the cloud instance metadata endpoint inside the pod. Returns - 'GCP' if the GCP metadata server responds, 'AWS' otherwise. - """ - gcp_out, _ = _pod_exec( - pod, - "curl -s -m 2 --fail " - "http://metadata.google.internal/computeMetadata/v1/project/project-id" - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_out.strip(): - return "GCP" - return "AWS" - - -def _ensure_io2_volume() -> None: - """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). - - Only executed when --swap_encryption_swap_type=io2. Full implementation - is deferred to PR2 (swap-capability layer). - """ - if _SWAP_TYPE.value != "io2": - return - logging.info( - "[swap_encryption] io2 swap volume provisioning deferred to PR2" - ) + return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)] diff --git a/perfkitbenchmarker/resources/container_service/swap_daemonset.py b/perfkitbenchmarker/resources/container_service/swap_daemonset.py new file mode 100644 index 0000000000..ab23c8d6aa --- /dev/null +++ b/perfkitbenchmarker/resources/container_service/swap_daemonset.py @@ -0,0 +1,609 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SwapDaemonSet: PKB BaseResource for the swap-encryption privileged DaemonSet. + +Manages the full lifecycle of the privileged benchmark pod used by the +swap_encryption benchmark: + + _Create() — apply the Jinja2 manifest via kubernetes_commands.ApplyManifest + and wait for the pod to reach Running + /tmp/pkb_ready. + _Delete() — run in-pod cleanup (swapoff, dmsetup remove, losetup teardown, + pkill fio/stress-ng) then kubectl delete daemonset. + PodExec() — kubectl exec wrapper with transient-reset retry, OOM-kill (rc=137) + detection, and automatic RecoverPod() after eviction or container + restart. + WaitForPod() — polls for Running phase + sentinel; updates self.pod_name. + RecoverPod() — waits for DaemonSet to recreate / restart the container, + checking deletionTimestamp to avoid false-positive Running state. + +Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern +(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses, +not in benchmark files. +""" + +import logging +import textwrap +import time +from typing import Optional + +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker.resources.container_service import kubectl +from perfkitbenchmarker.resources.container_service import kubernetes_commands + +# Transient kubectl errors that are safe to retry automatically. +_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close') + +# Errors indicating the container / pod is gone and needs full recovery. +_CONTAINER_GONE_KUBECTL_ERRORS = ( + 'container not found', + 'procready not received', + 'unable to upgrade connection', + 'not found', + 'deleted state', +) + + +class SwapDaemonSet(resource.BaseResource): + """PKB resource for the swap-encryption benchmark privileged DaemonSet. + + The DaemonSet runs a single privileged pod on the benchmark nodepool. + It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli), + verifies the swap device is active, then writes /tmp/pkb_ready. All + benchmark phases execute commands inside this pod via PodExec(). + + Attributes: + name: DaemonSet metadata.name (e.g. 'pkb-swap-benchmark'). + namespace: Kubernetes namespace (typically 'default'). + label: Pod label value for app= selector. + nodepool: pkb_nodepool label value pinning the DaemonSet to the + benchmark node. + image: Container image (e.g. 'ubuntu:22.04'). + pod_name: Name of the currently active pod; updated by WaitForPod / + RecoverPod on eviction. + oom_events: Pod names that triggered rc=137 OOM-kill; read by Run() + for the degradation gate. + pod_lost: Pod names that went NotFound during PodExec; read by Run() + for the degradation gate. + """ + + RESOURCE_TYPE = 'SwapDaemonSet' + REQUIRED_ATTRS = [] + + def __init__( + self, + name: str, + namespace: str, + label: str, + nodepool: str, + image: str, + ) -> None: + super().__init__() + self.name = name + self.namespace = namespace + self.label = label + self.nodepool = nodepool + self.image = image + # Active pod tracking — updated by WaitForPod / RecoverPod. + self.pod_name: Optional[str] = None + # Per-run accumulators read by Run() for the degradation gate. + self.oom_events: list[str] = [] + self.pod_lost: list[str] = [] + + # ── PKB lifecycle ───────────────────────────────────────────────────────── + + def _Create(self) -> None: + """Apply the DaemonSet manifest and wait for the pod to be ready.""" + kubernetes_commands.ApplyManifest( + 'cluster/swap_encryption_daemonset.yaml.j2', + ds_name=self.name, + ds_namespace=self.namespace, + ds_label=self.label, + benchmark_nodepool=self.nodepool, + image=self.image, + ) + logging.info('[swap_encryption] Swap-infra DaemonSet applied') + pod = self.WaitForPod() + if pod is None: + raise errors.Benchmarks.PrepareException( + '[swap_encryption] DaemonSet pod did not become ready within' + ' timeout' + ) + + def _Delete(self) -> None: + """Run in-pod teardown then delete the DaemonSet. + + Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the + pod (best-effort, ignore_failure=True) before deleting the DaemonSet. + This mirrors the original Cleanup() logic so no swap state is leaked. + """ + # Try to get the pod name quickly if not set. + if self.pod_name is None: + self.WaitForPod(timeout=30) + + if self.pod_name: + self.PodExec( + 'swapoff -a 2>/dev/null || true', + ignore_failure=True, + _retries=0, + ) + self.PodExec( + textwrap.dedent("""\ + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync \ + swap_encrypted 2>/dev/null || true + """), + ignore_failure=True, + _retries=0, + ) + self.PodExec( + textwrap.dedent("""\ + for backing in \ + /var/pkb_swap_backing \ + /run/pkb_swap_backing \ + /mnt/stateful_partition/pkb_swap_backing + do + losetup -j "$backing" 2>/dev/null \ + | awk -F: '{print $1}' \ + | while read dev + do losetup -d "$dev" 2>/dev/null || true; done + rm -f "$backing" + done + """), + ignore_failure=True, + _retries=0, + ) + self.PodExec( + "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True, + _retries=0, + ) + + kubectl.RunKubectlCommand( + [ + 'delete', + 'daemonset', + self.name, + '-n', + self.namespace, + '--ignore-not-found', + ], + raise_on_failure=False, + ) + logging.info('[swap_encryption] DaemonSet deleted') + + # ── Pod lifecycle helpers ───────────────────────────────────────────────── + + def WaitForPod(self, timeout: int = 600) -> Optional[str]: + """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists. + + Two-phase poll: + 1. Wait for status.phase == Running. + 2. kubectl exec test -f /tmp/pkb_ready. + + The DaemonSet init script writes /tmp/pkb_ready only after verifying + the swap device is active (up to 150 s) and installing all measurement + tools (~1-2 min on cold APT cache). The default 600 s covers + worst-case APT latency on a freshly-booted node. + + Args: + timeout: Maximum seconds to wait. + + Returns: + Pod name on success; None on timeout. Also updates self.pod_name. + """ + deadline = time.time() + timeout + last_phase = '' + ready_pod = None + + while time.time() < deadline: + # Step 1: wait for Running phase. + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pods', + '-l', + f'app={self.label}', + '-n', + self.namespace, + '-o', + ( + r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{.status.phase}{"\n"}{end}' + ), + ], + raise_on_failure=False, + ) + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2: + pod_name = parts[0].strip() + phase = parts[1].strip() + if phase == 'Running': + logging.info( + '[swap_encryption] Pod %s is Running' + ' — waiting for sentinel...', + pod_name, + ) + ready_pod = pod_name + break + if phase != last_phase: + logging.info( + '[swap_encryption] Pod %s phase: %s', + pod_name, + phase, + ) + last_phase = phase + if phase == 'Pending': + self._LogPodEvents(pod_name) + else: + logging.info( + '[swap_encryption] Waiting for DaemonSet pod to' + ' appear...' + ) + + # Step 2: poll for /tmp/pkb_ready sentinel. + if ready_pod is not None: + _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( + [ + 'exec', + ready_pod, + '-n', + self.namespace, + '--', + 'test', + '-f', + '/tmp/pkb_ready', + ], + raise_on_failure=False, + ) + if sentinel_rc == 0: + logging.info( + '[swap_encryption] Pod %s ready (swap device active)', + ready_pod, + ) + self.pod_name = ready_pod + return ready_pod + # Container crashed (CrashLoopBackOff / exited) — reset and + # re-check pod phase on the next iteration. + if 'container not found' in sentinel_err or ( + 'unable to upgrade connection' in sentinel_err + ): + logging.warning( + '[swap_encryption] Pod %s: container not running' + ' (%s) — will re-check pod state', + ready_pod, + sentinel_err.strip(), + ) + ready_pod = None + last_phase = '' + else: + logging.info( + '[swap_encryption] Pod %s: still installing tools...', + ready_pod, + ) + + time.sleep(15) + + logging.warning( + '[swap_encryption] Benchmark pod not ready after %ds', timeout + ) + return None + + def _LogPodEvents(self, pod_name: str) -> None: + """Dump recent Kubernetes events for a pod to help diagnose hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand( + ['describe', 'pod', pod_name, '-n', self.namespace], + raise_on_failure=False, + ) + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith('Events:'): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info( + '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]) + ) + else: + logging.info( + '[swap_encryption] kubectl describe output:\n%s', + events_out[-2000:] if len(events_out) > 2000 else events_out, + ) + + def _IsPodGone(self, pod: str) -> bool: + """Return True if the named pod no longer exists in the cluster.""" + try: + _, err, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pod', + pod, + '-n', + self.namespace, + '-o', + 'jsonpath={.metadata.name}', + ], + raise_on_failure=False, + timeout=15, + ) + return rc != 0 and 'not found' in (err or '').lower() + except Exception: # pylint: disable=broad-except + return False + + def PodExec( + self, + cmd: str, + ignore_failure: bool = False, + timeout: int = 300, + _retries: int = 2, + ) -> tuple[str, str]: + """Run a shell command inside the benchmark pod via kubectl exec. + + Handles: + - Transient GKE websocket resets: automatic retry (up to _retries). + - OOM kill (rc=137): records to self.oom_events, calls RecoverPod, + does NOT retry the OOM-triggering command itself. + - Container/pod gone: records to self.pod_lost, calls RecoverPod, + retries the command on the recovered pod. + + Uses self.pod_name as the active pod; RecoverPod updates it on eviction. + + Args: + cmd: Shell command string passed to bash -c. + ignore_failure: When True, non-zero exit codes are logged but not + raised. + timeout: Seconds before PKB kills the kubectl exec process. Pass a + larger value for long-running jobs (fio, stress-ng, kernel build). + _retries: Max automatic retries on transient websocket resets. + + Returns: + Tuple of (stdout, stderr) strings. + """ + active = self.pod_name + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + [ + 'exec', + active, + '-n', + self.namespace, + '--', + 'bash', + '-c', + cmd, + ], + raise_on_failure=False, + raise_on_timeout=False, + timeout=timeout, + ) + + # Retry transient GKE websocket resets. + is_transient = rc != 0 and any( + e in err for e in _TRANSIENT_KUBECTL_ERRORS + ) + if is_transient and attempt < _retries: + logging.warning( + '[swap_encryption] kubectl exec connection reset (attempt' + ' %d/%d); retrying in 10 s', + attempt + 1, + _retries + 1, + ) + time.sleep(10) + continue + + # rc=137 (SIGKILL): OOM killer terminated the container process. + # Do NOT retry — log, recover, and return so the caller can decide. + if rc == 137: + if active not in self.oom_events: + self.oom_events.append(active) + # Kubernetes takes a few seconds to update pod state after + # eviction — sleep before checking to avoid false-positive Running. + logging.warning( + '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes' + ' to update pod state before recovery check' + ) + time.sleep(15) + if self._IsPodGone(active): + logging.warning( + '[swap_encryption] OOM-eviction detected (rc=137, pod' + ' gone) — recovering pod name for subsequent commands' + ) + else: + logging.warning( + '[swap_encryption] Container OOM-killed (rc=137, pod' + ' still exists) — waiting for container restart' + ) + new_pod = self.RecoverPod(active) + if new_pod != active: + logging.info( + '[swap_encryption] Pod name updated: %s → %s', + active, + new_pod, + ) + self.pod_name = new_pod + active = new_pod + break # OOM cmd is never re-run on the recovered pod. + + # Container or pod gone: record loss, try RecoverPod, retry cmd. + is_container_gone = rc != 0 and any( + e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS + ) + if is_container_gone: + if active and active not in self.pod_lost: + self.pod_lost.append(active) + logging.error( + '[swap_encryption] Benchmark pod %s is gone (%s) —' + ' recording run as degraded', + active, + (err or '').strip()[:160], + ) + if attempt < _retries: + logging.warning( + '[swap_encryption] Container gone/restarting (attempt' + ' %d/%d) — waiting for pod to recover...', + attempt + 1, + _retries + 1, + ) + new_pod = self.RecoverPod(active) + if new_pod != active: + logging.info( + '[swap_encryption] Pod name updated: %s → %s', + active, + new_pod, + ) + self.pod_name = new_pod + active = new_pod + continue + break + + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] PodExec failed (rc={rc}): {err}' + ) + return out, err + + def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str: + """Wait for the DaemonSet to recover after OOM kill or eviction. + + Handles two scenarios: + 1. Container OOM restart: same pod name, container restarting in + place (DaemonSet restartPolicy=Always). + 2. Pod eviction/deletion: pod is gone; DaemonSet creates a new pod + with a DIFFERENT name. + + Checks metadata.deletionTimestamp in addition to status.phase to + catch the Terminating state where phase may still read Running. + + Args: + pod: Original pod name to monitor. + timeout_sec: Maximum seconds to wait for recovery. + + Returns: + The (possibly new) pod name once Running and /tmp/pkb_ready is + present. + """ + deadline = time.time() + timeout_sec + logging.info( + '[swap_encryption] Waiting for pod %s to recover (up to %ds)...', + pod, + timeout_sec, + ) + + # Phase 1: find a Running pod that is NOT being terminated. + recovered_pod = pod + while time.time() < deadline: + # Query both phase and deletionTimestamp in a single call. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pod', + pod, + '-n', + self.namespace, + '-o', + 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}', + ], + raise_on_failure=False, + timeout=30, + ) + fields = status_out.strip().split('|') + phase = fields[0].strip() if fields else '' + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Genuine Running (not being deleted) — move to Phase 2. + if status_rc == 0 and phase == 'Running' and not is_terminating: + break + + # Pod gone or Terminating — look for a replacement by label. + pod_gone_or_terminating = ( + status_rc != 0 + and 'not found' in (status_out + status_err).lower() + ) or is_terminating + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pods', + '-n', + self.namespace, + '-l', + f'app={self.label}', + '-o', + ( + 'jsonpath={range' + ' .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}' + ), + ], + raise_on_failure=False, + timeout=30, + ) + new_pods = [ + p.strip() + for p in label_out.strip().splitlines() + if p.strip() and p.strip() != pod + ] + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info( + '[swap_encryption] Original pod %s gone/terminating;' + ' found replacement %s', + pod, + recovered_pod, + ) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] No Running pod found (original: {pod})' + f' within {timeout_sec}s after OOM kill / eviction' + ) + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + [ + 'exec', + recovered_pod, + '-n', + self.namespace, + '--', + 'bash', + '-c', + 'test -f /tmp/pkb_ready && echo READY', + ], + raise_on_failure=False, + timeout=30, + ) + if ready_rc == 0 and 'READY' in ready_out: + logging.info( + '[swap_encryption] Pod %s recovered (swap device active)', + recovered_pod, + ) + self.pod_name = recovered_pod + return recovered_pod + time.sleep(15) + + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] Pod {recovered_pod} did not become ready' + f' within {timeout_sec}s after OOM kill / eviction' + ) diff --git a/perfkitbenchmarker/resources/container_service/swap_nodepool.py b/perfkitbenchmarker/resources/container_service/swap_nodepool.py new file mode 100644 index 0000000000..44e5cb396a --- /dev/null +++ b/perfkitbenchmarker/resources/container_service/swap_nodepool.py @@ -0,0 +1,575 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SwapNodePool: PKB BaseResource for the swap-encryption benchmark nodepool. + +Manages the lifecycle of: + + GKE nodepool — gcloud container node-pools create with UBUNTU_CONTAINERD, + linuxConfig.swapConfig + sysctl via --system-config-from-file. + For LSSD machines: --local-nvme-ssd-block and + dedicatedLocalSsdProfile in the swap YAML. + For hyperdisk configs: boot-disk-provisioned-iops/throughput. + + Swap disk — Optional dedicated hyperdisk attached post-nodepool creation + (for dm-crypt measurement on machines where the boot disk + cannot be used as a swap device directly). + + Default pool — DeleteDefaultPool() removes the dummy e2-medium pool created + at cluster time once the DaemonSet pod is Running. + +Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern +(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses. +""" + +import logging +import os +import tempfile +import time + +from perfkitbenchmarker import errors +from perfkitbenchmarker import resource +from perfkitbenchmarker.providers.gcp import util as gcp_util +from perfkitbenchmarker.resources.container_service import kubectl + +# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s. +_HYPERDISK_MAX_IOPS_PER_MBPS = 256 + +_BENCHMARK_NODEPOOL = 'benchmark' +_DEFAULT_NODEPOOL = 'default-pool' + + +def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: + """Return a throughput (MiB/s) satisfying GCP Hyperdisk Balanced constraints. + + Clamps throughput UP to the minimum required by the requested IOPS so that + a mismatched flag pair cannot abort nodepool / disk creation with: + "Requested provisioned throughput is too low for the provisioned iops". + """ + min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) + if throughput < min_tput: + logging.warning( + '[swap_encryption] boot/swap disk throughput %d MiB/s is too low' + ' for %d IOPS; clamping to minimum %d MiB/s', + throughput, + iops, + min_tput, + ) + return min_tput + return throughput + + +class _GcpZonalResource: + """Minimal resource shim for gcp_util.GcloudCommand on compute operations. + + gcp_util.GcloudCommand auto-injects --project and --zone from the resource + object. GkeCluster._GcloudCommand() switches --zone → --region for + multi-zone clusters, which is wrong for gcloud compute commands (--region + creates regional resources). This shim pins a single zone so all + gcloud compute calls target the correct AZ. + """ + + def __init__(self, project: str, zone: str) -> None: + self.project = project + self.zone = zone + + +class SwapNodePool(resource.BaseResource): + """PKB resource for the swap-encryption benchmark GKE nodepool and disk. + + _Create() runs the full setup sequence: + 1. gcloud container node-pools create with linuxConfig.swapConfig. + 2. Wait for the node to become Ready. + 3. (Optional) Create and attach a dedicated swap disk. + + _Delete() tears down in reverse: + 1. (Optional) Detach and delete the swap disk. + 2. gcloud container node-pools delete. + + DeleteDefaultPool() is a separate step called from Prepare() AFTER the + DaemonSet pod is Running, since deleting the default pool while the + benchmark node is still joining can trigger a brief API-server timeout. + + Attributes: + cluster: PKB GkeCluster (or subclass) object; provides _GcloudCommand, + name, project, zones/region. + machine_type: GKE machine type (e.g. 'n4-highmem-32'). + node_image_type: GKE image type (e.g. 'UBUNTU_CONTAINERD'). + disk_type: Boot disk type (e.g. 'hyperdisk-balanced' or 'pd-ssd'). + disk_size_gb: Boot disk size in GiB (500 for hyperdisk, 100 for LSSD). + disk_iops: Provisioned IOPS (hyperdisk-balanced only). + disk_throughput: Provisioned throughput MiB/s (hyperdisk-balanced only). + lssd: True if the machine type uses local NVMe SSDs. Auto-detected from + machine_type name when False. + lssd_count: Number of local NVMe SSDs (--local-nvme-ssd-block count=N). + add_swap_disk: True to create+attach a dedicated second disk for swap. + swap_disk_size_gb: Size of the dedicated swap disk in GiB. + """ + + RESOURCE_TYPE = 'SwapNodePool' + REQUIRED_ATTRS = [] + + def __init__( + self, + cluster, + machine_type: str, + node_image_type: str, + disk_type: str, + disk_size_gb: int, + disk_iops: int, + disk_throughput: int, + lssd: bool, + lssd_count: int, + add_swap_disk: bool, + swap_disk_size_gb: int, + ) -> None: + super().__init__() + self.cluster = cluster + self.machine_type = machine_type + self.node_image_type = node_image_type + self.disk_type = disk_type + self.disk_size_gb = disk_size_gb + self.disk_iops = disk_iops + self.disk_throughput = disk_throughput + # Auto-detect LSSD from machine type name; explicit flag overrides. + self.lssd = lssd or 'lssd' in machine_type.lower() + self.lssd_count = lssd_count + self.add_swap_disk = add_swap_disk + self.swap_disk_size_gb = swap_disk_size_gb + + # ── PKB lifecycle ───────────────────────────────────────────────────────── + + def _Create(self) -> None: + """Create the benchmark nodepool, wait for node, optionally attach disk.""" + self._CreateNodePool() + self._WaitForNode() + if self.add_swap_disk: + self._AttachDisk() + + def _Delete(self) -> None: + """Detach+delete the swap disk (if any) then delete the nodepool.""" + if self.add_swap_disk: + self._DetachAndDeleteDisk() + self._DeleteNodePool() + + # ── Nodepool helpers ────────────────────────────────────────────────────── + + def _CreateNodePool(self) -> None: + """gcloud container node-pools create with linuxConfig.swapConfig YAML. + + Per Ajay review comment r3472513706: + linuxConfig.swapConfig automatically enables + kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set + kubeletConfig explicitly. + For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE + to use local NVMe as the swap device. + Per Ajay review comment r3472549985: + UBUNTU_CONTAINERD is required for dm-crypt measurement. + """ + is_lssd = self.lssd + # LSSD configs use a small boot disk (OS only; swap is on local NVMe). + disk_size_gb = 100 if is_lssd else self.disk_size_gb + + cmd = self.cluster._GcloudCommand( + 'container', + 'node-pools', + 'create', + _BENCHMARK_NODEPOOL, + '--cluster', + self.cluster.name, + ) + cmd.flags['machine-type'] = self.machine_type + cmd.flags['image-type'] = self.node_image_type + cmd.flags['disk-type'] = self.disk_type + cmd.flags['disk-size'] = disk_size_gb + cmd.flags['num-nodes'] = 1 + cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}' + cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair'] + + # IOPS / throughput only for hyperdisk non-LSSD configs. + if self.disk_type.startswith('hyperdisk') and not is_lssd: + cmd.flags['boot-disk-provisioned-iops'] = self.disk_iops + cmd.flags['boot-disk-provisioned-throughput'] = ( + _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput) + ) + + # Expose local NVMe as raw block devices for fio/mdadm direct access. + if is_lssd: + cmd.flags['local-nvme-ssd-block'] = f'count={self.lssd_count}' + + # Build linuxConfig YAML for --system-config-from-file. + if is_lssd: + swap_config_block = ( + ' swapConfig:\n' + ' enabled: true\n' + ' dedicatedLocalSsdProfile:\n' + f' diskCount: {self.lssd_count}\n' + ) + else: + swap_config_block = ' swapConfig:\n enabled: true\n' + swap_config_yaml = ( + 'linuxConfig:\n' + + swap_config_block + + ' sysctl:\n' + ' vm.min_free_kbytes: 200\n' + ' vm.watermark_scale_factor: 500\n' + ' vm.swappiness: 100\n' + ) + + system_config_tmp = None + try: + system_config_tmp = tempfile.NamedTemporaryFile( + mode='w', suffix='.yaml', delete=False + ) + system_config_tmp.write(swap_config_yaml) + system_config_tmp.flush() + cmd.flags['system-config-from-file'] = system_config_tmp.name + logging.info( + '[swap_encryption] system-config-from-file: lssd=%s' + ' (written to %s):\n%s', + is_lssd, + system_config_tmp.name, + swap_config_yaml, + ) + logging.info( + '[swap_encryption] Creating benchmark nodepool: %s / %s /' + ' image=%s / disk=%dGiB / iops=%d / lssd=%s /' + ' add_swap_disk=%s', + _BENCHMARK_NODEPOOL, + self.machine_type, + self.node_image_type, + disk_size_gb, + self.disk_iops, + is_lssd, + self.add_swap_disk, + ) + # LSSD nodepools take longer to provision (NVMe init before Ready). + _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) + finally: + if system_config_tmp is not None: + try: + os.unlink(system_config_tmp.name) + except OSError: + pass + + if rc != 0: + low = (stderr or '').lower() + # Idempotent prepare: if the nodepool already exists (re-running + # --run_stage=prepare,run), reuse it instead of failing. + if ( + 'already exists' in low + or 'alreadyexists' in low + or 'code=409' in low + ): + logging.info( + '[swap_encryption] Benchmark nodepool already exists —' + ' reusing (idempotent prepare)' + ) + return + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create benchmark nodepool' + f' (rc={rc}): {stderr}' + ) + logging.info('[swap_encryption] Benchmark nodepool ready') + + def _WaitForNode(self, timeout: int = 900) -> None: + """Block until a node labelled pkb_nodepool=benchmark is Ready. + + gcloud container node-pools create returns when the API accepts the + request; the node VM may take another 2-4 min to boot and pass + readiness checks. Deploying the DaemonSet before the node is Ready + leaves the pod Pending indefinitely. + """ + deadline = time.time() + timeout + logging.info( + '[swap_encryption] Waiting for benchmark node' + ' (pkb_nodepool=benchmark) to be Ready...' + ) + while time.time() < deadline: + out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'nodes', + '-l', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', + ( + r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{range .status.conditions[?(@.type=="Ready")]}' + r'{.status}{"\n"}{end}{end}' + ), + ], + raise_on_failure=False, + ) + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2 and parts[1].strip() == 'True': + logging.info( + '[swap_encryption] Benchmark node ready: %s', + parts[0].strip(), + ) + return + logging.info( + '[swap_encryption] Benchmark node not yet Ready —' + ' retrying in 15 s...' + ) + time.sleep(15) + raise errors.Benchmarks.RunError( + f'[swap_encryption] Timed out waiting for benchmark node' + f' (pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready' + f' after {timeout}s' + ) + + # ── Dedicated swap disk helpers ─────────────────────────────────────────── + + def _AttachDisk(self) -> None: + """Create a dedicated hyperdisk and attach it to the benchmark node. + + gcloud container node-pools create --additional-node-disk is not + available in all gcloud SDK versions, so we create the disk via + gcloud compute and attach it after the node is Ready. In GKE the + Kubernetes node name equals the GCE instance name. + + The disk is named pkb-swap- to avoid collisions across + concurrent PKB runs. _Delete() calls _DetachAndDeleteDisk() to clean + up. + """ + cluster = self.cluster + zone = self._cluster_zone() + if not zone: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot attach swap disk: cluster zone unknown' + ) + project = cluster.project + disk_name = f'pkb-swap-{cluster.name}' + + # Get the GCE instance name from the benchmark node's Kubernetes name. + node_out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'nodes', + '-l', + f'pkb_nodepool={_BENCHMARK_NODEPOOL}', + '-o', + 'jsonpath={.items[0].metadata.name}', + ], + raise_on_failure=False, + ) + instance_name = node_out.strip() + if rc != 0 or not instance_name: + raise errors.Benchmarks.RunError( + '[swap_encryption] Cannot find benchmark node for swap disk' + ' attach' + ) + logging.info( + '[swap_encryption] Benchmark node instance: %s', instance_name + ) + + # Create the disk. + logging.info( + '[swap_encryption] Creating swap disk %s (%dGiB %s)', + disk_name, + self.swap_disk_size_gb, + self.disk_type, + ) + gcp_res = _GcpZonalResource(project, zone) + create_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'create', disk_name + ) + create_cmd.flags['type'] = self.disk_type + create_cmd.flags['size'] = f'{self.swap_disk_size_gb}GB' + create_cmd.args.append('--quiet') + if self.disk_type.startswith('hyperdisk'): + create_cmd.flags['provisioned-iops'] = self.disk_iops + create_cmd.flags['provisioned-throughput'] = ( + _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput) + ) + _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to create swap disk {disk_name}:' + f' {stderr}' + ) + + # Attach the disk to the benchmark node VM. + logging.info( + '[swap_encryption] Attaching swap disk %s to %s', + disk_name, + instance_name, + ) + attach_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'instances', 'attach-disk', instance_name + ) + attach_cmd.flags['disk'] = disk_name + attach_cmd.flags['device-name'] = 'pkb-swap' + attach_cmd.args.append('--quiet') + _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) + if rc != 0: + raise errors.Benchmarks.RunError( + f'[swap_encryption] Failed to attach swap disk to' + f' {instance_name}: {stderr}' + ) + logging.info( + '[swap_encryption] Swap disk attached: %s → %s', + disk_name, + instance_name, + ) + + def _DetachAndDeleteDisk(self) -> None: + """Detach and delete the dedicated swap disk created by _AttachDisk.""" + zone = self._cluster_zone() + cluster = self.cluster + if not zone or not getattr(cluster, 'project', None): + return + disk_name = f'pkb-swap-{cluster.name}' + self._DeleteDiskByName(disk_name, cluster.project, zone) + + def _DeleteDiskByName( + self, disk_name: str, project: str, zone: str + ) -> bool: + """Detach (if attached) and delete a GCE disk, robustly, with retries. + + Finds the attached instance from the disk's own `users` field rather + than kubectl — kubectl is often unavailable during teardown (cluster + being deleted), which previously left the disk attached and + undeletable. Returns True if the disk is gone. + """ + for attempt in range(1, 5): + gcp_res = _GcpZonalResource(project, zone) + describe_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'describe', disk_name + ) + describe_cmd.flags['format'] = 'value(users)' + users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) + if rc != 0: + logging.info( + '[swap_encryption] Swap disk %s not present —' + ' nothing to delete', + disk_name, + ) + return True # Already gone. + user = users.strip() + if user: + inst = user.split('/')[-1] + logging.info( + '[swap_encryption] Detaching swap disk %s from %s', + disk_name, + inst, + ) + detach_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'instances', 'detach-disk', inst + ) + detach_cmd.flags['disk'] = disk_name + detach_cmd.args.append('--quiet') + detach_cmd.Issue(timeout=120, raise_on_failure=False) + delete_cmd = gcp_util.GcloudCommand( + gcp_res, 'compute', 'disks', 'delete', disk_name + ) + delete_cmd.args.append('--quiet') + _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) + if drc == 0: + logging.info( + '[swap_encryption] Swap disk deleted: %s', disk_name + ) + return True + logging.warning( + '[swap_encryption] Swap disk delete attempt %d/4 failed' + ' (%s); retrying in 10 s', + attempt, + derr.strip()[:160], + ) + time.sleep(10) + logging.error( + '[swap_encryption] Could NOT delete swap disk %s after retries' + ' — delete it manually:\n' + ' gcloud compute disks delete %s --zone %s --quiet', + disk_name, + disk_name, + zone, + ) + return False + + def _DeleteNodePool(self) -> None: + """Delete the benchmark nodepool.""" + cmd = self.cluster._GcloudCommand( + 'container', + 'node-pools', + 'delete', + _BENCHMARK_NODEPOOL, + '--cluster', + self.cluster.name, + ) + cmd.args.append('--quiet') + logging.info( + '[swap_encryption] Deleting benchmark nodepool: %s', + _BENCHMARK_NODEPOOL, + ) + _, stderr, rc = cmd.Issue(timeout=600, raise_on_failure=False) + if rc != 0: + logging.warning( + '[swap_encryption] Could not delete benchmark nodepool' + ' (rc=%d): %s', + rc, + stderr, + ) + else: + logging.info('[swap_encryption] Benchmark nodepool deleted') + + def DeleteDefaultPool(self) -> None: + """Delete the dummy e2-medium default nodepool. + + Called from Prepare() AFTER the DaemonSet pod is Running. The default + pool (e2-medium) was only needed to satisfy GKE's requirement that a + cluster must have at least one nodepool at creation time. Removing it + stops its cost immediately. + + Deleting the default pool BEFORE the DaemonSet pod is Running can + trigger a brief API-server I/O timeout (control plane busy with two + concurrent nodepool ops). Calling this method from Prepare() after + daemonset.WaitForPod() ensures the cluster is fully stable. + """ + cmd = self.cluster._GcloudCommand( + 'container', + 'node-pools', + 'delete', + _DEFAULT_NODEPOOL, + '--cluster', + self.cluster.name, + ) + cmd.args.append('--quiet') + logging.info( + '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL + ) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) + if rc != 0: + logging.warning( + '[swap_encryption] Could not delete default nodepool' + ' (rc=%d): %s', + rc, + stderr, + ) + else: + logging.info('[swap_encryption] Default nodepool deleted') + + # ── Internal helpers ────────────────────────────────────────────────────── + + def _cluster_zone(self) -> str: + """Return the first zone (or region) from the cluster object.""" + cluster = self.cluster + if getattr(cluster, 'zones', None): + return cluster.zones[0] + if getattr(cluster, 'region', None): + return cluster.region + return '' From fb026f8dd14ed384a3d8bf701762e6109b80e1a4 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 17:30:19 +0530 Subject: [PATCH 10/17] refactor(swap_encryption/pr1): correct PKB structure - swap_config as NodepoolSpec field BREAKING: replaces SwapNodePool (standalone nodepool lifecycle) with the correct PKB pattern: swap configuration declared in BENCHMARK_CONFIG and applied by the existing GKE cluster creation flow. New files: - resources/container_service/swap_config.py - GkeSwapConfig(BaseResource): WriteLinuxConfigYaml(), ValidHyperdiskThroughput() - EksSwapConfig(BaseResource): stub for nodeadm config (deferred to PR #6780) Core framework changes: - configs/container_spec.py: add SwapConfigSpec(BaseSpec) + _SwapConfigDecoder + swap_config field on NodepoolSpec - resources/container_service/container.py: add swap_config attr to BaseNodePoolConfig - resources/container_service/container_cluster.py: propagate swap_config in _InitializeNodePool() (mirrors sandbox_config pattern) - providers/gcp/google_kubernetes_engine.py: _AddNodeParamsToCmd() reads nodepool_config.swap_config - applies --system-config-from-file, UBUNTU_CONTAINERD, --no-enable-autorepair, boot-disk-provisioned-iops/throughput Thin benchmark: - BENCHMARK_CONFIG declares benchmark nodepool with swap_config (no separate nodepool create needed - GKE cluster creation handles it) - Prepare(): deploy SwapDaemonSet + delete default-pool - Run(): verify swap_active + swap_encrypted; report samples - Cleanup(): empty (PKB auto-deletes spec.resources) Addresses Ajay reviews: - r3457826290: swap as base resource plugged into GKE cluster creation flow - r3457877984: linuxConfig.swapConfig via --system-config-from-file (GkeSwapConfig) - r3457928855: removed memory.swap.max hack - r3457964593: UBUNTU_CONTAINERD set per-nodepool in _AddNodeParamsToCmd - r3472513706: swapConfig auto-enables memorySwapBehavior=LimitedSwap - r3472549985: UBUNTU_CONTAINERD required for dm-crypt --- perfkitbenchmarker/configs/container_spec.py | 96 ++ .../swap_encryption_benchmark.py | 1123 +++-------------- .../providers/gcp/google_kubernetes_engine.py | 20 +- .../resources/container_service/container.py | 4 + .../container_service/container_cluster.py | 1 + .../container_service/swap_config.py | 259 ++++ 6 files changed, 573 insertions(+), 930 deletions(-) create mode 100644 perfkitbenchmarker/resources/container_service/swap_config.py diff --git a/perfkitbenchmarker/configs/container_spec.py b/perfkitbenchmarker/configs/container_spec.py index 1f808ad066..cb20ef883a 100644 --- a/perfkitbenchmarker/configs/container_spec.py +++ b/perfkitbenchmarker/configs/container_spec.py @@ -243,6 +243,7 @@ def __init__( self.vm_spec: virtual_machine_spec.BaseVmSpec self.machine_families: list[str] | None self.sandbox_config: SandboxSpec | None + self.swap_config: SwapConfigSpec | None @classmethod def _GetOptionDecoderConstructions(cls): @@ -273,6 +274,7 @@ def _GetOptionDecoderConstructions(cls): ), 'vm_spec': (spec.PerCloudConfigDecoder, {}), 'sandbox_config': (_SandboxDecoder, {'default': None}), + 'swap_config': (_SwapConfigDecoder, {'default': None}), }) return result @@ -333,6 +335,100 @@ def Decode(self, value, component_full_name, flag_values): return result +class SwapConfigSpec(spec.BaseSpec): + """Configurable swap options for a GKE/EKS nodepool. + + Declared in BENCHMARK_CONFIG under nodepools..swap_config. + Consumed by the cloud provider's _AddNodeParamsToCmd() / equivalent to + apply the cloud-specific swap configuration during nodepool creation. + + Attributes: + enabled: Whether to enable swap on the nodepool (default True). + swappiness: vm.swappiness sysctl value (0-200, default 100). + min_free_kbytes: vm.min_free_kbytes sysctl (default 200). + watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500). + lssd: True if the nodepool uses local NVMe SSDs for the swap device. + lssd_count: Number of local NVMe SSDs (GKE dedicatedLocalSsdProfile). + boot_disk_iops: Provisioned IOPS for hyperdisk-balanced (0 = not set). + boot_disk_throughput: Provisioned throughput MiB/s for hyperdisk-balanced. + """ + + def __init__(self, *args, **kwargs): + self.enabled: bool = True + self.swappiness: int = 100 + self.min_free_kbytes: int = 200 + self.watermark_scale_factor: int = 500 + self.lssd: bool = False + self.lssd_count: int = 0 + self.boot_disk_iops: int = 0 + self.boot_disk_throughput: int = 0 + super().__init__(*args, **kwargs) + + @classmethod + def _GetOptionDecoderConstructions(cls): + result = super()._GetOptionDecoderConstructions() + result.update({ + 'enabled': ( + option_decoders.BooleanDecoder, + {'default': True}, + ), + 'swappiness': ( + option_decoders.IntDecoder, + {'default': 100, 'min': 0, 'max': 200}, + ), + 'min_free_kbytes': ( + option_decoders.IntDecoder, + {'default': 200, 'min': 0}, + ), + 'watermark_scale_factor': ( + option_decoders.IntDecoder, + {'default': 500, 'min': 0}, + ), + 'lssd': ( + option_decoders.BooleanDecoder, + {'default': False}, + ), + 'lssd_count': ( + option_decoders.IntDecoder, + {'default': 0, 'min': 0}, + ), + 'boot_disk_iops': ( + option_decoders.IntDecoder, + {'default': 0, 'min': 0}, + ), + 'boot_disk_throughput': ( + option_decoders.IntDecoder, + {'default': 0, 'min': 0}, + ), + }) + return result + + +class _SwapConfigDecoder(option_decoders.TypeVerifier): + """Decodes the swap_config option of a NodepoolSpec.""" + + def Decode(self, value, component_full_name, flag_values): + """Decodes the swap_config dictionary into a SwapConfigSpec. + + Args: + value: dict. Keys match SwapConfigSpec._GetOptionDecoderConstructions. + component_full_name: str. Fully qualified name of the parent component. + flag_values: flags.FlagValues. Runtime flags propagated to BaseSpec. + + Returns: + SwapConfigSpec instance. + + Raises: + errors.Config.InvalidValue upon invalid input value. + """ + super().Decode(value, component_full_name, flag_values) + return SwapConfigSpec( + self._GetOptionFullName(component_full_name), + flag_values=flag_values, + **value, + ) + + class SandboxSpec(spec.BaseSpec): """Configurable options for sandboxed node pools.""" diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 7f981b1bb7..3322795eec 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -11,993 +11,260 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""swap_encryption_benchmark: verifies encrypted swap on GKE/EKS nodepools. -"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark. +Architecture: + BENCHMARK_CONFIG declares a 'benchmark' nodepool with swap_config. + GkeCluster._AddNodeParamsToCmd() reads nodepool_config.swap_config and + applies --system-config-from-file (linuxConfig.swapConfig + sysctl) + sets + UBUNTU_CONTAINERD + boot-disk-provisioned-iops/throughput automatically + during cluster creation. No separate nodepool lifecycle management needed. -Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws + Prepare() deploys a privileged SwapDaemonSet on the swap-enabled nodepool + for in-pod benchmark execution (fio / stress-ng / kernel build in later PRs). -== Architecture == + Run() verifies swap is active and dm-crypt encryption is configured, then + reports swap device metadata as PKB samples. -Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's -container_cluster abstraction, then deploys a privileged DaemonSet whose -pod has full host-device access (/dev, /sys, hostPID). All benchmark -phases execute inside this pod via kubectl exec, so measurements reflect -actual cluster-node behaviour including Kubernetes overhead (kubelet, -containerd cgroup hierarchy, etc.). + Cleanup() is empty — PKB auto-deletes spec.resources (SwapDaemonSet). - GKE nodes ── dm-crypt with ephemeral key (go/node:swap-encryption) - swap device: /dev/mapper/swap_encrypted (over dedicated - hyperdisk or LSSD RAID-0 /dev/md0). - Single-disk fallback: plain loop device on - /mnt/stateful_partition — dm-crypt is blocked by COS - kernel namespace restrictions from inside a pod. - - EKS nodes ── NVMe Instance Store, Nitro hardware-offloaded encryption - swap device: /dev/nvme1n1 (or auto-detected) - -== Resource pattern == - -Infrastructure lifecycle lives in two BaseResource subclasses: - - SwapNodePool (perfkitbenchmarker/resources/container_service/swap_nodepool.py) - _Create(): gcloud container node-pools create with linuxConfig.swapConfig - + sysctl via --system-config-from-file; waits for node Ready; - optionally creates and attaches a dedicated swap disk. - _Delete(): detach+delete disk; delete the nodepool. - DeleteDefaultPool(): remove the dummy e2-medium default pool after the - DaemonSet pod is Running (separate step to avoid API-server - contention during nodepool ops). - - SwapDaemonSet (perfkitbenchmarker/resources/container_service/swap_daemonset.py) - _Create(): apply Jinja2 manifest; wait for Running + /tmp/pkb_ready. - _Delete(): in-pod swapoff / dmsetup / losetup teardown; kubectl delete. - PodExec(): kubectl exec wrapper with transient-reset retry, OOM-kill - detection (rc=137), and automatic pod recovery. - -Both resources are added to spec.resources in Prepare() and are auto-deleted -by the PKB framework in Cleanup(). - -== Benchmark Phases == - - Phase 1 – fio Microbenchmarks (this PR) - Run fio directly on the swap block device (swapoff first) to measure - the hardware + encryption ceiling: random IOPS (4K), sequential - bandwidth (1M), and completion latency (iodepth=1). - - Phase 2a – CPU Overhead (PR2/PR4) - Phase 2b – I/O Interference (PR4) - Phase 3a – Redis Latency (PR5) - Phase 3b – Kernel Build (PR5) - Phase 3c – OpenSearch (PR5) +Subsequent PRs add phases: + PR3: fio microbenchmarks on raw swap device (Tier 1) + PR4: stress-ng CPU overhead + I/O interference (Tier 2) + PR5: kernel build under cgroup memory constraint (Phase 3b) """ -import json import logging -import textwrap -import time from typing import Any from absl import flags -from perfkitbenchmarker import benchmark_spec as bm_spec_lib +from perfkitbenchmarker import benchmark_spec from perfkitbenchmarker import configs -from perfkitbenchmarker import errors from perfkitbenchmarker import sample -from perfkitbenchmarker.resources.container_service import kubectl -from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod -from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod +from perfkitbenchmarker.resources.container_service import swap_daemonset FLAGS = flags.FLAGS -_BenchmarkSpec = bm_spec_lib.BenchmarkSpec - -# --------------------------------------------------------------------------- -# Benchmark identity -# --------------------------------------------------------------------------- - BENCHMARK_NAME = 'swap_encryption' - - BENCHMARK_CONFIG = """ swap_encryption: description: > - GKE vs. EKS swap encryption and LSSD performance comparison. - Two-step nodepool setup: PKB provisions a minimal cluster with a cheap - default nodepool (Step 1), then Prepare() adds the real benchmark - nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a - node-level startup script that configures dm-crypt swap before any pod - is scheduled, then removes the default nodepool (Step 2). All benchmark - phases run inside a privileged DaemonSet pinned to the benchmark nodepool. - flags: {} + Verify dm-crypt encrypted swap on GKE/EKS. Subsequent PRs add fio, + stress-ng, and kernel build phases. container_cluster: + cloud: GCP type: Kubernetes vm_count: 1 vm_spec: GCP: - # Cheap placeholder — the benchmark nodepool is created in Prepare(). machine_type: e2-medium - boot_disk_size: 20 - AWS: - # Cheap placeholder — the benchmark nodegroup is added in Prepare(). - machine_type: t3.medium - boot_disk_size: 20 + zone: us-central1-a + nodepools: + benchmark: + vm_count: 1 + vm_spec: + GCP: + machine_type: n4-highmem-32 + boot_disk_type: hyperdisk-balanced + boot_disk_size: 500 + zone: us-central1-a + swap_config: + enabled: true + swappiness: 100 + min_free_kbytes: 200 + watermark_scale_factor: 500 + boot_disk_iops: 160000 + boot_disk_throughput: 2400 """ - -_DAEMONSET_IMAGE = flags.DEFINE_string( - 'swap_encryption_daemonset_image', - 'ubuntu:22.04', - 'Container image used for the privileged benchmark DaemonSet pod.', -) - - -_NODEPOOL = flags.DEFINE_string( - 'swap_encryption_nodepool', - 'benchmark', - 'Name of the node pool to deploy the benchmark DaemonSet on.', -) - - -_INSTANCE_SIZE_LABEL = flags.DEFINE_string( - 'swap_encryption_instance_size_label', - '', - 'Human-readable label for the current instance size being tested, e.g. ' - '"n4-highmem-32" or "i4i.4xlarge". Stored in sample metadata so that ' - 'results from multiple PKB runs across different instance sizes can be ' - 'collated and compared. Defaults to the value reported by the cloud ' - 'metadata endpoint inside the pod.', +_MACHINE_TYPE = flags.DEFINE_string( + 'swap_encryption_machine_type', + None, + 'Override machine type for the benchmark nodepool.', ) - - -_COLLECT_COST = flags.DEFINE_boolean( - 'swap_encryption_collect_cost', - False, - 'When True, emit a cost_estimate_usd sample using on-demand pricing ' - 'for the instance type detected at runtime.', +_DISK_TYPE = flags.DEFINE_string( + 'swap_encryption_disk_type', + None, + 'Override disk type for the benchmark nodepool.', ) +_BenchmarkSpec = benchmark_spec.BenchmarkSpec +_BENCHMARK_NODEPOOL = 'benchmark' +_DEFAULT_POOL = 'default-pool' -_FAIL_ON_DEGRADED = flags.DEFINE_boolean( - 'swap_encryption_fail_on_degraded', - True, - 'When True (default), raise an error at the end of Run() if the run was ' - 'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and ' - 'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng ' - 'swap-pressure phase was OOM-killed before completing. This prevents PKB ' - 'from reporting SUCCEEDED for a run whose post-eviction phases produced ' - 'empty or meaningless data. Set False to keep the legacy behaviour of ' - 'always returning whatever partial samples were collected.', -) - - -_PHASES = flags.DEFINE_list( - 'swap_encryption_phases', - ['all'], - 'Which Run() phases to execute, for fast iteration against an ' - 'already-provisioned cluster (e.g. --run_stage=run --run_uri=...). ' - 'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng ' - 'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), ' - '3b (kernel build), 3c (opensearch). Default "all" runs everything. ' - 'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. ' - 'Phases not listed are skipped and do not affect the degraded-run gate ' - '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").', -) - - -_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string( - 'swap_encryption_benchmark_machine_type', - 'n4-highmem-32', - 'Machine type for the benchmark nodepool created in Prepare(). ' - 'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd ' - '(LSSD RAID-0). The matching swap setup is selected automatically.', -) - - -_BENCHMARK_LSSD = flags.DEFINE_boolean( - 'swap_encryption_lssd', - False, - 'Force LSSD RAID-0 swap path even when the machine type name does not ' - 'contain "lssd". Auto-detected from machine type when False.', -) - - -_LSSD_COUNT = flags.DEFINE_integer( - 'swap_encryption_lssd_count', - 1, - 'Number of local NVMe SSDs to attach as raw block devices ' - '(--local-nvme-ssd-block count=N). Must match the fixed local SSD ' - 'count for the chosen machine type: c4-standard-8-lssd=1, ' - 'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS). ' - 'Default 1 covers most single-lssd machine types.', -) - - -_NODE_IMAGE_TYPE = flags.DEFINE_string( - 'swap_encryption_node_image_type', - 'UBUNTU_CONTAINERD', - 'GKE node image type for the benchmark nodepool. ' - 'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks ' - 'down device-mapper at the kernel LSM level and cryptsetup hangs ' - 'indefinitely from any pod context (even privileged, even via nsenter ' - 'into the host mount namespace). Ubuntu GKE nodes allow cryptsetup ' - 'from privileged pods without restriction. ' - 'Use COS_CONTAINERD only when dm-crypt is disabled ' - '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead. ' - 'AL2 on EKS.', -) - - -_BOOT_DISK_TYPE = flags.DEFINE_string( - 'swap_encryption_boot_disk_type', - 'hyperdisk-balanced', - 'Disk type for the benchmark nodepool boot disk. Use hyperdisk-balanced ' - 'for production machines (n4, c3, c4 families). Use pd-ssd for n2/e2 ' - 'dev/test machines, which do not support hyperdisk-balanced.', -) - - -_BOOT_DISK_IOPS = flags.DEFINE_integer( - 'swap_encryption_boot_disk_iops', - 80000, - 'Provisioned IOPS for the boot disk (hyperdisk-balanced only). ' - '80 000 is the COS max-IOPS target. Ignored for pd-ssd.', -) - - -_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer( - 'swap_encryption_boot_disk_throughput', - 1200, - 'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced ' - 'only). Must be set together with iops. 1200 MB/s pairs with 80 000 ' - 'IOPS for production; use 140 (minimum) for dev/test. Ignored for ' - 'pd-ssd.', -) - - -_BOOT_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_boot_disk_size_gb', - 500, - 'Boot disk size in GiB for the benchmark nodepool. 500 GiB is ' - 'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run ' - '(see Engineer Assignments table in execution-plan.md). ' - 'For LSSD configs the boot disk is smaller; 100 GiB is fine.', -) - - -_ADD_SWAP_DISK = flags.DEFINE_boolean( - 'swap_encryption_add_swap_disk', - False, - 'Attach a dedicated second disk to the benchmark nodepool for use as ' - 'the swap device. Required for dm-crypt measurement on single-boot-disk ' - 'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper ' - 'from pod namespaces. The second disk is provisioned via ' - '--additional-node-disk using the same type/IOPS/throughput as the boot ' - 'disk flags.', -) - - -_SWAP_DISK_SIZE_GB = flags.DEFINE_integer( - 'swap_encryption_swap_disk_size_gb', - 500, - 'Size in GiB of the dedicated swap disk when ' - '--swap_encryption_add_swap_disk is True. Must satisfy the ' - 'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.', -) - -_ENABLE_DMCRYPT = flags.DEFINE_boolean( - 'swap_encryption_enable_dmcrypt', - True, - 'When True (default), wrap the swap device in dm-crypt plain mode ' - '(aes-xts-plain64, ephemeral random key) matching GKE\'s ' - 'go/node:swap-encryption implementation. Set False to measure plain ' - '(unencrypted) swap overhead as a baseline.', -) - - -_SWAP_DEVICE = flags.DEFINE_string( - 'swap_encryption_device', - '', - 'Explicit block device path to use as the swap device, e.g. ' - '/dev/nvme1n1 or /dev/mapper/swap_encrypted. When empty (default), ' - 'the device is auto-detected from /proc/swaps inside the benchmark pod.', -) -_SWAP_TYPE = flags.DEFINE_string( - 'swap_encryption_swap_type', - 'hyperdisk', - 'Storage target for the swap device. One of: hyperdisk (default), ' - 'lssd, instance_store, io2.', -) +def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: + """Load and return benchmark config spec.""" + config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) + nodepool = config['container_cluster']['nodepools'][_BENCHMARK_NODEPOOL] + if _MACHINE_TYPE.value: + for cloud in nodepool['vm_spec']: + nodepool['vm_spec'][cloud]['machine_type'] = _MACHINE_TYPE.value + if _DISK_TYPE.value: + for cloud in nodepool['vm_spec']: + nodepool['vm_spec'][cloud]['boot_disk_type'] = _DISK_TYPE.value + return config -_ENABLE_ZSWAP = flags.DEFINE_boolean( - 'swap_encryption_enable_zswap', - False, - 'When True, enable zswap compressed swap cache on the benchmark node.', -) -_MIN_FREE_KBYTES = flags.DEFINE_integer( - 'swap_encryption_min_free_kbytes', - 0, - 'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. ' - '0 (default) leaves the kernel default unchanged.', -) +def CheckPrerequisites(_) -> None: + """Verifies that benchmark setup is correct.""" -_FIO_RUNTIME_SEC = flags.DEFINE_integer( - 'swap_encryption_fio_runtime_sec', - 60, - 'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.', -) -_STRESS_VM_BYTES = flags.DEFINE_string( - 'swap_encryption_stress_vm_bytes', - '28G', - 'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor. ' - 'Should exceed available node RAM to force sustained paging.', -) +def Prepare(spec: _BenchmarkSpec) -> None: + """Deploys the privileged benchmark DaemonSet on the swap-enabled nodepool. -_STRESS_VM_BYTES_LIST = flags.DEFINE_list( - 'swap_encryption_stress_vm_bytes_list', - [], - 'Comma-separated list of --vm-bytes values to sweep in Phase 2a, ' - 'e.g. "14G,28G,56G". Overrides --swap_encryption_stress_vm_bytes.', -) + The swap-enabled 'benchmark' nodepool is already created by GKE cluster + creation (swap_config declared in BENCHMARK_CONFIG). Prepare() deploys the + privileged DaemonSet used for in-pod command execution across all phases. -_STRESS_TIMEOUT_SEC = flags.DEFINE_integer( - 'swap_encryption_stress_timeout_sec', - 300, - 'Maximum seconds to wait for the stress-ng swap-pressure phase.', -) - -# DaemonSet constants used by both SwapDaemonSet construction and the EKS path. -_DS_NAME = 'pkb-swap-benchmark' -_DS_NAMESPACE = 'default' -_DS_LABEL = 'pkb-swap-benchmark' -_BENCHMARK_NODEPOOL = 'benchmark' + After the DaemonSet pod is Running the dummy e2-medium default-pool is + deleted to stop its cost. + Args: + spec: PKB BenchmarkSpec with spec.container_cluster already created. + """ + cluster = spec.container_cluster + daemonset = swap_daemonset.SwapDaemonSet(cluster=cluster) + daemonset.Create() + spec.resources.append(daemonset) + pod = daemonset.WaitForPod() + logging.info('[swap_encryption] Benchmark pod ready: %s', pod) + _delete_default_pool(cluster) -def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: - """Load and return benchmark config spec.""" - return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME) +def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: + """Verify swap is active and dm-crypt encryption is configured. + + Returns: + PKB samples: swap_active, swap_encrypted, swap_cipher, swap_total_kb. + """ + daemonset = _get_daemonset(spec) + daemonset.WaitForPod() + daemonset.oom_events.clear() + daemonset.pod_lost.clear() + + swap_dev = _detect_swap_device(daemonset) + base_meta = _build_metadata(daemonset, swap_dev) + results: list[sample.Sample] = [] + + # ── Verify swap is active ────────────────────────────────────────────────── + try: + swap_out, _ = daemonset.PodExec('cat /proc/swaps') + active = any( + l and not l.startswith('Filename') for l in swap_out.splitlines() + ) + results.append(sample.Sample('swap_active', int(active), 'bool', base_meta)) + logging.info('[swap_encryption] swap_active=%s /proc/swaps:\n%s', active, swap_out) + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] Could not read /proc/swaps: %s', e) -def Prepare(spec: _BenchmarkSpec) -> None: - """Two-step nodepool setup then DaemonSet deployment. - - Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap - e2-medium default nodepool. - - Step 2 (this function): - a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk). - EKS: label existing nodes with pkb_nodepool=benchmark. - b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel. - c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running. - d. GCP: re-resolve pod name in case default-pool deletion evicts the pod. - - Both resources are appended to spec.resources for auto-cleanup. - """ - cluster = spec.container_cluster - is_gcp = getattr(cluster, 'project', None) is not None - - if is_gcp: - # ── Step 2a (GCP): create benchmark nodepool + wait for node ────────── - logging.info('[swap_encryption] Step 2a: creating benchmark nodepool') - nodepool = _np_mod.SwapNodePool( - cluster=cluster, - machine_type=_BENCHMARK_MACHINE_TYPE.value, - node_image_type=_NODE_IMAGE_TYPE.value, - disk_type=_BOOT_DISK_TYPE.value, - disk_size_gb=_BOOT_DISK_SIZE_GB.value, - disk_iops=_BOOT_DISK_IOPS.value, - disk_throughput=_BOOT_DISK_THROUGHPUT.value, - lssd=_BENCHMARK_LSSD.value, - lssd_count=_LSSD_COUNT.value, - add_swap_disk=_ADD_SWAP_DISK.value, - swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value, - ) - nodepool.Create() - spec.resources.append(nodepool) - else: - # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ── - logging.info( - '[swap_encryption] EKS cluster — labelling existing nodes with' - ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.', - _BENCHMARK_NODEPOOL, - ) - kubectl.RunKubectlCommand([ - 'label', - 'nodes', - '--all', - '--overwrite', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - ]) - _ensure_io2_volume() - - # ── Step 2b: deploy DaemonSet and wait for pod ──────────────────────────── - # Deploy BEFORE deleting the default pool: deleting the default pool while - # the benchmark node is still joining causes a brief API-server I/O timeout. - # The pod being Running means the cluster is fully stable. - logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet') - daemonset = _ds_mod.SwapDaemonSet( - name=_DS_NAME, - namespace=_DS_NAMESPACE, - label=_DS_LABEL, - nodepool=_BENCHMARK_NODEPOOL, - image=_DAEMONSET_IMAGE.value, + # ── Verify dm-crypt encryption ───────────────────────────────────────────── + if swap_dev: + try: + dm_out, _ = daemonset.PodExec( + f'dmsetup status {swap_dev} 2>/dev/null || echo not_encrypted' + ) + encrypted = 'crypt' in dm_out.lower() + cipher = _parse_cipher(dm_out) + meta = {**base_meta, 'dmsetup_status': dm_out.strip()[:200]} + results.append(sample.Sample('swap_encrypted', int(encrypted), 'bool', meta)) + if cipher: + results.append(sample.Sample('swap_cipher', 0, cipher, meta)) + logging.info('[swap_encryption] encrypted=%s cipher=%s', encrypted, cipher) + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] dm-crypt check failed: %s', e) + + # ── Swap size ────────────────────────────────────────────────────────────── + try: + sz_out, _ = daemonset.PodExec( + "awk '/^SwapTotal/ {print $2}' /proc/meminfo" ) - daemonset.Create() - spec.resources.append(daemonset) + swap_kb = int(sz_out.strip() or '0') + results.append(sample.Sample('swap_total_kb', swap_kb, 'KB', base_meta)) logging.info( - '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name + '[swap_encryption] SwapTotal: %d KB (%.1f GiB)', + swap_kb, swap_kb / 1024 / 1024, ) + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] Could not read SwapTotal: %s', e) - # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ── - if is_gcp: - logging.info( - '[swap_encryption] Step 2c: deleting dummy default nodepool' - ) - nodepool.DeleteDefaultPool() - # The pod may be evicted and rescheduled with a new name during the - # default nodepool deletion. Re-resolve to avoid stale references. - logging.info( - '[swap_encryption] Step 2d: re-resolving benchmark pod after' - ' nodepool deletion' - ) - daemonset.WaitForPod() - logging.info( - '[swap_encryption] Benchmark pod (post-deletion): %s', - daemonset.pod_name, - ) - - -def Run(spec: _BenchmarkSpec) -> list[sample.Sample]: - """Execute all benchmark phases with gate logic. - - Execution is structured in three gated tiers matching the execution plan: - - Tier 1 (Gate 1) — fio microbenchmarks - Raw I/O ceiling of the swap device. Gate 1 fails if fio produces - zero samples (device not found, O_DIRECT error, etc.). - - Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (PR4) - Requires an active swap device (Gate 1 must pass). - - Tier 3 (Gate 3) — real-world workloads (PR5) - Independent of Tier 2 results. - - If Gate 1 fails, Tiers 2 and 3 are skipped. - """ - daemonset = _get_daemonset(spec) - - pod = daemonset.WaitForPod() - if pod is None: - raise errors.Benchmarks.RunError( - '[swap_encryption] Benchmark pod never became ready.' - ) - # Reset per-run accumulators before starting phases. - daemonset.oom_events.clear() - daemonset.pod_lost.clear() - original_pod = pod - degraded_reasons: list[str] = [] - - swap_dev = _detect_swap_device(daemonset) - base_meta = _build_metadata(daemonset, swap_dev) - results: list[sample.Sample] = [] - t_run_start = time.time() - - logging.info('[swap_encryption] swap device: %s', swap_dev) - - # ── Phase 1: fio microbenchmarks on raw swap device ─────────────────────── - if _phase_selected('fio'): - logging.info( - '[swap_encryption] Phase 1: fio microbenchmarks on %s', swap_dev - ) - try: - phase1_samples = _run_phase1_fio(daemonset, swap_dev, base_meta) - results += phase1_samples - if not phase1_samples: - degraded_reasons.append( - 'Phase 1 (fio) produced no samples — ' - 'check fio install and swap device accessibility' - ) - logging.error('[swap_encryption] Phase 1: no samples produced') - except Exception as e: # pylint: disable=broad-except - degraded_reasons.append(f'Phase 1 fio failed: {e}') - logging.error('[swap_encryption] Phase 1 fio error: %s', e) - - # ── Cost estimate ───────────────────────────────────────────────────────── - if _COLLECT_COST.value: - elapsed = time.time() - t_run_start - results += _collect_cost_sample(daemonset, elapsed, base_meta) - - # ── Final degradation gate ──────────────────────────────────────────────── - if daemonset.pod_name and daemonset.pod_name != original_pod: - degraded_reasons.append( - f'benchmark pod was replaced during the run ({original_pod} →' - f' {daemonset.pod_name}) — it was OOM-evicted under swap' - ' pressure; phases executed after the eviction ran against a' - ' freshly-initialised pod (empty /tmp, swap re-setup) and may' - ' be invalid' - ) - if daemonset.pod_lost: - degraded_reasons.append( - 'benchmark pod(s) went NotFound during the run' - f' ({", ".join(daemonset.pod_lost)}) — the pod died (node' - ' memory-pressure eviction or container exit) and any phase' - ' running at or after that point produced invalid data' - ) - if daemonset.oom_events: - degraded_reasons.append( - 'OOM kill(s) (rc=137) occurred during the run on pod(s) ' - f'{", ".join(daemonset.oom_events)} — a phase exceeded memory' - ' and was killed by the OOM killer; the affected phase(s)' - ' produced no or partial data' - ) - - degraded = bool(degraded_reasons) + if daemonset.oom_events: results.append( - sample.Sample( - 'swap_encryption_run_status', - 0.0 if degraded else 1.0, - 'status', - dict( - base_meta, - degraded=degraded, - degraded_reasons='; '.join(degraded_reasons) or 'none', - num_samples=len(results) + 1, - ), - ) + sample.Sample('oom_events', len(daemonset.oom_events), 'count', base_meta) ) - - if degraded: - msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons) - logging.error(msg) - if _FAIL_ON_DEGRADED.value: - raise errors.Benchmarks.RunError(msg) - else: - logging.info( - '[swap_encryption] Run completed cleanly (%d samples)', - len(results), - ) - - return results - - -def Cleanup(spec: _BenchmarkSpec) -> None: - """Resources in spec.resources are auto-deleted by the PKB framework. - - SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove, - losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet. - SwapNodePool._Delete() detaches+deletes the swap disk (if any) then - deletes the benchmark nodepool. - """ + return results -# --------------------------------------------------------------------------- -# Internal helpers -# --------------------------------------------------------------------------- +def Cleanup(_: _BenchmarkSpec) -> None: + """Empty — PKB auto-deletes spec.resources (SwapDaemonSet).""" -def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet: - """Retrieve the SwapDaemonSet resource from spec.resources.""" - daemonset = next( - (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)), - None, - ) - if daemonset is None: - raise errors.Benchmarks.RunError( - '[swap_encryption] SwapDaemonSet not found in spec.resources —' - ' was Prepare() called?' - ) - return daemonset - - -def _phase_selected(token: str) -> bool: - """Return True if phase `token` should run given --swap_encryption_phases. - - 'all' (the default) selects every phase. Otherwise only the - comma-separated tokens listed in the flag run. - """ - selected = [p.strip().lower() for p in _PHASES.value if p.strip()] - return (not selected) or ('all' in selected) or (token.lower() in selected) - - -def _configure_eks_kubelet_swap(spec) -> None: - """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap. - - NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm - integration) to merge. When that lands, EKS node pools should include - a preBootstrapCommands block writing nodeadm config with - memorySwapBehavior: LimitedSwap before kubelet starts:: - - apiVersion: node.eks.aws/v1alpha1 - kind: NodeConfig - spec: - kubelet: - config: - memorySwapBehavior: LimitedSwap - failSwapOn: false - - GKE equivalent: linuxConfig.swapConfig via --system-config-from-file - (swapConfig automatically enables memorySwapBehavior=LimitedSwap), - already implemented in SwapNodePool._CreateNodePool(). - - See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780 - """ - logging.warning( - '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is ' - 'deferred (blocked on PR #6780 — SwapConfigSpec). ' - 'EKS nodes will use default kubelet swap settings until that PR merges.' - ) +# ── Helpers ──────────────────────────────────────────────────────────────────── -def _ensure_io2_volume() -> None: - """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2). - - Only executed when --swap_encryption_swap_type=io2. Full implementation - is deferred to PR2 (swap-capability layer). - """ - if _SWAP_TYPE.value != 'io2': - return - logging.info( - '[swap_encryption] io2 swap volume provisioning deferred to PR2' - ) +def _get_daemonset(spec: _BenchmarkSpec) -> swap_daemonset.SwapDaemonSet: + for r in spec.resources: + if isinstance(r, swap_daemonset.SwapDaemonSet): + return r + raise RuntimeError('[swap_encryption] SwapDaemonSet not found in spec.resources') -def _detect_swap_device( - daemonset: _ds_mod.SwapDaemonSet, -) -> str: - """Return the active swap device path on the cluster node.""" - if _SWAP_DEVICE.value: - return _SWAP_DEVICE.value - - # /proc/swaps is the source of truth — it lists the device ACTUALLY active. - # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping - # from a previous run on a reused node can still appear as a /dev node while - # being non-functional (fio/swapoff fail with "No such device or address"). - dm_out, _ = daemonset.PodExec( - textwrap.dedent(""" - ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null) - if [ -n "$ACTIVE" ] - then - echo "$ACTIVE" - elif test -e /dev/mapper/swap_encrypted - then - echo /dev/mapper/swap_encrypted - fi - """), - ignore_failure=True, - ) - dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else '' - if dev: - return dev - raise ValueError( - 'No active swap device found in the benchmark pod. ' - 'Use --swap_encryption_device to specify one.' - ) +def _detect_swap_device(ds: swap_daemonset.SwapDaemonSet) -> str: + """Return the first active swap device name (e.g. 'dm-0') or ''.""" + try: + out, _ = ds.PodExec("awk 'NR>1 {print $1}' /proc/swaps") + dev = out.strip().split('\n')[0].strip() + return dev.split('/')[-1] if dev else '' + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] _detect_swap_device: %s', e) + return '' def _build_metadata( - daemonset: _ds_mod.SwapDaemonSet, swap_dev: str + ds: swap_daemonset.SwapDaemonSet, swap_dev: str ) -> dict[str, Any]: - """Collect node environment, encryption type, and config into a dict.""" - kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True) - mem_out, _ = daemonset.PodExec( - "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True - ) - swap_out, _ = daemonset.PodExec( - "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True + """Build base metadata dict for all samples.""" + meta: dict[str, Any] = {'swap_device': swap_dev or 'unknown'} + try: + kver, _ = ds.PodExec('uname -r') + meta['kernel_version'] = kver.strip() + except Exception: # pylint: disable=broad-except + pass + return meta + + +def _parse_cipher(dmsetup_status: str) -> str: + """Extract cipher name from dmsetup status output.""" + parts = dmsetup_status.split() + try: + idx = parts.index('crypt') + return parts[idx + 1] if idx + 1 < len(parts) else '' + except ValueError: + return '' + + +def _delete_default_pool(cluster) -> None: + """Delete the dummy e2-medium default-pool once the benchmark pod is Running. + + GKE requires at least one nodepool at cluster creation time; the e2-medium + default-pool satisfies that requirement. Deleting it before the DaemonSet + pod is Running can trigger a brief API-server timeout while two concurrent + nodepool operations are in progress. + """ + try: + cmd = cluster._GcloudCommand( # pylint: disable=protected-access + 'container', 'node-pools', 'delete', _DEFAULT_POOL, + '--cluster', cluster.name, ) - - try: - mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1) - except ValueError: - mem_gb = 0 - try: - swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1) - except ValueError: - swap_gb = 0 - - # Encryption type — key off dm-crypt presence + swap target. - enc = 'unknown' - if '/dev/mapper/' in swap_dev: - table_out, _ = daemonset.PodExec( - f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""', - ignore_failure=True, - ) - enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other' - elif _SWAP_TYPE.value in ('instance_store', 'io2'): - enc = 'nitro_hardware_offload' - elif not _ENABLE_DMCRYPT.value: - enc = 'none' - - cloud = _detect_cloud(daemonset) - - instance_label = _INSTANCE_SIZE_LABEL.value - if not instance_label: - gcp_type_out, _ = daemonset.PodExec( - 'curl -s -m 3 --fail' - ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_label = gcp_type_out.strip().split('/')[-1] - if not instance_label: - aws_type_out, _ = daemonset.PodExec( - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', - ignore_failure=True, - ) - instance_label = aws_type_out.strip() - - return { - 'benchmark': BENCHMARK_NAME, - 'execution_mode': 'kubernetes_privileged_pod', - 'cloud': cloud, - 'instance_size': instance_label, - 'kernel_version': kernel_out.strip(), - 'host_memory_gb': mem_gb, - 'swap_device': swap_dev, - 'swap_size_gb': swap_gb, - 'swap_encryption': enc, - 'storage_target': _SWAP_TYPE.value, - 'boot_disk_type': _BOOT_DISK_TYPE.value, - 'dmcrypt_enabled': _ENABLE_DMCRYPT.value, - 'node_image_type': _NODE_IMAGE_TYPE.value, - 'boot_disk_iops_target': _BOOT_DISK_IOPS.value, - 'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value, - 'zswap_enabled': _ENABLE_ZSWAP.value, - 'min_free_kbytes': _MIN_FREE_KBYTES.value, - 'fio_runtime_sec': _FIO_RUNTIME_SEC.value, - 'stress_vm_bytes_requested': _STRESS_VM_BYTES.value, - 'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value, - 'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value, - 'nodepool': _NODEPOOL.value, - } - - -def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str: - """Detect whether the benchmark pod is running on GCP or AWS.""" - gcp_out, _ = daemonset.PodExec( - 'curl -s -m 2 --fail ' - 'http://metadata.google.internal/computeMetadata/v1/project/project-id' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_out.strip(): - return 'GCP' - return 'AWS' - - -def _run_phase1_fio( - daemonset: _ds_mod.SwapDaemonSet, - swap_dev: str, - base_meta: dict[str, Any], -) -> list[sample.Sample]: - """Run fio microbenchmarks on the raw swap block device (Phase 1). - - Calls swapoff before running fio so measurements reflect the raw - hardware + encryption ceiling with no swap-daemon overhead. Re-enables - swap unconditionally after all jobs complete. - - Jobs: - 4k_randread iodepth=32 → random read IOPS - 4k_randwrite iodepth=32 → random write IOPS - 1m_seqread iodepth=8 → sequential read bandwidth - 1m_seqwrite iodepth=8 → sequential write bandwidth - 4k_lat_read iodepth=1 → completion latency floor (read) - - Args: - daemonset: Active SwapDaemonSet resource. - swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted. - base_meta: Shared metadata dict from _build_metadata(). - - Returns: - List of Sample objects with IOPS, bandwidth and latency metrics. - """ - samples: list[sample.Sample] = [] - - # swapoff before fio — running fio with --direct=1 on an active swap device - # races with kernel page-reclaim on the same dm-crypt target. - logging.info('[swap_encryption] Phase 1: swapoff %s', swap_dev) - daemonset.PodExec( - f'swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true', - timeout=30, - ignore_failure=True, - ) - - # (name, rw_mode, block_size, iodepth) - fio_jobs = [ - ('4k_randread', 'randread', '4k', 32), - ('4k_randwrite', 'randwrite', '4k', 32), - ('1m_seqread', 'read', '1m', 8), - ('1m_seqwrite', 'write', '1m', 8), - ('4k_lat_read', 'randread', '4k', 1), - ] - - runtime = _FIO_RUNTIME_SEC.value - try: - for name, rw, bs, iodepth in fio_jobs: - cmd = ( - f'fio --name={name} --filename={swap_dev}' - f' --rw={rw} --bs={bs} --iodepth={iodepth}' - ' --ioengine=libaio --direct=1' - f' --runtime={runtime} --time_based --group_reporting' - ' --output-format=json 2>/dev/null' - ) - logging.info('[swap_encryption] Phase 1: fio job %s', name) - out, _ = daemonset.PodExec(cmd, timeout=runtime + 120) - samples += _parse_fio_json(out, name, base_meta) - finally: - # Always re-enable swap so subsequent phases can drive swap I/O. - logging.info('[swap_encryption] Phase 1: swapon %s', swap_dev) - daemonset.PodExec( - f'swapon {swap_dev} 2>/dev/null || true', - timeout=30, - ignore_failure=True, - ) - - logging.info( - '[swap_encryption] Phase 1 complete (%d samples)', len(samples) - ) - return samples - - -def _parse_fio_json( - fio_output: str, job_name: str, base_meta: dict[str, Any] -) -> list[sample.Sample]: - """Parse fio --output-format=json output into PKB Sample objects. - - Extracts per-direction (read/write) IOPS, bandwidth (MB/s) and completion - latency (mean + p50/p99/p999 percentiles). - - Args: - fio_output: Raw stdout from fio with --output-format=json. - job_name: Short identifier embedded in metric names, e.g. '4k_randread'. - base_meta: Shared metadata dict copied into each sample. - - Returns: - List of Sample objects; empty if output cannot be parsed or is zero. - """ - # fio sometimes emits kernel warnings before the JSON object. - json_start = fio_output.find('{') - if json_start == -1: - logging.warning( - '[swap_encryption] Phase 1: no JSON in fio output for %s', job_name - ) - return [] - - try: - data = json.loads(fio_output[json_start:]) - except json.JSONDecodeError as e: - logging.warning( - '[swap_encryption] Phase 1: fio JSON parse error (%s): %s', - job_name, - e, - ) - return [] - - jobs = data.get('jobs', []) - if not jobs: - return [] - - job = jobs[0] - samples: list[sample.Sample] = [] - meta = dict(base_meta, fio_job=job_name) - - for direction in ('read', 'write'): - d = job.get(direction, {}) - iops = float(d.get('iops', 0)) - bw_kbps = float(d.get('bw', 0)) # fio reports KiB/s - bw_mbps = bw_kbps / 1024.0 - - # Skip directions with near-zero throughput. - if iops < 1 and bw_kbps < 1: - continue - - prefix = f'phase1_fio_{job_name}_{direction}' - samples.append(sample.Sample(f'{prefix}_iops', iops, 'IOPS', meta)) - samples.append( - sample.Sample(f'{prefix}_bw_mbps', bw_mbps, 'MB/s', meta) - ) - - # Completion latency — fio reports nanoseconds; emit microseconds. - clat = d.get('clat_ns', d.get('lat_ns', {})) - lat_mean_ns = float(clat.get('mean', 0)) - if lat_mean_ns > 0: - samples.append( - sample.Sample( - f'{prefix}_lat_mean_us', lat_mean_ns / 1000.0, 'us', meta - ) - ) - for pct_key, label in ( - ('50.000000', 'p50'), - ('99.000000', 'p99'), - ('99.900000', 'p999'), - ): - val_ns = clat.get('percentile', {}).get(pct_key, 0) - if val_ns: - samples.append( - sample.Sample( - f'{prefix}_lat_{label}_us', - val_ns / 1000.0, - 'us', - meta, - ) - ) - - return samples - - -_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = { - # GCP (on-demand, us-central1 unless noted) - 'c4-standard-8-lssd': 0.5888, - 'c4-standard-8': 0.5008, - 'n4-highmem-32': 3.0256, - 'n2-highmem-32': 2.5216, - 'n2-standard-32': 1.5264, - 'z3-highmem-8': 2.7248, - # AWS - 'i4i.4xlarge': 1.4960, - 'i4i.2xlarge': 0.7480, - 'm6id.4xlarge': 0.9072, - 'm6i.4xlarge': 0.7680, - 'r6i.4xlarge': 1.0080, -} - - -def _collect_cost_sample( - daemonset: _ds_mod.SwapDaemonSet, - elapsed_sec: float, - base_meta: dict, -) -> list[sample.Sample]: - """Emit a cost_estimate_usd sample for the benchmark run.""" - instance_type = '' - - gcp_type_out, _ = daemonset.PodExec( - 'curl -s -m 3 --fail' - ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type' - ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""', - ignore_failure=True, - ) - if gcp_type_out.strip(): - instance_type = gcp_type_out.strip().split('/')[-1] - - if not instance_type: - aws_type_out, _ = daemonset.PodExec( - 'curl -s -m 3 --fail ' - 'http://169.254.169.254/latest/meta-data/instance-type ' - '2>/dev/null || echo ""', - ignore_failure=True, - ) - instance_type = aws_type_out.strip() - - if _INSTANCE_SIZE_LABEL.value: - instance_type = _INSTANCE_SIZE_LABEL.value - - if not instance_type and _BENCHMARK_MACHINE_TYPE.value: - instance_type = _BENCHMARK_MACHINE_TYPE.value - logging.info( - '[swap_encryption] Instance type from metadata unavailable; using' - ' --swap_encryption_benchmark_machine_type=%s for cost tracking', - instance_type, - ) - - price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type) - if price is None: - logging.warning( - '[swap_encryption] Unknown instance type "%s" — skipping cost' - ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost' - ' tracking.', - instance_type, - ) - return [] - - hours = elapsed_sec / 3600.0 - meta = dict( - base_meta, - instance_type=instance_type, - price_usd_per_hr=price, - benchmark_elapsed_sec=round(elapsed_sec, 1), - ) - return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)] + cmd.args.append('--quiet') + logging.info('[swap_encryption] Deleting default nodepool: %s', _DEFAULT_POOL) + _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) + if rc != 0: + logging.warning( + '[swap_encryption] Could not delete default nodepool (rc=%d): %s', + rc, stderr, + ) + else: + logging.info('[swap_encryption] Default nodepool deleted') + except Exception as e: # pylint: disable=broad-except + logging.warning('[swap_encryption] _delete_default_pool failed: %s', e) diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index f943a53ff1..52bcdc82c2 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -37,6 +37,7 @@ from perfkitbenchmarker.resources.container_service import kubectl from perfkitbenchmarker.resources.container_service import kubernetes_cluster from perfkitbenchmarker.resources.container_service import kubernetes_commands +from perfkitbenchmarker.resources.container_service import swap_config as swap_config_lib FLAGS = flags.FLAGS @@ -570,13 +571,28 @@ def _AddNodeParamsToCmd( ): cmd.args.append('--enable-fast-socket') - if FLAGS.gke_node_system_config is not None: + # Per-nodepool swap config takes precedence over the global flag. + if nodepool_config.swap_config is not None: + gke_swap = swap_config_lib.GkeSwapConfig.from_spec(nodepool_config.swap_config) + cmd.flags['system-config-from-file'] = gke_swap.WriteLinuxConfigYaml() + # dm-crypt requires UBUNTU_CONTAINERD (Ajay r3472549985). + cmd.flags['image-type'] = 'UBUNTU_CONTAINERD' + # Prevent GKE from replacing the node after swap setup is complete. + cmd.args.append('--no-enable-autorepair') + sc = nodepool_config.swap_config + if sc.boot_disk_iops and not sc.lssd: + cmd.flags['boot-disk-provisioned-iops'] = sc.boot_disk_iops + cmd.flags['boot-disk-provisioned-throughput'] = ( + gke_swap.ValidHyperdiskThroughput() + ) + elif FLAGS.gke_node_system_config is not None: + # Fall back to global flag when no per-nodepool swap config is set. cmd.flags['system-config-from-file'] = FLAGS.gke_node_system_config if nodepool_config.sandbox_config is not None: cmd.flags['sandbox'] = nodepool_config.sandbox_config.ToSandboxFlag() - if self.image_type: + if self.image_type and 'image-type' not in cmd.flags: cmd.flags['image-type'] = self.image_type cmd.flags['node-labels'] = f'pkb_nodepool={nodepool_config.name}' diff --git a/perfkitbenchmarker/resources/container_service/container.py b/perfkitbenchmarker/resources/container_service/container.py index 3e05a1ec2b..b652eaab32 100644 --- a/perfkitbenchmarker/resources/container_service/container.py +++ b/perfkitbenchmarker/resources/container_service/container.py @@ -187,6 +187,10 @@ def __init__( # Defined by GceVirtualMachineConfig. Used by google_kubernetes_engine # pylint: disable=g-missing-from-attributes self.sandbox_config: container_spec_lib.SandboxSpec | None = None + # Set by container_cluster._InitializeNodePool() when NodepoolSpec + # declares swap_config. Consumed by _AddNodeParamsToCmd() in the cloud + # provider to apply swap configuration during nodepool creation. + self.swap_config: container_spec_lib.SwapConfigSpec | None = None self.max_local_disks: int | None self.ssd_interface: str | None self.threads_per_core: int diff --git a/perfkitbenchmarker/resources/container_service/container_cluster.py b/perfkitbenchmarker/resources/container_service/container_cluster.py index 9458662c98..ed67ff7adb 100644 --- a/perfkitbenchmarker/resources/container_service/container_cluster.py +++ b/perfkitbenchmarker/resources/container_service/container_cluster.py @@ -116,6 +116,7 @@ def _InitializeNodePool( nodepool_spec.machine_families, ) nodepool_config.sandbox_config = nodepool_spec.sandbox_config + nodepool_config.swap_config = nodepool_spec.swap_config nodepool_config.zone = zone nodepool_config.num_nodes = nodepool_spec.vm_count if nodepool_spec.min_vm_count is None: diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py new file mode 100644 index 0000000000..8606929308 --- /dev/null +++ b/perfkitbenchmarker/resources/container_service/swap_config.py @@ -0,0 +1,259 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""GkeSwapConfig and EksSwapConfig: swap configuration as PKB BaseResource. + +These resources encapsulate cloud-specific swap configuration for GKE and EKS +nodepools. They are referenced via NodepoolSpec.swap_config (declared in the +benchmark BENCHMARK_CONFIG YAML) and consumed by the cloud provider's +_AddNodeParamsToCmd() during cluster/nodepool creation. + +Usage in BENCHMARK_CONFIG: + container_cluster: + nodepools: + benchmark: + vm_spec: + GCP: + machine_type: n4-highmem-32 + boot_disk_type: hyperdisk-balanced + boot_disk_size: 500 + swap_config: + enabled: true + swappiness: 100 + min_free_kbytes: 200 + watermark_scale_factor: 500 + boot_disk_iops: 160000 + boot_disk_throughput: 2400 + +GkeCluster._AddNodeParamsToCmd() creates a GkeSwapConfig from the +SwapConfigSpec and calls WriteLinuxConfigYaml() to obtain the path for +--system-config-from-file. No separate resource.Create() call is needed +for the swap config itself — it is applied as part of nodepool creation. +""" + +import logging +import os +import tempfile + +from perfkitbenchmarker import resource + +# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s. +_HYPERDISK_MAX_IOPS_PER_MBPS = 256 + + +class GkeSwapConfig(resource.BaseResource): + """GKE swap configuration for a nodepool. + + Encapsulates the linuxConfig (swapConfig + sysctl) YAML for + --system-config-from-file and optional Hyperdisk IOPS/throughput overrides. + + Consumed by GkeCluster._AddNodeParamsToCmd() when nodepool_config.swap_config + is set. _Create() and _Delete() are no-ops because the swap config is applied + as part of the gcloud node-pools create command; the nodepool itself manages + the lifecycle. + + Attributes: + swappiness: vm.swappiness sysctl value (0-200, default 100). + min_free_kbytes: vm.min_free_kbytes sysctl (default 200). + watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500). + lssd: True if the nodepool uses local NVMe SSDs for swap device. + lssd_count: Number of local NVMe SSDs (dedicatedLocalSsdProfile.diskCount). + boot_disk_iops: Provisioned IOPS for hyperdisk-balanced (0 = not set). + boot_disk_throughput: Provisioned throughput MiB/s for hyperdisk-balanced. + """ + + RESOURCE_TYPE = 'GkeSwapConfig' + REQUIRED_ATTRS = [] + + def __init__( + self, + swappiness: int = 100, + min_free_kbytes: int = 200, + watermark_scale_factor: int = 500, + lssd: bool = False, + lssd_count: int = 0, + boot_disk_iops: int = 0, + boot_disk_throughput: int = 0, + ) -> None: + super().__init__() + self.swappiness = swappiness + self.min_free_kbytes = min_free_kbytes + self.watermark_scale_factor = watermark_scale_factor + self.lssd = lssd + self.lssd_count = lssd_count + self.boot_disk_iops = boot_disk_iops + self.boot_disk_throughput = boot_disk_throughput + self._yaml_path: str | None = None + + @classmethod + def from_spec(cls, swap_spec) -> 'GkeSwapConfig': + """Create a GkeSwapConfig from a SwapConfigSpec decoded from BENCHMARK_CONFIG.""" + return cls( + swappiness=swap_spec.swappiness, + min_free_kbytes=swap_spec.min_free_kbytes, + watermark_scale_factor=swap_spec.watermark_scale_factor, + lssd=swap_spec.lssd, + lssd_count=swap_spec.lssd_count, + boot_disk_iops=swap_spec.boot_disk_iops, + boot_disk_throughput=swap_spec.boot_disk_throughput, + ) + + def _Create(self) -> None: + """No-op: swap config is applied during nodepool creation.""" + + def _Delete(self) -> None: + """No-op: cleaned up when the nodepool is deleted.""" + self._CleanupYaml() + + def WriteLinuxConfigYaml(self) -> str: + """Write the GKE linuxConfig YAML to a tempfile; return the path. + + Called by GkeCluster._AddNodeParamsToCmd() to supply + --system-config-from-file. The caller is responsible for deleting the + tempfile via CleanupYaml() after the gcloud command completes. + + Per Ajay review r3472513706: + linuxConfig.swapConfig.enabled=true automatically sets + kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set + kubeletConfig explicitly. + For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE to + use local NVMe as the swap device. + + Returns: + Absolute path to the written tempfile. + """ + if self.lssd and self.lssd_count > 0: + swap_block = ( + ' swapConfig:\n' + ' enabled: true\n' + ' dedicatedLocalSsdProfile:\n' + f' diskCount: {self.lssd_count}\n' + ) + else: + swap_block = ' swapConfig:\n enabled: true\n' + + yaml_content = ( + 'linuxConfig:\n' + + swap_block + + ' sysctl:\n' + + f' vm.swappiness: {self.swappiness}\n' + + f' vm.min_free_kbytes: {self.min_free_kbytes}\n' + + f' vm.watermark_scale_factor: {self.watermark_scale_factor}\n' + ) + + tmp = tempfile.NamedTemporaryFile( + mode='w', suffix='.yaml', delete=False + ) + try: + tmp.write(yaml_content) + tmp.flush() + self._yaml_path = tmp.name + finally: + tmp.close() + + logging.info( + '[swap_config] Wrote linuxConfig YAML (lssd=%s, lssd_count=%d)' + ' to %s:\n%s', + self.lssd, + self.lssd_count, + self._yaml_path, + yaml_content, + ) + return self._yaml_path + + def ValidHyperdiskThroughput(self) -> int: + """Return clamped throughput satisfying GCP Hyperdisk Balanced constraints. + + GCP Hyperdisk Balanced requires: provisioned_iops <= 256 × throughput_MiB_s. + Clamps throughput UP so a mismatched pair cannot abort nodepool creation. + """ + if not self.boot_disk_iops or not self.boot_disk_throughput: + return self.boot_disk_throughput + min_tput = -(-int(self.boot_disk_iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) + if self.boot_disk_throughput < min_tput: + logging.warning( + '[swap_config] boot disk throughput %d MiB/s too low for %d IOPS;' + ' clamping to minimum %d MiB/s', + self.boot_disk_throughput, + self.boot_disk_iops, + min_tput, + ) + return min_tput + return self.boot_disk_throughput + + def CleanupYaml(self) -> None: + """Delete the linuxConfig tempfile if it was written.""" + if self._yaml_path and os.path.exists(self._yaml_path): + try: + os.unlink(self._yaml_path) + logging.info( + '[swap_config] Cleaned up YAML tempfile: %s', self._yaml_path + ) + except OSError: + pass + self._yaml_path = None + + def _CleanupYaml(self) -> None: + self.CleanupYaml() + + +class EksSwapConfig(resource.BaseResource): + """EKS swap configuration for a nodepool (stub). + + Configures kubelet LimitedSwap via nodeadm bootstrap configuration. + Full implementation deferred to PR #6780. + + Attributes: + memory_swap_behavior: kubelet memorySwapBehavior value ('LimitedSwap'). + fail_swap_on: kubelet failSwapOn setting (False to allow swap on EKS). + """ + + RESOURCE_TYPE = 'EksSwapConfig' + REQUIRED_ATTRS = [] + + def __init__( + self, + memory_swap_behavior: str = 'LimitedSwap', + fail_swap_on: bool = False, + ) -> None: + super().__init__() + self.memory_swap_behavior = memory_swap_behavior + self.fail_swap_on = fail_swap_on + + @classmethod + def from_spec(cls, swap_spec) -> 'EksSwapConfig': + """Create an EksSwapConfig from a SwapConfigSpec.""" + return cls() + + def _Create(self) -> None: + """Stub: EKS kubelet LimitedSwap config via nodeadm (deferred to PR #6780).""" + logging.warning( + '[swap_config] EksSwapConfig._Create() is a stub. ' + 'EKS kubelet LimitedSwap config via nodeadm not yet implemented ' + '(deferred to PR #6780). Swap will not be enabled on EKS nodes.' + ) + + def _Delete(self) -> None: + """No-op.""" + + def GetNodeadmConfig(self) -> str: + """Return nodeadm bootstrap YAML for kubelet swap settings.""" + return ( + 'apiVersion: node.eks.aws/v1alpha1\n' + 'kind: NodeConfig\n' + 'spec:\n' + ' kubelet:\n' + ' config:\n' + f' memorySwapBehavior: {self.memory_swap_behavior}\n' + f' failSwapOn: {str(self.fail_swap_on).lower()}\n' + ) From a3060e9b52a4bf1c2b55b41886dd8dc826f709c3 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 17:36:07 +0530 Subject: [PATCH 11/17] correct PKB structure --- .../container_service/swap_nodepool.py | 575 ------------------ 1 file changed, 575 deletions(-) delete mode 100644 perfkitbenchmarker/resources/container_service/swap_nodepool.py diff --git a/perfkitbenchmarker/resources/container_service/swap_nodepool.py b/perfkitbenchmarker/resources/container_service/swap_nodepool.py deleted file mode 100644 index 44e5cb396a..0000000000 --- a/perfkitbenchmarker/resources/container_service/swap_nodepool.py +++ /dev/null @@ -1,575 +0,0 @@ -# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""SwapNodePool: PKB BaseResource for the swap-encryption benchmark nodepool. - -Manages the lifecycle of: - - GKE nodepool — gcloud container node-pools create with UBUNTU_CONTAINERD, - linuxConfig.swapConfig + sysctl via --system-config-from-file. - For LSSD machines: --local-nvme-ssd-block and - dedicatedLocalSsdProfile in the swap YAML. - For hyperdisk configs: boot-disk-provisioned-iops/throughput. - - Swap disk — Optional dedicated hyperdisk attached post-nodepool creation - (for dm-crypt measurement on machines where the boot disk - cannot be used as a swap device directly). - - Default pool — DeleteDefaultPool() removes the dummy e2-medium pool created - at cluster time once the DaemonSet pod is Running. - -Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern -(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses. -""" - -import logging -import os -import tempfile -import time - -from perfkitbenchmarker import errors -from perfkitbenchmarker import resource -from perfkitbenchmarker.providers.gcp import util as gcp_util -from perfkitbenchmarker.resources.container_service import kubectl - -# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s. -_HYPERDISK_MAX_IOPS_PER_MBPS = 256 - -_BENCHMARK_NODEPOOL = 'benchmark' -_DEFAULT_NODEPOOL = 'default-pool' - - -def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int: - """Return a throughput (MiB/s) satisfying GCP Hyperdisk Balanced constraints. - - Clamps throughput UP to the minimum required by the requested IOPS so that - a mismatched flag pair cannot abort nodepool / disk creation with: - "Requested provisioned throughput is too low for the provisioned iops". - """ - min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS) # ceil(iops/256) - if throughput < min_tput: - logging.warning( - '[swap_encryption] boot/swap disk throughput %d MiB/s is too low' - ' for %d IOPS; clamping to minimum %d MiB/s', - throughput, - iops, - min_tput, - ) - return min_tput - return throughput - - -class _GcpZonalResource: - """Minimal resource shim for gcp_util.GcloudCommand on compute operations. - - gcp_util.GcloudCommand auto-injects --project and --zone from the resource - object. GkeCluster._GcloudCommand() switches --zone → --region for - multi-zone clusters, which is wrong for gcloud compute commands (--region - creates regional resources). This shim pins a single zone so all - gcloud compute calls target the correct AZ. - """ - - def __init__(self, project: str, zone: str) -> None: - self.project = project - self.zone = zone - - -class SwapNodePool(resource.BaseResource): - """PKB resource for the swap-encryption benchmark GKE nodepool and disk. - - _Create() runs the full setup sequence: - 1. gcloud container node-pools create with linuxConfig.swapConfig. - 2. Wait for the node to become Ready. - 3. (Optional) Create and attach a dedicated swap disk. - - _Delete() tears down in reverse: - 1. (Optional) Detach and delete the swap disk. - 2. gcloud container node-pools delete. - - DeleteDefaultPool() is a separate step called from Prepare() AFTER the - DaemonSet pod is Running, since deleting the default pool while the - benchmark node is still joining can trigger a brief API-server timeout. - - Attributes: - cluster: PKB GkeCluster (or subclass) object; provides _GcloudCommand, - name, project, zones/region. - machine_type: GKE machine type (e.g. 'n4-highmem-32'). - node_image_type: GKE image type (e.g. 'UBUNTU_CONTAINERD'). - disk_type: Boot disk type (e.g. 'hyperdisk-balanced' or 'pd-ssd'). - disk_size_gb: Boot disk size in GiB (500 for hyperdisk, 100 for LSSD). - disk_iops: Provisioned IOPS (hyperdisk-balanced only). - disk_throughput: Provisioned throughput MiB/s (hyperdisk-balanced only). - lssd: True if the machine type uses local NVMe SSDs. Auto-detected from - machine_type name when False. - lssd_count: Number of local NVMe SSDs (--local-nvme-ssd-block count=N). - add_swap_disk: True to create+attach a dedicated second disk for swap. - swap_disk_size_gb: Size of the dedicated swap disk in GiB. - """ - - RESOURCE_TYPE = 'SwapNodePool' - REQUIRED_ATTRS = [] - - def __init__( - self, - cluster, - machine_type: str, - node_image_type: str, - disk_type: str, - disk_size_gb: int, - disk_iops: int, - disk_throughput: int, - lssd: bool, - lssd_count: int, - add_swap_disk: bool, - swap_disk_size_gb: int, - ) -> None: - super().__init__() - self.cluster = cluster - self.machine_type = machine_type - self.node_image_type = node_image_type - self.disk_type = disk_type - self.disk_size_gb = disk_size_gb - self.disk_iops = disk_iops - self.disk_throughput = disk_throughput - # Auto-detect LSSD from machine type name; explicit flag overrides. - self.lssd = lssd or 'lssd' in machine_type.lower() - self.lssd_count = lssd_count - self.add_swap_disk = add_swap_disk - self.swap_disk_size_gb = swap_disk_size_gb - - # ── PKB lifecycle ───────────────────────────────────────────────────────── - - def _Create(self) -> None: - """Create the benchmark nodepool, wait for node, optionally attach disk.""" - self._CreateNodePool() - self._WaitForNode() - if self.add_swap_disk: - self._AttachDisk() - - def _Delete(self) -> None: - """Detach+delete the swap disk (if any) then delete the nodepool.""" - if self.add_swap_disk: - self._DetachAndDeleteDisk() - self._DeleteNodePool() - - # ── Nodepool helpers ────────────────────────────────────────────────────── - - def _CreateNodePool(self) -> None: - """gcloud container node-pools create with linuxConfig.swapConfig YAML. - - Per Ajay review comment r3472513706: - linuxConfig.swapConfig automatically enables - kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set - kubeletConfig explicitly. - For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE - to use local NVMe as the swap device. - Per Ajay review comment r3472549985: - UBUNTU_CONTAINERD is required for dm-crypt measurement. - """ - is_lssd = self.lssd - # LSSD configs use a small boot disk (OS only; swap is on local NVMe). - disk_size_gb = 100 if is_lssd else self.disk_size_gb - - cmd = self.cluster._GcloudCommand( - 'container', - 'node-pools', - 'create', - _BENCHMARK_NODEPOOL, - '--cluster', - self.cluster.name, - ) - cmd.flags['machine-type'] = self.machine_type - cmd.flags['image-type'] = self.node_image_type - cmd.flags['disk-type'] = self.disk_type - cmd.flags['disk-size'] = disk_size_gb - cmd.flags['num-nodes'] = 1 - cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}' - cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair'] - - # IOPS / throughput only for hyperdisk non-LSSD configs. - if self.disk_type.startswith('hyperdisk') and not is_lssd: - cmd.flags['boot-disk-provisioned-iops'] = self.disk_iops - cmd.flags['boot-disk-provisioned-throughput'] = ( - _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput) - ) - - # Expose local NVMe as raw block devices for fio/mdadm direct access. - if is_lssd: - cmd.flags['local-nvme-ssd-block'] = f'count={self.lssd_count}' - - # Build linuxConfig YAML for --system-config-from-file. - if is_lssd: - swap_config_block = ( - ' swapConfig:\n' - ' enabled: true\n' - ' dedicatedLocalSsdProfile:\n' - f' diskCount: {self.lssd_count}\n' - ) - else: - swap_config_block = ' swapConfig:\n enabled: true\n' - swap_config_yaml = ( - 'linuxConfig:\n' - + swap_config_block - + ' sysctl:\n' - ' vm.min_free_kbytes: 200\n' - ' vm.watermark_scale_factor: 500\n' - ' vm.swappiness: 100\n' - ) - - system_config_tmp = None - try: - system_config_tmp = tempfile.NamedTemporaryFile( - mode='w', suffix='.yaml', delete=False - ) - system_config_tmp.write(swap_config_yaml) - system_config_tmp.flush() - cmd.flags['system-config-from-file'] = system_config_tmp.name - logging.info( - '[swap_encryption] system-config-from-file: lssd=%s' - ' (written to %s):\n%s', - is_lssd, - system_config_tmp.name, - swap_config_yaml, - ) - logging.info( - '[swap_encryption] Creating benchmark nodepool: %s / %s /' - ' image=%s / disk=%dGiB / iops=%d / lssd=%s /' - ' add_swap_disk=%s', - _BENCHMARK_NODEPOOL, - self.machine_type, - self.node_image_type, - disk_size_gb, - self.disk_iops, - is_lssd, - self.add_swap_disk, - ) - # LSSD nodepools take longer to provision (NVMe init before Ready). - _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False) - finally: - if system_config_tmp is not None: - try: - os.unlink(system_config_tmp.name) - except OSError: - pass - - if rc != 0: - low = (stderr or '').lower() - # Idempotent prepare: if the nodepool already exists (re-running - # --run_stage=prepare,run), reuse it instead of failing. - if ( - 'already exists' in low - or 'alreadyexists' in low - or 'code=409' in low - ): - logging.info( - '[swap_encryption] Benchmark nodepool already exists —' - ' reusing (idempotent prepare)' - ) - return - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create benchmark nodepool' - f' (rc={rc}): {stderr}' - ) - logging.info('[swap_encryption] Benchmark nodepool ready') - - def _WaitForNode(self, timeout: int = 900) -> None: - """Block until a node labelled pkb_nodepool=benchmark is Ready. - - gcloud container node-pools create returns when the API accepts the - request; the node VM may take another 2-4 min to boot and pass - readiness checks. Deploying the DaemonSet before the node is Ready - leaves the pod Pending indefinitely. - """ - deadline = time.time() + timeout - logging.info( - '[swap_encryption] Waiting for benchmark node' - ' (pkb_nodepool=benchmark) to be Ready...' - ) - while time.time() < deadline: - out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'nodes', - '-l', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', - ( - r'jsonpath={range .items[*]}' - r'{.metadata.name}{"\t"}' - r'{range .status.conditions[?(@.type=="Ready")]}' - r'{.status}{"\n"}{end}{end}' - ), - ], - raise_on_failure=False, - ) - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2 and parts[1].strip() == 'True': - logging.info( - '[swap_encryption] Benchmark node ready: %s', - parts[0].strip(), - ) - return - logging.info( - '[swap_encryption] Benchmark node not yet Ready —' - ' retrying in 15 s...' - ) - time.sleep(15) - raise errors.Benchmarks.RunError( - f'[swap_encryption] Timed out waiting for benchmark node' - f' (pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready' - f' after {timeout}s' - ) - - # ── Dedicated swap disk helpers ─────────────────────────────────────────── - - def _AttachDisk(self) -> None: - """Create a dedicated hyperdisk and attach it to the benchmark node. - - gcloud container node-pools create --additional-node-disk is not - available in all gcloud SDK versions, so we create the disk via - gcloud compute and attach it after the node is Ready. In GKE the - Kubernetes node name equals the GCE instance name. - - The disk is named pkb-swap- to avoid collisions across - concurrent PKB runs. _Delete() calls _DetachAndDeleteDisk() to clean - up. - """ - cluster = self.cluster - zone = self._cluster_zone() - if not zone: - raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot attach swap disk: cluster zone unknown' - ) - project = cluster.project - disk_name = f'pkb-swap-{cluster.name}' - - # Get the GCE instance name from the benchmark node's Kubernetes name. - node_out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'nodes', - '-l', - f'pkb_nodepool={_BENCHMARK_NODEPOOL}', - '-o', - 'jsonpath={.items[0].metadata.name}', - ], - raise_on_failure=False, - ) - instance_name = node_out.strip() - if rc != 0 or not instance_name: - raise errors.Benchmarks.RunError( - '[swap_encryption] Cannot find benchmark node for swap disk' - ' attach' - ) - logging.info( - '[swap_encryption] Benchmark node instance: %s', instance_name - ) - - # Create the disk. - logging.info( - '[swap_encryption] Creating swap disk %s (%dGiB %s)', - disk_name, - self.swap_disk_size_gb, - self.disk_type, - ) - gcp_res = _GcpZonalResource(project, zone) - create_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'create', disk_name - ) - create_cmd.flags['type'] = self.disk_type - create_cmd.flags['size'] = f'{self.swap_disk_size_gb}GB' - create_cmd.args.append('--quiet') - if self.disk_type.startswith('hyperdisk'): - create_cmd.flags['provisioned-iops'] = self.disk_iops - create_cmd.flags['provisioned-throughput'] = ( - _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput) - ) - _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to create swap disk {disk_name}:' - f' {stderr}' - ) - - # Attach the disk to the benchmark node VM. - logging.info( - '[swap_encryption] Attaching swap disk %s to %s', - disk_name, - instance_name, - ) - attach_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'instances', 'attach-disk', instance_name - ) - attach_cmd.flags['disk'] = disk_name - attach_cmd.flags['device-name'] = 'pkb-swap' - attach_cmd.args.append('--quiet') - _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False) - if rc != 0: - raise errors.Benchmarks.RunError( - f'[swap_encryption] Failed to attach swap disk to' - f' {instance_name}: {stderr}' - ) - logging.info( - '[swap_encryption] Swap disk attached: %s → %s', - disk_name, - instance_name, - ) - - def _DetachAndDeleteDisk(self) -> None: - """Detach and delete the dedicated swap disk created by _AttachDisk.""" - zone = self._cluster_zone() - cluster = self.cluster - if not zone or not getattr(cluster, 'project', None): - return - disk_name = f'pkb-swap-{cluster.name}' - self._DeleteDiskByName(disk_name, cluster.project, zone) - - def _DeleteDiskByName( - self, disk_name: str, project: str, zone: str - ) -> bool: - """Detach (if attached) and delete a GCE disk, robustly, with retries. - - Finds the attached instance from the disk's own `users` field rather - than kubectl — kubectl is often unavailable during teardown (cluster - being deleted), which previously left the disk attached and - undeletable. Returns True if the disk is gone. - """ - for attempt in range(1, 5): - gcp_res = _GcpZonalResource(project, zone) - describe_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'describe', disk_name - ) - describe_cmd.flags['format'] = 'value(users)' - users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False) - if rc != 0: - logging.info( - '[swap_encryption] Swap disk %s not present —' - ' nothing to delete', - disk_name, - ) - return True # Already gone. - user = users.strip() - if user: - inst = user.split('/')[-1] - logging.info( - '[swap_encryption] Detaching swap disk %s from %s', - disk_name, - inst, - ) - detach_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'instances', 'detach-disk', inst - ) - detach_cmd.flags['disk'] = disk_name - detach_cmd.args.append('--quiet') - detach_cmd.Issue(timeout=120, raise_on_failure=False) - delete_cmd = gcp_util.GcloudCommand( - gcp_res, 'compute', 'disks', 'delete', disk_name - ) - delete_cmd.args.append('--quiet') - _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False) - if drc == 0: - logging.info( - '[swap_encryption] Swap disk deleted: %s', disk_name - ) - return True - logging.warning( - '[swap_encryption] Swap disk delete attempt %d/4 failed' - ' (%s); retrying in 10 s', - attempt, - derr.strip()[:160], - ) - time.sleep(10) - logging.error( - '[swap_encryption] Could NOT delete swap disk %s after retries' - ' — delete it manually:\n' - ' gcloud compute disks delete %s --zone %s --quiet', - disk_name, - disk_name, - zone, - ) - return False - - def _DeleteNodePool(self) -> None: - """Delete the benchmark nodepool.""" - cmd = self.cluster._GcloudCommand( - 'container', - 'node-pools', - 'delete', - _BENCHMARK_NODEPOOL, - '--cluster', - self.cluster.name, - ) - cmd.args.append('--quiet') - logging.info( - '[swap_encryption] Deleting benchmark nodepool: %s', - _BENCHMARK_NODEPOOL, - ) - _, stderr, rc = cmd.Issue(timeout=600, raise_on_failure=False) - if rc != 0: - logging.warning( - '[swap_encryption] Could not delete benchmark nodepool' - ' (rc=%d): %s', - rc, - stderr, - ) - else: - logging.info('[swap_encryption] Benchmark nodepool deleted') - - def DeleteDefaultPool(self) -> None: - """Delete the dummy e2-medium default nodepool. - - Called from Prepare() AFTER the DaemonSet pod is Running. The default - pool (e2-medium) was only needed to satisfy GKE's requirement that a - cluster must have at least one nodepool at creation time. Removing it - stops its cost immediately. - - Deleting the default pool BEFORE the DaemonSet pod is Running can - trigger a brief API-server I/O timeout (control plane busy with two - concurrent nodepool ops). Calling this method from Prepare() after - daemonset.WaitForPod() ensures the cluster is fully stable. - """ - cmd = self.cluster._GcloudCommand( - 'container', - 'node-pools', - 'delete', - _DEFAULT_NODEPOOL, - '--cluster', - self.cluster.name, - ) - cmd.args.append('--quiet') - logging.info( - '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL - ) - _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False) - if rc != 0: - logging.warning( - '[swap_encryption] Could not delete default nodepool' - ' (rc=%d): %s', - rc, - stderr, - ) - else: - logging.info('[swap_encryption] Default nodepool deleted') - - # ── Internal helpers ────────────────────────────────────────────────────── - - def _cluster_zone(self) -> str: - """Return the first zone (or region) from the cluster object.""" - cluster = self.cluster - if getattr(cluster, 'zones', None): - return cluster.zones[0] - if getattr(cluster, 'region', None): - return cluster.region - return '' From 1befef2127b4308e8be1f5767ecc4a6ba06e6741 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 17:53:25 +0530 Subject: [PATCH 12/17] correct PKB structure --- .../providers/gcp/google_kubernetes_engine.py | 16 +- .../container_service/swap_daemonset.py | 1012 ++++++++--------- 2 files changed, 517 insertions(+), 511 deletions(-) diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py index 52bcdc82c2..86d8d7142a 100644 --- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py +++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py @@ -429,11 +429,15 @@ def _CreateNodePools(self): cmd = self._GcloudCommand( 'container', 'node-pools', 'create', name, '--cluster', self.name ) - self._AddNodeParamsToCmd( - nodepool, - cmd, - ) - self._IssueResourceCreationCommand(cmd) + self._AddNodeParamsToCmd(nodepool, cmd) + # If swap_config wrote a linuxConfig tempfile, clean it up after Issue(). + swap_cfg = getattr(nodepool, '_gke_swap_config', None) + try: + self._IssueResourceCreationCommand(cmd) + finally: + if swap_cfg is not None: + swap_cfg.CleanupYaml() + nodepool._gke_swap_config = None # pylint: disable=protected-access self._CreateCustomComputeClass(nodepool) def _CreateCustomComputeClass( @@ -575,6 +579,8 @@ def _AddNodeParamsToCmd( if nodepool_config.swap_config is not None: gke_swap = swap_config_lib.GkeSwapConfig.from_spec(nodepool_config.swap_config) cmd.flags['system-config-from-file'] = gke_swap.WriteLinuxConfigYaml() + # Store on nodepool so _CreateNodePools() can clean up the tempfile. + nodepool_config._gke_swap_config = gke_swap # pylint: disable=protected-access # dm-crypt requires UBUNTU_CONTAINERD (Ajay r3472549985). cmd.flags['image-type'] = 'UBUNTU_CONTAINERD' # Prevent GKE from replacing the node after swap setup is complete. diff --git a/perfkitbenchmarker/resources/container_service/swap_daemonset.py b/perfkitbenchmarker/resources/container_service/swap_daemonset.py index ab23c8d6aa..48e3b9c890 100644 --- a/perfkitbenchmarker/resources/container_service/swap_daemonset.py +++ b/perfkitbenchmarker/resources/container_service/swap_daemonset.py @@ -17,12 +17,12 @@ swap_encryption benchmark: _Create() — apply the Jinja2 manifest via kubernetes_commands.ApplyManifest - and wait for the pod to reach Running + /tmp/pkb_ready. + and wait for the pod to reach Running + /tmp/pkb_ready. _Delete() — run in-pod cleanup (swapoff, dmsetup remove, losetup teardown, - pkill fio/stress-ng) then kubectl delete daemonset. + pkill fio/stress-ng) then kubectl delete daemonset. PodExec() — kubectl exec wrapper with transient-reset retry, OOM-kill (rc=137) - detection, and automatic RecoverPod() after eviction or container - restart. + detection, and automatic RecoverPod() after eviction or container + restart. WaitForPod() — polls for Running phase + sentinel; updates self.pod_name. RecoverPod() — waits for DaemonSet to recreate / restart the container, checking deletionTimestamp to avoid false-positive Running state. @@ -47,563 +47,563 @@ # Errors indicating the container / pod is gone and needs full recovery. _CONTAINER_GONE_KUBECTL_ERRORS = ( - 'container not found', - 'procready not received', - 'unable to upgrade connection', - 'not found', - 'deleted state', + 'container not found', + 'procready not received', + 'unable to upgrade connection', + 'not found', + 'deleted state', ) class SwapDaemonSet(resource.BaseResource): - """PKB resource for the swap-encryption benchmark privileged DaemonSet. + """PKB resource for the swap-encryption benchmark privileged DaemonSet. - The DaemonSet runs a single privileged pod on the benchmark nodepool. - It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli), - verifies the swap device is active, then writes /tmp/pkb_ready. All - benchmark phases execute commands inside this pod via PodExec(). + The DaemonSet runs a single privileged pod on the benchmark nodepool. + It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli), + verifies the swap device is active, then writes /tmp/pkb_ready. All + benchmark phases execute commands inside this pod via PodExec(). - Attributes: + Attributes: name: DaemonSet metadata.name (e.g. 'pkb-swap-benchmark'). namespace: Kubernetes namespace (typically 'default'). label: Pod label value for app= selector. nodepool: pkb_nodepool label value pinning the DaemonSet to the - benchmark node. + benchmark node. image: Container image (e.g. 'ubuntu:22.04'). pod_name: Name of the currently active pod; updated by WaitForPod / - RecoverPod on eviction. + RecoverPod on eviction. oom_events: Pod names that triggered rc=137 OOM-kill; read by Run() - for the degradation gate. + for the degradation gate. pod_lost: Pod names that went NotFound during PodExec; read by Run() - for the degradation gate. + for the degradation gate. + """ + + RESOURCE_TYPE = 'SwapDaemonSet' + REQUIRED_ATTRS = [] + + def __init__( + self, + name: str, + namespace: str, + label: str, + nodepool: str, + image: str, + ) -> None: + super().__init__() + self.name = name + self.namespace = namespace + self.label = label + self.nodepool = nodepool + self.image = image + # Active pod tracking — updated by WaitForPod / RecoverPod. + self.pod_name: Optional[str] = None + # Per-run accumulators read by Run() for the degradation gate. + self.oom_events: list[str] = [] + self.pod_lost: list[str] = [] + + # ── PKB lifecycle ───────────────────────────────────────────────────────── + + def _Create(self) -> None: + """Apply the DaemonSet manifest and wait for the pod to be ready.""" + kubernetes_commands.ApplyManifest( + 'cluster/swap_encryption_daemonset.yaml.j2', + ds_name=self.name, + ds_namespace=self.namespace, + ds_label=self.label, + benchmark_nodepool=self.nodepool, + image=self.image, + ) + logging.info('[swap_encryption] Swap-infra DaemonSet applied') + pod = self.WaitForPod() + if pod is None: + raise errors.Benchmarks.PrepareException( + '[swap_encryption] DaemonSet pod did not become ready within' + ' timeout' + ) + + def _Delete(self) -> None: + """Run in-pod teardown then delete the DaemonSet. + + Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the + pod (best-effort, ignore_failure=True) before deleting the DaemonSet. + This mirrors the original Cleanup() logic so no swap state is leaked. """ - - RESOURCE_TYPE = 'SwapDaemonSet' - REQUIRED_ATTRS = [] - - def __init__( - self, - name: str, - namespace: str, - label: str, - nodepool: str, - image: str, - ) -> None: - super().__init__() - self.name = name - self.namespace = namespace - self.label = label - self.nodepool = nodepool - self.image = image - # Active pod tracking — updated by WaitForPod / RecoverPod. - self.pod_name: Optional[str] = None - # Per-run accumulators read by Run() for the degradation gate. - self.oom_events: list[str] = [] - self.pod_lost: list[str] = [] - - # ── PKB lifecycle ───────────────────────────────────────────────────────── - - def _Create(self) -> None: - """Apply the DaemonSet manifest and wait for the pod to be ready.""" - kubernetes_commands.ApplyManifest( - 'cluster/swap_encryption_daemonset.yaml.j2', - ds_name=self.name, - ds_namespace=self.namespace, - ds_label=self.label, - benchmark_nodepool=self.nodepool, - image=self.image, - ) - logging.info('[swap_encryption] Swap-infra DaemonSet applied') - pod = self.WaitForPod() - if pod is None: - raise errors.Benchmarks.PrepareException( - '[swap_encryption] DaemonSet pod did not become ready within' - ' timeout' - ) - - def _Delete(self) -> None: - """Run in-pod teardown then delete the DaemonSet. - - Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the - pod (best-effort, ignore_failure=True) before deleting the DaemonSet. - This mirrors the original Cleanup() logic so no swap state is leaked. - """ - # Try to get the pod name quickly if not set. - if self.pod_name is None: - self.WaitForPod(timeout=30) - - if self.pod_name: - self.PodExec( - 'swapoff -a 2>/dev/null || true', - ignore_failure=True, - _retries=0, - ) - self.PodExec( - textwrap.dedent("""\ - swapoff /dev/mapper/swap_encrypted 2>/dev/null || true - dmsetup remove --noudevrules --noudevsync \ + # Try to get the pod name quickly if not set. + if self.pod_name is None: + self.WaitForPod(timeout=30) + + if self.pod_name: + self.PodExec( + 'swapoff -a 2>/dev/null || true', + ignore_failure=True, + _retries=0, + ) + self.PodExec( + textwrap.dedent("""\ + swapoff /dev/mapper/swap_encrypted 2>/dev/null || true + dmsetup remove --noudevrules --noudevsync \ swap_encrypted 2>/dev/null || true - """), - ignore_failure=True, - _retries=0, - ) - self.PodExec( - textwrap.dedent("""\ - for backing in \ - /var/pkb_swap_backing \ - /run/pkb_swap_backing \ - /mnt/stateful_partition/pkb_swap_backing - do + """), + ignore_failure=True, + _retries=0, + ) + self.PodExec( + textwrap.dedent("""\ + for backing in \ + /var/pkb_swap_backing \ + /run/pkb_swap_backing \ + /mnt/stateful_partition/pkb_swap_backing + do losetup -j "$backing" 2>/dev/null \ - | awk -F: '{print $1}' \ - | while read dev + | awk -F: '{print $1}' \ + | while read dev do losetup -d "$dev" 2>/dev/null || true; done rm -f "$backing" - done - """), - ignore_failure=True, - _retries=0, - ) - self.PodExec( - "pkill -9 'stress-ng|fio' 2>/dev/null || true", - ignore_failure=True, - _retries=0, - ) - - kubectl.RunKubectlCommand( - [ - 'delete', - 'daemonset', - self.name, - '-n', - self.namespace, - '--ignore-not-found', - ], - raise_on_failure=False, - ) - logging.info('[swap_encryption] DaemonSet deleted') - - # ── Pod lifecycle helpers ───────────────────────────────────────────────── - - def WaitForPod(self, timeout: int = 600) -> Optional[str]: - """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists. - - Two-phase poll: + done + """), + ignore_failure=True, + _retries=0, + ) + self.PodExec( + "pkill -9 'stress-ng|fio' 2>/dev/null || true", + ignore_failure=True, + _retries=0, + ) + + kubectl.RunKubectlCommand( + [ + 'delete', + 'daemonset', + self.name, + '-n', + self.namespace, + '--ignore-not-found', + ], + raise_on_failure=False, + ) + logging.info('[swap_encryption] DaemonSet deleted') + + # ── Pod lifecycle helpers ───────────────────────────────────────────────── + + def WaitForPod(self, timeout: int = 600) -> Optional[str]: + """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists. + + Two-phase poll: 1. Wait for status.phase == Running. 2. kubectl exec test -f /tmp/pkb_ready. - The DaemonSet init script writes /tmp/pkb_ready only after verifying - the swap device is active (up to 150 s) and installing all measurement - tools (~1-2 min on cold APT cache). The default 600 s covers - worst-case APT latency on a freshly-booted node. + The DaemonSet init script writes /tmp/pkb_ready only after verifying + the swap device is active (up to 150 s) and installing all measurement + tools (~1-2 min on cold APT cache). The default 600 s covers + worst-case APT latency on a freshly-booted node. - Args: + Args: timeout: Maximum seconds to wait. - Returns: + Returns: Pod name on success; None on timeout. Also updates self.pod_name. - """ - deadline = time.time() + timeout - last_phase = '' - ready_pod = None - - while time.time() < deadline: - # Step 1: wait for Running phase. - if ready_pod is None: - out, _, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pods', - '-l', - f'app={self.label}', - '-n', - self.namespace, - '-o', - ( - r'jsonpath={range .items[*]}' - r'{.metadata.name}{"\t"}' - r'{.status.phase}{"\n"}{end}' - ), - ], - raise_on_failure=False, + """ + deadline = time.time() + timeout + last_phase = '' + ready_pod = None + + while time.time() < deadline: + # Step 1: wait for Running phase. + if ready_pod is None: + out, _, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pods', + '-l', + f'app={self.label}', + '-n', + self.namespace, + '-o', + ( + r'jsonpath={range .items[*]}' + r'{.metadata.name}{"\t"}' + r'{.status.phase}{"\n"}{end}' + ), + ], + raise_on_failure=False, + ) + if rc == 0 and out.strip(): + for line in out.strip().splitlines(): + parts = line.split('\t') + if len(parts) == 2: + pod_name = parts[0].strip() + phase = parts[1].strip() + if phase == 'Running': + logging.info( + '[swap_encryption] Pod %s is Running' + ' — waiting for sentinel...', + pod_name, ) - if rc == 0 and out.strip(): - for line in out.strip().splitlines(): - parts = line.split('\t') - if len(parts) == 2: - pod_name = parts[0].strip() - phase = parts[1].strip() - if phase == 'Running': - logging.info( - '[swap_encryption] Pod %s is Running' - ' — waiting for sentinel...', - pod_name, - ) - ready_pod = pod_name - break - if phase != last_phase: - logging.info( - '[swap_encryption] Pod %s phase: %s', - pod_name, - phase, - ) - last_phase = phase - if phase == 'Pending': - self._LogPodEvents(pod_name) - else: - logging.info( - '[swap_encryption] Waiting for DaemonSet pod to' - ' appear...' - ) - - # Step 2: poll for /tmp/pkb_ready sentinel. - if ready_pod is not None: - _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( - [ - 'exec', - ready_pod, - '-n', - self.namespace, - '--', - 'test', - '-f', - '/tmp/pkb_ready', - ], - raise_on_failure=False, + ready_pod = pod_name + break + if phase != last_phase: + logging.info( + '[swap_encryption] Pod %s phase: %s', + pod_name, + phase, ) - if sentinel_rc == 0: - logging.info( - '[swap_encryption] Pod %s ready (swap device active)', - ready_pod, - ) - self.pod_name = ready_pod - return ready_pod - # Container crashed (CrashLoopBackOff / exited) — reset and - # re-check pod phase on the next iteration. - if 'container not found' in sentinel_err or ( - 'unable to upgrade connection' in sentinel_err - ): - logging.warning( - '[swap_encryption] Pod %s: container not running' - ' (%s) — will re-check pod state', - ready_pod, - sentinel_err.strip(), - ) - ready_pod = None - last_phase = '' - else: - logging.info( - '[swap_encryption] Pod %s: still installing tools...', - ready_pod, - ) - - time.sleep(15) - - logging.warning( - '[swap_encryption] Benchmark pod not ready after %ds', timeout - ) - return None - - def _LogPodEvents(self, pod_name: str) -> None: - """Dump recent Kubernetes events for a pod to help diagnose hangs.""" - events_out, _, _ = kubectl.RunKubectlCommand( - ['describe', 'pod', pod_name, '-n', self.namespace], - raise_on_failure=False, + last_phase = phase + if phase == 'Pending': + self._LogPodEvents(pod_name) + else: + logging.info( + '[swap_encryption] Waiting for DaemonSet pod to' + ' appear...' + ) + + # Step 2: poll for /tmp/pkb_ready sentinel. + if ready_pod is not None: + _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand( + [ + 'exec', + ready_pod, + '-n', + self.namespace, + '--', + 'test', + '-f', + '/tmp/pkb_ready', + ], + raise_on_failure=False, ) - in_events = False - lines = [] - for line in events_out.splitlines(): - if line.startswith('Events:'): - in_events = True - if in_events: - lines.append(line) - if lines: - logging.info( - '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]) - ) + if sentinel_rc == 0: + logging.info( + '[swap_encryption] Pod %s ready (swap device active)', + ready_pod, + ) + self.pod_name = ready_pod + return ready_pod + # Container crashed (CrashLoopBackOff / exited) — reset and + # re-check pod phase on the next iteration. + if 'container not found' in sentinel_err or ( + 'unable to upgrade connection' in sentinel_err + ): + logging.warning( + '[swap_encryption] Pod %s: container not running' + ' (%s) — will re-check pod state', + ready_pod, + sentinel_err.strip(), + ) + ready_pod = None + last_phase = '' else: - logging.info( - '[swap_encryption] kubectl describe output:\n%s', - events_out[-2000:] if len(events_out) > 2000 else events_out, - ) - - def _IsPodGone(self, pod: str) -> bool: - """Return True if the named pod no longer exists in the cluster.""" - try: - _, err, rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pod', - pod, - '-n', - self.namespace, - '-o', - 'jsonpath={.metadata.name}', - ], - raise_on_failure=False, - timeout=15, - ) - return rc != 0 and 'not found' in (err or '').lower() - except Exception: # pylint: disable=broad-except - return False - - def PodExec( - self, - cmd: str, - ignore_failure: bool = False, - timeout: int = 300, - _retries: int = 2, - ) -> tuple[str, str]: - """Run a shell command inside the benchmark pod via kubectl exec. - - Handles: + logging.info( + '[swap_encryption] Pod %s: still installing tools...', + ready_pod, + ) + + time.sleep(15) + + logging.warning( + '[swap_encryption] Benchmark pod not ready after %ds', timeout + ) + return None + + def _LogPodEvents(self, pod_name: str) -> None: + """Dump recent Kubernetes events for a pod to help diagnose hangs.""" + events_out, _, _ = kubectl.RunKubectlCommand( + ['describe', 'pod', pod_name, '-n', self.namespace], + raise_on_failure=False, + ) + in_events = False + lines = [] + for line in events_out.splitlines(): + if line.startswith('Events:'): + in_events = True + if in_events: + lines.append(line) + if lines: + logging.info( + '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]) + ) + else: + logging.info( + '[swap_encryption] kubectl describe output:\n%s', + events_out[-2000:] if len(events_out) > 2000 else events_out, + ) + + def _IsPodGone(self, pod: str) -> bool: + """Return True if the named pod no longer exists in the cluster.""" + try: + _, err, rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pod', + pod, + '-n', + self.namespace, + '-o', + 'jsonpath={.metadata.name}', + ], + raise_on_failure=False, + timeout=15, + ) + return rc != 0 and 'not found' in (err or '').lower() + except Exception: # pylint: disable=broad-except + return False + + def PodExec( + self, + cmd: str, + ignore_failure: bool = False, + timeout: int = 300, + _retries: int = 2, + ) -> tuple[str, str]: + """Run a shell command inside the benchmark pod via kubectl exec. + + Handles: - Transient GKE websocket resets: automatic retry (up to _retries). - OOM kill (rc=137): records to self.oom_events, calls RecoverPod, - does NOT retry the OOM-triggering command itself. + does NOT retry the OOM-triggering command itself. - Container/pod gone: records to self.pod_lost, calls RecoverPod, - retries the command on the recovered pod. + retries the command on the recovered pod. - Uses self.pod_name as the active pod; RecoverPod updates it on eviction. + Uses self.pod_name as the active pod; RecoverPod updates it on eviction. - Args: + Args: cmd: Shell command string passed to bash -c. ignore_failure: When True, non-zero exit codes are logged but not - raised. + raised. timeout: Seconds before PKB kills the kubectl exec process. Pass a - larger value for long-running jobs (fio, stress-ng, kernel build). + larger value for long-running jobs (fio, stress-ng, kernel build). _retries: Max automatic retries on transient websocket resets. - Returns: + Returns: Tuple of (stdout, stderr) strings. - """ - active = self.pod_name - - for attempt in range(_retries + 1): - out, err, rc = kubectl.RunKubectlCommand( - [ - 'exec', - active, - '-n', - self.namespace, - '--', - 'bash', - '-c', - cmd, - ], - raise_on_failure=False, - raise_on_timeout=False, - timeout=timeout, + """ + active = self.pod_name + + for attempt in range(_retries + 1): + out, err, rc = kubectl.RunKubectlCommand( + [ + 'exec', + active, + '-n', + self.namespace, + '--', + 'bash', + '-c', + cmd, + ], + raise_on_failure=False, + raise_on_timeout=False, + timeout=timeout, + ) + + # Retry transient GKE websocket resets. + is_transient = rc != 0 and any( + e in err for e in _TRANSIENT_KUBECTL_ERRORS + ) + if is_transient and attempt < _retries: + logging.warning( + '[swap_encryption] kubectl exec connection reset (attempt' + ' %d/%d); retrying in 10 s', + attempt + 1, + _retries + 1, + ) + time.sleep(10) + continue + + # rc=137 (SIGKILL): OOM killer terminated the container process. + # Do NOT retry — log, recover, and return so the caller can decide. + if rc == 137: + if active not in self.oom_events: + self.oom_events.append(active) + # Kubernetes takes a few seconds to update pod state after + # eviction — sleep before checking to avoid false-positive Running. + logging.warning( + '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes' + ' to update pod state before recovery check' + ) + time.sleep(15) + if self._IsPodGone(active): + logging.warning( + '[swap_encryption] OOM-eviction detected (rc=137, pod' + ' gone) — recovering pod name for subsequent commands' + ) + else: + logging.warning( + '[swap_encryption] Container OOM-killed (rc=137, pod' + ' still exists) — waiting for container restart' + ) + new_pod = self.RecoverPod(active) + if new_pod != active: + logging.info( + '[swap_encryption] Pod name updated: %s → %s', + active, + new_pod, + ) + self.pod_name = new_pod + active = new_pod + break # OOM cmd is never re-run on the recovered pod. + + # Container or pod gone: record loss, try RecoverPod, retry cmd. + is_container_gone = rc != 0 and any( + e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS + ) + if is_container_gone: + if active and active not in self.pod_lost: + self.pod_lost.append(active) + logging.error( + '[swap_encryption] Benchmark pod %s is gone (%s) —' + ' recording run as degraded', + active, + (err or '').strip()[:160], + ) + if attempt < _retries: + logging.warning( + '[swap_encryption] Container gone/restarting (attempt' + ' %d/%d) — waiting for pod to recover...', + attempt + 1, + _retries + 1, + ) + new_pod = self.RecoverPod(active) + if new_pod != active: + logging.info( + '[swap_encryption] Pod name updated: %s → %s', + active, + new_pod, ) + self.pod_name = new_pod + active = new_pod + continue + break - # Retry transient GKE websocket resets. - is_transient = rc != 0 and any( - e in err for e in _TRANSIENT_KUBECTL_ERRORS - ) - if is_transient and attempt < _retries: - logging.warning( - '[swap_encryption] kubectl exec connection reset (attempt' - ' %d/%d); retrying in 10 s', - attempt + 1, - _retries + 1, - ) - time.sleep(10) - continue - - # rc=137 (SIGKILL): OOM killer terminated the container process. - # Do NOT retry — log, recover, and return so the caller can decide. - if rc == 137: - if active not in self.oom_events: - self.oom_events.append(active) - # Kubernetes takes a few seconds to update pod state after - # eviction — sleep before checking to avoid false-positive Running. - logging.warning( - '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes' - ' to update pod state before recovery check' - ) - time.sleep(15) - if self._IsPodGone(active): - logging.warning( - '[swap_encryption] OOM-eviction detected (rc=137, pod' - ' gone) — recovering pod name for subsequent commands' - ) - else: - logging.warning( - '[swap_encryption] Container OOM-killed (rc=137, pod' - ' still exists) — waiting for container restart' - ) - new_pod = self.RecoverPod(active) - if new_pod != active: - logging.info( - '[swap_encryption] Pod name updated: %s → %s', - active, - new_pod, - ) - self.pod_name = new_pod - active = new_pod - break # OOM cmd is never re-run on the recovered pod. - - # Container or pod gone: record loss, try RecoverPod, retry cmd. - is_container_gone = rc != 0 and any( - e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS - ) - if is_container_gone: - if active and active not in self.pod_lost: - self.pod_lost.append(active) - logging.error( - '[swap_encryption] Benchmark pod %s is gone (%s) —' - ' recording run as degraded', - active, - (err or '').strip()[:160], - ) - if attempt < _retries: - logging.warning( - '[swap_encryption] Container gone/restarting (attempt' - ' %d/%d) — waiting for pod to recover...', - attempt + 1, - _retries + 1, - ) - new_pod = self.RecoverPod(active) - if new_pod != active: - logging.info( - '[swap_encryption] Pod name updated: %s → %s', - active, - new_pod, - ) - self.pod_name = new_pod - active = new_pod - continue - break - - if rc != 0 and not ignore_failure: - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] PodExec failed (rc={rc}): {err}' - ) - return out, err + if rc != 0 and not ignore_failure: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] PodExec failed (rc={rc}): {err}' + ) + return out, err - def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str: - """Wait for the DaemonSet to recover after OOM kill or eviction. + def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str: + """Wait for the DaemonSet to recover after OOM kill or eviction. - Handles two scenarios: + Handles two scenarios: 1. Container OOM restart: same pod name, container restarting in place (DaemonSet restartPolicy=Always). 2. Pod eviction/deletion: pod is gone; DaemonSet creates a new pod with a DIFFERENT name. - Checks metadata.deletionTimestamp in addition to status.phase to - catch the Terminating state where phase may still read Running. + Checks metadata.deletionTimestamp in addition to status.phase to + catch the Terminating state where phase may still read Running. - Args: + Args: pod: Original pod name to monitor. timeout_sec: Maximum seconds to wait for recovery. - Returns: + Returns: The (possibly new) pod name once Running and /tmp/pkb_ready is present. - """ - deadline = time.time() + timeout_sec - logging.info( - '[swap_encryption] Waiting for pod %s to recover (up to %ds)...', - pod, - timeout_sec, + """ + deadline = time.time() + timeout_sec + logging.info( + '[swap_encryption] Waiting for pod %s to recover (up to %ds)...', + pod, + timeout_sec, + ) + + # Phase 1: find a Running pod that is NOT being terminated. + recovered_pod = pod + while time.time() < deadline: + # Query both phase and deletionTimestamp in a single call. + status_out, status_err, status_rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pod', + pod, + '-n', + self.namespace, + '-o', + 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}', + ], + raise_on_failure=False, + timeout=30, + ) + fields = status_out.strip().split('|') + phase = fields[0].strip() if fields else '' + is_terminating = len(fields) > 1 and bool(fields[1].strip()) + + # Genuine Running (not being deleted) — move to Phase 2. + if status_rc == 0 and phase == 'Running' and not is_terminating: + break + + # Pod gone or Terminating — look for a replacement by label. + pod_gone_or_terminating = ( + status_rc != 0 + and 'not found' in (status_out + status_err).lower() + ) or is_terminating + if pod_gone_or_terminating: + label_out, _, label_rc = kubectl.RunKubectlCommand( + [ + 'get', + 'pods', + '-n', + self.namespace, + '-l', + f'app={self.label}', + '-o', + ( + 'jsonpath={range' + ' .items[?(@.status.phase=="Running")]}' + '{.metadata.name}{"\\n"}{end}' + ), + ], + raise_on_failure=False, + timeout=30, ) - - # Phase 1: find a Running pod that is NOT being terminated. - recovered_pod = pod - while time.time() < deadline: - # Query both phase and deletionTimestamp in a single call. - status_out, status_err, status_rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pod', - pod, - '-n', - self.namespace, - '-o', - 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}', - ], - raise_on_failure=False, - timeout=30, - ) - fields = status_out.strip().split('|') - phase = fields[0].strip() if fields else '' - is_terminating = len(fields) > 1 and bool(fields[1].strip()) - - # Genuine Running (not being deleted) — move to Phase 2. - if status_rc == 0 and phase == 'Running' and not is_terminating: - break - - # Pod gone or Terminating — look for a replacement by label. - pod_gone_or_terminating = ( - status_rc != 0 - and 'not found' in (status_out + status_err).lower() - ) or is_terminating - if pod_gone_or_terminating: - label_out, _, label_rc = kubectl.RunKubectlCommand( - [ - 'get', - 'pods', - '-n', - self.namespace, - '-l', - f'app={self.label}', - '-o', - ( - 'jsonpath={range' - ' .items[?(@.status.phase=="Running")]}' - '{.metadata.name}{"\\n"}{end}' - ), - ], - raise_on_failure=False, - timeout=30, - ) - new_pods = [ - p.strip() - for p in label_out.strip().splitlines() - if p.strip() and p.strip() != pod - ] - if label_rc == 0 and new_pods: - recovered_pod = new_pods[0] - logging.info( - '[swap_encryption] Original pod %s gone/terminating;' - ' found replacement %s', - pod, - recovered_pod, - ) - break - - time.sleep(10) - else: - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] No Running pod found (original: {pod})' - f' within {timeout_sec}s after OOM kill / eviction' - ) - - # Phase 2: wait for init script to finish (sentinel written last). - while time.time() < deadline: - ready_out, _, ready_rc = kubectl.RunKubectlCommand( - [ - 'exec', - recovered_pod, - '-n', - self.namespace, - '--', - 'bash', - '-c', - 'test -f /tmp/pkb_ready && echo READY', - ], - raise_on_failure=False, - timeout=30, - ) - if ready_rc == 0 and 'READY' in ready_out: - logging.info( - '[swap_encryption] Pod %s recovered (swap device active)', - recovered_pod, - ) - self.pod_name = recovered_pod - return recovered_pod - time.sleep(15) - - raise errors.VmUtil.IssueCommandError( - f'[swap_encryption] Pod {recovered_pod} did not become ready' - f' within {timeout_sec}s after OOM kill / eviction' + new_pods = [ + p.strip() + for p in label_out.strip().splitlines() + if p.strip() and p.strip() != pod + ] + if label_rc == 0 and new_pods: + recovered_pod = new_pods[0] + logging.info( + '[swap_encryption] Original pod %s gone/terminating;' + ' found replacement %s', + pod, + recovered_pod, + ) + break + + time.sleep(10) + else: + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] No Running pod found (original: {pod})' + f' within {timeout_sec}s after OOM kill / eviction' + ) + + # Phase 2: wait for init script to finish (sentinel written last). + while time.time() < deadline: + ready_out, _, ready_rc = kubectl.RunKubectlCommand( + [ + 'exec', + recovered_pod, + '-n', + self.namespace, + '--', + 'bash', + '-c', + 'test -f /tmp/pkb_ready && echo READY', + ], + raise_on_failure=False, + timeout=30, + ) + if ready_rc == 0 and 'READY' in ready_out: + logging.info( + '[swap_encryption] Pod %s recovered (swap device active)', + recovered_pod, ) + self.pod_name = recovered_pod + return recovered_pod + time.sleep(15) + + raise errors.VmUtil.IssueCommandError( + f'[swap_encryption] Pod {recovered_pod} did not become ready' + f' within {timeout_sec}s after OOM kill / eviction' + ) From d997254919f816a9481c2ca56b04f1061f98ea82 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Mon, 29 Jun 2026 18:17:42 +0530 Subject: [PATCH 13/17] refactor(swap_encryption/pr1): correct PKB structure - swap_config as NodepoolSpec field BREAKING: replaces SwapNodePool (standalone nodepool lifecycle) with the correct PKB pattern: swap configuration declared in BENCHMARK_CONFIG and applied by the existing GKE cluster creation flow. New files: - resources/container_service/swap_config.py - GkeSwapConfig(BaseResource): WriteLinuxConfigYaml(), ValidHyperdiskThroughput() - EksSwapConfig(BaseResource): stub for nodeadm config (deferred to PR #6780) Core framework changes: - configs/container_spec.py: add SwapConfigSpec(BaseSpec) + _SwapConfigDecoder + swap_config field on NodepoolSpec - resources/container_service/container.py: add swap_config attr to BaseNodePoolConfig - resources/container_service/container_cluster.py: propagate swap_config in _InitializeNodePool() (mirrors sandbox_config pattern) - providers/gcp/google_kubernetes_engine.py: _AddNodeParamsToCmd() reads nodepool_config.swap_config - applies --system-config-from-file, UBUNTU_CONTAINERD, --no-enable-autorepair, boot-disk-provisioned-iops/throughput Thin benchmark: - BENCHMARK_CONFIG declares benchmark nodepool with swap_config (no separate nodepool create needed - GKE cluster creation handles it) - Prepare(): deploy SwapDaemonSet + delete default-pool - Run(): verify swap_active + swap_encrypted; report samples - Cleanup(): empty (PKB auto-deletes spec.resources) Addresses Ajay reviews: - r3457826290: swap as base resource plugged into GKE cluster creation flow - r3457877984: linuxConfig.swapConfig via --system-config-from-file (GkeSwapConfig) - r3457928855: removed memory.swap.max hack - r3457964593: UBUNTU_CONTAINERD set per-nodepool in _AddNodeParamsToCmd - r3472513706: swapConfig auto-enables memorySwapBehavior=LimitedSwap - r3472549985: UBUNTU_CONTAINERD required for dm-crypt --- .../swap_encryption_benchmark.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py index 3322795eec..d5b4ec08db 100644 --- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py +++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py @@ -88,9 +88,18 @@ 'Override disk type for the benchmark nodepool.', ) +_DAEMONSET_IMAGE = flags.DEFINE_string( + 'swap_encryption_daemonset_image', + 'ubuntu:22.04', + 'Container image for the privileged benchmark DaemonSet.', +) + _BenchmarkSpec = benchmark_spec.BenchmarkSpec _BENCHMARK_NODEPOOL = 'benchmark' _DEFAULT_POOL = 'default-pool' +_DS_NAME = 'pkb-swap-benchmark' +_DS_NAMESPACE = 'default' +_DS_LABEL = 'pkb-swap-benchmark' def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]: @@ -124,7 +133,13 @@ def Prepare(spec: _BenchmarkSpec) -> None: spec: PKB BenchmarkSpec with spec.container_cluster already created. """ cluster = spec.container_cluster - daemonset = swap_daemonset.SwapDaemonSet(cluster=cluster) + daemonset = swap_daemonset.SwapDaemonSet( + name=_DS_NAME, + namespace=_DS_NAMESPACE, + label=_DS_LABEL, + nodepool=_BENCHMARK_NODEPOOL, + image=_DAEMONSET_IMAGE.value, + ) daemonset.Create() spec.resources.append(daemonset) pod = daemonset.WaitForPod() From a65db709104d7ca7518d002957df2f672bbdb326 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Tue, 30 Jun 2026 09:48:11 +0530 Subject: [PATCH 14/17] refactor(swap_config): add BaseSwapConfig abstract base class GkeSwapConfig and EksSwapConfig now both inherit from BaseSwapConfig(BaseResource). Common sysctl attrs (swappiness, min_free_kbytes, watermark_scale_factor) live in the base class. Cloud-specific attrs remain in each subclass. Addresses Zac review: GkeSwapConfig & EksSwapConfig should inherit from BaseSwapConfig. --- .../container_service/swap_config.py | 99 +++++++++++++++---- 1 file changed, 80 insertions(+), 19 deletions(-) diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py index 8606929308..ca36dbad8b 100644 --- a/perfkitbenchmarker/resources/container_service/swap_config.py +++ b/perfkitbenchmarker/resources/container_service/swap_config.py @@ -11,13 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -"""GkeSwapConfig and EksSwapConfig: swap configuration as PKB BaseResource. +"""Swap configuration as PKB BaseResource: BaseSwapConfig, GkeSwapConfig, EksSwapConfig. These resources encapsulate cloud-specific swap configuration for GKE and EKS nodepools. They are referenced via NodepoolSpec.swap_config (declared in the benchmark BENCHMARK_CONFIG YAML) and consumed by the cloud provider's _AddNodeParamsToCmd() during cluster/nodepool creation. +Class hierarchy: + BaseSwapConfig(BaseResource) — common sysctl attrs + abstract from_spec() + GkeSwapConfig(BaseSwapConfig) — linuxConfig YAML for --system-config-from-file + EksSwapConfig(BaseSwapConfig) — nodeadm kubelet config (deferred to PR #6780) + Usage in BENCHMARK_CONFIG: container_cluster: nodepools: @@ -51,16 +56,58 @@ _HYPERDISK_MAX_IOPS_PER_MBPS = 256 -class GkeSwapConfig(resource.BaseResource): +class BaseSwapConfig(resource.BaseResource): + """Abstract base class for cloud-specific nodepool swap configuration. + + Subclasses (GkeSwapConfig, EksSwapConfig) implement the cloud-specific + method for applying swap configuration during nodepool creation. + + Common sysctl attributes (vm.swappiness, vm.min_free_kbytes, + vm.watermark_scale_factor) are shared across all cloud providers. + + _Create() and _Delete() are no-ops: the swap config is applied as a + parameter to nodepool creation, not as a standalone cloud resource. + """ + + RESOURCE_TYPE = 'BaseSwapConfig' + REQUIRED_ATTRS = [] + + def __init__( + self, + swappiness: int = 100, + min_free_kbytes: int = 200, + watermark_scale_factor: int = 500, + ) -> None: + super().__init__() + self.swappiness = swappiness + self.min_free_kbytes = min_free_kbytes + self.watermark_scale_factor = watermark_scale_factor + + @classmethod + def from_spec(cls, swap_spec) -> 'BaseSwapConfig': + """Create a BaseSwapConfig subclass from a SwapConfigSpec. + + Subclasses must override this to instantiate with cloud-specific attrs. + """ + raise NotImplementedError( + f'{cls.__name__}.from_spec() must be implemented by subclasses.' + ) + + def _Create(self) -> None: + """No-op: swap config is applied during nodepool creation.""" + + def _Delete(self) -> None: + """No-op: cleaned up when the nodepool is deleted.""" + + +class GkeSwapConfig(BaseSwapConfig): """GKE swap configuration for a nodepool. Encapsulates the linuxConfig (swapConfig + sysctl) YAML for --system-config-from-file and optional Hyperdisk IOPS/throughput overrides. Consumed by GkeCluster._AddNodeParamsToCmd() when nodepool_config.swap_config - is set. _Create() and _Delete() are no-ops because the swap config is applied - as part of the gcloud node-pools create command; the nodepool itself manages - the lifecycle. + is set. Attributes: swappiness: vm.swappiness sysctl value (0-200, default 100). @@ -85,10 +132,11 @@ def __init__( boot_disk_iops: int = 0, boot_disk_throughput: int = 0, ) -> None: - super().__init__() - self.swappiness = swappiness - self.min_free_kbytes = min_free_kbytes - self.watermark_scale_factor = watermark_scale_factor + super().__init__( + swappiness=swappiness, + min_free_kbytes=min_free_kbytes, + watermark_scale_factor=watermark_scale_factor, + ) self.lssd = lssd self.lssd_count = lssd_count self.boot_disk_iops = boot_disk_iops @@ -108,11 +156,8 @@ def from_spec(cls, swap_spec) -> 'GkeSwapConfig': boot_disk_throughput=swap_spec.boot_disk_throughput, ) - def _Create(self) -> None: - """No-op: swap config is applied during nodepool creation.""" - def _Delete(self) -> None: - """No-op: cleaned up when the nodepool is deleted.""" + """Cleans up any written YAML tempfile.""" self._CleanupYaml() def WriteLinuxConfigYaml(self) -> str: @@ -207,13 +252,16 @@ def _CleanupYaml(self) -> None: self.CleanupYaml() -class EksSwapConfig(resource.BaseResource): +class EksSwapConfig(BaseSwapConfig): """EKS swap configuration for a nodepool (stub). Configures kubelet LimitedSwap via nodeadm bootstrap configuration. Full implementation deferred to PR #6780. Attributes: + swappiness: vm.swappiness sysctl value (inherited from BaseSwapConfig). + min_free_kbytes: vm.min_free_kbytes sysctl (inherited from BaseSwapConfig). + watermark_scale_factor: vm.watermark_scale_factor (inherited from BaseSwapConfig). memory_swap_behavior: kubelet memorySwapBehavior value ('LimitedSwap'). fail_swap_on: kubelet failSwapOn setting (False to allow swap on EKS). """ @@ -223,17 +271,28 @@ class EksSwapConfig(resource.BaseResource): def __init__( self, + swappiness: int = 100, + min_free_kbytes: int = 200, + watermark_scale_factor: int = 500, memory_swap_behavior: str = 'LimitedSwap', fail_swap_on: bool = False, ) -> None: - super().__init__() + super().__init__( + swappiness=swappiness, + min_free_kbytes=min_free_kbytes, + watermark_scale_factor=watermark_scale_factor, + ) self.memory_swap_behavior = memory_swap_behavior self.fail_swap_on = fail_swap_on @classmethod def from_spec(cls, swap_spec) -> 'EksSwapConfig': """Create an EksSwapConfig from a SwapConfigSpec.""" - return cls() + return cls( + swappiness=swap_spec.swappiness, + min_free_kbytes=swap_spec.min_free_kbytes, + watermark_scale_factor=swap_spec.watermark_scale_factor, + ) def _Create(self) -> None: """Stub: EKS kubelet LimitedSwap config via nodeadm (deferred to PR #6780).""" @@ -243,9 +302,6 @@ def _Create(self) -> None: '(deferred to PR #6780). Swap will not be enabled on EKS nodes.' ) - def _Delete(self) -> None: - """No-op.""" - def GetNodeadmConfig(self) -> str: """Return nodeadm bootstrap YAML for kubelet swap settings.""" return ( @@ -256,4 +312,9 @@ def GetNodeadmConfig(self) -> str: ' config:\n' f' memorySwapBehavior: {self.memory_swap_behavior}\n' f' failSwapOn: {str(self.fail_swap_on).lower()}\n' + ' containerd:\n' + ' config:\n' + f' vm.swappiness: {self.swappiness}\n' + f' vm.min_free_kbytes: {self.min_free_kbytes}\n' + f' vm.watermark_scale_factor: {self.watermark_scale_factor}\n' ) From 8e6e719369ced00d960677328f6d453b0fd4d49b Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Wed, 1 Jul 2026 11:15:44 +0530 Subject: [PATCH 15/17] test(swap_config): add unit tests for BaseSwapConfig, GkeSwapConfig, EksSwapConfig and GKE wiring --- .../swap_encryption_benchmark_test.py | 141 ++++++++++ .../gcp/google_kubernetes_engine_test.py | 156 +++++++++++ .../container_service/swap_config_test.py | 260 ++++++++++++++++++ 3 files changed, 557 insertions(+) create mode 100644 tests/linux_benchmarks/swap_encryption_benchmark_test.py create mode 100644 tests/resources/container_service/swap_config_test.py diff --git a/tests/linux_benchmarks/swap_encryption_benchmark_test.py b/tests/linux_benchmarks/swap_encryption_benchmark_test.py new file mode 100644 index 0000000000..9a29939cde --- /dev/null +++ b/tests/linux_benchmarks/swap_encryption_benchmark_test.py @@ -0,0 +1,141 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for perfkitbenchmarker.linux_benchmarks.swap_encryption_benchmark.""" + +import unittest +from unittest import mock + +from perfkitbenchmarker.linux_benchmarks import swap_encryption_benchmark +from tests import pkb_common_test_case + + +class GetConfigTest(pkb_common_test_case.PkbCommonTestCase): + """Tests that BENCHMARK_CONFIG is well-formed and loadable.""" + + def test_get_config_returns_dict(self): + config = swap_encryption_benchmark.GetConfig({}) + self.assertIsInstance(config, dict) + + def test_get_config_has_container_cluster(self): + # configs.LoadConfig returns the inner benchmark dict directly (no benchmark + # name wrapper), so top-level keys are 'container_cluster', 'description', etc. + config = swap_encryption_benchmark.GetConfig({}) + self.assertIn('container_cluster', config) + + def test_get_config_benchmark_nodepool_present(self): + config = swap_encryption_benchmark.GetConfig({}) + nodepools = config['container_cluster']['nodepools'] + self.assertIn( + swap_encryption_benchmark._BENCHMARK_NODEPOOL, + nodepools, + ) + + def test_get_config_swap_config_present_on_benchmark_nodepool(self): + config = swap_encryption_benchmark.GetConfig({}) + nodepool = config['container_cluster']['nodepools'][ + swap_encryption_benchmark._BENCHMARK_NODEPOOL + ] + self.assertIn('swap_config', nodepool) + self.assertTrue(nodepool['swap_config'].get('enabled', False)) + + +class ParseCipherTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for _parse_cipher() output parsing.""" + + def test_parse_cipher_standard_aes_xts(self): + # Typical dmsetup status line: - crypt ... + status = '0 67108864 crypt aes-xts-plain64 0 8:16 0 1 sector_size:4096' + self.assertEqual( + swap_encryption_benchmark._parse_cipher(status), 'aes-xts-plain64' + ) + + def test_parse_cipher_returns_empty_when_no_crypt_token(self): + status = '0 67108864 linear 8:16 0' + self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '') + + def test_parse_cipher_returns_empty_on_empty_string(self): + self.assertEqual(swap_encryption_benchmark._parse_cipher(''), '') + + def test_parse_cipher_crypt_at_end_returns_empty(self): + # 'crypt' present but no token after it. + status = 'something crypt' + self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '') + + def test_parse_cipher_not_encrypted_string(self): + # Output from the benchmark when dm-crypt not active. + status = 'not_encrypted' + self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '') + + +class DetectSwapDeviceTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for _detect_swap_device() with mocked PodExec.""" + + def _make_ds(self, pod_exec_output): + ds = mock.Mock() + ds.PodExec.return_value = (pod_exec_output, '') + return ds + + def test_detect_swap_device_returns_device_basename(self): + # /proc/swaps first device column (after header skip via awk NR>1). + ds = self._make_ds('/dev/dm-0\n') + result = swap_encryption_benchmark._detect_swap_device(ds) + self.assertEqual(result, 'dm-0') + + def test_detect_swap_device_returns_first_device_when_multiple(self): + ds = self._make_ds('/dev/dm-0\n/dev/dm-1\n') + result = swap_encryption_benchmark._detect_swap_device(ds) + self.assertEqual(result, 'dm-0') + + def test_detect_swap_device_returns_empty_when_no_swap(self): + ds = self._make_ds('') + result = swap_encryption_benchmark._detect_swap_device(ds) + self.assertEqual(result, '') + + def test_detect_swap_device_returns_empty_on_pod_exec_exception(self): + ds = mock.Mock() + ds.PodExec.side_effect = Exception('pod not found') + result = swap_encryption_benchmark._detect_swap_device(ds) + self.assertEqual(result, '') + + +class BuildMetadataTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for _build_metadata() with mocked PodExec.""" + + def test_build_metadata_includes_swap_device(self): + ds = mock.Mock() + ds.PodExec.return_value = ('5.15.0-gke-1234\n', '') + meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0') + self.assertEqual(meta['swap_device'], 'dm-0') + + def test_build_metadata_swap_device_unknown_when_empty(self): + ds = mock.Mock() + ds.PodExec.return_value = ('5.15.0\n', '') + meta = swap_encryption_benchmark._build_metadata(ds, '') + self.assertEqual(meta['swap_device'], 'unknown') + + def test_build_metadata_includes_kernel_version(self): + ds = mock.Mock() + ds.PodExec.return_value = ('5.15.0-gke-1234\n', '') + meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0') + self.assertEqual(meta['kernel_version'], '5.15.0-gke-1234') + + def test_build_metadata_kernel_version_absent_on_pod_exec_exception(self): + ds = mock.Mock() + ds.PodExec.side_effect = Exception('timeout') + meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0') + self.assertNotIn('kernel_version', meta) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/providers/gcp/google_kubernetes_engine_test.py b/tests/providers/gcp/google_kubernetes_engine_test.py index dbf8232f5e..421d99cd02 100644 --- a/tests/providers/gcp/google_kubernetes_engine_test.py +++ b/tests/providers/gcp/google_kubernetes_engine_test.py @@ -34,6 +34,7 @@ from perfkitbenchmarker.resources.container_service import container from perfkitbenchmarker.resources.container_service import kubectl from perfkitbenchmarker.resources.container_service import kubernetes_commands +from perfkitbenchmarker.resources.container_service import swap_config as swap_config_lib from tests import pkb_common_test_case FLAGS = flgs.FLAGS @@ -949,5 +950,160 @@ def testCreateWithPerNodepoolAutoscaling(self): self.assertIn('--max-nodes 10', nodepool_cmd) +class GoogleKubernetesEngineSwapConfigTestCase(PatchedObjectsTestCase): + """Tests that _AddNodeParamsToCmd wires swap_config flags correctly.""" + + @staticmethod + def _make_swap_spec( + boot_disk_iops=160000, + boot_disk_throughput=2400, + lssd=False, + lssd_count=0, + ): + """Build a ContainerClusterSpec with swap_config on the benchmark nodepool.""" + return container_spec.ContainerClusterSpec( + 'NAME', + **{ + 'cloud': 'GCP', + 'vm_spec': { + 'GCP': { + 'machine_type': 'e2-medium', + 'zone': 'us-central1-a', + }, + }, + 'nodepools': { + 'benchmark': { + 'vm_spec': { + 'GCP': { + 'machine_type': 'n4-highmem-32', + 'zone': 'us-central1-a', + }, + }, + 'swap_config': { + 'enabled': True, + 'swappiness': 100, + 'min_free_kbytes': 200, + 'watermark_scale_factor': 500, + 'lssd': lssd, + 'lssd_count': lssd_count, + 'boot_disk_iops': boot_disk_iops, + 'boot_disk_throughput': boot_disk_throughput, + }, + }, + }, + }, + ) + + def setUp(self): + super().setUp() + # Avoid real tempfile creation in GKE command-generation tests. + # GkeSwapConfig implementation is tested separately in swap_config_test.py. + self.enter_context( + mock.patch.object( + swap_config_lib.GkeSwapConfig, + 'WriteLinuxConfigYaml', + return_value='/tmp/fake_linux_config.yaml', + ) + ) + + def test_swap_config_sets_system_config_from_file_flag(self): + spec = self._make_swap_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertIsNotNone(nodepool_cmd) + self.assertIn('--system-config-from-file', nodepool_cmd) + self.assertIn('/tmp/fake_linux_config.yaml', nodepool_cmd) + + def test_swap_config_sets_ubuntu_containerd_image_type(self): + spec = self._make_swap_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertIn('UBUNTU_CONTAINERD', nodepool_cmd) + + def test_swap_config_sets_no_enable_autorepair(self): + spec = self._make_swap_spec() + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertIn('--no-enable-autorepair', nodepool_cmd) + + def test_swap_config_with_boot_disk_iops_sets_provisioned_flags(self): + spec = self._make_swap_spec(boot_disk_iops=160000, boot_disk_throughput=2400) + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertIn('--boot-disk-provisioned-iops', nodepool_cmd) + self.assertIn('--boot-disk-provisioned-throughput', nodepool_cmd) + + def test_swap_config_lssd_omits_boot_disk_provisioned_flags(self): + # When lssd=True the swap device is local NVMe, not the boot disk. + spec = self._make_swap_spec(lssd=True, lssd_count=2, boot_disk_iops=0) + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertNotIn('--boot-disk-provisioned-iops', nodepool_cmd) + self.assertNotIn('--boot-disk-provisioned-throughput', nodepool_cmd) + + def test_nodepool_without_swap_config_omits_all_swap_flags(self): + spec = container_spec.ContainerClusterSpec( + 'NAME', + **{ + 'cloud': 'GCP', + 'vm_spec': { + 'GCP': { + 'machine_type': 'e2-medium', + 'zone': 'us-central1-a', + }, + }, + 'nodepools': { + 'benchmark': { + 'vm_spec': { + 'GCP': { + 'machine_type': 'n4-highmem-32', + 'zone': 'us-central1-a', + }, + }, + }, + }, + }, + ) + with self.patch_critical_objects() as issue_command: + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + nodepool_cmd = issue_command.GetCommandWithSubstring( + 'node-pools create benchmark' + ) + self.assertNotIn('--system-config-from-file', nodepool_cmd) + self.assertNotIn('UBUNTU_CONTAINERD', nodepool_cmd) + self.assertNotIn('--no-enable-autorepair', nodepool_cmd) + + def test_cleanup_yaml_called_after_nodepool_create(self): + spec = self._make_swap_spec() + with mock.patch.object( + swap_config_lib.GkeSwapConfig, 'CleanupYaml' + ) as mock_cleanup: + with self.patch_critical_objects(): + cluster = google_kubernetes_engine.GkeCluster(spec) + cluster._Create() + mock_cleanup.assert_called_once() + + if __name__ == '__main__': unittest.main() diff --git a/tests/resources/container_service/swap_config_test.py b/tests/resources/container_service/swap_config_test.py new file mode 100644 index 0000000000..f71ba04d8e --- /dev/null +++ b/tests/resources/container_service/swap_config_test.py @@ -0,0 +1,260 @@ +# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for perfkitbenchmarker.resources.container_service.swap_config.""" + +import os +import unittest +from unittest import mock + +from perfkitbenchmarker.resources.container_service import swap_config +from tests import pkb_common_test_case + + +class BaseSwapConfigTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for the abstract BaseSwapConfig class.""" + + def test_default_attrs(self): + cfg = swap_config.BaseSwapConfig() + self.assertEqual(cfg.swappiness, 100) + self.assertEqual(cfg.min_free_kbytes, 200) + self.assertEqual(cfg.watermark_scale_factor, 500) + + def test_custom_attrs(self): + cfg = swap_config.BaseSwapConfig( + swappiness=60, min_free_kbytes=400, watermark_scale_factor=200 + ) + self.assertEqual(cfg.swappiness, 60) + self.assertEqual(cfg.min_free_kbytes, 400) + self.assertEqual(cfg.watermark_scale_factor, 200) + + def test_from_spec_raises_not_implemented(self): + with self.assertRaises(NotImplementedError): + swap_config.BaseSwapConfig.from_spec(mock.Mock()) + + def test_create_is_noop(self): + cfg = swap_config.BaseSwapConfig() + cfg._Create() # Must not raise. + + def test_delete_is_noop(self): + cfg = swap_config.BaseSwapConfig() + cfg._Delete() # Must not raise. + + +class GkeSwapConfigTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for GkeSwapConfig: YAML generation, Hyperdisk clamping, lifecycle.""" + + def _make_spec(self, **kwargs): + """Return a mock SwapConfigSpec with sensible defaults.""" + spec = mock.Mock() + spec.swappiness = kwargs.get('swappiness', 100) + spec.min_free_kbytes = kwargs.get('min_free_kbytes', 200) + spec.watermark_scale_factor = kwargs.get('watermark_scale_factor', 500) + spec.lssd = kwargs.get('lssd', False) + spec.lssd_count = kwargs.get('lssd_count', 0) + spec.boot_disk_iops = kwargs.get('boot_disk_iops', 0) + spec.boot_disk_throughput = kwargs.get('boot_disk_throughput', 0) + return spec + + # ── from_spec ───────────────────────────────────────────────────────────── + + def test_from_spec_maps_all_attrs(self): + spec = self._make_spec( + swappiness=60, + min_free_kbytes=400, + watermark_scale_factor=200, + lssd=True, + lssd_count=2, + boot_disk_iops=160000, + boot_disk_throughput=2400, + ) + cfg = swap_config.GkeSwapConfig.from_spec(spec) + self.assertEqual(cfg.swappiness, 60) + self.assertEqual(cfg.min_free_kbytes, 400) + self.assertEqual(cfg.watermark_scale_factor, 200) + self.assertTrue(cfg.lssd) + self.assertEqual(cfg.lssd_count, 2) + self.assertEqual(cfg.boot_disk_iops, 160000) + self.assertEqual(cfg.boot_disk_throughput, 2400) + + # ── WriteLinuxConfigYaml ────────────────────────────────────────────────── + + def test_write_linux_config_yaml_basic_content(self): + cfg = swap_config.GkeSwapConfig( + swappiness=80, min_free_kbytes=300, watermark_scale_factor=400 + ) + path = cfg.WriteLinuxConfigYaml() + try: + with open(path) as f: + content = f.read() + self.assertIn('linuxConfig:', content) + self.assertIn('swapConfig:', content) + self.assertIn('enabled: true', content) + self.assertIn('vm.swappiness: 80', content) + self.assertIn('vm.min_free_kbytes: 300', content) + self.assertIn('vm.watermark_scale_factor: 400', content) + finally: + cfg.CleanupYaml() + + def test_write_linux_config_yaml_no_lssd_has_no_disk_profile(self): + cfg = swap_config.GkeSwapConfig(lssd=False) + path = cfg.WriteLinuxConfigYaml() + try: + with open(path) as f: + content = f.read() + self.assertNotIn('dedicatedLocalSsdProfile', content) + self.assertNotIn('diskCount', content) + finally: + cfg.CleanupYaml() + + def test_write_linux_config_yaml_lssd_includes_disk_profile(self): + cfg = swap_config.GkeSwapConfig(lssd=True, lssd_count=2) + path = cfg.WriteLinuxConfigYaml() + try: + with open(path) as f: + content = f.read() + self.assertIn('dedicatedLocalSsdProfile:', content) + self.assertIn('diskCount: 2', content) + finally: + cfg.CleanupYaml() + + def test_write_linux_config_yaml_returns_existing_file_path(self): + cfg = swap_config.GkeSwapConfig() + path = cfg.WriteLinuxConfigYaml() + try: + self.assertTrue(os.path.isfile(path)) + finally: + cfg.CleanupYaml() + + # ── CleanupYaml ─────────────────────────────────────────────────────────── + + def test_cleanup_yaml_removes_tempfile(self): + cfg = swap_config.GkeSwapConfig() + path = cfg.WriteLinuxConfigYaml() + self.assertTrue(os.path.exists(path)) + cfg.CleanupYaml() + self.assertFalse(os.path.exists(path)) + + def test_cleanup_yaml_noop_before_write(self): + cfg = swap_config.GkeSwapConfig() + cfg.CleanupYaml() # Must not raise. + + def test_cleanup_yaml_noop_on_second_call(self): + cfg = swap_config.GkeSwapConfig() + cfg.WriteLinuxConfigYaml() + cfg.CleanupYaml() + cfg.CleanupYaml() # Second call must not raise. + + # ── ValidHyperdiskThroughput ────────────────────────────────────────────── + + def test_valid_hyperdisk_throughput_no_clamp_needed(self): + # min_throughput = ceil(160000 / 256) = 625; 2400 > 625 → unchanged. + cfg = swap_config.GkeSwapConfig( + boot_disk_iops=160000, boot_disk_throughput=2400 + ) + self.assertEqual(cfg.ValidHyperdiskThroughput(), 2400) + + def test_valid_hyperdisk_throughput_clamps_up(self): + # min_throughput = ceil(160000 / 256) = 625; 100 < 625 → clamp to 625. + cfg = swap_config.GkeSwapConfig( + boot_disk_iops=160000, boot_disk_throughput=100 + ) + self.assertEqual(cfg.ValidHyperdiskThroughput(), 625) + + def test_valid_hyperdisk_throughput_no_iops_returns_throughput(self): + # iops=0 means no constraint → return throughput unchanged. + cfg = swap_config.GkeSwapConfig(boot_disk_iops=0, boot_disk_throughput=500) + self.assertEqual(cfg.ValidHyperdiskThroughput(), 500) + + def test_valid_hyperdisk_throughput_both_zero_returns_zero(self): + cfg = swap_config.GkeSwapConfig(boot_disk_iops=0, boot_disk_throughput=0) + self.assertEqual(cfg.ValidHyperdiskThroughput(), 0) + + def test_valid_hyperdisk_throughput_exact_minimum_no_clamp(self): + # iops=256, throughput=1 → min=1; exactly at boundary → unchanged. + cfg = swap_config.GkeSwapConfig(boot_disk_iops=256, boot_disk_throughput=1) + self.assertEqual(cfg.ValidHyperdiskThroughput(), 1) + + +class EksSwapConfigTest(pkb_common_test_case.PkbCommonTestCase): + """Tests for EksSwapConfig: nodeadm YAML output and from_spec mapping.""" + + def _make_spec(self, **kwargs): + spec = mock.Mock() + spec.swappiness = kwargs.get('swappiness', 100) + spec.min_free_kbytes = kwargs.get('min_free_kbytes', 200) + spec.watermark_scale_factor = kwargs.get('watermark_scale_factor', 500) + return spec + + # ── from_spec ───────────────────────────────────────────────────────────── + + def test_from_spec_maps_sysctl_attrs(self): + spec = self._make_spec( + swappiness=60, min_free_kbytes=400, watermark_scale_factor=200 + ) + cfg = swap_config.EksSwapConfig.from_spec(spec) + self.assertEqual(cfg.swappiness, 60) + self.assertEqual(cfg.min_free_kbytes, 400) + self.assertEqual(cfg.watermark_scale_factor, 200) + + def test_from_spec_eks_specific_attrs_use_defaults(self): + # from_spec does not accept memory_swap_behavior / fail_swap_on from spec. + cfg = swap_config.EksSwapConfig.from_spec(self._make_spec()) + self.assertEqual(cfg.memory_swap_behavior, 'LimitedSwap') + self.assertFalse(cfg.fail_swap_on) + + # ── GetNodeadmConfig ────────────────────────────────────────────────────── + + def test_get_nodeadm_config_api_version(self): + cfg = swap_config.EksSwapConfig() + self.assertIn('apiVersion: node.eks.aws/v1alpha1', cfg.GetNodeadmConfig()) + + def test_get_nodeadm_config_memory_swap_behavior(self): + cfg = swap_config.EksSwapConfig() + self.assertIn('memorySwapBehavior: LimitedSwap', cfg.GetNodeadmConfig()) + + def test_get_nodeadm_config_fail_swap_on_false(self): + cfg = swap_config.EksSwapConfig(fail_swap_on=False) + self.assertIn('failSwapOn: false', cfg.GetNodeadmConfig()) + + def test_get_nodeadm_config_sysctl_keys_present(self): + cfg = swap_config.EksSwapConfig() + output = cfg.GetNodeadmConfig() + self.assertIn('vm.swappiness:', output) + self.assertIn('vm.min_free_kbytes:', output) + self.assertIn('vm.watermark_scale_factor:', output) + + def test_get_nodeadm_config_reflects_custom_sysctl_values(self): + cfg = swap_config.EksSwapConfig( + swappiness=60, min_free_kbytes=400, watermark_scale_factor=200 + ) + output = cfg.GetNodeadmConfig() + self.assertIn('vm.swappiness: 60', output) + self.assertIn('vm.min_free_kbytes: 400', output) + self.assertIn('vm.watermark_scale_factor: 200', output) + + # ── _Create stub ────────────────────────────────────────────────────────── + + def test_create_logs_deferred_warning(self): + cfg = swap_config.EksSwapConfig() + with self.assertLogs(level='WARNING') as log_ctx: + cfg._Create() + combined = ' '.join(log_ctx.output).lower() + self.assertTrue( + 'stub' in combined or 'deferred' in combined, + msg=f'Expected "stub" or "deferred" in log output: {log_ctx.output}', + ) + + +if __name__ == '__main__': + unittest.main() From 7056b1a14e3267e7631e0c82e903f93e70c233ab Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Wed, 1 Jul 2026 12:56:11 +0530 Subject: [PATCH 16/17] fix(swap_config): quote sysctl values as strings in GKE linuxConfig YAML --- .../resources/container_service/swap_config.py | 6 +++--- tests/resources/container_service/swap_config_test.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py index ca36dbad8b..a38b7adccf 100644 --- a/perfkitbenchmarker/resources/container_service/swap_config.py +++ b/perfkitbenchmarker/resources/container_service/swap_config.py @@ -191,9 +191,9 @@ def WriteLinuxConfigYaml(self) -> str: 'linuxConfig:\n' + swap_block + ' sysctl:\n' - + f' vm.swappiness: {self.swappiness}\n' - + f' vm.min_free_kbytes: {self.min_free_kbytes}\n' - + f' vm.watermark_scale_factor: {self.watermark_scale_factor}\n' + + f' vm.swappiness: "{self.swappiness}"\n' + + f' vm.min_free_kbytes: "{self.min_free_kbytes}"\n' + + f' vm.watermark_scale_factor: "{self.watermark_scale_factor}"\n' ) tmp = tempfile.NamedTemporaryFile( diff --git a/tests/resources/container_service/swap_config_test.py b/tests/resources/container_service/swap_config_test.py index f71ba04d8e..0d965ed2ce 100644 --- a/tests/resources/container_service/swap_config_test.py +++ b/tests/resources/container_service/swap_config_test.py @@ -100,9 +100,9 @@ def test_write_linux_config_yaml_basic_content(self): self.assertIn('linuxConfig:', content) self.assertIn('swapConfig:', content) self.assertIn('enabled: true', content) - self.assertIn('vm.swappiness: 80', content) - self.assertIn('vm.min_free_kbytes: 300', content) - self.assertIn('vm.watermark_scale_factor: 400', content) + self.assertIn('vm.swappiness: "80"', content) + self.assertIn('vm.min_free_kbytes: "300"', content) + self.assertIn('vm.watermark_scale_factor: "400"', content) finally: cfg.CleanupYaml() From c8a8dd91b022388348183435b34fa6ceb7f751e0 Mon Sep 17 00:00:00 2001 From: DevVegeta Date: Wed, 1 Jul 2026 23:50:29 +0530 Subject: [PATCH 17/17] fix(swap_config,swap_daemonset): pylint cleanups - rename _retries->retries arg, suppress invalid-name on from_spec base class definition --- .../resources/container_service/swap_config.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py index a38b7adccf..6184220520 100644 --- a/perfkitbenchmarker/resources/container_service/swap_config.py +++ b/perfkitbenchmarker/resources/container_service/swap_config.py @@ -35,7 +35,7 @@ swap_config: enabled: true swappiness: 100 - min_free_kbytes: 200 + min_free_kbytes: 67584 watermark_scale_factor: 500 boot_disk_iops: 160000 boot_disk_throughput: 2400 @@ -75,7 +75,7 @@ class BaseSwapConfig(resource.BaseResource): def __init__( self, swappiness: int = 100, - min_free_kbytes: int = 200, + min_free_kbytes: int = 67584, watermark_scale_factor: int = 500, ) -> None: super().__init__() @@ -111,7 +111,7 @@ class GkeSwapConfig(BaseSwapConfig): Attributes: swappiness: vm.swappiness sysctl value (0-200, default 100). - min_free_kbytes: vm.min_free_kbytes sysctl (default 200). + min_free_kbytes: vm.min_free_kbytes sysctl (default 67584, GKE minimum >= 67584). watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500). lssd: True if the nodepool uses local NVMe SSDs for swap device. lssd_count: Number of local NVMe SSDs (dedicatedLocalSsdProfile.diskCount). @@ -125,7 +125,7 @@ class GkeSwapConfig(BaseSwapConfig): def __init__( self, swappiness: int = 100, - min_free_kbytes: int = 200, + min_free_kbytes: int = 67584, watermark_scale_factor: int = 500, lssd: bool = False, lssd_count: int = 0, @@ -272,7 +272,7 @@ class EksSwapConfig(BaseSwapConfig): def __init__( self, swappiness: int = 100, - min_free_kbytes: int = 200, + min_free_kbytes: int = 67584, watermark_scale_factor: int = 500, memory_swap_behavior: str = 'LimitedSwap', fail_swap_on: bool = False,