From cf0f1d8af8e2e595155cb01f168fe4b511a7dbd3 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Fri, 19 Jun 2026 09:43:22 +0530
Subject: [PATCH 01/17] PR1: swap-encryption benchmark - shared DaemonSet/pod
 infra (layer 1/5); manifest moved to data/cluster and rendered via vm_util

---
 .../cluster/swap_encryption_daemonset.yaml.j2 |  266 +++
 .../swap_encryption_benchmark.py              | 1529 +++++++++++++++++
 2 files changed, 1795 insertions(+)
 create mode 100644 perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
 create mode 100644 perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py

diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
new file mode 100644
index 0000000000..c40ec79dff
--- /dev/null
+++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
@@ -0,0 +1,266 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ ds_name }}
+  namespace: {{ ds_namespace }}
+  labels:
+    app: {{ ds_label }}
+spec:
+  selector:
+    matchLabels:
+      app: {{ ds_label }}
+  template:
+    metadata:
+      labels:
+        app: {{ ds_label }}
+    spec:
+      hostPID: true
+      hostNetwork: true
+      # Pin to the benchmark nodepool — never schedule on the dummy default pool.
+      nodeSelector:
+        pkb_nodepool: {{ benchmark_nodepool }}
+      tolerations:
+      - operator: Exists
+      containers:
+      - name: benchmark
+        image: {{ image }}
+        command:
+        - bash
+        - -c
+        - |
+          echo "[pkb] Installing benchmark tools..."
+          # Retry apt-get up to 3 times — transient network failures are
+          # common on a freshly-started GKE node.  Critical tools (fio,
+          # stress-ng) must be present before we write the ready sentinel;
+          # a silent || true here would cause /tmp/pkb_ready to appear even
+          # when tools are missing, breaking all subsequent phases.
+          PKB_APT_OK=0
+          for _attempt in 1 2 3; do
+            apt-get update -qq 2>&1 || true
+            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
+              fio \\
+              stress-ng \\
+              sysstat \\
+              cryptsetup \\
+              mdadm \\
+              redis-server \\
+              redis-tools \\
+              git \\
+              wget \\
+              curl \\
+              make \\
+              gcc \\
+              bc \\
+              flex \\
+              bison \\
+              libelf-dev \\
+              libssl-dev \\
+              cgroup-tools \\
+              nvme-cli \\
+              util-linux \\
+              python3-pip \\
+              libevent-dev \\
+              libssl-dev \\
+              libpcre3-dev \\
+              zlib1g-dev \\
+              build-essential \\
+              autoconf \\
+              automake \\
+              libtool \\
+              libtool-bin \\
+              pkg-config \\
+              python3-dev \\
+              default-jre-headless \\
+              2>&1 && PKB_APT_OK=1 && break
+            echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
+            sleep 15
+          done
+          if [ "$PKB_APT_OK" != "1" ] || \\
+             ! command -v fio >/dev/null 2>&1 || \\
+             ! command -v stress-ng >/dev/null 2>&1; then
+            echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
+            exit 1
+          fi
+          echo "[pkb] Installing memtier_benchmark from source..."
+          # Pin a stable release tag — building from the moving default
+          # branch (HEAD) intermittently broke (memtier_benchmark not found
+          # → Phase 3a lost its P50/P90/P99 latency).  2.2.1 matches the
+          # version PKB's memtier package (memtier.MemtierResult.Parse) is
+          # validated against and builds cleanly with the apt deps above.
+          # Fall back to HEAD only if the tagged clone fails.
+          if ! command -v memtier_benchmark >/dev/null 2>&1; then
+            (cd /tmp && \\
+              rm -rf memtier_benchmark && \\
+              ( git clone --depth 1 --branch 2.2.1 \\
+                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
+                git clone --depth 1 \\
+                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
+              cd memtier_benchmark && \\
+              autoreconf -ivf 2>&1 && \\
+              ./configure 2>&1 && \\
+              make -j$(nproc) 2>&1 && \\
+              make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
+              echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
+          fi
+          if command -v memtier_benchmark >/dev/null 2>&1; then
+            echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
+          fi
+          echo "[pkb] Installing esrally (lightweight)..."
+          python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
+          pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
+            pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
+            echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
+          if command -v esrally >/dev/null 2>&1; then
+            echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
+          else
+            echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
+          fi
+          echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
+          # Phase 3c needs a real search server on :9200.  Nothing in apt
+          # ships one and the pod has no systemd, so install the OpenSearch
+          # bundle (ships its own JDK) and launch the binary directly in the
+          # phase.  All best-effort: if any step fails the phase probes the
+          # endpoint and skips cleanly rather than recording fake timings.
+          if [ ! -x /opt/opensearch/bin/opensearch ]; then
+            OS_VER=2.15.0
+            (cd /opt && \\
+              wget -q --timeout=600 -O os.tgz \\
+                "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
+              tar -xzf os.tgz && rm -f os.tgz && \\
+              mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
+              echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
+          fi
+          if [ -x /opt/opensearch/bin/opensearch ]; then
+            # pkbos owns and runs OpenSearch (it refuses to run as root).
+            # Give it a home so HOME/temp paths are writable.
+            id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
+            printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
+              > /opt/opensearch/config/opensearch.yml
+            mkdir -p /opt/opensearch/config/jvm.options.d
+            # 2 GB heap: 512 MB was too small and OpenSearch aborted early.
+            # On a 252 GB node this still leaves plenty of page cache to
+            # pressure into swap during the phase.
+            printf -- '-Xms2g\\n-Xmx2g\\n' \\
+              > /opt/opensearch/config/jvm.options.d/pkb-heap.options
+            sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
+            # CRITICAL: never run the binary as root here (it bails and
+            # leaves root-owned files in logs/ that block the pkbos server).
+            # Clear any stale logs and chown everything to pkbos LAST.
+            rm -f /opt/opensearch/logs/* 2>/dev/null || true
+            chown -R pkbos /opt/opensearch 2>/dev/null || true
+            echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
+          fi
+          echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
+          PKB_KVER="{{ kernel_version }}"
+          PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
+          PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
+          PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
+          PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
+          mkdir -p "$PKB_KROOT"
+          if [ ! -f "$PKB_KTARBALL" ]; then
+            wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
+              echo "[pkb] WARNING: kernel tarball download failed" >&2
+          fi
+          if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
+            echo "[pkb] Extracting kernel source (xz)..."
+            tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
+              echo "[pkb] WARNING: kernel source extraction failed" >&2
+          fi
+          echo "[pkb] Unlocking container cgroup swap limits..."
+          # GKE cgroup v2 sets memory.swap.max=0 per-container, which
+          # prevents swap usage even when the node has a swap device and
+          # vm.swappiness>0.  Stress-ng gets OOM-killed in ~15s because
+          # the kernel can't page out to swap for this cgroup.
+          #
+          # NOTE: the old approach derived the cgroup path from
+          # /proc/self/cgroup, but inside a cgroup namespace that reports
+          # "0::/" — so the write targeted the host ROOT cgroup, silently
+          # no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
+          # /sys is the host cgroup tree (hostPath mount) and this pod is
+          # privileged, so instead unlock swap across the entire kubepods
+          # hierarchy, which is guaranteed to contain our own container.
+          if [ -d /sys/fs/cgroup/kubepods.slice ] || \
+             [ -d /sys/fs/cgroup/kubepods ]; then
+            # cgroup v2: write 'max' to every memory.swap.max under kubepods*.
+            find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
+              2>/dev/null | while read -r _f; do
+                echo max > "$_f" 2>/dev/null || true
+              done
+          fi
+          # Best-effort: our own namespaced path and the unified root.
+          PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
+            2>/dev/null)
+          for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
+                      /sys/fs/cgroup/memory.swap.max; do
+            [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
+          done
+          # cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
+          find /sys/fs/cgroup/memory -path '*kubepods*' \
+            -name memory.memsw.limit_in_bytes 2>/dev/null \
+            | while read -r _f; do
+                echo -1 > "$_f" 2>/dev/null || true
+              done
+          # Verify and surface the result in the pod log.  grep -L lists
+          # files that do NOT contain 'max' on their first line, i.e. ones
+          # still capping swap.
+          PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
+            -name memory.swap.max 2>/dev/null \
+            | xargs -r grep -L '^max' 2>/dev/null | head -1)
+          if [ -n "$PKB_STILL_CAPPED" ]; then
+            echo "[pkb] WARNING: cgroup swap still capped at \
+            $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
+            OOM-killed before swap is exercised" >&2
+          else
+            echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
+          fi
+          echo "[pkb] Tools installed. Writing ready sentinel."
+          touch /tmp/pkb_ready
+          sleep infinity
+        securityContext:
+          privileged: true
+          capabilities:
+            add: ["SYS_ADMIN", "IPC_LOCK"]
+        resources:
+          requests:
+            memory: "512Mi"
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        volumeMounts:
+        - name: dev
+          mountPath: /dev
+        - name: sys
+          mountPath: /sys
+        - name: run
+          mountPath: /run
+        - name: proc-host
+          mountPath: /proc-host
+          readOnly: true
+        - name: stateful-partition
+          mountPath: /mnt/stateful_partition
+        - name: lib-modules
+          mountPath: /lib/modules
+          readOnly: true
+      volumes:
+      - name: dev
+        hostPath:
+          path: /dev
+      - name: sys
+        hostPath:
+          path: /sys
+      - name: run
+        hostPath:
+          path: /run
+      - name: proc-host
+        hostPath:
+          path: /proc
+      - name: stateful-partition
+        hostPath:
+          path: /mnt/stateful_partition
+          type: DirectoryOrCreate
+      - name: lib-modules
+        hostPath:
+          path: /lib/modules
+          type: Directory
diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
new file mode 100644
index 0000000000..5bdc933bba
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -0,0 +1,1529 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark.
+
+Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws
+
+== Architecture ==
+
+Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's
+container_cluster abstraction, then deploys a privileged DaemonSet whose
+pod has full host-device access (/dev, /sys, hostPID).  All benchmark
+phases execute inside this pod via kubectl exec, so measurements reflect
+actual cluster-node behaviour including Kubernetes overhead (kubelet,
+containerd cgroup hierarchy, etc.).
+
+  GKE nodes  ── dm-crypt with ephemeral key (go/node:swap-encryption)
+                 swap device: /dev/mapper/swap_encrypted (over dedicated
+                 hyperdisk or LSSD RAID-0 /dev/md0).
+                 Single-disk fallback: plain loop device on
+                 /mnt/stateful_partition — dm-crypt is blocked by COS
+                 kernel namespace restrictions from inside a pod.
+
+  EKS nodes  ── NVMe Instance Store, Nitro hardware-offloaded encryption
+                 swap device: /dev/nvme1n1 (or auto-detected)
+
+== Benchmark Phases ==
+
+  Phase 1 – fio Microbenchmarks
+    Run fio directly on the swap block device (swapoff first) to measure
+    the hardware + encryption ceiling: random IOPS (4K), sequential
+    bandwidth (1M), and completion latency (iodepth=1).
+
+  Phase 2a – CPU Overhead
+    stress-ng drives sustained swap I/O; vmstat and pidstat capture
+    swap-in/out rates and per-process CPU cost (kswapd, kcryptd,
+    dm-crypt threads on GKE; Nitro offload on EKS).
+
+  Phase 2b – I/O Interference
+    Baseline fio on a scratch volume → re-run with concurrent swap
+    pressure.  IOPS/latency delta = storage contention cost.
+
+  Phase 3a – Redis Latency
+    Dataset loaded beyond container memory limit → GET/SET p99 latency
+    measured while kernel swaps pages.
+
+  Phase 3b – Kernel Build
+    Linux compiled inside a memory-capped cgroup; slowdown ratio vs
+    unconstrained baseline.
+
+  Phase 3c – OpenSearch
+    Bulk-index + search query under swap pressure (esrally or curl).
+"""
+
+import json
+import logging
+import re
+import textwrap
+import time
+from typing import Any
+
+from absl import flags
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import sample
+from perfkitbenchmarker import vm_util
+from perfkitbenchmarker.resources.container_service import kubectl
+
+FLAGS = flags.FLAGS
+
+# ---------------------------------------------------------------------------
+# Benchmark identity
+# ---------------------------------------------------------------------------
+
+
+
+FLAGS = flags.FLAGS
+
+
+BENCHMARK_NAME = 'swap_encryption'
+
+
+BENCHMARK_CONFIG = """
+swap_encryption:
+  description: >
+    GKE vs. EKS swap encryption and LSSD performance comparison.
+    Two-step nodepool setup: PKB provisions a minimal cluster with a cheap
+    default nodepool (Step 1), then Prepare() adds the real benchmark
+    nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a
+    node-level startup script that configures dm-crypt swap before any pod
+    is scheduled, then removes the default nodepool (Step 2).  All benchmark
+    phases run inside a privileged DaemonSet pinned to the benchmark nodepool.
+  flags: {}
+  container_cluster:
+    type: Kubernetes
+    vm_count: 1
+    vm_spec:
+      GCP:
+        # Cheap placeholder — the benchmark nodepool is created in Prepare().
+        machine_type: e2-medium
+        boot_disk_size: 20
+      AWS:
+        # Cheap placeholder — the benchmark nodegroup is added in Prepare().
+        machine_type: t3.medium
+        boot_disk_size: 20
+"""
+
+
+_DAEMONSET_IMAGE = flags.DEFINE_string(
+    'swap_encryption_daemonset_image',
+    'ubuntu:22.04',
+    'Container image used for the privileged benchmark DaemonSet pod.',
+)
+
+
+_NODEPOOL = flags.DEFINE_string(
+    'swap_encryption_nodepool',
+    'benchmark',
+    'Name of the node pool to deploy the benchmark DaemonSet on.',
+)
+
+
+_INSTANCE_SIZE_LABEL = flags.DEFINE_string(
+    'swap_encryption_instance_size_label',
+    '',
+    'Human-readable label for the current instance size being tested, e.g. '
+    '"n4-highmem-32" or "i4i.4xlarge".  Stored in sample metadata so that '
+    'results from multiple PKB runs across different instance sizes can be '
+    'collated and compared.  Defaults to the value reported by the cloud '
+    'metadata endpoint inside the pod.',
+)
+
+
+_COLLECT_COST = flags.DEFINE_boolean(
+    'swap_encryption_collect_cost',
+    False,
+    'When True, emit a cost_estimate_usd sample using on-demand pricing '
+    'for the instance type detected at runtime.',
+)
+
+
+_FAIL_ON_DEGRADED = flags.DEFINE_boolean(
+    'swap_encryption_fail_on_degraded',
+    True,
+    'When True (default), raise an error at the end of Run() if the run was '
+    'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and '
+    'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng '
+    'swap-pressure phase was OOM-killed before completing.  This prevents PKB '
+    'from reporting SUCCEEDED for a run whose post-eviction phases produced '
+    'empty or meaningless data.  Set False to keep the legacy behaviour of '
+    'always returning whatever partial samples were collected.',
+)
+
+
+_PHASES = flags.DEFINE_list(
+    'swap_encryption_phases',
+    ['all'],
+    'Which Run() phases to execute, for fast iteration against an '
+    'already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  '
+    'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng '
+    'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), '
+    '3b (kernel build), 3c (opensearch).  Default "all" runs everything.  '
+    'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. '
+    'Phases not listed are skipped and do not affect the degraded-run gate '
+    '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").',
+)
+
+
+_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string(
+    'swap_encryption_benchmark_machine_type',
+    'n4-highmem-32',
+    'Machine type for the benchmark nodepool created in Prepare(). '
+    'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd '
+    '(LSSD RAID-0).  The matching swap setup is selected automatically.',
+)
+
+
+_BENCHMARK_LSSD = flags.DEFINE_boolean(
+    'swap_encryption_lssd',
+    False,
+    'Force LSSD RAID-0 swap path even when the machine type name does not '
+    'contain "lssd".  Auto-detected from machine type when False.',
+)
+
+
+_LSSD_COUNT = flags.DEFINE_integer(
+    'swap_encryption_lssd_count',
+    1,
+    'Number of local NVMe SSDs to attach as raw block devices '
+    '(--local-nvme-ssd-block count=N).  Must match the fixed local SSD '
+    'count for the chosen machine type: c4-standard-8-lssd=1, '
+    'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  '
+    'Default 1 covers most single-lssd machine types.',
+)
+
+
+_NODE_IMAGE_TYPE = flags.DEFINE_string(
+    'swap_encryption_node_image_type',
+    'UBUNTU_CONTAINERD',
+    'GKE node image type for the benchmark nodepool.  '
+    'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks '
+    'down device-mapper at the kernel LSM level and cryptsetup hangs '
+    'indefinitely from any pod context (even privileged, even via nsenter '
+    'into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup '
+    'from privileged pods without restriction.  '
+    'Use COS_CONTAINERD only when dm-crypt is disabled '
+    '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  '
+    'AL2 on EKS.',
+)
+
+
+_BOOT_DISK_TYPE = flags.DEFINE_string(
+    'swap_encryption_boot_disk_type',
+    'hyperdisk-balanced',
+    'Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced '
+    'for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 '
+    'dev/test machines, which do not support hyperdisk-balanced.',
+)
+
+
+_BOOT_DISK_IOPS = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_iops',
+    80000,
+    'Provisioned IOPS for the boot disk (hyperdisk-balanced only).  '
+    '80 000 is the COS max-IOPS target.  Ignored for pd-ssd.',
+)
+
+
+_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_throughput',
+    1200,
+    'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced '
+    'only).  Must be set together with iops.  1200 MB/s pairs with 80 000 '
+    'IOPS for production; use 140 (minimum) for dev/test.  Ignored for '
+    'pd-ssd.',
+)
+
+
+_BOOT_DISK_SIZE_GB = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_size_gb',
+    500,
+    'Boot disk size in GiB for the benchmark nodepool.  500 GiB is '
+    'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run '
+    '(see Engineer Assignments table in execution-plan.md).  '
+    'For LSSD configs the boot disk is smaller; 100 GiB is fine.',
+)
+
+
+_ADD_SWAP_DISK = flags.DEFINE_boolean(
+    'swap_encryption_add_swap_disk',
+    False,
+    'Attach a dedicated second disk to the benchmark nodepool for use as '
+    'the swap device.  Required for dm-crypt measurement on single-boot-disk '
+    'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper '
+    'from pod namespaces.  The second disk is provisioned via '
+    '--additional-node-disk using the same type/IOPS/throughput as the boot '
+    'disk flags.',
+)
+
+
+_SWAP_DISK_SIZE_GB = flags.DEFINE_integer(
+    'swap_encryption_swap_disk_size_gb',
+    500,
+    'Size in GiB of the dedicated swap disk when '
+    '--swap_encryption_add_swap_disk is True.  Must satisfy the '
+    'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
+)
+
+
+_DS_NAME = 'pkb-swap-benchmark'
+
+
+_DS_NAMESPACE = 'default'
+
+
+_DS_LABEL = 'pkb-swap-benchmark'
+
+
+_active_pod: list[str] = []  # single-element list so closures can mutate it
+
+
+_degraded_reasons: list[str] = []
+
+
+_pod_lost: list[str] = []
+
+
+_oom_events: list[str] = []
+
+
+_BENCHMARK_NODEPOOL = 'benchmark'
+
+
+_DEFAULT_NODEPOOL = 'default-pool'
+
+
+def _daemonset_yaml(image: str) -> str:
+  """Render the privileged benchmark DaemonSet manifest.
+
+  The manifest is a PKB data file rendered with Jinja2
+  (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline
+  string, per PKB conventions.  The DaemonSet is pinned to the benchmark
+  nodepool via nodeSelector so it never lands on the dummy default pool.
+  """
+  return vm_util.ReadAndRenderJinja2Template(
+      'cluster/swap_encryption_daemonset.yaml.j2',
+      ds_name=_DS_NAME,
+      ds_namespace=_DS_NAMESPACE,
+      ds_label=_DS_LABEL,
+      benchmark_nodepool=_BENCHMARK_NODEPOOL,
+      image=image,
+      kernel_version=_KERNEL_VERSION.value,
+  )
+
+
+def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
+  return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(spec) -> None:
+  """Two-step nodepool setup then DaemonSet deployment.
+
+  Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap
+  e2-medium default nodepool.
+
+  Step 2 (this function):
+    a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with
+       COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures
+       dm-crypt swap at the OS level — before any pod is scheduled.
+    b. Delete the dummy default nodepool to stop its cost immediately.
+    c. Deploy the privileged DaemonSet (pinned via nodeSelector to the
+       benchmark nodepool) and wait for tools to install.
+  """
+  cluster = spec.container_cluster
+
+  # ── Step 2a: add real benchmark nodepool ────────────────────────────────
+  if getattr(cluster, 'project', None):
+    # GCP path: true two-step nodepool setup
+    logging.info('[swap_encryption] Step 2a: creating benchmark nodepool')
+    _create_benchmark_node_pool(cluster)
+
+    # ── Step 2b: wait for the benchmark node to join and be Ready ─────────
+    logging.info('[swap_encryption] Step 2b: waiting for benchmark node')
+    _wait_for_benchmark_node()
+
+    # ── Step 2b2: attach dedicated swap disk (if requested) ───────────────
+    # --additional-node-disk is not available in all gcloud versions, so we
+    # create + attach the disk after the node is up using gcloud compute.
+    if _ADD_SWAP_DISK.value:
+      logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk')
+      _attach_swap_disk(cluster)
+  else:
+    # AWS / EKS: nodepool management is external.  PKB's cluster creation
+    # labels nodes pkb_nodepool=default, so re-label all existing nodes here
+    # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark).
+    logging.info(
+        '[swap_encryption] EKS cluster — labelling existing nodes with '
+        'pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
+        _BENCHMARK_NODEPOOL)
+    kubectl.RunKubectlCommand([
+        'label', 'nodes', '--all', '--overwrite',
+        f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+    ])
+    # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
+    # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
+    _ensure_io2_volume()
+
+  # ── Step 2c: deploy DaemonSet ────────────────────────────────────────────
+  # Deploy and wait for the pod BEFORE deleting the default nodepool.
+  # Deleting the default pool while the benchmark node is still joining causes
+  # a temporary API server i/o timeout (control plane busy with two nodepool
+  # ops simultaneously).  Once the pod is Running the cluster is fully stable.
+  logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet')
+  _deploy_daemonset()
+
+  pod = _wait_for_benchmark_pod()
+  logging.info('[swap_encryption] Benchmark pod ready: %s', pod)
+
+  # ── Step 2d: now safe to remove the dummy default nodepool ───────────────
+  if getattr(cluster, 'project', None):
+    logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool')
+    _delete_default_node_pool(cluster)
+    # The DaemonSet pod may be evicted and rescheduled with a new name during
+    # the nodepool deletion (cluster control plane briefly interrupts pod
+    # lifecycle).  Re-resolve the pod name to avoid stale-reference errors on
+    # all subsequent _pod_exec calls.
+    logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod '
+                 'after nodepool deletion')
+    pod = _wait_for_benchmark_pod()
+    logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod)
+
+
+def _phase_selected(token: str) -> bool:
+  """Return True if phase `token` should run given --swap_encryption_phases.
+
+  'all' (the default) selects every phase.  Otherwise only the comma-separated
+  tokens listed in the flag run.  Tokens: fio, 2a, 2b, 3a, 3b, 3c.
+  """
+  selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
+  return (not selected) or ('all' in selected) or (token.lower() in selected)
+
+
+def Run(spec) -> list[sample.Sample]:
+  """Execute all benchmark phases with gate logic.
+
+  Execution is structured in three gated tiers matching the execution plan:
+
+    Tier 1 (Gate 1) — fio microbenchmarks
+      Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
+      zero samples (device not found, O_DIRECT error, etc.).
+
+    Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference
+      Requires an active swap device (Gate 1 must pass).  Gate 2 fails if
+      stress-ng does not complete within timeout.
+
+    Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch)
+      Independent of Tier 2 results; always attempted if Gate 1 passed.
+      Individual workload failures are logged but do not abort the others.
+
+  If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring
+  application-level swap performance when the raw device is inaccessible.
+  """
+  pod = _wait_for_benchmark_pod()
+  # Initialise the module-level active-pod tracker so _pod_exec and
+  # _recover_pod can transparently redirect to a replacement pod if the
+  # original is evicted during the run.
+  _active_pod.clear()
+  _active_pod.append(pod)
+  _degraded_reasons.clear()
+  _pod_lost.clear()
+  _oom_events.clear()
+  original_pod = pod
+  swap_dev = _detect_swap_device(pod)
+  base_meta = _build_metadata(pod, swap_dev)
+  results: list[sample.Sample] = []
+  t_run_start = time.time()
+
+  logging.info('[swap_encryption] swap device: %s', swap_dev)
+
+  # ── Cost estimate ─────────────────────────────────────────────────────────
+  if _COLLECT_COST.value:
+    elapsed = time.time() - t_run_start
+    results += _collect_cost_sample(pod, elapsed, base_meta)
+
+  # ── Final degradation gate ────────────────────────────────────────────────
+  # The phase try/except blocks above keep the run alive so partial data is
+  # still collected, but that means a catastrophic failure (pod OOM-evicted
+  # mid-run, no fio data, stress-ng killed before it could drive swap I/O)
+  # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics.
+  # Detect those conditions here and surface them explicitly.
+  if _active_pod and _active_pod[0] != original_pod:
+    _degraded_reasons.append(
+        f'benchmark pod was replaced during the run '
+        f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap '
+        f'pressure; phases executed after the eviction ran against a '
+        f'freshly-initialised pod (empty /tmp, swap re-setup) and may be '
+        f'invalid')
+  if _pod_lost:
+    _degraded_reasons.append(
+        f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) '
+        f'— the pod died (node memory-pressure eviction or container exit) and '
+        f'any phase running at or after that point (e.g. kernel-build baseline, '
+        f'OpenSearch) produced invalid data')
+  if _oom_events:
+    _degraded_reasons.append(
+        f'OOM kill(s) (rc=137) occurred during the run on pod(s) '
+        f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by '
+        f'the OOM killer (the container may have restarted in place), so the '
+        f'affected phase(s) produced no or partial data')
+
+  degraded = bool(_degraded_reasons)
+  results.append(sample.Sample(
+      'swap_encryption_run_status',
+      0.0 if degraded else 1.0,
+      'status',
+      dict(base_meta,
+           degraded=degraded,
+           degraded_reasons='; '.join(_degraded_reasons) or 'none',
+           num_samples=len(results) + 1)))
+
+  if degraded:
+    msg = ('[swap_encryption] RUN DEGRADED — '
+           + '; '.join(_degraded_reasons))
+    logging.error(msg)
+    if _FAIL_ON_DEGRADED.value:
+      # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED.  The
+      # samples collected so far are still published by PKB before the failure
+      # is recorded, so no data is lost.
+      raise errors.Benchmarks.RunError(msg)
+  else:
+    logging.info('[swap_encryption] Run completed cleanly (%d samples)',
+                 len(results))
+
+  return results
+
+
+def Cleanup(spec) -> None:
+  """Remove the DaemonSet and tear down any swap configuration."""
+  pod = _wait_for_benchmark_pod(timeout=30)
+  if pod:
+    _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True)
+    _pod_exec(pod, textwrap.dedent("""
+      swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+      dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    """), ignore_failure=True)
+    # Clean up loop device backing files (single-disk fallback path).
+    _pod_exec(pod, textwrap.dedent("""
+      for backing in /var/pkb_swap_backing /run/pkb_swap_backing \
+                     /mnt/stateful_partition/pkb_swap_backing
+      do
+        losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \
+          while read dev
+          do
+            losetup -d "$dev" 2>/dev/null || true
+          done
+        rm -f "$backing"
+      done
+    """), ignore_failure=True)
+    _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true",
+             ignore_failure=True)
+
+  _delete_daemonset()
+
+  # Detach and delete the dedicated swap disk if one was provisioned.
+  cluster = spec.container_cluster
+  if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None):
+    _detach_and_delete_swap_disk(cluster)
+
+
+def _deploy_daemonset() -> None:
+  """Apply the benchmark DaemonSet manifest to the cluster."""
+  manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value)
+  with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
+    f.write(manifest)
+    f.close()
+    kubectl.RunKubectlCommand(['apply', '-f', f.name])
+  logging.info('[swap_encryption] DaemonSet applied')
+
+
+def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
+  """Wait until the DaemonSet pod is Running AND tools are installed.
+
+  The benchmark container installs apt packages on first start and writes
+  /tmp/pkb_ready when done (~2-4 min on a cold node).  We must wait for
+  that sentinel before exec-ing any commands, otherwise tools like
+  cryptsetup / fio may not yet be on PATH.
+
+  Uses tab-separated name/phase output so kubectl always exits 0 regardless
+  of whether any pods are present, avoiding jsonpath index errors.
+  """
+  deadline = time.time() + timeout
+  last_phase = ''
+  ready_pod  = None   # pod name once phase == Running
+
+  while time.time() < deadline:
+    # ── Step 1: wait for Running phase ──────────────────────────────────────
+    if ready_pod is None:
+      out, _, rc = kubectl.RunKubectlCommand([
+          'get', 'pods',
+          '-l', f'app={_DS_LABEL}',
+          '-n', _DS_NAMESPACE,
+          '-o',
+          r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}',
+      ], raise_on_failure=False)
+
+      if rc == 0 and out.strip():
+        for line in out.strip().splitlines():
+          parts = line.split('\t')
+          if len(parts) == 2:
+            pod_name, phase = parts[0].strip(), parts[1].strip()
+            if phase == 'Running':
+              logging.info('[swap_encryption] Pod %s is Running – '
+                           'waiting for tool install to finish...', pod_name)
+              ready_pod = pod_name
+              break
+            if phase != last_phase:
+              logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase)
+              last_phase = phase
+              if phase in ('Pending',):
+                _log_pod_events(pod_name)
+      else:
+        logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...')
+
+    # ── Step 2: poll for /tmp/pkb_ready sentinel ────────────────────────────
+    if ready_pod is not None:
+      sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([
+          'exec', ready_pod, '-n', _DS_NAMESPACE,
+          '--', 'test', '-f', '/tmp/pkb_ready',
+      ], raise_on_failure=False)
+      if sentinel_rc == 0:
+        logging.info(
+            '[swap_encryption] Pod %s ready (tools installed)', ready_pod)
+        return ready_pod
+      # "container not found" means the container crashed (CrashLoopBackOff or
+      # exited) — treat it as a hard reset: re-check pod phase on next iteration.
+      if ('container not found' in sentinel_err
+          or 'unable to upgrade connection' in sentinel_err):
+        logging.warning('[swap_encryption] Pod %s: container not running (%s) '
+                        '— will re-check pod state', ready_pod, sentinel_err.strip())
+        ready_pod = None
+        last_phase = ''
+      else:
+        logging.info(
+            '[swap_encryption] Pod %s: still installing tools...', ready_pod)
+
+    time.sleep(15)
+
+  logging.warning(
+      '[swap_encryption] Benchmark pod not ready after %ds', timeout)
+  return None
+
+
+def _log_pod_events(pod_name: str) -> None:
+  """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
+  events_out, _, _ = kubectl.RunKubectlCommand([
+      'describe', 'pod', pod_name,
+      '-n', _DS_NAMESPACE,
+  ], raise_on_failure=False)
+  # Only log the Events section to keep output manageable
+  in_events = False
+  lines = []
+  for line in events_out.splitlines():
+    if line.startswith('Events:'):
+      in_events = True
+    if in_events:
+      lines.append(line)
+  if lines:
+    logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]))
+  else:
+    logging.info('[swap_encryption] kubectl describe output:\n%s',
+                 events_out[-2000:] if len(events_out) > 2000 else events_out)
+
+
+def _delete_daemonset() -> None:
+  """Delete the benchmark DaemonSet."""
+  kubectl.RunKubectlCommand([
+      'delete', 'daemonset', _DS_NAME,
+      '-n', _DS_NAMESPACE,
+      '--ignore-not-found',
+  ], raise_on_failure=False)
+  logging.info('[swap_encryption] DaemonSet deleted')
+
+
+def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str:
+  """Return a bash startup script for the benchmark nodepool.
+
+  NOTE: This function is not currently used. GKE reserves the
+  `startup-script` node metadata key, so dm-crypt setup is performed
+  from within the privileged DaemonSet pod instead (see
+  _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference.
+
+  Args:
+    enable_dmcrypt: When True, wrap the swap device in dm-crypt plain
+      mode (aes-xts-plain64, ephemeral random key) matching GKE's
+      go/node:swap-encryption implementation.
+    lssd: When True, build a RAID-0 array across all local SSDs before
+      setting up swap (matches go/gke-swap-lssd).
+
+  Returns:
+    A bash script string suitable for running as root at node boot.
+  """
+  dmcrypt_str = 'true' if enable_dmcrypt else 'false'
+  lssd_str = 'true' if lssd else 'false'
+
+  return textwrap.dedent(f"""\
+    #!/bin/bash
+    # PKB swap_encryption_benchmark — nodepool startup script.
+    # Configures swap once at node boot so all benchmark phases see a
+    # pre-warmed swap device.  Runs as root on the COS host.
+    set -euo pipefail
+    ENABLE_DMCRYPT={dmcrypt_str}
+    LSSD={lssd_str}
+
+    _wait_dev() {{
+      local d=$1 i
+      for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done
+      echo "[pkb-startup] device $d not ready" >&2; return 1
+    }}
+
+    _boot_dev() {{
+      lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1
+    }}
+
+    if $LSSD; then
+      BOOT=$(_boot_dev)
+      # Collect all non-rotational non-boot block devices (local SSDs)
+      DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true)
+      N=$(echo "$DEVS" | grep -c /dev/ || true)
+      if [ "$N" -gt 1 ]; then
+        modprobe raid0 || true
+        # shellcheck disable=SC2086
+        mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force
+        TARGET=/dev/md0
+      elif [ "$N" -eq 1 ]; then
+        TARGET=$(echo "$DEVS" | head -1)
+      else
+        echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2
+        exit 0
+      fi
+    else
+      BOOT=$(_boot_dev)
+      RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true)
+      if [ -z "$RAW" ]; then
+        echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2
+        exit 0
+      fi
+      TARGET=/dev/$RAW
+    fi
+
+    _wait_dev "$TARGET"
+
+    if $ENABLE_DMCRYPT; then
+      modprobe dm-crypt || true
+      dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\
+        cryptsetup open --type plain \\
+          --cipher aes-xts-plain64 --key-size 256 \\
+          --key-file=- "$TARGET" pkb_swap
+      SWAP_DEV=/dev/mapper/pkb_swap
+    else
+      SWAP_DEV=$TARGET
+    fi
+
+    mkswap "$SWAP_DEV"
+    swapon "$SWAP_DEV"
+    echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)"
+  """)
+
+
+_HYPERDISK_MAX_IOPS_PER_MBPS = 256  # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
+
+
+def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
+  """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint.
+
+  Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed
+  256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails
+  with "Requested provisioned throughput is too low for the provisioned iops".
+  Clamp throughput UP to the minimum the requested IOPS need (plus a small
+  margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk
+  creation.
+  """
+  min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
+  if throughput < min_tput:
+    logging.warning(
+        '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for '
+        '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d',
+        throughput, iops, min_tput, min_tput)
+    return min_tput
+  return throughput
+
+
+def _create_benchmark_node_pool(cluster) -> None:
+  """Add the benchmark nodepool to the existing cluster (Step 2 of setup).
+
+  Uses:
+    --swap_encryption_benchmark_machine_type  (default n4-highmem-32)
+    --swap_encryption_node_image_type         (default COS_CONTAINERD)
+    --swap_encryption_boot_disk_iops          (default 80000)
+    --swap_encryption_enable_dmcrypt          (default True)
+
+  The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet
+  nodeSelector targets it exclusively.  dm-crypt swap setup is performed
+  from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap /
+  _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata
+  because GKE reserves that metadata key and rejects it at the API level.
+  """
+  machine_type = _BENCHMARK_MACHINE_TYPE.value
+  # Auto-detect LSSD from machine type name; flag overrides only when True.
+  is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower()
+
+  # Determine zone/region from the cluster object.
+  zone_flags: list[str] = []
+  if getattr(cluster, 'zones', None):
+    zone_flags = ['--zone', cluster.zones[0]]
+  elif getattr(cluster, 'region', None):
+    zone_flags = ['--region', cluster.region]
+
+  # LSSD configs only need a small boot disk (OS only; swap is on local NVMe).
+  # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on
+  # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk
+  # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable
+  # headroom and matches the Config 2 spec in the Engineer Assignments table).
+  disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value
+
+  disk_type = _BOOT_DISK_TYPE.value
+  cmd = [
+      'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL,
+      '--cluster',      cluster.name,
+      '--project',      cluster.project,
+      '--machine-type', machine_type,
+      '--image-type',   _NODE_IMAGE_TYPE.value,
+      '--disk-type',    disk_type,
+      '--disk-size',    str(disk_size_gb),
+      '--num-nodes',    '1',
+      '--node-labels',  f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+      '--no-enable-autoupgrade',
+      '--no-enable-autorepair',
+  ] + zone_flags
+
+  # IOPS and throughput provisioning only applies to hyperdisk-* types AND
+  # only when the boot disk is also the swap device (non-LSSD configs).
+  # For LSSD machines the boot disk is OS-only; swap is on local NVMe.
+  # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the
+  # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
+  if disk_type.startswith('hyperdisk') and not is_lssd:
+    cmd += [
+        '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value),
+        '--boot-disk-provisioned-throughput',
+        str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value,
+                                        _BOOT_DISK_THROUGHPUT.value)),
+    ]
+
+  # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
+  # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block).
+  if is_lssd:
+    cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}']
+
+  logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / '
+               'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
+               'add_swap_disk=%s',
+               _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value,
+               disk_size_gb, _BOOT_DISK_IOPS.value,
+               _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value)
+
+  # LSSD nodepools take longer to provision than PD-only nodepools because
+  # GKE must also initialise the local NVMe devices before marking nodes Ready.
+  # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
+  stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200,
+                                            raise_on_failure=False)
+
+  if rc != 0:
+    # Idempotent prepare: if the nodepool already exists (e.g. re-running
+    # --run_stage=prepare,run to redeploy the DaemonSet onto an existing
+    # cluster), reuse it instead of failing.  gcloud returns a 409 /
+    # "Already exists" message in this case.
+    low = (stderr or '').lower()
+    if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low:
+      logging.info('[swap_encryption] Benchmark nodepool already exists — '
+                   'reusing it (idempotent prepare); proceeding to DaemonSet')
+      return
+    raise errors.Benchmarks.RunError(
+        f'[swap_encryption] Failed to create benchmark nodepool '
+        f'(rc={rc}): {stderr}'
+    )
+  logging.info('[swap_encryption] Benchmark nodepool ready')
+
+
+def _wait_for_benchmark_node(timeout: int = 900) -> None:
+  """Block until a node labelled pkb_nodepool=benchmark is Ready.
+
+  gcloud container node-pools create returns as soon as the API accepts the
+  request — the actual node VM may take another 2-4 minutes to boot, join the
+  cluster, and pass its readiness checks.  Deploying the DaemonSet before that
+  point leaves the pod Pending indefinitely because the nodeSelector finds no
+  eligible node.
+
+  This function polls kubectl every 15 s until at least one node with
+  pkb_nodepool=benchmark has Ready=True, then returns.
+  """
+  deadline = time.time() + timeout
+  logging.info('[swap_encryption] Waiting for benchmark node '
+               '(pkb_nodepool=benchmark) to be Ready...')
+  while time.time() < deadline:
+    out, _, rc = kubectl.RunKubectlCommand([
+        'get', 'nodes',
+        '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+        '-o', r'jsonpath={range .items[*]}'
+               r'{.metadata.name}{"\t"}'
+               r'{range .status.conditions[?(@.type=="Ready")]}'
+               r'{.status}{"\n"}{end}{end}',
+    ], raise_on_failure=False)
+
+    if rc == 0 and out.strip():
+      for line in out.strip().splitlines():
+        parts = line.split('\t')
+        if len(parts) == 2 and parts[1].strip() == 'True':
+          logging.info('[swap_encryption] Benchmark node ready: %s',
+                       parts[0].strip())
+          return
+
+    logging.info('[swap_encryption] Benchmark node not yet Ready — '
+                 'retrying in 15 s...')
+    time.sleep(15)
+
+  raise errors.Benchmarks.RunError(
+      '[swap_encryption] Timed out waiting for benchmark node '
+      f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready '
+      f'after {timeout}s'
+  )
+
+
+def _attach_swap_disk(cluster) -> None:
+  """Create a dedicated hyperdisk and attach it to the benchmark node.
+
+  gcloud container node-pools create --additional-node-disk is not available
+  in all gcloud SDK versions, so we use gcloud compute to create the disk and
+  attach it after the node is ready.  In GKE the Kubernetes node name is the
+  same as the GCE instance name, so no translation is needed.
+
+  After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe
+  nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk.
+
+  The disk is named pkb-swap-<cluster-name> to avoid name collisions across
+  concurrent runs.  Cleanup deletes it in Cleanup() if it exists.
+  """
+  # Resolve zone from cluster
+  zone = None
+  if getattr(cluster, 'zones', None):
+    zone = cluster.zones[0]
+  elif getattr(cluster, 'region', None):
+    zone = cluster.region
+  if not zone:
+    raise errors.Benchmarks.RunError(
+        '[swap_encryption] Cannot attach swap disk: cluster zone unknown')
+
+  project = cluster.project
+  disk_name = f'pkb-swap-{cluster.name}'
+  disk_type = _BOOT_DISK_TYPE.value
+  disk_size_gb = _SWAP_DISK_SIZE_GB.value
+
+  # ── Step 1: get the GCE instance name of the benchmark node ───────────────
+  node_out, _, rc = kubectl.RunKubectlCommand([
+      'get', 'nodes',
+      '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+      '-o', 'jsonpath={.items[0].metadata.name}',
+  ], raise_on_failure=False)
+  instance_name = node_out.strip()
+  if rc != 0 or not instance_name:
+    raise errors.Benchmarks.RunError(
+        '[swap_encryption] Cannot find benchmark node for swap disk attach')
+  logging.info('[swap_encryption] Benchmark node instance: %s', instance_name)
+
+  # ── Step 2: create the hyperdisk ──────────────────────────────────────────
+  logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)',
+               disk_name, disk_size_gb, disk_type)
+  create_cmd = [
+      'gcloud', 'compute', 'disks', 'create', disk_name,
+      '--project', project,
+      '--zone', zone,
+      '--type', disk_type,
+      '--size', f'{disk_size_gb}GB',
+      '--quiet',
+  ]
+  if disk_type.startswith('hyperdisk'):
+    create_cmd += [
+        '--provisioned-iops', str(_BOOT_DISK_IOPS.value),
+        '--provisioned-throughput',
+        str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value,
+                                        _BOOT_DISK_THROUGHPUT.value)),
+    ]
+  _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120,
+                                       raise_on_failure=False)
+  if rc != 0:
+    raise errors.Benchmarks.RunError(
+        f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}')
+
+  # ── Step 3: attach the disk to the node VM ────────────────────────────────
+  logging.info('[swap_encryption] Attaching swap disk %s to %s',
+               disk_name, instance_name)
+  attach_cmd = [
+      'gcloud', 'compute', 'instances', 'attach-disk', instance_name,
+      '--project', project,
+      '--zone', zone,
+      '--disk', disk_name,
+      '--device-name', 'pkb-swap',
+      '--quiet',
+  ]
+  _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120,
+                                       raise_on_failure=False)
+  if rc != 0:
+    raise errors.Benchmarks.RunError(
+        f'[swap_encryption] Failed to attach swap disk to {instance_name}: '
+        f'{stderr}')
+  logging.info('[swap_encryption] Swap disk attached: %s → %s',
+               disk_name, instance_name)
+
+
+def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
+  """Detach (if attached) and delete a GCE disk, robustly, with retries.
+
+  Finds the attached instance from the disk's own `users` field rather than
+  kubectl — kubectl is often unavailable during teardown (cluster being
+  deleted), which previously left the disk attached and undeletable, so it
+  leaked.  Returns True if the disk is gone (deleted or already absent).
+  """
+  for attempt in range(1, 5):
+    users, _, rc = vm_util.IssueCommand(
+        ['gcloud', 'compute', 'disks', 'describe', disk_name,
+         '--project', project, '--zone', zone, '--format=value(users)'],
+        timeout=60, raise_on_failure=False)
+    if rc != 0:
+      logging.info('[swap_encryption] Swap disk %s not present — nothing to '
+                   'delete', disk_name)
+      return True  # already gone
+    user = users.strip()
+    if user:
+      inst = user.split('/')[-1]
+      logging.info('[swap_encryption] Detaching swap disk %s from %s',
+                   disk_name, inst)
+      vm_util.IssueCommand(
+          ['gcloud', 'compute', 'instances', 'detach-disk', inst,
+           '--project', project, '--zone', zone, '--disk', disk_name,
+           '--quiet'], timeout=120, raise_on_failure=False)
+    _, derr, drc = vm_util.IssueCommand(
+        ['gcloud', 'compute', 'disks', 'delete', disk_name,
+         '--project', project, '--zone', zone, '--quiet'],
+        timeout=180, raise_on_failure=False)
+    if drc == 0:
+      logging.info('[swap_encryption] Swap disk deleted: %s', disk_name)
+      return True
+    logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed '
+                    '(%s); retrying in 10s', attempt, derr.strip()[:160])
+    time.sleep(10)
+  logging.error('[swap_encryption] Could NOT delete swap disk %s after retries '
+                '— delete it manually: gcloud compute disks delete %s '
+                '--zone %s --quiet', disk_name, disk_name, zone)
+  return False
+
+
+def _detach_and_delete_swap_disk(cluster) -> None:
+  """Detach and delete the dedicated swap disk created by _attach_swap_disk."""
+  zone = None
+  if getattr(cluster, 'zones', None):
+    zone = cluster.zones[0]
+  elif getattr(cluster, 'region', None):
+    zone = cluster.region
+  if not zone or not getattr(cluster, 'project', None):
+    return
+  _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone)
+
+
+def _delete_default_node_pool(cluster) -> None:
+  """Delete the dummy default nodepool after the benchmark pool is ready.
+
+  The default nodepool (e2-medium) was only needed to satisfy GKE's
+  requirement that a cluster must have at least one nodepool at creation time.
+  Removing it stops the clock on its cost immediately.
+  """
+  zone_flags: list[str] = []
+  if getattr(cluster, 'zones', None):
+    zone_flags = ['--zone', cluster.zones[0]]
+  elif getattr(cluster, 'region', None):
+    zone_flags = ['--region', cluster.region]
+
+  cmd = [
+      'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL,
+      '--cluster', cluster.name,
+      '--project', cluster.project,
+      '--quiet',
+  ] + zone_flags
+
+  logging.info(
+      '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL)
+  stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300,
+                                            raise_on_failure=False)
+  if rc != 0:
+    logging.warning('[swap_encryption] Could not delete default nodepool '
+                    '(rc=%d): %s', rc, stderr)
+  else:
+    logging.info('[swap_encryption] Default nodepool deleted')
+
+
+def _is_pod_gone(pod: str) -> bool:
+  """Return True if the named pod no longer exists in the cluster.
+
+  Used to distinguish OOM-killed container processes (pod still alive, rc=137)
+  from OOM-evicted pods (pod gone, DaemonSet will create a replacement).
+  """
+  try:
+    _, err, rc = kubectl.RunKubectlCommand(
+        ['get', 'pod', pod, '-n', _DS_NAMESPACE,
+         '-o', 'jsonpath={.metadata.name}'],
+        raise_on_failure=False, timeout=15,
+    )
+    return rc != 0 and 'not found' in (err or '').lower()
+  except Exception:  # pylint: disable=broad-except
+    return False
+
+
+def _pod_exec(
+    pod: str,
+    cmd: str,
+    ignore_failure: bool = False,
+    timeout: int = 300,
+    _retries: int = 2,
+) -> tuple[str, str]:
+  """Run a shell command inside the benchmark pod via kubectl exec.
+
+  Args:
+    pod: Pod name returned by _wait_for_benchmark_pod.
+    cmd: Shell command string passed to bash -c.
+    ignore_failure: When True, non-zero exit codes are logged but not
+      raised.
+    timeout: Seconds before PKB kills the kubectl exec process. Default
+      300 s matches PKB's IssueCommand default. Pass a larger value for
+      long-running jobs (fio, stress-ng, kernel build).
+    _retries: Number of automatic retries on transient GKE websocket
+      resets ("connection reset by peer").  Set to 0 to disable retries
+      for idempotent-sensitive commands.
+
+  Returns:
+    Tuple of (stdout, stderr) strings.
+  """
+  _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close')
+  # Errors that indicate the container/pod is gone and needs recovery.
+  # 'not found' covers "Error from server (NotFound): pods ... not found"
+  # which occurs when the DaemonSet pod was evicted and recreated under a
+  # new name (e.g. after OOM-triggered node pressure eviction).
+  # 'deleted state' covers "cannot exec in a deleted state" — the container
+  # was OOM-killed and is mid-termination (not yet recreated).
+  _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received',
+                             'unable to upgrade connection', 'not found',
+                             'deleted state')
+  # Use the globally-tracked active pod name — it may have been updated by
+  # a previous _recover_pod call when eviction replaced the pod.
+  active = _active_pod[0] if _active_pod else pod
+
+  for attempt in range(_retries + 1):
+    out, err, rc = kubectl.RunKubectlCommand(
+        ['exec', active, '-n', _DS_NAMESPACE,
+         '--', 'bash', '-c', cmd],
+        raise_on_failure=False,
+        raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
+        timeout=timeout,
+    )
+    is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS)
+    if is_transient and attempt < _retries:
+      logging.warning(
+          '[swap_encryption] kubectl exec connection reset (attempt %d/%d); '
+          'retrying in 10 s', attempt + 1, _retries + 1)
+      time.sleep(10)
+      continue
+    # rc=137 (SIGKILL): the OOM killer terminated the container process.
+    # Two sub-cases:
+    #   A) Pod eviction: pod is gone, DaemonSet recreates it under a new name.
+    #   B) Container OOM restart: pod still exists, container restarts in place.
+    #      (DaemonSet restartPolicy=Always restarts the container, /tmp is lost,
+    #      tools must be re-installed before subsequent commands can run.)
+    # In both cases we call _recover_pod to wait for tools + sentinel, and
+    # we do NOT retry the OOM-triggering command itself.
+    if rc == 137:
+      # Record the OOM so the run-level gate can flag it even if the container
+      # restarts in place under the same pod name (which leaves both the
+      # "pod replaced" and "pod NotFound" checks silent).
+      if active not in _oom_events:
+        _oom_events.append(active)
+      # CRITICAL: sleep before checking pod state.  Kubernetes takes a few
+      # seconds to mark a just-evicted pod as Terminating / NotFound.  Without
+      # this delay _recover_pod sees the pod still in "Running" phase, returns
+      # the old pod name immediately, and every subsequent command fails with
+      # "Error from server (NotFound): pods … not found".
+      logging.warning(
+          '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update '
+          'pod state before recovery check')
+      time.sleep(15)
+      pod_gone = _is_pod_gone(active)
+      if pod_gone:
+        logging.warning(
+            '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — '
+            'recovering pod name for subsequent commands (not retrying this cmd)')
+      else:
+        logging.warning(
+            '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — '
+            'waiting for container restart and tool re-install before continuing')
+      new_pod = _recover_pod(active)
+      if new_pod != active:
+        logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod)
+        if _active_pod:
+          _active_pod[0] = new_pod
+        active = new_pod
+      break  # Do NOT retry — the OOM cmd itself is not re-run on the new pod.
+
+    is_container_gone = (rc != 0 and
+                         any(e in err.lower() for e in _CONTAINER_GONE_ERRORS))
+    if is_container_gone:
+      # Record the loss for the run-level degradation gate REGARDLESS of retry
+      # budget or ignore_failure.  A "pods … not found" on a best-effort command
+      # (kernel build, opensearch, cleanup of a dead pod) still means the pod
+      # died; without this the gate stays blind because _active_pod is only
+      # renamed on the retry path below, which _retries=0 callers never reach.
+      if active and active not in _pod_lost:
+        _pod_lost.append(active)
+        logging.error(
+            '[swap_encryption] Benchmark pod %s is gone (%s) — recording run '
+            'as degraded', active, (err or '').strip()[:160])
+      if attempt < _retries:
+        logging.warning(
+            '[swap_encryption] Container gone/restarting (attempt %d/%d) — '
+            'waiting for pod to recover...', attempt + 1, _retries + 1)
+        new_pod = _recover_pod(active)
+        if new_pod != active:
+          logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod)
+          if _active_pod:
+            _active_pod[0] = new_pod
+          active = new_pod
+        continue
+    break
+
+  if rc != 0 and not ignore_failure:
+    raise errors.VmUtil.IssueCommandError(
+        f'[swap_encryption] _pod_exec failed (rc={rc}): {err}')
+  return out, err
+
+
+def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
+  """Wait for a DaemonSet container to recover after OOM kill or eviction.
+
+  Handles two scenarios:
+  1. Container OOM restart: same pod name, container restarting in place.
+     DaemonSet restartPolicy=Always brings it back under the same pod name.
+  2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates
+     a new pod with a DIFFERENT name.  We detect this by checking whether
+     the named pod still exists; if not, we search by the DaemonSet label
+     selector for a Running pod.
+
+  Returns the (possibly new) pod name once it is Running and ready.
+  """
+  deadline = time.time() + timeout_sec
+  logging.info('[swap_encryption] Waiting for pod %s to recover '
+               '(up to %ds)...', pod, timeout_sec)
+
+  # Phase 1: wait for a Running pod — either the named one (container
+  # restart) or a replacement pod found via label selector (eviction).
+  #
+  # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a
+  # single call.  When a pod is evicted, Kubernetes first sets deletionTimestamp
+  # (the pod is "Terminating") while status.phase may still read "Running" for
+  # several seconds.  Checking only status.phase causes a false-positive: we
+  # return the old pod name immediately and every subsequent command fails with
+  # "Error from server (NotFound)".  Checking deletionTimestamp catches this.
+  recovered_pod = pod
+  while time.time() < deadline:
+    # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not
+    # stdout.  When the pod is gone, status_out is empty and the error text
+    # lives entirely in status_err.  Discarding stderr (using _) means the
+    # 'not found' check below never fires and we spin until deadline.
+    status_out, status_err, status_rc = kubectl.RunKubectlCommand(
+        ['get', 'pod', pod, '-n', _DS_NAMESPACE,
+         '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'],
+        raise_on_failure=False, timeout=30,
+    )
+    # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating)
+    fields = status_out.strip().split('|')
+    phase = fields[0].strip() if fields else ''
+    is_terminating = len(fields) > 1 and bool(fields[1].strip())
+
+    # Pod is genuinely Running and NOT being deleted — recovery complete.
+    if status_rc == 0 and phase == 'Running' and not is_terminating:
+      break
+
+    # Pod no longer exists, OR it exists but is being terminated (Terminating
+    # state or deletionTimestamp set) — look for a replacement pod by label.
+    pod_gone_or_terminating = (
+        (status_rc != 0 and 'not found' in (status_out + status_err).lower())
+        or is_terminating
+    )
+    if pod_gone_or_terminating:
+      label_out, _, label_rc = kubectl.RunKubectlCommand(
+          ['get', 'pods', '-n', _DS_NAMESPACE,
+           '-l', f'app={_DS_LABEL}',
+           '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}'
+                 '{.metadata.name}{"\\n"}{end}'],
+          raise_on_failure=False, timeout=30,
+      )
+      new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip()
+                  and p.strip() != pod]  # exclude the dying pod
+      if label_rc == 0 and new_pods:
+        recovered_pod = new_pods[0]
+        logging.info('[swap_encryption] Original pod %s gone/terminating; '
+                     'found replacement %s', pod, recovered_pod)
+        break
+
+    time.sleep(10)
+  else:
+    raise errors.VmUtil.IssueCommandError(
+        f'[swap_encryption] No Running pod found (original: {pod}) '
+        f'within {timeout_sec}s after OOM kill / eviction')
+
+  # Phase 2: wait for init script to finish (sentinel written last).
+  while time.time() < deadline:
+    ready_out, _, ready_rc = kubectl.RunKubectlCommand(
+        ['exec', recovered_pod, '-n', _DS_NAMESPACE,
+         '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'],
+        raise_on_failure=False, timeout=30,
+    )
+    if ready_rc == 0 and 'READY' in ready_out:
+      logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod)
+      return recovered_pod
+    time.sleep(15)
+
+  raise errors.VmUtil.IssueCommandError(
+      f'[swap_encryption] Pod {recovered_pod} did not become ready '
+      f'within {timeout_sec}s after OOM kill / eviction')
+
+
+_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
+    # GCP  (on-demand, us-central1 unless noted)
+    'c4-standard-8-lssd': 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
+    'c4-standard-8':      0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
+    'n4-highmem-32':      3.0256,  # 32 vCPU, 256 GB RAM
+    'n2-highmem-32':      2.5216,  # 32 vCPU, 256 GB RAM
+    'n2-standard-32':     1.5264,  # 32 vCPU, 120 GB RAM
+    'z3-highmem-8':       2.7248,  # 8 vCPU + 4× LSSD
+    # AWS
+    'i4i.4xlarge':        1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
+    'i4i.2xlarge':        0.7480,
+    'm6id.4xlarge':       0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
+    'm6i.4xlarge':        0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
+    'r6i.4xlarge':        1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
+}
+
+
+def _collect_cost_sample(
+    pod: str, elapsed_sec: float, base_meta: dict
+) -> list[sample.Sample]:
+  """Emit a cost_estimate_usd sample for the benchmark run (gap 7).
+
+  Instance type is read from cloud metadata inside the pod.  Price is looked
+  up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and
+  a warning is logged.
+
+  Args:
+    pod: Benchmark pod name.
+    elapsed_sec: Wall-clock seconds the benchmark phases took.
+    base_meta: Shared metadata dict.
+
+  Returns:
+    A list of zero or one sample.Sample.
+  """
+  # Detect instance type from cloud metadata
+  instance_type = ''
+
+  # GCP: machine type is the last segment of the metadata URL value
+  gcp_type_out, _ = _pod_exec(
+      pod,
+      'curl -s -m 3 --fail '
+      'http://metadata.google.internal/computeMetadata/v1/instance/machine-type '
+      '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+      ignore_failure=True,
+  )
+  if gcp_type_out.strip():
+    instance_type = gcp_type_out.strip().split('/')[-1]
+
+  if not instance_type:
+    # AWS: instance-type is a plain string
+    aws_type_out, _ = _pod_exec(
+        pod,
+        'curl -s -m 3 --fail '
+        'http://169.254.169.254/latest/meta-data/instance-type '
+        '2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    instance_type = aws_type_out.strip()
+
+  # Allow explicit override (useful when running on custom/renamed machine
+  # types or when the pod was unavailable during cost collection).
+  if _INSTANCE_SIZE_LABEL.value:
+    instance_type = _INSTANCE_SIZE_LABEL.value
+
+  # Last resort: fall back to the benchmark machine type flag.  This ensures
+  # cost tracking works even when the pod was evicted before cost collection
+  # ran (in which case the metadata curl above returned empty).
+  if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
+    instance_type = _BENCHMARK_MACHINE_TYPE.value
+    logging.info(
+        '[swap_encryption] Instance type from metadata unavailable; '
+        'using --swap_encryption_benchmark_machine_type=%s for cost tracking',
+        instance_type,
+    )
+
+  price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
+  if price is None:
+    logging.warning(
+        '[swap_encryption] Unknown instance type "%s" – skipping cost sample. '
+        'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.',
+        instance_type,
+    )
+    return []
+
+  hours = elapsed_sec / 3600.0
+  cost = hours * price
+  meta = dict(
+      base_meta,
+      instance_type=instance_type,
+      price_usd_per_hr=price,
+      benchmark_elapsed_sec=round(elapsed_sec, 1),
+  )
+  return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)]
+
+
+def _detect_swap_device(pod: str) -> str:
+  """Return the active swap device path on the cluster node."""
+  if _SWAP_DEVICE.value:
+    return _SWAP_DEVICE.value
+
+  # /proc/swaps is the source of truth: it lists the swap device that is
+  # ACTUALLY active.  We must NOT just `test -e /dev/mapper/swap_encrypted`,
+  # because a stale dm-crypt mapping from a previous run on a reused node can
+  # still exist as a /dev node while being non-functional (fio/swapoff then
+  # fail with "No such device or address").  So read the active device from
+  # /proc/swaps first; only fall back to the mapper path if /proc/swaps is
+  # somehow empty but the mapper is genuinely present.
+  dm_out, _ = _pod_exec(
+      pod,
+      textwrap.dedent("""
+        ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
+        if [ -n "$ACTIVE" ]
+        then
+          echo "$ACTIVE"
+        elif test -e /dev/mapper/swap_encrypted
+        then
+          echo /dev/mapper/swap_encrypted
+        fi
+      """),
+      ignore_failure=True,
+  )
+  dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ''
+  if dev:
+    return dev
+  raise ValueError(
+      'No active swap device found in the benchmark pod. '
+      'Use --swap_encryption_device to specify one.'
+  )
+
+
+def _build_metadata(pod: str, swap_dev: str) -> dict:
+  """Collect node environment, encryption type, and config into a dict."""
+
+  kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True)
+  mem_out, _ = _pod_exec(
+      pod, "awk '/MemTotal/{print $2}' /proc/meminfo",
+      ignore_failure=True,
+  )
+  swap_out, _ = _pod_exec(
+      pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
+      ignore_failure=True,
+  )
+
+  try:
+    mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
+  except ValueError:
+    mem_gb = 0
+  try:
+    swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
+  except ValueError:
+    swap_gb = 0
+
+  # Encryption type — key off dm-crypt presence + the swap target, NOT the
+  # device path.  A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro-
+  # encrypted; only the AWS targets (instance_store / io2) are.
+  enc = 'unknown'
+  if '/dev/mapper/' in swap_dev:
+    table_out, _ = _pod_exec(
+        pod,
+        f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
+  elif _SWAP_TYPE.value in ('instance_store', 'io2'):
+    enc = 'nitro_hardware_offload'   # AWS: encrypted by the Nitro card
+  elif not _ENABLE_DMCRYPT.value:
+    enc = 'none'                      # GKE plain swap (encryption OFF)
+
+  cloud = _detect_cloud(pod)
+
+  # Gap 6: instance size label for multi-size comparison runs.
+  # If the flag is set use it directly; otherwise try to read it from
+  # cloud metadata so that the field is always populated.
+  instance_label = _INSTANCE_SIZE_LABEL.value
+  if not instance_label:
+    gcp_type_out, _ = _pod_exec(
+        pod,
+        'curl -s -m 3 --fail '
+        'http://metadata.google.internal/computeMetadata/v1/instance/machine-type '
+        '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    if gcp_type_out.strip():
+      instance_label = gcp_type_out.strip().split('/')[-1]
+  if not instance_label:
+    aws_type_out, _ = _pod_exec(
+        pod,
+        'curl -s -m 3 --fail '
+        'http://169.254.169.254/latest/meta-data/instance-type '
+        '2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    instance_label = aws_type_out.strip()
+
+  return {
+      'benchmark': BENCHMARK_NAME,
+      'execution_mode': 'kubernetes_privileged_pod',
+      'cloud': cloud,
+      'instance_size': instance_label,
+      'kernel_version': kernel_out.strip(),
+      'host_memory_gb': mem_gb,
+      'swap_device': swap_dev,
+      'swap_size_gb': swap_gb,
+      'swap_encryption': enc,
+      # Test-matrix columns: storage target, encryption on/off, image, IOPS
+      'storage_target': _SWAP_TYPE.value,
+      'boot_disk_type': _BOOT_DISK_TYPE.value,
+      'dmcrypt_enabled': _ENABLE_DMCRYPT.value,
+      'node_image_type': _NODE_IMAGE_TYPE.value,
+      'boot_disk_iops_target': _BOOT_DISK_IOPS.value,
+      'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value,
+      # Other config
+      'zswap_enabled': _ENABLE_ZSWAP.value,
+      'min_free_kbytes': _MIN_FREE_KBYTES.value,
+      'fio_runtime_sec': _FIO_RUNTIME_SEC.value,
+      # Requested config value only.  The *effective* stress-ng footprint may
+      # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the
+      # actual value it ran with as 'stress_vm_bytes' so the two never conflict.
+      'stress_vm_bytes_requested': _STRESS_VM_BYTES.value,
+      'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value,
+      'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
+      'nodepool': _NODEPOOL.value,
+  }

From 0f75d80697af1a6f7943f0e61bd12910f0568c56 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Fri, 19 Jun 2026 13:18:44 +0530
Subject: [PATCH 02/17] formatting the code as per standard

---
 .../swap_encryption_benchmark.py              | 1007 +++++++++++------
 1 file changed, 648 insertions(+), 359 deletions(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 5bdc933bba..026831efe0 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -63,14 +63,13 @@
     Bulk-index + search query under swap pressure (esrally or curl).
 """
 
-import json
 import logging
-import re
 import textwrap
 import time
 from typing import Any
 
 from absl import flags
+from perfkitbenchmarker import benchmark_spec as bm_spec_lib
 from perfkitbenchmarker import configs
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import sample
@@ -79,15 +78,12 @@
 
 FLAGS = flags.FLAGS
 
+_BenchmarkSpec = bm_spec_lib.BenchmarkSpec
+
 # ---------------------------------------------------------------------------
 # Benchmark identity
 # ---------------------------------------------------------------------------
 
-
-
-FLAGS = flags.FLAGS
-
-
 BENCHMARK_NAME = 'swap_encryption'
 
 
@@ -277,15 +273,91 @@
     'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
 )
 
+_ENABLE_DMCRYPT = flags.DEFINE_boolean(
+    'swap_encryption_enable_dmcrypt',
+    True,
+    'When True (default), wrap the swap device in dm-crypt plain mode '
+    "(aes-xts-plain64, ephemeral random key) matching GKE's "
+    'go/node:swap-encryption implementation.  Set False to measure plain '
+    '(unencrypted) swap overhead as a baseline.',
+)
 
-_DS_NAME = 'pkb-swap-benchmark'
+_SWAP_DEVICE = flags.DEFINE_string(
+    'swap_encryption_device',
+    '',
+    'Explicit block device path to use as the swap device, e.g. '
+    '/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), '
+    'the device is auto-detected from /proc/swaps inside the benchmark pod.',
+)
 
+_SWAP_TYPE = flags.DEFINE_string(
+    'swap_encryption_swap_type',
+    'hyperdisk',
+    'Storage target for the swap device.  One of: hyperdisk (default), '
+    'lssd, instance_store, io2.',
+)
 
-_DS_NAMESPACE = 'default'
+_KERNEL_VERSION = flags.DEFINE_string(
+    'swap_encryption_kernel_version',
+    '',
+    'Kernel version string to embed in the DaemonSet pod spec as a label.  '
+    'When empty (default) the version is not pinned.',
+)
+
+_ENABLE_ZSWAP = flags.DEFINE_boolean(
+    'swap_encryption_enable_zswap',
+    False,
+    'When True, enable zswap compressed swap cache on the benchmark node.',
+)
 
+_MIN_FREE_KBYTES = flags.DEFINE_integer(
+    'swap_encryption_min_free_kbytes',
+    0,
+    'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. '
+    '0 (default) leaves the kernel default unchanged.',
+)
+
+_FIO_RUNTIME_SEC = flags.DEFINE_integer(
+    'swap_encryption_fio_runtime_sec',
+    60,
+    'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.',
+)
 
+_STRESS_VM_BYTES = flags.DEFINE_string(
+    'swap_encryption_stress_vm_bytes',
+    '28G',
+    'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  '
+    'Should exceed available node RAM to force sustained paging.',
+)
+
+_STRESS_VM_BYTES_LIST = flags.DEFINE_list(
+    'swap_encryption_stress_vm_bytes_list',
+    [],
+    'Comma-separated list of --vm-bytes values to sweep in Phase 2a, '
+    'e.g. "14G,28G,56G".  Overrides --swap_encryption_stress_vm_bytes.',
+)
+
+_STRESS_TIMEOUT_SEC = flags.DEFINE_integer(
+    'swap_encryption_stress_timeout_sec',
+    300,
+    'Maximum seconds to wait for the stress-ng swap-pressure phase.',
+)
+
+_DS_NAME = 'pkb-swap-benchmark'
+_DS_NAMESPACE = 'default'
 _DS_LABEL = 'pkb-swap-benchmark'
 
+# Transient kubectl errors that are safe to retry.
+_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close')
+
+# Errors indicating the container/pod is gone and needs recovery.
+_CONTAINER_GONE_KUBECTL_ERRORS = (
+    'container not found',
+    'procReady not received',
+    'unable to upgrade connection',
+    'not found',
+    'deleted state',
+)
 
 _active_pod: list[str] = []  # single-element list so closures can mutate it
 
@@ -298,10 +370,7 @@
 
 _oom_events: list[str] = []
 
-
 _BENCHMARK_NODEPOOL = 'benchmark'
-
-
 _DEFAULT_NODEPOOL = 'default-pool'
 
 
@@ -325,10 +394,11 @@ def _daemonset_yaml(image: str) -> str:
 
 
 def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
+  """Load and return benchmark config spec."""
   return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
 
 
-def Prepare(spec) -> None:
+def Prepare(spec: _BenchmarkSpec) -> None:
   """Two-step nodepool setup then DaemonSet deployment.
 
   Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap
@@ -345,8 +415,27 @@ def Prepare(spec) -> None:
   cluster = spec.container_cluster
 
   # ── Step 2a: add real benchmark nodepool ────────────────────────────────
-  if getattr(cluster, 'project', None):
-    # GCP path: true two-step nodepool setup
+  if not getattr(cluster, 'project', None):
+    # Guard: AWS / EKS path — nodepool management is external.
+    # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet
+    # nodeSelector (pkb_nodepool=benchmark) before deploying the pod.
+    logging.info(
+        '[swap_encryption] EKS cluster — labelling existing nodes with '
+        'pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
+        _BENCHMARK_NODEPOOL,
+    )
+    kubectl.RunKubectlCommand([
+        'label',
+        'nodes',
+        '--all',
+        '--overwrite',
+        f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+    ])
+    # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
+    # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
+    _ensure_io2_volume()
+  else:
+    # GCP path: true two-step nodepool setup.
     logging.info('[swap_encryption] Step 2a: creating benchmark nodepool')
     _create_benchmark_node_pool(cluster)
 
@@ -355,26 +444,9 @@ def Prepare(spec) -> None:
     _wait_for_benchmark_node()
 
     # ── Step 2b2: attach dedicated swap disk (if requested) ───────────────
-    # --additional-node-disk is not available in all gcloud versions, so we
-    # create + attach the disk after the node is up using gcloud compute.
     if _ADD_SWAP_DISK.value:
       logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk')
       _attach_swap_disk(cluster)
-  else:
-    # AWS / EKS: nodepool management is external.  PKB's cluster creation
-    # labels nodes pkb_nodepool=default, so re-label all existing nodes here
-    # to match the DaemonSet nodeSelector (pkb_nodepool=benchmark).
-    logging.info(
-        '[swap_encryption] EKS cluster — labelling existing nodes with '
-        'pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
-        _BENCHMARK_NODEPOOL)
-    kubectl.RunKubectlCommand([
-        'label', 'nodes', '--all', '--overwrite',
-        f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-    ])
-    # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
-    # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
-    _ensure_io2_volume()
 
   # ── Step 2c: deploy DaemonSet ────────────────────────────────────────────
   # Deploy and wait for the pod BEFORE deleting the default nodepool.
@@ -395,8 +467,10 @@ def Prepare(spec) -> None:
     # the nodepool deletion (cluster control plane briefly interrupts pod
     # lifecycle).  Re-resolve the pod name to avoid stale-reference errors on
     # all subsequent _pod_exec calls.
-    logging.info('[swap_encryption] Step 2d: re-resolving benchmark pod '
-                 'after nodepool deletion')
+    logging.info(
+        '[swap_encryption] Step 2d: re-resolving benchmark pod '
+        'after nodepool deletion'
+    )
     pod = _wait_for_benchmark_pod()
     logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod)
 
@@ -411,7 +485,7 @@ def _phase_selected(token: str) -> bool:
   return (not selected) or ('all' in selected) or (token.lower() in selected)
 
 
-def Run(spec) -> list[sample.Sample]:
+def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
   """Execute all benchmark phases with gate logic.
 
   Execution is structured in three gated tiers matching the execution plan:
@@ -432,6 +506,10 @@ def Run(spec) -> list[sample.Sample]:
   application-level swap performance when the raw device is inaccessible.
   """
   pod = _wait_for_benchmark_pod()
+  if pod is None:
+    raise errors.Benchmarks.RunError(
+        '[swap_encryption] Benchmark pod never became ready.'
+    )
   # Initialise the module-level active-pod tracker so _pod_exec and
   # _recover_pod can transparently redirect to a replacement pod if the
   # original is evicted during the run.
@@ -461,37 +539,43 @@ def Run(spec) -> list[sample.Sample]:
   # Detect those conditions here and surface them explicitly.
   if _active_pod and _active_pod[0] != original_pod:
     _degraded_reasons.append(
-        f'benchmark pod was replaced during the run '
+        'benchmark pod was replaced during the run '
         f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap '
-        f'pressure; phases executed after the eviction ran against a '
-        f'freshly-initialised pod (empty /tmp, swap re-setup) and may be '
-        f'invalid')
+        'pressure; phases executed after the eviction ran against a '
+        'freshly-initialised pod (empty /tmp, swap re-setup) and may be '
+        'invalid'
+    )
   if _pod_lost:
     _degraded_reasons.append(
-        f'benchmark pod(s) went NotFound during the run ({", ".join(_pod_lost)}) '
-        f'— the pod died (node memory-pressure eviction or container exit) and '
-        f'any phase running at or after that point (e.g. kernel-build baseline, '
-        f'OpenSearch) produced invalid data')
+        f'pod(s) NotFound during run: {", ".join(_pod_lost)} — pod died'
+        ' (eviction/exit); phases at/after that point (e.g.'
+        ' kernel-build, OpenSearch) produced invalid data'
+    )
   if _oom_events:
     _degraded_reasons.append(
-        f'OOM kill(s) (rc=137) occurred during the run on pod(s) '
+        'OOM kill(s) (rc=137) occurred during the run on pod(s) '
         f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by '
-        f'the OOM killer (the container may have restarted in place), so the '
-        f'affected phase(s) produced no or partial data')
+        'the OOM killer (the container may have restarted in place), so the '
+        'affected phase(s) produced no or partial data'
+    )
 
   degraded = bool(_degraded_reasons)
-  results.append(sample.Sample(
-      'swap_encryption_run_status',
-      0.0 if degraded else 1.0,
-      'status',
-      dict(base_meta,
-           degraded=degraded,
-           degraded_reasons='; '.join(_degraded_reasons) or 'none',
-           num_samples=len(results) + 1)))
+  results.append(
+      sample.Sample(
+          'swap_encryption_run_status',
+          0.0 if degraded else 1.0,
+          'status',
+          dict(
+              base_meta,
+              degraded=degraded,
+              degraded_reasons='; '.join(_degraded_reasons) or 'none',
+              num_samples=len(results) + 1,
+          ),
+      )
+  )
 
   if degraded:
-    msg = ('[swap_encryption] RUN DEGRADED — '
-           + '; '.join(_degraded_reasons))
+    msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(_degraded_reasons)
     logging.error(msg)
     if _FAIL_ON_DEGRADED.value:
       # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED.  The
@@ -499,23 +583,31 @@ def Run(spec) -> list[sample.Sample]:
       # is recorded, so no data is lost.
       raise errors.Benchmarks.RunError(msg)
   else:
-    logging.info('[swap_encryption] Run completed cleanly (%d samples)',
-                 len(results))
+    logging.info(
+        '[swap_encryption] Run completed cleanly (%d samples)', len(results)
+    )
 
   return results
 
 
-def Cleanup(spec) -> None:
+def Cleanup(spec: _BenchmarkSpec) -> None:
   """Remove the DaemonSet and tear down any swap configuration."""
   pod = _wait_for_benchmark_pod(timeout=30)
   if pod:
     _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True)
-    _pod_exec(pod, textwrap.dedent("""
-      swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
-      dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
-    """), ignore_failure=True)
+    _pod_exec(
+        pod,
+        textwrap.dedent("""
+          swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+          dmsetup remove --noudevrules --noudevsync \
+            swap_encrypted 2>/dev/null || true
+        """),
+        ignore_failure=True,
+    )
     # Clean up loop device backing files (single-disk fallback path).
-    _pod_exec(pod, textwrap.dedent("""
+    _pod_exec(
+        pod,
+        textwrap.dedent("""
       for backing in /var/pkb_swap_backing /run/pkb_swap_backing \
                      /mnt/stateful_partition/pkb_swap_backing
       do
@@ -526,9 +618,14 @@ def Cleanup(spec) -> None:
           done
         rm -f "$backing"
       done
-    """), ignore_failure=True)
-    _pod_exec(pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true",
-             ignore_failure=True)
+    """),
+        ignore_failure=True,
+    )
+    _pod_exec(
+        pod,
+        "pkill -9 'stress-ng|fio' 2>/dev/null || true",
+        ignore_failure=True,
+    )
 
   _delete_daemonset()
 
@@ -561,18 +658,25 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
   """
   deadline = time.time() + timeout
   last_phase = ''
-  ready_pod  = None   # pod name once phase == Running
+  ready_pod = None  # pod name once phase == Running
 
   while time.time() < deadline:
     # ── Step 1: wait for Running phase ──────────────────────────────────────
     if ready_pod is None:
-      out, _, rc = kubectl.RunKubectlCommand([
-          'get', 'pods',
-          '-l', f'app={_DS_LABEL}',
-          '-n', _DS_NAMESPACE,
-          '-o',
-          r'jsonpath={range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}',
-      ], raise_on_failure=False)
+      out, _, rc = kubectl.RunKubectlCommand(
+          [
+              'get',
+              'pods',
+              '-l',
+              f'app={_DS_LABEL}',
+              '-n',
+              _DS_NAMESPACE,
+              '-o',
+              r'jsonpath={range .items[*]}{.metadata.name}'
+              r'{"\t"}{.status.phase}{"\n"}{end}',
+          ],
+          raise_on_failure=False,
+      )
 
       if rc == 0 and out.strip():
         for line in out.strip().splitlines():
@@ -580,12 +684,17 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
           if len(parts) == 2:
             pod_name, phase = parts[0].strip(), parts[1].strip()
             if phase == 'Running':
-              logging.info('[swap_encryption] Pod %s is Running – '
-                           'waiting for tool install to finish...', pod_name)
+              logging.info(
+                  '[swap_encryption] Pod %s is Running – '
+                  'waiting for tool install to finish...',
+                  pod_name,
+              )
               ready_pod = pod_name
               break
             if phase != last_phase:
-              logging.info('[swap_encryption] Pod %s phase: %s', pod_name, phase)
+              logging.info(
+                  '[swap_encryption] Pod %s phase: %s', pod_name, phase
+              )
               last_phase = phase
               if phase in ('Pending',):
                 _log_pod_events(pod_name)
@@ -594,39 +703,63 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
 
     # ── Step 2: poll for /tmp/pkb_ready sentinel ────────────────────────────
     if ready_pod is not None:
-      sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand([
-          'exec', ready_pod, '-n', _DS_NAMESPACE,
-          '--', 'test', '-f', '/tmp/pkb_ready',
-      ], raise_on_failure=False)
+      sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
+          [
+              'exec',
+              ready_pod,
+              '-n',
+              _DS_NAMESPACE,
+              '--',
+              'test',
+              '-f',
+              '/tmp/pkb_ready',
+          ],
+          raise_on_failure=False,
+      )
       if sentinel_rc == 0:
         logging.info(
-            '[swap_encryption] Pod %s ready (tools installed)', ready_pod)
+            '[swap_encryption] Pod %s ready (tools installed)', ready_pod
+        )
         return ready_pod
-      # "container not found" means the container crashed (CrashLoopBackOff or
-      # exited) — treat it as a hard reset: re-check pod phase on next iteration.
-      if ('container not found' in sentinel_err
-          or 'unable to upgrade connection' in sentinel_err):
-        logging.warning('[swap_encryption] Pod %s: container not running (%s) '
-                        '— will re-check pod state', ready_pod, sentinel_err.strip())
+      # "container not found" means the container crashed (CrashLoopBackOff
+      # or exited) — hard reset: re-check pod phase on next iteration.
+      if (
+          'container not found' in sentinel_err
+          or 'unable to upgrade connection' in sentinel_err
+      ):
+        logging.warning(
+            '[swap_encryption] Pod %s: container not running (%s)'
+            ' — will re-check pod state',
+            ready_pod,
+            sentinel_err.strip(),
+        )
         ready_pod = None
         last_phase = ''
       else:
         logging.info(
-            '[swap_encryption] Pod %s: still installing tools...', ready_pod)
+            '[swap_encryption] Pod %s: still installing tools...', ready_pod
+        )
 
     time.sleep(15)
 
   logging.warning(
-      '[swap_encryption] Benchmark pod not ready after %ds', timeout)
+      '[swap_encryption] Benchmark pod not ready after %ds', timeout
+  )
   return None
 
 
 def _log_pod_events(pod_name: str) -> None:
-  """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
-  events_out, _, _ = kubectl.RunKubectlCommand([
-      'describe', 'pod', pod_name,
-      '-n', _DS_NAMESPACE,
-  ], raise_on_failure=False)
+  """Dump recent Kubernetes events for the pod to diagnose startup hangs."""
+  events_out, _, _ = kubectl.RunKubectlCommand(
+      [
+          'describe',
+          'pod',
+          pod_name,
+          '-n',
+          _DS_NAMESPACE,
+      ],
+      raise_on_failure=False,
+  )
   # Only log the Events section to keep output manageable
   in_events = False
   lines = []
@@ -638,106 +771,30 @@ def _log_pod_events(pod_name: str) -> None:
   if lines:
     logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]))
   else:
-    logging.info('[swap_encryption] kubectl describe output:\n%s',
-                 events_out[-2000:] if len(events_out) > 2000 else events_out)
+    logging.info(
+        '[swap_encryption] kubectl describe output:\n%s',
+        events_out[-2000:] if len(events_out) > 2000 else events_out,
+    )
 
 
 def _delete_daemonset() -> None:
   """Delete the benchmark DaemonSet."""
-  kubectl.RunKubectlCommand([
-      'delete', 'daemonset', _DS_NAME,
-      '-n', _DS_NAMESPACE,
-      '--ignore-not-found',
-  ], raise_on_failure=False)
+  kubectl.RunKubectlCommand(
+      [
+          'delete',
+          'daemonset',
+          _DS_NAME,
+          '-n',
+          _DS_NAMESPACE,
+          '--ignore-not-found',
+      ],
+      raise_on_failure=False,
+  )
   logging.info('[swap_encryption] DaemonSet deleted')
 
 
-def _build_node_startup_script(enable_dmcrypt: bool, lssd: bool) -> str:
-  """Return a bash startup script for the benchmark nodepool.
-
-  NOTE: This function is not currently used. GKE reserves the
-  `startup-script` node metadata key, so dm-crypt setup is performed
-  from within the privileged DaemonSet pod instead (see
-  _setup_gke_hyperdisk_swap / _setup_gke_lssd_swap). Kept as reference.
-
-  Args:
-    enable_dmcrypt: When True, wrap the swap device in dm-crypt plain
-      mode (aes-xts-plain64, ephemeral random key) matching GKE's
-      go/node:swap-encryption implementation.
-    lssd: When True, build a RAID-0 array across all local SSDs before
-      setting up swap (matches go/gke-swap-lssd).
-
-  Returns:
-    A bash script string suitable for running as root at node boot.
-  """
-  dmcrypt_str = 'true' if enable_dmcrypt else 'false'
-  lssd_str = 'true' if lssd else 'false'
-
-  return textwrap.dedent(f"""\
-    #!/bin/bash
-    # PKB swap_encryption_benchmark — nodepool startup script.
-    # Configures swap once at node boot so all benchmark phases see a
-    # pre-warmed swap device.  Runs as root on the COS host.
-    set -euo pipefail
-    ENABLE_DMCRYPT={dmcrypt_str}
-    LSSD={lssd_str}
-
-    _wait_dev() {{
-      local d=$1 i
-      for i in $(seq 1 30); do [ -b "$d" ] && return 0; sleep 2; done
-      echo "[pkb-startup] device $d not ready" >&2; return 1
-    }}
-
-    _boot_dev() {{
-      lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1 || echo nvme0n1
-    }}
-
-    if $LSSD; then
-      BOOT=$(_boot_dev)
-      # Collect all non-rotational non-boot block devices (local SSDs)
-      DEVS=$(lsblk -d -o NAME,ROTA | awk '$2=="0"{{print "/dev/"$1}}' | grep -v "/dev/$BOOT" || true)
-      N=$(echo "$DEVS" | grep -c /dev/ || true)
-      if [ "$N" -gt 1 ]; then
-        modprobe raid0 || true
-        # shellcheck disable=SC2086
-        mdadm --create /dev/md0 --level=0 --raid-devices="$N" $DEVS --force
-        TARGET=/dev/md0
-      elif [ "$N" -eq 1 ]; then
-        TARGET=$(echo "$DEVS" | head -1)
-      else
-        echo "[pkb-startup] no LSSD devices found; skipping swap setup" >&2
-        exit 0
-      fi
-    else
-      BOOT=$(_boot_dev)
-      RAW=$(lsblk -d -o NAME,TYPE | awk '$2=="disk"{{print $1}}' | grep -v "^$BOOT$" | head -1 || true)
-      if [ -z "$RAW" ]; then
-        echo "[pkb-startup] no secondary disk found for hyperdisk swap" >&2
-        exit 0
-      fi
-      TARGET=/dev/$RAW
-    fi
-
-    _wait_dev "$TARGET"
-
-    if $ENABLE_DMCRYPT; then
-      modprobe dm-crypt || true
-      dd if=/dev/urandom bs=32 count=1 2>/dev/null | \\
-        cryptsetup open --type plain \\
-          --cipher aes-xts-plain64 --key-size 256 \\
-          --key-file=- "$TARGET" pkb_swap
-      SWAP_DEV=/dev/mapper/pkb_swap
-    else
-      SWAP_DEV=$TARGET
-    fi
-
-    mkswap "$SWAP_DEV"
-    swapon "$SWAP_DEV"
-    echo "[pkb-startup] swap active on $SWAP_DEV (dmcrypt=$ENABLE_DMCRYPT lssd=$LSSD)"
-  """)
-
-
-_HYPERDISK_MAX_IOPS_PER_MBPS = 256  # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
+# GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput.
+_HYPERDISK_MAX_IOPS_PER_MBPS = 256
 
 
 def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
@@ -755,7 +812,11 @@ def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
     logging.warning(
         '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for '
         '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d',
-        throughput, iops, min_tput, min_tput)
+        throughput,
+        iops,
+        min_tput,
+        min_tput,
+    )
     return min_tput
   return throughput
 
@@ -795,15 +856,27 @@ def _create_benchmark_node_pool(cluster) -> None:
 
   disk_type = _BOOT_DISK_TYPE.value
   cmd = [
-      'gcloud', 'container', 'node-pools', 'create', _BENCHMARK_NODEPOOL,
-      '--cluster',      cluster.name,
-      '--project',      cluster.project,
-      '--machine-type', machine_type,
-      '--image-type',   _NODE_IMAGE_TYPE.value,
-      '--disk-type',    disk_type,
-      '--disk-size',    str(disk_size_gb),
-      '--num-nodes',    '1',
-      '--node-labels',  f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+      'gcloud',
+      'container',
+      'node-pools',
+      'create',
+      _BENCHMARK_NODEPOOL,
+      '--cluster',
+      cluster.name,
+      '--project',
+      cluster.project,
+      '--machine-type',
+      machine_type,
+      '--image-type',
+      _NODE_IMAGE_TYPE.value,
+      '--disk-type',
+      disk_type,
+      '--disk-size',
+      str(disk_size_gb),
+      '--num-nodes',
+      '1',
+      '--node-labels',
+      f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
       '--no-enable-autoupgrade',
       '--no-enable-autorepair',
   ] + zone_flags
@@ -815,10 +888,14 @@ def _create_benchmark_node_pool(cluster) -> None:
   # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
   if disk_type.startswith('hyperdisk') and not is_lssd:
     cmd += [
-        '--boot-disk-provisioned-iops', str(_BOOT_DISK_IOPS.value),
+        '--boot-disk-provisioned-iops',
+        str(_BOOT_DISK_IOPS.value),
         '--boot-disk-provisioned-throughput',
-        str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value,
-                                        _BOOT_DISK_THROUGHPUT.value)),
+        str(
+            _valid_hyperdisk_throughput(
+                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+            )
+        ),
     ]
 
   # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
@@ -826,18 +903,26 @@ def _create_benchmark_node_pool(cluster) -> None:
   if is_lssd:
     cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}']
 
-  logging.info('[swap_encryption] Creating benchmark nodepool: %s / %s / '
-               'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
-               'add_swap_disk=%s',
-               _BENCHMARK_NODEPOOL, machine_type, _NODE_IMAGE_TYPE.value,
-               disk_size_gb, _BOOT_DISK_IOPS.value,
-               _ENABLE_DMCRYPT.value, is_lssd, _ADD_SWAP_DISK.value)
+  logging.info(
+      '[swap_encryption] Creating benchmark nodepool: %s / %s / '
+      'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
+      'add_swap_disk=%s',
+      _BENCHMARK_NODEPOOL,
+      machine_type,
+      _NODE_IMAGE_TYPE.value,
+      disk_size_gb,
+      _BOOT_DISK_IOPS.value,
+      _ENABLE_DMCRYPT.value,
+      is_lssd,
+      _ADD_SWAP_DISK.value,
+  )
 
   # LSSD nodepools take longer to provision than PD-only nodepools because
   # GKE must also initialise the local NVMe devices before marking nodes Ready.
   # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
-  stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=1200,
-                                            raise_on_failure=False)
+  stdout, stderr, rc = vm_util.IssueCommand(
+      cmd, timeout=1200, raise_on_failure=False
+  )
 
   if rc != 0:
     # Idempotent prepare: if the nodepool already exists (e.g. re-running
@@ -846,11 +931,13 @@ def _create_benchmark_node_pool(cluster) -> None:
     # "Already exists" message in this case.
     low = (stderr or '').lower()
     if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low:
-      logging.info('[swap_encryption] Benchmark nodepool already exists — '
-                   'reusing it (idempotent prepare); proceeding to DaemonSet')
+      logging.info(
+          '[swap_encryption] Benchmark nodepool already exists — '
+          'reusing it (idempotent prepare); proceeding to DaemonSet'
+      )
       return
     raise errors.Benchmarks.RunError(
-        f'[swap_encryption] Failed to create benchmark nodepool '
+        '[swap_encryption] Failed to create benchmark nodepool '
         f'(rc={rc}): {stderr}'
     )
   logging.info('[swap_encryption] Benchmark nodepool ready')
@@ -869,28 +956,38 @@ def _wait_for_benchmark_node(timeout: int = 900) -> None:
   pkb_nodepool=benchmark has Ready=True, then returns.
   """
   deadline = time.time() + timeout
-  logging.info('[swap_encryption] Waiting for benchmark node '
-               '(pkb_nodepool=benchmark) to be Ready...')
+  logging.info(
+      '[swap_encryption] Waiting for benchmark node '
+      '(pkb_nodepool=benchmark) to be Ready...'
+  )
   while time.time() < deadline:
-    out, _, rc = kubectl.RunKubectlCommand([
-        'get', 'nodes',
-        '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-        '-o', r'jsonpath={range .items[*]}'
-               r'{.metadata.name}{"\t"}'
-               r'{range .status.conditions[?(@.type=="Ready")]}'
-               r'{.status}{"\n"}{end}{end}',
-    ], raise_on_failure=False)
+    out, _, rc = kubectl.RunKubectlCommand(
+        [
+            'get',
+            'nodes',
+            '-l',
+            f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+            '-o',
+            r'jsonpath={range .items[*]}'
+            r'{.metadata.name}{"\t"}'
+            r'{range .status.conditions[?(@.type=="Ready")]}'
+            r'{.status}{"\n"}{end}{end}',
+        ],
+        raise_on_failure=False,
+    )
 
     if rc == 0 and out.strip():
       for line in out.strip().splitlines():
         parts = line.split('\t')
         if len(parts) == 2 and parts[1].strip() == 'True':
-          logging.info('[swap_encryption] Benchmark node ready: %s',
-                       parts[0].strip())
+          logging.info(
+              '[swap_encryption] Benchmark node ready: %s', parts[0].strip()
+          )
           return
 
-    logging.info('[swap_encryption] Benchmark node not yet Ready — '
-                 'retrying in 15 s...')
+    logging.info(
+        '[swap_encryption] Benchmark node not yet Ready — retrying in 15 s...'
+    )
     time.sleep(15)
 
   raise errors.Benchmarks.RunError(
@@ -922,7 +1019,8 @@ def _attach_swap_disk(cluster) -> None:
     zone = cluster.region
   if not zone:
     raise errors.Benchmarks.RunError(
-        '[swap_encryption] Cannot attach swap disk: cluster zone unknown')
+        '[swap_encryption] Cannot attach swap disk: cluster zone unknown'
+    )
 
   project = cluster.project
   disk_name = f'pkb-swap-{cluster.name}'
@@ -930,60 +1028,97 @@ def _attach_swap_disk(cluster) -> None:
   disk_size_gb = _SWAP_DISK_SIZE_GB.value
 
   # ── Step 1: get the GCE instance name of the benchmark node ───────────────
-  node_out, _, rc = kubectl.RunKubectlCommand([
-      'get', 'nodes',
-      '-l', f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-      '-o', 'jsonpath={.items[0].metadata.name}',
-  ], raise_on_failure=False)
+  node_out, _, rc = kubectl.RunKubectlCommand(
+      [
+          'get',
+          'nodes',
+          '-l',
+          f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+          '-o',
+          'jsonpath={.items[0].metadata.name}',
+      ],
+      raise_on_failure=False,
+  )
   instance_name = node_out.strip()
   if rc != 0 or not instance_name:
     raise errors.Benchmarks.RunError(
-        '[swap_encryption] Cannot find benchmark node for swap disk attach')
+        '[swap_encryption] Cannot find benchmark node for swap disk attach'
+    )
   logging.info('[swap_encryption] Benchmark node instance: %s', instance_name)
 
   # ── Step 2: create the hyperdisk ──────────────────────────────────────────
-  logging.info('[swap_encryption] Creating swap disk %s (%dGiB %s)',
-               disk_name, disk_size_gb, disk_type)
+  logging.info(
+      '[swap_encryption] Creating swap disk %s (%dGiB %s)',
+      disk_name,
+      disk_size_gb,
+      disk_type,
+  )
   create_cmd = [
-      'gcloud', 'compute', 'disks', 'create', disk_name,
-      '--project', project,
-      '--zone', zone,
-      '--type', disk_type,
-      '--size', f'{disk_size_gb}GB',
+      'gcloud',
+      'compute',
+      'disks',
+      'create',
+      disk_name,
+      '--project',
+      project,
+      '--zone',
+      zone,
+      '--type',
+      disk_type,
+      '--size',
+      f'{disk_size_gb}GB',
       '--quiet',
   ]
   if disk_type.startswith('hyperdisk'):
     create_cmd += [
-        '--provisioned-iops', str(_BOOT_DISK_IOPS.value),
+        '--provisioned-iops',
+        str(_BOOT_DISK_IOPS.value),
         '--provisioned-throughput',
-        str(_valid_hyperdisk_throughput(_BOOT_DISK_IOPS.value,
-                                        _BOOT_DISK_THROUGHPUT.value)),
+        str(
+            _valid_hyperdisk_throughput(
+                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+            )
+        ),
     ]
-  _, stderr, rc = vm_util.IssueCommand(create_cmd, timeout=120,
-                                       raise_on_failure=False)
+  _, stderr, rc = vm_util.IssueCommand(
+      create_cmd, timeout=120, raise_on_failure=False
+  )
   if rc != 0:
     raise errors.Benchmarks.RunError(
-        f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}')
+        f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}'
+    )
 
   # ── Step 3: attach the disk to the node VM ────────────────────────────────
-  logging.info('[swap_encryption] Attaching swap disk %s to %s',
-               disk_name, instance_name)
+  logging.info(
+      '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name
+  )
   attach_cmd = [
-      'gcloud', 'compute', 'instances', 'attach-disk', instance_name,
-      '--project', project,
-      '--zone', zone,
-      '--disk', disk_name,
-      '--device-name', 'pkb-swap',
+      'gcloud',
+      'compute',
+      'instances',
+      'attach-disk',
+      instance_name,
+      '--project',
+      project,
+      '--zone',
+      zone,
+      '--disk',
+      disk_name,
+      '--device-name',
+      'pkb-swap',
       '--quiet',
   ]
-  _, stderr, rc = vm_util.IssueCommand(attach_cmd, timeout=120,
-                                       raise_on_failure=False)
+  _, stderr, rc = vm_util.IssueCommand(
+      attach_cmd, timeout=120, raise_on_failure=False
+  )
   if rc != 0:
     raise errors.Benchmarks.RunError(
         f'[swap_encryption] Failed to attach swap disk to {instance_name}: '
-        f'{stderr}')
-  logging.info('[swap_encryption] Swap disk attached: %s → %s',
-               disk_name, instance_name)
+        f'{stderr}'
+    )
+  logging.info(
+      '[swap_encryption] Swap disk attached: %s → %s', disk_name, instance_name
+  )
 
 
 def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
@@ -996,35 +1131,85 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
   """
   for attempt in range(1, 5):
     users, _, rc = vm_util.IssueCommand(
-        ['gcloud', 'compute', 'disks', 'describe', disk_name,
-         '--project', project, '--zone', zone, '--format=value(users)'],
-        timeout=60, raise_on_failure=False)
+        [
+            'gcloud',
+            'compute',
+            'disks',
+            'describe',
+            disk_name,
+            '--project',
+            project,
+            '--zone',
+            zone,
+            '--format=value(users)',
+        ],
+        timeout=60,
+        raise_on_failure=False,
+    )
     if rc != 0:
-      logging.info('[swap_encryption] Swap disk %s not present — nothing to '
-                   'delete', disk_name)
+      logging.info(
+          '[swap_encryption] Swap disk %s not present — nothing to delete',
+          disk_name,
+      )
       return True  # already gone
     user = users.strip()
     if user:
       inst = user.split('/')[-1]
-      logging.info('[swap_encryption] Detaching swap disk %s from %s',
-                   disk_name, inst)
+      logging.info(
+          '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst
+      )
       vm_util.IssueCommand(
-          ['gcloud', 'compute', 'instances', 'detach-disk', inst,
-           '--project', project, '--zone', zone, '--disk', disk_name,
-           '--quiet'], timeout=120, raise_on_failure=False)
+          [
+              'gcloud',
+              'compute',
+              'instances',
+              'detach-disk',
+              inst,
+              '--project',
+              project,
+              '--zone',
+              zone,
+              '--disk',
+              disk_name,
+              '--quiet',
+          ],
+          timeout=120,
+          raise_on_failure=False,
+      )
     _, derr, drc = vm_util.IssueCommand(
-        ['gcloud', 'compute', 'disks', 'delete', disk_name,
-         '--project', project, '--zone', zone, '--quiet'],
-        timeout=180, raise_on_failure=False)
+        [
+            'gcloud',
+            'compute',
+            'disks',
+            'delete',
+            disk_name,
+            '--project',
+            project,
+            '--zone',
+            zone,
+            '--quiet',
+        ],
+        timeout=180,
+        raise_on_failure=False,
+    )
     if drc == 0:
       logging.info('[swap_encryption] Swap disk deleted: %s', disk_name)
       return True
-    logging.warning('[swap_encryption] Swap disk delete attempt %d/4 failed '
-                    '(%s); retrying in 10s', attempt, derr.strip()[:160])
+    logging.warning(
+        '[swap_encryption] Swap disk delete attempt %d/4 failed '
+        '(%s); retrying in 10s',
+        attempt,
+        derr.strip()[:160],
+    )
     time.sleep(10)
-  logging.error('[swap_encryption] Could NOT delete swap disk %s after retries '
-                '— delete it manually: gcloud compute disks delete %s '
-                '--zone %s --quiet', disk_name, disk_name, zone)
+  logging.error(
+      '[swap_encryption] Could NOT delete swap disk %s after retries '
+      '— delete it manually: gcloud compute disks delete %s '
+      '--zone %s --quiet',
+      disk_name,
+      disk_name,
+      zone,
+  )
   return False
 
 
@@ -1054,19 +1239,30 @@ def _delete_default_node_pool(cluster) -> None:
     zone_flags = ['--region', cluster.region]
 
   cmd = [
-      'gcloud', 'container', 'node-pools', 'delete', _DEFAULT_NODEPOOL,
-      '--cluster', cluster.name,
-      '--project', cluster.project,
+      'gcloud',
+      'container',
+      'node-pools',
+      'delete',
+      _DEFAULT_NODEPOOL,
+      '--cluster',
+      cluster.name,
+      '--project',
+      cluster.project,
       '--quiet',
   ] + zone_flags
 
   logging.info(
-      '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL)
-  stdout, stderr, rc = vm_util.IssueCommand(cmd, timeout=300,
-                                            raise_on_failure=False)
+      '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL
+  )
+  stdout, stderr, rc = vm_util.IssueCommand(
+      cmd, timeout=300, raise_on_failure=False
+  )
   if rc != 0:
-    logging.warning('[swap_encryption] Could not delete default nodepool '
-                    '(rc=%d): %s', rc, stderr)
+    logging.warning(
+        '[swap_encryption] Could not delete default nodepool (rc=%d): %s',
+        rc,
+        stderr,
+    )
   else:
     logging.info('[swap_encryption] Default nodepool deleted')
 
@@ -1079,9 +1275,17 @@ def _is_pod_gone(pod: str) -> bool:
   """
   try:
     _, err, rc = kubectl.RunKubectlCommand(
-        ['get', 'pod', pod, '-n', _DS_NAMESPACE,
-         '-o', 'jsonpath={.metadata.name}'],
-        raise_on_failure=False, timeout=15,
+        [
+            'get',
+            'pod',
+            pod,
+            '-n',
+            _DS_NAMESPACE,
+            '-o',
+            'jsonpath={.metadata.name}',
+        ],
+        raise_on_failure=False,
+        timeout=15,
     )
     return rc != 0 and 'not found' in (err or '').lower()
   except Exception:  # pylint: disable=broad-except
@@ -1112,33 +1316,27 @@ def _pod_exec(
   Returns:
     Tuple of (stdout, stderr) strings.
   """
-  _TRANSIENT_ERRORS = ('connection reset by peer', 'websocket: close')
-  # Errors that indicate the container/pod is gone and needs recovery.
-  # 'not found' covers "Error from server (NotFound): pods ... not found"
-  # which occurs when the DaemonSet pod was evicted and recreated under a
-  # new name (e.g. after OOM-triggered node pressure eviction).
-  # 'deleted state' covers "cannot exec in a deleted state" — the container
-  # was OOM-killed and is mid-termination (not yet recreated).
-  _CONTAINER_GONE_ERRORS = ('container not found', 'procReady not received',
-                             'unable to upgrade connection', 'not found',
-                             'deleted state')
+  # Use module-level constants for error strings (defined at top of module).
   # Use the globally-tracked active pod name — it may have been updated by
   # a previous _recover_pod call when eviction replaced the pod.
   active = _active_pod[0] if _active_pod else pod
 
   for attempt in range(_retries + 1):
     out, err, rc = kubectl.RunKubectlCommand(
-        ['exec', active, '-n', _DS_NAMESPACE,
-         '--', 'bash', '-c', cmd],
+        ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd],
         raise_on_failure=False,
-        raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
+        # Retry loop in _pod_exec handles transient resets.
+        raise_on_timeout=False,
         timeout=timeout,
     )
-    is_transient = rc != 0 and any(e in err for e in _TRANSIENT_ERRORS)
+    is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS)
     if is_transient and attempt < _retries:
       logging.warning(
           '[swap_encryption] kubectl exec connection reset (attempt %d/%d); '
-          'retrying in 10 s', attempt + 1, _retries + 1)
+          'retrying in 10 s',
+          attempt + 1,
+          _retries + 1,
+      )
       time.sleep(10)
       continue
     # rc=137 (SIGKILL): the OOM killer terminated the container process.
@@ -1162,27 +1360,33 @@ def _pod_exec(
       # "Error from server (NotFound): pods … not found".
       logging.warning(
           '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update '
-          'pod state before recovery check')
+          'pod state before recovery check'
+      )
       time.sleep(15)
       pod_gone = _is_pod_gone(active)
       if pod_gone:
         logging.warning(
-            '[swap_encryption] OOM-eviction detected (rc=137, pod gone) — '
-            'recovering pod name for subsequent commands (not retrying this cmd)')
+            '[swap_encryption] OOM-eviction (rc=137, pod gone) —'
+            ' recovering pod name (cmd not retried)'
+        )
       else:
         logging.warning(
-            '[swap_encryption] Container OOM-killed (rc=137, pod still exists) — '
-            'waiting for container restart and tool re-install before continuing')
+            '[swap_encryption] OOM-kill (rc=137, pod exists) —'
+            ' waiting for container restart before continuing'
+        )
       new_pod = _recover_pod(active)
       if new_pod != active:
-        logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod)
+        logging.info(
+            '[swap_encryption] Pod name updated: %s → %s', active, new_pod
+        )
         if _active_pod:
           _active_pod[0] = new_pod
         active = new_pod
       break  # Do NOT retry — the OOM cmd itself is not re-run on the new pod.
 
-    is_container_gone = (rc != 0 and
-                         any(e in err.lower() for e in _CONTAINER_GONE_ERRORS))
+    is_container_gone = rc != 0 and any(
+        e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
+    )
     if is_container_gone:
       # Record the loss for the run-level degradation gate REGARDLESS of retry
       # budget or ignore_failure.  A "pods … not found" on a best-effort command
@@ -1193,14 +1397,22 @@ def _pod_exec(
         _pod_lost.append(active)
         logging.error(
             '[swap_encryption] Benchmark pod %s is gone (%s) — recording run '
-            'as degraded', active, (err or '').strip()[:160])
+            'as degraded',
+            active,
+            (err or '').strip()[:160],
+        )
       if attempt < _retries:
         logging.warning(
             '[swap_encryption] Container gone/restarting (attempt %d/%d) — '
-            'waiting for pod to recover...', attempt + 1, _retries + 1)
+            'waiting for pod to recover...',
+            attempt + 1,
+            _retries + 1,
+        )
         new_pod = _recover_pod(active)
         if new_pod != active:
-          logging.info('[swap_encryption] Pod name updated: %s → %s', active, new_pod)
+          logging.info(
+              '[swap_encryption] Pod name updated: %s → %s', active, new_pod
+          )
           if _active_pod:
             _active_pod[0] = new_pod
           active = new_pod
@@ -1209,7 +1421,8 @@ def _pod_exec(
 
   if rc != 0 and not ignore_failure:
     raise errors.VmUtil.IssueCommandError(
-        f'[swap_encryption] _pod_exec failed (rc={rc}): {err}')
+        f'[swap_encryption] _pod_exec failed (rc={rc}): {err}'
+    )
   return out, err
 
 
@@ -1227,8 +1440,11 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
   Returns the (possibly new) pod name once it is Running and ready.
   """
   deadline = time.time() + timeout_sec
-  logging.info('[swap_encryption] Waiting for pod %s to recover '
-               '(up to %ds)...', pod, timeout_sec)
+  logging.info(
+      '[swap_encryption] Waiting for pod %s to recover (up to %ds)...',
+      pod,
+      timeout_sec,
+  )
 
   # Phase 1: wait for a Running pod — either the named one (container
   # restart) or a replacement pod found via label selector (eviction).
@@ -1246,9 +1462,17 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
     # lives entirely in status_err.  Discarding stderr (using _) means the
     # 'not found' check below never fires and we spin until deadline.
     status_out, status_err, status_rc = kubectl.RunKubectlCommand(
-        ['get', 'pod', pod, '-n', _DS_NAMESPACE,
-         '-o', 'jsonpath={.status.phase}|{.metadata.deletionTimestamp}'],
-        raise_on_failure=False, timeout=30,
+        [
+            'get',
+            'pod',
+            pod,
+            '-n',
+            _DS_NAMESPACE,
+            '-o',
+            'jsonpath={.status.phase}|{.metadata.deletionTimestamp}',
+        ],
+        raise_on_failure=False,
+        timeout=30,
     )
     # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating)
     fields = status_out.strip().split('|')
@@ -1262,62 +1486,91 @@ def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
     # Pod no longer exists, OR it exists but is being terminated (Terminating
     # state or deletionTimestamp set) — look for a replacement pod by label.
     pod_gone_or_terminating = (
-        (status_rc != 0 and 'not found' in (status_out + status_err).lower())
-        or is_terminating
-    )
+        status_rc != 0 and 'not found' in (status_out + status_err).lower()
+    ) or is_terminating
     if pod_gone_or_terminating:
       label_out, _, label_rc = kubectl.RunKubectlCommand(
-          ['get', 'pods', '-n', _DS_NAMESPACE,
-           '-l', f'app={_DS_LABEL}',
-           '-o', 'jsonpath={range .items[?(@.status.phase=="Running")]}'
-                 '{.metadata.name}{"\\n"}{end}'],
-          raise_on_failure=False, timeout=30,
+          [
+              'get',
+              'pods',
+              '-n',
+              _DS_NAMESPACE,
+              '-l',
+              f'app={_DS_LABEL}',
+              '-o',
+              (
+                  'jsonpath={range .items[?(@.status.phase=="Running")]}'
+                  '{.metadata.name}{"\\n"}{end}'
+              ),
+          ],
+          raise_on_failure=False,
+          timeout=30,
       )
-      new_pods = [p.strip() for p in label_out.strip().splitlines() if p.strip()
-                  and p.strip() != pod]  # exclude the dying pod
+      new_pods = [
+          p.strip()
+          for p in label_out.strip().splitlines()
+          if p.strip() and p.strip() != pod
+      ]  # exclude the dying pod
       if label_rc == 0 and new_pods:
         recovered_pod = new_pods[0]
-        logging.info('[swap_encryption] Original pod %s gone/terminating; '
-                     'found replacement %s', pod, recovered_pod)
+        logging.info(
+            '[swap_encryption] Original pod %s gone/terminating; '
+            'found replacement %s',
+            pod,
+            recovered_pod,
+        )
         break
 
     time.sleep(10)
   else:
     raise errors.VmUtil.IssueCommandError(
         f'[swap_encryption] No Running pod found (original: {pod}) '
-        f'within {timeout_sec}s after OOM kill / eviction')
+        f'within {timeout_sec}s after OOM kill / eviction'
+    )
 
   # Phase 2: wait for init script to finish (sentinel written last).
   while time.time() < deadline:
     ready_out, _, ready_rc = kubectl.RunKubectlCommand(
-        ['exec', recovered_pod, '-n', _DS_NAMESPACE,
-         '--', 'bash', '-c', 'test -f /tmp/pkb_ready && echo READY'],
-        raise_on_failure=False, timeout=30,
+        [
+            'exec',
+            recovered_pod,
+            '-n',
+            _DS_NAMESPACE,
+            '--',
+            'bash',
+            '-c',
+            'test -f /tmp/pkb_ready && echo READY',
+        ],
+        raise_on_failure=False,
+        timeout=30,
     )
     if ready_rc == 0 and 'READY' in ready_out:
-      logging.info('[swap_encryption] Pod %s recovered and ready', recovered_pod)
+      logging.info(
+          '[swap_encryption] Pod %s recovered and ready', recovered_pod
+      )
       return recovered_pod
     time.sleep(15)
 
   raise errors.VmUtil.IssueCommandError(
       f'[swap_encryption] Pod {recovered_pod} did not become ready '
-      f'within {timeout_sec}s after OOM kill / eviction')
+      f'within {timeout_sec}s after OOM kill / eviction'
+  )
 
 
 _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
     # GCP  (on-demand, us-central1 unless noted)
     'c4-standard-8-lssd': 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
-    'c4-standard-8':      0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
-    'n4-highmem-32':      3.0256,  # 32 vCPU, 256 GB RAM
-    'n2-highmem-32':      2.5216,  # 32 vCPU, 256 GB RAM
-    'n2-standard-32':     1.5264,  # 32 vCPU, 120 GB RAM
-    'z3-highmem-8':       2.7248,  # 8 vCPU + 4× LSSD
+    'c4-standard-8': 0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
+    'n4-highmem-32': 3.0256,  # 32 vCPU, 256 GB RAM
+    'n2-highmem-32': 2.5216,  # 32 vCPU, 256 GB RAM
+    'n2-standard-32': 1.5264,  # 32 vCPU, 120 GB RAM
+    'z3-highmem-8': 2.7248,  # 8 vCPU + 4× LSSD
     # AWS
-    'i4i.4xlarge':        1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
-    'i4i.2xlarge':        0.7480,
-    'm6id.4xlarge':       0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
-    'm6i.4xlarge':        0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
-    'r6i.4xlarge':        1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
+    'i4i.4xlarge': 1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
+    'i4i.2xlarge': 0.7480,
+    'm6id.4xlarge': 0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
+    'm6i.4xlarge': 0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
+    'r6i.4xlarge': 1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
 }
 
 
@@ -1342,11 +1595,13 @@ def _collect_cost_sample(
   instance_type = ''
 
   # GCP: machine type is the last segment of the metadata URL value
+  _gcp_meta_url = (
+      'http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
+  )
   gcp_type_out, _ = _pod_exec(
       pod,
-      'curl -s -m 3 --fail '
-      'http://metadata.google.internal/computeMetadata/v1/instance/machine-type '
-      '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+      f'curl -s -m 3 --fail {_gcp_meta_url}'
+      ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
       ignore_failure=True,
   )
   if gcp_type_out.strip():
@@ -1434,16 +1689,18 @@ def _detect_swap_device(pod: str) -> str:
   )
 
 
-def _build_metadata(pod: str, swap_dev: str) -> dict:
+def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
   """Collect node environment, encryption type, and config into a dict."""
 
   kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True)
   mem_out, _ = _pod_exec(
-      pod, "awk '/MemTotal/{print $2}' /proc/meminfo",
+      pod,
+      "awk '/MemTotal/{print $2}' /proc/meminfo",
       ignore_failure=True,
   )
   swap_out, _ = _pod_exec(
-      pod, "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
+      pod,
+      "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
       ignore_failure=True,
   )
 
@@ -1468,9 +1725,9 @@ def _build_metadata(pod: str, swap_dev: str) -> dict:
     )
     enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
   elif _SWAP_TYPE.value in ('instance_store', 'io2'):
-    enc = 'nitro_hardware_offload'   # AWS: encrypted by the Nitro card
+    enc = 'nitro_hardware_offload'  # AWS: encrypted by the Nitro card
   elif not _ENABLE_DMCRYPT.value:
-    enc = 'none'                      # GKE plain swap (encryption OFF)
+    enc = 'none'  # GKE plain swap (encryption OFF)
 
   cloud = _detect_cloud(pod)
 
@@ -1479,11 +1736,14 @@ def _build_metadata(pod: str, swap_dev: str) -> dict:
   # cloud metadata so that the field is always populated.
   instance_label = _INSTANCE_SIZE_LABEL.value
   if not instance_label:
+    _gcp_mt_url = (
+        'http://metadata.google.internal'
+        '/computeMetadata/v1/instance/machine-type'
+    )
     gcp_type_out, _ = _pod_exec(
         pod,
-        'curl -s -m 3 --fail '
-        'http://metadata.google.internal/computeMetadata/v1/instance/machine-type '
-        '-H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        f'curl -s -m 3 --fail {_gcp_mt_url}'
+        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
         ignore_failure=True,
     )
     if gcp_type_out.strip():
@@ -1527,3 +1787,32 @@ def _build_metadata(pod: str, swap_dev: str) -> dict:
       'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
       'nodepool': _NODEPOOL.value,
   }
+
+
+def _detect_cloud(pod: str) -> str:
+  """Detect whether the benchmark pod is running on GCP or AWS.
+
+  Queries the cloud instance metadata endpoint inside the pod.  Returns
+  'GCP' if the GCP metadata server responds, 'AWS' otherwise.
+  """
+  gcp_out, _ = _pod_exec(
+      pod,
+      'curl -s -m 2 --fail '
+      'http://metadata.google.internal/computeMetadata/v1/project/project-id'
+      ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+      ignore_failure=True,
+  )
+  if gcp_out.strip():
+    return 'GCP'
+  return 'AWS'
+
+
+def _ensure_io2_volume() -> None:
+  """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
+
+  Only executed when --swap_encryption_swap_type=io2.  Full implementation
+  is deferred to PR2 (swap-capability layer).
+  """
+  if _SWAP_TYPE.value != 'io2':
+    return
+  logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')
\ No newline at end of file

From 6489d06d58e3c98c35ff877eb503c9172f0fab67 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Thu, 19 Jun 2025 10:46:41 +0000
Subject: [PATCH 03/17] swap_encryption: add GKE kubelet memorySwapBehavior
 config

Per Ajay's review comment on PR #6758:
- Add _GKE_KUBELET_MEMORY_SWAP flag (default LimitedSwap) so the
  benchmark nodepool is created with kubeletConfig.memorySwapBehavior
  set via --system-config-from-file, enabling pod-level swap usage.
- Wrap gcloud IssueCommand in try/finally to clean up the temp YAML.
- Update nodepool creation log to include kubelet_swap value.
---
 .../swap_encryption_benchmark.py              | 123 ++++++++++++------
 1 file changed, 84 insertions(+), 39 deletions(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 026831efe0..215a9b40f3 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -64,6 +64,8 @@
 """
 
 import logging
+import os
+import tempfile
 import textwrap
 import time
 from typing import Any
@@ -282,6 +284,18 @@
     '(unencrypted) swap overhead as a baseline.',
 )
 
+_GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string(
+    'swap_encryption_gke_kubelet_memory_swap',
+    'LimitedSwap',
+    'Value for kubeletConfig.memorySwapBehavior injected via '
+    '--system-config-from-file when creating the GKE benchmark nodepool.  '
+    'LimitedSwap (default) — the kubelet allows pods to use swap up to their '
+    'memory limit; required for the DaemonSet pod to drive kernel swapping.  '
+    'NoSwap — disables swap at the kubelet level (use for a baseline run that '
+    'confirms zero swap activity).  Set empty string to omit the flag entirely '
+    'and rely on the cluster-level default.',
+)
+
 _SWAP_DEVICE = flags.DEFINE_string(
     'swap_encryption_device',
     '',
@@ -547,9 +561,10 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
     )
   if _pod_lost:
     _degraded_reasons.append(
-        f'pod(s) NotFound during run: {", ".join(_pod_lost)} — pod died'
-        ' (eviction/exit); phases at/after that point (e.g.'
-        ' kernel-build, OpenSearch) produced invalid data'
+        'benchmark pod(s) went NotFound during the run'
+        f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure'
+        ' eviction or container exit) and any phase running at or after that'
+        ' point (e.g. kernel-build baseline, OpenSearch) produced invalid data'
     )
   if _oom_events:
     _degraded_reasons.append(
@@ -598,10 +613,9 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
     _pod_exec(
         pod,
         textwrap.dedent("""
-          swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
-          dmsetup remove --noudevrules --noudevsync \
-            swap_encrypted 2>/dev/null || true
-        """),
+      swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+      dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    """),
         ignore_failure=True,
     )
     # Clean up loop device backing files (single-disk fallback path).
@@ -622,9 +636,7 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
         ignore_failure=True,
     )
     _pod_exec(
-        pod,
-        "pkill -9 'stress-ng|fio' 2>/dev/null || true",
-        ignore_failure=True,
+        pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", ignore_failure=True
     )
 
   _delete_daemonset()
@@ -672,8 +684,10 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
               '-n',
               _DS_NAMESPACE,
               '-o',
-              r'jsonpath={range .items[*]}{.metadata.name}'
-              r'{"\t"}{.status.phase}{"\n"}{end}',
+              (
+                  r'jsonpath={range'
+                  r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
+              ),
           ],
           raise_on_failure=False,
       )
@@ -721,15 +735,15 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
             '[swap_encryption] Pod %s ready (tools installed)', ready_pod
         )
         return ready_pod
-      # "container not found" means the container crashed (CrashLoopBackOff
-      # or exited) — hard reset: re-check pod phase on next iteration.
+      # "container not found" means the container crashed (CrashLoopBackOff or
+      # exited) — treat it as a hard reset: re-check pod phase on next iteration.
       if (
           'container not found' in sentinel_err
           or 'unable to upgrade connection' in sentinel_err
       ):
         logging.warning(
-            '[swap_encryption] Pod %s: container not running (%s)'
-            ' — will re-check pod state',
+            '[swap_encryption] Pod %s: container not running (%s) '
+            '— will re-check pod state',
             ready_pod,
             sentinel_err.strip(),
         )
@@ -749,7 +763,7 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
 
 
 def _log_pod_events(pod_name: str) -> None:
-  """Dump recent Kubernetes events for the pod to diagnose startup hangs."""
+  """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
   events_out, _, _ = kubectl.RunKubectlCommand(
       [
           'describe',
@@ -793,8 +807,9 @@ def _delete_daemonset() -> None:
   logging.info('[swap_encryption] DaemonSet deleted')
 
 
-# GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput.
-_HYPERDISK_MAX_IOPS_PER_MBPS = 256
+_HYPERDISK_MAX_IOPS_PER_MBPS = (
+    256  # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
+)
 
 
 def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
@@ -903,10 +918,36 @@ def _create_benchmark_node_pool(cluster) -> None:
   if is_lssd:
     cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}']
 
+  # ── GKE kubelet swap config ───────────────────────────────────────────────
+  # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
+  # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
+  # so that the kubelet allocates swap to the DaemonSet pod.  Without this flag
+  # the Linux kernel swap device may exist but the kubelet blocks pod-level
+  # swap usage and the benchmark pod cannot drive swap I/O.
+  #
+  # Passed as --system-config-from-file pointing to a temp YAML, which is the
+  # same mechanism PKB's gke_node_system_config flag uses:
+  #   perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+  swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
+  system_config_tmp = None
+  if swap_behavior:
+    kubelet_yaml = f'kubeletConfig:\n  memorySwapBehavior: {swap_behavior}\n'
+    system_config_tmp = tempfile.NamedTemporaryFile(
+        mode='w', suffix='.yaml', delete=False
+    )
+    system_config_tmp.write(kubelet_yaml)
+    system_config_tmp.flush()
+    cmd += ['--system-config-from-file', system_config_tmp.name]
+    logging.info(
+        '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)',
+        swap_behavior,
+        system_config_tmp.name,
+    )
+
   logging.info(
       '[swap_encryption] Creating benchmark nodepool: %s / %s / '
       'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
-      'add_swap_disk=%s',
+      'add_swap_disk=%s / kubelet_swap=%s',
       _BENCHMARK_NODEPOOL,
       machine_type,
       _NODE_IMAGE_TYPE.value,
@@ -915,14 +956,22 @@ def _create_benchmark_node_pool(cluster) -> None:
       _ENABLE_DMCRYPT.value,
       is_lssd,
       _ADD_SWAP_DISK.value,
+      swap_behavior or 'unset',
   )
 
   # LSSD nodepools take longer to provision than PD-only nodepools because
   # GKE must also initialise the local NVMe devices before marking nodes Ready.
   # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
-  stdout, stderr, rc = vm_util.IssueCommand(
-      cmd, timeout=1200, raise_on_failure=False
-  )
+  try:
+    stdout, stderr, rc = vm_util.IssueCommand(
+        cmd, timeout=1200, raise_on_failure=False
+    )
+  finally:
+    if system_config_tmp is not None:
+      try:
+        os.unlink(system_config_tmp.name)
+      except OSError:
+        pass
 
   if rc != 0:
     # Idempotent prepare: if the nodepool already exists (e.g. re-running
@@ -1325,8 +1374,7 @@ def _pod_exec(
     out, err, rc = kubectl.RunKubectlCommand(
         ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd],
         raise_on_failure=False,
-        # Retry loop in _pod_exec handles transient resets.
-        raise_on_timeout=False,
+        raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
         timeout=timeout,
     )
     is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS)
@@ -1366,13 +1414,15 @@ def _pod_exec(
       pod_gone = _is_pod_gone(active)
       if pod_gone:
         logging.warning(
-            '[swap_encryption] OOM-eviction (rc=137, pod gone) —'
-            ' recovering pod name (cmd not retried)'
+            '[swap_encryption] OOM-eviction detected (rc=137, pod gone) —'
+            ' recovering pod name for subsequent commands (not retrying this'
+            ' cmd)'
         )
       else:
         logging.warning(
-            '[swap_encryption] OOM-kill (rc=137, pod exists) —'
-            ' waiting for container restart before continuing'
+            '[swap_encryption] Container OOM-killed (rc=137, pod still exists)'
+            ' — waiting for container restart and tool re-install before'
+            ' continuing'
         )
       new_pod = _recover_pod(active)
       if new_pod != active:
@@ -1595,12 +1645,10 @@ def _collect_cost_sample(
   instance_type = ''
 
   # GCP: machine type is the last segment of the metadata URL value
-  _gcp_meta_url = (
-      'http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
-  )
   gcp_type_out, _ = _pod_exec(
       pod,
-      f'curl -s -m 3 --fail {_gcp_meta_url}'
+      'curl -s -m 3 --fail'
+      ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
       ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
       ignore_failure=True,
   )
@@ -1736,13 +1784,10 @@ def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
   # cloud metadata so that the field is always populated.
   instance_label = _INSTANCE_SIZE_LABEL.value
   if not instance_label:
-    _gcp_mt_url = (
-        'http://metadata.google.internal'
-        '/computeMetadata/v1/instance/machine-type'
-    )
     gcp_type_out, _ = _pod_exec(
         pod,
-        f'curl -s -m 3 --fail {_gcp_mt_url}'
+        'curl -s -m 3 --fail'
+        ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
         ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
         ignore_failure=True,
     )
@@ -1815,4 +1860,4 @@ def _ensure_io2_volume() -> None:
   """
   if _SWAP_TYPE.value != 'io2':
     return
-  logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')
\ No newline at end of file
+  logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')

From 95df3acfae53599ab6978b2a34ab46a6079121fa Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Thu, 19 Jun 2025 10:46:42 +0000
Subject: [PATCH 04/17] refactor(swap_encryption): use PKB GcloudCommand
 instead of raw vm_util.IssueCommand

Replace all raw ['gcloud', ...] list + vm_util.IssueCommand calls in
swap_encryption_benchmark.py with PKB's existing GcloudCommand infrastructure:

- _create_benchmark_node_pool: cluster._GcloudCommand() + cmd.flags + cmd.Issue
- _delete_default_node_pool: cluster._GcloudCommand() + cmd.Issue
- _attach_swap_disk: gcp_util.GcloudCommand(_GcpZonalResource) for create+attach
- _delete_disk_by_name: gcp_util.GcloudCommand for describe/detach/delete

Add _GcpZonalResource shim: pins zone for gcloud compute operations.
GcloudCommand auto-injects --project and --zone/--region, handles auth
token refresh -- matching PKB standards.
---
 .../swap_encryption_benchmark.py              | 228 ++++++------------
 1 file changed, 79 insertions(+), 149 deletions(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 215a9b40f3..e325220c03 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -77,6 +77,7 @@
 from perfkitbenchmarker import sample
 from perfkitbenchmarker import vm_util
 from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.providers.gcp import util as gcp_util
 
 FLAGS = flags.FLAGS
 
@@ -388,6 +389,22 @@
 _DEFAULT_NODEPOOL = 'default-pool'
 
 
+class _GcpZonalResource:
+  """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
+
+  gcp_util.GcloudCommand auto-injects --project and --zone from the resource
+  object passed to it.  GkeCluster._GcloudCommand() handles container/*
+  operations correctly but also switches --zone → --region for multi-zone
+  clusters, which is wrong for gcloud compute commands (--region creates
+  regional resources, not zonal ones).  This shim pins a single zone so all
+  gcloud compute calls target the correct AZ.
+  """
+
+  def __init__(self, project: str, zone: str) -> None:
+    self.project = project
+    self.zone = zone
+
+
 def _daemonset_yaml(image: str) -> str:
   """Render the privileged benchmark DaemonSet manifest.
 
@@ -856,12 +873,6 @@ def _create_benchmark_node_pool(cluster) -> None:
   is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower()
 
   # Determine zone/region from the cluster object.
-  zone_flags: list[str] = []
-  if getattr(cluster, 'zones', None):
-    zone_flags = ['--zone', cluster.zones[0]]
-  elif getattr(cluster, 'region', None):
-    zone_flags = ['--region', cluster.region]
-
   # LSSD configs only need a small boot disk (OS only; swap is on local NVMe).
   # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on
   # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk
@@ -870,31 +881,25 @@ def _create_benchmark_node_pool(cluster) -> None:
   disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value
 
   disk_type = _BOOT_DISK_TYPE.value
-  cmd = [
-      'gcloud',
+
+  # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region,
+  # and auth token refresh.  GkeCluster._GcloudCommand also handles the
+  # zone → region promotion for multi-zone / regional clusters.
+  cmd = cluster._GcloudCommand(
       'container',
       'node-pools',
       'create',
       _BENCHMARK_NODEPOOL,
       '--cluster',
       cluster.name,
-      '--project',
-      cluster.project,
-      '--machine-type',
-      machine_type,
-      '--image-type',
-      _NODE_IMAGE_TYPE.value,
-      '--disk-type',
-      disk_type,
-      '--disk-size',
-      str(disk_size_gb),
-      '--num-nodes',
-      '1',
-      '--node-labels',
-      f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-      '--no-enable-autoupgrade',
-      '--no-enable-autorepair',
-  ] + zone_flags
+  )
+  cmd.flags['machine-type'] = machine_type
+  cmd.flags['image-type'] = _NODE_IMAGE_TYPE.value
+  cmd.flags['disk-type'] = disk_type
+  cmd.flags['disk-size'] = disk_size_gb
+  cmd.flags['num-nodes'] = 1
+  cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}'
+  cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair']
 
   # IOPS and throughput provisioning only applies to hyperdisk-* types AND
   # only when the boot disk is also the swap device (non-LSSD configs).
@@ -902,21 +907,17 @@ def _create_benchmark_node_pool(cluster) -> None:
   # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the
   # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
   if disk_type.startswith('hyperdisk') and not is_lssd:
-    cmd += [
-        '--boot-disk-provisioned-iops',
-        str(_BOOT_DISK_IOPS.value),
-        '--boot-disk-provisioned-throughput',
-        str(
-            _valid_hyperdisk_throughput(
-                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
-            )
-        ),
-    ]
+    # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by
+    # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks).
+    cmd.flags['boot-disk-provisioned-iops'] = _BOOT_DISK_IOPS.value
+    cmd.flags['boot-disk-provisioned-throughput'] = _valid_hyperdisk_throughput(
+        _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+    )
 
   # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
   # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block).
   if is_lssd:
-    cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}']
+    cmd.flags['local-nvme-ssd-block'] = f'count={_LSSD_COUNT.value}'
 
   # ── GKE kubelet swap config ───────────────────────────────────────────────
   # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
@@ -937,7 +938,7 @@ def _create_benchmark_node_pool(cluster) -> None:
     )
     system_config_tmp.write(kubelet_yaml)
     system_config_tmp.flush()
-    cmd += ['--system-config-from-file', system_config_tmp.name]
+    cmd.flags['system-config-from-file'] = system_config_tmp.name
     logging.info(
         '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)',
         swap_behavior,
@@ -963,9 +964,7 @@ def _create_benchmark_node_pool(cluster) -> None:
   # GKE must also initialise the local NVMe devices before marking nodes Ready.
   # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
   try:
-    stdout, stderr, rc = vm_util.IssueCommand(
-        cmd, timeout=1200, raise_on_failure=False
-    )
+    _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
   finally:
     if system_config_tmp is not None:
       try:
@@ -1102,36 +1101,22 @@ def _attach_swap_disk(cluster) -> None:
       disk_size_gb,
       disk_type,
   )
-  create_cmd = [
-      'gcloud',
-      'compute',
-      'disks',
-      'create',
-      disk_name,
-      '--project',
-      project,
-      '--zone',
-      zone,
-      '--type',
-      disk_type,
-      '--size',
-      f'{disk_size_gb}GB',
-      '--quiet',
-  ]
-  if disk_type.startswith('hyperdisk'):
-    create_cmd += [
-        '--provisioned-iops',
-        str(_BOOT_DISK_IOPS.value),
-        '--provisioned-throughput',
-        str(
-            _valid_hyperdisk_throughput(
-                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
-            )
-        ),
-    ]
-  _, stderr, rc = vm_util.IssueCommand(
-      create_cmd, timeout=120, raise_on_failure=False
+  # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project
+  # and --zone (always zonal — gcloud compute --region creates regional
+  # resources, which is not what we want for a node-attached swap disk).
+  gcp_res = _GcpZonalResource(project, zone)
+  create_cmd = gcp_util.GcloudCommand(
+      gcp_res, 'compute', 'disks', 'create', disk_name
   )
+  create_cmd.flags['type'] = disk_type
+  create_cmd.flags['size'] = f'{disk_size_gb}GB'
+  create_cmd.args.append('--quiet')
+  if disk_type.startswith('hyperdisk'):
+    create_cmd.flags['provisioned-iops'] = _BOOT_DISK_IOPS.value
+    create_cmd.flags['provisioned-throughput'] = _valid_hyperdisk_throughput(
+        _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+    )
+  _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
   if rc != 0:
     raise errors.Benchmarks.RunError(
         f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}'
@@ -1141,25 +1126,13 @@ def _attach_swap_disk(cluster) -> None:
   logging.info(
       '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name
   )
-  attach_cmd = [
-      'gcloud',
-      'compute',
-      'instances',
-      'attach-disk',
-      instance_name,
-      '--project',
-      project,
-      '--zone',
-      zone,
-      '--disk',
-      disk_name,
-      '--device-name',
-      'pkb-swap',
-      '--quiet',
-  ]
-  _, stderr, rc = vm_util.IssueCommand(
-      attach_cmd, timeout=120, raise_on_failure=False
+  attach_cmd = gcp_util.GcloudCommand(
+      gcp_res, 'compute', 'instances', 'attach-disk', instance_name
   )
+  attach_cmd.flags['disk'] = disk_name
+  attach_cmd.flags['device-name'] = 'pkb-swap'
+  attach_cmd.args.append('--quiet')
+  _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
   if rc != 0:
     raise errors.Benchmarks.RunError(
         f'[swap_encryption] Failed to attach swap disk to {instance_name}: '
@@ -1179,22 +1152,12 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
   leaked.  Returns True if the disk is gone (deleted or already absent).
   """
   for attempt in range(1, 5):
-    users, _, rc = vm_util.IssueCommand(
-        [
-            'gcloud',
-            'compute',
-            'disks',
-            'describe',
-            disk_name,
-            '--project',
-            project,
-            '--zone',
-            zone,
-            '--format=value(users)',
-        ],
-        timeout=60,
-        raise_on_failure=False,
+    gcp_res = _GcpZonalResource(project, zone)
+    describe_cmd = gcp_util.GcloudCommand(
+        gcp_res, 'compute', 'disks', 'describe', disk_name
     )
+    describe_cmd.flags['format'] = 'value(users)'
+    users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
     if rc != 0:
       logging.info(
           '[swap_encryption] Swap disk %s not present — nothing to delete',
@@ -1207,40 +1170,17 @@ def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
       logging.info(
           '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst
       )
-      vm_util.IssueCommand(
-          [
-              'gcloud',
-              'compute',
-              'instances',
-              'detach-disk',
-              inst,
-              '--project',
-              project,
-              '--zone',
-              zone,
-              '--disk',
-              disk_name,
-              '--quiet',
-          ],
-          timeout=120,
-          raise_on_failure=False,
+      detach_cmd = gcp_util.GcloudCommand(
+          gcp_res, 'compute', 'instances', 'detach-disk', inst
       )
-    _, derr, drc = vm_util.IssueCommand(
-        [
-            'gcloud',
-            'compute',
-            'disks',
-            'delete',
-            disk_name,
-            '--project',
-            project,
-            '--zone',
-            zone,
-            '--quiet',
-        ],
-        timeout=180,
-        raise_on_failure=False,
+      detach_cmd.flags['disk'] = disk_name
+      detach_cmd.args.append('--quiet')
+      detach_cmd.Issue(timeout=120, raise_on_failure=False)
+    delete_cmd = gcp_util.GcloudCommand(
+        gcp_res, 'compute', 'disks', 'delete', disk_name
     )
+    delete_cmd.args.append('--quiet')
+    _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
     if drc == 0:
       logging.info('[swap_encryption] Swap disk deleted: %s', disk_name)
       return True
@@ -1281,31 +1221,21 @@ def _delete_default_node_pool(cluster) -> None:
   requirement that a cluster must have at least one nodepool at creation time.
   Removing it stops the clock on its cost immediately.
   """
-  zone_flags: list[str] = []
-  if getattr(cluster, 'zones', None):
-    zone_flags = ['--zone', cluster.zones[0]]
-  elif getattr(cluster, 'region', None):
-    zone_flags = ['--region', cluster.region]
-
-  cmd = [
-      'gcloud',
+  # Use PKB's GcloudCommand: auto-injects --project, --zone/--region.
+  cmd = cluster._GcloudCommand(
       'container',
       'node-pools',
       'delete',
       _DEFAULT_NODEPOOL,
       '--cluster',
       cluster.name,
-      '--project',
-      cluster.project,
-      '--quiet',
-  ] + zone_flags
+  )
+  cmd.args.append('--quiet')
 
   logging.info(
       '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL
   )
-  stdout, stderr, rc = vm_util.IssueCommand(
-      cmd, timeout=300, raise_on_failure=False
-  )
+  _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
   if rc != 0:
     logging.warning(
         '[swap_encryption] Could not delete default nodepool (rc=%d): %s',

From 48da7c4caae74f6d778dcdd98573414d404d90d5 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Thu, 19 Jun 2025 10:46:43 +0000
Subject: [PATCH 05/17] refactor(swap_encryption): use
 kubernetes_commands.ApplyManifest and fix imports

Replace manual temp-file + kubectl apply in _deploy_daemonset() with
PKB's kubernetes_commands.ApplyManifest():

- Remove _daemonset_yaml() helper
- _deploy_daemonset() delegates to kubernetes_commands.ApplyManifest(
    'cluster/swap_encryption_daemonset.yaml.j2', **kwargs)
- Add kubernetes_commands import; remove vm_util import (now unused)
- Fix import order: providers.gcp before resources.container_service
---
 .../swap_encryption_benchmark.py              | 45 ++++++++-----------
 1 file changed, 18 insertions(+), 27 deletions(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index e325220c03..5767f8eb71 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -75,9 +75,9 @@
 from perfkitbenchmarker import configs
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import sample
-from perfkitbenchmarker import vm_util
-from perfkitbenchmarker.resources.container_service import kubectl
 from perfkitbenchmarker.providers.gcp import util as gcp_util
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.resources.container_service import kubernetes_commands
 
 FLAGS = flags.FLAGS
 
@@ -405,25 +405,6 @@ def __init__(self, project: str, zone: str) -> None:
     self.zone = zone
 
 
-def _daemonset_yaml(image: str) -> str:
-  """Render the privileged benchmark DaemonSet manifest.
-
-  The manifest is a PKB data file rendered with Jinja2
-  (data/cluster/swap_encryption_daemonset.yaml.j2) rather than an inline
-  string, per PKB conventions.  The DaemonSet is pinned to the benchmark
-  nodepool via nodeSelector so it never lands on the dummy default pool.
-  """
-  return vm_util.ReadAndRenderJinja2Template(
-      'cluster/swap_encryption_daemonset.yaml.j2',
-      ds_name=_DS_NAME,
-      ds_namespace=_DS_NAMESPACE,
-      ds_label=_DS_LABEL,
-      benchmark_nodepool=_BENCHMARK_NODEPOOL,
-      image=image,
-      kernel_version=_KERNEL_VERSION.value,
-  )
-
-
 def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
   """Load and return benchmark config spec."""
   return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
@@ -665,12 +646,22 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
 
 
 def _deploy_daemonset() -> None:
-  """Apply the benchmark DaemonSet manifest to the cluster."""
-  manifest = _daemonset_yaml(image=_DAEMONSET_IMAGE.value)
-  with vm_util.NamedTemporaryFile(mode='w', suffix='.yaml') as f:
-    f.write(manifest)
-    f.close()
-    kubectl.RunKubectlCommand(['apply', '-f', f.name])
+  """Apply the benchmark DaemonSet manifest to the cluster.
+
+  Uses kubernetes_commands.ApplyManifest which renders the Jinja2 template
+  from perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2,
+  writes it to a temp file, and calls kubectl apply -f — the standard PKB
+  pattern for deploying manifests.
+  """
+  kubernetes_commands.ApplyManifest(
+      'cluster/swap_encryption_daemonset.yaml.j2',
+      ds_name=_DS_NAME,
+      ds_namespace=_DS_NAMESPACE,
+      ds_label=_DS_LABEL,
+      benchmark_nodepool=_BENCHMARK_NODEPOOL,
+      image=_DAEMONSET_IMAGE.value,
+      kernel_version=_KERNEL_VERSION.value,
+  )
   logging.info('[swap_encryption] DaemonSet applied')
 
 

From 3cd49537a59e14aa5e48e93178da7c637fd75c2e Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Thu, 19 Jun 2025 10:46:44 +0000
Subject: [PATCH 06/17] fix(swap_encryption): add linuxConfig.swapConfig to
 system-config and remove cgroup hack

Address Ajay review comments on PR #6776:

Comment #r3457877984 (linuxConfig.swapConfig):
Extend --system-config-from-file YAML with linuxConfig blocks:
  linuxConfig.swapConfig.enabled: true -- GKE sets up node-level swap
  dedicatedLocalSsdProfile.diskCount: N -- LSSD: use local NVMe for swap
  linuxConfig.sysctl: vm.swappiness=100, vm.min_free_kbytes=200,
    vm.watermark_scale_factor=500
Ref: https://cloud.google.com/kubernetes-engine/docs/how-to/node-memory-swap

Comment #r3457928855 (cgroup hack):
Remove memory.swap.max=max loop from swap_encryption_daemonset.yaml.j2.
With kubeletConfig.memorySwapBehavior=LimitedSwap the kubelet manages
per-container swap allocation; the cgroup hack is unnecessary.
---
 .../cluster/swap_encryption_daemonset.yaml.j2 | 57 ++++---------------
 .../swap_encryption_benchmark.py              | 34 ++++++++++-
 2 files changed, 42 insertions(+), 49 deletions(-)

diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
index c40ec79dff..62b773ccfd 100644
--- a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
+++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
@@ -166,53 +166,15 @@ spec:
             tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
               echo "[pkb] WARNING: kernel source extraction failed" >&2
           fi
-          echo "[pkb] Unlocking container cgroup swap limits..."
-          # GKE cgroup v2 sets memory.swap.max=0 per-container, which
-          # prevents swap usage even when the node has a swap device and
-          # vm.swappiness>0.  Stress-ng gets OOM-killed in ~15s because
-          # the kernel can't page out to swap for this cgroup.
-          #
-          # NOTE: the old approach derived the cgroup path from
-          # /proc/self/cgroup, but inside a cgroup namespace that reports
-          # "0::/" — so the write targeted the host ROOT cgroup, silently
-          # no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
-          # /sys is the host cgroup tree (hostPath mount) and this pod is
-          # privileged, so instead unlock swap across the entire kubepods
-          # hierarchy, which is guaranteed to contain our own container.
-          if [ -d /sys/fs/cgroup/kubepods.slice ] || \
-             [ -d /sys/fs/cgroup/kubepods ]; then
-            # cgroup v2: write 'max' to every memory.swap.max under kubepods*.
-            find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
-              2>/dev/null | while read -r _f; do
-                echo max > "$_f" 2>/dev/null || true
-              done
-          fi
-          # Best-effort: our own namespaced path and the unified root.
-          PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
-            2>/dev/null)
-          for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
-                      /sys/fs/cgroup/memory.swap.max; do
-            [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
-          done
-          # cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
-          find /sys/fs/cgroup/memory -path '*kubepods*' \
-            -name memory.memsw.limit_in_bytes 2>/dev/null \
-            | while read -r _f; do
-                echo -1 > "$_f" 2>/dev/null || true
-              done
-          # Verify and surface the result in the pod log.  grep -L lists
-          # files that do NOT contain 'max' on their first line, i.e. ones
-          # still capping swap.
-          PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
-            -name memory.swap.max 2>/dev/null \
-            | xargs -r grep -L '^max' 2>/dev/null | head -1)
-          if [ -n "$PKB_STILL_CAPPED" ]; then
-            echo "[pkb] WARNING: cgroup swap still capped at \
-            $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
-            OOM-killed before swap is exercised" >&2
-          else
-            echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
-          fi
+          # Container cgroup swap limits are managed by the kubelet when
+          # kubeletConfig.memorySwapBehavior=LimitedSwap is set via
+          # --system-config-from-file (GKE) or kubelet-config.json (EKS).
+          # Manually writing memory.swap.max=max across kubepods is not
+          # required and is superseded by the kubelet swap config.
+          # Reference: Ajay's review comment go/pkb-swap-encryption-pr1
+          # #r3457928855 — https://github.com/GoogleCloudPlatform/
+          # PerfKitBenchmarker/pull/6776#discussion_r3457928855
+          echo "[pkb] Swap limits managed by kubelet (LimitedSwap config)."
           echo "[pkb] Tools installed. Writing ready sentinel."
           touch /tmp/pkb_ready
           sleep infinity
@@ -264,3 +226,4 @@ spec:
         hostPath:
           path: /lib/modules
           type: Directory
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
\ No newline at end of file
diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 5767f8eb71..f8076ac4e7 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -923,7 +923,34 @@ def _create_benchmark_node_pool(cluster) -> None:
   swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
   system_config_tmp = None
   if swap_behavior:
-    kubelet_yaml = f'kubeletConfig:\n  memorySwapBehavior: {swap_behavior}\n'
+    # Build system-config YAML for --system-config-from-file.
+    # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
+    #   kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods.
+    #   linuxConfig.swapConfig: GKE enables node-level swap device.
+    #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use
+    #     the local NVMe as the swap device (avoids boot-disk overhead).
+    #   linuxConfig.sysctl: swap aggressiveness tuning so the benchmark
+    #     workloads can drive sustained swap I/O.
+    # Reference:
+    #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
+    #   node-memory-swap#enable
+    if is_lssd:
+      swap_config_block = (
+          '  swapConfig:\n'
+          '    enabled: true\n'
+          '    dedicatedLocalSsdProfile:\n'
+          f'      diskCount: {_LSSD_COUNT.value}\n'
+      )
+    else:
+      swap_config_block = '  swapConfig:\n    enabled: true\n'
+    kubelet_yaml = (
+        f'kubeletConfig:\n  memorySwapBehavior: {swap_behavior}\nlinuxConfig:\n'
+        + swap_config_block
+        + '  sysctl:\n'
+        '    vm.min_free_kbytes: 200\n'
+        '    vm.watermark_scale_factor: 500\n'
+        '    vm.swappiness: 100\n'
+    )
     system_config_tmp = tempfile.NamedTemporaryFile(
         mode='w', suffix='.yaml', delete=False
     )
@@ -931,9 +958,12 @@ def _create_benchmark_node_pool(cluster) -> None:
     system_config_tmp.flush()
     cmd.flags['system-config-from-file'] = system_config_tmp.name
     logging.info(
-        '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)',
+        '[swap_encryption] system-config-from-file: '
+        'kubelet_swap=%s lssd=%s (written to %s):\n%s',
         swap_behavior,
+        is_lssd,
         system_config_tmp.name,
+        kubelet_yaml,
     )
 
   logging.info(

From 122505240178962daf942fb237da6ddd9951348a Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Wed, 24 Jun 2026 13:13:52 +0530
Subject: [PATCH 07/17] fix(swap_encryption): lean DaemonSet + Phase 1 fio
 microbenchmarks

---
 .../cluster/swap_encryption_daemonset.yaml.j2 |  169 +-
 .../swap_encryption_benchmark.py              | 3031 +++++++++--------
 2 files changed, 1672 insertions(+), 1528 deletions(-)

diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
index 62b773ccfd..29cacfb3ce 100644
--- a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
+++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
@@ -28,154 +28,46 @@ spec:
         - bash
         - -c
         - |
-          echo "[pkb] Installing benchmark tools..."
-          # Retry apt-get up to 3 times — transient network failures are
-          # common on a freshly-started GKE node.  Critical tools (fio,
-          # stress-ng) must be present before we write the ready sentinel;
-          # a silent || true here would cause /tmp/pkb_ready to appear even
-          # when tools are missing, breaking all subsequent phases.
+          echo "[pkb] Installing measurement tools..."
+          # Only the tools needed for Phase 1 (raw-device fio) and Phase 2
+          # (CPU/I/O overhead) are installed here.  Workload benchmarks
+          # (redis, opensearch, kernel-build) run in separate pods via
+          # existing PKB benchmark modules and are NOT installed here.
           PKB_APT_OK=0
           for _attempt in 1 2 3; do
             apt-get update -qq 2>&1 || true
-            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
-              fio \\
-              stress-ng \\
-              sysstat \\
-              cryptsetup \\
-              mdadm \\
-              redis-server \\
-              redis-tools \\
-              git \\
-              wget \\
-              curl \\
-              make \\
-              gcc \\
-              bc \\
-              flex \\
-              bison \\
-              libelf-dev \\
-              libssl-dev \\
-              cgroup-tools \\
-              nvme-cli \\
-              util-linux \\
-              python3-pip \\
-              libevent-dev \\
-              libssl-dev \\
-              libpcre3-dev \\
-              zlib1g-dev \\
-              build-essential \\
-              autoconf \\
-              automake \\
-              libtool \\
-              libtool-bin \\
-              pkg-config \\
-              python3-dev \\
-              default-jre-headless \\
+            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
+              fio \
+              cryptsetup \
+              mdadm \
+              sysstat \
+              nvme-cli \
               2>&1 && PKB_APT_OK=1 && break
             echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
             sleep 15
           done
-          if [ "$PKB_APT_OK" != "1" ] || \\
-             ! command -v fio >/dev/null 2>&1 || \\
-             ! command -v stress-ng >/dev/null 2>&1; then
-            echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
+          if [ "$PKB_APT_OK" != "1" ] || ! command -v fio >/dev/null 2>&1; then
+            echo "[pkb] FATAL: fio not installed after 3 attempts" >&2
             exit 1
           fi
-          echo "[pkb] Installing memtier_benchmark from source..."
-          # Pin a stable release tag — building from the moving default
-          # branch (HEAD) intermittently broke (memtier_benchmark not found
-          # → Phase 3a lost its P50/P90/P99 latency).  2.2.1 matches the
-          # version PKB's memtier package (memtier.MemtierResult.Parse) is
-          # validated against and builds cleanly with the apt deps above.
-          # Fall back to HEAD only if the tagged clone fails.
-          if ! command -v memtier_benchmark >/dev/null 2>&1; then
-            (cd /tmp && \\
-              rm -rf memtier_benchmark && \\
-              ( git clone --depth 1 --branch 2.2.1 \\
-                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
-                git clone --depth 1 \\
-                  https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
-              cd memtier_benchmark && \\
-              autoreconf -ivf 2>&1 && \\
-              ./configure 2>&1 && \\
-              make -j$(nproc) 2>&1 && \\
-              make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
-              echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
-          fi
-          if command -v memtier_benchmark >/dev/null 2>&1; then
-            echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
-          fi
-          echo "[pkb] Installing esrally (lightweight)..."
-          python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
-          pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
-            pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
-            echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
-          if command -v esrally >/dev/null 2>&1; then
-            echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
-          else
-            echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
-          fi
-          echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
-          # Phase 3c needs a real search server on :9200.  Nothing in apt
-          # ships one and the pod has no systemd, so install the OpenSearch
-          # bundle (ships its own JDK) and launch the binary directly in the
-          # phase.  All best-effort: if any step fails the phase probes the
-          # endpoint and skips cleanly rather than recording fake timings.
-          if [ ! -x /opt/opensearch/bin/opensearch ]; then
-            OS_VER=2.15.0
-            (cd /opt && \\
-              wget -q --timeout=600 -O os.tgz \\
-                "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
-              tar -xzf os.tgz && rm -f os.tgz && \\
-              mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
-              echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
-          fi
-          if [ -x /opt/opensearch/bin/opensearch ]; then
-            # pkbos owns and runs OpenSearch (it refuses to run as root).
-            # Give it a home so HOME/temp paths are writable.
-            id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
-            printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
-              > /opt/opensearch/config/opensearch.yml
-            mkdir -p /opt/opensearch/config/jvm.options.d
-            # 2 GB heap: 512 MB was too small and OpenSearch aborted early.
-            # On a 252 GB node this still leaves plenty of page cache to
-            # pressure into swap during the phase.
-            printf -- '-Xms2g\\n-Xmx2g\\n' \\
-              > /opt/opensearch/config/jvm.options.d/pkb-heap.options
-            sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
-            # CRITICAL: never run the binary as root here (it bails and
-            # leaves root-owned files in logs/ that block the pkbos server).
-            # Clear any stale logs and chown everything to pkbos LAST.
-            rm -f /opt/opensearch/logs/* 2>/dev/null || true
-            chown -R pkbos /opt/opensearch 2>/dev/null || true
-            echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
-          fi
-          echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
-          PKB_KVER="{{ kernel_version }}"
-          PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
-          PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
-          PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
-          PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
-          mkdir -p "$PKB_KROOT"
-          if [ ! -f "$PKB_KTARBALL" ]; then
-            wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
-              echo "[pkb] WARNING: kernel tarball download failed" >&2
-          fi
-          if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
-            echo "[pkb] Extracting kernel source (xz)..."
-            tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
-              echo "[pkb] WARNING: kernel source extraction failed" >&2
+          echo "[pkb] fio installed: $(fio --version 2>&1 | head -1)"
+          echo "[pkb] Verifying swap device is active..."
+          PKB_SWAP_FOUND=0
+          for _attempt in $(seq 1 30); do
+            if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then
+              PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps)
+              echo "[pkb] Swap device active: $PKB_SWAP_DEV"
+              PKB_SWAP_FOUND=1
+              break
+            fi
+            echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2
+            sleep 5
+          done
+          if [ "$PKB_SWAP_FOUND" != "1" ]; then
+            echo "[pkb] WARNING: no active swap device after 150s — " \
+                 "check linuxConfig.swapConfig / kubelet swap config." >&2
           fi
-          # Container cgroup swap limits are managed by the kubelet when
-          # kubeletConfig.memorySwapBehavior=LimitedSwap is set via
-          # --system-config-from-file (GKE) or kubelet-config.json (EKS).
-          # Manually writing memory.swap.max=max across kubepods is not
-          # required and is superseded by the kubelet swap config.
-          # Reference: Ajay's review comment go/pkb-swap-encryption-pr1
-          # #r3457928855 — https://github.com/GoogleCloudPlatform/
-          # PerfKitBenchmarker/pull/6776#discussion_r3457928855
-          echo "[pkb] Swap limits managed by kubelet (LimitedSwap config)."
-          echo "[pkb] Tools installed. Writing ready sentinel."
+          echo "[pkb] Measurement tools ready. Writing ready sentinel."
           touch /tmp/pkb_ready
           sleep infinity
         securityContext:
@@ -226,4 +118,3 @@ spec:
         hostPath:
           path: /lib/modules
           type: Directory
-                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          
\ No newline at end of file
diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index f8076ac4e7..e596abf963 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -63,6 +63,7 @@
     Bulk-index + search query under swap pressure (esrally or curl).
 """
 
+import json
 import logging
 import os
 import tempfile
@@ -87,7 +88,7 @@
 # Benchmark identity
 # ---------------------------------------------------------------------------
 
-BENCHMARK_NAME = 'swap_encryption'
+BENCHMARK_NAME = "swap_encryption"
 
 
 BENCHMARK_CONFIG = """
@@ -117,261 +118,254 @@
 
 
 _DAEMONSET_IMAGE = flags.DEFINE_string(
-    'swap_encryption_daemonset_image',
-    'ubuntu:22.04',
-    'Container image used for the privileged benchmark DaemonSet pod.',
+    "swap_encryption_daemonset_image",
+    "ubuntu:22.04",
+    "Container image used for the privileged benchmark DaemonSet pod.",
 )
 
 
 _NODEPOOL = flags.DEFINE_string(
-    'swap_encryption_nodepool',
-    'benchmark',
-    'Name of the node pool to deploy the benchmark DaemonSet on.',
+    "swap_encryption_nodepool",
+    "benchmark",
+    "Name of the node pool to deploy the benchmark DaemonSet on.",
 )
 
 
 _INSTANCE_SIZE_LABEL = flags.DEFINE_string(
-    'swap_encryption_instance_size_label',
-    '',
-    'Human-readable label for the current instance size being tested, e.g. '
+    "swap_encryption_instance_size_label",
+    "",
+    "Human-readable label for the current instance size being tested, e.g. "
     '"n4-highmem-32" or "i4i.4xlarge".  Stored in sample metadata so that '
-    'results from multiple PKB runs across different instance sizes can be '
-    'collated and compared.  Defaults to the value reported by the cloud '
-    'metadata endpoint inside the pod.',
+    "results from multiple PKB runs across different instance sizes can be "
+    "collated and compared.  Defaults to the value reported by the cloud "
+    "metadata endpoint inside the pod.",
 )
 
 
 _COLLECT_COST = flags.DEFINE_boolean(
-    'swap_encryption_collect_cost',
+    "swap_encryption_collect_cost",
     False,
-    'When True, emit a cost_estimate_usd sample using on-demand pricing '
-    'for the instance type detected at runtime.',
+    "When True, emit a cost_estimate_usd sample using on-demand pricing "
+    "for the instance type detected at runtime.",
 )
 
 
 _FAIL_ON_DEGRADED = flags.DEFINE_boolean(
-    'swap_encryption_fail_on_degraded',
+    "swap_encryption_fail_on_degraded",
     True,
-    'When True (default), raise an error at the end of Run() if the run was '
-    'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and '
-    'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng '
-    'swap-pressure phase was OOM-killed before completing.  This prevents PKB '
-    'from reporting SUCCEEDED for a run whose post-eviction phases produced '
-    'empty or meaningless data.  Set False to keep the legacy behaviour of '
-    'always returning whatever partial samples were collected.',
+    "When True (default), raise an error at the end of Run() if the run was "
+    "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and "
+    "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng "
+    "swap-pressure phase was OOM-killed before completing.  This prevents PKB "
+    "from reporting SUCCEEDED for a run whose post-eviction phases produced "
+    "empty or meaningless data.  Set False to keep the legacy behaviour of "
+    "always returning whatever partial samples were collected.",
 )
 
 
 _PHASES = flags.DEFINE_list(
-    'swap_encryption_phases',
-    ['all'],
-    'Which Run() phases to execute, for fast iteration against an '
-    'already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  '
-    'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng '
-    'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), '
+    "swap_encryption_phases",
+    ["all"],
+    "Which Run() phases to execute, for fast iteration against an "
+    "already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  "
+    "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng "
+    "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), "
     '3b (kernel build), 3c (opensearch).  Default "all" runs everything.  '
-    'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. '
-    'Phases not listed are skipped and do not affect the degraded-run gate '
+    "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. "
+    "Phases not listed are skipped and do not affect the degraded-run gate "
     '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").',
 )
 
 
 _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string(
-    'swap_encryption_benchmark_machine_type',
-    'n4-highmem-32',
-    'Machine type for the benchmark nodepool created in Prepare(). '
-    'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd '
-    '(LSSD RAID-0).  The matching swap setup is selected automatically.',
+    "swap_encryption_benchmark_machine_type",
+    "n4-highmem-32",
+    "Machine type for the benchmark nodepool created in Prepare(). "
+    "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd "
+    "(LSSD RAID-0).  The matching swap setup is selected automatically.",
 )
 
 
 _BENCHMARK_LSSD = flags.DEFINE_boolean(
-    'swap_encryption_lssd',
+    "swap_encryption_lssd",
     False,
-    'Force LSSD RAID-0 swap path even when the machine type name does not '
+    "Force LSSD RAID-0 swap path even when the machine type name does not "
     'contain "lssd".  Auto-detected from machine type when False.',
 )
 
 
 _LSSD_COUNT = flags.DEFINE_integer(
-    'swap_encryption_lssd_count',
+    "swap_encryption_lssd_count",
     1,
-    'Number of local NVMe SSDs to attach as raw block devices '
-    '(--local-nvme-ssd-block count=N).  Must match the fixed local SSD '
-    'count for the chosen machine type: c4-standard-8-lssd=1, '
-    'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  '
-    'Default 1 covers most single-lssd machine types.',
+    "Number of local NVMe SSDs to attach as raw block devices "
+    "(--local-nvme-ssd-block count=N).  Must match the fixed local SSD "
+    "count for the chosen machine type: c4-standard-8-lssd=1, "
+    "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  "
+    "Default 1 covers most single-lssd machine types.",
 )
 
 
 _NODE_IMAGE_TYPE = flags.DEFINE_string(
-    'swap_encryption_node_image_type',
-    'UBUNTU_CONTAINERD',
-    'GKE node image type for the benchmark nodepool.  '
-    'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks '
-    'down device-mapper at the kernel LSM level and cryptsetup hangs '
-    'indefinitely from any pod context (even privileged, even via nsenter '
-    'into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup '
-    'from privileged pods without restriction.  '
-    'Use COS_CONTAINERD only when dm-crypt is disabled '
-    '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  '
-    'AL2 on EKS.',
+    "swap_encryption_node_image_type",
+    "UBUNTU_CONTAINERD",
+    "GKE node image type for the benchmark nodepool.  "
+    "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks "
+    "down device-mapper at the kernel LSM level and cryptsetup hangs "
+    "indefinitely from any pod context (even privileged, even via nsenter "
+    "into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup "
+    "from privileged pods without restriction.  "
+    "Use COS_CONTAINERD only when dm-crypt is disabled "
+    "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  "
+    "AL2 on EKS.",
 )
 
 
 _BOOT_DISK_TYPE = flags.DEFINE_string(
-    'swap_encryption_boot_disk_type',
-    'hyperdisk-balanced',
-    'Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced '
-    'for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 '
-    'dev/test machines, which do not support hyperdisk-balanced.',
+    "swap_encryption_boot_disk_type",
+    "hyperdisk-balanced",
+    "Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced "
+    "for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 "
+    "dev/test machines, which do not support hyperdisk-balanced.",
 )
 
 
 _BOOT_DISK_IOPS = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_iops',
+    "swap_encryption_boot_disk_iops",
     80000,
-    'Provisioned IOPS for the boot disk (hyperdisk-balanced only).  '
-    '80 000 is the COS max-IOPS target.  Ignored for pd-ssd.',
+    "Provisioned IOPS for the boot disk (hyperdisk-balanced only).  "
+    "80 000 is the COS max-IOPS target.  Ignored for pd-ssd.",
 )
 
 
 _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_throughput',
+    "swap_encryption_boot_disk_throughput",
     1200,
-    'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced '
-    'only).  Must be set together with iops.  1200 MB/s pairs with 80 000 '
-    'IOPS for production; use 140 (minimum) for dev/test.  Ignored for '
-    'pd-ssd.',
+    "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced "
+    "only).  Must be set together with iops.  1200 MB/s pairs with 80 000 "
+    "IOPS for production; use 140 (minimum) for dev/test.  Ignored for "
+    "pd-ssd.",
 )
 
 
 _BOOT_DISK_SIZE_GB = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_size_gb',
+    "swap_encryption_boot_disk_size_gb",
     500,
-    'Boot disk size in GiB for the benchmark nodepool.  500 GiB is '
-    'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run '
-    '(see Engineer Assignments table in execution-plan.md).  '
-    'For LSSD configs the boot disk is smaller; 100 GiB is fine.',
+    "Boot disk size in GiB for the benchmark nodepool.  500 GiB is "
+    "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run "
+    "(see Engineer Assignments table in execution-plan.md).  "
+    "For LSSD configs the boot disk is smaller; 100 GiB is fine.",
 )
 
 
 _ADD_SWAP_DISK = flags.DEFINE_boolean(
-    'swap_encryption_add_swap_disk',
+    "swap_encryption_add_swap_disk",
     False,
-    'Attach a dedicated second disk to the benchmark nodepool for use as '
-    'the swap device.  Required for dm-crypt measurement on single-boot-disk '
-    'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper '
-    'from pod namespaces.  The second disk is provisioned via '
-    '--additional-node-disk using the same type/IOPS/throughput as the boot '
-    'disk flags.',
+    "Attach a dedicated second disk to the benchmark nodepool for use as "
+    "the swap device.  Required for dm-crypt measurement on single-boot-disk "
+    "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper "
+    "from pod namespaces.  The second disk is provisioned via "
+    "--additional-node-disk using the same type/IOPS/throughput as the boot "
+    "disk flags.",
 )
 
 
 _SWAP_DISK_SIZE_GB = flags.DEFINE_integer(
-    'swap_encryption_swap_disk_size_gb',
+    "swap_encryption_swap_disk_size_gb",
     500,
-    'Size in GiB of the dedicated swap disk when '
-    '--swap_encryption_add_swap_disk is True.  Must satisfy the '
-    'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
+    "Size in GiB of the dedicated swap disk when "
+    "--swap_encryption_add_swap_disk is True.  Must satisfy the "
+    "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.",
 )
 
 _ENABLE_DMCRYPT = flags.DEFINE_boolean(
-    'swap_encryption_enable_dmcrypt',
+    "swap_encryption_enable_dmcrypt",
     True,
-    'When True (default), wrap the swap device in dm-crypt plain mode '
+    "When True (default), wrap the swap device in dm-crypt plain mode "
     "(aes-xts-plain64, ephemeral random key) matching GKE's "
-    'go/node:swap-encryption implementation.  Set False to measure plain '
-    '(unencrypted) swap overhead as a baseline.',
+    "go/node:swap-encryption implementation.  Set False to measure plain "
+    "(unencrypted) swap overhead as a baseline.",
 )
 
 _GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string(
-    'swap_encryption_gke_kubelet_memory_swap',
-    'LimitedSwap',
-    'Value for kubeletConfig.memorySwapBehavior injected via '
-    '--system-config-from-file when creating the GKE benchmark nodepool.  '
-    'LimitedSwap (default) — the kubelet allows pods to use swap up to their '
-    'memory limit; required for the DaemonSet pod to drive kernel swapping.  '
-    'NoSwap — disables swap at the kubelet level (use for a baseline run that '
-    'confirms zero swap activity).  Set empty string to omit the flag entirely '
-    'and rely on the cluster-level default.',
+    "swap_encryption_gke_kubelet_memory_swap",
+    "LimitedSwap",
+    "Value for kubeletConfig.memorySwapBehavior injected via "
+    "--system-config-from-file when creating the GKE benchmark nodepool.  "
+    "LimitedSwap (default) — the kubelet allows pods to use swap up to their "
+    "memory limit; required for the DaemonSet pod to drive kernel swapping.  "
+    "NoSwap — disables swap at the kubelet level (use for a baseline run that "
+    "confirms zero swap activity).  Set empty string to omit the flag entirely "
+    "and rely on the cluster-level default.",
 )
 
 _SWAP_DEVICE = flags.DEFINE_string(
-    'swap_encryption_device',
-    '',
-    'Explicit block device path to use as the swap device, e.g. '
-    '/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), '
-    'the device is auto-detected from /proc/swaps inside the benchmark pod.',
+    "swap_encryption_device",
+    "",
+    "Explicit block device path to use as the swap device, e.g. "
+    "/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), "
+    "the device is auto-detected from /proc/swaps inside the benchmark pod.",
 )
 
 _SWAP_TYPE = flags.DEFINE_string(
-    'swap_encryption_swap_type',
-    'hyperdisk',
-    'Storage target for the swap device.  One of: hyperdisk (default), '
-    'lssd, instance_store, io2.',
-)
-
-_KERNEL_VERSION = flags.DEFINE_string(
-    'swap_encryption_kernel_version',
-    '',
-    'Kernel version string to embed in the DaemonSet pod spec as a label.  '
-    'When empty (default) the version is not pinned.',
+    "swap_encryption_swap_type",
+    "hyperdisk",
+    "Storage target for the swap device.  One of: hyperdisk (default), "
+    "lssd, instance_store, io2.",
 )
 
 _ENABLE_ZSWAP = flags.DEFINE_boolean(
-    'swap_encryption_enable_zswap',
+    "swap_encryption_enable_zswap",
     False,
-    'When True, enable zswap compressed swap cache on the benchmark node.',
+    "When True, enable zswap compressed swap cache on the benchmark node.",
 )
 
 _MIN_FREE_KBYTES = flags.DEFINE_integer(
-    'swap_encryption_min_free_kbytes',
+    "swap_encryption_min_free_kbytes",
     0,
-    'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. '
-    '0 (default) leaves the kernel default unchanged.',
+    "Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. "
+    "0 (default) leaves the kernel default unchanged.",
 )
 
 _FIO_RUNTIME_SEC = flags.DEFINE_integer(
-    'swap_encryption_fio_runtime_sec',
+    "swap_encryption_fio_runtime_sec",
     60,
-    'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.',
+    "Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.",
 )
 
 _STRESS_VM_BYTES = flags.DEFINE_string(
-    'swap_encryption_stress_vm_bytes',
-    '28G',
-    'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  '
-    'Should exceed available node RAM to force sustained paging.',
+    "swap_encryption_stress_vm_bytes",
+    "28G",
+    "stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  "
+    "Should exceed available node RAM to force sustained paging.",
 )
 
 _STRESS_VM_BYTES_LIST = flags.DEFINE_list(
-    'swap_encryption_stress_vm_bytes_list',
+    "swap_encryption_stress_vm_bytes_list",
     [],
-    'Comma-separated list of --vm-bytes values to sweep in Phase 2a, '
+    "Comma-separated list of --vm-bytes values to sweep in Phase 2a, "
     'e.g. "14G,28G,56G".  Overrides --swap_encryption_stress_vm_bytes.',
 )
 
 _STRESS_TIMEOUT_SEC = flags.DEFINE_integer(
-    'swap_encryption_stress_timeout_sec',
+    "swap_encryption_stress_timeout_sec",
     300,
-    'Maximum seconds to wait for the stress-ng swap-pressure phase.',
+    "Maximum seconds to wait for the stress-ng swap-pressure phase.",
 )
 
-_DS_NAME = 'pkb-swap-benchmark'
-_DS_NAMESPACE = 'default'
-_DS_LABEL = 'pkb-swap-benchmark'
+_DS_NAME = "pkb-swap-benchmark"
+_DS_NAMESPACE = "default"
+_DS_LABEL = "pkb-swap-benchmark"
 
 # Transient kubectl errors that are safe to retry.
-_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close')
+_TRANSIENT_KUBECTL_ERRORS = ("connection reset by peer", "websocket: close")
 
 # Errors indicating the container/pod is gone and needs recovery.
 _CONTAINER_GONE_KUBECTL_ERRORS = (
-    'container not found',
-    'procReady not received',
-    'unable to upgrade connection',
-    'not found',
-    'deleted state',
+    "container not found",
+    "procReady not received",
+    "unable to upgrade connection",
+    "not found",
+    "deleted state",
 )
 
 _active_pod: list[str] = []  # single-element list so closures can mutate it
@@ -385,241 +379,267 @@
 
 _oom_events: list[str] = []
 
-_BENCHMARK_NODEPOOL = 'benchmark'
-_DEFAULT_NODEPOOL = 'default-pool'
+_BENCHMARK_NODEPOOL = "benchmark"
+_DEFAULT_NODEPOOL = "default-pool"
 
 
 class _GcpZonalResource:
-  """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
+    """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
 
-  gcp_util.GcloudCommand auto-injects --project and --zone from the resource
-  object passed to it.  GkeCluster._GcloudCommand() handles container/*
-  operations correctly but also switches --zone → --region for multi-zone
-  clusters, which is wrong for gcloud compute commands (--region creates
-  regional resources, not zonal ones).  This shim pins a single zone so all
-  gcloud compute calls target the correct AZ.
-  """
+    gcp_util.GcloudCommand auto-injects --project and --zone from the resource
+    object passed to it.  GkeCluster._GcloudCommand() handles container/*
+    operations correctly but also switches --zone → --region for multi-zone
+    clusters, which is wrong for gcloud compute commands (--region creates
+    regional resources, not zonal ones).  This shim pins a single zone so all
+    gcloud compute calls target the correct AZ.
+    """
 
-  def __init__(self, project: str, zone: str) -> None:
-    self.project = project
-    self.zone = zone
+    def __init__(self, project: str, zone: str) -> None:
+        self.project = project
+        self.zone = zone
 
 
 def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
-  """Load and return benchmark config spec."""
-  return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+    """Load and return benchmark config spec."""
+    return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
 
 
 def Prepare(spec: _BenchmarkSpec) -> None:
-  """Two-step nodepool setup then DaemonSet deployment.
-
-  Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap
-  e2-medium default nodepool.
-
-  Step 2 (this function):
-    a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with
-       COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures
-       dm-crypt swap at the OS level — before any pod is scheduled.
-    b. Delete the dummy default nodepool to stop its cost immediately.
-    c. Deploy the privileged DaemonSet (pinned via nodeSelector to the
-       benchmark nodepool) and wait for tools to install.
-  """
-  cluster = spec.container_cluster
-
-  # ── Step 2a: add real benchmark nodepool ────────────────────────────────
-  if not getattr(cluster, 'project', None):
-    # Guard: AWS / EKS path — nodepool management is external.
-    # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet
-    # nodeSelector (pkb_nodepool=benchmark) before deploying the pod.
-    logging.info(
-        '[swap_encryption] EKS cluster — labelling existing nodes with '
-        'pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
-        _BENCHMARK_NODEPOOL,
-    )
-    kubectl.RunKubectlCommand([
-        'label',
-        'nodes',
-        '--all',
-        '--overwrite',
-        f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-    ])
-    # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
-    # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
-    _ensure_io2_volume()
-  else:
-    # GCP path: true two-step nodepool setup.
-    logging.info('[swap_encryption] Step 2a: creating benchmark nodepool')
-    _create_benchmark_node_pool(cluster)
-
-    # ── Step 2b: wait for the benchmark node to join and be Ready ─────────
-    logging.info('[swap_encryption] Step 2b: waiting for benchmark node')
-    _wait_for_benchmark_node()
-
-    # ── Step 2b2: attach dedicated swap disk (if requested) ───────────────
-    if _ADD_SWAP_DISK.value:
-      logging.info('[swap_encryption] Step 2b2: attaching dedicated swap disk')
-      _attach_swap_disk(cluster)
-
-  # ── Step 2c: deploy DaemonSet ────────────────────────────────────────────
-  # Deploy and wait for the pod BEFORE deleting the default nodepool.
-  # Deleting the default pool while the benchmark node is still joining causes
-  # a temporary API server i/o timeout (control plane busy with two nodepool
-  # ops simultaneously).  Once the pod is Running the cluster is fully stable.
-  logging.info('[swap_encryption] Step 2c: deploying privileged DaemonSet')
-  _deploy_daemonset()
-
-  pod = _wait_for_benchmark_pod()
-  logging.info('[swap_encryption] Benchmark pod ready: %s', pod)
-
-  # ── Step 2d: now safe to remove the dummy default nodepool ───────────────
-  if getattr(cluster, 'project', None):
-    logging.info('[swap_encryption] Step 2d: deleting dummy default nodepool')
-    _delete_default_node_pool(cluster)
-    # The DaemonSet pod may be evicted and rescheduled with a new name during
-    # the nodepool deletion (cluster control plane briefly interrupts pod
-    # lifecycle).  Re-resolve the pod name to avoid stale-reference errors on
-    # all subsequent _pod_exec calls.
-    logging.info(
-        '[swap_encryption] Step 2d: re-resolving benchmark pod '
-        'after nodepool deletion'
-    )
+    """Two-step nodepool setup then DaemonSet deployment.
+
+    Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap
+    e2-medium default nodepool.
+
+    Step 2 (this function):
+      a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with
+         COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures
+         dm-crypt swap at the OS level — before any pod is scheduled.
+      b. Delete the dummy default nodepool to stop its cost immediately.
+      c. Deploy the privileged DaemonSet (pinned via nodeSelector to the
+         benchmark nodepool) and wait for tools to install.
+    """
+    cluster = spec.container_cluster
+
+    # ── Step 2a: add real benchmark nodepool ────────────────────────────────
+    if not getattr(cluster, "project", None):
+        # Guard: AWS / EKS path — nodepool management is external.
+        # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet
+        # nodeSelector (pkb_nodepool=benchmark) before deploying the pod.
+        logging.info(
+            "[swap_encryption] EKS cluster — labelling existing nodes with "
+            "pkb_nodepool=%s so the DaemonSet nodeSelector matches.",
+            _BENCHMARK_NODEPOOL,
+        )
+        kubectl.RunKubectlCommand([
+            "label",
+            "nodes",
+            "--all",
+            "--overwrite",
+            f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
+        ])
+        # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
+        # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
+        _ensure_io2_volume()
+    else:
+        # GCP path: true two-step nodepool setup.
+        logging.info("[swap_encryption] Step 2a: creating benchmark nodepool")
+        _create_benchmark_node_pool(cluster)
+
+        # ── Step 2b: wait for the benchmark node to join and be Ready ─────────
+        logging.info("[swap_encryption] Step 2b: waiting for benchmark node")
+        _wait_for_benchmark_node()
+
+        # ── Step 2b2: attach dedicated swap disk (if requested) ───────────────
+        if _ADD_SWAP_DISK.value:
+            logging.info(
+                "[swap_encryption] Step 2b2: attaching dedicated swap disk"
+            )
+            _attach_swap_disk(cluster)
+
+    # ── Step 2c: deploy DaemonSet ────────────────────────────────────────────
+    # Deploy and wait for the pod BEFORE deleting the default nodepool.
+    # Deleting the default pool while the benchmark node is still joining causes
+    # a temporary API server i/o timeout (control plane busy with two nodepool
+    # ops simultaneously).  Once the pod is Running the cluster is fully stable.
+    logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet")
+    _deploy_daemonset()
+
     pod = _wait_for_benchmark_pod()
-    logging.info('[swap_encryption] Benchmark pod (post-deletion): %s', pod)
+    logging.info("[swap_encryption] Benchmark pod ready: %s", pod)
+
+    # ── Step 2d: now safe to remove the dummy default nodepool ───────────────
+    if getattr(cluster, "project", None):
+        logging.info(
+            "[swap_encryption] Step 2d: deleting dummy default nodepool"
+        )
+        _delete_default_node_pool(cluster)
+        # The DaemonSet pod may be evicted and rescheduled with a new name during
+        # the nodepool deletion (cluster control plane briefly interrupts pod
+        # lifecycle).  Re-resolve the pod name to avoid stale-reference errors on
+        # all subsequent _pod_exec calls.
+        logging.info(
+            "[swap_encryption] Step 2d: re-resolving benchmark pod "
+            "after nodepool deletion"
+        )
+        pod = _wait_for_benchmark_pod()
+        logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod)
 
 
 def _phase_selected(token: str) -> bool:
-  """Return True if phase `token` should run given --swap_encryption_phases.
+    """Return True if phase `token` should run given --swap_encryption_phases.
 
-  'all' (the default) selects every phase.  Otherwise only the comma-separated
-  tokens listed in the flag run.  Tokens: fio, 2a, 2b, 3a, 3b, 3c.
-  """
-  selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
-  return (not selected) or ('all' in selected) or (token.lower() in selected)
+    'all' (the default) selects every phase.  Otherwise only the comma-separated
+    tokens listed in the flag run.  Tokens: fio, 2a, 2b, 3a, 3b, 3c.
+    """
+    selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
+    return (not selected) or ("all" in selected) or (token.lower() in selected)
 
 
 def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
-  """Execute all benchmark phases with gate logic.
+    """Execute all benchmark phases with gate logic.
 
-  Execution is structured in three gated tiers matching the execution plan:
+    Execution is structured in three gated tiers matching the execution plan:
 
-    Tier 1 (Gate 1) — fio microbenchmarks
-      Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
-      zero samples (device not found, O_DIRECT error, etc.).
+      Tier 1 (Gate 1) — fio microbenchmarks
+        Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
+        zero samples (device not found, O_DIRECT error, etc.).
 
-    Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference
-      Requires an active swap device (Gate 1 must pass).  Gate 2 fails if
-      stress-ng does not complete within timeout.
+      Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference
+        Requires an active swap device (Gate 1 must pass).  Gate 2 fails if
+        stress-ng does not complete within timeout.
 
-    Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch)
-      Independent of Tier 2 results; always attempted if Gate 1 passed.
-      Individual workload failures are logged but do not abort the others.
+      Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch)
+        Independent of Tier 2 results; always attempted if Gate 1 passed.
+        Individual workload failures are logged but do not abort the others.
 
-  If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring
-  application-level swap performance when the raw device is inaccessible.
-  """
-  pod = _wait_for_benchmark_pod()
-  if pod is None:
-    raise errors.Benchmarks.RunError(
-        '[swap_encryption] Benchmark pod never became ready.'
-    )
-  # Initialise the module-level active-pod tracker so _pod_exec and
-  # _recover_pod can transparently redirect to a replacement pod if the
-  # original is evicted during the run.
-  _active_pod.clear()
-  _active_pod.append(pod)
-  _degraded_reasons.clear()
-  _pod_lost.clear()
-  _oom_events.clear()
-  original_pod = pod
-  swap_dev = _detect_swap_device(pod)
-  base_meta = _build_metadata(pod, swap_dev)
-  results: list[sample.Sample] = []
-  t_run_start = time.time()
-
-  logging.info('[swap_encryption] swap device: %s', swap_dev)
-
-  # ── Cost estimate ─────────────────────────────────────────────────────────
-  if _COLLECT_COST.value:
-    elapsed = time.time() - t_run_start
-    results += _collect_cost_sample(pod, elapsed, base_meta)
-
-  # ── Final degradation gate ────────────────────────────────────────────────
-  # The phase try/except blocks above keep the run alive so partial data is
-  # still collected, but that means a catastrophic failure (pod OOM-evicted
-  # mid-run, no fio data, stress-ng killed before it could drive swap I/O)
-  # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics.
-  # Detect those conditions here and surface them explicitly.
-  if _active_pod and _active_pod[0] != original_pod:
-    _degraded_reasons.append(
-        'benchmark pod was replaced during the run '
-        f'({original_pod} → {_active_pod[0]}) — it was OOM-evicted under swap '
-        'pressure; phases executed after the eviction ran against a '
-        'freshly-initialised pod (empty /tmp, swap re-setup) and may be '
-        'invalid'
-    )
-  if _pod_lost:
-    _degraded_reasons.append(
-        'benchmark pod(s) went NotFound during the run'
-        f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure'
-        ' eviction or container exit) and any phase running at or after that'
-        ' point (e.g. kernel-build baseline, OpenSearch) produced invalid data'
-    )
-  if _oom_events:
-    _degraded_reasons.append(
-        'OOM kill(s) (rc=137) occurred during the run on pod(s) '
-        f'{", ".join(_oom_events)} — a phase exceeded memory and was killed by '
-        'the OOM killer (the container may have restarted in place), so the '
-        'affected phase(s) produced no or partial data'
-    )
+    If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring
+    application-level swap performance when the raw device is inaccessible.
+    """
+    pod = _wait_for_benchmark_pod()
+    if pod is None:
+        raise errors.Benchmarks.RunError(
+            "[swap_encryption] Benchmark pod never became ready."
+        )
+    # Initialise the module-level active-pod tracker so _pod_exec and
+    # _recover_pod can transparently redirect to a replacement pod if the
+    # original is evicted during the run.
+    _active_pod.clear()
+    _active_pod.append(pod)
+    _degraded_reasons.clear()
+    _pod_lost.clear()
+    _oom_events.clear()
+    original_pod = pod
+    swap_dev = _detect_swap_device(pod)
+    base_meta = _build_metadata(pod, swap_dev)
+    results: list[sample.Sample] = []
+    t_run_start = time.time()
+
+    logging.info("[swap_encryption] swap device: %s", swap_dev)
+
+    # ── Phase 1: fio microbenchmarks on raw swap device ─────────────────────────
+    if _phase_selected("fio"):
+        logging.info(
+            "[swap_encryption] Phase 1: fio microbenchmarks on %s", swap_dev
+        )
+        try:
+            phase1_samples = _run_phase1_fio(pod, swap_dev, base_meta)
+            results += phase1_samples
+            if not phase1_samples:
+                _degraded_reasons.append(
+                    "Phase 1 (fio) produced no samples — "
+                    "check fio install and swap device accessibility"
+                )
+                logging.error("[swap_encryption] Phase 1: no samples produced")
+        except Exception as e:  # pylint: disable=broad-except
+            _degraded_reasons.append(f"Phase 1 fio failed: {e}")
+            logging.error("[swap_encryption] Phase 1 fio error: %s", e)
+
+    # ── Cost estimate ─────────────────────────────────────────────────────────
+    if _COLLECT_COST.value:
+        elapsed = time.time() - t_run_start
+        results += _collect_cost_sample(pod, elapsed, base_meta)
+
+    # ── Final degradation gate ────────────────────────────────────────────────
+    # The phase try/except blocks above keep the run alive so partial data is
+    # still collected, but that means a catastrophic failure (pod OOM-evicted
+    # mid-run, no fio data, stress-ng killed before it could drive swap I/O)
+    # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics.
+    # Detect those conditions here and surface them explicitly.
+    if _active_pod and _active_pod[0] != original_pod:
+        _degraded_reasons.append(
+            f"benchmark pod was replaced during the run ({original_pod} →"
+            f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;"
+            " phases executed after the eviction ran against a"
+            " freshly-initialised pod (empty /tmp, swap re-setup) and may be"
+            " invalid"
+        )
+    if _pod_lost:
+        _degraded_reasons.append(
+            "benchmark pod(s) went NotFound during the run"
+            f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure'
+            " eviction or container exit) and any phase running at or after"
+            " that"
+            " point (e.g. kernel-build baseline, OpenSearch) produced invalid"
+            " data"
+        )
+    if _oom_events:
+        _degraded_reasons.append(
+            "OOM kill(s) (rc=137) occurred during the run on pod(s) "
+            f'{", ".join(_oom_events)} — a phase exceeded memory and was'
+            " killed by "
+            "the OOM killer (the container may have restarted in place), so"
+            " the "
+            "affected phase(s) produced no or partial data"
+        )
 
-  degraded = bool(_degraded_reasons)
-  results.append(
-      sample.Sample(
-          'swap_encryption_run_status',
-          0.0 if degraded else 1.0,
-          'status',
-          dict(
-              base_meta,
-              degraded=degraded,
-              degraded_reasons='; '.join(_degraded_reasons) or 'none',
-              num_samples=len(results) + 1,
-          ),
-      )
-  )
-
-  if degraded:
-    msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(_degraded_reasons)
-    logging.error(msg)
-    if _FAIL_ON_DEGRADED.value:
-      # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED.  The
-      # samples collected so far are still published by PKB before the failure
-      # is recorded, so no data is lost.
-      raise errors.Benchmarks.RunError(msg)
-  else:
-    logging.info(
-        '[swap_encryption] Run completed cleanly (%d samples)', len(results)
+    degraded = bool(_degraded_reasons)
+    results.append(
+        sample.Sample(
+            "swap_encryption_run_status",
+            0.0 if degraded else 1.0,
+            "status",
+            dict(
+                base_meta,
+                degraded=degraded,
+                degraded_reasons="; ".join(_degraded_reasons) or "none",
+                num_samples=len(results) + 1,
+            ),
+        )
     )
 
-  return results
+    if degraded:
+        msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons)
+        logging.error(msg)
+        if _FAIL_ON_DEGRADED.value:
+            # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED.  The
+            # samples collected so far are still published by PKB before the failure
+            # is recorded, so no data is lost.
+            raise errors.Benchmarks.RunError(msg)
+    else:
+        logging.info(
+            "[swap_encryption] Run completed cleanly (%d samples)", len(results)
+        )
+
+    return results
 
 
 def Cleanup(spec: _BenchmarkSpec) -> None:
-  """Remove the DaemonSet and tear down any swap configuration."""
-  pod = _wait_for_benchmark_pod(timeout=30)
-  if pod:
-    _pod_exec(pod, 'swapoff -a 2>/dev/null || true', ignore_failure=True)
-    _pod_exec(
-        pod,
-        textwrap.dedent("""
+    """Remove the DaemonSet and tear down any swap configuration."""
+    pod = _wait_for_benchmark_pod(timeout=30)
+    if pod:
+        _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True)
+        _pod_exec(
+            pod,
+            textwrap.dedent("""
       swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
       dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
     """),
-        ignore_failure=True,
-    )
-    # Clean up loop device backing files (single-disk fallback path).
-    _pod_exec(
-        pod,
-        textwrap.dedent("""
+            ignore_failure=True,
+        )
+        # Clean up loop device backing files (single-disk fallback path).
+        _pod_exec(
+            pod,
+            textwrap.dedent("""
       for backing in /var/pkb_swap_backing /run/pkb_swap_backing \
                      /mnt/stateful_partition/pkb_swap_backing
       do
@@ -631,188 +651,230 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
         rm -f "$backing"
       done
     """),
-        ignore_failure=True,
-    )
-    _pod_exec(
-        pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", ignore_failure=True
-    )
+            ignore_failure=True,
+        )
+        _pod_exec(
+            pod,
+            "pkill -9 'stress-ng|fio' 2>/dev/null || true",
+            ignore_failure=True,
+        )
 
-  _delete_daemonset()
+    _delete_daemonset()
 
-  # Detach and delete the dedicated swap disk if one was provisioned.
-  cluster = spec.container_cluster
-  if _ADD_SWAP_DISK.value and getattr(cluster, 'project', None):
-    _detach_and_delete_swap_disk(cluster)
+    # Detach and delete the dedicated swap disk if one was provisioned.
+    cluster = spec.container_cluster
+    if _ADD_SWAP_DISK.value and getattr(cluster, "project", None):
+        _detach_and_delete_swap_disk(cluster)
 
 
-def _deploy_daemonset() -> None:
-  """Apply the benchmark DaemonSet manifest to the cluster.
-
-  Uses kubernetes_commands.ApplyManifest which renders the Jinja2 template
-  from perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2,
-  writes it to a temp file, and calls kubectl apply -f — the standard PKB
-  pattern for deploying manifests.
-  """
-  kubernetes_commands.ApplyManifest(
-      'cluster/swap_encryption_daemonset.yaml.j2',
-      ds_name=_DS_NAME,
-      ds_namespace=_DS_NAMESPACE,
-      ds_label=_DS_LABEL,
-      benchmark_nodepool=_BENCHMARK_NODEPOOL,
-      image=_DAEMONSET_IMAGE.value,
-      kernel_version=_KERNEL_VERSION.value,
-  )
-  logging.info('[swap_encryption] DaemonSet applied')
-
-
-def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
-  """Wait until the DaemonSet pod is Running AND tools are installed.
-
-  The benchmark container installs apt packages on first start and writes
-  /tmp/pkb_ready when done (~2-4 min on a cold node).  We must wait for
-  that sentinel before exec-ing any commands, otherwise tools like
-  cryptsetup / fio may not yet be on PATH.
-
-  Uses tab-separated name/phase output so kubectl always exits 0 regardless
-  of whether any pods are present, avoiding jsonpath index errors.
-  """
-  deadline = time.time() + timeout
-  last_phase = ''
-  ready_pod = None  # pod name once phase == Running
-
-  while time.time() < deadline:
-    # ── Step 1: wait for Running phase ──────────────────────────────────────
-    if ready_pod is None:
-      out, _, rc = kubectl.RunKubectlCommand(
-          [
-              'get',
-              'pods',
-              '-l',
-              f'app={_DS_LABEL}',
-              '-n',
-              _DS_NAMESPACE,
-              '-o',
-              (
-                  r'jsonpath={range'
-                  r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
-              ),
-          ],
-          raise_on_failure=False,
-      )
-
-      if rc == 0 and out.strip():
-        for line in out.strip().splitlines():
-          parts = line.split('\t')
-          if len(parts) == 2:
-            pod_name, phase = parts[0].strip(), parts[1].strip()
-            if phase == 'Running':
-              logging.info(
-                  '[swap_encryption] Pod %s is Running – '
-                  'waiting for tool install to finish...',
-                  pod_name,
-              )
-              ready_pod = pod_name
-              break
-            if phase != last_phase:
-              logging.info(
-                  '[swap_encryption] Pod %s phase: %s', pod_name, phase
-              )
-              last_phase = phase
-              if phase in ('Pending',):
-                _log_pod_events(pod_name)
-      else:
-        logging.info('[swap_encryption] Waiting for DaemonSet pod to appear...')
-
-    # ── Step 2: poll for /tmp/pkb_ready sentinel ────────────────────────────
-    if ready_pod is not None:
-      sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
-          [
-              'exec',
-              ready_pod,
-              '-n',
-              _DS_NAMESPACE,
-              '--',
-              'test',
-              '-f',
-              '/tmp/pkb_ready',
-          ],
-          raise_on_failure=False,
-      )
-      if sentinel_rc == 0:
-        logging.info(
-            '[swap_encryption] Pod %s ready (tools installed)', ready_pod
-        )
-        return ready_pod
-      # "container not found" means the container crashed (CrashLoopBackOff or
-      # exited) — treat it as a hard reset: re-check pod phase on next iteration.
-      if (
-          'container not found' in sentinel_err
-          or 'unable to upgrade connection' in sentinel_err
-      ):
-        logging.warning(
-            '[swap_encryption] Pod %s: container not running (%s) '
-            '— will re-check pod state',
-            ready_pod,
-            sentinel_err.strip(),
-        )
-        ready_pod = None
-        last_phase = ''
-      else:
-        logging.info(
-            '[swap_encryption] Pod %s: still installing tools...', ready_pod
-        )
+def _configure_eks_kubelet_swap(spec) -> None:
+    """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap.
+
+    NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm
+    integration) to merge.  When that lands, EKS node pools should include
+    a preBootstrapCommands block writing nodeadm config with
+    memorySwapBehavior: LimitedSwap before kubelet starts::
+
+      apiVersion: node.eks.aws/v1alpha1
+      kind: NodeConfig
+      spec:
+        kubelet:
+          config:
+            memorySwapBehavior: LimitedSwap
+            failSwapOn: false
+
+    GKE equivalent: linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior
+    via --system-config-from-file, already implemented in
+    _create_benchmark_node_pool.
+
+    See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780
+    """
+    logging.warning(
+        "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is "
+        "deferred (blocked on PR #6780 — SwapConfigSpec). "
+        "EKS nodes will use default kubelet swap settings until that PR merges."
+    )
+
 
-    time.sleep(15)
+def _deploy_daemonset() -> None:
+    """Apply the swap-infra DaemonSet manifest to the cluster.
+
+    The DaemonSet is intentionally lean: it only verifies the node-level swap
+    device is active (configured via linuxConfig.swapConfig on GKE or
+    kubelet-config.json on EKS) and writes /tmp/pkb_ready.  No benchmark
+    tooling is installed here — workloads are delegated to existing PKB
+    benchmark modules (kubernetes_fio, kubernetes_redis_memtier, etc.) which
+    manage their own tool installs inside separate benchmark pods.
+
+    Uses kubernetes_commands.ApplyManifest to render the Jinja2 template from
+    perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 and
+    apply it via kubectl — the standard PKB pattern for deploying manifests.
+    """
+    kubernetes_commands.ApplyManifest(
+        "cluster/swap_encryption_daemonset.yaml.j2",
+        ds_name=_DS_NAME,
+        ds_namespace=_DS_NAMESPACE,
+        ds_label=_DS_LABEL,
+        benchmark_nodepool=_BENCHMARK_NODEPOOL,
+        image=_DAEMONSET_IMAGE.value,
+    )
+    logging.info("[swap_encryption] Swap-infra DaemonSet applied")
+
+
+def _wait_for_benchmark_pod(timeout: int = 600) -> str | None:
+    """Wait until the swap-infra DaemonSet pod is Running AND swap is active.
+
+    The DaemonSet installs fio and a small set of measurement tools then
+    verifies the swap device before writing /tmp/pkb_ready (~1-2 min on a
+    cold apt cache).  Default timeout 600 s covers worst-case APT latency
+    on a freshly-started node.
+
+    Uses tab-separated name/phase output so kubectl always exits 0 regardless
+    of whether any pods are present, avoiding jsonpath index errors.
+    """
+    deadline = time.time() + timeout
+    last_phase = ""
+    ready_pod = None  # pod name once phase == Running
+
+    while time.time() < deadline:
+        # ── Step 1: wait for Running phase ──────────────────────────────────────
+        if ready_pod is None:
+            out, _, rc = kubectl.RunKubectlCommand(
+                [
+                    "get",
+                    "pods",
+                    "-l",
+                    f"app={_DS_LABEL}",
+                    "-n",
+                    _DS_NAMESPACE,
+                    "-o",
+                    (
+                        r"jsonpath={range"
+                        r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
+                    ),
+                ],
+                raise_on_failure=False,
+            )
+
+            if rc == 0 and out.strip():
+                for line in out.strip().splitlines():
+                    parts = line.split("\t")
+                    if len(parts) == 2:
+                        pod_name, phase = parts[0].strip(), parts[1].strip()
+                        if phase == "Running":
+                            logging.info(
+                                "[swap_encryption] Pod %s is Running – "
+                                "waiting for swap device readiness sentinel...",
+                                pod_name,
+                            )
+                            ready_pod = pod_name
+                            break
+                        if phase != last_phase:
+                            logging.info(
+                                "[swap_encryption] Pod %s phase: %s",
+                                pod_name,
+                                phase,
+                            )
+                            last_phase = phase
+                            if phase in ("Pending",):
+                                _log_pod_events(pod_name)
+            else:
+                logging.info(
+                    "[swap_encryption] Waiting for DaemonSet pod to appear..."
+                )
+
+        # ── Step 2: poll for /tmp/pkb_ready sentinel ────────────────────────────
+        if ready_pod is not None:
+            sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
+                [
+                    "exec",
+                    ready_pod,
+                    "-n",
+                    _DS_NAMESPACE,
+                    "--",
+                    "test",
+                    "-f",
+                    "/tmp/pkb_ready",
+                ],
+                raise_on_failure=False,
+            )
+            if sentinel_rc == 0:
+                logging.info(
+                    "[swap_encryption] Pod %s ready (swap device active)",
+                    ready_pod,
+                )
+                return ready_pod
+            # "container not found" means the container crashed (CrashLoopBackOff or
+            # exited) — treat it as a hard reset: re-check pod phase on next iteration.
+            if (
+                "container not found" in sentinel_err
+                or "unable to upgrade connection" in sentinel_err
+            ):
+                logging.warning(
+                    "[swap_encryption] Pod %s: container not running (%s) "
+                    "— will re-check pod state",
+                    ready_pod,
+                    sentinel_err.strip(),
+                )
+                ready_pod = None
+                last_phase = ""
+            else:
+                logging.info(
+                    "[swap_encryption] Pod %s: still installing tools...",
+                    ready_pod,
+                )
+
+        time.sleep(15)
 
-  logging.warning(
-      '[swap_encryption] Benchmark pod not ready after %ds', timeout
-  )
-  return None
+    logging.warning(
+        "[swap_encryption] Benchmark pod not ready after %ds", timeout
+    )
+    return None
 
 
 def _log_pod_events(pod_name: str) -> None:
-  """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
-  events_out, _, _ = kubectl.RunKubectlCommand(
-      [
-          'describe',
-          'pod',
-          pod_name,
-          '-n',
-          _DS_NAMESPACE,
-      ],
-      raise_on_failure=False,
-  )
-  # Only log the Events section to keep output manageable
-  in_events = False
-  lines = []
-  for line in events_out.splitlines():
-    if line.startswith('Events:'):
-      in_events = True
-    if in_events:
-      lines.append(line)
-  if lines:
-    logging.info('[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30]))
-  else:
-    logging.info(
-        '[swap_encryption] kubectl describe output:\n%s',
-        events_out[-2000:] if len(events_out) > 2000 else events_out,
+    """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
+    events_out, _, _ = kubectl.RunKubectlCommand(
+        [
+            "describe",
+            "pod",
+            pod_name,
+            "-n",
+            _DS_NAMESPACE,
+        ],
+        raise_on_failure=False,
     )
+    # Only log the Events section to keep output manageable
+    in_events = False
+    lines = []
+    for line in events_out.splitlines():
+        if line.startswith("Events:"):
+            in_events = True
+        if in_events:
+            lines.append(line)
+    if lines:
+        logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30]))
+    else:
+        logging.info(
+            "[swap_encryption] kubectl describe output:\n%s",
+            events_out[-2000:] if len(events_out) > 2000 else events_out,
+        )
 
 
 def _delete_daemonset() -> None:
-  """Delete the benchmark DaemonSet."""
-  kubectl.RunKubectlCommand(
-      [
-          'delete',
-          'daemonset',
-          _DS_NAME,
-          '-n',
-          _DS_NAMESPACE,
-          '--ignore-not-found',
-      ],
-      raise_on_failure=False,
-  )
-  logging.info('[swap_encryption] DaemonSet deleted')
+    """Delete the benchmark DaemonSet."""
+    kubectl.RunKubectlCommand(
+        [
+            "delete",
+            "daemonset",
+            _DS_NAME,
+            "-n",
+            _DS_NAMESPACE,
+            "--ignore-not-found",
+        ],
+        raise_on_failure=False,
+    )
+    logging.info("[swap_encryption] DaemonSet deleted")
 
 
 _HYPERDISK_MAX_IOPS_PER_MBPS = (
@@ -821,475 +883,495 @@ def _delete_daemonset() -> None:
 
 
 def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
-  """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint.
-
-  Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed
-  256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails
-  with "Requested provisioned throughput is too low for the provisioned iops".
-  Clamp throughput UP to the minimum the requested IOPS need (plus a small
-  margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk
-  creation.
-  """
-  min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
-  if throughput < min_tput:
-    logging.warning(
-        '[swap_encryption] boot/swap disk throughput %d MiB/s is too low for '
-        '%d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s); raising to %d',
-        throughput,
-        iops,
-        min_tput,
-        min_tput,
-    )
-    return min_tput
-  return throughput
+    """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint.
+
+    Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed
+    256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails
+    with "Requested provisioned throughput is too low for the provisioned iops".
+    Clamp throughput UP to the minimum the requested IOPS need (plus a small
+    margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk
+    creation.
+    """
+    min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
+    if throughput < min_tput:
+        logging.warning(
+            "[swap_encryption] boot/swap disk throughput %d MiB/s is too low"
+            " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);"
+            " raising to %d",
+            throughput,
+            iops,
+            min_tput,
+            min_tput,
+        )
+        return min_tput
+    return throughput
 
 
 def _create_benchmark_node_pool(cluster) -> None:
-  """Add the benchmark nodepool to the existing cluster (Step 2 of setup).
-
-  Uses:
-    --swap_encryption_benchmark_machine_type  (default n4-highmem-32)
-    --swap_encryption_node_image_type         (default COS_CONTAINERD)
-    --swap_encryption_boot_disk_iops          (default 80000)
-    --swap_encryption_enable_dmcrypt          (default True)
-
-  The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet
-  nodeSelector targets it exclusively.  dm-crypt swap setup is performed
-  from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap /
-  _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata
-  because GKE reserves that metadata key and rejects it at the API level.
-  """
-  machine_type = _BENCHMARK_MACHINE_TYPE.value
-  # Auto-detect LSSD from machine type name; flag overrides only when True.
-  is_lssd = _BENCHMARK_LSSD.value or 'lssd' in machine_type.lower()
-
-  # Determine zone/region from the cluster object.
-  # LSSD configs only need a small boot disk (OS only; swap is on local NVMe).
-  # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on
-  # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk
-  # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable
-  # headroom and matches the Config 2 spec in the Engineer Assignments table).
-  disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value
-
-  disk_type = _BOOT_DISK_TYPE.value
-
-  # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region,
-  # and auth token refresh.  GkeCluster._GcloudCommand also handles the
-  # zone → region promotion for multi-zone / regional clusters.
-  cmd = cluster._GcloudCommand(
-      'container',
-      'node-pools',
-      'create',
-      _BENCHMARK_NODEPOOL,
-      '--cluster',
-      cluster.name,
-  )
-  cmd.flags['machine-type'] = machine_type
-  cmd.flags['image-type'] = _NODE_IMAGE_TYPE.value
-  cmd.flags['disk-type'] = disk_type
-  cmd.flags['disk-size'] = disk_size_gb
-  cmd.flags['num-nodes'] = 1
-  cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}'
-  cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair']
-
-  # IOPS and throughput provisioning only applies to hyperdisk-* types AND
-  # only when the boot disk is also the swap device (non-LSSD configs).
-  # For LSSD machines the boot disk is OS-only; swap is on local NVMe.
-  # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the
-  # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
-  if disk_type.startswith('hyperdisk') and not is_lssd:
-    # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by
-    # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks).
-    cmd.flags['boot-disk-provisioned-iops'] = _BOOT_DISK_IOPS.value
-    cmd.flags['boot-disk-provisioned-throughput'] = _valid_hyperdisk_throughput(
-        _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+    """Add the benchmark nodepool to the existing cluster (Step 2 of setup).
+
+    Uses:
+      --swap_encryption_benchmark_machine_type  (default n4-highmem-32)
+      --swap_encryption_node_image_type         (default COS_CONTAINERD)
+      --swap_encryption_boot_disk_iops          (default 80000)
+      --swap_encryption_enable_dmcrypt          (default True)
+
+    The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet
+    nodeSelector targets it exclusively.  dm-crypt swap setup is performed
+    from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap /
+    _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata
+    because GKE reserves that metadata key and rejects it at the API level.
+    """
+    machine_type = _BENCHMARK_MACHINE_TYPE.value
+    # Auto-detect LSSD from machine type name; flag overrides only when True.
+    is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower()
+
+    # Determine zone/region from the cluster object.
+    # LSSD configs only need a small boot disk (OS only; swap is on local NVMe).
+    # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on
+    # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk
+    # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable
+    # headroom and matches the Config 2 spec in the Engineer Assignments table).
+    disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value
+
+    disk_type = _BOOT_DISK_TYPE.value
+
+    # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region,
+    # and auth token refresh.  GkeCluster._GcloudCommand also handles the
+    # zone → region promotion for multi-zone / regional clusters.
+    cmd = cluster._GcloudCommand(
+        "container",
+        "node-pools",
+        "create",
+        _BENCHMARK_NODEPOOL,
+        "--cluster",
+        cluster.name,
     )
+    cmd.flags["machine-type"] = machine_type
+    cmd.flags["image-type"] = _NODE_IMAGE_TYPE.value
+    cmd.flags["disk-type"] = disk_type
+    cmd.flags["disk-size"] = disk_size_gb
+    cmd.flags["num-nodes"] = 1
+    cmd.flags["node-labels"] = f"pkb_nodepool={_BENCHMARK_NODEPOOL}"
+    cmd.args += ["--no-enable-autoupgrade", "--no-enable-autorepair"]
+
+    # IOPS and throughput provisioning only applies to hyperdisk-* types AND
+    # only when the boot disk is also the swap device (non-LSSD configs).
+    # For LSSD machines the boot disk is OS-only; swap is on local NVMe.
+    # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the
+    # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
+    if disk_type.startswith("hyperdisk") and not is_lssd:
+        # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by
+        # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks).
+        cmd.flags["boot-disk-provisioned-iops"] = _BOOT_DISK_IOPS.value
+        cmd.flags["boot-disk-provisioned-throughput"] = (
+            _valid_hyperdisk_throughput(
+                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+            )
+        )
 
-  # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
-  # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block).
-  if is_lssd:
-    cmd.flags['local-nvme-ssd-block'] = f'count={_LSSD_COUNT.value}'
-
-  # ── GKE kubelet swap config ───────────────────────────────────────────────
-  # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
-  # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
-  # so that the kubelet allocates swap to the DaemonSet pod.  Without this flag
-  # the Linux kernel swap device may exist but the kubelet blocks pod-level
-  # swap usage and the benchmark pod cannot drive swap I/O.
-  #
-  # Passed as --system-config-from-file pointing to a temp YAML, which is the
-  # same mechanism PKB's gke_node_system_config flag uses:
-  #   perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
-  swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
-  system_config_tmp = None
-  if swap_behavior:
-    # Build system-config YAML for --system-config-from-file.
-    # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
-    #   kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods.
-    #   linuxConfig.swapConfig: GKE enables node-level swap device.
-    #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use
-    #     the local NVMe as the swap device (avoids boot-disk overhead).
-    #   linuxConfig.sysctl: swap aggressiveness tuning so the benchmark
-    #     workloads can drive sustained swap I/O.
-    # Reference:
-    #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
-    #   node-memory-swap#enable
+    # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
+    # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block).
     if is_lssd:
-      swap_config_block = (
-          '  swapConfig:\n'
-          '    enabled: true\n'
-          '    dedicatedLocalSsdProfile:\n'
-          f'      diskCount: {_LSSD_COUNT.value}\n'
-      )
-    else:
-      swap_config_block = '  swapConfig:\n    enabled: true\n'
-    kubelet_yaml = (
-        f'kubeletConfig:\n  memorySwapBehavior: {swap_behavior}\nlinuxConfig:\n'
-        + swap_config_block
-        + '  sysctl:\n'
-        '    vm.min_free_kbytes: 200\n'
-        '    vm.watermark_scale_factor: 500\n'
-        '    vm.swappiness: 100\n'
-    )
-    system_config_tmp = tempfile.NamedTemporaryFile(
-        mode='w', suffix='.yaml', delete=False
-    )
-    system_config_tmp.write(kubelet_yaml)
-    system_config_tmp.flush()
-    cmd.flags['system-config-from-file'] = system_config_tmp.name
+        cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}"
+
+    # ── GKE kubelet swap config ───────────────────────────────────────────────
+    # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
+    # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
+    # so that the kubelet allocates swap to the DaemonSet pod.  Without this flag
+    # the Linux kernel swap device may exist but the kubelet blocks pod-level
+    # swap usage and the benchmark pod cannot drive swap I/O.
+    #
+    # Passed as --system-config-from-file pointing to a temp YAML, which is the
+    # same mechanism PKB's gke_node_system_config flag uses:
+    #   perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+    swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
+    system_config_tmp = None
+    if swap_behavior:
+        # Build system-config YAML for --system-config-from-file.
+        # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
+        #   kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods.
+        #   linuxConfig.swapConfig: GKE enables node-level swap device.
+        #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use
+        #     the local NVMe as the swap device (avoids boot-disk overhead).
+        #   linuxConfig.sysctl: swap aggressiveness tuning so the benchmark
+        #     workloads can drive sustained swap I/O.
+        # Reference:
+        #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
+        #   node-memory-swap#enable
+        if is_lssd:
+            swap_config_block = (
+                "  swapConfig:\n"
+                "    enabled: true\n"
+                "    dedicatedLocalSsdProfile:\n"
+                f"      diskCount: {_LSSD_COUNT.value}\n"
+            )
+        else:
+            swap_config_block = "  swapConfig:\n    enabled: true\n"
+        kubelet_yaml = (
+            "kubeletConfig:\n  memorySwapBehavior:"
+            f" {swap_behavior}\nlinuxConfig:\n"
+            + swap_config_block
+            + "  sysctl:\n"
+            "    vm.min_free_kbytes: 200\n"
+            "    vm.watermark_scale_factor: 500\n"
+            "    vm.swappiness: 100\n"
+        )
+        system_config_tmp = tempfile.NamedTemporaryFile(
+            mode="w", suffix=".yaml", delete=False
+        )
+        system_config_tmp.write(kubelet_yaml)
+        system_config_tmp.flush()
+        cmd.flags["system-config-from-file"] = system_config_tmp.name
+        logging.info(
+            "[swap_encryption] system-config-from-file: "
+            "kubelet_swap=%s lssd=%s (written to %s):\n%s",
+            swap_behavior,
+            is_lssd,
+            system_config_tmp.name,
+            kubelet_yaml,
+        )
+
     logging.info(
-        '[swap_encryption] system-config-from-file: '
-        'kubelet_swap=%s lssd=%s (written to %s):\n%s',
-        swap_behavior,
+        "[swap_encryption] Creating benchmark nodepool: %s / %s / "
+        "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / "
+        "add_swap_disk=%s / kubelet_swap=%s",
+        _BENCHMARK_NODEPOOL,
+        machine_type,
+        _NODE_IMAGE_TYPE.value,
+        disk_size_gb,
+        _BOOT_DISK_IOPS.value,
+        _ENABLE_DMCRYPT.value,
         is_lssd,
-        system_config_tmp.name,
-        kubelet_yaml,
-    )
-
-  logging.info(
-      '[swap_encryption] Creating benchmark nodepool: %s / %s / '
-      'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
-      'add_swap_disk=%s / kubelet_swap=%s',
-      _BENCHMARK_NODEPOOL,
-      machine_type,
-      _NODE_IMAGE_TYPE.value,
-      disk_size_gb,
-      _BOOT_DISK_IOPS.value,
-      _ENABLE_DMCRYPT.value,
-      is_lssd,
-      _ADD_SWAP_DISK.value,
-      swap_behavior or 'unset',
-  )
-
-  # LSSD nodepools take longer to provision than PD-only nodepools because
-  # GKE must also initialise the local NVMe devices before marking nodes Ready.
-  # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
-  try:
-    _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
-  finally:
-    if system_config_tmp is not None:
-      try:
-        os.unlink(system_config_tmp.name)
-      except OSError:
-        pass
-
-  if rc != 0:
-    # Idempotent prepare: if the nodepool already exists (e.g. re-running
-    # --run_stage=prepare,run to redeploy the DaemonSet onto an existing
-    # cluster), reuse it instead of failing.  gcloud returns a 409 /
-    # "Already exists" message in this case.
-    low = (stderr or '').lower()
-    if 'already exists' in low or 'alreadyexists' in low or 'code=409' in low:
-      logging.info(
-          '[swap_encryption] Benchmark nodepool already exists — '
-          'reusing it (idempotent prepare); proceeding to DaemonSet'
-      )
-      return
-    raise errors.Benchmarks.RunError(
-        '[swap_encryption] Failed to create benchmark nodepool '
-        f'(rc={rc}): {stderr}'
+        _ADD_SWAP_DISK.value,
+        swap_behavior or "unset",
     )
-  logging.info('[swap_encryption] Benchmark nodepool ready')
 
+    # LSSD nodepools take longer to provision than PD-only nodepools because
+    # GKE must also initialise the local NVMe devices before marking nodes Ready.
+    # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
+    try:
+        _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
+    finally:
+        if system_config_tmp is not None:
+            try:
+                os.unlink(system_config_tmp.name)
+            except OSError:
+                pass
 
-def _wait_for_benchmark_node(timeout: int = 900) -> None:
-  """Block until a node labelled pkb_nodepool=benchmark is Ready.
-
-  gcloud container node-pools create returns as soon as the API accepts the
-  request — the actual node VM may take another 2-4 minutes to boot, join the
-  cluster, and pass its readiness checks.  Deploying the DaemonSet before that
-  point leaves the pod Pending indefinitely because the nodeSelector finds no
-  eligible node.
-
-  This function polls kubectl every 15 s until at least one node with
-  pkb_nodepool=benchmark has Ready=True, then returns.
-  """
-  deadline = time.time() + timeout
-  logging.info(
-      '[swap_encryption] Waiting for benchmark node '
-      '(pkb_nodepool=benchmark) to be Ready...'
-  )
-  while time.time() < deadline:
-    out, _, rc = kubectl.RunKubectlCommand(
-        [
-            'get',
-            'nodes',
-            '-l',
-            f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-            '-o',
-            r'jsonpath={range .items[*]}'
-            r'{.metadata.name}{"\t"}'
-            r'{range .status.conditions[?(@.type=="Ready")]}'
-            r'{.status}{"\n"}{end}{end}',
-        ],
-        raise_on_failure=False,
-    )
+    if rc != 0:
+        # Idempotent prepare: if the nodepool already exists (e.g. re-running
+        # --run_stage=prepare,run to redeploy the DaemonSet onto an existing
+        # cluster), reuse it instead of failing.  gcloud returns a 409 /
+        # "Already exists" message in this case.
+        low = (stderr or "").lower()
+        if (
+            "already exists" in low
+            or "alreadyexists" in low
+            or "code=409" in low
+        ):
+            logging.info(
+                "[swap_encryption] Benchmark nodepool already exists — "
+                "reusing it (idempotent prepare); proceeding to DaemonSet"
+            )
+            return
+        raise errors.Benchmarks.RunError(
+            "[swap_encryption] Failed to create benchmark nodepool "
+            f"(rc={rc}): {stderr}"
+        )
+    logging.info("[swap_encryption] Benchmark nodepool ready")
 
-    if rc == 0 and out.strip():
-      for line in out.strip().splitlines():
-        parts = line.split('\t')
-        if len(parts) == 2 and parts[1].strip() == 'True':
-          logging.info(
-              '[swap_encryption] Benchmark node ready: %s', parts[0].strip()
-          )
-          return
 
+def _wait_for_benchmark_node(timeout: int = 900) -> None:
+    """Block until a node labelled pkb_nodepool=benchmark is Ready.
+
+    gcloud container node-pools create returns as soon as the API accepts the
+    request — the actual node VM may take another 2-4 minutes to boot, join the
+    cluster, and pass its readiness checks.  Deploying the DaemonSet before that
+    point leaves the pod Pending indefinitely because the nodeSelector finds no
+    eligible node.
+
+    This function polls kubectl every 15 s until at least one node with
+    pkb_nodepool=benchmark has Ready=True, then returns.
+    """
+    deadline = time.time() + timeout
     logging.info(
-        '[swap_encryption] Benchmark node not yet Ready — retrying in 15 s...'
+        "[swap_encryption] Waiting for benchmark node "
+        "(pkb_nodepool=benchmark) to be Ready..."
     )
-    time.sleep(15)
+    while time.time() < deadline:
+        out, _, rc = kubectl.RunKubectlCommand(
+            [
+                "get",
+                "nodes",
+                "-l",
+                f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
+                "-o",
+                r"jsonpath={range .items[*]}"
+                r'{.metadata.name}{"\t"}'
+                r'{range .status.conditions[?(@.type=="Ready")]}'
+                r'{.status}{"\n"}{end}{end}',
+            ],
+            raise_on_failure=False,
+        )
 
-  raise errors.Benchmarks.RunError(
-      '[swap_encryption] Timed out waiting for benchmark node '
-      f'(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready '
-      f'after {timeout}s'
-  )
+        if rc == 0 and out.strip():
+            for line in out.strip().splitlines():
+                parts = line.split("\t")
+                if len(parts) == 2 and parts[1].strip() == "True":
+                    logging.info(
+                        "[swap_encryption] Benchmark node ready: %s",
+                        parts[0].strip(),
+                    )
+                    return
 
+        logging.info(
+            "[swap_encryption] Benchmark node not yet Ready — retrying in 15"
+            " s..."
+        )
+        time.sleep(15)
 
-def _attach_swap_disk(cluster) -> None:
-  """Create a dedicated hyperdisk and attach it to the benchmark node.
-
-  gcloud container node-pools create --additional-node-disk is not available
-  in all gcloud SDK versions, so we use gcloud compute to create the disk and
-  attach it after the node is ready.  In GKE the Kubernetes node name is the
-  same as the GCE instance name, so no translation is needed.
-
-  After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe
-  nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk.
-
-  The disk is named pkb-swap-<cluster-name> to avoid name collisions across
-  concurrent runs.  Cleanup deletes it in Cleanup() if it exists.
-  """
-  # Resolve zone from cluster
-  zone = None
-  if getattr(cluster, 'zones', None):
-    zone = cluster.zones[0]
-  elif getattr(cluster, 'region', None):
-    zone = cluster.region
-  if not zone:
     raise errors.Benchmarks.RunError(
-        '[swap_encryption] Cannot attach swap disk: cluster zone unknown'
+        "[swap_encryption] Timed out waiting for benchmark node "
+        f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready "
+        f"after {timeout}s"
     )
 
-  project = cluster.project
-  disk_name = f'pkb-swap-{cluster.name}'
-  disk_type = _BOOT_DISK_TYPE.value
-  disk_size_gb = _SWAP_DISK_SIZE_GB.value
-
-  # ── Step 1: get the GCE instance name of the benchmark node ───────────────
-  node_out, _, rc = kubectl.RunKubectlCommand(
-      [
-          'get',
-          'nodes',
-          '-l',
-          f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-          '-o',
-          'jsonpath={.items[0].metadata.name}',
-      ],
-      raise_on_failure=False,
-  )
-  instance_name = node_out.strip()
-  if rc != 0 or not instance_name:
-    raise errors.Benchmarks.RunError(
-        '[swap_encryption] Cannot find benchmark node for swap disk attach'
+
+def _attach_swap_disk(cluster) -> None:
+    """Create a dedicated hyperdisk and attach it to the benchmark node.
+
+    gcloud container node-pools create --additional-node-disk is not available
+    in all gcloud SDK versions, so we use gcloud compute to create the disk and
+    attach it after the node is ready.  In GKE the Kubernetes node name is the
+    same as the GCE instance name, so no translation is needed.
+
+    After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe
+    nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk.
+
+    The disk is named pkb-swap-<cluster-name> to avoid name collisions across
+    concurrent runs.  Cleanup deletes it in Cleanup() if it exists.
+    """
+    # Resolve zone from cluster
+    zone = None
+    if getattr(cluster, "zones", None):
+        zone = cluster.zones[0]
+    elif getattr(cluster, "region", None):
+        zone = cluster.region
+    if not zone:
+        raise errors.Benchmarks.RunError(
+            "[swap_encryption] Cannot attach swap disk: cluster zone unknown"
+        )
+
+    project = cluster.project
+    disk_name = f"pkb-swap-{cluster.name}"
+    disk_type = _BOOT_DISK_TYPE.value
+    disk_size_gb = _SWAP_DISK_SIZE_GB.value
+
+    # ── Step 1: get the GCE instance name of the benchmark node ───────────────
+    node_out, _, rc = kubectl.RunKubectlCommand(
+        [
+            "get",
+            "nodes",
+            "-l",
+            f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
+            "-o",
+            "jsonpath={.items[0].metadata.name}",
+        ],
+        raise_on_failure=False,
     )
-  logging.info('[swap_encryption] Benchmark node instance: %s', instance_name)
-
-  # ── Step 2: create the hyperdisk ──────────────────────────────────────────
-  logging.info(
-      '[swap_encryption] Creating swap disk %s (%dGiB %s)',
-      disk_name,
-      disk_size_gb,
-      disk_type,
-  )
-  # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project
-  # and --zone (always zonal — gcloud compute --region creates regional
-  # resources, which is not what we want for a node-attached swap disk).
-  gcp_res = _GcpZonalResource(project, zone)
-  create_cmd = gcp_util.GcloudCommand(
-      gcp_res, 'compute', 'disks', 'create', disk_name
-  )
-  create_cmd.flags['type'] = disk_type
-  create_cmd.flags['size'] = f'{disk_size_gb}GB'
-  create_cmd.args.append('--quiet')
-  if disk_type.startswith('hyperdisk'):
-    create_cmd.flags['provisioned-iops'] = _BOOT_DISK_IOPS.value
-    create_cmd.flags['provisioned-throughput'] = _valid_hyperdisk_throughput(
-        _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+    instance_name = node_out.strip()
+    if rc != 0 or not instance_name:
+        raise errors.Benchmarks.RunError(
+            "[swap_encryption] Cannot find benchmark node for swap disk attach"
+        )
+    logging.info("[swap_encryption] Benchmark node instance: %s", instance_name)
+
+    # ── Step 2: create the hyperdisk ──────────────────────────────────────────
+    logging.info(
+        "[swap_encryption] Creating swap disk %s (%dGiB %s)",
+        disk_name,
+        disk_size_gb,
+        disk_type,
     )
-  _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
-  if rc != 0:
-    raise errors.Benchmarks.RunError(
-        f'[swap_encryption] Failed to create swap disk {disk_name}: {stderr}'
+    # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project
+    # and --zone (always zonal — gcloud compute --region creates regional
+    # resources, which is not what we want for a node-attached swap disk).
+    gcp_res = _GcpZonalResource(project, zone)
+    create_cmd = gcp_util.GcloudCommand(
+        gcp_res, "compute", "disks", "create", disk_name
     )
+    create_cmd.flags["type"] = disk_type
+    create_cmd.flags["size"] = f"{disk_size_gb}GB"
+    create_cmd.args.append("--quiet")
+    if disk_type.startswith("hyperdisk"):
+        create_cmd.flags["provisioned-iops"] = _BOOT_DISK_IOPS.value
+        create_cmd.flags["provisioned-throughput"] = (
+            _valid_hyperdisk_throughput(
+                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
+            )
+        )
+    _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
+    if rc != 0:
+        raise errors.Benchmarks.RunError(
+            f"[swap_encryption] Failed to create swap disk {disk_name}:"
+            f" {stderr}"
+        )
 
-  # ── Step 3: attach the disk to the node VM ────────────────────────────────
-  logging.info(
-      '[swap_encryption] Attaching swap disk %s to %s', disk_name, instance_name
-  )
-  attach_cmd = gcp_util.GcloudCommand(
-      gcp_res, 'compute', 'instances', 'attach-disk', instance_name
-  )
-  attach_cmd.flags['disk'] = disk_name
-  attach_cmd.flags['device-name'] = 'pkb-swap'
-  attach_cmd.args.append('--quiet')
-  _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
-  if rc != 0:
-    raise errors.Benchmarks.RunError(
-        f'[swap_encryption] Failed to attach swap disk to {instance_name}: '
-        f'{stderr}'
+    # ── Step 3: attach the disk to the node VM ────────────────────────────────
+    logging.info(
+        "[swap_encryption] Attaching swap disk %s to %s",
+        disk_name,
+        instance_name,
     )
-  logging.info(
-      '[swap_encryption] Swap disk attached: %s → %s', disk_name, instance_name
-  )
-
-
-def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
-  """Detach (if attached) and delete a GCE disk, robustly, with retries.
-
-  Finds the attached instance from the disk's own `users` field rather than
-  kubectl — kubectl is often unavailable during teardown (cluster being
-  deleted), which previously left the disk attached and undeletable, so it
-  leaked.  Returns True if the disk is gone (deleted or already absent).
-  """
-  for attempt in range(1, 5):
-    gcp_res = _GcpZonalResource(project, zone)
-    describe_cmd = gcp_util.GcloudCommand(
-        gcp_res, 'compute', 'disks', 'describe', disk_name
+    attach_cmd = gcp_util.GcloudCommand(
+        gcp_res, "compute", "instances", "attach-disk", instance_name
     )
-    describe_cmd.flags['format'] = 'value(users)'
-    users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
+    attach_cmd.flags["disk"] = disk_name
+    attach_cmd.flags["device-name"] = "pkb-swap"
+    attach_cmd.args.append("--quiet")
+    _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
     if rc != 0:
-      logging.info(
-          '[swap_encryption] Swap disk %s not present — nothing to delete',
-          disk_name,
-      )
-      return True  # already gone
-    user = users.strip()
-    if user:
-      inst = user.split('/')[-1]
-      logging.info(
-          '[swap_encryption] Detaching swap disk %s from %s', disk_name, inst
-      )
-      detach_cmd = gcp_util.GcloudCommand(
-          gcp_res, 'compute', 'instances', 'detach-disk', inst
-      )
-      detach_cmd.flags['disk'] = disk_name
-      detach_cmd.args.append('--quiet')
-      detach_cmd.Issue(timeout=120, raise_on_failure=False)
-    delete_cmd = gcp_util.GcloudCommand(
-        gcp_res, 'compute', 'disks', 'delete', disk_name
+        raise errors.Benchmarks.RunError(
+            f"[swap_encryption] Failed to attach swap disk to {instance_name}: "
+            f"{stderr}"
+        )
+    logging.info(
+        "[swap_encryption] Swap disk attached: %s → %s",
+        disk_name,
+        instance_name,
     )
-    delete_cmd.args.append('--quiet')
-    _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
-    if drc == 0:
-      logging.info('[swap_encryption] Swap disk deleted: %s', disk_name)
-      return True
-    logging.warning(
-        '[swap_encryption] Swap disk delete attempt %d/4 failed '
-        '(%s); retrying in 10s',
-        attempt,
-        derr.strip()[:160],
+
+
+def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
+    """Detach (if attached) and delete a GCE disk, robustly, with retries.
+
+    Finds the attached instance from the disk's own `users` field rather than
+    kubectl — kubectl is often unavailable during teardown (cluster being
+    deleted), which previously left the disk attached and undeletable, so it
+    leaked.  Returns True if the disk is gone (deleted or already absent).
+    """
+    for attempt in range(1, 5):
+        gcp_res = _GcpZonalResource(project, zone)
+        describe_cmd = gcp_util.GcloudCommand(
+            gcp_res, "compute", "disks", "describe", disk_name
+        )
+        describe_cmd.flags["format"] = "value(users)"
+        users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
+        if rc != 0:
+            logging.info(
+                "[swap_encryption] Swap disk %s not present — nothing to"
+                " delete",
+                disk_name,
+            )
+            return True  # already gone
+        user = users.strip()
+        if user:
+            inst = user.split("/")[-1]
+            logging.info(
+                "[swap_encryption] Detaching swap disk %s from %s",
+                disk_name,
+                inst,
+            )
+            detach_cmd = gcp_util.GcloudCommand(
+                gcp_res, "compute", "instances", "detach-disk", inst
+            )
+            detach_cmd.flags["disk"] = disk_name
+            detach_cmd.args.append("--quiet")
+            detach_cmd.Issue(timeout=120, raise_on_failure=False)
+        delete_cmd = gcp_util.GcloudCommand(
+            gcp_res, "compute", "disks", "delete", disk_name
+        )
+        delete_cmd.args.append("--quiet")
+        _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
+        if drc == 0:
+            logging.info("[swap_encryption] Swap disk deleted: %s", disk_name)
+            return True
+        logging.warning(
+            "[swap_encryption] Swap disk delete attempt %d/4 failed "
+            "(%s); retrying in 10s",
+            attempt,
+            derr.strip()[:160],
+        )
+        time.sleep(10)
+    logging.error(
+        "[swap_encryption] Could NOT delete swap disk %s after retries "
+        "— delete it manually: gcloud compute disks delete %s "
+        "--zone %s --quiet",
+        disk_name,
+        disk_name,
+        zone,
     )
-    time.sleep(10)
-  logging.error(
-      '[swap_encryption] Could NOT delete swap disk %s after retries '
-      '— delete it manually: gcloud compute disks delete %s '
-      '--zone %s --quiet',
-      disk_name,
-      disk_name,
-      zone,
-  )
-  return False
+    return False
 
 
 def _detach_and_delete_swap_disk(cluster) -> None:
-  """Detach and delete the dedicated swap disk created by _attach_swap_disk."""
-  zone = None
-  if getattr(cluster, 'zones', None):
-    zone = cluster.zones[0]
-  elif getattr(cluster, 'region', None):
-    zone = cluster.region
-  if not zone or not getattr(cluster, 'project', None):
-    return
-  _delete_disk_by_name(f'pkb-swap-{cluster.name}', cluster.project, zone)
+    """Detach and delete the dedicated swap disk created by _attach_swap_disk."""
+    zone = None
+    if getattr(cluster, "zones", None):
+        zone = cluster.zones[0]
+    elif getattr(cluster, "region", None):
+        zone = cluster.region
+    if not zone or not getattr(cluster, "project", None):
+        return
+    _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone)
 
 
 def _delete_default_node_pool(cluster) -> None:
-  """Delete the dummy default nodepool after the benchmark pool is ready.
-
-  The default nodepool (e2-medium) was only needed to satisfy GKE's
-  requirement that a cluster must have at least one nodepool at creation time.
-  Removing it stops the clock on its cost immediately.
-  """
-  # Use PKB's GcloudCommand: auto-injects --project, --zone/--region.
-  cmd = cluster._GcloudCommand(
-      'container',
-      'node-pools',
-      'delete',
-      _DEFAULT_NODEPOOL,
-      '--cluster',
-      cluster.name,
-  )
-  cmd.args.append('--quiet')
-
-  logging.info(
-      '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL
-  )
-  _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
-  if rc != 0:
-    logging.warning(
-        '[swap_encryption] Could not delete default nodepool (rc=%d): %s',
-        rc,
-        stderr,
+    """Delete the dummy default nodepool after the benchmark pool is ready.
+
+    The default nodepool (e2-medium) was only needed to satisfy GKE's
+    requirement that a cluster must have at least one nodepool at creation time.
+    Removing it stops the clock on its cost immediately.
+    """
+    # Use PKB's GcloudCommand: auto-injects --project, --zone/--region.
+    cmd = cluster._GcloudCommand(
+        "container",
+        "node-pools",
+        "delete",
+        _DEFAULT_NODEPOOL,
+        "--cluster",
+        cluster.name,
     )
-  else:
-    logging.info('[swap_encryption] Default nodepool deleted')
+    cmd.args.append("--quiet")
 
+    logging.info(
+        "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL
+    )
+    _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
+    if rc != 0:
+        logging.warning(
+            "[swap_encryption] Could not delete default nodepool (rc=%d): %s",
+            rc,
+            stderr,
+        )
+    else:
+        logging.info("[swap_encryption] Default nodepool deleted")
 
-def _is_pod_gone(pod: str) -> bool:
-  """Return True if the named pod no longer exists in the cluster.
 
-  Used to distinguish OOM-killed container processes (pod still alive, rc=137)
-  from OOM-evicted pods (pod gone, DaemonSet will create a replacement).
-  """
-  try:
-    _, err, rc = kubectl.RunKubectlCommand(
-        [
-            'get',
-            'pod',
-            pod,
-            '-n',
-            _DS_NAMESPACE,
-            '-o',
-            'jsonpath={.metadata.name}',
-        ],
-        raise_on_failure=False,
-        timeout=15,
-    )
-    return rc != 0 and 'not found' in (err or '').lower()
-  except Exception:  # pylint: disable=broad-except
-    return False
+def _is_pod_gone(pod: str) -> bool:
+    """Return True if the named pod no longer exists in the cluster.
+
+    Used to distinguish OOM-killed container processes (pod still alive, rc=137)
+    from OOM-evicted pods (pod gone, DaemonSet will create a replacement).
+    """
+    try:
+        _, err, rc = kubectl.RunKubectlCommand(
+            [
+                "get",
+                "pod",
+                pod,
+                "-n",
+                _DS_NAMESPACE,
+                "-o",
+                "jsonpath={.metadata.name}",
+            ],
+            raise_on_failure=False,
+            timeout=15,
+        )
+        return rc != 0 and "not found" in (err or "").lower()
+    except Exception:  # pylint: disable=broad-except
+        return False
 
 
 def _pod_exec(
@@ -1299,375 +1381,544 @@ def _pod_exec(
     timeout: int = 300,
     _retries: int = 2,
 ) -> tuple[str, str]:
-  """Run a shell command inside the benchmark pod via kubectl exec.
-
-  Args:
-    pod: Pod name returned by _wait_for_benchmark_pod.
-    cmd: Shell command string passed to bash -c.
-    ignore_failure: When True, non-zero exit codes are logged but not
-      raised.
-    timeout: Seconds before PKB kills the kubectl exec process. Default
-      300 s matches PKB's IssueCommand default. Pass a larger value for
-      long-running jobs (fio, stress-ng, kernel build).
-    _retries: Number of automatic retries on transient GKE websocket
-      resets ("connection reset by peer").  Set to 0 to disable retries
-      for idempotent-sensitive commands.
-
-  Returns:
-    Tuple of (stdout, stderr) strings.
-  """
-  # Use module-level constants for error strings (defined at top of module).
-  # Use the globally-tracked active pod name — it may have been updated by
-  # a previous _recover_pod call when eviction replaced the pod.
-  active = _active_pod[0] if _active_pod else pod
-
-  for attempt in range(_retries + 1):
-    out, err, rc = kubectl.RunKubectlCommand(
-        ['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd],
-        raise_on_failure=False,
-        raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
-        timeout=timeout,
-    )
-    is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS)
-    if is_transient and attempt < _retries:
-      logging.warning(
-          '[swap_encryption] kubectl exec connection reset (attempt %d/%d); '
-          'retrying in 10 s',
-          attempt + 1,
-          _retries + 1,
-      )
-      time.sleep(10)
-      continue
-    # rc=137 (SIGKILL): the OOM killer terminated the container process.
-    # Two sub-cases:
-    #   A) Pod eviction: pod is gone, DaemonSet recreates it under a new name.
-    #   B) Container OOM restart: pod still exists, container restarts in place.
-    #      (DaemonSet restartPolicy=Always restarts the container, /tmp is lost,
-    #      tools must be re-installed before subsequent commands can run.)
-    # In both cases we call _recover_pod to wait for tools + sentinel, and
-    # we do NOT retry the OOM-triggering command itself.
-    if rc == 137:
-      # Record the OOM so the run-level gate can flag it even if the container
-      # restarts in place under the same pod name (which leaves both the
-      # "pod replaced" and "pod NotFound" checks silent).
-      if active not in _oom_events:
-        _oom_events.append(active)
-      # CRITICAL: sleep before checking pod state.  Kubernetes takes a few
-      # seconds to mark a just-evicted pod as Terminating / NotFound.  Without
-      # this delay _recover_pod sees the pod still in "Running" phase, returns
-      # the old pod name immediately, and every subsequent command fails with
-      # "Error from server (NotFound): pods … not found".
-      logging.warning(
-          '[swap_encryption] rc=137 — sleeping 15s for Kubernetes to update '
-          'pod state before recovery check'
-      )
-      time.sleep(15)
-      pod_gone = _is_pod_gone(active)
-      if pod_gone:
-        logging.warning(
-            '[swap_encryption] OOM-eviction detected (rc=137, pod gone) —'
-            ' recovering pod name for subsequent commands (not retrying this'
-            ' cmd)'
+    """Run a shell command inside the benchmark pod via kubectl exec.
+
+    Args:
+      pod: Pod name returned by _wait_for_benchmark_pod.
+      cmd: Shell command string passed to bash -c.
+      ignore_failure: When True, non-zero exit codes are logged but not
+        raised.
+      timeout: Seconds before PKB kills the kubectl exec process. Default
+        300 s matches PKB's IssueCommand default. Pass a larger value for
+        long-running jobs (fio, stress-ng, kernel build).
+      _retries: Number of automatic retries on transient GKE websocket
+        resets ("connection reset by peer").  Set to 0 to disable retries
+        for idempotent-sensitive commands.
+
+    Returns:
+      Tuple of (stdout, stderr) strings.
+    """
+    # Use module-level constants for error strings (defined at top of module).
+    # Use the globally-tracked active pod name — it may have been updated by
+    # a previous _recover_pod call when eviction replaced the pod.
+    active = _active_pod[0] if _active_pod else pod
+
+    for attempt in range(_retries + 1):
+        out, err, rc = kubectl.RunKubectlCommand(
+            ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd],
+            raise_on_failure=False,
+            raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
+            timeout=timeout,
         )
-      else:
-        logging.warning(
-            '[swap_encryption] Container OOM-killed (rc=137, pod still exists)'
-            ' — waiting for container restart and tool re-install before'
-            ' continuing'
+        is_transient = rc != 0 and any(
+            e in err for e in _TRANSIENT_KUBECTL_ERRORS
         )
-      new_pod = _recover_pod(active)
-      if new_pod != active:
-        logging.info(
-            '[swap_encryption] Pod name updated: %s → %s', active, new_pod
+        if is_transient and attempt < _retries:
+            logging.warning(
+                "[swap_encryption] kubectl exec connection reset (attempt"
+                " %d/%d); retrying in 10 s",
+                attempt + 1,
+                _retries + 1,
+            )
+            time.sleep(10)
+            continue
+        # rc=137 (SIGKILL): the OOM killer terminated the container process.
+        # Two sub-cases:
+        #   A) Pod eviction: pod is gone, DaemonSet recreates it under a new name.
+        #   B) Container OOM restart: pod still exists, container restarts in place.
+        #      (DaemonSet restartPolicy=Always restarts the container, /tmp is lost,
+        #      tools must be re-installed before subsequent commands can run.)
+        # In both cases we call _recover_pod to wait for tools + sentinel, and
+        # we do NOT retry the OOM-triggering command itself.
+        if rc == 137:
+            # Record the OOM so the run-level gate can flag it even if the container
+            # restarts in place under the same pod name (which leaves both the
+            # "pod replaced" and "pod NotFound" checks silent).
+            if active not in _oom_events:
+                _oom_events.append(active)
+            # CRITICAL: sleep before checking pod state.  Kubernetes takes a few
+            # seconds to mark a just-evicted pod as Terminating / NotFound.  Without
+            # this delay _recover_pod sees the pod still in "Running" phase, returns
+            # the old pod name immediately, and every subsequent command fails with
+            # "Error from server (NotFound): pods … not found".
+            logging.warning(
+                "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to"
+                " update pod state before recovery check"
+            )
+            time.sleep(15)
+            pod_gone = _is_pod_gone(active)
+            if pod_gone:
+                logging.warning(
+                    "[swap_encryption] OOM-eviction detected (rc=137, pod gone)"
+                    " — recovering pod name for subsequent commands (not"
+                    " retrying this cmd)"
+                )
+            else:
+                logging.warning(
+                    "[swap_encryption] Container OOM-killed (rc=137, pod still"
+                    " exists) — waiting for container restart and tool"
+                    " re-install before continuing"
+                )
+            new_pod = _recover_pod(active)
+            if new_pod != active:
+                logging.info(
+                    "[swap_encryption] Pod name updated: %s → %s",
+                    active,
+                    new_pod,
+                )
+                if _active_pod:
+                    _active_pod[0] = new_pod
+                active = new_pod
+            break  # Do NOT retry — the OOM cmd itself is not re-run on the new pod.
+
+        is_container_gone = rc != 0 and any(
+            e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
+        )
+        if is_container_gone:
+            # Record the loss for the run-level degradation gate REGARDLESS of retry
+            # budget or ignore_failure.  A "pods … not found" on a best-effort command
+            # (kernel build, opensearch, cleanup of a dead pod) still means the pod
+            # died; without this the gate stays blind because _active_pod is only
+            # renamed on the retry path below, which _retries=0 callers never reach.
+            if active and active not in _pod_lost:
+                _pod_lost.append(active)
+                logging.error(
+                    "[swap_encryption] Benchmark pod %s is gone (%s) —"
+                    " recording run as degraded",
+                    active,
+                    (err or "").strip()[:160],
+                )
+            if attempt < _retries:
+                logging.warning(
+                    "[swap_encryption] Container gone/restarting (attempt"
+                    " %d/%d) — waiting for pod to recover...",
+                    attempt + 1,
+                    _retries + 1,
+                )
+                new_pod = _recover_pod(active)
+                if new_pod != active:
+                    logging.info(
+                        "[swap_encryption] Pod name updated: %s → %s",
+                        active,
+                        new_pod,
+                    )
+                    if _active_pod:
+                        _active_pod[0] = new_pod
+                    active = new_pod
+                continue
+        break
+
+    if rc != 0 and not ignore_failure:
+        raise errors.VmUtil.IssueCommandError(
+            f"[swap_encryption] _pod_exec failed (rc={rc}): {err}"
         )
-        if _active_pod:
-          _active_pod[0] = new_pod
-        active = new_pod
-      break  # Do NOT retry — the OOM cmd itself is not re-run on the new pod.
+    return out, err
 
-    is_container_gone = rc != 0 and any(
-        e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
+
+def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
+    """Wait for a DaemonSet container to recover after OOM kill or eviction.
+
+    Handles two scenarios:
+    1. Container OOM restart: same pod name, container restarting in place.
+       DaemonSet restartPolicy=Always brings it back under the same pod name.
+    2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates
+       a new pod with a DIFFERENT name.  We detect this by checking whether
+       the named pod still exists; if not, we search by the DaemonSet label
+       selector for a Running pod.
+
+    Returns the (possibly new) pod name once it is Running and ready.
+    """
+    deadline = time.time() + timeout_sec
+    logging.info(
+        "[swap_encryption] Waiting for pod %s to recover (up to %ds)...",
+        pod,
+        timeout_sec,
     )
-    if is_container_gone:
-      # Record the loss for the run-level degradation gate REGARDLESS of retry
-      # budget or ignore_failure.  A "pods … not found" on a best-effort command
-      # (kernel build, opensearch, cleanup of a dead pod) still means the pod
-      # died; without this the gate stays blind because _active_pod is only
-      # renamed on the retry path below, which _retries=0 callers never reach.
-      if active and active not in _pod_lost:
-        _pod_lost.append(active)
-        logging.error(
-            '[swap_encryption] Benchmark pod %s is gone (%s) — recording run '
-            'as degraded',
-            active,
-            (err or '').strip()[:160],
+
+    # Phase 1: wait for a Running pod — either the named one (container
+    # restart) or a replacement pod found via label selector (eviction).
+    #
+    # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a
+    # single call.  When a pod is evicted, Kubernetes first sets deletionTimestamp
+    # (the pod is "Terminating") while status.phase may still read "Running" for
+    # several seconds.  Checking only status.phase causes a false-positive: we
+    # return the old pod name immediately and every subsequent command fails with
+    # "Error from server (NotFound)".  Checking deletionTimestamp catches this.
+    recovered_pod = pod
+    while time.time() < deadline:
+        # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not
+        # stdout.  When the pod is gone, status_out is empty and the error text
+        # lives entirely in status_err.  Discarding stderr (using _) means the
+        # 'not found' check below never fires and we spin until deadline.
+        status_out, status_err, status_rc = kubectl.RunKubectlCommand(
+            [
+                "get",
+                "pod",
+                pod,
+                "-n",
+                _DS_NAMESPACE,
+                "-o",
+                "jsonpath={.status.phase}|{.metadata.deletionTimestamp}",
+            ],
+            raise_on_failure=False,
+            timeout=30,
         )
-      if attempt < _retries:
-        logging.warning(
-            '[swap_encryption] Container gone/restarting (attempt %d/%d) — '
-            'waiting for pod to recover...',
-            attempt + 1,
-            _retries + 1,
+        # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating)
+        fields = status_out.strip().split("|")
+        phase = fields[0].strip() if fields else ""
+        is_terminating = len(fields) > 1 and bool(fields[1].strip())
+
+        # Pod is genuinely Running and NOT being deleted — recovery complete.
+        if status_rc == 0 and phase == "Running" and not is_terminating:
+            break
+
+        # Pod no longer exists, OR it exists but is being terminated (Terminating
+        # state or deletionTimestamp set) — look for a replacement pod by label.
+        pod_gone_or_terminating = (
+            status_rc != 0 and "not found" in (status_out + status_err).lower()
+        ) or is_terminating
+        if pod_gone_or_terminating:
+            label_out, _, label_rc = kubectl.RunKubectlCommand(
+                [
+                    "get",
+                    "pods",
+                    "-n",
+                    _DS_NAMESPACE,
+                    "-l",
+                    f"app={_DS_LABEL}",
+                    "-o",
+                    (
+                        'jsonpath={range .items[?(@.status.phase=="Running")]}'
+                        '{.metadata.name}{"\\n"}{end}'
+                    ),
+                ],
+                raise_on_failure=False,
+                timeout=30,
+            )
+            new_pods = [
+                p.strip()
+                for p in label_out.strip().splitlines()
+                if p.strip() and p.strip() != pod
+            ]  # exclude the dying pod
+            if label_rc == 0 and new_pods:
+                recovered_pod = new_pods[0]
+                logging.info(
+                    "[swap_encryption] Original pod %s gone/terminating; "
+                    "found replacement %s",
+                    pod,
+                    recovered_pod,
+                )
+                break
+
+        time.sleep(10)
+    else:
+        raise errors.VmUtil.IssueCommandError(
+            f"[swap_encryption] No Running pod found (original: {pod}) "
+            f"within {timeout_sec}s after OOM kill / eviction"
+        )
+
+    # Phase 2: wait for init script to finish (sentinel written last).
+    while time.time() < deadline:
+        ready_out, _, ready_rc = kubectl.RunKubectlCommand(
+            [
+                "exec",
+                recovered_pod,
+                "-n",
+                _DS_NAMESPACE,
+                "--",
+                "bash",
+                "-c",
+                "test -f /tmp/pkb_ready && echo READY",
+            ],
+            raise_on_failure=False,
+            timeout=30,
         )
-        new_pod = _recover_pod(active)
-        if new_pod != active:
-          logging.info(
-              '[swap_encryption] Pod name updated: %s → %s', active, new_pod
-          )
-          if _active_pod:
-            _active_pod[0] = new_pod
-          active = new_pod
-        continue
-    break
-
-  if rc != 0 and not ignore_failure:
+        if ready_rc == 0 and "READY" in ready_out:
+            logging.info(
+                "[swap_encryption] Pod %s recovered (swap device active)",
+                recovered_pod,
+            )
+            return recovered_pod
+        time.sleep(15)
+
     raise errors.VmUtil.IssueCommandError(
-        f'[swap_encryption] _pod_exec failed (rc={rc}): {err}'
+        f"[swap_encryption] Pod {recovered_pod} did not become ready "
+        f"within {timeout_sec}s after OOM kill / eviction"
     )
-  return out, err
 
 
-def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
-  """Wait for a DaemonSet container to recover after OOM kill or eviction.
-
-  Handles two scenarios:
-  1. Container OOM restart: same pod name, container restarting in place.
-     DaemonSet restartPolicy=Always brings it back under the same pod name.
-  2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates
-     a new pod with a DIFFERENT name.  We detect this by checking whether
-     the named pod still exists; if not, we search by the DaemonSet label
-     selector for a Running pod.
-
-  Returns the (possibly new) pod name once it is Running and ready.
-  """
-  deadline = time.time() + timeout_sec
-  logging.info(
-      '[swap_encryption] Waiting for pod %s to recover (up to %ds)...',
-      pod,
-      timeout_sec,
-  )
-
-  # Phase 1: wait for a Running pod — either the named one (container
-  # restart) or a replacement pod found via label selector (eviction).
-  #
-  # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a
-  # single call.  When a pod is evicted, Kubernetes first sets deletionTimestamp
-  # (the pod is "Terminating") while status.phase may still read "Running" for
-  # several seconds.  Checking only status.phase causes a false-positive: we
-  # return the old pod name immediately and every subsequent command fails with
-  # "Error from server (NotFound)".  Checking deletionTimestamp catches this.
-  recovered_pod = pod
-  while time.time() < deadline:
-    # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not
-    # stdout.  When the pod is gone, status_out is empty and the error text
-    # lives entirely in status_err.  Discarding stderr (using _) means the
-    # 'not found' check below never fires and we spin until deadline.
-    status_out, status_err, status_rc = kubectl.RunKubectlCommand(
-        [
-            'get',
-            'pod',
-            pod,
-            '-n',
-            _DS_NAMESPACE,
-            '-o',
-            'jsonpath={.status.phase}|{.metadata.deletionTimestamp}',
-        ],
-        raise_on_failure=False,
+def _run_phase1_fio(
+    pod: str, swap_dev: str, base_meta: dict[str, Any]
+) -> list[sample.Sample]:
+    """Run fio microbenchmarks on the raw swap block device (Phase 1).
+
+    Calls swapoff before running fio so measurements reflect the raw
+    hardware + encryption ceiling with no swap-daemon overhead.  Re-enables
+    swap unconditionally after all jobs complete.
+
+    Jobs:
+      4k_randread   iodepth=32  → random read IOPS
+      4k_randwrite  iodepth=32  → random write IOPS
+      1m_seqread    iodepth=8   → sequential read bandwidth
+      1m_seqwrite   iodepth=8   → sequential write bandwidth
+      4k_lat_read   iodepth=1   → completion latency floor (read)
+
+    Args:
+      pod: Benchmark pod name.
+      swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted.
+      base_meta: Shared metadata dict from _build_metadata().
+
+    Returns:
+      List of Sample objects with IOPS, bandwidth and latency metrics.
+    """
+    samples: list[sample.Sample] = []
+
+    # swapoff before fio — running fio with --direct=1 on an active swap
+    # device races with kernel page-reclaim on the same dm-crypt target
+    # and can cause kernel panics on some kernels.
+    logging.info("[swap_encryption] Phase 1: swapoff %s", swap_dev)
+    _pod_exec(
+        pod,
+        f"swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true",
         timeout=30,
+        ignore_failure=True,
     )
-    # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating)
-    fields = status_out.strip().split('|')
-    phase = fields[0].strip() if fields else ''
-    is_terminating = len(fields) > 1 and bool(fields[1].strip())
-
-    # Pod is genuinely Running and NOT being deleted — recovery complete.
-    if status_rc == 0 and phase == 'Running' and not is_terminating:
-      break
-
-    # Pod no longer exists, OR it exists but is being terminated (Terminating
-    # state or deletionTimestamp set) — look for a replacement pod by label.
-    pod_gone_or_terminating = (
-        status_rc != 0 and 'not found' in (status_out + status_err).lower()
-    ) or is_terminating
-    if pod_gone_or_terminating:
-      label_out, _, label_rc = kubectl.RunKubectlCommand(
-          [
-              'get',
-              'pods',
-              '-n',
-              _DS_NAMESPACE,
-              '-l',
-              f'app={_DS_LABEL}',
-              '-o',
-              (
-                  'jsonpath={range .items[?(@.status.phase=="Running")]}'
-                  '{.metadata.name}{"\\n"}{end}'
-              ),
-          ],
-          raise_on_failure=False,
-          timeout=30,
-      )
-      new_pods = [
-          p.strip()
-          for p in label_out.strip().splitlines()
-          if p.strip() and p.strip() != pod
-      ]  # exclude the dying pod
-      if label_rc == 0 and new_pods:
-        recovered_pod = new_pods[0]
-        logging.info(
-            '[swap_encryption] Original pod %s gone/terminating; '
-            'found replacement %s',
+
+    # (name, rw_mode, block_size, iodepth)
+    fio_jobs = [
+        ("4k_randread", "randread", "4k", 32),
+        ("4k_randwrite", "randwrite", "4k", 32),
+        ("1m_seqread", "read", "1m", 8),
+        ("1m_seqwrite", "write", "1m", 8),
+        ("4k_lat_read", "randread", "4k", 1),
+    ]
+
+    runtime = _FIO_RUNTIME_SEC.value
+    try:
+        for name, rw, bs, iodepth in fio_jobs:
+            cmd = (
+                f"fio --name={name} --filename={swap_dev}"
+                f" --rw={rw} --bs={bs} --iodepth={iodepth}"
+                " --ioengine=libaio --direct=1"
+                f" --runtime={runtime} --time_based --group_reporting"
+                " --output-format=json 2>/dev/null"
+            )
+            logging.info("[swap_encryption] Phase 1: fio job %s", name)
+            out, _ = _pod_exec(pod, cmd, timeout=runtime + 120)
+            samples += _parse_fio_json(out, name, base_meta)
+    finally:
+        # Always re-enable swap so subsequent phases can drive swap I/O.
+        logging.info("[swap_encryption] Phase 1: swapon %s", swap_dev)
+        _pod_exec(
             pod,
-            recovered_pod,
+            f"swapon {swap_dev} 2>/dev/null || true",
+            timeout=30,
+            ignore_failure=True,
         )
-        break
 
-    time.sleep(10)
-  else:
-    raise errors.VmUtil.IssueCommandError(
-        f'[swap_encryption] No Running pod found (original: {pod}) '
-        f'within {timeout_sec}s after OOM kill / eviction'
+    logging.info(
+        "[swap_encryption] Phase 1 complete (%d samples)", len(samples)
     )
+    return samples
 
-  # Phase 2: wait for init script to finish (sentinel written last).
-  while time.time() < deadline:
-    ready_out, _, ready_rc = kubectl.RunKubectlCommand(
-        [
-            'exec',
-            recovered_pod,
-            '-n',
-            _DS_NAMESPACE,
-            '--',
-            'bash',
-            '-c',
-            'test -f /tmp/pkb_ready && echo READY',
-        ],
-        raise_on_failure=False,
-        timeout=30,
-    )
-    if ready_rc == 0 and 'READY' in ready_out:
-      logging.info(
-          '[swap_encryption] Pod %s recovered and ready', recovered_pod
-      )
-      return recovered_pod
-    time.sleep(15)
 
-  raise errors.VmUtil.IssueCommandError(
-      f'[swap_encryption] Pod {recovered_pod} did not become ready '
-      f'within {timeout_sec}s after OOM kill / eviction'
-  )
+def _parse_fio_json(
+    fio_output: str, job_name: str, base_meta: dict[str, Any]
+) -> list[sample.Sample]:
+    """Parse fio --output-format=json output into PKB Sample objects.
+
+    Extracts per-direction (read/write) IOPS, bandwidth (MB/s) and completion
+    latency (mean + p50/p99/p999 percentiles).
+
+    Args:
+      fio_output: Raw stdout from fio with --output-format=json.
+      job_name: Short identifier embedded in metric names, e.g. '4k_randread'.
+      base_meta: Shared metadata dict copied into each sample.
+
+    Returns:
+      List of Sample objects; empty if output cannot be parsed or is zero.
+    """
+    # fio sometimes emits kernel warnings before the JSON object.
+    json_start = fio_output.find("{")
+    if json_start == -1:
+        logging.warning(
+            "[swap_encryption] Phase 1: no JSON in fio output for %s", job_name
+        )
+        return []
+
+    try:
+        data = json.loads(fio_output[json_start:])
+    except json.JSONDecodeError as e:
+        logging.warning(
+            "[swap_encryption] Phase 1: fio JSON parse error (%s): %s",
+            job_name,
+            e,
+        )
+        return []
+
+    jobs = data.get("jobs", [])
+    if not jobs:
+        return []
+
+    job = jobs[0]
+    samples: list[sample.Sample] = []
+    meta = dict(base_meta, fio_job=job_name)
+
+    for direction in ("read", "write"):
+        d = job.get(direction, {})
+        iops = float(d.get("iops", 0))
+        bw_kbps = float(d.get("bw", 0))  # fio reports KiB/s
+        bw_mbps = bw_kbps / 1024.0
+
+        # Skip directions with near-zero throughput (e.g. write on a randread job).
+        if iops < 1 and bw_kbps < 1:
+            continue
+
+        prefix = f"phase1_fio_{job_name}_{direction}"
+        samples.append(sample.Sample(f"{prefix}_iops", iops, "IOPS", meta))
+        samples.append(
+            sample.Sample(f"{prefix}_bw_mbps", bw_mbps, "MB/s", meta)
+        )
+
+        # Completion latency — fio reports nanoseconds; emit microseconds.
+        clat = d.get("clat_ns", d.get("lat_ns", {}))
+        lat_mean_ns = float(clat.get("mean", 0))
+        if lat_mean_ns > 0:
+            samples.append(
+                sample.Sample(
+                    f"{prefix}_lat_mean_us", lat_mean_ns / 1000.0, "us", meta
+                )
+            )
+            for pct_key, label in (
+                ("50.000000", "p50"),
+                ("99.000000", "p99"),
+                ("99.900000", "p999"),
+            ):
+                val_ns = clat.get("percentile", {}).get(pct_key, 0)
+                if val_ns:
+                    samples.append(
+                        sample.Sample(
+                            f"{prefix}_lat_{label}_us",
+                            val_ns / 1000.0,
+                            "us",
+                            meta,
+                        )
+                    )
+
+    return samples
 
 
 _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
     # GCP  (on-demand, us-central1 unless noted)
-    'c4-standard-8-lssd': 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
-    'c4-standard-8': 0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
-    'n4-highmem-32': 3.0256,  # 32 vCPU, 256 GB RAM
-    'n2-highmem-32': 2.5216,  # 32 vCPU, 256 GB RAM
-    'n2-standard-32': 1.5264,  # 32 vCPU, 120 GB RAM
-    'z3-highmem-8': 2.7248,  # 8 vCPU + 4× LSSD
+    "c4-standard-8-lssd": 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
+    "c4-standard-8": 0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
+    "n4-highmem-32": 3.0256,  # 32 vCPU, 256 GB RAM
+    "n2-highmem-32": 2.5216,  # 32 vCPU, 256 GB RAM
+    "n2-standard-32": 1.5264,  # 32 vCPU, 120 GB RAM
+    "z3-highmem-8": 2.7248,  # 8 vCPU + 4× LSSD
     # AWS
-    'i4i.4xlarge': 1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
-    'i4i.2xlarge': 0.7480,
-    'm6id.4xlarge': 0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
-    'm6i.4xlarge': 0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
-    'r6i.4xlarge': 1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
+    "i4i.4xlarge": 1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
+    "i4i.2xlarge": 0.7480,
+    "m6id.4xlarge": 0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
+    "m6i.4xlarge": 0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
+    "r6i.4xlarge": 1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
 }
 
 
 def _collect_cost_sample(
     pod: str, elapsed_sec: float, base_meta: dict
 ) -> list[sample.Sample]:
-  """Emit a cost_estimate_usd sample for the benchmark run (gap 7).
-
-  Instance type is read from cloud metadata inside the pod.  Price is looked
-  up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and
-  a warning is logged.
-
-  Args:
-    pod: Benchmark pod name.
-    elapsed_sec: Wall-clock seconds the benchmark phases took.
-    base_meta: Shared metadata dict.
-
-  Returns:
-    A list of zero or one sample.Sample.
-  """
-  # Detect instance type from cloud metadata
-  instance_type = ''
-
-  # GCP: machine type is the last segment of the metadata URL value
-  gcp_type_out, _ = _pod_exec(
-      pod,
-      'curl -s -m 3 --fail'
-      ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
-      ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-      ignore_failure=True,
-  )
-  if gcp_type_out.strip():
-    instance_type = gcp_type_out.strip().split('/')[-1]
-
-  if not instance_type:
-    # AWS: instance-type is a plain string
-    aws_type_out, _ = _pod_exec(
+    """Emit a cost_estimate_usd sample for the benchmark run (gap 7).
+
+    Instance type is read from cloud metadata inside the pod.  Price is looked
+    up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and
+    a warning is logged.
+
+    Args:
+      pod: Benchmark pod name.
+      elapsed_sec: Wall-clock seconds the benchmark phases took.
+      base_meta: Shared metadata dict.
+
+    Returns:
+      A list of zero or one sample.Sample.
+    """
+    # Detect instance type from cloud metadata
+    instance_type = ""
+
+    # GCP: machine type is the last segment of the metadata URL value
+    gcp_type_out, _ = _pod_exec(
         pod,
-        'curl -s -m 3 --fail '
-        'http://169.254.169.254/latest/meta-data/instance-type '
-        '2>/dev/null || echo ""',
+        "curl -s -m 3 --fail"
+        " http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
+        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
         ignore_failure=True,
     )
-    instance_type = aws_type_out.strip()
-
-  # Allow explicit override (useful when running on custom/renamed machine
-  # types or when the pod was unavailable during cost collection).
-  if _INSTANCE_SIZE_LABEL.value:
-    instance_type = _INSTANCE_SIZE_LABEL.value
-
-  # Last resort: fall back to the benchmark machine type flag.  This ensures
-  # cost tracking works even when the pod was evicted before cost collection
-  # ran (in which case the metadata curl above returned empty).
-  if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
-    instance_type = _BENCHMARK_MACHINE_TYPE.value
-    logging.info(
-        '[swap_encryption] Instance type from metadata unavailable; '
-        'using --swap_encryption_benchmark_machine_type=%s for cost tracking',
-        instance_type,
-    )
+    if gcp_type_out.strip():
+        instance_type = gcp_type_out.strip().split("/")[-1]
 
-  price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
-  if price is None:
-    logging.warning(
-        '[swap_encryption] Unknown instance type "%s" – skipping cost sample. '
-        'Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost tracking.',
-        instance_type,
-    )
-    return []
+    if not instance_type:
+        # AWS: instance-type is a plain string
+        aws_type_out, _ = _pod_exec(
+            pod,
+            "curl -s -m 3 --fail "
+            "http://169.254.169.254/latest/meta-data/instance-type "
+            '2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        instance_type = aws_type_out.strip()
+
+    # Allow explicit override (useful when running on custom/renamed machine
+    # types or when the pod was unavailable during cost collection).
+    if _INSTANCE_SIZE_LABEL.value:
+        instance_type = _INSTANCE_SIZE_LABEL.value
+
+    # Last resort: fall back to the benchmark machine type flag.  This ensures
+    # cost tracking works even when the pod was evicted before cost collection
+    # ran (in which case the metadata curl above returned empty).
+    if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
+        instance_type = _BENCHMARK_MACHINE_TYPE.value
+        logging.info(
+            "[swap_encryption] Instance type from metadata unavailable; using"
+            " --swap_encryption_benchmark_machine_type=%s for cost tracking",
+            instance_type,
+        )
 
-  hours = elapsed_sec / 3600.0
-  cost = hours * price
-  meta = dict(
-      base_meta,
-      instance_type=instance_type,
-      price_usd_per_hr=price,
-      benchmark_elapsed_sec=round(elapsed_sec, 1),
-  )
-  return [sample.Sample('cost_estimate_usd', cost, 'USD', meta)]
+    price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
+    if price is None:
+        logging.warning(
+            '[swap_encryption] Unknown instance type "%s" – skipping cost'
+            " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost"
+            " tracking.",
+            instance_type,
+        )
+        return []
+
+    hours = elapsed_sec / 3600.0
+    cost = hours * price
+    meta = dict(
+        base_meta,
+        instance_type=instance_type,
+        price_usd_per_hr=price,
+        benchmark_elapsed_sec=round(elapsed_sec, 1),
+    )
+    return [sample.Sample("cost_estimate_usd", cost, "USD", meta)]
 
 
 def _detect_swap_device(pod: str) -> str:
-  """Return the active swap device path on the cluster node."""
-  if _SWAP_DEVICE.value:
-    return _SWAP_DEVICE.value
-
-  # /proc/swaps is the source of truth: it lists the swap device that is
-  # ACTUALLY active.  We must NOT just `test -e /dev/mapper/swap_encrypted`,
-  # because a stale dm-crypt mapping from a previous run on a reused node can
-  # still exist as a /dev node while being non-functional (fio/swapoff then
-  # fail with "No such device or address").  So read the active device from
-  # /proc/swaps first; only fall back to the mapper path if /proc/swaps is
-  # somehow empty but the mapper is genuinely present.
-  dm_out, _ = _pod_exec(
-      pod,
-      textwrap.dedent("""
+    """Return the active swap device path on the cluster node."""
+    if _SWAP_DEVICE.value:
+        return _SWAP_DEVICE.value
+
+    # /proc/swaps is the source of truth: it lists the swap device that is
+    # ACTUALLY active.  We must NOT just `test -e /dev/mapper/swap_encrypted`,
+    # because a stale dm-crypt mapping from a previous run on a reused node can
+    # still exist as a /dev node while being non-functional (fio/swapoff then
+    # fail with "No such device or address").  So read the active device from
+    # /proc/swaps first; only fall back to the mapper path if /proc/swaps is
+    # somehow empty but the mapper is genuinely present.
+    dm_out, _ = _pod_exec(
+        pod,
+        textwrap.dedent("""
         ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
         if [ -n "$ACTIVE" ]
         then
@@ -1677,138 +1928,140 @@ def _detect_swap_device(pod: str) -> str:
           echo /dev/mapper/swap_encrypted
         fi
       """),
-      ignore_failure=True,
-  )
-  dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ''
-  if dev:
-    return dev
-  raise ValueError(
-      'No active swap device found in the benchmark pod. '
-      'Use --swap_encryption_device to specify one.'
-  )
+        ignore_failure=True,
+    )
+    dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ""
+    if dev:
+        return dev
+    raise ValueError(
+        "No active swap device found in the benchmark pod. "
+        "Use --swap_encryption_device to specify one."
+    )
 
 
 def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
-  """Collect node environment, encryption type, and config into a dict."""
-
-  kernel_out, _ = _pod_exec(pod, 'uname -r', ignore_failure=True)
-  mem_out, _ = _pod_exec(
-      pod,
-      "awk '/MemTotal/{print $2}' /proc/meminfo",
-      ignore_failure=True,
-  )
-  swap_out, _ = _pod_exec(
-      pod,
-      "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
-      ignore_failure=True,
-  )
-
-  try:
-    mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
-  except ValueError:
-    mem_gb = 0
-  try:
-    swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
-  except ValueError:
-    swap_gb = 0
-
-  # Encryption type — key off dm-crypt presence + the swap target, NOT the
-  # device path.  A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro-
-  # encrypted; only the AWS targets (instance_store / io2) are.
-  enc = 'unknown'
-  if '/dev/mapper/' in swap_dev:
-    table_out, _ = _pod_exec(
-        pod,
-        f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
-        ignore_failure=True,
-    )
-    enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
-  elif _SWAP_TYPE.value in ('instance_store', 'io2'):
-    enc = 'nitro_hardware_offload'  # AWS: encrypted by the Nitro card
-  elif not _ENABLE_DMCRYPT.value:
-    enc = 'none'  # GKE plain swap (encryption OFF)
-
-  cloud = _detect_cloud(pod)
-
-  # Gap 6: instance size label for multi-size comparison runs.
-  # If the flag is set use it directly; otherwise try to read it from
-  # cloud metadata so that the field is always populated.
-  instance_label = _INSTANCE_SIZE_LABEL.value
-  if not instance_label:
-    gcp_type_out, _ = _pod_exec(
+    """Collect node environment, encryption type, and config into a dict."""
+
+    kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True)
+    mem_out, _ = _pod_exec(
         pod,
-        'curl -s -m 3 --fail'
-        ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
-        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        "awk '/MemTotal/{print $2}' /proc/meminfo",
         ignore_failure=True,
     )
-    if gcp_type_out.strip():
-      instance_label = gcp_type_out.strip().split('/')[-1]
-  if not instance_label:
-    aws_type_out, _ = _pod_exec(
+    swap_out, _ = _pod_exec(
         pod,
-        'curl -s -m 3 --fail '
-        'http://169.254.169.254/latest/meta-data/instance-type '
-        '2>/dev/null || echo ""',
+        "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
         ignore_failure=True,
     )
-    instance_label = aws_type_out.strip()
-
-  return {
-      'benchmark': BENCHMARK_NAME,
-      'execution_mode': 'kubernetes_privileged_pod',
-      'cloud': cloud,
-      'instance_size': instance_label,
-      'kernel_version': kernel_out.strip(),
-      'host_memory_gb': mem_gb,
-      'swap_device': swap_dev,
-      'swap_size_gb': swap_gb,
-      'swap_encryption': enc,
-      # Test-matrix columns: storage target, encryption on/off, image, IOPS
-      'storage_target': _SWAP_TYPE.value,
-      'boot_disk_type': _BOOT_DISK_TYPE.value,
-      'dmcrypt_enabled': _ENABLE_DMCRYPT.value,
-      'node_image_type': _NODE_IMAGE_TYPE.value,
-      'boot_disk_iops_target': _BOOT_DISK_IOPS.value,
-      'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value,
-      # Other config
-      'zswap_enabled': _ENABLE_ZSWAP.value,
-      'min_free_kbytes': _MIN_FREE_KBYTES.value,
-      'fio_runtime_sec': _FIO_RUNTIME_SEC.value,
-      # Requested config value only.  The *effective* stress-ng footprint may
-      # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the
-      # actual value it ran with as 'stress_vm_bytes' so the two never conflict.
-      'stress_vm_bytes_requested': _STRESS_VM_BYTES.value,
-      'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value,
-      'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
-      'nodepool': _NODEPOOL.value,
-  }
+
+    try:
+        mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        mem_gb = 0
+    try:
+        swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        swap_gb = 0
+
+    # Encryption type — key off dm-crypt presence + the swap target, NOT the
+    # device path.  A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro-
+    # encrypted; only the AWS targets (instance_store / io2) are.
+    enc = "unknown"
+    if "/dev/mapper/" in swap_dev:
+        table_out, _ = _pod_exec(
+            pod,
+            f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other"
+    elif _SWAP_TYPE.value in ("instance_store", "io2"):
+        enc = "nitro_hardware_offload"  # AWS: encrypted by the Nitro card
+    elif not _ENABLE_DMCRYPT.value:
+        enc = "none"  # GKE plain swap (encryption OFF)
+
+    cloud = _detect_cloud(pod)
+
+    # Gap 6: instance size label for multi-size comparison runs.
+    # If the flag is set use it directly; otherwise try to read it from
+    # cloud metadata so that the field is always populated.
+    instance_label = _INSTANCE_SIZE_LABEL.value
+    if not instance_label:
+        gcp_type_out, _ = _pod_exec(
+            pod,
+            "curl -s -m 3 --fail"
+            " http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
+            ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        if gcp_type_out.strip():
+            instance_label = gcp_type_out.strip().split("/")[-1]
+    if not instance_label:
+        aws_type_out, _ = _pod_exec(
+            pod,
+            "curl -s -m 3 --fail "
+            "http://169.254.169.254/latest/meta-data/instance-type "
+            '2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        instance_label = aws_type_out.strip()
+
+    return {
+        "benchmark": BENCHMARK_NAME,
+        "execution_mode": "kubernetes_privileged_pod",
+        "cloud": cloud,
+        "instance_size": instance_label,
+        "kernel_version": kernel_out.strip(),
+        "host_memory_gb": mem_gb,
+        "swap_device": swap_dev,
+        "swap_size_gb": swap_gb,
+        "swap_encryption": enc,
+        # Test-matrix columns: storage target, encryption on/off, image, IOPS
+        "storage_target": _SWAP_TYPE.value,
+        "boot_disk_type": _BOOT_DISK_TYPE.value,
+        "dmcrypt_enabled": _ENABLE_DMCRYPT.value,
+        "node_image_type": _NODE_IMAGE_TYPE.value,
+        "boot_disk_iops_target": _BOOT_DISK_IOPS.value,
+        "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value,
+        # Other config
+        "zswap_enabled": _ENABLE_ZSWAP.value,
+        "min_free_kbytes": _MIN_FREE_KBYTES.value,
+        "fio_runtime_sec": _FIO_RUNTIME_SEC.value,
+        # Requested config value only.  The *effective* stress-ng footprint may
+        # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the
+        # actual value it ran with as 'stress_vm_bytes' so the two never conflict.
+        "stress_vm_bytes_requested": _STRESS_VM_BYTES.value,
+        "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value,
+        "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value,
+        "nodepool": _NODEPOOL.value,
+    }
 
 
 def _detect_cloud(pod: str) -> str:
-  """Detect whether the benchmark pod is running on GCP or AWS.
-
-  Queries the cloud instance metadata endpoint inside the pod.  Returns
-  'GCP' if the GCP metadata server responds, 'AWS' otherwise.
-  """
-  gcp_out, _ = _pod_exec(
-      pod,
-      'curl -s -m 2 --fail '
-      'http://metadata.google.internal/computeMetadata/v1/project/project-id'
-      ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-      ignore_failure=True,
-  )
-  if gcp_out.strip():
-    return 'GCP'
-  return 'AWS'
+    """Detect whether the benchmark pod is running on GCP or AWS.
+
+    Queries the cloud instance metadata endpoint inside the pod.  Returns
+    'GCP' if the GCP metadata server responds, 'AWS' otherwise.
+    """
+    gcp_out, _ = _pod_exec(
+        pod,
+        "curl -s -m 2 --fail "
+        "http://metadata.google.internal/computeMetadata/v1/project/project-id"
+        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    if gcp_out.strip():
+        return "GCP"
+    return "AWS"
 
 
 def _ensure_io2_volume() -> None:
-  """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
-
-  Only executed when --swap_encryption_swap_type=io2.  Full implementation
-  is deferred to PR2 (swap-capability layer).
-  """
-  if _SWAP_TYPE.value != 'io2':
-    return
-  logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')
+    """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
+
+    Only executed when --swap_encryption_swap_type=io2.  Full implementation
+    is deferred to PR2 (swap-capability layer).
+    """
+    if _SWAP_TYPE.value != "io2":
+        return
+    logging.info(
+        "[swap_encryption] io2 swap volume provisioning deferred to PR2"
+    )

From b8b4300db55afaf1e6b3ad37a73847681ab68faa Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Thu, 25 Jun 2026 20:28:04 +0530
Subject: [PATCH 08/17] fix(swap_encryption): lean DaemonSet + Phase 1 fio
 microbenchmarks

---
 .../swap_encryption_benchmark.py              | 121 +++++++-----------
 1 file changed, 48 insertions(+), 73 deletions(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index e596abf963..e30854e188 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -97,7 +97,7 @@
     GKE vs. EKS swap encryption and LSSD performance comparison.
     Two-step nodepool setup: PKB provisions a minimal cluster with a cheap
     default nodepool (Step 1), then Prepare() adds the real benchmark
-    nodepool (n4-highmem-32 / c4-*-lssd, COS_CONTAINERD, 80k IOPS) with a
+    nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a
     node-level startup script that configures dm-crypt swap before any pod
     is scheduled, then removes the default nodepool (Step 2).  All benchmark
     phases run inside a privileged DaemonSet pinned to the benchmark nodepool.
@@ -286,17 +286,6 @@
     "(unencrypted) swap overhead as a baseline.",
 )
 
-_GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string(
-    "swap_encryption_gke_kubelet_memory_swap",
-    "LimitedSwap",
-    "Value for kubeletConfig.memorySwapBehavior injected via "
-    "--system-config-from-file when creating the GKE benchmark nodepool.  "
-    "LimitedSwap (default) — the kubelet allows pods to use swap up to their "
-    "memory limit; required for the DaemonSet pod to drive kernel swapping.  "
-    "NoSwap — disables swap at the kubelet level (use for a baseline run that "
-    "confirms zero swap activity).  Set empty string to omit the flag entirely "
-    "and rely on the cluster-level default.",
-)
 
 _SWAP_DEVICE = flags.DEFINE_string(
     "swap_encryption_device",
@@ -412,7 +401,7 @@ def Prepare(spec: _BenchmarkSpec) -> None:
 
     Step 2 (this function):
       a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with
-         COS_CONTAINERD, 80 000 IOPS, and a node startup script that configures
+         UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures
          dm-crypt swap at the OS level — before any pod is scheduled.
       b. Delete the dummy default nodepool to stop its cost immediately.
       c. Deploy the privileged DaemonSet (pinned via nodeSelector to the
@@ -683,9 +672,9 @@ def _configure_eks_kubelet_swap(spec) -> None:
             memorySwapBehavior: LimitedSwap
             failSwapOn: false
 
-    GKE equivalent: linuxConfig.swapConfig + kubeletConfig.memorySwapBehavior
-    via --system-config-from-file, already implemented in
-    _create_benchmark_node_pool.
+    GKE equivalent: linuxConfig.swapConfig via --system-config-from-file
+    (swapConfig automatically enables memorySwapBehavior=LimitedSwap),
+    already implemented in _create_benchmark_node_pool.
 
     See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780
     """
@@ -912,7 +901,7 @@ def _create_benchmark_node_pool(cluster) -> None:
 
     Uses:
       --swap_encryption_benchmark_machine_type  (default n4-highmem-32)
-      --swap_encryption_node_image_type         (default COS_CONTAINERD)
+      --swap_encryption_node_image_type         (default UBUNTU_CONTAINERD)
       --swap_encryption_boot_disk_iops          (default 80000)
       --swap_encryption_enable_dmcrypt          (default True)
 
@@ -975,67 +964,54 @@ def _create_benchmark_node_pool(cluster) -> None:
     if is_lssd:
         cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}"
 
-    # ── GKE kubelet swap config ───────────────────────────────────────────────
-    # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
-    # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
-    # so that the kubelet allocates swap to the DaemonSet pod.  Without this flag
-    # the Linux kernel swap device may exist but the kubelet blocks pod-level
-    # swap usage and the benchmark pod cannot drive swap I/O.
-    #
-    # Passed as --system-config-from-file pointing to a temp YAML, which is the
-    # same mechanism PKB's gke_node_system_config flag uses:
-    #   perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
-    swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
+    # ── GKE swap system-config ───────────────────────────────────────────────
+    # Pass linuxConfig.swapConfig + linuxConfig.sysctl via --system-config-from-file.
+    # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
+    #   linuxConfig.swapConfig: GKE enables node-level swap device and
+    #     automatically sets kubeletConfig.memorySwapBehavior=LimitedSwap.
+    #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use the
+    #     local NVMe as the swap device (avoids boot-disk overhead).
+    #   linuxConfig.sysctl: swap aggressiveness tuning so benchmark workloads
+    #     can drive sustained swap I/O.
+    # Reference:
+    #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
+    #   node-memory-swap#enable
     system_config_tmp = None
-    if swap_behavior:
-        # Build system-config YAML for --system-config-from-file.
-        # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
-        #   kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods.
-        #   linuxConfig.swapConfig: GKE enables node-level swap device.
-        #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use
-        #     the local NVMe as the swap device (avoids boot-disk overhead).
-        #   linuxConfig.sysctl: swap aggressiveness tuning so the benchmark
-        #     workloads can drive sustained swap I/O.
-        # Reference:
-        #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
-        #   node-memory-swap#enable
-        if is_lssd:
-            swap_config_block = (
-                "  swapConfig:\n"
-                "    enabled: true\n"
-                "    dedicatedLocalSsdProfile:\n"
-                f"      diskCount: {_LSSD_COUNT.value}\n"
-            )
-        else:
-            swap_config_block = "  swapConfig:\n    enabled: true\n"
-        kubelet_yaml = (
-            "kubeletConfig:\n  memorySwapBehavior:"
-            f" {swap_behavior}\nlinuxConfig:\n"
-            + swap_config_block
-            + "  sysctl:\n"
-            "    vm.min_free_kbytes: 200\n"
-            "    vm.watermark_scale_factor: 500\n"
-            "    vm.swappiness: 100\n"
-        )
-        system_config_tmp = tempfile.NamedTemporaryFile(
-            mode="w", suffix=".yaml", delete=False
-        )
-        system_config_tmp.write(kubelet_yaml)
-        system_config_tmp.flush()
-        cmd.flags["system-config-from-file"] = system_config_tmp.name
-        logging.info(
-            "[swap_encryption] system-config-from-file: "
-            "kubelet_swap=%s lssd=%s (written to %s):\n%s",
-            swap_behavior,
-            is_lssd,
-            system_config_tmp.name,
-            kubelet_yaml,
+    if is_lssd:
+        swap_config_block = (
+            "  swapConfig:\n"
+            "    enabled: true\n"
+            "    dedicatedLocalSsdProfile:\n"
+            f"      diskCount: {_LSSD_COUNT.value}\n"
         )
+    else:
+        swap_config_block = "  swapConfig:\n    enabled: true\n"
+    swap_config_yaml = (
+        "linuxConfig:\n"
+        + swap_config_block
+        + "  sysctl:\n"
+        "    vm.min_free_kbytes: 200\n"
+        "    vm.watermark_scale_factor: 500\n"
+        "    vm.swappiness: 100\n"
+    )
+    system_config_tmp = tempfile.NamedTemporaryFile(
+        mode="w", suffix=".yaml", delete=False
+    )
+    system_config_tmp.write(swap_config_yaml)
+    system_config_tmp.flush()
+    cmd.flags["system-config-from-file"] = system_config_tmp.name
+    logging.info(
+        "[swap_encryption] system-config-from-file: "
+        "lssd=%s (written to %s):\n%s",
+        is_lssd,
+        system_config_tmp.name,
+        swap_config_yaml,
+    )
 
     logging.info(
         "[swap_encryption] Creating benchmark nodepool: %s / %s / "
         "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / "
-        "add_swap_disk=%s / kubelet_swap=%s",
+        "add_swap_disk=%s",
         _BENCHMARK_NODEPOOL,
         machine_type,
         _NODE_IMAGE_TYPE.value,
@@ -1044,7 +1020,6 @@ def _create_benchmark_node_pool(cluster) -> None:
         _ENABLE_DMCRYPT.value,
         is_lssd,
         _ADD_SWAP_DISK.value,
-        swap_behavior or "unset",
     )
 
     # LSSD nodepools take longer to provision than PD-only nodepools because

From a3f9aa2513d25b7d264d826c1dfbadfc3552ad7e Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Mon, 29 Jun 2026 16:33:09 +0530
Subject: [PATCH 09/17] refactor(swap_encryption/pr1): extract infra into
 BaseResource classes

- Add SwapDaemonSet(resource.BaseResource) in resources/container_service/swap_daemonset.py
  - _Create(): apply Jinja2 manifest + wait for Running + /tmp/pkb_ready
  - _Delete(): in-pod swapoff/dmsetup/losetup/pkill teardown; kubectl delete
  - PodExec(): transient-reset retry, rc=137 OOM detection, pod recovery
- Add SwapNodePool(resource.BaseResource) in resources/container_service/swap_nodepool.py
  - _Create(): gcloud node-pools create with linuxConfig.swapConfig + optional swap disk
  - _Delete(): detach+delete disk; delete nodepool
  - DeleteDefaultPool(): remove dummy e2-medium pool after DaemonSet pod Running
- Rewrite benchmark to thin pattern: Prepare() uses resource.Create() + spec.resources
  - Cleanup() is empty - PKB framework auto-deletes spec.resources
  - Run() uses daemonset.PodExec() throughout
- Addresses Zac review: resources pattern, no infra code in benchmark file
- Fix COS_CONTAINERD -> UBUNTU_CONTAINERD (r3472549985)
- swapConfig auto-enables memorySwapBehavior=LimitedSwap (r3472513706)
---
 .../swap_encryption_benchmark.py              | 2035 ++++-------------
 .../container_service/swap_daemonset.py       |  609 +++++
 .../container_service/swap_nodepool.py        |  575 +++++
 3 files changed, 1682 insertions(+), 1537 deletions(-)
 create mode 100644 perfkitbenchmarker/resources/container_service/swap_daemonset.py
 create mode 100644 perfkitbenchmarker/resources/container_service/swap_nodepool.py

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index e30854e188..7f981b1bb7 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -35,38 +35,44 @@
   EKS nodes  ── NVMe Instance Store, Nitro hardware-offloaded encryption
                  swap device: /dev/nvme1n1 (or auto-detected)
 
-== Benchmark Phases ==
+== Resource pattern ==
 
-  Phase 1 – fio Microbenchmarks
-    Run fio directly on the swap block device (swapoff first) to measure
-    the hardware + encryption ceiling: random IOPS (4K), sequential
-    bandwidth (1M), and completion latency (iodepth=1).
+Infrastructure lifecycle lives in two BaseResource subclasses:
 
-  Phase 2a – CPU Overhead
-    stress-ng drives sustained swap I/O; vmstat and pidstat capture
-    swap-in/out rates and per-process CPU cost (kswapd, kcryptd,
-    dm-crypt threads on GKE; Nitro offload on EKS).
+  SwapNodePool  (perfkitbenchmarker/resources/container_service/swap_nodepool.py)
+    _Create():  gcloud container node-pools create with linuxConfig.swapConfig
+                + sysctl via --system-config-from-file; waits for node Ready;
+                optionally creates and attaches a dedicated swap disk.
+    _Delete():  detach+delete disk; delete the nodepool.
+    DeleteDefaultPool(): remove the dummy e2-medium default pool after the
+                DaemonSet pod is Running (separate step to avoid API-server
+                contention during nodepool ops).
 
-  Phase 2b – I/O Interference
-    Baseline fio on a scratch volume → re-run with concurrent swap
-    pressure.  IOPS/latency delta = storage contention cost.
+  SwapDaemonSet  (perfkitbenchmarker/resources/container_service/swap_daemonset.py)
+    _Create():  apply Jinja2 manifest; wait for Running + /tmp/pkb_ready.
+    _Delete():  in-pod swapoff / dmsetup / losetup teardown; kubectl delete.
+    PodExec():  kubectl exec wrapper with transient-reset retry, OOM-kill
+                detection (rc=137), and automatic pod recovery.
 
-  Phase 3a – Redis Latency
-    Dataset loaded beyond container memory limit → GET/SET p99 latency
-    measured while kernel swaps pages.
+Both resources are added to spec.resources in Prepare() and are auto-deleted
+by the PKB framework in Cleanup().
 
-  Phase 3b – Kernel Build
-    Linux compiled inside a memory-capped cgroup; slowdown ratio vs
-    unconstrained baseline.
+== Benchmark Phases ==
 
-  Phase 3c – OpenSearch
-    Bulk-index + search query under swap pressure (esrally or curl).
+  Phase 1 – fio Microbenchmarks (this PR)
+    Run fio directly on the swap block device (swapoff first) to measure
+    the hardware + encryption ceiling: random IOPS (4K), sequential
+    bandwidth (1M), and completion latency (iodepth=1).
+
+  Phase 2a – CPU Overhead  (PR2/PR4)
+  Phase 2b – I/O Interference  (PR4)
+  Phase 3a – Redis Latency  (PR5)
+  Phase 3b – Kernel Build  (PR5)
+  Phase 3c – OpenSearch  (PR5)
 """
 
 import json
 import logging
-import os
-import tempfile
 import textwrap
 import time
 from typing import Any
@@ -76,9 +82,9 @@
 from perfkitbenchmarker import configs
 from perfkitbenchmarker import errors
 from perfkitbenchmarker import sample
-from perfkitbenchmarker.providers.gcp import util as gcp_util
 from perfkitbenchmarker.resources.container_service import kubectl
-from perfkitbenchmarker.resources.container_service import kubernetes_commands
+from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod
+from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod
 
 FLAGS = flags.FLAGS
 
@@ -88,7 +94,7 @@
 # Benchmark identity
 # ---------------------------------------------------------------------------
 
-BENCHMARK_NAME = "swap_encryption"
+BENCHMARK_NAME = 'swap_encryption'
 
 
 BENCHMARK_CONFIG = """
@@ -118,274 +124,234 @@
 
 
 _DAEMONSET_IMAGE = flags.DEFINE_string(
-    "swap_encryption_daemonset_image",
-    "ubuntu:22.04",
-    "Container image used for the privileged benchmark DaemonSet pod.",
+    'swap_encryption_daemonset_image',
+    'ubuntu:22.04',
+    'Container image used for the privileged benchmark DaemonSet pod.',
 )
 
 
 _NODEPOOL = flags.DEFINE_string(
-    "swap_encryption_nodepool",
-    "benchmark",
-    "Name of the node pool to deploy the benchmark DaemonSet on.",
+    'swap_encryption_nodepool',
+    'benchmark',
+    'Name of the node pool to deploy the benchmark DaemonSet on.',
 )
 
 
 _INSTANCE_SIZE_LABEL = flags.DEFINE_string(
-    "swap_encryption_instance_size_label",
-    "",
-    "Human-readable label for the current instance size being tested, e.g. "
+    'swap_encryption_instance_size_label',
+    '',
+    'Human-readable label for the current instance size being tested, e.g. '
     '"n4-highmem-32" or "i4i.4xlarge".  Stored in sample metadata so that '
-    "results from multiple PKB runs across different instance sizes can be "
-    "collated and compared.  Defaults to the value reported by the cloud "
-    "metadata endpoint inside the pod.",
+    'results from multiple PKB runs across different instance sizes can be '
+    'collated and compared.  Defaults to the value reported by the cloud '
+    'metadata endpoint inside the pod.',
 )
 
 
 _COLLECT_COST = flags.DEFINE_boolean(
-    "swap_encryption_collect_cost",
+    'swap_encryption_collect_cost',
     False,
-    "When True, emit a cost_estimate_usd sample using on-demand pricing "
-    "for the instance type detected at runtime.",
+    'When True, emit a cost_estimate_usd sample using on-demand pricing '
+    'for the instance type detected at runtime.',
 )
 
 
 _FAIL_ON_DEGRADED = flags.DEFINE_boolean(
-    "swap_encryption_fail_on_degraded",
+    'swap_encryption_fail_on_degraded',
     True,
-    "When True (default), raise an error at the end of Run() if the run was "
-    "catastrophically degraded — e.g. the benchmark pod was OOM-evicted and "
-    "replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng "
-    "swap-pressure phase was OOM-killed before completing.  This prevents PKB "
-    "from reporting SUCCEEDED for a run whose post-eviction phases produced "
-    "empty or meaningless data.  Set False to keep the legacy behaviour of "
-    "always returning whatever partial samples were collected.",
+    'When True (default), raise an error at the end of Run() if the run was '
+    'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and '
+    'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng '
+    'swap-pressure phase was OOM-killed before completing.  This prevents PKB '
+    'from reporting SUCCEEDED for a run whose post-eviction phases produced '
+    'empty or meaningless data.  Set False to keep the legacy behaviour of '
+    'always returning whatever partial samples were collected.',
 )
 
 
 _PHASES = flags.DEFINE_list(
-    "swap_encryption_phases",
-    ["all"],
-    "Which Run() phases to execute, for fast iteration against an "
-    "already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  "
-    "Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng "
-    "CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), "
+    'swap_encryption_phases',
+    ['all'],
+    'Which Run() phases to execute, for fast iteration against an '
+    'already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  '
+    'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng '
+    'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), '
     '3b (kernel build), 3c (opensearch).  Default "all" runs everything.  '
-    "Example: --swap_encryption_phases=2a runs only the swap-pressure phase. "
-    "Phases not listed are skipped and do not affect the degraded-run gate "
+    'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. '
+    'Phases not listed are skipped and do not affect the degraded-run gate '
     '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").',
 )
 
 
 _BENCHMARK_MACHINE_TYPE = flags.DEFINE_string(
-    "swap_encryption_benchmark_machine_type",
-    "n4-highmem-32",
-    "Machine type for the benchmark nodepool created in Prepare(). "
-    "Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd "
-    "(LSSD RAID-0).  The matching swap setup is selected automatically.",
+    'swap_encryption_benchmark_machine_type',
+    'n4-highmem-32',
+    'Machine type for the benchmark nodepool created in Prepare(). '
+    'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd '
+    '(LSSD RAID-0).  The matching swap setup is selected automatically.',
 )
 
 
 _BENCHMARK_LSSD = flags.DEFINE_boolean(
-    "swap_encryption_lssd",
+    'swap_encryption_lssd',
     False,
-    "Force LSSD RAID-0 swap path even when the machine type name does not "
+    'Force LSSD RAID-0 swap path even when the machine type name does not '
     'contain "lssd".  Auto-detected from machine type when False.',
 )
 
 
 _LSSD_COUNT = flags.DEFINE_integer(
-    "swap_encryption_lssd_count",
+    'swap_encryption_lssd_count',
     1,
-    "Number of local NVMe SSDs to attach as raw block devices "
-    "(--local-nvme-ssd-block count=N).  Must match the fixed local SSD "
-    "count for the chosen machine type: c4-standard-8-lssd=1, "
-    "c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  "
-    "Default 1 covers most single-lssd machine types.",
+    'Number of local NVMe SSDs to attach as raw block devices '
+    '(--local-nvme-ssd-block count=N).  Must match the fixed local SSD '
+    'count for the chosen machine type: c4-standard-8-lssd=1, '
+    'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  '
+    'Default 1 covers most single-lssd machine types.',
 )
 
 
 _NODE_IMAGE_TYPE = flags.DEFINE_string(
-    "swap_encryption_node_image_type",
-    "UBUNTU_CONTAINERD",
-    "GKE node image type for the benchmark nodepool.  "
-    "UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks "
-    "down device-mapper at the kernel LSM level and cryptsetup hangs "
-    "indefinitely from any pod context (even privileged, even via nsenter "
-    "into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup "
-    "from privileged pods without restriction.  "
-    "Use COS_CONTAINERD only when dm-crypt is disabled "
-    "(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  "
-    "AL2 on EKS.",
+    'swap_encryption_node_image_type',
+    'UBUNTU_CONTAINERD',
+    'GKE node image type for the benchmark nodepool.  '
+    'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks '
+    'down device-mapper at the kernel LSM level and cryptsetup hangs '
+    'indefinitely from any pod context (even privileged, even via nsenter '
+    'into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup '
+    'from privileged pods without restriction.  '
+    'Use COS_CONTAINERD only when dm-crypt is disabled '
+    '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  '
+    'AL2 on EKS.',
 )
 
 
 _BOOT_DISK_TYPE = flags.DEFINE_string(
-    "swap_encryption_boot_disk_type",
-    "hyperdisk-balanced",
-    "Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced "
-    "for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 "
-    "dev/test machines, which do not support hyperdisk-balanced.",
+    'swap_encryption_boot_disk_type',
+    'hyperdisk-balanced',
+    'Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced '
+    'for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 '
+    'dev/test machines, which do not support hyperdisk-balanced.',
 )
 
 
 _BOOT_DISK_IOPS = flags.DEFINE_integer(
-    "swap_encryption_boot_disk_iops",
+    'swap_encryption_boot_disk_iops',
     80000,
-    "Provisioned IOPS for the boot disk (hyperdisk-balanced only).  "
-    "80 000 is the COS max-IOPS target.  Ignored for pd-ssd.",
+    'Provisioned IOPS for the boot disk (hyperdisk-balanced only).  '
+    '80 000 is the COS max-IOPS target.  Ignored for pd-ssd.',
 )
 
 
 _BOOT_DISK_THROUGHPUT = flags.DEFINE_integer(
-    "swap_encryption_boot_disk_throughput",
+    'swap_encryption_boot_disk_throughput',
     1200,
-    "Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced "
-    "only).  Must be set together with iops.  1200 MB/s pairs with 80 000 "
-    "IOPS for production; use 140 (minimum) for dev/test.  Ignored for "
-    "pd-ssd.",
+    'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced '
+    'only).  Must be set together with iops.  1200 MB/s pairs with 80 000 '
+    'IOPS for production; use 140 (minimum) for dev/test.  Ignored for '
+    'pd-ssd.',
 )
 
 
 _BOOT_DISK_SIZE_GB = flags.DEFINE_integer(
-    "swap_encryption_boot_disk_size_gb",
+    'swap_encryption_boot_disk_size_gb',
     500,
-    "Boot disk size in GiB for the benchmark nodepool.  500 GiB is "
-    "required for the n4-highmem-32 + hyperdisk-balanced Config 2 run "
-    "(see Engineer Assignments table in execution-plan.md).  "
-    "For LSSD configs the boot disk is smaller; 100 GiB is fine.",
+    'Boot disk size in GiB for the benchmark nodepool.  500 GiB is '
+    'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run '
+    '(see Engineer Assignments table in execution-plan.md).  '
+    'For LSSD configs the boot disk is smaller; 100 GiB is fine.',
 )
 
 
 _ADD_SWAP_DISK = flags.DEFINE_boolean(
-    "swap_encryption_add_swap_disk",
+    'swap_encryption_add_swap_disk',
     False,
-    "Attach a dedicated second disk to the benchmark nodepool for use as "
-    "the swap device.  Required for dm-crypt measurement on single-boot-disk "
-    "machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper "
-    "from pod namespaces.  The second disk is provisioned via "
-    "--additional-node-disk using the same type/IOPS/throughput as the boot "
-    "disk flags.",
+    'Attach a dedicated second disk to the benchmark nodepool for use as '
+    'the swap device.  Required for dm-crypt measurement on single-boot-disk '
+    'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper '
+    'from pod namespaces.  The second disk is provisioned via '
+    '--additional-node-disk using the same type/IOPS/throughput as the boot '
+    'disk flags.',
 )
 
 
 _SWAP_DISK_SIZE_GB = flags.DEFINE_integer(
-    "swap_encryption_swap_disk_size_gb",
+    'swap_encryption_swap_disk_size_gb',
     500,
-    "Size in GiB of the dedicated swap disk when "
-    "--swap_encryption_add_swap_disk is True.  Must satisfy the "
-    "hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.",
+    'Size in GiB of the dedicated swap disk when '
+    '--swap_encryption_add_swap_disk is True.  Must satisfy the '
+    'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
 )
 
 _ENABLE_DMCRYPT = flags.DEFINE_boolean(
-    "swap_encryption_enable_dmcrypt",
+    'swap_encryption_enable_dmcrypt',
     True,
-    "When True (default), wrap the swap device in dm-crypt plain mode "
-    "(aes-xts-plain64, ephemeral random key) matching GKE's "
-    "go/node:swap-encryption implementation.  Set False to measure plain "
-    "(unencrypted) swap overhead as a baseline.",
+    'When True (default), wrap the swap device in dm-crypt plain mode '
+    '(aes-xts-plain64, ephemeral random key) matching GKE\'s '
+    'go/node:swap-encryption implementation.  Set False to measure plain '
+    '(unencrypted) swap overhead as a baseline.',
 )
 
 
 _SWAP_DEVICE = flags.DEFINE_string(
-    "swap_encryption_device",
-    "",
-    "Explicit block device path to use as the swap device, e.g. "
-    "/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), "
-    "the device is auto-detected from /proc/swaps inside the benchmark pod.",
+    'swap_encryption_device',
+    '',
+    'Explicit block device path to use as the swap device, e.g. '
+    '/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), '
+    'the device is auto-detected from /proc/swaps inside the benchmark pod.',
 )
 
 _SWAP_TYPE = flags.DEFINE_string(
-    "swap_encryption_swap_type",
-    "hyperdisk",
-    "Storage target for the swap device.  One of: hyperdisk (default), "
-    "lssd, instance_store, io2.",
+    'swap_encryption_swap_type',
+    'hyperdisk',
+    'Storage target for the swap device.  One of: hyperdisk (default), '
+    'lssd, instance_store, io2.',
 )
 
 _ENABLE_ZSWAP = flags.DEFINE_boolean(
-    "swap_encryption_enable_zswap",
+    'swap_encryption_enable_zswap',
     False,
-    "When True, enable zswap compressed swap cache on the benchmark node.",
+    'When True, enable zswap compressed swap cache on the benchmark node.',
 )
 
 _MIN_FREE_KBYTES = flags.DEFINE_integer(
-    "swap_encryption_min_free_kbytes",
+    'swap_encryption_min_free_kbytes',
     0,
-    "Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. "
-    "0 (default) leaves the kernel default unchanged.",
+    'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. '
+    '0 (default) leaves the kernel default unchanged.',
 )
 
 _FIO_RUNTIME_SEC = flags.DEFINE_integer(
-    "swap_encryption_fio_runtime_sec",
+    'swap_encryption_fio_runtime_sec',
     60,
-    "Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.",
+    'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.',
 )
 
 _STRESS_VM_BYTES = flags.DEFINE_string(
-    "swap_encryption_stress_vm_bytes",
-    "28G",
-    "stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  "
-    "Should exceed available node RAM to force sustained paging.",
+    'swap_encryption_stress_vm_bytes',
+    '28G',
+    'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  '
+    'Should exceed available node RAM to force sustained paging.',
 )
 
 _STRESS_VM_BYTES_LIST = flags.DEFINE_list(
-    "swap_encryption_stress_vm_bytes_list",
+    'swap_encryption_stress_vm_bytes_list',
     [],
-    "Comma-separated list of --vm-bytes values to sweep in Phase 2a, "
+    'Comma-separated list of --vm-bytes values to sweep in Phase 2a, '
     'e.g. "14G,28G,56G".  Overrides --swap_encryption_stress_vm_bytes.',
 )
 
 _STRESS_TIMEOUT_SEC = flags.DEFINE_integer(
-    "swap_encryption_stress_timeout_sec",
+    'swap_encryption_stress_timeout_sec',
     300,
-    "Maximum seconds to wait for the stress-ng swap-pressure phase.",
-)
-
-_DS_NAME = "pkb-swap-benchmark"
-_DS_NAMESPACE = "default"
-_DS_LABEL = "pkb-swap-benchmark"
-
-# Transient kubectl errors that are safe to retry.
-_TRANSIENT_KUBECTL_ERRORS = ("connection reset by peer", "websocket: close")
-
-# Errors indicating the container/pod is gone and needs recovery.
-_CONTAINER_GONE_KUBECTL_ERRORS = (
-    "container not found",
-    "procReady not received",
-    "unable to upgrade connection",
-    "not found",
-    "deleted state",
+    'Maximum seconds to wait for the stress-ng swap-pressure phase.',
 )
 
-_active_pod: list[str] = []  # single-element list so closures can mutate it
-
-
-_degraded_reasons: list[str] = []
-
-
-_pod_lost: list[str] = []
-
-
-_oom_events: list[str] = []
-
-_BENCHMARK_NODEPOOL = "benchmark"
-_DEFAULT_NODEPOOL = "default-pool"
-
-
-class _GcpZonalResource:
-    """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
-
-    gcp_util.GcloudCommand auto-injects --project and --zone from the resource
-    object passed to it.  GkeCluster._GcloudCommand() handles container/*
-    operations correctly but also switches --zone → --region for multi-zone
-    clusters, which is wrong for gcloud compute commands (--region creates
-    regional resources, not zonal ones).  This shim pins a single zone so all
-    gcloud compute calls target the correct AZ.
-    """
-
-    def __init__(self, project: str, zone: str) -> None:
-        self.project = project
-        self.zone = zone
+# DaemonSet constants used by both SwapDaemonSet construction and the EKS path.
+_DS_NAME = 'pkb-swap-benchmark'
+_DS_NAMESPACE = 'default'
+_DS_LABEL = 'pkb-swap-benchmark'
+_BENCHMARK_NODEPOOL = 'benchmark'
 
 
 def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
@@ -400,88 +366,86 @@ def Prepare(spec: _BenchmarkSpec) -> None:
     e2-medium default nodepool.
 
     Step 2 (this function):
-      a. Create the benchmark nodepool (n4-highmem-32 or c4-*-lssd) with
-         UBUNTU_CONTAINERD, 80 000 IOPS, and a node startup script that configures
-         dm-crypt swap at the OS level — before any pod is scheduled.
-      b. Delete the dummy default nodepool to stop its cost immediately.
-      c. Deploy the privileged DaemonSet (pinned via nodeSelector to the
-         benchmark nodepool) and wait for tools to install.
+      a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk).
+         EKS: label existing nodes with pkb_nodepool=benchmark.
+      b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel.
+      c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running.
+      d. GCP: re-resolve pod name in case default-pool deletion evicts the pod.
+
+    Both resources are appended to spec.resources for auto-cleanup.
     """
     cluster = spec.container_cluster
-
-    # ── Step 2a: add real benchmark nodepool ────────────────────────────────
-    if not getattr(cluster, "project", None):
-        # Guard: AWS / EKS path — nodepool management is external.
-        # PKB labels nodes pkb_nodepool=default; re-label to match the DaemonSet
-        # nodeSelector (pkb_nodepool=benchmark) before deploying the pod.
+    is_gcp = getattr(cluster, 'project', None) is not None
+
+    if is_gcp:
+        # ── Step 2a (GCP): create benchmark nodepool + wait for node ──────────
+        logging.info('[swap_encryption] Step 2a: creating benchmark nodepool')
+        nodepool = _np_mod.SwapNodePool(
+            cluster=cluster,
+            machine_type=_BENCHMARK_MACHINE_TYPE.value,
+            node_image_type=_NODE_IMAGE_TYPE.value,
+            disk_type=_BOOT_DISK_TYPE.value,
+            disk_size_gb=_BOOT_DISK_SIZE_GB.value,
+            disk_iops=_BOOT_DISK_IOPS.value,
+            disk_throughput=_BOOT_DISK_THROUGHPUT.value,
+            lssd=_BENCHMARK_LSSD.value,
+            lssd_count=_LSSD_COUNT.value,
+            add_swap_disk=_ADD_SWAP_DISK.value,
+            swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value,
+        )
+        nodepool.Create()
+        spec.resources.append(nodepool)
+    else:
+        # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ──
         logging.info(
-            "[swap_encryption] EKS cluster — labelling existing nodes with "
-            "pkb_nodepool=%s so the DaemonSet nodeSelector matches.",
+            '[swap_encryption] EKS cluster — labelling existing nodes with'
+            ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
             _BENCHMARK_NODEPOOL,
         )
         kubectl.RunKubectlCommand([
-            "label",
-            "nodes",
-            "--all",
-            "--overwrite",
-            f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
+            'label',
+            'nodes',
+            '--all',
+            '--overwrite',
+            f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
         ])
-        # io2 test-matrix row: create + attach a real io2 EBS volume so swap runs
-        # on io2 hardware-encrypted storage (no-op unless swap_type=io2).
         _ensure_io2_volume()
-    else:
-        # GCP path: true two-step nodepool setup.
-        logging.info("[swap_encryption] Step 2a: creating benchmark nodepool")
-        _create_benchmark_node_pool(cluster)
-
-        # ── Step 2b: wait for the benchmark node to join and be Ready ─────────
-        logging.info("[swap_encryption] Step 2b: waiting for benchmark node")
-        _wait_for_benchmark_node()
-
-        # ── Step 2b2: attach dedicated swap disk (if requested) ───────────────
-        if _ADD_SWAP_DISK.value:
-            logging.info(
-                "[swap_encryption] Step 2b2: attaching dedicated swap disk"
-            )
-            _attach_swap_disk(cluster)
-
-    # ── Step 2c: deploy DaemonSet ────────────────────────────────────────────
-    # Deploy and wait for the pod BEFORE deleting the default nodepool.
-    # Deleting the default pool while the benchmark node is still joining causes
-    # a temporary API server i/o timeout (control plane busy with two nodepool
-    # ops simultaneously).  Once the pod is Running the cluster is fully stable.
-    logging.info("[swap_encryption] Step 2c: deploying privileged DaemonSet")
-    _deploy_daemonset()
 
-    pod = _wait_for_benchmark_pod()
-    logging.info("[swap_encryption] Benchmark pod ready: %s", pod)
+    # ── Step 2b: deploy DaemonSet and wait for pod ────────────────────────────
+    # Deploy BEFORE deleting the default pool: deleting the default pool while
+    # the benchmark node is still joining causes a brief API-server I/O timeout.
+    # The pod being Running means the cluster is fully stable.
+    logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet')
+    daemonset = _ds_mod.SwapDaemonSet(
+        name=_DS_NAME,
+        namespace=_DS_NAMESPACE,
+        label=_DS_LABEL,
+        nodepool=_BENCHMARK_NODEPOOL,
+        image=_DAEMONSET_IMAGE.value,
+    )
+    daemonset.Create()
+    spec.resources.append(daemonset)
+    logging.info(
+        '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name
+    )
 
-    # ── Step 2d: now safe to remove the dummy default nodepool ───────────────
-    if getattr(cluster, "project", None):
+    # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ──
+    if is_gcp:
         logging.info(
-            "[swap_encryption] Step 2d: deleting dummy default nodepool"
+            '[swap_encryption] Step 2c: deleting dummy default nodepool'
         )
-        _delete_default_node_pool(cluster)
-        # The DaemonSet pod may be evicted and rescheduled with a new name during
-        # the nodepool deletion (cluster control plane briefly interrupts pod
-        # lifecycle).  Re-resolve the pod name to avoid stale-reference errors on
-        # all subsequent _pod_exec calls.
+        nodepool.DeleteDefaultPool()
+        # The pod may be evicted and rescheduled with a new name during the
+        # default nodepool deletion.  Re-resolve to avoid stale references.
         logging.info(
-            "[swap_encryption] Step 2d: re-resolving benchmark pod "
-            "after nodepool deletion"
+            '[swap_encryption] Step 2d: re-resolving benchmark pod after'
+            ' nodepool deletion'
+        )
+        daemonset.WaitForPod()
+        logging.info(
+            '[swap_encryption] Benchmark pod (post-deletion): %s',
+            daemonset.pod_name,
         )
-        pod = _wait_for_benchmark_pod()
-        logging.info("[swap_encryption] Benchmark pod (post-deletion): %s", pod)
-
-
-def _phase_selected(token: str) -> bool:
-    """Return True if phase `token` should run given --swap_encryption_phases.
-
-    'all' (the default) selects every phase.  Otherwise only the comma-separated
-    tokens listed in the flag run.  Tokens: fio, 2a, 2b, 3a, 3b, 3c.
-    """
-    selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
-    return (not selected) or ("all" in selected) or (token.lower() in selected)
 
 
 def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
@@ -493,167 +457,147 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
         Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
         zero samples (device not found, O_DIRECT error, etc.).
 
-      Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference
-        Requires an active swap device (Gate 1 must pass).  Gate 2 fails if
-        stress-ng does not complete within timeout.
+      Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (PR4)
+        Requires an active swap device (Gate 1 must pass).
 
-      Tier 3 (Gate 3) — real-world workloads (Redis, kernel build, OpenSearch)
-        Independent of Tier 2 results; always attempted if Gate 1 passed.
-        Individual workload failures are logged but do not abort the others.
+      Tier 3 (Gate 3) — real-world workloads (PR5)
+        Independent of Tier 2 results.
 
-    If Gate 1 fails, Tiers 2 and 3 are skipped — there is no point measuring
-    application-level swap performance when the raw device is inaccessible.
+    If Gate 1 fails, Tiers 2 and 3 are skipped.
     """
-    pod = _wait_for_benchmark_pod()
+    daemonset = _get_daemonset(spec)
+
+    pod = daemonset.WaitForPod()
     if pod is None:
         raise errors.Benchmarks.RunError(
-            "[swap_encryption] Benchmark pod never became ready."
+            '[swap_encryption] Benchmark pod never became ready.'
         )
-    # Initialise the module-level active-pod tracker so _pod_exec and
-    # _recover_pod can transparently redirect to a replacement pod if the
-    # original is evicted during the run.
-    _active_pod.clear()
-    _active_pod.append(pod)
-    _degraded_reasons.clear()
-    _pod_lost.clear()
-    _oom_events.clear()
+    # Reset per-run accumulators before starting phases.
+    daemonset.oom_events.clear()
+    daemonset.pod_lost.clear()
     original_pod = pod
-    swap_dev = _detect_swap_device(pod)
-    base_meta = _build_metadata(pod, swap_dev)
+    degraded_reasons: list[str] = []
+
+    swap_dev = _detect_swap_device(daemonset)
+    base_meta = _build_metadata(daemonset, swap_dev)
     results: list[sample.Sample] = []
     t_run_start = time.time()
 
-    logging.info("[swap_encryption] swap device: %s", swap_dev)
+    logging.info('[swap_encryption] swap device: %s', swap_dev)
 
-    # ── Phase 1: fio microbenchmarks on raw swap device ─────────────────────────
-    if _phase_selected("fio"):
+    # ── Phase 1: fio microbenchmarks on raw swap device ───────────────────────
+    if _phase_selected('fio'):
         logging.info(
-            "[swap_encryption] Phase 1: fio microbenchmarks on %s", swap_dev
+            '[swap_encryption] Phase 1: fio microbenchmarks on %s', swap_dev
         )
         try:
-            phase1_samples = _run_phase1_fio(pod, swap_dev, base_meta)
+            phase1_samples = _run_phase1_fio(daemonset, swap_dev, base_meta)
             results += phase1_samples
             if not phase1_samples:
-                _degraded_reasons.append(
-                    "Phase 1 (fio) produced no samples — "
-                    "check fio install and swap device accessibility"
+                degraded_reasons.append(
+                    'Phase 1 (fio) produced no samples — '
+                    'check fio install and swap device accessibility'
                 )
-                logging.error("[swap_encryption] Phase 1: no samples produced")
+                logging.error('[swap_encryption] Phase 1: no samples produced')
         except Exception as e:  # pylint: disable=broad-except
-            _degraded_reasons.append(f"Phase 1 fio failed: {e}")
-            logging.error("[swap_encryption] Phase 1 fio error: %s", e)
+            degraded_reasons.append(f'Phase 1 fio failed: {e}')
+            logging.error('[swap_encryption] Phase 1 fio error: %s', e)
 
     # ── Cost estimate ─────────────────────────────────────────────────────────
     if _COLLECT_COST.value:
         elapsed = time.time() - t_run_start
-        results += _collect_cost_sample(pod, elapsed, base_meta)
+        results += _collect_cost_sample(daemonset, elapsed, base_meta)
 
     # ── Final degradation gate ────────────────────────────────────────────────
-    # The phase try/except blocks above keep the run alive so partial data is
-    # still collected, but that means a catastrophic failure (pod OOM-evicted
-    # mid-run, no fio data, stress-ng killed before it could drive swap I/O)
-    # would otherwise be reported by PKB as SUCCEEDED with empty/garbage metrics.
-    # Detect those conditions here and surface them explicitly.
-    if _active_pod and _active_pod[0] != original_pod:
-        _degraded_reasons.append(
-            f"benchmark pod was replaced during the run ({original_pod} →"
-            f" {_active_pod[0]}) — it was OOM-evicted under swap pressure;"
-            " phases executed after the eviction ran against a"
-            " freshly-initialised pod (empty /tmp, swap re-setup) and may be"
-            " invalid"
-        )
-    if _pod_lost:
-        _degraded_reasons.append(
-            "benchmark pod(s) went NotFound during the run"
-            f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure'
-            " eviction or container exit) and any phase running at or after"
-            " that"
-            " point (e.g. kernel-build baseline, OpenSearch) produced invalid"
-            " data"
-        )
-    if _oom_events:
-        _degraded_reasons.append(
-            "OOM kill(s) (rc=137) occurred during the run on pod(s) "
-            f'{", ".join(_oom_events)} — a phase exceeded memory and was'
-            " killed by "
-            "the OOM killer (the container may have restarted in place), so"
-            " the "
-            "affected phase(s) produced no or partial data"
-        )
-
-    degraded = bool(_degraded_reasons)
+    if daemonset.pod_name and daemonset.pod_name != original_pod:
+        degraded_reasons.append(
+            f'benchmark pod was replaced during the run ({original_pod} →'
+            f' {daemonset.pod_name}) — it was OOM-evicted under swap'
+            ' pressure; phases executed after the eviction ran against a'
+            ' freshly-initialised pod (empty /tmp, swap re-setup) and may'
+            ' be invalid'
+        )
+    if daemonset.pod_lost:
+        degraded_reasons.append(
+            'benchmark pod(s) went NotFound during the run'
+            f' ({", ".join(daemonset.pod_lost)}) — the pod died (node'
+            ' memory-pressure eviction or container exit) and any phase'
+            ' running at or after that point produced invalid data'
+        )
+    if daemonset.oom_events:
+        degraded_reasons.append(
+            'OOM kill(s) (rc=137) occurred during the run on pod(s) '
+            f'{", ".join(daemonset.oom_events)} — a phase exceeded memory'
+            ' and was killed by the OOM killer; the affected phase(s)'
+            ' produced no or partial data'
+        )
+
+    degraded = bool(degraded_reasons)
     results.append(
         sample.Sample(
-            "swap_encryption_run_status",
+            'swap_encryption_run_status',
             0.0 if degraded else 1.0,
-            "status",
+            'status',
             dict(
                 base_meta,
                 degraded=degraded,
-                degraded_reasons="; ".join(_degraded_reasons) or "none",
+                degraded_reasons='; '.join(degraded_reasons) or 'none',
                 num_samples=len(results) + 1,
             ),
         )
     )
 
     if degraded:
-        msg = "[swap_encryption] RUN DEGRADED — " + "; ".join(_degraded_reasons)
+        msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons)
         logging.error(msg)
         if _FAIL_ON_DEGRADED.value:
-            # Raise so PKB marks the benchmark FAILED instead of SUCCEEDED.  The
-            # samples collected so far are still published by PKB before the failure
-            # is recorded, so no data is lost.
             raise errors.Benchmarks.RunError(msg)
     else:
         logging.info(
-            "[swap_encryption] Run completed cleanly (%d samples)", len(results)
+            '[swap_encryption] Run completed cleanly (%d samples)',
+            len(results),
         )
 
     return results
 
 
 def Cleanup(spec: _BenchmarkSpec) -> None:
-    """Remove the DaemonSet and tear down any swap configuration."""
-    pod = _wait_for_benchmark_pod(timeout=30)
-    if pod:
-        _pod_exec(pod, "swapoff -a 2>/dev/null || true", ignore_failure=True)
-        _pod_exec(
-            pod,
-            textwrap.dedent("""
-      swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
-      dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
-    """),
-            ignore_failure=True,
-        )
-        # Clean up loop device backing files (single-disk fallback path).
-        _pod_exec(
-            pod,
-            textwrap.dedent("""
-      for backing in /var/pkb_swap_backing /run/pkb_swap_backing \
-                     /mnt/stateful_partition/pkb_swap_backing
-      do
-        losetup -j "$backing" 2>/dev/null | awk -F: '{print $1}' | \
-          while read dev
-          do
-            losetup -d "$dev" 2>/dev/null || true
-          done
-        rm -f "$backing"
-      done
-    """),
-            ignore_failure=True,
-        )
-        _pod_exec(
-            pod,
-            "pkill -9 'stress-ng|fio' 2>/dev/null || true",
-            ignore_failure=True,
+    """Resources in spec.resources are auto-deleted by the PKB framework.
+
+    SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove,
+    losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet.
+    SwapNodePool._Delete() detaches+deletes the swap disk (if any) then
+    deletes the benchmark nodepool.
+    """
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet:
+    """Retrieve the SwapDaemonSet resource from spec.resources."""
+    daemonset = next(
+        (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)),
+        None,
+    )
+    if daemonset is None:
+        raise errors.Benchmarks.RunError(
+            '[swap_encryption] SwapDaemonSet not found in spec.resources —'
+            ' was Prepare() called?'
         )
+    return daemonset
 
-    _delete_daemonset()
 
-    # Detach and delete the dedicated swap disk if one was provisioned.
-    cluster = spec.container_cluster
-    if _ADD_SWAP_DISK.value and getattr(cluster, "project", None):
-        _detach_and_delete_swap_disk(cluster)
+def _phase_selected(token: str) -> bool:
+    """Return True if phase `token` should run given --swap_encryption_phases.
+
+    'all' (the default) selects every phase.  Otherwise only the
+    comma-separated tokens listed in the flag run.
+    """
+    selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
+    return (not selected) or ('all' in selected) or (token.lower() in selected)
 
 
 def _configure_eks_kubelet_swap(spec) -> None:
@@ -674,956 +618,161 @@ def _configure_eks_kubelet_swap(spec) -> None:
 
     GKE equivalent: linuxConfig.swapConfig via --system-config-from-file
     (swapConfig automatically enables memorySwapBehavior=LimitedSwap),
-    already implemented in _create_benchmark_node_pool.
+    already implemented in SwapNodePool._CreateNodePool().
 
     See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780
     """
     logging.warning(
-        "[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is "
-        "deferred (blocked on PR #6780 — SwapConfigSpec). "
-        "EKS nodes will use default kubelet swap settings until that PR merges."
+        '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is '
+        'deferred (blocked on PR #6780 — SwapConfigSpec). '
+        'EKS nodes will use default kubelet swap settings until that PR merges.'
     )
 
 
-def _deploy_daemonset() -> None:
-    """Apply the swap-infra DaemonSet manifest to the cluster.
-
-    The DaemonSet is intentionally lean: it only verifies the node-level swap
-    device is active (configured via linuxConfig.swapConfig on GKE or
-    kubelet-config.json on EKS) and writes /tmp/pkb_ready.  No benchmark
-    tooling is installed here — workloads are delegated to existing PKB
-    benchmark modules (kubernetes_fio, kubernetes_redis_memtier, etc.) which
-    manage their own tool installs inside separate benchmark pods.
-
-    Uses kubernetes_commands.ApplyManifest to render the Jinja2 template from
-    perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 and
-    apply it via kubectl — the standard PKB pattern for deploying manifests.
-    """
-    kubernetes_commands.ApplyManifest(
-        "cluster/swap_encryption_daemonset.yaml.j2",
-        ds_name=_DS_NAME,
-        ds_namespace=_DS_NAMESPACE,
-        ds_label=_DS_LABEL,
-        benchmark_nodepool=_BENCHMARK_NODEPOOL,
-        image=_DAEMONSET_IMAGE.value,
-    )
-    logging.info("[swap_encryption] Swap-infra DaemonSet applied")
-
-
-def _wait_for_benchmark_pod(timeout: int = 600) -> str | None:
-    """Wait until the swap-infra DaemonSet pod is Running AND swap is active.
-
-    The DaemonSet installs fio and a small set of measurement tools then
-    verifies the swap device before writing /tmp/pkb_ready (~1-2 min on a
-    cold apt cache).  Default timeout 600 s covers worst-case APT latency
-    on a freshly-started node.
-
-    Uses tab-separated name/phase output so kubectl always exits 0 regardless
-    of whether any pods are present, avoiding jsonpath index errors.
-    """
-    deadline = time.time() + timeout
-    last_phase = ""
-    ready_pod = None  # pod name once phase == Running
-
-    while time.time() < deadline:
-        # ── Step 1: wait for Running phase ──────────────────────────────────────
-        if ready_pod is None:
-            out, _, rc = kubectl.RunKubectlCommand(
-                [
-                    "get",
-                    "pods",
-                    "-l",
-                    f"app={_DS_LABEL}",
-                    "-n",
-                    _DS_NAMESPACE,
-                    "-o",
-                    (
-                        r"jsonpath={range"
-                        r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
-                    ),
-                ],
-                raise_on_failure=False,
-            )
-
-            if rc == 0 and out.strip():
-                for line in out.strip().splitlines():
-                    parts = line.split("\t")
-                    if len(parts) == 2:
-                        pod_name, phase = parts[0].strip(), parts[1].strip()
-                        if phase == "Running":
-                            logging.info(
-                                "[swap_encryption] Pod %s is Running – "
-                                "waiting for swap device readiness sentinel...",
-                                pod_name,
-                            )
-                            ready_pod = pod_name
-                            break
-                        if phase != last_phase:
-                            logging.info(
-                                "[swap_encryption] Pod %s phase: %s",
-                                pod_name,
-                                phase,
-                            )
-                            last_phase = phase
-                            if phase in ("Pending",):
-                                _log_pod_events(pod_name)
-            else:
-                logging.info(
-                    "[swap_encryption] Waiting for DaemonSet pod to appear..."
-                )
-
-        # ── Step 2: poll for /tmp/pkb_ready sentinel ────────────────────────────
-        if ready_pod is not None:
-            sentinel_out, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
-                [
-                    "exec",
-                    ready_pod,
-                    "-n",
-                    _DS_NAMESPACE,
-                    "--",
-                    "test",
-                    "-f",
-                    "/tmp/pkb_ready",
-                ],
-                raise_on_failure=False,
-            )
-            if sentinel_rc == 0:
-                logging.info(
-                    "[swap_encryption] Pod %s ready (swap device active)",
-                    ready_pod,
-                )
-                return ready_pod
-            # "container not found" means the container crashed (CrashLoopBackOff or
-            # exited) — treat it as a hard reset: re-check pod phase on next iteration.
-            if (
-                "container not found" in sentinel_err
-                or "unable to upgrade connection" in sentinel_err
-            ):
-                logging.warning(
-                    "[swap_encryption] Pod %s: container not running (%s) "
-                    "— will re-check pod state",
-                    ready_pod,
-                    sentinel_err.strip(),
-                )
-                ready_pod = None
-                last_phase = ""
-            else:
-                logging.info(
-                    "[swap_encryption] Pod %s: still installing tools...",
-                    ready_pod,
-                )
-
-        time.sleep(15)
-
-    logging.warning(
-        "[swap_encryption] Benchmark pod not ready after %ds", timeout
-    )
-    return None
-
-
-def _log_pod_events(pod_name: str) -> None:
-    """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
-    events_out, _, _ = kubectl.RunKubectlCommand(
-        [
-            "describe",
-            "pod",
-            pod_name,
-            "-n",
-            _DS_NAMESPACE,
-        ],
-        raise_on_failure=False,
-    )
-    # Only log the Events section to keep output manageable
-    in_events = False
-    lines = []
-    for line in events_out.splitlines():
-        if line.startswith("Events:"):
-            in_events = True
-        if in_events:
-            lines.append(line)
-    if lines:
-        logging.info("[swap_encryption] Pod events:\n%s", "\n".join(lines[:30]))
-    else:
-        logging.info(
-            "[swap_encryption] kubectl describe output:\n%s",
-            events_out[-2000:] if len(events_out) > 2000 else events_out,
-        )
-
-
-def _delete_daemonset() -> None:
-    """Delete the benchmark DaemonSet."""
-    kubectl.RunKubectlCommand(
-        [
-            "delete",
-            "daemonset",
-            _DS_NAME,
-            "-n",
-            _DS_NAMESPACE,
-            "--ignore-not-found",
-        ],
-        raise_on_failure=False,
-    )
-    logging.info("[swap_encryption] DaemonSet deleted")
-
-
-_HYPERDISK_MAX_IOPS_PER_MBPS = (
-    256  # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
-)
-
-
-def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
-    """Return a throughput (MiB/s) that satisfies GCP's Hyperdisk constraint.
-
-    Hyperdisk Balanced rejects disk creation when provisioned IOPS exceed
-    256 x provisioned throughput (MiB/s) — e.g. 80000 IOPS with 300 MiB/s fails
-    with "Requested provisioned throughput is too low for the provisioned iops".
-    Clamp throughput UP to the minimum the requested IOPS need (plus a small
-    margin) and warn, so a mismatched flag pairing cannot abort node-pool/disk
-    creation.
-    """
-    min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
-    if throughput < min_tput:
-        logging.warning(
-            "[swap_encryption] boot/swap disk throughput %d MiB/s is too low"
-            " for %d IOPS (Hyperdisk needs >= ceil(iops/256) = %d MiB/s);"
-            " raising to %d",
-            throughput,
-            iops,
-            min_tput,
-            min_tput,
-        )
-        return min_tput
-    return throughput
-
-
-def _create_benchmark_node_pool(cluster) -> None:
-    """Add the benchmark nodepool to the existing cluster (Step 2 of setup).
-
-    Uses:
-      --swap_encryption_benchmark_machine_type  (default n4-highmem-32)
-      --swap_encryption_node_image_type         (default UBUNTU_CONTAINERD)
-      --swap_encryption_boot_disk_iops          (default 80000)
-      --swap_encryption_enable_dmcrypt          (default True)
-
-    The nodepool is labelled pkb_nodepool=benchmark so the DaemonSet
-    nodeSelector targets it exclusively.  dm-crypt swap setup is performed
-    from within the privileged DaemonSet pod (see _setup_gke_hyperdisk_swap /
-    _setup_gke_lssd_swap) — we do NOT inject a startup-script via node metadata
-    because GKE reserves that metadata key and rejects it at the API level.
-    """
-    machine_type = _BENCHMARK_MACHINE_TYPE.value
-    # Auto-detect LSSD from machine type name; flag overrides only when True.
-    is_lssd = _BENCHMARK_LSSD.value or "lssd" in machine_type.lower()
-
-    # Determine zone/region from the cluster object.
-    # LSSD configs only need a small boot disk (OS only; swap is on local NVMe).
-    # Hyperdisk configs need 500 GiB to hit 80 000 IOPS (the IOPS/GiB ratio on
-    # hyperdisk-balanced is 1:1 up to the provisioned ceiling, so a 100 GiB disk
-    # can only provision up to 100 000 IOPS but a 500 GiB gives comfortable
-    # headroom and matches the Config 2 spec in the Engineer Assignments table).
-    disk_size_gb = 100 if is_lssd else _BOOT_DISK_SIZE_GB.value
-
-    disk_type = _BOOT_DISK_TYPE.value
-
-    # Use PKB's GcloudCommand wrapper: auto-injects --project, --zone/--region,
-    # and auth token refresh.  GkeCluster._GcloudCommand also handles the
-    # zone → region promotion for multi-zone / regional clusters.
-    cmd = cluster._GcloudCommand(
-        "container",
-        "node-pools",
-        "create",
-        _BENCHMARK_NODEPOOL,
-        "--cluster",
-        cluster.name,
-    )
-    cmd.flags["machine-type"] = machine_type
-    cmd.flags["image-type"] = _NODE_IMAGE_TYPE.value
-    cmd.flags["disk-type"] = disk_type
-    cmd.flags["disk-size"] = disk_size_gb
-    cmd.flags["num-nodes"] = 1
-    cmd.flags["node-labels"] = f"pkb_nodepool={_BENCHMARK_NODEPOOL}"
-    cmd.args += ["--no-enable-autoupgrade", "--no-enable-autorepair"]
-
-    # IOPS and throughput provisioning only applies to hyperdisk-* types AND
-    # only when the boot disk is also the swap device (non-LSSD configs).
-    # For LSSD machines the boot disk is OS-only; swap is on local NVMe.
-    # Provisioning 80k IOPS on a 100 GiB boot disk would exceed the
-    # hyperdisk-balanced per-GiB cap (80 IOPS/GiB × 100 GiB = 8 000 max).
-    if disk_type.startswith("hyperdisk") and not is_lssd:
-        # Hyperdisk boot-disk IOPS/throughput provisioning — not covered by
-        # GkeCluster._AddNodeParamsToCmd (which only handles secondary disks).
-        cmd.flags["boot-disk-provisioned-iops"] = _BOOT_DISK_IOPS.value
-        cmd.flags["boot-disk-provisioned-throughput"] = (
-            _valid_hyperdisk_throughput(
-                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
-            )
-        )
-
-    # For LSSD machines, expose local NVMe as raw block devices so fio/mdadm
-    # can access them directly (go/gke-swap-lssd uses local-nvme-ssd-block).
-    if is_lssd:
-        cmd.flags["local-nvme-ssd-block"] = f"count={_LSSD_COUNT.value}"
-
-    # ── GKE swap system-config ───────────────────────────────────────────────
-    # Pass linuxConfig.swapConfig + linuxConfig.sysctl via --system-config-from-file.
-    # Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
-    #   linuxConfig.swapConfig: GKE enables node-level swap device and
-    #     automatically sets kubeletConfig.memorySwapBehavior=LimitedSwap.
-    #     For LSSD machines, dedicatedLocalSsdProfile tells GKE to use the
-    #     local NVMe as the swap device (avoids boot-disk overhead).
-    #   linuxConfig.sysctl: swap aggressiveness tuning so benchmark workloads
-    #     can drive sustained swap I/O.
-    # Reference:
-    #   https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
-    #   node-memory-swap#enable
-    system_config_tmp = None
-    if is_lssd:
-        swap_config_block = (
-            "  swapConfig:\n"
-            "    enabled: true\n"
-            "    dedicatedLocalSsdProfile:\n"
-            f"      diskCount: {_LSSD_COUNT.value}\n"
-        )
-    else:
-        swap_config_block = "  swapConfig:\n    enabled: true\n"
-    swap_config_yaml = (
-        "linuxConfig:\n"
-        + swap_config_block
-        + "  sysctl:\n"
-        "    vm.min_free_kbytes: 200\n"
-        "    vm.watermark_scale_factor: 500\n"
-        "    vm.swappiness: 100\n"
-    )
-    system_config_tmp = tempfile.NamedTemporaryFile(
-        mode="w", suffix=".yaml", delete=False
-    )
-    system_config_tmp.write(swap_config_yaml)
-    system_config_tmp.flush()
-    cmd.flags["system-config-from-file"] = system_config_tmp.name
-    logging.info(
-        "[swap_encryption] system-config-from-file: "
-        "lssd=%s (written to %s):\n%s",
-        is_lssd,
-        system_config_tmp.name,
-        swap_config_yaml,
-    )
-
-    logging.info(
-        "[swap_encryption] Creating benchmark nodepool: %s / %s / "
-        "image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / "
-        "add_swap_disk=%s",
-        _BENCHMARK_NODEPOOL,
-        machine_type,
-        _NODE_IMAGE_TYPE.value,
-        disk_size_gb,
-        _BOOT_DISK_IOPS.value,
-        _ENABLE_DMCRYPT.value,
-        is_lssd,
-        _ADD_SWAP_DISK.value,
-    )
-
-    # LSSD nodepools take longer to provision than PD-only nodepools because
-    # GKE must also initialise the local NVMe devices before marking nodes Ready.
-    # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
-    try:
-        _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
-    finally:
-        if system_config_tmp is not None:
-            try:
-                os.unlink(system_config_tmp.name)
-            except OSError:
-                pass
-
-    if rc != 0:
-        # Idempotent prepare: if the nodepool already exists (e.g. re-running
-        # --run_stage=prepare,run to redeploy the DaemonSet onto an existing
-        # cluster), reuse it instead of failing.  gcloud returns a 409 /
-        # "Already exists" message in this case.
-        low = (stderr or "").lower()
-        if (
-            "already exists" in low
-            or "alreadyexists" in low
-            or "code=409" in low
-        ):
-            logging.info(
-                "[swap_encryption] Benchmark nodepool already exists — "
-                "reusing it (idempotent prepare); proceeding to DaemonSet"
-            )
-            return
-        raise errors.Benchmarks.RunError(
-            "[swap_encryption] Failed to create benchmark nodepool "
-            f"(rc={rc}): {stderr}"
-        )
-    logging.info("[swap_encryption] Benchmark nodepool ready")
-
-
-def _wait_for_benchmark_node(timeout: int = 900) -> None:
-    """Block until a node labelled pkb_nodepool=benchmark is Ready.
-
-    gcloud container node-pools create returns as soon as the API accepts the
-    request — the actual node VM may take another 2-4 minutes to boot, join the
-    cluster, and pass its readiness checks.  Deploying the DaemonSet before that
-    point leaves the pod Pending indefinitely because the nodeSelector finds no
-    eligible node.
+def _ensure_io2_volume() -> None:
+    """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
 
-    This function polls kubectl every 15 s until at least one node with
-    pkb_nodepool=benchmark has Ready=True, then returns.
+    Only executed when --swap_encryption_swap_type=io2.  Full implementation
+    is deferred to PR2 (swap-capability layer).
     """
-    deadline = time.time() + timeout
+    if _SWAP_TYPE.value != 'io2':
+        return
     logging.info(
-        "[swap_encryption] Waiting for benchmark node "
-        "(pkb_nodepool=benchmark) to be Ready..."
-    )
-    while time.time() < deadline:
-        out, _, rc = kubectl.RunKubectlCommand(
-            [
-                "get",
-                "nodes",
-                "-l",
-                f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
-                "-o",
-                r"jsonpath={range .items[*]}"
-                r'{.metadata.name}{"\t"}'
-                r'{range .status.conditions[?(@.type=="Ready")]}'
-                r'{.status}{"\n"}{end}{end}',
-            ],
-            raise_on_failure=False,
-        )
-
-        if rc == 0 and out.strip():
-            for line in out.strip().splitlines():
-                parts = line.split("\t")
-                if len(parts) == 2 and parts[1].strip() == "True":
-                    logging.info(
-                        "[swap_encryption] Benchmark node ready: %s",
-                        parts[0].strip(),
-                    )
-                    return
-
-        logging.info(
-            "[swap_encryption] Benchmark node not yet Ready — retrying in 15"
-            " s..."
-        )
-        time.sleep(15)
-
-    raise errors.Benchmarks.RunError(
-        "[swap_encryption] Timed out waiting for benchmark node "
-        f"(pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready "
-        f"after {timeout}s"
+        '[swap_encryption] io2 swap volume provisioning deferred to PR2'
     )
 
 
-def _attach_swap_disk(cluster) -> None:
-    """Create a dedicated hyperdisk and attach it to the benchmark node.
-
-    gcloud container node-pools create --additional-node-disk is not available
-    in all gcloud SDK versions, so we use gcloud compute to create the disk and
-    attach it after the node is ready.  In GKE the Kubernetes node name is the
-    same as the GCE instance name, so no translation is needed.
-
-    After attachment the disk appears as /dev/sdb (or /dev/nvme1n1 on NVMe
-    nodes) inside the pod, and _setup_gke_hyperdisk_swap detects it via lsblk.
-
-    The disk is named pkb-swap-<cluster-name> to avoid name collisions across
-    concurrent runs.  Cleanup deletes it in Cleanup() if it exists.
-    """
-    # Resolve zone from cluster
-    zone = None
-    if getattr(cluster, "zones", None):
-        zone = cluster.zones[0]
-    elif getattr(cluster, "region", None):
-        zone = cluster.region
-    if not zone:
-        raise errors.Benchmarks.RunError(
-            "[swap_encryption] Cannot attach swap disk: cluster zone unknown"
-        )
-
-    project = cluster.project
-    disk_name = f"pkb-swap-{cluster.name}"
-    disk_type = _BOOT_DISK_TYPE.value
-    disk_size_gb = _SWAP_DISK_SIZE_GB.value
-
-    # ── Step 1: get the GCE instance name of the benchmark node ───────────────
-    node_out, _, rc = kubectl.RunKubectlCommand(
-        [
-            "get",
-            "nodes",
-            "-l",
-            f"pkb_nodepool={_BENCHMARK_NODEPOOL}",
-            "-o",
-            "jsonpath={.items[0].metadata.name}",
-        ],
-        raise_on_failure=False,
-    )
-    instance_name = node_out.strip()
-    if rc != 0 or not instance_name:
-        raise errors.Benchmarks.RunError(
-            "[swap_encryption] Cannot find benchmark node for swap disk attach"
-        )
-    logging.info("[swap_encryption] Benchmark node instance: %s", instance_name)
-
-    # ── Step 2: create the hyperdisk ──────────────────────────────────────────
-    logging.info(
-        "[swap_encryption] Creating swap disk %s (%dGiB %s)",
-        disk_name,
-        disk_size_gb,
-        disk_type,
-    )
-    # Use PKB's GcloudCommand via _GcpZonalResource: auto-injects --project
-    # and --zone (always zonal — gcloud compute --region creates regional
-    # resources, which is not what we want for a node-attached swap disk).
-    gcp_res = _GcpZonalResource(project, zone)
-    create_cmd = gcp_util.GcloudCommand(
-        gcp_res, "compute", "disks", "create", disk_name
-    )
-    create_cmd.flags["type"] = disk_type
-    create_cmd.flags["size"] = f"{disk_size_gb}GB"
-    create_cmd.args.append("--quiet")
-    if disk_type.startswith("hyperdisk"):
-        create_cmd.flags["provisioned-iops"] = _BOOT_DISK_IOPS.value
-        create_cmd.flags["provisioned-throughput"] = (
-            _valid_hyperdisk_throughput(
-                _BOOT_DISK_IOPS.value, _BOOT_DISK_THROUGHPUT.value
-            )
-        )
-    _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
-    if rc != 0:
-        raise errors.Benchmarks.RunError(
-            f"[swap_encryption] Failed to create swap disk {disk_name}:"
-            f" {stderr}"
-        )
+def _detect_swap_device(
+    daemonset: _ds_mod.SwapDaemonSet,
+) -> str:
+    """Return the active swap device path on the cluster node."""
+    if _SWAP_DEVICE.value:
+        return _SWAP_DEVICE.value
 
-    # ── Step 3: attach the disk to the node VM ────────────────────────────────
-    logging.info(
-        "[swap_encryption] Attaching swap disk %s to %s",
-        disk_name,
-        instance_name,
-    )
-    attach_cmd = gcp_util.GcloudCommand(
-        gcp_res, "compute", "instances", "attach-disk", instance_name
-    )
-    attach_cmd.flags["disk"] = disk_name
-    attach_cmd.flags["device-name"] = "pkb-swap"
-    attach_cmd.args.append("--quiet")
-    _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
-    if rc != 0:
-        raise errors.Benchmarks.RunError(
-            f"[swap_encryption] Failed to attach swap disk to {instance_name}: "
-            f"{stderr}"
-        )
-    logging.info(
-        "[swap_encryption] Swap disk attached: %s → %s",
-        disk_name,
-        instance_name,
+    # /proc/swaps is the source of truth — it lists the device ACTUALLY active.
+    # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping
+    # from a previous run on a reused node can still appear as a /dev node while
+    # being non-functional (fio/swapoff fail with "No such device or address").
+    dm_out, _ = daemonset.PodExec(
+        textwrap.dedent("""
+            ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
+            if [ -n "$ACTIVE" ]
+            then
+              echo "$ACTIVE"
+            elif test -e /dev/mapper/swap_encrypted
+            then
+              echo /dev/mapper/swap_encrypted
+            fi
+        """),
+        ignore_failure=True,
     )
-
-
-def _delete_disk_by_name(disk_name: str, project: str, zone: str) -> bool:
-    """Detach (if attached) and delete a GCE disk, robustly, with retries.
-
-    Finds the attached instance from the disk's own `users` field rather than
-    kubectl — kubectl is often unavailable during teardown (cluster being
-    deleted), which previously left the disk attached and undeletable, so it
-    leaked.  Returns True if the disk is gone (deleted or already absent).
-    """
-    for attempt in range(1, 5):
-        gcp_res = _GcpZonalResource(project, zone)
-        describe_cmd = gcp_util.GcloudCommand(
-            gcp_res, "compute", "disks", "describe", disk_name
-        )
-        describe_cmd.flags["format"] = "value(users)"
-        users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
-        if rc != 0:
-            logging.info(
-                "[swap_encryption] Swap disk %s not present — nothing to"
-                " delete",
-                disk_name,
-            )
-            return True  # already gone
-        user = users.strip()
-        if user:
-            inst = user.split("/")[-1]
-            logging.info(
-                "[swap_encryption] Detaching swap disk %s from %s",
-                disk_name,
-                inst,
-            )
-            detach_cmd = gcp_util.GcloudCommand(
-                gcp_res, "compute", "instances", "detach-disk", inst
-            )
-            detach_cmd.flags["disk"] = disk_name
-            detach_cmd.args.append("--quiet")
-            detach_cmd.Issue(timeout=120, raise_on_failure=False)
-        delete_cmd = gcp_util.GcloudCommand(
-            gcp_res, "compute", "disks", "delete", disk_name
-        )
-        delete_cmd.args.append("--quiet")
-        _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
-        if drc == 0:
-            logging.info("[swap_encryption] Swap disk deleted: %s", disk_name)
-            return True
-        logging.warning(
-            "[swap_encryption] Swap disk delete attempt %d/4 failed "
-            "(%s); retrying in 10s",
-            attempt,
-            derr.strip()[:160],
-        )
-        time.sleep(10)
-    logging.error(
-        "[swap_encryption] Could NOT delete swap disk %s after retries "
-        "— delete it manually: gcloud compute disks delete %s "
-        "--zone %s --quiet",
-        disk_name,
-        disk_name,
-        zone,
+    dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ''
+    if dev:
+        return dev
+    raise ValueError(
+        'No active swap device found in the benchmark pod. '
+        'Use --swap_encryption_device to specify one.'
     )
-    return False
 
 
-def _detach_and_delete_swap_disk(cluster) -> None:
-    """Detach and delete the dedicated swap disk created by _attach_swap_disk."""
-    zone = None
-    if getattr(cluster, "zones", None):
-        zone = cluster.zones[0]
-    elif getattr(cluster, "region", None):
-        zone = cluster.region
-    if not zone or not getattr(cluster, "project", None):
-        return
-    _delete_disk_by_name(f"pkb-swap-{cluster.name}", cluster.project, zone)
-
-
-def _delete_default_node_pool(cluster) -> None:
-    """Delete the dummy default nodepool after the benchmark pool is ready.
-
-    The default nodepool (e2-medium) was only needed to satisfy GKE's
-    requirement that a cluster must have at least one nodepool at creation time.
-    Removing it stops the clock on its cost immediately.
-    """
-    # Use PKB's GcloudCommand: auto-injects --project, --zone/--region.
-    cmd = cluster._GcloudCommand(
-        "container",
-        "node-pools",
-        "delete",
-        _DEFAULT_NODEPOOL,
-        "--cluster",
-        cluster.name,
+def _build_metadata(
+    daemonset: _ds_mod.SwapDaemonSet, swap_dev: str
+) -> dict[str, Any]:
+    """Collect node environment, encryption type, and config into a dict."""
+    kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True)
+    mem_out, _ = daemonset.PodExec(
+        "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True
     )
-    cmd.args.append("--quiet")
-
-    logging.info(
-        "[swap_encryption] Deleting default nodepool: %s", _DEFAULT_NODEPOOL
+    swap_out, _ = daemonset.PodExec(
+        "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True
     )
-    _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
-    if rc != 0:
-        logging.warning(
-            "[swap_encryption] Could not delete default nodepool (rc=%d): %s",
-            rc,
-            stderr,
-        )
-    else:
-        logging.info("[swap_encryption] Default nodepool deleted")
 
-
-def _is_pod_gone(pod: str) -> bool:
-    """Return True if the named pod no longer exists in the cluster.
-
-    Used to distinguish OOM-killed container processes (pod still alive, rc=137)
-    from OOM-evicted pods (pod gone, DaemonSet will create a replacement).
-    """
     try:
-        _, err, rc = kubectl.RunKubectlCommand(
-            [
-                "get",
-                "pod",
-                pod,
-                "-n",
-                _DS_NAMESPACE,
-                "-o",
-                "jsonpath={.metadata.name}",
-            ],
-            raise_on_failure=False,
-            timeout=15,
-        )
-        return rc != 0 and "not found" in (err or "").lower()
-    except Exception:  # pylint: disable=broad-except
-        return False
-
-
-def _pod_exec(
-    pod: str,
-    cmd: str,
-    ignore_failure: bool = False,
-    timeout: int = 300,
-    _retries: int = 2,
-) -> tuple[str, str]:
-    """Run a shell command inside the benchmark pod via kubectl exec.
-
-    Args:
-      pod: Pod name returned by _wait_for_benchmark_pod.
-      cmd: Shell command string passed to bash -c.
-      ignore_failure: When True, non-zero exit codes are logged but not
-        raised.
-      timeout: Seconds before PKB kills the kubectl exec process. Default
-        300 s matches PKB's IssueCommand default. Pass a larger value for
-        long-running jobs (fio, stress-ng, kernel build).
-      _retries: Number of automatic retries on transient GKE websocket
-        resets ("connection reset by peer").  Set to 0 to disable retries
-        for idempotent-sensitive commands.
-
-    Returns:
-      Tuple of (stdout, stderr) strings.
-    """
-    # Use module-level constants for error strings (defined at top of module).
-    # Use the globally-tracked active pod name — it may have been updated by
-    # a previous _recover_pod call when eviction replaced the pod.
-    active = _active_pod[0] if _active_pod else pod
-
-    for attempt in range(_retries + 1):
-        out, err, rc = kubectl.RunKubectlCommand(
-            ["exec", active, "-n", _DS_NAMESPACE, "--", "bash", "-c", cmd],
-            raise_on_failure=False,
-            raise_on_timeout=False,  # let _pod_exec's own retry loop handle transient resets
-            timeout=timeout,
-        )
-        is_transient = rc != 0 and any(
-            e in err for e in _TRANSIENT_KUBECTL_ERRORS
-        )
-        if is_transient and attempt < _retries:
-            logging.warning(
-                "[swap_encryption] kubectl exec connection reset (attempt"
-                " %d/%d); retrying in 10 s",
-                attempt + 1,
-                _retries + 1,
-            )
-            time.sleep(10)
-            continue
-        # rc=137 (SIGKILL): the OOM killer terminated the container process.
-        # Two sub-cases:
-        #   A) Pod eviction: pod is gone, DaemonSet recreates it under a new name.
-        #   B) Container OOM restart: pod still exists, container restarts in place.
-        #      (DaemonSet restartPolicy=Always restarts the container, /tmp is lost,
-        #      tools must be re-installed before subsequent commands can run.)
-        # In both cases we call _recover_pod to wait for tools + sentinel, and
-        # we do NOT retry the OOM-triggering command itself.
-        if rc == 137:
-            # Record the OOM so the run-level gate can flag it even if the container
-            # restarts in place under the same pod name (which leaves both the
-            # "pod replaced" and "pod NotFound" checks silent).
-            if active not in _oom_events:
-                _oom_events.append(active)
-            # CRITICAL: sleep before checking pod state.  Kubernetes takes a few
-            # seconds to mark a just-evicted pod as Terminating / NotFound.  Without
-            # this delay _recover_pod sees the pod still in "Running" phase, returns
-            # the old pod name immediately, and every subsequent command fails with
-            # "Error from server (NotFound): pods … not found".
-            logging.warning(
-                "[swap_encryption] rc=137 — sleeping 15s for Kubernetes to"
-                " update pod state before recovery check"
-            )
-            time.sleep(15)
-            pod_gone = _is_pod_gone(active)
-            if pod_gone:
-                logging.warning(
-                    "[swap_encryption] OOM-eviction detected (rc=137, pod gone)"
-                    " — recovering pod name for subsequent commands (not"
-                    " retrying this cmd)"
-                )
-            else:
-                logging.warning(
-                    "[swap_encryption] Container OOM-killed (rc=137, pod still"
-                    " exists) — waiting for container restart and tool"
-                    " re-install before continuing"
-                )
-            new_pod = _recover_pod(active)
-            if new_pod != active:
-                logging.info(
-                    "[swap_encryption] Pod name updated: %s → %s",
-                    active,
-                    new_pod,
-                )
-                if _active_pod:
-                    _active_pod[0] = new_pod
-                active = new_pod
-            break  # Do NOT retry — the OOM cmd itself is not re-run on the new pod.
+        mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        mem_gb = 0
+    try:
+        swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        swap_gb = 0
 
-        is_container_gone = rc != 0 and any(
-            e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
-        )
-        if is_container_gone:
-            # Record the loss for the run-level degradation gate REGARDLESS of retry
-            # budget or ignore_failure.  A "pods … not found" on a best-effort command
-            # (kernel build, opensearch, cleanup of a dead pod) still means the pod
-            # died; without this the gate stays blind because _active_pod is only
-            # renamed on the retry path below, which _retries=0 callers never reach.
-            if active and active not in _pod_lost:
-                _pod_lost.append(active)
-                logging.error(
-                    "[swap_encryption] Benchmark pod %s is gone (%s) —"
-                    " recording run as degraded",
-                    active,
-                    (err or "").strip()[:160],
-                )
-            if attempt < _retries:
-                logging.warning(
-                    "[swap_encryption] Container gone/restarting (attempt"
-                    " %d/%d) — waiting for pod to recover...",
-                    attempt + 1,
-                    _retries + 1,
-                )
-                new_pod = _recover_pod(active)
-                if new_pod != active:
-                    logging.info(
-                        "[swap_encryption] Pod name updated: %s → %s",
-                        active,
-                        new_pod,
-                    )
-                    if _active_pod:
-                        _active_pod[0] = new_pod
-                    active = new_pod
-                continue
-        break
-
-    if rc != 0 and not ignore_failure:
-        raise errors.VmUtil.IssueCommandError(
-            f"[swap_encryption] _pod_exec failed (rc={rc}): {err}"
+    # Encryption type — key off dm-crypt presence + swap target.
+    enc = 'unknown'
+    if '/dev/mapper/' in swap_dev:
+        table_out, _ = daemonset.PodExec(
+            f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
+            ignore_failure=True,
         )
-    return out, err
-
-
-def _recover_pod(pod: str, timeout_sec: int = 600) -> str:
-    """Wait for a DaemonSet container to recover after OOM kill or eviction.
+        enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
+    elif _SWAP_TYPE.value in ('instance_store', 'io2'):
+        enc = 'nitro_hardware_offload'
+    elif not _ENABLE_DMCRYPT.value:
+        enc = 'none'
 
-    Handles two scenarios:
-    1. Container OOM restart: same pod name, container restarting in place.
-       DaemonSet restartPolicy=Always brings it back under the same pod name.
-    2. Pod eviction/deletion: the pod is gone entirely; the DaemonSet creates
-       a new pod with a DIFFERENT name.  We detect this by checking whether
-       the named pod still exists; if not, we search by the DaemonSet label
-       selector for a Running pod.
+    cloud = _detect_cloud(daemonset)
 
-    Returns the (possibly new) pod name once it is Running and ready.
-    """
-    deadline = time.time() + timeout_sec
-    logging.info(
-        "[swap_encryption] Waiting for pod %s to recover (up to %ds)...",
-        pod,
-        timeout_sec,
-    )
-
-    # Phase 1: wait for a Running pod — either the named one (container
-    # restart) or a replacement pod found via label selector (eviction).
-    #
-    # IMPORTANT: we query BOTH status.phase AND metadata.deletionTimestamp in a
-    # single call.  When a pod is evicted, Kubernetes first sets deletionTimestamp
-    # (the pod is "Terminating") while status.phase may still read "Running" for
-    # several seconds.  Checking only status.phase causes a false-positive: we
-    # return the old pod name immediately and every subsequent command fails with
-    # "Error from server (NotFound)".  Checking deletionTimestamp catches this.
-    recovered_pod = pod
-    while time.time() < deadline:
-        # IMPORTANT: capture stderr — kubectl writes "not found" to stderr, not
-        # stdout.  When the pod is gone, status_out is empty and the error text
-        # lives entirely in status_err.  Discarding stderr (using _) means the
-        # 'not found' check below never fires and we spin until deadline.
-        status_out, status_err, status_rc = kubectl.RunKubectlCommand(
-            [
-                "get",
-                "pod",
-                pod,
-                "-n",
-                _DS_NAMESPACE,
-                "-o",
-                "jsonpath={.status.phase}|{.metadata.deletionTimestamp}",
-            ],
-            raise_on_failure=False,
-            timeout=30,
+    instance_label = _INSTANCE_SIZE_LABEL.value
+    if not instance_label:
+        gcp_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail'
+            ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
+            ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+            ignore_failure=True,
         )
-        # Parse "Running|" (no deletionTimestamp) vs "Running|2026-…" (terminating)
-        fields = status_out.strip().split("|")
-        phase = fields[0].strip() if fields else ""
-        is_terminating = len(fields) > 1 and bool(fields[1].strip())
-
-        # Pod is genuinely Running and NOT being deleted — recovery complete.
-        if status_rc == 0 and phase == "Running" and not is_terminating:
-            break
-
-        # Pod no longer exists, OR it exists but is being terminated (Terminating
-        # state or deletionTimestamp set) — look for a replacement pod by label.
-        pod_gone_or_terminating = (
-            status_rc != 0 and "not found" in (status_out + status_err).lower()
-        ) or is_terminating
-        if pod_gone_or_terminating:
-            label_out, _, label_rc = kubectl.RunKubectlCommand(
-                [
-                    "get",
-                    "pods",
-                    "-n",
-                    _DS_NAMESPACE,
-                    "-l",
-                    f"app={_DS_LABEL}",
-                    "-o",
-                    (
-                        'jsonpath={range .items[?(@.status.phase=="Running")]}'
-                        '{.metadata.name}{"\\n"}{end}'
-                    ),
-                ],
-                raise_on_failure=False,
-                timeout=30,
-            )
-            new_pods = [
-                p.strip()
-                for p in label_out.strip().splitlines()
-                if p.strip() and p.strip() != pod
-            ]  # exclude the dying pod
-            if label_rc == 0 and new_pods:
-                recovered_pod = new_pods[0]
-                logging.info(
-                    "[swap_encryption] Original pod %s gone/terminating; "
-                    "found replacement %s",
-                    pod,
-                    recovered_pod,
-                )
-                break
-
-        time.sleep(10)
-    else:
-        raise errors.VmUtil.IssueCommandError(
-            f"[swap_encryption] No Running pod found (original: {pod}) "
-            f"within {timeout_sec}s after OOM kill / eviction"
+        if gcp_type_out.strip():
+            instance_label = gcp_type_out.strip().split('/')[-1]
+    if not instance_label:
+        aws_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail '
+            'http://169.254.169.254/latest/meta-data/instance-type '
+            '2>/dev/null || echo ""',
+            ignore_failure=True,
         )
+        instance_label = aws_type_out.strip()
 
-    # Phase 2: wait for init script to finish (sentinel written last).
-    while time.time() < deadline:
-        ready_out, _, ready_rc = kubectl.RunKubectlCommand(
-            [
-                "exec",
-                recovered_pod,
-                "-n",
-                _DS_NAMESPACE,
-                "--",
-                "bash",
-                "-c",
-                "test -f /tmp/pkb_ready && echo READY",
-            ],
-            raise_on_failure=False,
-            timeout=30,
-        )
-        if ready_rc == 0 and "READY" in ready_out:
-            logging.info(
-                "[swap_encryption] Pod %s recovered (swap device active)",
-                recovered_pod,
-            )
-            return recovered_pod
-        time.sleep(15)
+    return {
+        'benchmark': BENCHMARK_NAME,
+        'execution_mode': 'kubernetes_privileged_pod',
+        'cloud': cloud,
+        'instance_size': instance_label,
+        'kernel_version': kernel_out.strip(),
+        'host_memory_gb': mem_gb,
+        'swap_device': swap_dev,
+        'swap_size_gb': swap_gb,
+        'swap_encryption': enc,
+        'storage_target': _SWAP_TYPE.value,
+        'boot_disk_type': _BOOT_DISK_TYPE.value,
+        'dmcrypt_enabled': _ENABLE_DMCRYPT.value,
+        'node_image_type': _NODE_IMAGE_TYPE.value,
+        'boot_disk_iops_target': _BOOT_DISK_IOPS.value,
+        'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value,
+        'zswap_enabled': _ENABLE_ZSWAP.value,
+        'min_free_kbytes': _MIN_FREE_KBYTES.value,
+        'fio_runtime_sec': _FIO_RUNTIME_SEC.value,
+        'stress_vm_bytes_requested': _STRESS_VM_BYTES.value,
+        'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value,
+        'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
+        'nodepool': _NODEPOOL.value,
+    }
 
-    raise errors.VmUtil.IssueCommandError(
-        f"[swap_encryption] Pod {recovered_pod} did not become ready "
-        f"within {timeout_sec}s after OOM kill / eviction"
+
+def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str:
+    """Detect whether the benchmark pod is running on GCP or AWS."""
+    gcp_out, _ = daemonset.PodExec(
+        'curl -s -m 2 --fail '
+        'http://metadata.google.internal/computeMetadata/v1/project/project-id'
+        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        ignore_failure=True,
     )
+    if gcp_out.strip():
+        return 'GCP'
+    return 'AWS'
 
 
 def _run_phase1_fio(
-    pod: str, swap_dev: str, base_meta: dict[str, Any]
+    daemonset: _ds_mod.SwapDaemonSet,
+    swap_dev: str,
+    base_meta: dict[str, Any],
 ) -> list[sample.Sample]:
     """Run fio microbenchmarks on the raw swap block device (Phase 1).
 
@@ -1639,7 +788,7 @@ def _run_phase1_fio(
       4k_lat_read   iodepth=1   → completion latency floor (read)
 
     Args:
-      pod: Benchmark pod name.
+      daemonset: Active SwapDaemonSet resource.
       swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted.
       base_meta: Shared metadata dict from _build_metadata().
 
@@ -1648,51 +797,48 @@ def _run_phase1_fio(
     """
     samples: list[sample.Sample] = []
 
-    # swapoff before fio — running fio with --direct=1 on an active swap
-    # device races with kernel page-reclaim on the same dm-crypt target
-    # and can cause kernel panics on some kernels.
-    logging.info("[swap_encryption] Phase 1: swapoff %s", swap_dev)
-    _pod_exec(
-        pod,
-        f"swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true",
+    # swapoff before fio — running fio with --direct=1 on an active swap device
+    # races with kernel page-reclaim on the same dm-crypt target.
+    logging.info('[swap_encryption] Phase 1: swapoff %s', swap_dev)
+    daemonset.PodExec(
+        f'swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true',
         timeout=30,
         ignore_failure=True,
     )
 
     # (name, rw_mode, block_size, iodepth)
     fio_jobs = [
-        ("4k_randread", "randread", "4k", 32),
-        ("4k_randwrite", "randwrite", "4k", 32),
-        ("1m_seqread", "read", "1m", 8),
-        ("1m_seqwrite", "write", "1m", 8),
-        ("4k_lat_read", "randread", "4k", 1),
+        ('4k_randread', 'randread', '4k', 32),
+        ('4k_randwrite', 'randwrite', '4k', 32),
+        ('1m_seqread', 'read', '1m', 8),
+        ('1m_seqwrite', 'write', '1m', 8),
+        ('4k_lat_read', 'randread', '4k', 1),
     ]
 
     runtime = _FIO_RUNTIME_SEC.value
     try:
         for name, rw, bs, iodepth in fio_jobs:
             cmd = (
-                f"fio --name={name} --filename={swap_dev}"
-                f" --rw={rw} --bs={bs} --iodepth={iodepth}"
-                " --ioengine=libaio --direct=1"
-                f" --runtime={runtime} --time_based --group_reporting"
-                " --output-format=json 2>/dev/null"
+                f'fio --name={name} --filename={swap_dev}'
+                f' --rw={rw} --bs={bs} --iodepth={iodepth}'
+                ' --ioengine=libaio --direct=1'
+                f' --runtime={runtime} --time_based --group_reporting'
+                ' --output-format=json 2>/dev/null'
             )
-            logging.info("[swap_encryption] Phase 1: fio job %s", name)
-            out, _ = _pod_exec(pod, cmd, timeout=runtime + 120)
+            logging.info('[swap_encryption] Phase 1: fio job %s', name)
+            out, _ = daemonset.PodExec(cmd, timeout=runtime + 120)
             samples += _parse_fio_json(out, name, base_meta)
     finally:
         # Always re-enable swap so subsequent phases can drive swap I/O.
-        logging.info("[swap_encryption] Phase 1: swapon %s", swap_dev)
-        _pod_exec(
-            pod,
-            f"swapon {swap_dev} 2>/dev/null || true",
+        logging.info('[swap_encryption] Phase 1: swapon %s', swap_dev)
+        daemonset.PodExec(
+            f'swapon {swap_dev} 2>/dev/null || true',
             timeout=30,
             ignore_failure=True,
         )
 
     logging.info(
-        "[swap_encryption] Phase 1 complete (%d samples)", len(samples)
+        '[swap_encryption] Phase 1 complete (%d samples)', len(samples)
     )
     return samples
 
@@ -1714,10 +860,10 @@ def _parse_fio_json(
       List of Sample objects; empty if output cannot be parsed or is zero.
     """
     # fio sometimes emits kernel warnings before the JSON object.
-    json_start = fio_output.find("{")
+    json_start = fio_output.find('{')
     if json_start == -1:
         logging.warning(
-            "[swap_encryption] Phase 1: no JSON in fio output for %s", job_name
+            '[swap_encryption] Phase 1: no JSON in fio output for %s', job_name
         )
         return []
 
@@ -1725,13 +871,13 @@ def _parse_fio_json(
         data = json.loads(fio_output[json_start:])
     except json.JSONDecodeError as e:
         logging.warning(
-            "[swap_encryption] Phase 1: fio JSON parse error (%s): %s",
+            '[swap_encryption] Phase 1: fio JSON parse error (%s): %s',
             job_name,
             e,
         )
         return []
 
-    jobs = data.get("jobs", [])
+    jobs = data.get('jobs', [])
     if not jobs:
         return []
 
@@ -1739,43 +885,43 @@ def _parse_fio_json(
     samples: list[sample.Sample] = []
     meta = dict(base_meta, fio_job=job_name)
 
-    for direction in ("read", "write"):
+    for direction in ('read', 'write'):
         d = job.get(direction, {})
-        iops = float(d.get("iops", 0))
-        bw_kbps = float(d.get("bw", 0))  # fio reports KiB/s
+        iops = float(d.get('iops', 0))
+        bw_kbps = float(d.get('bw', 0))  # fio reports KiB/s
         bw_mbps = bw_kbps / 1024.0
 
-        # Skip directions with near-zero throughput (e.g. write on a randread job).
+        # Skip directions with near-zero throughput.
         if iops < 1 and bw_kbps < 1:
             continue
 
-        prefix = f"phase1_fio_{job_name}_{direction}"
-        samples.append(sample.Sample(f"{prefix}_iops", iops, "IOPS", meta))
+        prefix = f'phase1_fio_{job_name}_{direction}'
+        samples.append(sample.Sample(f'{prefix}_iops', iops, 'IOPS', meta))
         samples.append(
-            sample.Sample(f"{prefix}_bw_mbps", bw_mbps, "MB/s", meta)
+            sample.Sample(f'{prefix}_bw_mbps', bw_mbps, 'MB/s', meta)
         )
 
         # Completion latency — fio reports nanoseconds; emit microseconds.
-        clat = d.get("clat_ns", d.get("lat_ns", {}))
-        lat_mean_ns = float(clat.get("mean", 0))
+        clat = d.get('clat_ns', d.get('lat_ns', {}))
+        lat_mean_ns = float(clat.get('mean', 0))
         if lat_mean_ns > 0:
             samples.append(
                 sample.Sample(
-                    f"{prefix}_lat_mean_us", lat_mean_ns / 1000.0, "us", meta
+                    f'{prefix}_lat_mean_us', lat_mean_ns / 1000.0, 'us', meta
                 )
             )
             for pct_key, label in (
-                ("50.000000", "p50"),
-                ("99.000000", "p99"),
-                ("99.900000", "p999"),
+                ('50.000000', 'p50'),
+                ('99.000000', 'p99'),
+                ('99.900000', 'p999'),
             ):
-                val_ns = clat.get("percentile", {}).get(pct_key, 0)
+                val_ns = clat.get('percentile', {}).get(pct_key, 0)
                 if val_ns:
                     samples.append(
                         sample.Sample(
-                            f"{prefix}_lat_{label}_us",
+                            f'{prefix}_lat_{label}_us',
                             val_ns / 1000.0,
-                            "us",
+                            'us',
                             meta,
                         )
                     )
@@ -1785,258 +931,73 @@ def _parse_fio_json(
 
 _INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
     # GCP  (on-demand, us-central1 unless noted)
-    "c4-standard-8-lssd": 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
-    "c4-standard-8": 0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
-    "n4-highmem-32": 3.0256,  # 32 vCPU, 256 GB RAM
-    "n2-highmem-32": 2.5216,  # 32 vCPU, 256 GB RAM
-    "n2-standard-32": 1.5264,  # 32 vCPU, 120 GB RAM
-    "z3-highmem-8": 2.7248,  # 8 vCPU + 4× LSSD
+    'c4-standard-8-lssd': 0.5888,
+    'c4-standard-8': 0.5008,
+    'n4-highmem-32': 3.0256,
+    'n2-highmem-32': 2.5216,
+    'n2-standard-32': 1.5264,
+    'z3-highmem-8': 2.7248,
     # AWS
-    "i4i.4xlarge": 1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
-    "i4i.2xlarge": 0.7480,
-    "m6id.4xlarge": 0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
-    "m6i.4xlarge": 0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
-    "r6i.4xlarge": 1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
+    'i4i.4xlarge': 1.4960,
+    'i4i.2xlarge': 0.7480,
+    'm6id.4xlarge': 0.9072,
+    'm6i.4xlarge': 0.7680,
+    'r6i.4xlarge': 1.0080,
 }
 
 
 def _collect_cost_sample(
-    pod: str, elapsed_sec: float, base_meta: dict
+    daemonset: _ds_mod.SwapDaemonSet,
+    elapsed_sec: float,
+    base_meta: dict,
 ) -> list[sample.Sample]:
-    """Emit a cost_estimate_usd sample for the benchmark run (gap 7).
-
-    Instance type is read from cloud metadata inside the pod.  Price is looked
-    up from _INSTANCE_PRICE_USD_PER_HR; if unknown, the sample is omitted and
-    a warning is logged.
+    """Emit a cost_estimate_usd sample for the benchmark run."""
+    instance_type = ''
 
-    Args:
-      pod: Benchmark pod name.
-      elapsed_sec: Wall-clock seconds the benchmark phases took.
-      base_meta: Shared metadata dict.
-
-    Returns:
-      A list of zero or one sample.Sample.
-    """
-    # Detect instance type from cloud metadata
-    instance_type = ""
-
-    # GCP: machine type is the last segment of the metadata URL value
-    gcp_type_out, _ = _pod_exec(
-        pod,
-        "curl -s -m 3 --fail"
-        " http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
+    gcp_type_out, _ = daemonset.PodExec(
+        'curl -s -m 3 --fail'
+        ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
         ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
         ignore_failure=True,
     )
     if gcp_type_out.strip():
-        instance_type = gcp_type_out.strip().split("/")[-1]
+        instance_type = gcp_type_out.strip().split('/')[-1]
 
     if not instance_type:
-        # AWS: instance-type is a plain string
-        aws_type_out, _ = _pod_exec(
-            pod,
-            "curl -s -m 3 --fail "
-            "http://169.254.169.254/latest/meta-data/instance-type "
+        aws_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail '
+            'http://169.254.169.254/latest/meta-data/instance-type '
             '2>/dev/null || echo ""',
             ignore_failure=True,
         )
         instance_type = aws_type_out.strip()
 
-    # Allow explicit override (useful when running on custom/renamed machine
-    # types or when the pod was unavailable during cost collection).
     if _INSTANCE_SIZE_LABEL.value:
         instance_type = _INSTANCE_SIZE_LABEL.value
 
-    # Last resort: fall back to the benchmark machine type flag.  This ensures
-    # cost tracking works even when the pod was evicted before cost collection
-    # ran (in which case the metadata curl above returned empty).
     if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
         instance_type = _BENCHMARK_MACHINE_TYPE.value
         logging.info(
-            "[swap_encryption] Instance type from metadata unavailable; using"
-            " --swap_encryption_benchmark_machine_type=%s for cost tracking",
+            '[swap_encryption] Instance type from metadata unavailable; using'
+            ' --swap_encryption_benchmark_machine_type=%s for cost tracking',
             instance_type,
         )
 
     price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
     if price is None:
         logging.warning(
-            '[swap_encryption] Unknown instance type "%s" – skipping cost'
-            " sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost"
-            " tracking.",
+            '[swap_encryption] Unknown instance type "%s" — skipping cost'
+            ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost'
+            ' tracking.',
             instance_type,
         )
         return []
 
     hours = elapsed_sec / 3600.0
-    cost = hours * price
     meta = dict(
         base_meta,
         instance_type=instance_type,
         price_usd_per_hr=price,
         benchmark_elapsed_sec=round(elapsed_sec, 1),
     )
-    return [sample.Sample("cost_estimate_usd", cost, "USD", meta)]
-
-
-def _detect_swap_device(pod: str) -> str:
-    """Return the active swap device path on the cluster node."""
-    if _SWAP_DEVICE.value:
-        return _SWAP_DEVICE.value
-
-    # /proc/swaps is the source of truth: it lists the swap device that is
-    # ACTUALLY active.  We must NOT just `test -e /dev/mapper/swap_encrypted`,
-    # because a stale dm-crypt mapping from a previous run on a reused node can
-    # still exist as a /dev node while being non-functional (fio/swapoff then
-    # fail with "No such device or address").  So read the active device from
-    # /proc/swaps first; only fall back to the mapper path if /proc/swaps is
-    # somehow empty but the mapper is genuinely present.
-    dm_out, _ = _pod_exec(
-        pod,
-        textwrap.dedent("""
-        ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
-        if [ -n "$ACTIVE" ]
-        then
-          echo "$ACTIVE"
-        elif test -e /dev/mapper/swap_encrypted
-        then
-          echo /dev/mapper/swap_encrypted
-        fi
-      """),
-        ignore_failure=True,
-    )
-    dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ""
-    if dev:
-        return dev
-    raise ValueError(
-        "No active swap device found in the benchmark pod. "
-        "Use --swap_encryption_device to specify one."
-    )
-
-
-def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
-    """Collect node environment, encryption type, and config into a dict."""
-
-    kernel_out, _ = _pod_exec(pod, "uname -r", ignore_failure=True)
-    mem_out, _ = _pod_exec(
-        pod,
-        "awk '/MemTotal/{print $2}' /proc/meminfo",
-        ignore_failure=True,
-    )
-    swap_out, _ = _pod_exec(
-        pod,
-        "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps",
-        ignore_failure=True,
-    )
-
-    try:
-        mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
-    except ValueError:
-        mem_gb = 0
-    try:
-        swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
-    except ValueError:
-        swap_gb = 0
-
-    # Encryption type — key off dm-crypt presence + the swap target, NOT the
-    # device path.  A GKE plain Local SSD is /dev/nvme0n1 but is NOT Nitro-
-    # encrypted; only the AWS targets (instance_store / io2) are.
-    enc = "unknown"
-    if "/dev/mapper/" in swap_dev:
-        table_out, _ = _pod_exec(
-            pod,
-            f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        enc = "dm-crypt-plain" if "crypt" in table_out.lower() else "dm-other"
-    elif _SWAP_TYPE.value in ("instance_store", "io2"):
-        enc = "nitro_hardware_offload"  # AWS: encrypted by the Nitro card
-    elif not _ENABLE_DMCRYPT.value:
-        enc = "none"  # GKE plain swap (encryption OFF)
-
-    cloud = _detect_cloud(pod)
-
-    # Gap 6: instance size label for multi-size comparison runs.
-    # If the flag is set use it directly; otherwise try to read it from
-    # cloud metadata so that the field is always populated.
-    instance_label = _INSTANCE_SIZE_LABEL.value
-    if not instance_label:
-        gcp_type_out, _ = _pod_exec(
-            pod,
-            "curl -s -m 3 --fail"
-            " http://metadata.google.internal/computeMetadata/v1/instance/machine-type"
-            ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        if gcp_type_out.strip():
-            instance_label = gcp_type_out.strip().split("/")[-1]
-    if not instance_label:
-        aws_type_out, _ = _pod_exec(
-            pod,
-            "curl -s -m 3 --fail "
-            "http://169.254.169.254/latest/meta-data/instance-type "
-            '2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        instance_label = aws_type_out.strip()
-
-    return {
-        "benchmark": BENCHMARK_NAME,
-        "execution_mode": "kubernetes_privileged_pod",
-        "cloud": cloud,
-        "instance_size": instance_label,
-        "kernel_version": kernel_out.strip(),
-        "host_memory_gb": mem_gb,
-        "swap_device": swap_dev,
-        "swap_size_gb": swap_gb,
-        "swap_encryption": enc,
-        # Test-matrix columns: storage target, encryption on/off, image, IOPS
-        "storage_target": _SWAP_TYPE.value,
-        "boot_disk_type": _BOOT_DISK_TYPE.value,
-        "dmcrypt_enabled": _ENABLE_DMCRYPT.value,
-        "node_image_type": _NODE_IMAGE_TYPE.value,
-        "boot_disk_iops_target": _BOOT_DISK_IOPS.value,
-        "benchmark_machine_type": _BENCHMARK_MACHINE_TYPE.value,
-        # Other config
-        "zswap_enabled": _ENABLE_ZSWAP.value,
-        "min_free_kbytes": _MIN_FREE_KBYTES.value,
-        "fio_runtime_sec": _FIO_RUNTIME_SEC.value,
-        # Requested config value only.  The *effective* stress-ng footprint may
-        # be autoscaled per node (see _autoscale_vm_bytes); Phase 2a records the
-        # actual value it ran with as 'stress_vm_bytes' so the two never conflict.
-        "stress_vm_bytes_requested": _STRESS_VM_BYTES.value,
-        "stress_vm_bytes_list": _STRESS_VM_BYTES_LIST.value,
-        "stress_timeout_sec": _STRESS_TIMEOUT_SEC.value,
-        "nodepool": _NODEPOOL.value,
-    }
-
-
-def _detect_cloud(pod: str) -> str:
-    """Detect whether the benchmark pod is running on GCP or AWS.
-
-    Queries the cloud instance metadata endpoint inside the pod.  Returns
-    'GCP' if the GCP metadata server responds, 'AWS' otherwise.
-    """
-    gcp_out, _ = _pod_exec(
-        pod,
-        "curl -s -m 2 --fail "
-        "http://metadata.google.internal/computeMetadata/v1/project/project-id"
-        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-        ignore_failure=True,
-    )
-    if gcp_out.strip():
-        return "GCP"
-    return "AWS"
-
-
-def _ensure_io2_volume() -> None:
-    """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
-
-    Only executed when --swap_encryption_swap_type=io2.  Full implementation
-    is deferred to PR2 (swap-capability layer).
-    """
-    if _SWAP_TYPE.value != "io2":
-        return
-    logging.info(
-        "[swap_encryption] io2 swap volume provisioning deferred to PR2"
-    )
+    return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)]
diff --git a/perfkitbenchmarker/resources/container_service/swap_daemonset.py b/perfkitbenchmarker/resources/container_service/swap_daemonset.py
new file mode 100644
index 0000000000..ab23c8d6aa
--- /dev/null
+++ b/perfkitbenchmarker/resources/container_service/swap_daemonset.py
@@ -0,0 +1,609 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SwapDaemonSet: PKB BaseResource for the swap-encryption privileged DaemonSet.
+
+Manages the full lifecycle of the privileged benchmark pod used by the
+swap_encryption benchmark:
+
+  _Create()   — apply the Jinja2 manifest via kubernetes_commands.ApplyManifest
+                and wait for the pod to reach Running + /tmp/pkb_ready.
+  _Delete()   — run in-pod cleanup (swapoff, dmsetup remove, losetup teardown,
+                pkill fio/stress-ng) then kubectl delete daemonset.
+  PodExec()   — kubectl exec wrapper with transient-reset retry, OOM-kill (rc=137)
+                detection, and automatic RecoverPod() after eviction or container
+                restart.
+  WaitForPod()  — polls for Running phase + sentinel; updates self.pod_name.
+  RecoverPod()  — waits for DaemonSet to recreate / restart the container,
+                  checking deletionTimestamp to avoid false-positive Running state.
+
+Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern
+(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses,
+not in benchmark files.
+"""
+
+import logging
+import textwrap
+import time
+from typing import Optional
+
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import resource
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.resources.container_service import kubernetes_commands
+
+# Transient kubectl errors that are safe to retry automatically.
+_TRANSIENT_KUBECTL_ERRORS = ('connection reset by peer', 'websocket: close')
+
+# Errors indicating the container / pod is gone and needs full recovery.
+_CONTAINER_GONE_KUBECTL_ERRORS = (
+    'container not found',
+    'procready not received',
+    'unable to upgrade connection',
+    'not found',
+    'deleted state',
+)
+
+
+class SwapDaemonSet(resource.BaseResource):
+    """PKB resource for the swap-encryption benchmark privileged DaemonSet.
+
+    The DaemonSet runs a single privileged pod on the benchmark nodepool.
+    It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli),
+    verifies the swap device is active, then writes /tmp/pkb_ready.  All
+    benchmark phases execute commands inside this pod via PodExec().
+
+    Attributes:
+      name: DaemonSet metadata.name (e.g. 'pkb-swap-benchmark').
+      namespace: Kubernetes namespace (typically 'default').
+      label: Pod label value for app= selector.
+      nodepool: pkb_nodepool label value pinning the DaemonSet to the
+        benchmark node.
+      image: Container image (e.g. 'ubuntu:22.04').
+      pod_name: Name of the currently active pod; updated by WaitForPod /
+        RecoverPod on eviction.
+      oom_events: Pod names that triggered rc=137 OOM-kill; read by Run()
+        for the degradation gate.
+      pod_lost: Pod names that went NotFound during PodExec; read by Run()
+        for the degradation gate.
+    """
+
+    RESOURCE_TYPE = 'SwapDaemonSet'
+    REQUIRED_ATTRS = []
+
+    def __init__(
+        self,
+        name: str,
+        namespace: str,
+        label: str,
+        nodepool: str,
+        image: str,
+    ) -> None:
+        super().__init__()
+        self.name = name
+        self.namespace = namespace
+        self.label = label
+        self.nodepool = nodepool
+        self.image = image
+        # Active pod tracking — updated by WaitForPod / RecoverPod.
+        self.pod_name: Optional[str] = None
+        # Per-run accumulators read by Run() for the degradation gate.
+        self.oom_events: list[str] = []
+        self.pod_lost: list[str] = []
+
+    # ── PKB lifecycle ─────────────────────────────────────────────────────────
+
+    def _Create(self) -> None:
+        """Apply the DaemonSet manifest and wait for the pod to be ready."""
+        kubernetes_commands.ApplyManifest(
+            'cluster/swap_encryption_daemonset.yaml.j2',
+            ds_name=self.name,
+            ds_namespace=self.namespace,
+            ds_label=self.label,
+            benchmark_nodepool=self.nodepool,
+            image=self.image,
+        )
+        logging.info('[swap_encryption] Swap-infra DaemonSet applied')
+        pod = self.WaitForPod()
+        if pod is None:
+            raise errors.Benchmarks.PrepareException(
+                '[swap_encryption] DaemonSet pod did not become ready within'
+                ' timeout'
+            )
+
+    def _Delete(self) -> None:
+        """Run in-pod teardown then delete the DaemonSet.
+
+        Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the
+        pod (best-effort, ignore_failure=True) before deleting the DaemonSet.
+        This mirrors the original Cleanup() logic so no swap state is leaked.
+        """
+        # Try to get the pod name quickly if not set.
+        if self.pod_name is None:
+            self.WaitForPod(timeout=30)
+
+        if self.pod_name:
+            self.PodExec(
+                'swapoff -a 2>/dev/null || true',
+                ignore_failure=True,
+                _retries=0,
+            )
+            self.PodExec(
+                textwrap.dedent("""\
+                    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+                    dmsetup remove --noudevrules --noudevsync \
+                      swap_encrypted 2>/dev/null || true
+                """),
+                ignore_failure=True,
+                _retries=0,
+            )
+            self.PodExec(
+                textwrap.dedent("""\
+                    for backing in \
+                        /var/pkb_swap_backing \
+                        /run/pkb_swap_backing \
+                        /mnt/stateful_partition/pkb_swap_backing
+                    do
+                      losetup -j "$backing" 2>/dev/null \
+                        | awk -F: '{print $1}' \
+                        | while read dev
+                          do losetup -d "$dev" 2>/dev/null || true; done
+                      rm -f "$backing"
+                    done
+                """),
+                ignore_failure=True,
+                _retries=0,
+            )
+            self.PodExec(
+                "pkill -9 'stress-ng|fio' 2>/dev/null || true",
+                ignore_failure=True,
+                _retries=0,
+            )
+
+        kubectl.RunKubectlCommand(
+            [
+                'delete',
+                'daemonset',
+                self.name,
+                '-n',
+                self.namespace,
+                '--ignore-not-found',
+            ],
+            raise_on_failure=False,
+        )
+        logging.info('[swap_encryption] DaemonSet deleted')
+
+    # ── Pod lifecycle helpers ─────────────────────────────────────────────────
+
+    def WaitForPod(self, timeout: int = 600) -> Optional[str]:
+        """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists.
+
+        Two-phase poll:
+          1. Wait for status.phase == Running.
+          2. kubectl exec test -f /tmp/pkb_ready.
+
+        The DaemonSet init script writes /tmp/pkb_ready only after verifying
+        the swap device is active (up to 150 s) and installing all measurement
+        tools (~1-2 min on cold APT cache).  The default 600 s covers
+        worst-case APT latency on a freshly-booted node.
+
+        Args:
+          timeout: Maximum seconds to wait.
+
+        Returns:
+          Pod name on success; None on timeout.  Also updates self.pod_name.
+        """
+        deadline = time.time() + timeout
+        last_phase = ''
+        ready_pod = None
+
+        while time.time() < deadline:
+            # Step 1: wait for Running phase.
+            if ready_pod is None:
+                out, _, rc = kubectl.RunKubectlCommand(
+                    [
+                        'get',
+                        'pods',
+                        '-l',
+                        f'app={self.label}',
+                        '-n',
+                        self.namespace,
+                        '-o',
+                        (
+                            r'jsonpath={range .items[*]}'
+                            r'{.metadata.name}{"\t"}'
+                            r'{.status.phase}{"\n"}{end}'
+                        ),
+                    ],
+                    raise_on_failure=False,
+                )
+                if rc == 0 and out.strip():
+                    for line in out.strip().splitlines():
+                        parts = line.split('\t')
+                        if len(parts) == 2:
+                            pod_name = parts[0].strip()
+                            phase = parts[1].strip()
+                            if phase == 'Running':
+                                logging.info(
+                                    '[swap_encryption] Pod %s is Running'
+                                    ' — waiting for sentinel...',
+                                    pod_name,
+                                )
+                                ready_pod = pod_name
+                                break
+                            if phase != last_phase:
+                                logging.info(
+                                    '[swap_encryption] Pod %s phase: %s',
+                                    pod_name,
+                                    phase,
+                                )
+                                last_phase = phase
+                                if phase == 'Pending':
+                                    self._LogPodEvents(pod_name)
+                else:
+                    logging.info(
+                        '[swap_encryption] Waiting for DaemonSet pod to'
+                        ' appear...'
+                    )
+
+            # Step 2: poll for /tmp/pkb_ready sentinel.
+            if ready_pod is not None:
+                _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
+                    [
+                        'exec',
+                        ready_pod,
+                        '-n',
+                        self.namespace,
+                        '--',
+                        'test',
+                        '-f',
+                        '/tmp/pkb_ready',
+                    ],
+                    raise_on_failure=False,
+                )
+                if sentinel_rc == 0:
+                    logging.info(
+                        '[swap_encryption] Pod %s ready (swap device active)',
+                        ready_pod,
+                    )
+                    self.pod_name = ready_pod
+                    return ready_pod
+                # Container crashed (CrashLoopBackOff / exited) — reset and
+                # re-check pod phase on the next iteration.
+                if 'container not found' in sentinel_err or (
+                    'unable to upgrade connection' in sentinel_err
+                ):
+                    logging.warning(
+                        '[swap_encryption] Pod %s: container not running'
+                        ' (%s) — will re-check pod state',
+                        ready_pod,
+                        sentinel_err.strip(),
+                    )
+                    ready_pod = None
+                    last_phase = ''
+                else:
+                    logging.info(
+                        '[swap_encryption] Pod %s: still installing tools...',
+                        ready_pod,
+                    )
+
+            time.sleep(15)
+
+        logging.warning(
+            '[swap_encryption] Benchmark pod not ready after %ds', timeout
+        )
+        return None
+
+    def _LogPodEvents(self, pod_name: str) -> None:
+        """Dump recent Kubernetes events for a pod to help diagnose hangs."""
+        events_out, _, _ = kubectl.RunKubectlCommand(
+            ['describe', 'pod', pod_name, '-n', self.namespace],
+            raise_on_failure=False,
+        )
+        in_events = False
+        lines = []
+        for line in events_out.splitlines():
+            if line.startswith('Events:'):
+                in_events = True
+            if in_events:
+                lines.append(line)
+        if lines:
+            logging.info(
+                '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])
+            )
+        else:
+            logging.info(
+                '[swap_encryption] kubectl describe output:\n%s',
+                events_out[-2000:] if len(events_out) > 2000 else events_out,
+            )
+
+    def _IsPodGone(self, pod: str) -> bool:
+        """Return True if the named pod no longer exists in the cluster."""
+        try:
+            _, err, rc = kubectl.RunKubectlCommand(
+                [
+                    'get',
+                    'pod',
+                    pod,
+                    '-n',
+                    self.namespace,
+                    '-o',
+                    'jsonpath={.metadata.name}',
+                ],
+                raise_on_failure=False,
+                timeout=15,
+            )
+            return rc != 0 and 'not found' in (err or '').lower()
+        except Exception:  # pylint: disable=broad-except
+            return False
+
+    def PodExec(
+        self,
+        cmd: str,
+        ignore_failure: bool = False,
+        timeout: int = 300,
+        _retries: int = 2,
+    ) -> tuple[str, str]:
+        """Run a shell command inside the benchmark pod via kubectl exec.
+
+        Handles:
+          - Transient GKE websocket resets: automatic retry (up to _retries).
+          - OOM kill (rc=137): records to self.oom_events, calls RecoverPod,
+            does NOT retry the OOM-triggering command itself.
+          - Container/pod gone: records to self.pod_lost, calls RecoverPod,
+            retries the command on the recovered pod.
+
+        Uses self.pod_name as the active pod; RecoverPod updates it on eviction.
+
+        Args:
+          cmd: Shell command string passed to bash -c.
+          ignore_failure: When True, non-zero exit codes are logged but not
+            raised.
+          timeout: Seconds before PKB kills the kubectl exec process.  Pass a
+            larger value for long-running jobs (fio, stress-ng, kernel build).
+          _retries: Max automatic retries on transient websocket resets.
+
+        Returns:
+          Tuple of (stdout, stderr) strings.
+        """
+        active = self.pod_name
+
+        for attempt in range(_retries + 1):
+            out, err, rc = kubectl.RunKubectlCommand(
+                [
+                    'exec',
+                    active,
+                    '-n',
+                    self.namespace,
+                    '--',
+                    'bash',
+                    '-c',
+                    cmd,
+                ],
+                raise_on_failure=False,
+                raise_on_timeout=False,
+                timeout=timeout,
+            )
+
+            # Retry transient GKE websocket resets.
+            is_transient = rc != 0 and any(
+                e in err for e in _TRANSIENT_KUBECTL_ERRORS
+            )
+            if is_transient and attempt < _retries:
+                logging.warning(
+                    '[swap_encryption] kubectl exec connection reset (attempt'
+                    ' %d/%d); retrying in 10 s',
+                    attempt + 1,
+                    _retries + 1,
+                )
+                time.sleep(10)
+                continue
+
+            # rc=137 (SIGKILL): OOM killer terminated the container process.
+            # Do NOT retry — log, recover, and return so the caller can decide.
+            if rc == 137:
+                if active not in self.oom_events:
+                    self.oom_events.append(active)
+                # Kubernetes takes a few seconds to update pod state after
+                # eviction — sleep before checking to avoid false-positive Running.
+                logging.warning(
+                    '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes'
+                    ' to update pod state before recovery check'
+                )
+                time.sleep(15)
+                if self._IsPodGone(active):
+                    logging.warning(
+                        '[swap_encryption] OOM-eviction detected (rc=137, pod'
+                        ' gone) — recovering pod name for subsequent commands'
+                    )
+                else:
+                    logging.warning(
+                        '[swap_encryption] Container OOM-killed (rc=137, pod'
+                        ' still exists) — waiting for container restart'
+                    )
+                new_pod = self.RecoverPod(active)
+                if new_pod != active:
+                    logging.info(
+                        '[swap_encryption] Pod name updated: %s → %s',
+                        active,
+                        new_pod,
+                    )
+                    self.pod_name = new_pod
+                    active = new_pod
+                break  # OOM cmd is never re-run on the recovered pod.
+
+            # Container or pod gone: record loss, try RecoverPod, retry cmd.
+            is_container_gone = rc != 0 and any(
+                e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
+            )
+            if is_container_gone:
+                if active and active not in self.pod_lost:
+                    self.pod_lost.append(active)
+                    logging.error(
+                        '[swap_encryption] Benchmark pod %s is gone (%s) —'
+                        ' recording run as degraded',
+                        active,
+                        (err or '').strip()[:160],
+                    )
+                if attempt < _retries:
+                    logging.warning(
+                        '[swap_encryption] Container gone/restarting (attempt'
+                        ' %d/%d) — waiting for pod to recover...',
+                        attempt + 1,
+                        _retries + 1,
+                    )
+                    new_pod = self.RecoverPod(active)
+                    if new_pod != active:
+                        logging.info(
+                            '[swap_encryption] Pod name updated: %s → %s',
+                            active,
+                            new_pod,
+                        )
+                        self.pod_name = new_pod
+                        active = new_pod
+                    continue
+            break
+
+        if rc != 0 and not ignore_failure:
+            raise errors.VmUtil.IssueCommandError(
+                f'[swap_encryption] PodExec failed (rc={rc}): {err}'
+            )
+        return out, err
+
+    def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str:
+        """Wait for the DaemonSet to recover after OOM kill or eviction.
+
+        Handles two scenarios:
+          1. Container OOM restart: same pod name, container restarting in
+             place (DaemonSet restartPolicy=Always).
+          2. Pod eviction/deletion: pod is gone; DaemonSet creates a new pod
+             with a DIFFERENT name.
+
+        Checks metadata.deletionTimestamp in addition to status.phase to
+        catch the Terminating state where phase may still read Running.
+
+        Args:
+          pod: Original pod name to monitor.
+          timeout_sec: Maximum seconds to wait for recovery.
+
+        Returns:
+          The (possibly new) pod name once Running and /tmp/pkb_ready is
+          present.
+        """
+        deadline = time.time() + timeout_sec
+        logging.info(
+            '[swap_encryption] Waiting for pod %s to recover (up to %ds)...',
+            pod,
+            timeout_sec,
+        )
+
+        # Phase 1: find a Running pod that is NOT being terminated.
+        recovered_pod = pod
+        while time.time() < deadline:
+            # Query both phase and deletionTimestamp in a single call.
+            status_out, status_err, status_rc = kubectl.RunKubectlCommand(
+                [
+                    'get',
+                    'pod',
+                    pod,
+                    '-n',
+                    self.namespace,
+                    '-o',
+                    'jsonpath={.status.phase}|{.metadata.deletionTimestamp}',
+                ],
+                raise_on_failure=False,
+                timeout=30,
+            )
+            fields = status_out.strip().split('|')
+            phase = fields[0].strip() if fields else ''
+            is_terminating = len(fields) > 1 and bool(fields[1].strip())
+
+            # Genuine Running (not being deleted) — move to Phase 2.
+            if status_rc == 0 and phase == 'Running' and not is_terminating:
+                break
+
+            # Pod gone or Terminating — look for a replacement by label.
+            pod_gone_or_terminating = (
+                status_rc != 0
+                and 'not found' in (status_out + status_err).lower()
+            ) or is_terminating
+            if pod_gone_or_terminating:
+                label_out, _, label_rc = kubectl.RunKubectlCommand(
+                    [
+                        'get',
+                        'pods',
+                        '-n',
+                        self.namespace,
+                        '-l',
+                        f'app={self.label}',
+                        '-o',
+                        (
+                            'jsonpath={range'
+                            ' .items[?(@.status.phase=="Running")]}'
+                            '{.metadata.name}{"\\n"}{end}'
+                        ),
+                    ],
+                    raise_on_failure=False,
+                    timeout=30,
+                )
+                new_pods = [
+                    p.strip()
+                    for p in label_out.strip().splitlines()
+                    if p.strip() and p.strip() != pod
+                ]
+                if label_rc == 0 and new_pods:
+                    recovered_pod = new_pods[0]
+                    logging.info(
+                        '[swap_encryption] Original pod %s gone/terminating;'
+                        ' found replacement %s',
+                        pod,
+                        recovered_pod,
+                    )
+                    break
+
+            time.sleep(10)
+        else:
+            raise errors.VmUtil.IssueCommandError(
+                f'[swap_encryption] No Running pod found (original: {pod})'
+                f' within {timeout_sec}s after OOM kill / eviction'
+            )
+
+        # Phase 2: wait for init script to finish (sentinel written last).
+        while time.time() < deadline:
+            ready_out, _, ready_rc = kubectl.RunKubectlCommand(
+                [
+                    'exec',
+                    recovered_pod,
+                    '-n',
+                    self.namespace,
+                    '--',
+                    'bash',
+                    '-c',
+                    'test -f /tmp/pkb_ready && echo READY',
+                ],
+                raise_on_failure=False,
+                timeout=30,
+            )
+            if ready_rc == 0 and 'READY' in ready_out:
+                logging.info(
+                    '[swap_encryption] Pod %s recovered (swap device active)',
+                    recovered_pod,
+                )
+                self.pod_name = recovered_pod
+                return recovered_pod
+            time.sleep(15)
+
+        raise errors.VmUtil.IssueCommandError(
+            f'[swap_encryption] Pod {recovered_pod} did not become ready'
+            f' within {timeout_sec}s after OOM kill / eviction'
+        )
diff --git a/perfkitbenchmarker/resources/container_service/swap_nodepool.py b/perfkitbenchmarker/resources/container_service/swap_nodepool.py
new file mode 100644
index 0000000000..44e5cb396a
--- /dev/null
+++ b/perfkitbenchmarker/resources/container_service/swap_nodepool.py
@@ -0,0 +1,575 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SwapNodePool: PKB BaseResource for the swap-encryption benchmark nodepool.
+
+Manages the lifecycle of:
+
+  GKE nodepool  — gcloud container node-pools create with UBUNTU_CONTAINERD,
+                  linuxConfig.swapConfig + sysctl via --system-config-from-file.
+                  For LSSD machines: --local-nvme-ssd-block and
+                  dedicatedLocalSsdProfile in the swap YAML.
+                  For hyperdisk configs: boot-disk-provisioned-iops/throughput.
+
+  Swap disk     — Optional dedicated hyperdisk attached post-nodepool creation
+                  (for dm-crypt measurement on machines where the boot disk
+                  cannot be used as a swap device directly).
+
+  Default pool  — DeleteDefaultPool() removes the dummy e2-medium pool created
+                  at cluster time once the DaemonSet pod is Running.
+
+Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern
+(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses.
+"""
+
+import logging
+import os
+import tempfile
+import time
+
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import resource
+from perfkitbenchmarker.providers.gcp import util as gcp_util
+from perfkitbenchmarker.resources.container_service import kubectl
+
+# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s.
+_HYPERDISK_MAX_IOPS_PER_MBPS = 256
+
+_BENCHMARK_NODEPOOL = 'benchmark'
+_DEFAULT_NODEPOOL = 'default-pool'
+
+
+def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
+    """Return a throughput (MiB/s) satisfying GCP Hyperdisk Balanced constraints.
+
+    Clamps throughput UP to the minimum required by the requested IOPS so that
+    a mismatched flag pair cannot abort nodepool / disk creation with:
+      "Requested provisioned throughput is too low for the provisioned iops".
+    """
+    min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
+    if throughput < min_tput:
+        logging.warning(
+            '[swap_encryption] boot/swap disk throughput %d MiB/s is too low'
+            ' for %d IOPS; clamping to minimum %d MiB/s',
+            throughput,
+            iops,
+            min_tput,
+        )
+        return min_tput
+    return throughput
+
+
+class _GcpZonalResource:
+    """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
+
+    gcp_util.GcloudCommand auto-injects --project and --zone from the resource
+    object.  GkeCluster._GcloudCommand() switches --zone → --region for
+    multi-zone clusters, which is wrong for gcloud compute commands (--region
+    creates regional resources).  This shim pins a single zone so all
+    gcloud compute calls target the correct AZ.
+    """
+
+    def __init__(self, project: str, zone: str) -> None:
+        self.project = project
+        self.zone = zone
+
+
+class SwapNodePool(resource.BaseResource):
+    """PKB resource for the swap-encryption benchmark GKE nodepool and disk.
+
+    _Create() runs the full setup sequence:
+      1. gcloud container node-pools create with linuxConfig.swapConfig.
+      2. Wait for the node to become Ready.
+      3. (Optional) Create and attach a dedicated swap disk.
+
+    _Delete() tears down in reverse:
+      1. (Optional) Detach and delete the swap disk.
+      2. gcloud container node-pools delete.
+
+    DeleteDefaultPool() is a separate step called from Prepare() AFTER the
+    DaemonSet pod is Running, since deleting the default pool while the
+    benchmark node is still joining can trigger a brief API-server timeout.
+
+    Attributes:
+      cluster: PKB GkeCluster (or subclass) object; provides _GcloudCommand,
+        name, project, zones/region.
+      machine_type: GKE machine type (e.g. 'n4-highmem-32').
+      node_image_type: GKE image type (e.g. 'UBUNTU_CONTAINERD').
+      disk_type: Boot disk type (e.g. 'hyperdisk-balanced' or 'pd-ssd').
+      disk_size_gb: Boot disk size in GiB (500 for hyperdisk, 100 for LSSD).
+      disk_iops: Provisioned IOPS (hyperdisk-balanced only).
+      disk_throughput: Provisioned throughput MiB/s (hyperdisk-balanced only).
+      lssd: True if the machine type uses local NVMe SSDs.  Auto-detected from
+        machine_type name when False.
+      lssd_count: Number of local NVMe SSDs (--local-nvme-ssd-block count=N).
+      add_swap_disk: True to create+attach a dedicated second disk for swap.
+      swap_disk_size_gb: Size of the dedicated swap disk in GiB.
+    """
+
+    RESOURCE_TYPE = 'SwapNodePool'
+    REQUIRED_ATTRS = []
+
+    def __init__(
+        self,
+        cluster,
+        machine_type: str,
+        node_image_type: str,
+        disk_type: str,
+        disk_size_gb: int,
+        disk_iops: int,
+        disk_throughput: int,
+        lssd: bool,
+        lssd_count: int,
+        add_swap_disk: bool,
+        swap_disk_size_gb: int,
+    ) -> None:
+        super().__init__()
+        self.cluster = cluster
+        self.machine_type = machine_type
+        self.node_image_type = node_image_type
+        self.disk_type = disk_type
+        self.disk_size_gb = disk_size_gb
+        self.disk_iops = disk_iops
+        self.disk_throughput = disk_throughput
+        # Auto-detect LSSD from machine type name; explicit flag overrides.
+        self.lssd = lssd or 'lssd' in machine_type.lower()
+        self.lssd_count = lssd_count
+        self.add_swap_disk = add_swap_disk
+        self.swap_disk_size_gb = swap_disk_size_gb
+
+    # ── PKB lifecycle ─────────────────────────────────────────────────────────
+
+    def _Create(self) -> None:
+        """Create the benchmark nodepool, wait for node, optionally attach disk."""
+        self._CreateNodePool()
+        self._WaitForNode()
+        if self.add_swap_disk:
+            self._AttachDisk()
+
+    def _Delete(self) -> None:
+        """Detach+delete the swap disk (if any) then delete the nodepool."""
+        if self.add_swap_disk:
+            self._DetachAndDeleteDisk()
+        self._DeleteNodePool()
+
+    # ── Nodepool helpers ──────────────────────────────────────────────────────
+
+    def _CreateNodePool(self) -> None:
+        """gcloud container node-pools create with linuxConfig.swapConfig YAML.
+
+        Per Ajay review comment r3472513706:
+          linuxConfig.swapConfig automatically enables
+          kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set
+          kubeletConfig explicitly.
+          For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE
+          to use local NVMe as the swap device.
+        Per Ajay review comment r3472549985:
+          UBUNTU_CONTAINERD is required for dm-crypt measurement.
+        """
+        is_lssd = self.lssd
+        # LSSD configs use a small boot disk (OS only; swap is on local NVMe).
+        disk_size_gb = 100 if is_lssd else self.disk_size_gb
+
+        cmd = self.cluster._GcloudCommand(
+            'container',
+            'node-pools',
+            'create',
+            _BENCHMARK_NODEPOOL,
+            '--cluster',
+            self.cluster.name,
+        )
+        cmd.flags['machine-type'] = self.machine_type
+        cmd.flags['image-type'] = self.node_image_type
+        cmd.flags['disk-type'] = self.disk_type
+        cmd.flags['disk-size'] = disk_size_gb
+        cmd.flags['num-nodes'] = 1
+        cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}'
+        cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair']
+
+        # IOPS / throughput only for hyperdisk non-LSSD configs.
+        if self.disk_type.startswith('hyperdisk') and not is_lssd:
+            cmd.flags['boot-disk-provisioned-iops'] = self.disk_iops
+            cmd.flags['boot-disk-provisioned-throughput'] = (
+                _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput)
+            )
+
+        # Expose local NVMe as raw block devices for fio/mdadm direct access.
+        if is_lssd:
+            cmd.flags['local-nvme-ssd-block'] = f'count={self.lssd_count}'
+
+        # Build linuxConfig YAML for --system-config-from-file.
+        if is_lssd:
+            swap_config_block = (
+                '  swapConfig:\n'
+                '    enabled: true\n'
+                '    dedicatedLocalSsdProfile:\n'
+                f'      diskCount: {self.lssd_count}\n'
+            )
+        else:
+            swap_config_block = '  swapConfig:\n    enabled: true\n'
+        swap_config_yaml = (
+            'linuxConfig:\n'
+            + swap_config_block
+            + '  sysctl:\n'
+            '    vm.min_free_kbytes: 200\n'
+            '    vm.watermark_scale_factor: 500\n'
+            '    vm.swappiness: 100\n'
+        )
+
+        system_config_tmp = None
+        try:
+            system_config_tmp = tempfile.NamedTemporaryFile(
+                mode='w', suffix='.yaml', delete=False
+            )
+            system_config_tmp.write(swap_config_yaml)
+            system_config_tmp.flush()
+            cmd.flags['system-config-from-file'] = system_config_tmp.name
+            logging.info(
+                '[swap_encryption] system-config-from-file: lssd=%s'
+                ' (written to %s):\n%s',
+                is_lssd,
+                system_config_tmp.name,
+                swap_config_yaml,
+            )
+            logging.info(
+                '[swap_encryption] Creating benchmark nodepool: %s / %s /'
+                ' image=%s / disk=%dGiB / iops=%d / lssd=%s /'
+                ' add_swap_disk=%s',
+                _BENCHMARK_NODEPOOL,
+                self.machine_type,
+                self.node_image_type,
+                disk_size_gb,
+                self.disk_iops,
+                is_lssd,
+                self.add_swap_disk,
+            )
+            # LSSD nodepools take longer to provision (NVMe init before Ready).
+            _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
+        finally:
+            if system_config_tmp is not None:
+                try:
+                    os.unlink(system_config_tmp.name)
+                except OSError:
+                    pass
+
+        if rc != 0:
+            low = (stderr or '').lower()
+            # Idempotent prepare: if the nodepool already exists (re-running
+            # --run_stage=prepare,run), reuse it instead of failing.
+            if (
+                'already exists' in low
+                or 'alreadyexists' in low
+                or 'code=409' in low
+            ):
+                logging.info(
+                    '[swap_encryption] Benchmark nodepool already exists —'
+                    ' reusing (idempotent prepare)'
+                )
+                return
+            raise errors.Benchmarks.RunError(
+                f'[swap_encryption] Failed to create benchmark nodepool'
+                f' (rc={rc}): {stderr}'
+            )
+        logging.info('[swap_encryption] Benchmark nodepool ready')
+
+    def _WaitForNode(self, timeout: int = 900) -> None:
+        """Block until a node labelled pkb_nodepool=benchmark is Ready.
+
+        gcloud container node-pools create returns when the API accepts the
+        request; the node VM may take another 2-4 min to boot and pass
+        readiness checks.  Deploying the DaemonSet before the node is Ready
+        leaves the pod Pending indefinitely.
+        """
+        deadline = time.time() + timeout
+        logging.info(
+            '[swap_encryption] Waiting for benchmark node'
+            ' (pkb_nodepool=benchmark) to be Ready...'
+        )
+        while time.time() < deadline:
+            out, _, rc = kubectl.RunKubectlCommand(
+                [
+                    'get',
+                    'nodes',
+                    '-l',
+                    f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+                    '-o',
+                    (
+                        r'jsonpath={range .items[*]}'
+                        r'{.metadata.name}{"\t"}'
+                        r'{range .status.conditions[?(@.type=="Ready")]}'
+                        r'{.status}{"\n"}{end}{end}'
+                    ),
+                ],
+                raise_on_failure=False,
+            )
+            if rc == 0 and out.strip():
+                for line in out.strip().splitlines():
+                    parts = line.split('\t')
+                    if len(parts) == 2 and parts[1].strip() == 'True':
+                        logging.info(
+                            '[swap_encryption] Benchmark node ready: %s',
+                            parts[0].strip(),
+                        )
+                        return
+            logging.info(
+                '[swap_encryption] Benchmark node not yet Ready —'
+                ' retrying in 15 s...'
+            )
+            time.sleep(15)
+        raise errors.Benchmarks.RunError(
+            f'[swap_encryption] Timed out waiting for benchmark node'
+            f' (pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready'
+            f' after {timeout}s'
+        )
+
+    # ── Dedicated swap disk helpers ───────────────────────────────────────────
+
+    def _AttachDisk(self) -> None:
+        """Create a dedicated hyperdisk and attach it to the benchmark node.
+
+        gcloud container node-pools create --additional-node-disk is not
+        available in all gcloud SDK versions, so we create the disk via
+        gcloud compute and attach it after the node is Ready.  In GKE the
+        Kubernetes node name equals the GCE instance name.
+
+        The disk is named pkb-swap-<cluster-name> to avoid collisions across
+        concurrent PKB runs.  _Delete() calls _DetachAndDeleteDisk() to clean
+        up.
+        """
+        cluster = self.cluster
+        zone = self._cluster_zone()
+        if not zone:
+            raise errors.Benchmarks.RunError(
+                '[swap_encryption] Cannot attach swap disk: cluster zone unknown'
+            )
+        project = cluster.project
+        disk_name = f'pkb-swap-{cluster.name}'
+
+        # Get the GCE instance name from the benchmark node's Kubernetes name.
+        node_out, _, rc = kubectl.RunKubectlCommand(
+            [
+                'get',
+                'nodes',
+                '-l',
+                f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
+                '-o',
+                'jsonpath={.items[0].metadata.name}',
+            ],
+            raise_on_failure=False,
+        )
+        instance_name = node_out.strip()
+        if rc != 0 or not instance_name:
+            raise errors.Benchmarks.RunError(
+                '[swap_encryption] Cannot find benchmark node for swap disk'
+                ' attach'
+            )
+        logging.info(
+            '[swap_encryption] Benchmark node instance: %s', instance_name
+        )
+
+        # Create the disk.
+        logging.info(
+            '[swap_encryption] Creating swap disk %s (%dGiB %s)',
+            disk_name,
+            self.swap_disk_size_gb,
+            self.disk_type,
+        )
+        gcp_res = _GcpZonalResource(project, zone)
+        create_cmd = gcp_util.GcloudCommand(
+            gcp_res, 'compute', 'disks', 'create', disk_name
+        )
+        create_cmd.flags['type'] = self.disk_type
+        create_cmd.flags['size'] = f'{self.swap_disk_size_gb}GB'
+        create_cmd.args.append('--quiet')
+        if self.disk_type.startswith('hyperdisk'):
+            create_cmd.flags['provisioned-iops'] = self.disk_iops
+            create_cmd.flags['provisioned-throughput'] = (
+                _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput)
+            )
+        _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
+        if rc != 0:
+            raise errors.Benchmarks.RunError(
+                f'[swap_encryption] Failed to create swap disk {disk_name}:'
+                f' {stderr}'
+            )
+
+        # Attach the disk to the benchmark node VM.
+        logging.info(
+            '[swap_encryption] Attaching swap disk %s to %s',
+            disk_name,
+            instance_name,
+        )
+        attach_cmd = gcp_util.GcloudCommand(
+            gcp_res, 'compute', 'instances', 'attach-disk', instance_name
+        )
+        attach_cmd.flags['disk'] = disk_name
+        attach_cmd.flags['device-name'] = 'pkb-swap'
+        attach_cmd.args.append('--quiet')
+        _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
+        if rc != 0:
+            raise errors.Benchmarks.RunError(
+                f'[swap_encryption] Failed to attach swap disk to'
+                f' {instance_name}: {stderr}'
+            )
+        logging.info(
+            '[swap_encryption] Swap disk attached: %s → %s',
+            disk_name,
+            instance_name,
+        )
+
+    def _DetachAndDeleteDisk(self) -> None:
+        """Detach and delete the dedicated swap disk created by _AttachDisk."""
+        zone = self._cluster_zone()
+        cluster = self.cluster
+        if not zone or not getattr(cluster, 'project', None):
+            return
+        disk_name = f'pkb-swap-{cluster.name}'
+        self._DeleteDiskByName(disk_name, cluster.project, zone)
+
+    def _DeleteDiskByName(
+        self, disk_name: str, project: str, zone: str
+    ) -> bool:
+        """Detach (if attached) and delete a GCE disk, robustly, with retries.
+
+        Finds the attached instance from the disk's own `users` field rather
+        than kubectl — kubectl is often unavailable during teardown (cluster
+        being deleted), which previously left the disk attached and
+        undeletable.  Returns True if the disk is gone.
+        """
+        for attempt in range(1, 5):
+            gcp_res = _GcpZonalResource(project, zone)
+            describe_cmd = gcp_util.GcloudCommand(
+                gcp_res, 'compute', 'disks', 'describe', disk_name
+            )
+            describe_cmd.flags['format'] = 'value(users)'
+            users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
+            if rc != 0:
+                logging.info(
+                    '[swap_encryption] Swap disk %s not present —'
+                    ' nothing to delete',
+                    disk_name,
+                )
+                return True  # Already gone.
+            user = users.strip()
+            if user:
+                inst = user.split('/')[-1]
+                logging.info(
+                    '[swap_encryption] Detaching swap disk %s from %s',
+                    disk_name,
+                    inst,
+                )
+                detach_cmd = gcp_util.GcloudCommand(
+                    gcp_res, 'compute', 'instances', 'detach-disk', inst
+                )
+                detach_cmd.flags['disk'] = disk_name
+                detach_cmd.args.append('--quiet')
+                detach_cmd.Issue(timeout=120, raise_on_failure=False)
+            delete_cmd = gcp_util.GcloudCommand(
+                gcp_res, 'compute', 'disks', 'delete', disk_name
+            )
+            delete_cmd.args.append('--quiet')
+            _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
+            if drc == 0:
+                logging.info(
+                    '[swap_encryption] Swap disk deleted: %s', disk_name
+                )
+                return True
+            logging.warning(
+                '[swap_encryption] Swap disk delete attempt %d/4 failed'
+                ' (%s); retrying in 10 s',
+                attempt,
+                derr.strip()[:160],
+            )
+            time.sleep(10)
+        logging.error(
+            '[swap_encryption] Could NOT delete swap disk %s after retries'
+            ' — delete it manually:\n'
+            '  gcloud compute disks delete %s --zone %s --quiet',
+            disk_name,
+            disk_name,
+            zone,
+        )
+        return False
+
+    def _DeleteNodePool(self) -> None:
+        """Delete the benchmark nodepool."""
+        cmd = self.cluster._GcloudCommand(
+            'container',
+            'node-pools',
+            'delete',
+            _BENCHMARK_NODEPOOL,
+            '--cluster',
+            self.cluster.name,
+        )
+        cmd.args.append('--quiet')
+        logging.info(
+            '[swap_encryption] Deleting benchmark nodepool: %s',
+            _BENCHMARK_NODEPOOL,
+        )
+        _, stderr, rc = cmd.Issue(timeout=600, raise_on_failure=False)
+        if rc != 0:
+            logging.warning(
+                '[swap_encryption] Could not delete benchmark nodepool'
+                ' (rc=%d): %s',
+                rc,
+                stderr,
+            )
+        else:
+            logging.info('[swap_encryption] Benchmark nodepool deleted')
+
+    def DeleteDefaultPool(self) -> None:
+        """Delete the dummy e2-medium default nodepool.
+
+        Called from Prepare() AFTER the DaemonSet pod is Running.  The default
+        pool (e2-medium) was only needed to satisfy GKE's requirement that a
+        cluster must have at least one nodepool at creation time.  Removing it
+        stops its cost immediately.
+
+        Deleting the default pool BEFORE the DaemonSet pod is Running can
+        trigger a brief API-server I/O timeout (control plane busy with two
+        concurrent nodepool ops).  Calling this method from Prepare() after
+        daemonset.WaitForPod() ensures the cluster is fully stable.
+        """
+        cmd = self.cluster._GcloudCommand(
+            'container',
+            'node-pools',
+            'delete',
+            _DEFAULT_NODEPOOL,
+            '--cluster',
+            self.cluster.name,
+        )
+        cmd.args.append('--quiet')
+        logging.info(
+            '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL
+        )
+        _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
+        if rc != 0:
+            logging.warning(
+                '[swap_encryption] Could not delete default nodepool'
+                ' (rc=%d): %s',
+                rc,
+                stderr,
+            )
+        else:
+            logging.info('[swap_encryption] Default nodepool deleted')
+
+    # ── Internal helpers ──────────────────────────────────────────────────────
+
+    def _cluster_zone(self) -> str:
+        """Return the first zone (or region) from the cluster object."""
+        cluster = self.cluster
+        if getattr(cluster, 'zones', None):
+            return cluster.zones[0]
+        if getattr(cluster, 'region', None):
+            return cluster.region
+        return ''

From fb026f8dd14ed384a3d8bf701762e6109b80e1a4 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Mon, 29 Jun 2026 17:30:19 +0530
Subject: [PATCH 10/17] refactor(swap_encryption/pr1): correct PKB structure -
 swap_config as NodepoolSpec field

BREAKING: replaces SwapNodePool (standalone nodepool lifecycle) with the
correct PKB pattern: swap configuration declared in BENCHMARK_CONFIG and
applied by the existing GKE cluster creation flow.

New files:
- resources/container_service/swap_config.py
  - GkeSwapConfig(BaseResource): WriteLinuxConfigYaml(), ValidHyperdiskThroughput()
  - EksSwapConfig(BaseResource): stub for nodeadm config (deferred to PR #6780)

Core framework changes:
- configs/container_spec.py: add SwapConfigSpec(BaseSpec) + _SwapConfigDecoder
  + swap_config field on NodepoolSpec
- resources/container_service/container.py: add swap_config attr to BaseNodePoolConfig
- resources/container_service/container_cluster.py: propagate swap_config in
  _InitializeNodePool() (mirrors sandbox_config pattern)
- providers/gcp/google_kubernetes_engine.py: _AddNodeParamsToCmd() reads
  nodepool_config.swap_config - applies --system-config-from-file,
  UBUNTU_CONTAINERD, --no-enable-autorepair, boot-disk-provisioned-iops/throughput

Thin benchmark:
- BENCHMARK_CONFIG declares benchmark nodepool with swap_config (no separate
  nodepool create needed - GKE cluster creation handles it)
- Prepare(): deploy SwapDaemonSet + delete default-pool
- Run(): verify swap_active + swap_encrypted; report samples
- Cleanup(): empty (PKB auto-deletes spec.resources)

Addresses Ajay reviews:
- r3457826290: swap as base resource plugged into GKE cluster creation flow
- r3457877984: linuxConfig.swapConfig via --system-config-from-file (GkeSwapConfig)
- r3457928855: removed memory.swap.max hack
- r3457964593: UBUNTU_CONTAINERD set per-nodepool in _AddNodeParamsToCmd
- r3472513706: swapConfig auto-enables memorySwapBehavior=LimitedSwap
- r3472549985: UBUNTU_CONTAINERD required for dm-crypt
---
 perfkitbenchmarker/configs/container_spec.py  |   96 ++
 .../swap_encryption_benchmark.py              | 1123 +++--------------
 .../providers/gcp/google_kubernetes_engine.py |   20 +-
 .../resources/container_service/container.py  |    4 +
 .../container_service/container_cluster.py    |    1 +
 .../container_service/swap_config.py          |  259 ++++
 6 files changed, 573 insertions(+), 930 deletions(-)
 create mode 100644 perfkitbenchmarker/resources/container_service/swap_config.py

diff --git a/perfkitbenchmarker/configs/container_spec.py b/perfkitbenchmarker/configs/container_spec.py
index 1f808ad066..cb20ef883a 100644
--- a/perfkitbenchmarker/configs/container_spec.py
+++ b/perfkitbenchmarker/configs/container_spec.py
@@ -243,6 +243,7 @@ def __init__(
     self.vm_spec: virtual_machine_spec.BaseVmSpec
     self.machine_families: list[str] | None
     self.sandbox_config: SandboxSpec | None
+    self.swap_config: SwapConfigSpec | None
 
   @classmethod
   def _GetOptionDecoderConstructions(cls):
@@ -273,6 +274,7 @@ def _GetOptionDecoderConstructions(cls):
         ),
         'vm_spec': (spec.PerCloudConfigDecoder, {}),
         'sandbox_config': (_SandboxDecoder, {'default': None}),
+        'swap_config': (_SwapConfigDecoder, {'default': None}),
     })
     return result
 
@@ -333,6 +335,100 @@ def Decode(self, value, component_full_name, flag_values):
     return result
 
 
+class SwapConfigSpec(spec.BaseSpec):
+  """Configurable swap options for a GKE/EKS nodepool.
+
+  Declared in BENCHMARK_CONFIG under nodepools.<name>.swap_config.
+  Consumed by the cloud provider's _AddNodeParamsToCmd() / equivalent to
+  apply the cloud-specific swap configuration during nodepool creation.
+
+  Attributes:
+    enabled: Whether to enable swap on the nodepool (default True).
+    swappiness: vm.swappiness sysctl value (0-200, default 100).
+    min_free_kbytes: vm.min_free_kbytes sysctl (default 200).
+    watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500).
+    lssd: True if the nodepool uses local NVMe SSDs for the swap device.
+    lssd_count: Number of local NVMe SSDs (GKE dedicatedLocalSsdProfile).
+    boot_disk_iops: Provisioned IOPS for hyperdisk-balanced (0 = not set).
+    boot_disk_throughput: Provisioned throughput MiB/s for hyperdisk-balanced.
+  """
+
+  def __init__(self, *args, **kwargs):
+    self.enabled: bool = True
+    self.swappiness: int = 100
+    self.min_free_kbytes: int = 200
+    self.watermark_scale_factor: int = 500
+    self.lssd: bool = False
+    self.lssd_count: int = 0
+    self.boot_disk_iops: int = 0
+    self.boot_disk_throughput: int = 0
+    super().__init__(*args, **kwargs)
+
+  @classmethod
+  def _GetOptionDecoderConstructions(cls):
+    result = super()._GetOptionDecoderConstructions()
+    result.update({
+        'enabled': (
+            option_decoders.BooleanDecoder,
+            {'default': True},
+        ),
+        'swappiness': (
+            option_decoders.IntDecoder,
+            {'default': 100, 'min': 0, 'max': 200},
+        ),
+        'min_free_kbytes': (
+            option_decoders.IntDecoder,
+            {'default': 200, 'min': 0},
+        ),
+        'watermark_scale_factor': (
+            option_decoders.IntDecoder,
+            {'default': 500, 'min': 0},
+        ),
+        'lssd': (
+            option_decoders.BooleanDecoder,
+            {'default': False},
+        ),
+        'lssd_count': (
+            option_decoders.IntDecoder,
+            {'default': 0, 'min': 0},
+        ),
+        'boot_disk_iops': (
+            option_decoders.IntDecoder,
+            {'default': 0, 'min': 0},
+        ),
+        'boot_disk_throughput': (
+            option_decoders.IntDecoder,
+            {'default': 0, 'min': 0},
+        ),
+    })
+    return result
+
+
+class _SwapConfigDecoder(option_decoders.TypeVerifier):
+  """Decodes the swap_config option of a NodepoolSpec."""
+
+  def Decode(self, value, component_full_name, flag_values):
+    """Decodes the swap_config dictionary into a SwapConfigSpec.
+
+    Args:
+      value: dict. Keys match SwapConfigSpec._GetOptionDecoderConstructions.
+      component_full_name: str. Fully qualified name of the parent component.
+      flag_values: flags.FlagValues. Runtime flags propagated to BaseSpec.
+
+    Returns:
+      SwapConfigSpec instance.
+
+    Raises:
+      errors.Config.InvalidValue upon invalid input value.
+    """
+    super().Decode(value, component_full_name, flag_values)
+    return SwapConfigSpec(
+        self._GetOptionFullName(component_full_name),
+        flag_values=flag_values,
+        **value,
+    )
+
+
 class SandboxSpec(spec.BaseSpec):
   """Configurable options for sandboxed node pools."""
 
diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 7f981b1bb7..3322795eec 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -11,993 +11,260 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""swap_encryption_benchmark: verifies encrypted swap on GKE/EKS nodepools.
 
-"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark.
+Architecture:
+  BENCHMARK_CONFIG declares a 'benchmark' nodepool with swap_config.
+  GkeCluster._AddNodeParamsToCmd() reads nodepool_config.swap_config and
+  applies --system-config-from-file (linuxConfig.swapConfig + sysctl) + sets
+  UBUNTU_CONTAINERD + boot-disk-provisioned-iops/throughput automatically
+  during cluster creation. No separate nodepool lifecycle management needed.
 
-Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws
+  Prepare() deploys a privileged SwapDaemonSet on the swap-enabled nodepool
+  for in-pod benchmark execution (fio / stress-ng / kernel build in later PRs).
 
-== Architecture ==
+  Run() verifies swap is active and dm-crypt encryption is configured, then
+  reports swap device metadata as PKB samples.
 
-Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's
-container_cluster abstraction, then deploys a privileged DaemonSet whose
-pod has full host-device access (/dev, /sys, hostPID).  All benchmark
-phases execute inside this pod via kubectl exec, so measurements reflect
-actual cluster-node behaviour including Kubernetes overhead (kubelet,
-containerd cgroup hierarchy, etc.).
+  Cleanup() is empty — PKB auto-deletes spec.resources (SwapDaemonSet).
 
-  GKE nodes  ── dm-crypt with ephemeral key (go/node:swap-encryption)
-                 swap device: /dev/mapper/swap_encrypted (over dedicated
-                 hyperdisk or LSSD RAID-0 /dev/md0).
-                 Single-disk fallback: plain loop device on
-                 /mnt/stateful_partition — dm-crypt is blocked by COS
-                 kernel namespace restrictions from inside a pod.
-
-  EKS nodes  ── NVMe Instance Store, Nitro hardware-offloaded encryption
-                 swap device: /dev/nvme1n1 (or auto-detected)
-
-== Resource pattern ==
-
-Infrastructure lifecycle lives in two BaseResource subclasses:
-
-  SwapNodePool  (perfkitbenchmarker/resources/container_service/swap_nodepool.py)
-    _Create():  gcloud container node-pools create with linuxConfig.swapConfig
-                + sysctl via --system-config-from-file; waits for node Ready;
-                optionally creates and attaches a dedicated swap disk.
-    _Delete():  detach+delete disk; delete the nodepool.
-    DeleteDefaultPool(): remove the dummy e2-medium default pool after the
-                DaemonSet pod is Running (separate step to avoid API-server
-                contention during nodepool ops).
-
-  SwapDaemonSet  (perfkitbenchmarker/resources/container_service/swap_daemonset.py)
-    _Create():  apply Jinja2 manifest; wait for Running + /tmp/pkb_ready.
-    _Delete():  in-pod swapoff / dmsetup / losetup teardown; kubectl delete.
-    PodExec():  kubectl exec wrapper with transient-reset retry, OOM-kill
-                detection (rc=137), and automatic pod recovery.
-
-Both resources are added to spec.resources in Prepare() and are auto-deleted
-by the PKB framework in Cleanup().
-
-== Benchmark Phases ==
-
-  Phase 1 – fio Microbenchmarks (this PR)
-    Run fio directly on the swap block device (swapoff first) to measure
-    the hardware + encryption ceiling: random IOPS (4K), sequential
-    bandwidth (1M), and completion latency (iodepth=1).
-
-  Phase 2a – CPU Overhead  (PR2/PR4)
-  Phase 2b – I/O Interference  (PR4)
-  Phase 3a – Redis Latency  (PR5)
-  Phase 3b – Kernel Build  (PR5)
-  Phase 3c – OpenSearch  (PR5)
+Subsequent PRs add phases:
+  PR3: fio microbenchmarks on raw swap device (Tier 1)
+  PR4: stress-ng CPU overhead + I/O interference (Tier 2)
+  PR5: kernel build under cgroup memory constraint (Phase 3b)
 """
 
-import json
 import logging
-import textwrap
-import time
 from typing import Any
 
 from absl import flags
-from perfkitbenchmarker import benchmark_spec as bm_spec_lib
+from perfkitbenchmarker import benchmark_spec
 from perfkitbenchmarker import configs
-from perfkitbenchmarker import errors
 from perfkitbenchmarker import sample
-from perfkitbenchmarker.resources.container_service import kubectl
-from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod
-from perfkitbenchmarker.resources.container_service import swap_nodepool as _np_mod
+from perfkitbenchmarker.resources.container_service import swap_daemonset
 
 FLAGS = flags.FLAGS
 
-_BenchmarkSpec = bm_spec_lib.BenchmarkSpec
-
-# ---------------------------------------------------------------------------
-# Benchmark identity
-# ---------------------------------------------------------------------------
-
 BENCHMARK_NAME = 'swap_encryption'
-
-
 BENCHMARK_CONFIG = """
 swap_encryption:
   description: >
-    GKE vs. EKS swap encryption and LSSD performance comparison.
-    Two-step nodepool setup: PKB provisions a minimal cluster with a cheap
-    default nodepool (Step 1), then Prepare() adds the real benchmark
-    nodepool (n4-highmem-32 / c4-*-lssd, UBUNTU_CONTAINERD, 80k IOPS) with a
-    node-level startup script that configures dm-crypt swap before any pod
-    is scheduled, then removes the default nodepool (Step 2).  All benchmark
-    phases run inside a privileged DaemonSet pinned to the benchmark nodepool.
-  flags: {}
+    Verify dm-crypt encrypted swap on GKE/EKS. Subsequent PRs add fio,
+    stress-ng, and kernel build phases.
   container_cluster:
+    cloud: GCP
     type: Kubernetes
     vm_count: 1
     vm_spec:
       GCP:
-        # Cheap placeholder — the benchmark nodepool is created in Prepare().
         machine_type: e2-medium
-        boot_disk_size: 20
-      AWS:
-        # Cheap placeholder — the benchmark nodegroup is added in Prepare().
-        machine_type: t3.medium
-        boot_disk_size: 20
+        zone: us-central1-a
+    nodepools:
+      benchmark:
+        vm_count: 1
+        vm_spec:
+          GCP:
+            machine_type: n4-highmem-32
+            boot_disk_type: hyperdisk-balanced
+            boot_disk_size: 500
+            zone: us-central1-a
+        swap_config:
+          enabled: true
+          swappiness: 100
+          min_free_kbytes: 200
+          watermark_scale_factor: 500
+          boot_disk_iops: 160000
+          boot_disk_throughput: 2400
 """
 
-
-_DAEMONSET_IMAGE = flags.DEFINE_string(
-    'swap_encryption_daemonset_image',
-    'ubuntu:22.04',
-    'Container image used for the privileged benchmark DaemonSet pod.',
-)
-
-
-_NODEPOOL = flags.DEFINE_string(
-    'swap_encryption_nodepool',
-    'benchmark',
-    'Name of the node pool to deploy the benchmark DaemonSet on.',
-)
-
-
-_INSTANCE_SIZE_LABEL = flags.DEFINE_string(
-    'swap_encryption_instance_size_label',
-    '',
-    'Human-readable label for the current instance size being tested, e.g. '
-    '"n4-highmem-32" or "i4i.4xlarge".  Stored in sample metadata so that '
-    'results from multiple PKB runs across different instance sizes can be '
-    'collated and compared.  Defaults to the value reported by the cloud '
-    'metadata endpoint inside the pod.',
+_MACHINE_TYPE = flags.DEFINE_string(
+    'swap_encryption_machine_type',
+    None,
+    'Override machine type for the benchmark nodepool.',
 )
-
-
-_COLLECT_COST = flags.DEFINE_boolean(
-    'swap_encryption_collect_cost',
-    False,
-    'When True, emit a cost_estimate_usd sample using on-demand pricing '
-    'for the instance type detected at runtime.',
+_DISK_TYPE = flags.DEFINE_string(
+    'swap_encryption_disk_type',
+    None,
+    'Override disk type for the benchmark nodepool.',
 )
 
+_BenchmarkSpec = benchmark_spec.BenchmarkSpec
+_BENCHMARK_NODEPOOL = 'benchmark'
+_DEFAULT_POOL = 'default-pool'
 
-_FAIL_ON_DEGRADED = flags.DEFINE_boolean(
-    'swap_encryption_fail_on_degraded',
-    True,
-    'When True (default), raise an error at the end of Run() if the run was '
-    'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and '
-    'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng '
-    'swap-pressure phase was OOM-killed before completing.  This prevents PKB '
-    'from reporting SUCCEEDED for a run whose post-eviction phases produced '
-    'empty or meaningless data.  Set False to keep the legacy behaviour of '
-    'always returning whatever partial samples were collected.',
-)
-
-
-_PHASES = flags.DEFINE_list(
-    'swap_encryption_phases',
-    ['all'],
-    'Which Run() phases to execute, for fast iteration against an '
-    'already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  '
-    'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng '
-    'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), '
-    '3b (kernel build), 3c (opensearch).  Default "all" runs everything.  '
-    'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. '
-    'Phases not listed are skipped and do not affect the degraded-run gate '
-    '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").',
-)
-
-
-_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string(
-    'swap_encryption_benchmark_machine_type',
-    'n4-highmem-32',
-    'Machine type for the benchmark nodepool created in Prepare(). '
-    'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd '
-    '(LSSD RAID-0).  The matching swap setup is selected automatically.',
-)
-
-
-_BENCHMARK_LSSD = flags.DEFINE_boolean(
-    'swap_encryption_lssd',
-    False,
-    'Force LSSD RAID-0 swap path even when the machine type name does not '
-    'contain "lssd".  Auto-detected from machine type when False.',
-)
-
-
-_LSSD_COUNT = flags.DEFINE_integer(
-    'swap_encryption_lssd_count',
-    1,
-    'Number of local NVMe SSDs to attach as raw block devices '
-    '(--local-nvme-ssd-block count=N).  Must match the fixed local SSD '
-    'count for the chosen machine type: c4-standard-8-lssd=1, '
-    'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  '
-    'Default 1 covers most single-lssd machine types.',
-)
-
-
-_NODE_IMAGE_TYPE = flags.DEFINE_string(
-    'swap_encryption_node_image_type',
-    'UBUNTU_CONTAINERD',
-    'GKE node image type for the benchmark nodepool.  '
-    'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks '
-    'down device-mapper at the kernel LSM level and cryptsetup hangs '
-    'indefinitely from any pod context (even privileged, even via nsenter '
-    'into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup '
-    'from privileged pods without restriction.  '
-    'Use COS_CONTAINERD only when dm-crypt is disabled '
-    '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  '
-    'AL2 on EKS.',
-)
-
-
-_BOOT_DISK_TYPE = flags.DEFINE_string(
-    'swap_encryption_boot_disk_type',
-    'hyperdisk-balanced',
-    'Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced '
-    'for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 '
-    'dev/test machines, which do not support hyperdisk-balanced.',
-)
-
-
-_BOOT_DISK_IOPS = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_iops',
-    80000,
-    'Provisioned IOPS for the boot disk (hyperdisk-balanced only).  '
-    '80 000 is the COS max-IOPS target.  Ignored for pd-ssd.',
-)
-
-
-_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_throughput',
-    1200,
-    'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced '
-    'only).  Must be set together with iops.  1200 MB/s pairs with 80 000 '
-    'IOPS for production; use 140 (minimum) for dev/test.  Ignored for '
-    'pd-ssd.',
-)
-
-
-_BOOT_DISK_SIZE_GB = flags.DEFINE_integer(
-    'swap_encryption_boot_disk_size_gb',
-    500,
-    'Boot disk size in GiB for the benchmark nodepool.  500 GiB is '
-    'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run '
-    '(see Engineer Assignments table in execution-plan.md).  '
-    'For LSSD configs the boot disk is smaller; 100 GiB is fine.',
-)
-
-
-_ADD_SWAP_DISK = flags.DEFINE_boolean(
-    'swap_encryption_add_swap_disk',
-    False,
-    'Attach a dedicated second disk to the benchmark nodepool for use as '
-    'the swap device.  Required for dm-crypt measurement on single-boot-disk '
-    'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper '
-    'from pod namespaces.  The second disk is provisioned via '
-    '--additional-node-disk using the same type/IOPS/throughput as the boot '
-    'disk flags.',
-)
-
-
-_SWAP_DISK_SIZE_GB = flags.DEFINE_integer(
-    'swap_encryption_swap_disk_size_gb',
-    500,
-    'Size in GiB of the dedicated swap disk when '
-    '--swap_encryption_add_swap_disk is True.  Must satisfy the '
-    'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
-)
-
-_ENABLE_DMCRYPT = flags.DEFINE_boolean(
-    'swap_encryption_enable_dmcrypt',
-    True,
-    'When True (default), wrap the swap device in dm-crypt plain mode '
-    '(aes-xts-plain64, ephemeral random key) matching GKE\'s '
-    'go/node:swap-encryption implementation.  Set False to measure plain '
-    '(unencrypted) swap overhead as a baseline.',
-)
-
-
-_SWAP_DEVICE = flags.DEFINE_string(
-    'swap_encryption_device',
-    '',
-    'Explicit block device path to use as the swap device, e.g. '
-    '/dev/nvme1n1 or /dev/mapper/swap_encrypted.  When empty (default), '
-    'the device is auto-detected from /proc/swaps inside the benchmark pod.',
-)
 
-_SWAP_TYPE = flags.DEFINE_string(
-    'swap_encryption_swap_type',
-    'hyperdisk',
-    'Storage target for the swap device.  One of: hyperdisk (default), '
-    'lssd, instance_store, io2.',
-)
+def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
+  """Load and return benchmark config spec."""
+  config = configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+  nodepool = config['container_cluster']['nodepools'][_BENCHMARK_NODEPOOL]
+  if _MACHINE_TYPE.value:
+    for cloud in nodepool['vm_spec']:
+      nodepool['vm_spec'][cloud]['machine_type'] = _MACHINE_TYPE.value
+  if _DISK_TYPE.value:
+    for cloud in nodepool['vm_spec']:
+      nodepool['vm_spec'][cloud]['boot_disk_type'] = _DISK_TYPE.value
+  return config
 
-_ENABLE_ZSWAP = flags.DEFINE_boolean(
-    'swap_encryption_enable_zswap',
-    False,
-    'When True, enable zswap compressed swap cache on the benchmark node.',
-)
 
-_MIN_FREE_KBYTES = flags.DEFINE_integer(
-    'swap_encryption_min_free_kbytes',
-    0,
-    'Value to write to /proc/sys/vm/min_free_kbytes before benchmarking. '
-    '0 (default) leaves the kernel default unchanged.',
-)
+def CheckPrerequisites(_) -> None:
+  """Verifies that benchmark setup is correct."""
 
-_FIO_RUNTIME_SEC = flags.DEFINE_integer(
-    'swap_encryption_fio_runtime_sec',
-    60,
-    'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.',
-)
 
-_STRESS_VM_BYTES = flags.DEFINE_string(
-    'swap_encryption_stress_vm_bytes',
-    '28G',
-    'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  '
-    'Should exceed available node RAM to force sustained paging.',
-)
+def Prepare(spec: _BenchmarkSpec) -> None:
+  """Deploys the privileged benchmark DaemonSet on the swap-enabled nodepool.
 
-_STRESS_VM_BYTES_LIST = flags.DEFINE_list(
-    'swap_encryption_stress_vm_bytes_list',
-    [],
-    'Comma-separated list of --vm-bytes values to sweep in Phase 2a, '
-    'e.g. "14G,28G,56G".  Overrides --swap_encryption_stress_vm_bytes.',
-)
+  The swap-enabled 'benchmark' nodepool is already created by GKE cluster
+  creation (swap_config declared in BENCHMARK_CONFIG). Prepare() deploys the
+  privileged DaemonSet used for in-pod command execution across all phases.
 
-_STRESS_TIMEOUT_SEC = flags.DEFINE_integer(
-    'swap_encryption_stress_timeout_sec',
-    300,
-    'Maximum seconds to wait for the stress-ng swap-pressure phase.',
-)
-
-# DaemonSet constants used by both SwapDaemonSet construction and the EKS path.
-_DS_NAME = 'pkb-swap-benchmark'
-_DS_NAMESPACE = 'default'
-_DS_LABEL = 'pkb-swap-benchmark'
-_BENCHMARK_NODEPOOL = 'benchmark'
+  After the DaemonSet pod is Running the dummy e2-medium default-pool is
+  deleted to stop its cost.
 
+  Args:
+    spec: PKB BenchmarkSpec with spec.container_cluster already created.
+  """
+  cluster = spec.container_cluster
+  daemonset = swap_daemonset.SwapDaemonSet(cluster=cluster)
+  daemonset.Create()
+  spec.resources.append(daemonset)
+  pod = daemonset.WaitForPod()
+  logging.info('[swap_encryption] Benchmark pod ready: %s', pod)
+  _delete_default_pool(cluster)
 
-def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
-    """Load and return benchmark config spec."""
-    return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
 
+def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
+  """Verify swap is active and dm-crypt encryption is configured.
+
+  Returns:
+    PKB samples: swap_active, swap_encrypted, swap_cipher, swap_total_kb.
+  """
+  daemonset = _get_daemonset(spec)
+  daemonset.WaitForPod()
+  daemonset.oom_events.clear()
+  daemonset.pod_lost.clear()
+
+  swap_dev = _detect_swap_device(daemonset)
+  base_meta = _build_metadata(daemonset, swap_dev)
+  results: list[sample.Sample] = []
+
+  # ── Verify swap is active ──────────────────────────────────────────────────
+  try:
+    swap_out, _ = daemonset.PodExec('cat /proc/swaps')
+    active = any(
+        l and not l.startswith('Filename') for l in swap_out.splitlines()
+    )
+    results.append(sample.Sample('swap_active', int(active), 'bool', base_meta))
+    logging.info('[swap_encryption] swap_active=%s /proc/swaps:\n%s', active, swap_out)
+  except Exception as e:  # pylint: disable=broad-except
+    logging.warning('[swap_encryption] Could not read /proc/swaps: %s', e)
 
-def Prepare(spec: _BenchmarkSpec) -> None:
-    """Two-step nodepool setup then DaemonSet deployment.
-
-    Step 1 (handled by PKB infrastructure): cluster provisioned with a cheap
-    e2-medium default nodepool.
-
-    Step 2 (this function):
-      a. GCP: Create SwapNodePool (benchmark nodepool + optional swap disk).
-         EKS: label existing nodes with pkb_nodepool=benchmark.
-      b. Create SwapDaemonSet: deploy manifest + wait for Running + sentinel.
-      c. GCP: DeleteDefaultPool() — safe now that DaemonSet pod is Running.
-      d. GCP: re-resolve pod name in case default-pool deletion evicts the pod.
-
-    Both resources are appended to spec.resources for auto-cleanup.
-    """
-    cluster = spec.container_cluster
-    is_gcp = getattr(cluster, 'project', None) is not None
-
-    if is_gcp:
-        # ── Step 2a (GCP): create benchmark nodepool + wait for node ──────────
-        logging.info('[swap_encryption] Step 2a: creating benchmark nodepool')
-        nodepool = _np_mod.SwapNodePool(
-            cluster=cluster,
-            machine_type=_BENCHMARK_MACHINE_TYPE.value,
-            node_image_type=_NODE_IMAGE_TYPE.value,
-            disk_type=_BOOT_DISK_TYPE.value,
-            disk_size_gb=_BOOT_DISK_SIZE_GB.value,
-            disk_iops=_BOOT_DISK_IOPS.value,
-            disk_throughput=_BOOT_DISK_THROUGHPUT.value,
-            lssd=_BENCHMARK_LSSD.value,
-            lssd_count=_LSSD_COUNT.value,
-            add_swap_disk=_ADD_SWAP_DISK.value,
-            swap_disk_size_gb=_SWAP_DISK_SIZE_GB.value,
-        )
-        nodepool.Create()
-        spec.resources.append(nodepool)
-    else:
-        # ── Step 2a (EKS): label existing nodes to match DaemonSet selector ──
-        logging.info(
-            '[swap_encryption] EKS cluster — labelling existing nodes with'
-            ' pkb_nodepool=%s so the DaemonSet nodeSelector matches.',
-            _BENCHMARK_NODEPOOL,
-        )
-        kubectl.RunKubectlCommand([
-            'label',
-            'nodes',
-            '--all',
-            '--overwrite',
-            f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-        ])
-        _ensure_io2_volume()
-
-    # ── Step 2b: deploy DaemonSet and wait for pod ────────────────────────────
-    # Deploy BEFORE deleting the default pool: deleting the default pool while
-    # the benchmark node is still joining causes a brief API-server I/O timeout.
-    # The pod being Running means the cluster is fully stable.
-    logging.info('[swap_encryption] Step 2b: deploying privileged DaemonSet')
-    daemonset = _ds_mod.SwapDaemonSet(
-        name=_DS_NAME,
-        namespace=_DS_NAMESPACE,
-        label=_DS_LABEL,
-        nodepool=_BENCHMARK_NODEPOOL,
-        image=_DAEMONSET_IMAGE.value,
+  # ── Verify dm-crypt encryption ─────────────────────────────────────────────
+  if swap_dev:
+    try:
+      dm_out, _ = daemonset.PodExec(
+          f'dmsetup status {swap_dev} 2>/dev/null || echo not_encrypted'
+      )
+      encrypted = 'crypt' in dm_out.lower()
+      cipher = _parse_cipher(dm_out)
+      meta = {**base_meta, 'dmsetup_status': dm_out.strip()[:200]}
+      results.append(sample.Sample('swap_encrypted', int(encrypted), 'bool', meta))
+      if cipher:
+        results.append(sample.Sample('swap_cipher', 0, cipher, meta))
+      logging.info('[swap_encryption] encrypted=%s cipher=%s', encrypted, cipher)
+    except Exception as e:  # pylint: disable=broad-except
+      logging.warning('[swap_encryption] dm-crypt check failed: %s', e)
+
+  # ── Swap size ──────────────────────────────────────────────────────────────
+  try:
+    sz_out, _ = daemonset.PodExec(
+        "awk '/^SwapTotal/ {print $2}' /proc/meminfo"
     )
-    daemonset.Create()
-    spec.resources.append(daemonset)
+    swap_kb = int(sz_out.strip() or '0')
+    results.append(sample.Sample('swap_total_kb', swap_kb, 'KB', base_meta))
     logging.info(
-        '[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name
+        '[swap_encryption] SwapTotal: %d KB (%.1f GiB)',
+        swap_kb, swap_kb / 1024 / 1024,
     )
+  except Exception as e:  # pylint: disable=broad-except
+    logging.warning('[swap_encryption] Could not read SwapTotal: %s', e)
 
-    # ── Step 2c+d (GCP): delete dummy default nodepool, re-resolve pod name ──
-    if is_gcp:
-        logging.info(
-            '[swap_encryption] Step 2c: deleting dummy default nodepool'
-        )
-        nodepool.DeleteDefaultPool()
-        # The pod may be evicted and rescheduled with a new name during the
-        # default nodepool deletion.  Re-resolve to avoid stale references.
-        logging.info(
-            '[swap_encryption] Step 2d: re-resolving benchmark pod after'
-            ' nodepool deletion'
-        )
-        daemonset.WaitForPod()
-        logging.info(
-            '[swap_encryption] Benchmark pod (post-deletion): %s',
-            daemonset.pod_name,
-        )
-
-
-def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
-    """Execute all benchmark phases with gate logic.
-
-    Execution is structured in three gated tiers matching the execution plan:
-
-      Tier 1 (Gate 1) — fio microbenchmarks
-        Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
-        zero samples (device not found, O_DIRECT error, etc.).
-
-      Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (PR4)
-        Requires an active swap device (Gate 1 must pass).
-
-      Tier 3 (Gate 3) — real-world workloads (PR5)
-        Independent of Tier 2 results.
-
-    If Gate 1 fails, Tiers 2 and 3 are skipped.
-    """
-    daemonset = _get_daemonset(spec)
-
-    pod = daemonset.WaitForPod()
-    if pod is None:
-        raise errors.Benchmarks.RunError(
-            '[swap_encryption] Benchmark pod never became ready.'
-        )
-    # Reset per-run accumulators before starting phases.
-    daemonset.oom_events.clear()
-    daemonset.pod_lost.clear()
-    original_pod = pod
-    degraded_reasons: list[str] = []
-
-    swap_dev = _detect_swap_device(daemonset)
-    base_meta = _build_metadata(daemonset, swap_dev)
-    results: list[sample.Sample] = []
-    t_run_start = time.time()
-
-    logging.info('[swap_encryption] swap device: %s', swap_dev)
-
-    # ── Phase 1: fio microbenchmarks on raw swap device ───────────────────────
-    if _phase_selected('fio'):
-        logging.info(
-            '[swap_encryption] Phase 1: fio microbenchmarks on %s', swap_dev
-        )
-        try:
-            phase1_samples = _run_phase1_fio(daemonset, swap_dev, base_meta)
-            results += phase1_samples
-            if not phase1_samples:
-                degraded_reasons.append(
-                    'Phase 1 (fio) produced no samples — '
-                    'check fio install and swap device accessibility'
-                )
-                logging.error('[swap_encryption] Phase 1: no samples produced')
-        except Exception as e:  # pylint: disable=broad-except
-            degraded_reasons.append(f'Phase 1 fio failed: {e}')
-            logging.error('[swap_encryption] Phase 1 fio error: %s', e)
-
-    # ── Cost estimate ─────────────────────────────────────────────────────────
-    if _COLLECT_COST.value:
-        elapsed = time.time() - t_run_start
-        results += _collect_cost_sample(daemonset, elapsed, base_meta)
-
-    # ── Final degradation gate ────────────────────────────────────────────────
-    if daemonset.pod_name and daemonset.pod_name != original_pod:
-        degraded_reasons.append(
-            f'benchmark pod was replaced during the run ({original_pod} →'
-            f' {daemonset.pod_name}) — it was OOM-evicted under swap'
-            ' pressure; phases executed after the eviction ran against a'
-            ' freshly-initialised pod (empty /tmp, swap re-setup) and may'
-            ' be invalid'
-        )
-    if daemonset.pod_lost:
-        degraded_reasons.append(
-            'benchmark pod(s) went NotFound during the run'
-            f' ({", ".join(daemonset.pod_lost)}) — the pod died (node'
-            ' memory-pressure eviction or container exit) and any phase'
-            ' running at or after that point produced invalid data'
-        )
-    if daemonset.oom_events:
-        degraded_reasons.append(
-            'OOM kill(s) (rc=137) occurred during the run on pod(s) '
-            f'{", ".join(daemonset.oom_events)} — a phase exceeded memory'
-            ' and was killed by the OOM killer; the affected phase(s)'
-            ' produced no or partial data'
-        )
-
-    degraded = bool(degraded_reasons)
+  if daemonset.oom_events:
     results.append(
-        sample.Sample(
-            'swap_encryption_run_status',
-            0.0 if degraded else 1.0,
-            'status',
-            dict(
-                base_meta,
-                degraded=degraded,
-                degraded_reasons='; '.join(degraded_reasons) or 'none',
-                num_samples=len(results) + 1,
-            ),
-        )
+        sample.Sample('oom_events', len(daemonset.oom_events), 'count', base_meta)
     )
-
-    if degraded:
-        msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons)
-        logging.error(msg)
-        if _FAIL_ON_DEGRADED.value:
-            raise errors.Benchmarks.RunError(msg)
-    else:
-        logging.info(
-            '[swap_encryption] Run completed cleanly (%d samples)',
-            len(results),
-        )
-
-    return results
-
-
-def Cleanup(spec: _BenchmarkSpec) -> None:
-    """Resources in spec.resources are auto-deleted by the PKB framework.
-
-    SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove,
-    losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet.
-    SwapNodePool._Delete() detaches+deletes the swap disk (if any) then
-    deletes the benchmark nodepool.
-    """
+  return results
 
 
-# ---------------------------------------------------------------------------
-# Internal helpers
-# ---------------------------------------------------------------------------
+def Cleanup(_: _BenchmarkSpec) -> None:
+  """Empty — PKB auto-deletes spec.resources (SwapDaemonSet)."""
 
 
-def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet:
-    """Retrieve the SwapDaemonSet resource from spec.resources."""
-    daemonset = next(
-        (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)),
-        None,
-    )
-    if daemonset is None:
-        raise errors.Benchmarks.RunError(
-            '[swap_encryption] SwapDaemonSet not found in spec.resources —'
-            ' was Prepare() called?'
-        )
-    return daemonset
-
-
-def _phase_selected(token: str) -> bool:
-    """Return True if phase `token` should run given --swap_encryption_phases.
-
-    'all' (the default) selects every phase.  Otherwise only the
-    comma-separated tokens listed in the flag run.
-    """
-    selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
-    return (not selected) or ('all' in selected) or (token.lower() in selected)
-
-
-def _configure_eks_kubelet_swap(spec) -> None:
-    """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap.
-
-    NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm
-    integration) to merge.  When that lands, EKS node pools should include
-    a preBootstrapCommands block writing nodeadm config with
-    memorySwapBehavior: LimitedSwap before kubelet starts::
-
-      apiVersion: node.eks.aws/v1alpha1
-      kind: NodeConfig
-      spec:
-        kubelet:
-          config:
-            memorySwapBehavior: LimitedSwap
-            failSwapOn: false
-
-    GKE equivalent: linuxConfig.swapConfig via --system-config-from-file
-    (swapConfig automatically enables memorySwapBehavior=LimitedSwap),
-    already implemented in SwapNodePool._CreateNodePool().
-
-    See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780
-    """
-    logging.warning(
-        '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is '
-        'deferred (blocked on PR #6780 — SwapConfigSpec). '
-        'EKS nodes will use default kubelet swap settings until that PR merges.'
-    )
+# ── Helpers ────────────────────────────────────────────────────────────────────
 
 
-def _ensure_io2_volume() -> None:
-    """Create and attach an io2 EBS volume for swap on EKS (no-op if not io2).
-
-    Only executed when --swap_encryption_swap_type=io2.  Full implementation
-    is deferred to PR2 (swap-capability layer).
-    """
-    if _SWAP_TYPE.value != 'io2':
-        return
-    logging.info(
-        '[swap_encryption] io2 swap volume provisioning deferred to PR2'
-    )
+def _get_daemonset(spec: _BenchmarkSpec) -> swap_daemonset.SwapDaemonSet:
+  for r in spec.resources:
+    if isinstance(r, swap_daemonset.SwapDaemonSet):
+      return r
+  raise RuntimeError('[swap_encryption] SwapDaemonSet not found in spec.resources')
 
 
-def _detect_swap_device(
-    daemonset: _ds_mod.SwapDaemonSet,
-) -> str:
-    """Return the active swap device path on the cluster node."""
-    if _SWAP_DEVICE.value:
-        return _SWAP_DEVICE.value
-
-    # /proc/swaps is the source of truth — it lists the device ACTUALLY active.
-    # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping
-    # from a previous run on a reused node can still appear as a /dev node while
-    # being non-functional (fio/swapoff fail with "No such device or address").
-    dm_out, _ = daemonset.PodExec(
-        textwrap.dedent("""
-            ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
-            if [ -n "$ACTIVE" ]
-            then
-              echo "$ACTIVE"
-            elif test -e /dev/mapper/swap_encrypted
-            then
-              echo /dev/mapper/swap_encrypted
-            fi
-        """),
-        ignore_failure=True,
-    )
-    dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ''
-    if dev:
-        return dev
-    raise ValueError(
-        'No active swap device found in the benchmark pod. '
-        'Use --swap_encryption_device to specify one.'
-    )
+def _detect_swap_device(ds: swap_daemonset.SwapDaemonSet) -> str:
+  """Return the first active swap device name (e.g. 'dm-0') or ''."""
+  try:
+    out, _ = ds.PodExec("awk 'NR>1 {print $1}' /proc/swaps")
+    dev = out.strip().split('\n')[0].strip()
+    return dev.split('/')[-1] if dev else ''
+  except Exception as e:  # pylint: disable=broad-except
+    logging.warning('[swap_encryption] _detect_swap_device: %s', e)
+    return ''
 
 
 def _build_metadata(
-    daemonset: _ds_mod.SwapDaemonSet, swap_dev: str
+    ds: swap_daemonset.SwapDaemonSet, swap_dev: str
 ) -> dict[str, Any]:
-    """Collect node environment, encryption type, and config into a dict."""
-    kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True)
-    mem_out, _ = daemonset.PodExec(
-        "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True
-    )
-    swap_out, _ = daemonset.PodExec(
-        "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True
+  """Build base metadata dict for all samples."""
+  meta: dict[str, Any] = {'swap_device': swap_dev or 'unknown'}
+  try:
+    kver, _ = ds.PodExec('uname -r')
+    meta['kernel_version'] = kver.strip()
+  except Exception:  # pylint: disable=broad-except
+    pass
+  return meta
+
+
+def _parse_cipher(dmsetup_status: str) -> str:
+  """Extract cipher name from dmsetup status output."""
+  parts = dmsetup_status.split()
+  try:
+    idx = parts.index('crypt')
+    return parts[idx + 1] if idx + 1 < len(parts) else ''
+  except ValueError:
+    return ''
+
+
+def _delete_default_pool(cluster) -> None:
+  """Delete the dummy e2-medium default-pool once the benchmark pod is Running.
+
+  GKE requires at least one nodepool at cluster creation time; the e2-medium
+  default-pool satisfies that requirement. Deleting it before the DaemonSet
+  pod is Running can trigger a brief API-server timeout while two concurrent
+  nodepool operations are in progress.
+  """
+  try:
+    cmd = cluster._GcloudCommand(  # pylint: disable=protected-access
+        'container', 'node-pools', 'delete', _DEFAULT_POOL,
+        '--cluster', cluster.name,
     )
-
-    try:
-        mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
-    except ValueError:
-        mem_gb = 0
-    try:
-        swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
-    except ValueError:
-        swap_gb = 0
-
-    # Encryption type — key off dm-crypt presence + swap target.
-    enc = 'unknown'
-    if '/dev/mapper/' in swap_dev:
-        table_out, _ = daemonset.PodExec(
-            f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
-    elif _SWAP_TYPE.value in ('instance_store', 'io2'):
-        enc = 'nitro_hardware_offload'
-    elif not _ENABLE_DMCRYPT.value:
-        enc = 'none'
-
-    cloud = _detect_cloud(daemonset)
-
-    instance_label = _INSTANCE_SIZE_LABEL.value
-    if not instance_label:
-        gcp_type_out, _ = daemonset.PodExec(
-            'curl -s -m 3 --fail'
-            ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
-            ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        if gcp_type_out.strip():
-            instance_label = gcp_type_out.strip().split('/')[-1]
-    if not instance_label:
-        aws_type_out, _ = daemonset.PodExec(
-            'curl -s -m 3 --fail '
-            'http://169.254.169.254/latest/meta-data/instance-type '
-            '2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        instance_label = aws_type_out.strip()
-
-    return {
-        'benchmark': BENCHMARK_NAME,
-        'execution_mode': 'kubernetes_privileged_pod',
-        'cloud': cloud,
-        'instance_size': instance_label,
-        'kernel_version': kernel_out.strip(),
-        'host_memory_gb': mem_gb,
-        'swap_device': swap_dev,
-        'swap_size_gb': swap_gb,
-        'swap_encryption': enc,
-        'storage_target': _SWAP_TYPE.value,
-        'boot_disk_type': _BOOT_DISK_TYPE.value,
-        'dmcrypt_enabled': _ENABLE_DMCRYPT.value,
-        'node_image_type': _NODE_IMAGE_TYPE.value,
-        'boot_disk_iops_target': _BOOT_DISK_IOPS.value,
-        'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value,
-        'zswap_enabled': _ENABLE_ZSWAP.value,
-        'min_free_kbytes': _MIN_FREE_KBYTES.value,
-        'fio_runtime_sec': _FIO_RUNTIME_SEC.value,
-        'stress_vm_bytes_requested': _STRESS_VM_BYTES.value,
-        'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value,
-        'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
-        'nodepool': _NODEPOOL.value,
-    }
-
-
-def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str:
-    """Detect whether the benchmark pod is running on GCP or AWS."""
-    gcp_out, _ = daemonset.PodExec(
-        'curl -s -m 2 --fail '
-        'http://metadata.google.internal/computeMetadata/v1/project/project-id'
-        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-        ignore_failure=True,
-    )
-    if gcp_out.strip():
-        return 'GCP'
-    return 'AWS'
-
-
-def _run_phase1_fio(
-    daemonset: _ds_mod.SwapDaemonSet,
-    swap_dev: str,
-    base_meta: dict[str, Any],
-) -> list[sample.Sample]:
-    """Run fio microbenchmarks on the raw swap block device (Phase 1).
-
-    Calls swapoff before running fio so measurements reflect the raw
-    hardware + encryption ceiling with no swap-daemon overhead.  Re-enables
-    swap unconditionally after all jobs complete.
-
-    Jobs:
-      4k_randread   iodepth=32  → random read IOPS
-      4k_randwrite  iodepth=32  → random write IOPS
-      1m_seqread    iodepth=8   → sequential read bandwidth
-      1m_seqwrite   iodepth=8   → sequential write bandwidth
-      4k_lat_read   iodepth=1   → completion latency floor (read)
-
-    Args:
-      daemonset: Active SwapDaemonSet resource.
-      swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted.
-      base_meta: Shared metadata dict from _build_metadata().
-
-    Returns:
-      List of Sample objects with IOPS, bandwidth and latency metrics.
-    """
-    samples: list[sample.Sample] = []
-
-    # swapoff before fio — running fio with --direct=1 on an active swap device
-    # races with kernel page-reclaim on the same dm-crypt target.
-    logging.info('[swap_encryption] Phase 1: swapoff %s', swap_dev)
-    daemonset.PodExec(
-        f'swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true',
-        timeout=30,
-        ignore_failure=True,
-    )
-
-    # (name, rw_mode, block_size, iodepth)
-    fio_jobs = [
-        ('4k_randread', 'randread', '4k', 32),
-        ('4k_randwrite', 'randwrite', '4k', 32),
-        ('1m_seqread', 'read', '1m', 8),
-        ('1m_seqwrite', 'write', '1m', 8),
-        ('4k_lat_read', 'randread', '4k', 1),
-    ]
-
-    runtime = _FIO_RUNTIME_SEC.value
-    try:
-        for name, rw, bs, iodepth in fio_jobs:
-            cmd = (
-                f'fio --name={name} --filename={swap_dev}'
-                f' --rw={rw} --bs={bs} --iodepth={iodepth}'
-                ' --ioengine=libaio --direct=1'
-                f' --runtime={runtime} --time_based --group_reporting'
-                ' --output-format=json 2>/dev/null'
-            )
-            logging.info('[swap_encryption] Phase 1: fio job %s', name)
-            out, _ = daemonset.PodExec(cmd, timeout=runtime + 120)
-            samples += _parse_fio_json(out, name, base_meta)
-    finally:
-        # Always re-enable swap so subsequent phases can drive swap I/O.
-        logging.info('[swap_encryption] Phase 1: swapon %s', swap_dev)
-        daemonset.PodExec(
-            f'swapon {swap_dev} 2>/dev/null || true',
-            timeout=30,
-            ignore_failure=True,
-        )
-
-    logging.info(
-        '[swap_encryption] Phase 1 complete (%d samples)', len(samples)
-    )
-    return samples
-
-
-def _parse_fio_json(
-    fio_output: str, job_name: str, base_meta: dict[str, Any]
-) -> list[sample.Sample]:
-    """Parse fio --output-format=json output into PKB Sample objects.
-
-    Extracts per-direction (read/write) IOPS, bandwidth (MB/s) and completion
-    latency (mean + p50/p99/p999 percentiles).
-
-    Args:
-      fio_output: Raw stdout from fio with --output-format=json.
-      job_name: Short identifier embedded in metric names, e.g. '4k_randread'.
-      base_meta: Shared metadata dict copied into each sample.
-
-    Returns:
-      List of Sample objects; empty if output cannot be parsed or is zero.
-    """
-    # fio sometimes emits kernel warnings before the JSON object.
-    json_start = fio_output.find('{')
-    if json_start == -1:
-        logging.warning(
-            '[swap_encryption] Phase 1: no JSON in fio output for %s', job_name
-        )
-        return []
-
-    try:
-        data = json.loads(fio_output[json_start:])
-    except json.JSONDecodeError as e:
-        logging.warning(
-            '[swap_encryption] Phase 1: fio JSON parse error (%s): %s',
-            job_name,
-            e,
-        )
-        return []
-
-    jobs = data.get('jobs', [])
-    if not jobs:
-        return []
-
-    job = jobs[0]
-    samples: list[sample.Sample] = []
-    meta = dict(base_meta, fio_job=job_name)
-
-    for direction in ('read', 'write'):
-        d = job.get(direction, {})
-        iops = float(d.get('iops', 0))
-        bw_kbps = float(d.get('bw', 0))  # fio reports KiB/s
-        bw_mbps = bw_kbps / 1024.0
-
-        # Skip directions with near-zero throughput.
-        if iops < 1 and bw_kbps < 1:
-            continue
-
-        prefix = f'phase1_fio_{job_name}_{direction}'
-        samples.append(sample.Sample(f'{prefix}_iops', iops, 'IOPS', meta))
-        samples.append(
-            sample.Sample(f'{prefix}_bw_mbps', bw_mbps, 'MB/s', meta)
-        )
-
-        # Completion latency — fio reports nanoseconds; emit microseconds.
-        clat = d.get('clat_ns', d.get('lat_ns', {}))
-        lat_mean_ns = float(clat.get('mean', 0))
-        if lat_mean_ns > 0:
-            samples.append(
-                sample.Sample(
-                    f'{prefix}_lat_mean_us', lat_mean_ns / 1000.0, 'us', meta
-                )
-            )
-            for pct_key, label in (
-                ('50.000000', 'p50'),
-                ('99.000000', 'p99'),
-                ('99.900000', 'p999'),
-            ):
-                val_ns = clat.get('percentile', {}).get(pct_key, 0)
-                if val_ns:
-                    samples.append(
-                        sample.Sample(
-                            f'{prefix}_lat_{label}_us',
-                            val_ns / 1000.0,
-                            'us',
-                            meta,
-                        )
-                    )
-
-    return samples
-
-
-_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
-    # GCP  (on-demand, us-central1 unless noted)
-    'c4-standard-8-lssd': 0.5888,
-    'c4-standard-8': 0.5008,
-    'n4-highmem-32': 3.0256,
-    'n2-highmem-32': 2.5216,
-    'n2-standard-32': 1.5264,
-    'z3-highmem-8': 2.7248,
-    # AWS
-    'i4i.4xlarge': 1.4960,
-    'i4i.2xlarge': 0.7480,
-    'm6id.4xlarge': 0.9072,
-    'm6i.4xlarge': 0.7680,
-    'r6i.4xlarge': 1.0080,
-}
-
-
-def _collect_cost_sample(
-    daemonset: _ds_mod.SwapDaemonSet,
-    elapsed_sec: float,
-    base_meta: dict,
-) -> list[sample.Sample]:
-    """Emit a cost_estimate_usd sample for the benchmark run."""
-    instance_type = ''
-
-    gcp_type_out, _ = daemonset.PodExec(
-        'curl -s -m 3 --fail'
-        ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
-        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
-        ignore_failure=True,
-    )
-    if gcp_type_out.strip():
-        instance_type = gcp_type_out.strip().split('/')[-1]
-
-    if not instance_type:
-        aws_type_out, _ = daemonset.PodExec(
-            'curl -s -m 3 --fail '
-            'http://169.254.169.254/latest/meta-data/instance-type '
-            '2>/dev/null || echo ""',
-            ignore_failure=True,
-        )
-        instance_type = aws_type_out.strip()
-
-    if _INSTANCE_SIZE_LABEL.value:
-        instance_type = _INSTANCE_SIZE_LABEL.value
-
-    if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
-        instance_type = _BENCHMARK_MACHINE_TYPE.value
-        logging.info(
-            '[swap_encryption] Instance type from metadata unavailable; using'
-            ' --swap_encryption_benchmark_machine_type=%s for cost tracking',
-            instance_type,
-        )
-
-    price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
-    if price is None:
-        logging.warning(
-            '[swap_encryption] Unknown instance type "%s" — skipping cost'
-            ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost'
-            ' tracking.',
-            instance_type,
-        )
-        return []
-
-    hours = elapsed_sec / 3600.0
-    meta = dict(
-        base_meta,
-        instance_type=instance_type,
-        price_usd_per_hr=price,
-        benchmark_elapsed_sec=round(elapsed_sec, 1),
-    )
-    return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)]
+    cmd.args.append('--quiet')
+    logging.info('[swap_encryption] Deleting default nodepool: %s', _DEFAULT_POOL)
+    _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
+    if rc != 0:
+      logging.warning(
+          '[swap_encryption] Could not delete default nodepool (rc=%d): %s',
+          rc, stderr,
+      )
+    else:
+      logging.info('[swap_encryption] Default nodepool deleted')
+  except Exception as e:  # pylint: disable=broad-except
+    logging.warning('[swap_encryption] _delete_default_pool failed: %s', e)
diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
index f943a53ff1..52bcdc82c2 100644
--- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
@@ -37,6 +37,7 @@
 from perfkitbenchmarker.resources.container_service import kubectl
 from perfkitbenchmarker.resources.container_service import kubernetes_cluster
 from perfkitbenchmarker.resources.container_service import kubernetes_commands
+from perfkitbenchmarker.resources.container_service import swap_config as swap_config_lib
 
 FLAGS = flags.FLAGS
 
@@ -570,13 +571,28 @@ def _AddNodeParamsToCmd(
     ):
       cmd.args.append('--enable-fast-socket')
 
-    if FLAGS.gke_node_system_config is not None:
+    # Per-nodepool swap config takes precedence over the global flag.
+    if nodepool_config.swap_config is not None:
+      gke_swap = swap_config_lib.GkeSwapConfig.from_spec(nodepool_config.swap_config)
+      cmd.flags['system-config-from-file'] = gke_swap.WriteLinuxConfigYaml()
+      # dm-crypt requires UBUNTU_CONTAINERD (Ajay r3472549985).
+      cmd.flags['image-type'] = 'UBUNTU_CONTAINERD'
+      # Prevent GKE from replacing the node after swap setup is complete.
+      cmd.args.append('--no-enable-autorepair')
+      sc = nodepool_config.swap_config
+      if sc.boot_disk_iops and not sc.lssd:
+        cmd.flags['boot-disk-provisioned-iops'] = sc.boot_disk_iops
+        cmd.flags['boot-disk-provisioned-throughput'] = (
+            gke_swap.ValidHyperdiskThroughput()
+        )
+    elif FLAGS.gke_node_system_config is not None:
+      # Fall back to global flag when no per-nodepool swap config is set.
       cmd.flags['system-config-from-file'] = FLAGS.gke_node_system_config
 
     if nodepool_config.sandbox_config is not None:
       cmd.flags['sandbox'] = nodepool_config.sandbox_config.ToSandboxFlag()
 
-    if self.image_type:
+    if self.image_type and 'image-type' not in cmd.flags:
       cmd.flags['image-type'] = self.image_type
 
     cmd.flags['node-labels'] = f'pkb_nodepool={nodepool_config.name}'
diff --git a/perfkitbenchmarker/resources/container_service/container.py b/perfkitbenchmarker/resources/container_service/container.py
index 3e05a1ec2b..b652eaab32 100644
--- a/perfkitbenchmarker/resources/container_service/container.py
+++ b/perfkitbenchmarker/resources/container_service/container.py
@@ -187,6 +187,10 @@ def __init__(
     # Defined by GceVirtualMachineConfig. Used by google_kubernetes_engine
     # pylint: disable=g-missing-from-attributes
     self.sandbox_config: container_spec_lib.SandboxSpec | None = None
+    # Set by container_cluster._InitializeNodePool() when NodepoolSpec
+    # declares swap_config. Consumed by _AddNodeParamsToCmd() in the cloud
+    # provider to apply swap configuration during nodepool creation.
+    self.swap_config: container_spec_lib.SwapConfigSpec | None = None
     self.max_local_disks: int | None
     self.ssd_interface: str | None
     self.threads_per_core: int
diff --git a/perfkitbenchmarker/resources/container_service/container_cluster.py b/perfkitbenchmarker/resources/container_service/container_cluster.py
index 9458662c98..ed67ff7adb 100644
--- a/perfkitbenchmarker/resources/container_service/container_cluster.py
+++ b/perfkitbenchmarker/resources/container_service/container_cluster.py
@@ -116,6 +116,7 @@ def _InitializeNodePool(
         nodepool_spec.machine_families,
     )
     nodepool_config.sandbox_config = nodepool_spec.sandbox_config
+    nodepool_config.swap_config = nodepool_spec.swap_config
     nodepool_config.zone = zone
     nodepool_config.num_nodes = nodepool_spec.vm_count
     if nodepool_spec.min_vm_count is None:
diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py
new file mode 100644
index 0000000000..8606929308
--- /dev/null
+++ b/perfkitbenchmarker/resources/container_service/swap_config.py
@@ -0,0 +1,259 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GkeSwapConfig and EksSwapConfig: swap configuration as PKB BaseResource.
+
+These resources encapsulate cloud-specific swap configuration for GKE and EKS
+nodepools. They are referenced via NodepoolSpec.swap_config (declared in the
+benchmark BENCHMARK_CONFIG YAML) and consumed by the cloud provider's
+_AddNodeParamsToCmd() during cluster/nodepool creation.
+
+Usage in BENCHMARK_CONFIG:
+  container_cluster:
+    nodepools:
+      benchmark:
+        vm_spec:
+          GCP:
+            machine_type: n4-highmem-32
+            boot_disk_type: hyperdisk-balanced
+            boot_disk_size: 500
+        swap_config:
+          enabled: true
+          swappiness: 100
+          min_free_kbytes: 200
+          watermark_scale_factor: 500
+          boot_disk_iops: 160000
+          boot_disk_throughput: 2400
+
+GkeCluster._AddNodeParamsToCmd() creates a GkeSwapConfig from the
+SwapConfigSpec and calls WriteLinuxConfigYaml() to obtain the path for
+--system-config-from-file. No separate resource.Create() call is needed
+for the swap config itself — it is applied as part of nodepool creation.
+"""
+
+import logging
+import os
+import tempfile
+
+from perfkitbenchmarker import resource
+
+# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s.
+_HYPERDISK_MAX_IOPS_PER_MBPS = 256
+
+
+class GkeSwapConfig(resource.BaseResource):
+  """GKE swap configuration for a nodepool.
+
+  Encapsulates the linuxConfig (swapConfig + sysctl) YAML for
+  --system-config-from-file and optional Hyperdisk IOPS/throughput overrides.
+
+  Consumed by GkeCluster._AddNodeParamsToCmd() when nodepool_config.swap_config
+  is set. _Create() and _Delete() are no-ops because the swap config is applied
+  as part of the gcloud node-pools create command; the nodepool itself manages
+  the lifecycle.
+
+  Attributes:
+    swappiness: vm.swappiness sysctl value (0-200, default 100).
+    min_free_kbytes: vm.min_free_kbytes sysctl (default 200).
+    watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500).
+    lssd: True if the nodepool uses local NVMe SSDs for swap device.
+    lssd_count: Number of local NVMe SSDs (dedicatedLocalSsdProfile.diskCount).
+    boot_disk_iops: Provisioned IOPS for hyperdisk-balanced (0 = not set).
+    boot_disk_throughput: Provisioned throughput MiB/s for hyperdisk-balanced.
+  """
+
+  RESOURCE_TYPE = 'GkeSwapConfig'
+  REQUIRED_ATTRS = []
+
+  def __init__(
+      self,
+      swappiness: int = 100,
+      min_free_kbytes: int = 200,
+      watermark_scale_factor: int = 500,
+      lssd: bool = False,
+      lssd_count: int = 0,
+      boot_disk_iops: int = 0,
+      boot_disk_throughput: int = 0,
+  ) -> None:
+    super().__init__()
+    self.swappiness = swappiness
+    self.min_free_kbytes = min_free_kbytes
+    self.watermark_scale_factor = watermark_scale_factor
+    self.lssd = lssd
+    self.lssd_count = lssd_count
+    self.boot_disk_iops = boot_disk_iops
+    self.boot_disk_throughput = boot_disk_throughput
+    self._yaml_path: str | None = None
+
+  @classmethod
+  def from_spec(cls, swap_spec) -> 'GkeSwapConfig':
+    """Create a GkeSwapConfig from a SwapConfigSpec decoded from BENCHMARK_CONFIG."""
+    return cls(
+        swappiness=swap_spec.swappiness,
+        min_free_kbytes=swap_spec.min_free_kbytes,
+        watermark_scale_factor=swap_spec.watermark_scale_factor,
+        lssd=swap_spec.lssd,
+        lssd_count=swap_spec.lssd_count,
+        boot_disk_iops=swap_spec.boot_disk_iops,
+        boot_disk_throughput=swap_spec.boot_disk_throughput,
+    )
+
+  def _Create(self) -> None:
+    """No-op: swap config is applied during nodepool creation."""
+
+  def _Delete(self) -> None:
+    """No-op: cleaned up when the nodepool is deleted."""
+    self._CleanupYaml()
+
+  def WriteLinuxConfigYaml(self) -> str:
+    """Write the GKE linuxConfig YAML to a tempfile; return the path.
+
+    Called by GkeCluster._AddNodeParamsToCmd() to supply
+    --system-config-from-file. The caller is responsible for deleting the
+    tempfile via CleanupYaml() after the gcloud command completes.
+
+    Per Ajay review r3472513706:
+      linuxConfig.swapConfig.enabled=true automatically sets
+      kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set
+      kubeletConfig explicitly.
+    For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE to
+    use local NVMe as the swap device.
+
+    Returns:
+      Absolute path to the written tempfile.
+    """
+    if self.lssd and self.lssd_count > 0:
+      swap_block = (
+          '  swapConfig:\n'
+          '    enabled: true\n'
+          '    dedicatedLocalSsdProfile:\n'
+          f'      diskCount: {self.lssd_count}\n'
+      )
+    else:
+      swap_block = '  swapConfig:\n    enabled: true\n'
+
+    yaml_content = (
+        'linuxConfig:\n'
+        + swap_block
+        + '  sysctl:\n'
+        + f'    vm.swappiness: {self.swappiness}\n'
+        + f'    vm.min_free_kbytes: {self.min_free_kbytes}\n'
+        + f'    vm.watermark_scale_factor: {self.watermark_scale_factor}\n'
+    )
+
+    tmp = tempfile.NamedTemporaryFile(
+        mode='w', suffix='.yaml', delete=False
+    )
+    try:
+      tmp.write(yaml_content)
+      tmp.flush()
+      self._yaml_path = tmp.name
+    finally:
+      tmp.close()
+
+    logging.info(
+        '[swap_config] Wrote linuxConfig YAML (lssd=%s, lssd_count=%d)'
+        ' to %s:\n%s',
+        self.lssd,
+        self.lssd_count,
+        self._yaml_path,
+        yaml_content,
+    )
+    return self._yaml_path
+
+  def ValidHyperdiskThroughput(self) -> int:
+    """Return clamped throughput satisfying GCP Hyperdisk Balanced constraints.
+
+    GCP Hyperdisk Balanced requires: provisioned_iops <= 256 × throughput_MiB_s.
+    Clamps throughput UP so a mismatched pair cannot abort nodepool creation.
+    """
+    if not self.boot_disk_iops or not self.boot_disk_throughput:
+      return self.boot_disk_throughput
+    min_tput = -(-int(self.boot_disk_iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)
+    if self.boot_disk_throughput < min_tput:
+      logging.warning(
+          '[swap_config] boot disk throughput %d MiB/s too low for %d IOPS;'
+          ' clamping to minimum %d MiB/s',
+          self.boot_disk_throughput,
+          self.boot_disk_iops,
+          min_tput,
+      )
+      return min_tput
+    return self.boot_disk_throughput
+
+  def CleanupYaml(self) -> None:
+    """Delete the linuxConfig tempfile if it was written."""
+    if self._yaml_path and os.path.exists(self._yaml_path):
+      try:
+        os.unlink(self._yaml_path)
+        logging.info(
+            '[swap_config] Cleaned up YAML tempfile: %s', self._yaml_path
+        )
+      except OSError:
+        pass
+      self._yaml_path = None
+
+  def _CleanupYaml(self) -> None:
+    self.CleanupYaml()
+
+
+class EksSwapConfig(resource.BaseResource):
+  """EKS swap configuration for a nodepool (stub).
+
+  Configures kubelet LimitedSwap via nodeadm bootstrap configuration.
+  Full implementation deferred to PR #6780.
+
+  Attributes:
+    memory_swap_behavior: kubelet memorySwapBehavior value ('LimitedSwap').
+    fail_swap_on: kubelet failSwapOn setting (False to allow swap on EKS).
+  """
+
+  RESOURCE_TYPE = 'EksSwapConfig'
+  REQUIRED_ATTRS = []
+
+  def __init__(
+      self,
+      memory_swap_behavior: str = 'LimitedSwap',
+      fail_swap_on: bool = False,
+  ) -> None:
+    super().__init__()
+    self.memory_swap_behavior = memory_swap_behavior
+    self.fail_swap_on = fail_swap_on
+
+  @classmethod
+  def from_spec(cls, swap_spec) -> 'EksSwapConfig':
+    """Create an EksSwapConfig from a SwapConfigSpec."""
+    return cls()
+
+  def _Create(self) -> None:
+    """Stub: EKS kubelet LimitedSwap config via nodeadm (deferred to PR #6780)."""
+    logging.warning(
+        '[swap_config] EksSwapConfig._Create() is a stub. '
+        'EKS kubelet LimitedSwap config via nodeadm not yet implemented '
+        '(deferred to PR #6780). Swap will not be enabled on EKS nodes.'
+    )
+
+  def _Delete(self) -> None:
+    """No-op."""
+
+  def GetNodeadmConfig(self) -> str:
+    """Return nodeadm bootstrap YAML for kubelet swap settings."""
+    return (
+        'apiVersion: node.eks.aws/v1alpha1\n'
+        'kind: NodeConfig\n'
+        'spec:\n'
+        '  kubelet:\n'
+        '    config:\n'
+        f'      memorySwapBehavior: {self.memory_swap_behavior}\n'
+        f'      failSwapOn: {str(self.fail_swap_on).lower()}\n'
+    )

From a3060e9b52a4bf1c2b55b41886dd8dc826f709c3 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Mon, 29 Jun 2026 17:36:07 +0530
Subject: [PATCH 11/17] correct PKB structure

---
 .../container_service/swap_nodepool.py        | 575 ------------------
 1 file changed, 575 deletions(-)
 delete mode 100644 perfkitbenchmarker/resources/container_service/swap_nodepool.py

diff --git a/perfkitbenchmarker/resources/container_service/swap_nodepool.py b/perfkitbenchmarker/resources/container_service/swap_nodepool.py
deleted file mode 100644
index 44e5cb396a..0000000000
--- a/perfkitbenchmarker/resources/container_service/swap_nodepool.py
+++ /dev/null
@@ -1,575 +0,0 @@
-# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""SwapNodePool: PKB BaseResource for the swap-encryption benchmark nodepool.
-
-Manages the lifecycle of:
-
-  GKE nodepool  — gcloud container node-pools create with UBUNTU_CONTAINERD,
-                  linuxConfig.swapConfig + sysctl via --system-config-from-file.
-                  For LSSD machines: --local-nvme-ssd-block and
-                  dedicatedLocalSsdProfile in the swap YAML.
-                  For hyperdisk configs: boot-disk-provisioned-iops/throughput.
-
-  Swap disk     — Optional dedicated hyperdisk attached post-nodepool creation
-                  (for dm-crypt measurement on machines where the boot disk
-                  cannot be used as a swap device directly).
-
-  Default pool  — DeleteDefaultPool() removes the dummy e2-medium pool created
-                  at cluster time once the DaemonSet pod is Running.
-
-Extracted from swap_encryption_benchmark.py to satisfy PKB resource pattern
-(go/pkb-resources): infrastructure lifecycle belongs in BaseResource subclasses.
-"""
-
-import logging
-import os
-import tempfile
-import time
-
-from perfkitbenchmarker import errors
-from perfkitbenchmarker import resource
-from perfkitbenchmarker.providers.gcp import util as gcp_util
-from perfkitbenchmarker.resources.container_service import kubectl
-
-# GCP Hyperdisk Balanced constraint: provisioned_iops <= 256 × throughput_MiB_s.
-_HYPERDISK_MAX_IOPS_PER_MBPS = 256
-
-_BENCHMARK_NODEPOOL = 'benchmark'
-_DEFAULT_NODEPOOL = 'default-pool'
-
-
-def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
-    """Return a throughput (MiB/s) satisfying GCP Hyperdisk Balanced constraints.
-
-    Clamps throughput UP to the minimum required by the requested IOPS so that
-    a mismatched flag pair cannot abort nodepool / disk creation with:
-      "Requested provisioned throughput is too low for the provisioned iops".
-    """
-    min_tput = -(-int(iops) // _HYPERDISK_MAX_IOPS_PER_MBPS)  # ceil(iops/256)
-    if throughput < min_tput:
-        logging.warning(
-            '[swap_encryption] boot/swap disk throughput %d MiB/s is too low'
-            ' for %d IOPS; clamping to minimum %d MiB/s',
-            throughput,
-            iops,
-            min_tput,
-        )
-        return min_tput
-    return throughput
-
-
-class _GcpZonalResource:
-    """Minimal resource shim for gcp_util.GcloudCommand on compute operations.
-
-    gcp_util.GcloudCommand auto-injects --project and --zone from the resource
-    object.  GkeCluster._GcloudCommand() switches --zone → --region for
-    multi-zone clusters, which is wrong for gcloud compute commands (--region
-    creates regional resources).  This shim pins a single zone so all
-    gcloud compute calls target the correct AZ.
-    """
-
-    def __init__(self, project: str, zone: str) -> None:
-        self.project = project
-        self.zone = zone
-
-
-class SwapNodePool(resource.BaseResource):
-    """PKB resource for the swap-encryption benchmark GKE nodepool and disk.
-
-    _Create() runs the full setup sequence:
-      1. gcloud container node-pools create with linuxConfig.swapConfig.
-      2. Wait for the node to become Ready.
-      3. (Optional) Create and attach a dedicated swap disk.
-
-    _Delete() tears down in reverse:
-      1. (Optional) Detach and delete the swap disk.
-      2. gcloud container node-pools delete.
-
-    DeleteDefaultPool() is a separate step called from Prepare() AFTER the
-    DaemonSet pod is Running, since deleting the default pool while the
-    benchmark node is still joining can trigger a brief API-server timeout.
-
-    Attributes:
-      cluster: PKB GkeCluster (or subclass) object; provides _GcloudCommand,
-        name, project, zones/region.
-      machine_type: GKE machine type (e.g. 'n4-highmem-32').
-      node_image_type: GKE image type (e.g. 'UBUNTU_CONTAINERD').
-      disk_type: Boot disk type (e.g. 'hyperdisk-balanced' or 'pd-ssd').
-      disk_size_gb: Boot disk size in GiB (500 for hyperdisk, 100 for LSSD).
-      disk_iops: Provisioned IOPS (hyperdisk-balanced only).
-      disk_throughput: Provisioned throughput MiB/s (hyperdisk-balanced only).
-      lssd: True if the machine type uses local NVMe SSDs.  Auto-detected from
-        machine_type name when False.
-      lssd_count: Number of local NVMe SSDs (--local-nvme-ssd-block count=N).
-      add_swap_disk: True to create+attach a dedicated second disk for swap.
-      swap_disk_size_gb: Size of the dedicated swap disk in GiB.
-    """
-
-    RESOURCE_TYPE = 'SwapNodePool'
-    REQUIRED_ATTRS = []
-
-    def __init__(
-        self,
-        cluster,
-        machine_type: str,
-        node_image_type: str,
-        disk_type: str,
-        disk_size_gb: int,
-        disk_iops: int,
-        disk_throughput: int,
-        lssd: bool,
-        lssd_count: int,
-        add_swap_disk: bool,
-        swap_disk_size_gb: int,
-    ) -> None:
-        super().__init__()
-        self.cluster = cluster
-        self.machine_type = machine_type
-        self.node_image_type = node_image_type
-        self.disk_type = disk_type
-        self.disk_size_gb = disk_size_gb
-        self.disk_iops = disk_iops
-        self.disk_throughput = disk_throughput
-        # Auto-detect LSSD from machine type name; explicit flag overrides.
-        self.lssd = lssd or 'lssd' in machine_type.lower()
-        self.lssd_count = lssd_count
-        self.add_swap_disk = add_swap_disk
-        self.swap_disk_size_gb = swap_disk_size_gb
-
-    # ── PKB lifecycle ─────────────────────────────────────────────────────────
-
-    def _Create(self) -> None:
-        """Create the benchmark nodepool, wait for node, optionally attach disk."""
-        self._CreateNodePool()
-        self._WaitForNode()
-        if self.add_swap_disk:
-            self._AttachDisk()
-
-    def _Delete(self) -> None:
-        """Detach+delete the swap disk (if any) then delete the nodepool."""
-        if self.add_swap_disk:
-            self._DetachAndDeleteDisk()
-        self._DeleteNodePool()
-
-    # ── Nodepool helpers ──────────────────────────────────────────────────────
-
-    def _CreateNodePool(self) -> None:
-        """gcloud container node-pools create with linuxConfig.swapConfig YAML.
-
-        Per Ajay review comment r3472513706:
-          linuxConfig.swapConfig automatically enables
-          kubeletConfig.memorySwapBehavior=LimitedSwap — no need to set
-          kubeletConfig explicitly.
-          For LSSD machines, dedicatedLocalSsdProfile.diskCount instructs GKE
-          to use local NVMe as the swap device.
-        Per Ajay review comment r3472549985:
-          UBUNTU_CONTAINERD is required for dm-crypt measurement.
-        """
-        is_lssd = self.lssd
-        # LSSD configs use a small boot disk (OS only; swap is on local NVMe).
-        disk_size_gb = 100 if is_lssd else self.disk_size_gb
-
-        cmd = self.cluster._GcloudCommand(
-            'container',
-            'node-pools',
-            'create',
-            _BENCHMARK_NODEPOOL,
-            '--cluster',
-            self.cluster.name,
-        )
-        cmd.flags['machine-type'] = self.machine_type
-        cmd.flags['image-type'] = self.node_image_type
-        cmd.flags['disk-type'] = self.disk_type
-        cmd.flags['disk-size'] = disk_size_gb
-        cmd.flags['num-nodes'] = 1
-        cmd.flags['node-labels'] = f'pkb_nodepool={_BENCHMARK_NODEPOOL}'
-        cmd.args += ['--no-enable-autoupgrade', '--no-enable-autorepair']
-
-        # IOPS / throughput only for hyperdisk non-LSSD configs.
-        if self.disk_type.startswith('hyperdisk') and not is_lssd:
-            cmd.flags['boot-disk-provisioned-iops'] = self.disk_iops
-            cmd.flags['boot-disk-provisioned-throughput'] = (
-                _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput)
-            )
-
-        # Expose local NVMe as raw block devices for fio/mdadm direct access.
-        if is_lssd:
-            cmd.flags['local-nvme-ssd-block'] = f'count={self.lssd_count}'
-
-        # Build linuxConfig YAML for --system-config-from-file.
-        if is_lssd:
-            swap_config_block = (
-                '  swapConfig:\n'
-                '    enabled: true\n'
-                '    dedicatedLocalSsdProfile:\n'
-                f'      diskCount: {self.lssd_count}\n'
-            )
-        else:
-            swap_config_block = '  swapConfig:\n    enabled: true\n'
-        swap_config_yaml = (
-            'linuxConfig:\n'
-            + swap_config_block
-            + '  sysctl:\n'
-            '    vm.min_free_kbytes: 200\n'
-            '    vm.watermark_scale_factor: 500\n'
-            '    vm.swappiness: 100\n'
-        )
-
-        system_config_tmp = None
-        try:
-            system_config_tmp = tempfile.NamedTemporaryFile(
-                mode='w', suffix='.yaml', delete=False
-            )
-            system_config_tmp.write(swap_config_yaml)
-            system_config_tmp.flush()
-            cmd.flags['system-config-from-file'] = system_config_tmp.name
-            logging.info(
-                '[swap_encryption] system-config-from-file: lssd=%s'
-                ' (written to %s):\n%s',
-                is_lssd,
-                system_config_tmp.name,
-                swap_config_yaml,
-            )
-            logging.info(
-                '[swap_encryption] Creating benchmark nodepool: %s / %s /'
-                ' image=%s / disk=%dGiB / iops=%d / lssd=%s /'
-                ' add_swap_disk=%s',
-                _BENCHMARK_NODEPOOL,
-                self.machine_type,
-                self.node_image_type,
-                disk_size_gb,
-                self.disk_iops,
-                is_lssd,
-                self.add_swap_disk,
-            )
-            # LSSD nodepools take longer to provision (NVMe init before Ready).
-            _, stderr, rc = cmd.Issue(timeout=1200, raise_on_failure=False)
-        finally:
-            if system_config_tmp is not None:
-                try:
-                    os.unlink(system_config_tmp.name)
-                except OSError:
-                    pass
-
-        if rc != 0:
-            low = (stderr or '').lower()
-            # Idempotent prepare: if the nodepool already exists (re-running
-            # --run_stage=prepare,run), reuse it instead of failing.
-            if (
-                'already exists' in low
-                or 'alreadyexists' in low
-                or 'code=409' in low
-            ):
-                logging.info(
-                    '[swap_encryption] Benchmark nodepool already exists —'
-                    ' reusing (idempotent prepare)'
-                )
-                return
-            raise errors.Benchmarks.RunError(
-                f'[swap_encryption] Failed to create benchmark nodepool'
-                f' (rc={rc}): {stderr}'
-            )
-        logging.info('[swap_encryption] Benchmark nodepool ready')
-
-    def _WaitForNode(self, timeout: int = 900) -> None:
-        """Block until a node labelled pkb_nodepool=benchmark is Ready.
-
-        gcloud container node-pools create returns when the API accepts the
-        request; the node VM may take another 2-4 min to boot and pass
-        readiness checks.  Deploying the DaemonSet before the node is Ready
-        leaves the pod Pending indefinitely.
-        """
-        deadline = time.time() + timeout
-        logging.info(
-            '[swap_encryption] Waiting for benchmark node'
-            ' (pkb_nodepool=benchmark) to be Ready...'
-        )
-        while time.time() < deadline:
-            out, _, rc = kubectl.RunKubectlCommand(
-                [
-                    'get',
-                    'nodes',
-                    '-l',
-                    f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-                    '-o',
-                    (
-                        r'jsonpath={range .items[*]}'
-                        r'{.metadata.name}{"\t"}'
-                        r'{range .status.conditions[?(@.type=="Ready")]}'
-                        r'{.status}{"\n"}{end}{end}'
-                    ),
-                ],
-                raise_on_failure=False,
-            )
-            if rc == 0 and out.strip():
-                for line in out.strip().splitlines():
-                    parts = line.split('\t')
-                    if len(parts) == 2 and parts[1].strip() == 'True':
-                        logging.info(
-                            '[swap_encryption] Benchmark node ready: %s',
-                            parts[0].strip(),
-                        )
-                        return
-            logging.info(
-                '[swap_encryption] Benchmark node not yet Ready —'
-                ' retrying in 15 s...'
-            )
-            time.sleep(15)
-        raise errors.Benchmarks.RunError(
-            f'[swap_encryption] Timed out waiting for benchmark node'
-            f' (pkb_nodepool={_BENCHMARK_NODEPOOL}) to become Ready'
-            f' after {timeout}s'
-        )
-
-    # ── Dedicated swap disk helpers ───────────────────────────────────────────
-
-    def _AttachDisk(self) -> None:
-        """Create a dedicated hyperdisk and attach it to the benchmark node.
-
-        gcloud container node-pools create --additional-node-disk is not
-        available in all gcloud SDK versions, so we create the disk via
-        gcloud compute and attach it after the node is Ready.  In GKE the
-        Kubernetes node name equals the GCE instance name.
-
-        The disk is named pkb-swap-<cluster-name> to avoid collisions across
-        concurrent PKB runs.  _Delete() calls _DetachAndDeleteDisk() to clean
-        up.
-        """
-        cluster = self.cluster
-        zone = self._cluster_zone()
-        if not zone:
-            raise errors.Benchmarks.RunError(
-                '[swap_encryption] Cannot attach swap disk: cluster zone unknown'
-            )
-        project = cluster.project
-        disk_name = f'pkb-swap-{cluster.name}'
-
-        # Get the GCE instance name from the benchmark node's Kubernetes name.
-        node_out, _, rc = kubectl.RunKubectlCommand(
-            [
-                'get',
-                'nodes',
-                '-l',
-                f'pkb_nodepool={_BENCHMARK_NODEPOOL}',
-                '-o',
-                'jsonpath={.items[0].metadata.name}',
-            ],
-            raise_on_failure=False,
-        )
-        instance_name = node_out.strip()
-        if rc != 0 or not instance_name:
-            raise errors.Benchmarks.RunError(
-                '[swap_encryption] Cannot find benchmark node for swap disk'
-                ' attach'
-            )
-        logging.info(
-            '[swap_encryption] Benchmark node instance: %s', instance_name
-        )
-
-        # Create the disk.
-        logging.info(
-            '[swap_encryption] Creating swap disk %s (%dGiB %s)',
-            disk_name,
-            self.swap_disk_size_gb,
-            self.disk_type,
-        )
-        gcp_res = _GcpZonalResource(project, zone)
-        create_cmd = gcp_util.GcloudCommand(
-            gcp_res, 'compute', 'disks', 'create', disk_name
-        )
-        create_cmd.flags['type'] = self.disk_type
-        create_cmd.flags['size'] = f'{self.swap_disk_size_gb}GB'
-        create_cmd.args.append('--quiet')
-        if self.disk_type.startswith('hyperdisk'):
-            create_cmd.flags['provisioned-iops'] = self.disk_iops
-            create_cmd.flags['provisioned-throughput'] = (
-                _valid_hyperdisk_throughput(self.disk_iops, self.disk_throughput)
-            )
-        _, stderr, rc = create_cmd.Issue(timeout=120, raise_on_failure=False)
-        if rc != 0:
-            raise errors.Benchmarks.RunError(
-                f'[swap_encryption] Failed to create swap disk {disk_name}:'
-                f' {stderr}'
-            )
-
-        # Attach the disk to the benchmark node VM.
-        logging.info(
-            '[swap_encryption] Attaching swap disk %s to %s',
-            disk_name,
-            instance_name,
-        )
-        attach_cmd = gcp_util.GcloudCommand(
-            gcp_res, 'compute', 'instances', 'attach-disk', instance_name
-        )
-        attach_cmd.flags['disk'] = disk_name
-        attach_cmd.flags['device-name'] = 'pkb-swap'
-        attach_cmd.args.append('--quiet')
-        _, stderr, rc = attach_cmd.Issue(timeout=120, raise_on_failure=False)
-        if rc != 0:
-            raise errors.Benchmarks.RunError(
-                f'[swap_encryption] Failed to attach swap disk to'
-                f' {instance_name}: {stderr}'
-            )
-        logging.info(
-            '[swap_encryption] Swap disk attached: %s → %s',
-            disk_name,
-            instance_name,
-        )
-
-    def _DetachAndDeleteDisk(self) -> None:
-        """Detach and delete the dedicated swap disk created by _AttachDisk."""
-        zone = self._cluster_zone()
-        cluster = self.cluster
-        if not zone or not getattr(cluster, 'project', None):
-            return
-        disk_name = f'pkb-swap-{cluster.name}'
-        self._DeleteDiskByName(disk_name, cluster.project, zone)
-
-    def _DeleteDiskByName(
-        self, disk_name: str, project: str, zone: str
-    ) -> bool:
-        """Detach (if attached) and delete a GCE disk, robustly, with retries.
-
-        Finds the attached instance from the disk's own `users` field rather
-        than kubectl — kubectl is often unavailable during teardown (cluster
-        being deleted), which previously left the disk attached and
-        undeletable.  Returns True if the disk is gone.
-        """
-        for attempt in range(1, 5):
-            gcp_res = _GcpZonalResource(project, zone)
-            describe_cmd = gcp_util.GcloudCommand(
-                gcp_res, 'compute', 'disks', 'describe', disk_name
-            )
-            describe_cmd.flags['format'] = 'value(users)'
-            users, _, rc = describe_cmd.Issue(timeout=60, raise_on_failure=False)
-            if rc != 0:
-                logging.info(
-                    '[swap_encryption] Swap disk %s not present —'
-                    ' nothing to delete',
-                    disk_name,
-                )
-                return True  # Already gone.
-            user = users.strip()
-            if user:
-                inst = user.split('/')[-1]
-                logging.info(
-                    '[swap_encryption] Detaching swap disk %s from %s',
-                    disk_name,
-                    inst,
-                )
-                detach_cmd = gcp_util.GcloudCommand(
-                    gcp_res, 'compute', 'instances', 'detach-disk', inst
-                )
-                detach_cmd.flags['disk'] = disk_name
-                detach_cmd.args.append('--quiet')
-                detach_cmd.Issue(timeout=120, raise_on_failure=False)
-            delete_cmd = gcp_util.GcloudCommand(
-                gcp_res, 'compute', 'disks', 'delete', disk_name
-            )
-            delete_cmd.args.append('--quiet')
-            _, derr, drc = delete_cmd.Issue(timeout=180, raise_on_failure=False)
-            if drc == 0:
-                logging.info(
-                    '[swap_encryption] Swap disk deleted: %s', disk_name
-                )
-                return True
-            logging.warning(
-                '[swap_encryption] Swap disk delete attempt %d/4 failed'
-                ' (%s); retrying in 10 s',
-                attempt,
-                derr.strip()[:160],
-            )
-            time.sleep(10)
-        logging.error(
-            '[swap_encryption] Could NOT delete swap disk %s after retries'
-            ' — delete it manually:\n'
-            '  gcloud compute disks delete %s --zone %s --quiet',
-            disk_name,
-            disk_name,
-            zone,
-        )
-        return False
-
-    def _DeleteNodePool(self) -> None:
-        """Delete the benchmark nodepool."""
-        cmd = self.cluster._GcloudCommand(
-            'container',
-            'node-pools',
-            'delete',
-            _BENCHMARK_NODEPOOL,
-            '--cluster',
-            self.cluster.name,
-        )
-        cmd.args.append('--quiet')
-        logging.info(
-            '[swap_encryption] Deleting benchmark nodepool: %s',
-            _BENCHMARK_NODEPOOL,
-        )
-        _, stderr, rc = cmd.Issue(timeout=600, raise_on_failure=False)
-        if rc != 0:
-            logging.warning(
-                '[swap_encryption] Could not delete benchmark nodepool'
-                ' (rc=%d): %s',
-                rc,
-                stderr,
-            )
-        else:
-            logging.info('[swap_encryption] Benchmark nodepool deleted')
-
-    def DeleteDefaultPool(self) -> None:
-        """Delete the dummy e2-medium default nodepool.
-
-        Called from Prepare() AFTER the DaemonSet pod is Running.  The default
-        pool (e2-medium) was only needed to satisfy GKE's requirement that a
-        cluster must have at least one nodepool at creation time.  Removing it
-        stops its cost immediately.
-
-        Deleting the default pool BEFORE the DaemonSet pod is Running can
-        trigger a brief API-server I/O timeout (control plane busy with two
-        concurrent nodepool ops).  Calling this method from Prepare() after
-        daemonset.WaitForPod() ensures the cluster is fully stable.
-        """
-        cmd = self.cluster._GcloudCommand(
-            'container',
-            'node-pools',
-            'delete',
-            _DEFAULT_NODEPOOL,
-            '--cluster',
-            self.cluster.name,
-        )
-        cmd.args.append('--quiet')
-        logging.info(
-            '[swap_encryption] Deleting default nodepool: %s', _DEFAULT_NODEPOOL
-        )
-        _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
-        if rc != 0:
-            logging.warning(
-                '[swap_encryption] Could not delete default nodepool'
-                ' (rc=%d): %s',
-                rc,
-                stderr,
-            )
-        else:
-            logging.info('[swap_encryption] Default nodepool deleted')
-
-    # ── Internal helpers ──────────────────────────────────────────────────────
-
-    def _cluster_zone(self) -> str:
-        """Return the first zone (or region) from the cluster object."""
-        cluster = self.cluster
-        if getattr(cluster, 'zones', None):
-            return cluster.zones[0]
-        if getattr(cluster, 'region', None):
-            return cluster.region
-        return ''

From 1befef2127b4308e8be1f5767ecc4a6ba06e6741 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Mon, 29 Jun 2026 17:53:25 +0530
Subject: [PATCH 12/17] correct PKB structure

---
 .../providers/gcp/google_kubernetes_engine.py |   16 +-
 .../container_service/swap_daemonset.py       | 1012 ++++++++---------
 2 files changed, 517 insertions(+), 511 deletions(-)

diff --git a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
index 52bcdc82c2..86d8d7142a 100644
--- a/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
+++ b/perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
@@ -429,11 +429,15 @@ def _CreateNodePools(self):
       cmd = self._GcloudCommand(
           'container', 'node-pools', 'create', name, '--cluster', self.name
       )
-      self._AddNodeParamsToCmd(
-          nodepool,
-          cmd,
-      )
-      self._IssueResourceCreationCommand(cmd)
+      self._AddNodeParamsToCmd(nodepool, cmd)
+      # If swap_config wrote a linuxConfig tempfile, clean it up after Issue().
+      swap_cfg = getattr(nodepool, '_gke_swap_config', None)
+      try:
+        self._IssueResourceCreationCommand(cmd)
+      finally:
+        if swap_cfg is not None:
+          swap_cfg.CleanupYaml()
+          nodepool._gke_swap_config = None  # pylint: disable=protected-access
       self._CreateCustomComputeClass(nodepool)
 
   def _CreateCustomComputeClass(
@@ -575,6 +579,8 @@ def _AddNodeParamsToCmd(
     if nodepool_config.swap_config is not None:
       gke_swap = swap_config_lib.GkeSwapConfig.from_spec(nodepool_config.swap_config)
       cmd.flags['system-config-from-file'] = gke_swap.WriteLinuxConfigYaml()
+      # Store on nodepool so _CreateNodePools() can clean up the tempfile.
+      nodepool_config._gke_swap_config = gke_swap  # pylint: disable=protected-access
       # dm-crypt requires UBUNTU_CONTAINERD (Ajay r3472549985).
       cmd.flags['image-type'] = 'UBUNTU_CONTAINERD'
       # Prevent GKE from replacing the node after swap setup is complete.
diff --git a/perfkitbenchmarker/resources/container_service/swap_daemonset.py b/perfkitbenchmarker/resources/container_service/swap_daemonset.py
index ab23c8d6aa..48e3b9c890 100644
--- a/perfkitbenchmarker/resources/container_service/swap_daemonset.py
+++ b/perfkitbenchmarker/resources/container_service/swap_daemonset.py
@@ -17,12 +17,12 @@
 swap_encryption benchmark:
 
   _Create()   — apply the Jinja2 manifest via kubernetes_commands.ApplyManifest
-                and wait for the pod to reach Running + /tmp/pkb_ready.
+        and wait for the pod to reach Running + /tmp/pkb_ready.
   _Delete()   — run in-pod cleanup (swapoff, dmsetup remove, losetup teardown,
-                pkill fio/stress-ng) then kubectl delete daemonset.
+        pkill fio/stress-ng) then kubectl delete daemonset.
   PodExec()   — kubectl exec wrapper with transient-reset retry, OOM-kill (rc=137)
-                detection, and automatic RecoverPod() after eviction or container
-                restart.
+        detection, and automatic RecoverPod() after eviction or container
+        restart.
   WaitForPod()  — polls for Running phase + sentinel; updates self.pod_name.
   RecoverPod()  — waits for DaemonSet to recreate / restart the container,
                   checking deletionTimestamp to avoid false-positive Running state.
@@ -47,563 +47,563 @@
 
 # Errors indicating the container / pod is gone and needs full recovery.
 _CONTAINER_GONE_KUBECTL_ERRORS = (
-    'container not found',
-    'procready not received',
-    'unable to upgrade connection',
-    'not found',
-    'deleted state',
+  'container not found',
+  'procready not received',
+  'unable to upgrade connection',
+  'not found',
+  'deleted state',
 )
 
 
 class SwapDaemonSet(resource.BaseResource):
-    """PKB resource for the swap-encryption benchmark privileged DaemonSet.
+  """PKB resource for the swap-encryption benchmark privileged DaemonSet.
 
-    The DaemonSet runs a single privileged pod on the benchmark nodepool.
-    It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli),
-    verifies the swap device is active, then writes /tmp/pkb_ready.  All
-    benchmark phases execute commands inside this pod via PodExec().
+  The DaemonSet runs a single privileged pod on the benchmark nodepool.
+  It installs measurement tools (fio, cryptsetup, mdadm, sysstat, nvme-cli),
+  verifies the swap device is active, then writes /tmp/pkb_ready.  All
+  benchmark phases execute commands inside this pod via PodExec().
 
-    Attributes:
+  Attributes:
       name: DaemonSet metadata.name (e.g. 'pkb-swap-benchmark').
       namespace: Kubernetes namespace (typically 'default').
       label: Pod label value for app= selector.
       nodepool: pkb_nodepool label value pinning the DaemonSet to the
-        benchmark node.
+    benchmark node.
       image: Container image (e.g. 'ubuntu:22.04').
       pod_name: Name of the currently active pod; updated by WaitForPod /
-        RecoverPod on eviction.
+    RecoverPod on eviction.
       oom_events: Pod names that triggered rc=137 OOM-kill; read by Run()
-        for the degradation gate.
+    for the degradation gate.
       pod_lost: Pod names that went NotFound during PodExec; read by Run()
-        for the degradation gate.
+    for the degradation gate.
+  """
+
+  RESOURCE_TYPE = 'SwapDaemonSet'
+  REQUIRED_ATTRS = []
+
+  def __init__(
+    self,
+    name: str,
+    namespace: str,
+    label: str,
+    nodepool: str,
+    image: str,
+  ) -> None:
+    super().__init__()
+    self.name = name
+    self.namespace = namespace
+    self.label = label
+    self.nodepool = nodepool
+    self.image = image
+    # Active pod tracking — updated by WaitForPod / RecoverPod.
+    self.pod_name: Optional[str] = None
+    # Per-run accumulators read by Run() for the degradation gate.
+    self.oom_events: list[str] = []
+    self.pod_lost: list[str] = []
+
+  # ── PKB lifecycle ─────────────────────────────────────────────────────────
+
+  def _Create(self) -> None:
+    """Apply the DaemonSet manifest and wait for the pod to be ready."""
+    kubernetes_commands.ApplyManifest(
+      'cluster/swap_encryption_daemonset.yaml.j2',
+      ds_name=self.name,
+      ds_namespace=self.namespace,
+      ds_label=self.label,
+      benchmark_nodepool=self.nodepool,
+      image=self.image,
+    )
+    logging.info('[swap_encryption] Swap-infra DaemonSet applied')
+    pod = self.WaitForPod()
+    if pod is None:
+      raise errors.Benchmarks.PrepareException(
+        '[swap_encryption] DaemonSet pod did not become ready within'
+        ' timeout'
+      )
+
+  def _Delete(self) -> None:
+    """Run in-pod teardown then delete the DaemonSet.
+
+    Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the
+    pod (best-effort, ignore_failure=True) before deleting the DaemonSet.
+    This mirrors the original Cleanup() logic so no swap state is leaked.
     """
-
-    RESOURCE_TYPE = 'SwapDaemonSet'
-    REQUIRED_ATTRS = []
-
-    def __init__(
-        self,
-        name: str,
-        namespace: str,
-        label: str,
-        nodepool: str,
-        image: str,
-    ) -> None:
-        super().__init__()
-        self.name = name
-        self.namespace = namespace
-        self.label = label
-        self.nodepool = nodepool
-        self.image = image
-        # Active pod tracking — updated by WaitForPod / RecoverPod.
-        self.pod_name: Optional[str] = None
-        # Per-run accumulators read by Run() for the degradation gate.
-        self.oom_events: list[str] = []
-        self.pod_lost: list[str] = []
-
-    # ── PKB lifecycle ─────────────────────────────────────────────────────────
-
-    def _Create(self) -> None:
-        """Apply the DaemonSet manifest and wait for the pod to be ready."""
-        kubernetes_commands.ApplyManifest(
-            'cluster/swap_encryption_daemonset.yaml.j2',
-            ds_name=self.name,
-            ds_namespace=self.namespace,
-            ds_label=self.label,
-            benchmark_nodepool=self.nodepool,
-            image=self.image,
-        )
-        logging.info('[swap_encryption] Swap-infra DaemonSet applied')
-        pod = self.WaitForPod()
-        if pod is None:
-            raise errors.Benchmarks.PrepareException(
-                '[swap_encryption] DaemonSet pod did not become ready within'
-                ' timeout'
-            )
-
-    def _Delete(self) -> None:
-        """Run in-pod teardown then delete the DaemonSet.
-
-        Runs swapoff, dmsetup remove, losetup cleanup, and pkill inside the
-        pod (best-effort, ignore_failure=True) before deleting the DaemonSet.
-        This mirrors the original Cleanup() logic so no swap state is leaked.
-        """
-        # Try to get the pod name quickly if not set.
-        if self.pod_name is None:
-            self.WaitForPod(timeout=30)
-
-        if self.pod_name:
-            self.PodExec(
-                'swapoff -a 2>/dev/null || true',
-                ignore_failure=True,
-                _retries=0,
-            )
-            self.PodExec(
-                textwrap.dedent("""\
-                    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
-                    dmsetup remove --noudevrules --noudevsync \
+    # Try to get the pod name quickly if not set.
+    if self.pod_name is None:
+      self.WaitForPod(timeout=30)
+
+    if self.pod_name:
+      self.PodExec(
+        'swapoff -a 2>/dev/null || true',
+        ignore_failure=True,
+        _retries=0,
+      )
+      self.PodExec(
+        textwrap.dedent("""\
+          swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+          dmsetup remove --noudevrules --noudevsync \
                       swap_encrypted 2>/dev/null || true
-                """),
-                ignore_failure=True,
-                _retries=0,
-            )
-            self.PodExec(
-                textwrap.dedent("""\
-                    for backing in \
-                        /var/pkb_swap_backing \
-                        /run/pkb_swap_backing \
-                        /mnt/stateful_partition/pkb_swap_backing
-                    do
+        """),
+        ignore_failure=True,
+        _retries=0,
+      )
+      self.PodExec(
+        textwrap.dedent("""\
+          for backing in \
+            /var/pkb_swap_backing \
+            /run/pkb_swap_backing \
+            /mnt/stateful_partition/pkb_swap_backing
+          do
                       losetup -j "$backing" 2>/dev/null \
-                        | awk -F: '{print $1}' \
-                        | while read dev
+            | awk -F: '{print $1}' \
+            | while read dev
                           do losetup -d "$dev" 2>/dev/null || true; done
                       rm -f "$backing"
-                    done
-                """),
-                ignore_failure=True,
-                _retries=0,
-            )
-            self.PodExec(
-                "pkill -9 'stress-ng|fio' 2>/dev/null || true",
-                ignore_failure=True,
-                _retries=0,
-            )
-
-        kubectl.RunKubectlCommand(
-            [
-                'delete',
-                'daemonset',
-                self.name,
-                '-n',
-                self.namespace,
-                '--ignore-not-found',
-            ],
-            raise_on_failure=False,
-        )
-        logging.info('[swap_encryption] DaemonSet deleted')
-
-    # ── Pod lifecycle helpers ─────────────────────────────────────────────────
-
-    def WaitForPod(self, timeout: int = 600) -> Optional[str]:
-        """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists.
-
-        Two-phase poll:
+          done
+        """),
+        ignore_failure=True,
+        _retries=0,
+      )
+      self.PodExec(
+        "pkill -9 'stress-ng|fio' 2>/dev/null || true",
+        ignore_failure=True,
+        _retries=0,
+      )
+
+    kubectl.RunKubectlCommand(
+      [
+        'delete',
+        'daemonset',
+        self.name,
+        '-n',
+        self.namespace,
+        '--ignore-not-found',
+      ],
+      raise_on_failure=False,
+    )
+    logging.info('[swap_encryption] DaemonSet deleted')
+
+  # ── Pod lifecycle helpers ─────────────────────────────────────────────────
+
+  def WaitForPod(self, timeout: int = 600) -> Optional[str]:
+    """Wait until the DaemonSet pod is Running AND /tmp/pkb_ready exists.
+
+    Two-phase poll:
           1. Wait for status.phase == Running.
           2. kubectl exec test -f /tmp/pkb_ready.
 
-        The DaemonSet init script writes /tmp/pkb_ready only after verifying
-        the swap device is active (up to 150 s) and installing all measurement
-        tools (~1-2 min on cold APT cache).  The default 600 s covers
-        worst-case APT latency on a freshly-booted node.
+    The DaemonSet init script writes /tmp/pkb_ready only after verifying
+    the swap device is active (up to 150 s) and installing all measurement
+    tools (~1-2 min on cold APT cache).  The default 600 s covers
+    worst-case APT latency on a freshly-booted node.
 
-        Args:
+    Args:
           timeout: Maximum seconds to wait.
 
-        Returns:
+    Returns:
           Pod name on success; None on timeout.  Also updates self.pod_name.
-        """
-        deadline = time.time() + timeout
-        last_phase = ''
-        ready_pod = None
-
-        while time.time() < deadline:
-            # Step 1: wait for Running phase.
-            if ready_pod is None:
-                out, _, rc = kubectl.RunKubectlCommand(
-                    [
-                        'get',
-                        'pods',
-                        '-l',
-                        f'app={self.label}',
-                        '-n',
-                        self.namespace,
-                        '-o',
-                        (
-                            r'jsonpath={range .items[*]}'
-                            r'{.metadata.name}{"\t"}'
-                            r'{.status.phase}{"\n"}{end}'
-                        ),
-                    ],
-                    raise_on_failure=False,
+    """
+    deadline = time.time() + timeout
+    last_phase = ''
+    ready_pod = None
+
+    while time.time() < deadline:
+      # Step 1: wait for Running phase.
+      if ready_pod is None:
+        out, _, rc = kubectl.RunKubectlCommand(
+          [
+            'get',
+            'pods',
+            '-l',
+            f'app={self.label}',
+            '-n',
+            self.namespace,
+            '-o',
+            (
+              r'jsonpath={range .items[*]}'
+              r'{.metadata.name}{"\t"}'
+              r'{.status.phase}{"\n"}{end}'
+            ),
+          ],
+          raise_on_failure=False,
+        )
+        if rc == 0 and out.strip():
+          for line in out.strip().splitlines():
+            parts = line.split('\t')
+            if len(parts) == 2:
+              pod_name = parts[0].strip()
+              phase = parts[1].strip()
+              if phase == 'Running':
+                logging.info(
+                  '[swap_encryption] Pod %s is Running'
+                  ' — waiting for sentinel...',
+                  pod_name,
                 )
-                if rc == 0 and out.strip():
-                    for line in out.strip().splitlines():
-                        parts = line.split('\t')
-                        if len(parts) == 2:
-                            pod_name = parts[0].strip()
-                            phase = parts[1].strip()
-                            if phase == 'Running':
-                                logging.info(
-                                    '[swap_encryption] Pod %s is Running'
-                                    ' — waiting for sentinel...',
-                                    pod_name,
-                                )
-                                ready_pod = pod_name
-                                break
-                            if phase != last_phase:
-                                logging.info(
-                                    '[swap_encryption] Pod %s phase: %s',
-                                    pod_name,
-                                    phase,
-                                )
-                                last_phase = phase
-                                if phase == 'Pending':
-                                    self._LogPodEvents(pod_name)
-                else:
-                    logging.info(
-                        '[swap_encryption] Waiting for DaemonSet pod to'
-                        ' appear...'
-                    )
-
-            # Step 2: poll for /tmp/pkb_ready sentinel.
-            if ready_pod is not None:
-                _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
-                    [
-                        'exec',
-                        ready_pod,
-                        '-n',
-                        self.namespace,
-                        '--',
-                        'test',
-                        '-f',
-                        '/tmp/pkb_ready',
-                    ],
-                    raise_on_failure=False,
+                ready_pod = pod_name
+                break
+              if phase != last_phase:
+                logging.info(
+                  '[swap_encryption] Pod %s phase: %s',
+                  pod_name,
+                  phase,
                 )
-                if sentinel_rc == 0:
-                    logging.info(
-                        '[swap_encryption] Pod %s ready (swap device active)',
-                        ready_pod,
-                    )
-                    self.pod_name = ready_pod
-                    return ready_pod
-                # Container crashed (CrashLoopBackOff / exited) — reset and
-                # re-check pod phase on the next iteration.
-                if 'container not found' in sentinel_err or (
-                    'unable to upgrade connection' in sentinel_err
-                ):
-                    logging.warning(
-                        '[swap_encryption] Pod %s: container not running'
-                        ' (%s) — will re-check pod state',
-                        ready_pod,
-                        sentinel_err.strip(),
-                    )
-                    ready_pod = None
-                    last_phase = ''
-                else:
-                    logging.info(
-                        '[swap_encryption] Pod %s: still installing tools...',
-                        ready_pod,
-                    )
-
-            time.sleep(15)
-
-        logging.warning(
-            '[swap_encryption] Benchmark pod not ready after %ds', timeout
-        )
-        return None
-
-    def _LogPodEvents(self, pod_name: str) -> None:
-        """Dump recent Kubernetes events for a pod to help diagnose hangs."""
-        events_out, _, _ = kubectl.RunKubectlCommand(
-            ['describe', 'pod', pod_name, '-n', self.namespace],
-            raise_on_failure=False,
+                last_phase = phase
+                if phase == 'Pending':
+                  self._LogPodEvents(pod_name)
+        else:
+          logging.info(
+            '[swap_encryption] Waiting for DaemonSet pod to'
+            ' appear...'
+          )
+
+      # Step 2: poll for /tmp/pkb_ready sentinel.
+      if ready_pod is not None:
+        _, sentinel_err, sentinel_rc = kubectl.RunKubectlCommand(
+          [
+            'exec',
+            ready_pod,
+            '-n',
+            self.namespace,
+            '--',
+            'test',
+            '-f',
+            '/tmp/pkb_ready',
+          ],
+          raise_on_failure=False,
         )
-        in_events = False
-        lines = []
-        for line in events_out.splitlines():
-            if line.startswith('Events:'):
-                in_events = True
-            if in_events:
-                lines.append(line)
-        if lines:
-            logging.info(
-                '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])
-            )
+        if sentinel_rc == 0:
+          logging.info(
+            '[swap_encryption] Pod %s ready (swap device active)',
+            ready_pod,
+          )
+          self.pod_name = ready_pod
+          return ready_pod
+        # Container crashed (CrashLoopBackOff / exited) — reset and
+        # re-check pod phase on the next iteration.
+        if 'container not found' in sentinel_err or (
+          'unable to upgrade connection' in sentinel_err
+        ):
+          logging.warning(
+            '[swap_encryption] Pod %s: container not running'
+            ' (%s) — will re-check pod state',
+            ready_pod,
+            sentinel_err.strip(),
+          )
+          ready_pod = None
+          last_phase = ''
         else:
-            logging.info(
-                '[swap_encryption] kubectl describe output:\n%s',
-                events_out[-2000:] if len(events_out) > 2000 else events_out,
-            )
-
-    def _IsPodGone(self, pod: str) -> bool:
-        """Return True if the named pod no longer exists in the cluster."""
-        try:
-            _, err, rc = kubectl.RunKubectlCommand(
-                [
-                    'get',
-                    'pod',
-                    pod,
-                    '-n',
-                    self.namespace,
-                    '-o',
-                    'jsonpath={.metadata.name}',
-                ],
-                raise_on_failure=False,
-                timeout=15,
-            )
-            return rc != 0 and 'not found' in (err or '').lower()
-        except Exception:  # pylint: disable=broad-except
-            return False
-
-    def PodExec(
-        self,
-        cmd: str,
-        ignore_failure: bool = False,
-        timeout: int = 300,
-        _retries: int = 2,
-    ) -> tuple[str, str]:
-        """Run a shell command inside the benchmark pod via kubectl exec.
-
-        Handles:
+          logging.info(
+            '[swap_encryption] Pod %s: still installing tools...',
+            ready_pod,
+          )
+
+      time.sleep(15)
+
+    logging.warning(
+      '[swap_encryption] Benchmark pod not ready after %ds', timeout
+    )
+    return None
+
+  def _LogPodEvents(self, pod_name: str) -> None:
+    """Dump recent Kubernetes events for a pod to help diagnose hangs."""
+    events_out, _, _ = kubectl.RunKubectlCommand(
+      ['describe', 'pod', pod_name, '-n', self.namespace],
+      raise_on_failure=False,
+    )
+    in_events = False
+    lines = []
+    for line in events_out.splitlines():
+      if line.startswith('Events:'):
+        in_events = True
+      if in_events:
+        lines.append(line)
+    if lines:
+      logging.info(
+        '[swap_encryption] Pod events:\n%s', '\n'.join(lines[:30])
+      )
+    else:
+      logging.info(
+        '[swap_encryption] kubectl describe output:\n%s',
+        events_out[-2000:] if len(events_out) > 2000 else events_out,
+      )
+
+  def _IsPodGone(self, pod: str) -> bool:
+    """Return True if the named pod no longer exists in the cluster."""
+    try:
+      _, err, rc = kubectl.RunKubectlCommand(
+        [
+          'get',
+          'pod',
+          pod,
+          '-n',
+          self.namespace,
+          '-o',
+          'jsonpath={.metadata.name}',
+        ],
+        raise_on_failure=False,
+        timeout=15,
+      )
+      return rc != 0 and 'not found' in (err or '').lower()
+    except Exception:  # pylint: disable=broad-except
+      return False
+
+  def PodExec(
+    self,
+    cmd: str,
+    ignore_failure: bool = False,
+    timeout: int = 300,
+    _retries: int = 2,
+  ) -> tuple[str, str]:
+    """Run a shell command inside the benchmark pod via kubectl exec.
+
+    Handles:
           - Transient GKE websocket resets: automatic retry (up to _retries).
           - OOM kill (rc=137): records to self.oom_events, calls RecoverPod,
-            does NOT retry the OOM-triggering command itself.
+      does NOT retry the OOM-triggering command itself.
           - Container/pod gone: records to self.pod_lost, calls RecoverPod,
-            retries the command on the recovered pod.
+      retries the command on the recovered pod.
 
-        Uses self.pod_name as the active pod; RecoverPod updates it on eviction.
+    Uses self.pod_name as the active pod; RecoverPod updates it on eviction.
 
-        Args:
+    Args:
           cmd: Shell command string passed to bash -c.
           ignore_failure: When True, non-zero exit codes are logged but not
-            raised.
+      raised.
           timeout: Seconds before PKB kills the kubectl exec process.  Pass a
-            larger value for long-running jobs (fio, stress-ng, kernel build).
+      larger value for long-running jobs (fio, stress-ng, kernel build).
           _retries: Max automatic retries on transient websocket resets.
 
-        Returns:
+    Returns:
           Tuple of (stdout, stderr) strings.
-        """
-        active = self.pod_name
-
-        for attempt in range(_retries + 1):
-            out, err, rc = kubectl.RunKubectlCommand(
-                [
-                    'exec',
-                    active,
-                    '-n',
-                    self.namespace,
-                    '--',
-                    'bash',
-                    '-c',
-                    cmd,
-                ],
-                raise_on_failure=False,
-                raise_on_timeout=False,
-                timeout=timeout,
+    """
+    active = self.pod_name
+
+    for attempt in range(_retries + 1):
+      out, err, rc = kubectl.RunKubectlCommand(
+        [
+          'exec',
+          active,
+          '-n',
+          self.namespace,
+          '--',
+          'bash',
+          '-c',
+          cmd,
+        ],
+        raise_on_failure=False,
+        raise_on_timeout=False,
+        timeout=timeout,
+      )
+
+      # Retry transient GKE websocket resets.
+      is_transient = rc != 0 and any(
+        e in err for e in _TRANSIENT_KUBECTL_ERRORS
+      )
+      if is_transient and attempt < _retries:
+        logging.warning(
+          '[swap_encryption] kubectl exec connection reset (attempt'
+          ' %d/%d); retrying in 10 s',
+          attempt + 1,
+          _retries + 1,
+        )
+        time.sleep(10)
+        continue
+
+      # rc=137 (SIGKILL): OOM killer terminated the container process.
+      # Do NOT retry — log, recover, and return so the caller can decide.
+      if rc == 137:
+        if active not in self.oom_events:
+          self.oom_events.append(active)
+        # Kubernetes takes a few seconds to update pod state after
+        # eviction — sleep before checking to avoid false-positive Running.
+        logging.warning(
+          '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes'
+          ' to update pod state before recovery check'
+        )
+        time.sleep(15)
+        if self._IsPodGone(active):
+          logging.warning(
+            '[swap_encryption] OOM-eviction detected (rc=137, pod'
+            ' gone) — recovering pod name for subsequent commands'
+          )
+        else:
+          logging.warning(
+            '[swap_encryption] Container OOM-killed (rc=137, pod'
+            ' still exists) — waiting for container restart'
+          )
+        new_pod = self.RecoverPod(active)
+        if new_pod != active:
+          logging.info(
+            '[swap_encryption] Pod name updated: %s → %s',
+            active,
+            new_pod,
+          )
+          self.pod_name = new_pod
+          active = new_pod
+        break  # OOM cmd is never re-run on the recovered pod.
+
+      # Container or pod gone: record loss, try RecoverPod, retry cmd.
+      is_container_gone = rc != 0 and any(
+        e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
+      )
+      if is_container_gone:
+        if active and active not in self.pod_lost:
+          self.pod_lost.append(active)
+          logging.error(
+            '[swap_encryption] Benchmark pod %s is gone (%s) —'
+            ' recording run as degraded',
+            active,
+            (err or '').strip()[:160],
+          )
+        if attempt < _retries:
+          logging.warning(
+            '[swap_encryption] Container gone/restarting (attempt'
+            ' %d/%d) — waiting for pod to recover...',
+            attempt + 1,
+            _retries + 1,
+          )
+          new_pod = self.RecoverPod(active)
+          if new_pod != active:
+            logging.info(
+              '[swap_encryption] Pod name updated: %s → %s',
+              active,
+              new_pod,
             )
+            self.pod_name = new_pod
+            active = new_pod
+          continue
+      break
 
-            # Retry transient GKE websocket resets.
-            is_transient = rc != 0 and any(
-                e in err for e in _TRANSIENT_KUBECTL_ERRORS
-            )
-            if is_transient and attempt < _retries:
-                logging.warning(
-                    '[swap_encryption] kubectl exec connection reset (attempt'
-                    ' %d/%d); retrying in 10 s',
-                    attempt + 1,
-                    _retries + 1,
-                )
-                time.sleep(10)
-                continue
-
-            # rc=137 (SIGKILL): OOM killer terminated the container process.
-            # Do NOT retry — log, recover, and return so the caller can decide.
-            if rc == 137:
-                if active not in self.oom_events:
-                    self.oom_events.append(active)
-                # Kubernetes takes a few seconds to update pod state after
-                # eviction — sleep before checking to avoid false-positive Running.
-                logging.warning(
-                    '[swap_encryption] rc=137 — sleeping 15 s for Kubernetes'
-                    ' to update pod state before recovery check'
-                )
-                time.sleep(15)
-                if self._IsPodGone(active):
-                    logging.warning(
-                        '[swap_encryption] OOM-eviction detected (rc=137, pod'
-                        ' gone) — recovering pod name for subsequent commands'
-                    )
-                else:
-                    logging.warning(
-                        '[swap_encryption] Container OOM-killed (rc=137, pod'
-                        ' still exists) — waiting for container restart'
-                    )
-                new_pod = self.RecoverPod(active)
-                if new_pod != active:
-                    logging.info(
-                        '[swap_encryption] Pod name updated: %s → %s',
-                        active,
-                        new_pod,
-                    )
-                    self.pod_name = new_pod
-                    active = new_pod
-                break  # OOM cmd is never re-run on the recovered pod.
-
-            # Container or pod gone: record loss, try RecoverPod, retry cmd.
-            is_container_gone = rc != 0 and any(
-                e in err.lower() for e in _CONTAINER_GONE_KUBECTL_ERRORS
-            )
-            if is_container_gone:
-                if active and active not in self.pod_lost:
-                    self.pod_lost.append(active)
-                    logging.error(
-                        '[swap_encryption] Benchmark pod %s is gone (%s) —'
-                        ' recording run as degraded',
-                        active,
-                        (err or '').strip()[:160],
-                    )
-                if attempt < _retries:
-                    logging.warning(
-                        '[swap_encryption] Container gone/restarting (attempt'
-                        ' %d/%d) — waiting for pod to recover...',
-                        attempt + 1,
-                        _retries + 1,
-                    )
-                    new_pod = self.RecoverPod(active)
-                    if new_pod != active:
-                        logging.info(
-                            '[swap_encryption] Pod name updated: %s → %s',
-                            active,
-                            new_pod,
-                        )
-                        self.pod_name = new_pod
-                        active = new_pod
-                    continue
-            break
-
-        if rc != 0 and not ignore_failure:
-            raise errors.VmUtil.IssueCommandError(
-                f'[swap_encryption] PodExec failed (rc={rc}): {err}'
-            )
-        return out, err
+    if rc != 0 and not ignore_failure:
+      raise errors.VmUtil.IssueCommandError(
+        f'[swap_encryption] PodExec failed (rc={rc}): {err}'
+      )
+    return out, err
 
-    def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str:
-        """Wait for the DaemonSet to recover after OOM kill or eviction.
+  def RecoverPod(self, pod: str, timeout_sec: int = 600) -> str:
+    """Wait for the DaemonSet to recover after OOM kill or eviction.
 
-        Handles two scenarios:
+    Handles two scenarios:
           1. Container OOM restart: same pod name, container restarting in
              place (DaemonSet restartPolicy=Always).
           2. Pod eviction/deletion: pod is gone; DaemonSet creates a new pod
              with a DIFFERENT name.
 
-        Checks metadata.deletionTimestamp in addition to status.phase to
-        catch the Terminating state where phase may still read Running.
+    Checks metadata.deletionTimestamp in addition to status.phase to
+    catch the Terminating state where phase may still read Running.
 
-        Args:
+    Args:
           pod: Original pod name to monitor.
           timeout_sec: Maximum seconds to wait for recovery.
 
-        Returns:
+    Returns:
           The (possibly new) pod name once Running and /tmp/pkb_ready is
           present.
-        """
-        deadline = time.time() + timeout_sec
-        logging.info(
-            '[swap_encryption] Waiting for pod %s to recover (up to %ds)...',
-            pod,
-            timeout_sec,
+    """
+    deadline = time.time() + timeout_sec
+    logging.info(
+      '[swap_encryption] Waiting for pod %s to recover (up to %ds)...',
+      pod,
+      timeout_sec,
+    )
+
+    # Phase 1: find a Running pod that is NOT being terminated.
+    recovered_pod = pod
+    while time.time() < deadline:
+      # Query both phase and deletionTimestamp in a single call.
+      status_out, status_err, status_rc = kubectl.RunKubectlCommand(
+        [
+          'get',
+          'pod',
+          pod,
+          '-n',
+          self.namespace,
+          '-o',
+          'jsonpath={.status.phase}|{.metadata.deletionTimestamp}',
+        ],
+        raise_on_failure=False,
+        timeout=30,
+      )
+      fields = status_out.strip().split('|')
+      phase = fields[0].strip() if fields else ''
+      is_terminating = len(fields) > 1 and bool(fields[1].strip())
+
+      # Genuine Running (not being deleted) — move to Phase 2.
+      if status_rc == 0 and phase == 'Running' and not is_terminating:
+        break
+
+      # Pod gone or Terminating — look for a replacement by label.
+      pod_gone_or_terminating = (
+        status_rc != 0
+        and 'not found' in (status_out + status_err).lower()
+      ) or is_terminating
+      if pod_gone_or_terminating:
+        label_out, _, label_rc = kubectl.RunKubectlCommand(
+          [
+            'get',
+            'pods',
+            '-n',
+            self.namespace,
+            '-l',
+            f'app={self.label}',
+            '-o',
+            (
+              'jsonpath={range'
+              ' .items[?(@.status.phase=="Running")]}'
+              '{.metadata.name}{"\\n"}{end}'
+            ),
+          ],
+          raise_on_failure=False,
+          timeout=30,
         )
-
-        # Phase 1: find a Running pod that is NOT being terminated.
-        recovered_pod = pod
-        while time.time() < deadline:
-            # Query both phase and deletionTimestamp in a single call.
-            status_out, status_err, status_rc = kubectl.RunKubectlCommand(
-                [
-                    'get',
-                    'pod',
-                    pod,
-                    '-n',
-                    self.namespace,
-                    '-o',
-                    'jsonpath={.status.phase}|{.metadata.deletionTimestamp}',
-                ],
-                raise_on_failure=False,
-                timeout=30,
-            )
-            fields = status_out.strip().split('|')
-            phase = fields[0].strip() if fields else ''
-            is_terminating = len(fields) > 1 and bool(fields[1].strip())
-
-            # Genuine Running (not being deleted) — move to Phase 2.
-            if status_rc == 0 and phase == 'Running' and not is_terminating:
-                break
-
-            # Pod gone or Terminating — look for a replacement by label.
-            pod_gone_or_terminating = (
-                status_rc != 0
-                and 'not found' in (status_out + status_err).lower()
-            ) or is_terminating
-            if pod_gone_or_terminating:
-                label_out, _, label_rc = kubectl.RunKubectlCommand(
-                    [
-                        'get',
-                        'pods',
-                        '-n',
-                        self.namespace,
-                        '-l',
-                        f'app={self.label}',
-                        '-o',
-                        (
-                            'jsonpath={range'
-                            ' .items[?(@.status.phase=="Running")]}'
-                            '{.metadata.name}{"\\n"}{end}'
-                        ),
-                    ],
-                    raise_on_failure=False,
-                    timeout=30,
-                )
-                new_pods = [
-                    p.strip()
-                    for p in label_out.strip().splitlines()
-                    if p.strip() and p.strip() != pod
-                ]
-                if label_rc == 0 and new_pods:
-                    recovered_pod = new_pods[0]
-                    logging.info(
-                        '[swap_encryption] Original pod %s gone/terminating;'
-                        ' found replacement %s',
-                        pod,
-                        recovered_pod,
-                    )
-                    break
-
-            time.sleep(10)
-        else:
-            raise errors.VmUtil.IssueCommandError(
-                f'[swap_encryption] No Running pod found (original: {pod})'
-                f' within {timeout_sec}s after OOM kill / eviction'
-            )
-
-        # Phase 2: wait for init script to finish (sentinel written last).
-        while time.time() < deadline:
-            ready_out, _, ready_rc = kubectl.RunKubectlCommand(
-                [
-                    'exec',
-                    recovered_pod,
-                    '-n',
-                    self.namespace,
-                    '--',
-                    'bash',
-                    '-c',
-                    'test -f /tmp/pkb_ready && echo READY',
-                ],
-                raise_on_failure=False,
-                timeout=30,
-            )
-            if ready_rc == 0 and 'READY' in ready_out:
-                logging.info(
-                    '[swap_encryption] Pod %s recovered (swap device active)',
-                    recovered_pod,
-                )
-                self.pod_name = recovered_pod
-                return recovered_pod
-            time.sleep(15)
-
-        raise errors.VmUtil.IssueCommandError(
-            f'[swap_encryption] Pod {recovered_pod} did not become ready'
-            f' within {timeout_sec}s after OOM kill / eviction'
+        new_pods = [
+          p.strip()
+          for p in label_out.strip().splitlines()
+          if p.strip() and p.strip() != pod
+        ]
+        if label_rc == 0 and new_pods:
+          recovered_pod = new_pods[0]
+          logging.info(
+            '[swap_encryption] Original pod %s gone/terminating;'
+            ' found replacement %s',
+            pod,
+            recovered_pod,
+          )
+          break
+
+      time.sleep(10)
+    else:
+      raise errors.VmUtil.IssueCommandError(
+        f'[swap_encryption] No Running pod found (original: {pod})'
+        f' within {timeout_sec}s after OOM kill / eviction'
+      )
+
+    # Phase 2: wait for init script to finish (sentinel written last).
+    while time.time() < deadline:
+      ready_out, _, ready_rc = kubectl.RunKubectlCommand(
+        [
+          'exec',
+          recovered_pod,
+          '-n',
+          self.namespace,
+          '--',
+          'bash',
+          '-c',
+          'test -f /tmp/pkb_ready && echo READY',
+        ],
+        raise_on_failure=False,
+        timeout=30,
+      )
+      if ready_rc == 0 and 'READY' in ready_out:
+        logging.info(
+          '[swap_encryption] Pod %s recovered (swap device active)',
+          recovered_pod,
         )
+        self.pod_name = recovered_pod
+        return recovered_pod
+      time.sleep(15)
+
+    raise errors.VmUtil.IssueCommandError(
+      f'[swap_encryption] Pod {recovered_pod} did not become ready'
+      f' within {timeout_sec}s after OOM kill / eviction'
+    )

From d997254919f816a9481c2ca56b04f1061f98ea82 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Mon, 29 Jun 2026 18:17:42 +0530
Subject: [PATCH 13/17] refactor(swap_encryption/pr1): correct PKB structure -
 swap_config as NodepoolSpec field

BREAKING: replaces SwapNodePool (standalone nodepool lifecycle) with the
correct PKB pattern: swap configuration declared in BENCHMARK_CONFIG and
applied by the existing GKE cluster creation flow.

New files:
- resources/container_service/swap_config.py
  - GkeSwapConfig(BaseResource): WriteLinuxConfigYaml(), ValidHyperdiskThroughput()
  - EksSwapConfig(BaseResource): stub for nodeadm config (deferred to PR #6780)

Core framework changes:
- configs/container_spec.py: add SwapConfigSpec(BaseSpec) + _SwapConfigDecoder
  + swap_config field on NodepoolSpec
- resources/container_service/container.py: add swap_config attr to BaseNodePoolConfig
- resources/container_service/container_cluster.py: propagate swap_config in
  _InitializeNodePool() (mirrors sandbox_config pattern)
- providers/gcp/google_kubernetes_engine.py: _AddNodeParamsToCmd() reads
  nodepool_config.swap_config - applies --system-config-from-file,
  UBUNTU_CONTAINERD, --no-enable-autorepair, boot-disk-provisioned-iops/throughput

Thin benchmark:
- BENCHMARK_CONFIG declares benchmark nodepool with swap_config (no separate
  nodepool create needed - GKE cluster creation handles it)
- Prepare(): deploy SwapDaemonSet + delete default-pool
- Run(): verify swap_active + swap_encrypted; report samples
- Cleanup(): empty (PKB auto-deletes spec.resources)

Addresses Ajay reviews:
- r3457826290: swap as base resource plugged into GKE cluster creation flow
- r3457877984: linuxConfig.swapConfig via --system-config-from-file (GkeSwapConfig)
- r3457928855: removed memory.swap.max hack
- r3457964593: UBUNTU_CONTAINERD set per-nodepool in _AddNodeParamsToCmd
- r3472513706: swapConfig auto-enables memorySwapBehavior=LimitedSwap
- r3472549985: UBUNTU_CONTAINERD required for dm-crypt
---
 .../swap_encryption_benchmark.py                | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
index 3322795eec..d5b4ec08db 100644
--- a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -88,9 +88,18 @@
     'Override disk type for the benchmark nodepool.',
 )
 
+_DAEMONSET_IMAGE = flags.DEFINE_string(
+    'swap_encryption_daemonset_image',
+    'ubuntu:22.04',
+    'Container image for the privileged benchmark DaemonSet.',
+)
+
 _BenchmarkSpec = benchmark_spec.BenchmarkSpec
 _BENCHMARK_NODEPOOL = 'benchmark'
 _DEFAULT_POOL = 'default-pool'
+_DS_NAME = 'pkb-swap-benchmark'
+_DS_NAMESPACE = 'default'
+_DS_LABEL = 'pkb-swap-benchmark'
 
 
 def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
@@ -124,7 +133,13 @@ def Prepare(spec: _BenchmarkSpec) -> None:
     spec: PKB BenchmarkSpec with spec.container_cluster already created.
   """
   cluster = spec.container_cluster
-  daemonset = swap_daemonset.SwapDaemonSet(cluster=cluster)
+  daemonset = swap_daemonset.SwapDaemonSet(
+      name=_DS_NAME,
+      namespace=_DS_NAMESPACE,
+      label=_DS_LABEL,
+      nodepool=_BENCHMARK_NODEPOOL,
+      image=_DAEMONSET_IMAGE.value,
+  )
   daemonset.Create()
   spec.resources.append(daemonset)
   pod = daemonset.WaitForPod()

From a65db709104d7ca7518d002957df2f672bbdb326 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Tue, 30 Jun 2026 09:48:11 +0530
Subject: [PATCH 14/17] refactor(swap_config): add BaseSwapConfig abstract base
 class

GkeSwapConfig and EksSwapConfig now both inherit from BaseSwapConfig(BaseResource).
Common sysctl attrs (swappiness, min_free_kbytes, watermark_scale_factor) live in
the base class. Cloud-specific attrs remain in each subclass.

Addresses Zac review: GkeSwapConfig & EksSwapConfig should inherit from BaseSwapConfig.
---
 .../container_service/swap_config.py          | 99 +++++++++++++++----
 1 file changed, 80 insertions(+), 19 deletions(-)

diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py
index 8606929308..ca36dbad8b 100644
--- a/perfkitbenchmarker/resources/container_service/swap_config.py
+++ b/perfkitbenchmarker/resources/container_service/swap_config.py
@@ -11,13 +11,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""GkeSwapConfig and EksSwapConfig: swap configuration as PKB BaseResource.
+"""Swap configuration as PKB BaseResource: BaseSwapConfig, GkeSwapConfig, EksSwapConfig.
 
 These resources encapsulate cloud-specific swap configuration for GKE and EKS
 nodepools. They are referenced via NodepoolSpec.swap_config (declared in the
 benchmark BENCHMARK_CONFIG YAML) and consumed by the cloud provider's
 _AddNodeParamsToCmd() during cluster/nodepool creation.
 
+Class hierarchy:
+  BaseSwapConfig(BaseResource)   — common sysctl attrs + abstract from_spec()
+    GkeSwapConfig(BaseSwapConfig) — linuxConfig YAML for --system-config-from-file
+    EksSwapConfig(BaseSwapConfig) — nodeadm kubelet config (deferred to PR #6780)
+
 Usage in BENCHMARK_CONFIG:
   container_cluster:
     nodepools:
@@ -51,16 +56,58 @@
 _HYPERDISK_MAX_IOPS_PER_MBPS = 256
 
 
-class GkeSwapConfig(resource.BaseResource):
+class BaseSwapConfig(resource.BaseResource):
+  """Abstract base class for cloud-specific nodepool swap configuration.
+
+  Subclasses (GkeSwapConfig, EksSwapConfig) implement the cloud-specific
+  method for applying swap configuration during nodepool creation.
+
+  Common sysctl attributes (vm.swappiness, vm.min_free_kbytes,
+  vm.watermark_scale_factor) are shared across all cloud providers.
+
+  _Create() and _Delete() are no-ops: the swap config is applied as a
+  parameter to nodepool creation, not as a standalone cloud resource.
+  """
+
+  RESOURCE_TYPE = 'BaseSwapConfig'
+  REQUIRED_ATTRS = []
+
+  def __init__(
+      self,
+      swappiness: int = 100,
+      min_free_kbytes: int = 200,
+      watermark_scale_factor: int = 500,
+  ) -> None:
+    super().__init__()
+    self.swappiness = swappiness
+    self.min_free_kbytes = min_free_kbytes
+    self.watermark_scale_factor = watermark_scale_factor
+
+  @classmethod
+  def from_spec(cls, swap_spec) -> 'BaseSwapConfig':
+    """Create a BaseSwapConfig subclass from a SwapConfigSpec.
+
+    Subclasses must override this to instantiate with cloud-specific attrs.
+    """
+    raise NotImplementedError(
+        f'{cls.__name__}.from_spec() must be implemented by subclasses.'
+    )
+
+  def _Create(self) -> None:
+    """No-op: swap config is applied during nodepool creation."""
+
+  def _Delete(self) -> None:
+    """No-op: cleaned up when the nodepool is deleted."""
+
+
+class GkeSwapConfig(BaseSwapConfig):
   """GKE swap configuration for a nodepool.
 
   Encapsulates the linuxConfig (swapConfig + sysctl) YAML for
   --system-config-from-file and optional Hyperdisk IOPS/throughput overrides.
 
   Consumed by GkeCluster._AddNodeParamsToCmd() when nodepool_config.swap_config
-  is set. _Create() and _Delete() are no-ops because the swap config is applied
-  as part of the gcloud node-pools create command; the nodepool itself manages
-  the lifecycle.
+  is set.
 
   Attributes:
     swappiness: vm.swappiness sysctl value (0-200, default 100).
@@ -85,10 +132,11 @@ def __init__(
       boot_disk_iops: int = 0,
       boot_disk_throughput: int = 0,
   ) -> None:
-    super().__init__()
-    self.swappiness = swappiness
-    self.min_free_kbytes = min_free_kbytes
-    self.watermark_scale_factor = watermark_scale_factor
+    super().__init__(
+        swappiness=swappiness,
+        min_free_kbytes=min_free_kbytes,
+        watermark_scale_factor=watermark_scale_factor,
+    )
     self.lssd = lssd
     self.lssd_count = lssd_count
     self.boot_disk_iops = boot_disk_iops
@@ -108,11 +156,8 @@ def from_spec(cls, swap_spec) -> 'GkeSwapConfig':
         boot_disk_throughput=swap_spec.boot_disk_throughput,
     )
 
-  def _Create(self) -> None:
-    """No-op: swap config is applied during nodepool creation."""
-
   def _Delete(self) -> None:
-    """No-op: cleaned up when the nodepool is deleted."""
+    """Cleans up any written YAML tempfile."""
     self._CleanupYaml()
 
   def WriteLinuxConfigYaml(self) -> str:
@@ -207,13 +252,16 @@ def _CleanupYaml(self) -> None:
     self.CleanupYaml()
 
 
-class EksSwapConfig(resource.BaseResource):
+class EksSwapConfig(BaseSwapConfig):
   """EKS swap configuration for a nodepool (stub).
 
   Configures kubelet LimitedSwap via nodeadm bootstrap configuration.
   Full implementation deferred to PR #6780.
 
   Attributes:
+    swappiness: vm.swappiness sysctl value (inherited from BaseSwapConfig).
+    min_free_kbytes: vm.min_free_kbytes sysctl (inherited from BaseSwapConfig).
+    watermark_scale_factor: vm.watermark_scale_factor (inherited from BaseSwapConfig).
     memory_swap_behavior: kubelet memorySwapBehavior value ('LimitedSwap').
     fail_swap_on: kubelet failSwapOn setting (False to allow swap on EKS).
   """
@@ -223,17 +271,28 @@ class EksSwapConfig(resource.BaseResource):
 
   def __init__(
       self,
+      swappiness: int = 100,
+      min_free_kbytes: int = 200,
+      watermark_scale_factor: int = 500,
       memory_swap_behavior: str = 'LimitedSwap',
       fail_swap_on: bool = False,
   ) -> None:
-    super().__init__()
+    super().__init__(
+        swappiness=swappiness,
+        min_free_kbytes=min_free_kbytes,
+        watermark_scale_factor=watermark_scale_factor,
+    )
     self.memory_swap_behavior = memory_swap_behavior
     self.fail_swap_on = fail_swap_on
 
   @classmethod
   def from_spec(cls, swap_spec) -> 'EksSwapConfig':
     """Create an EksSwapConfig from a SwapConfigSpec."""
-    return cls()
+    return cls(
+        swappiness=swap_spec.swappiness,
+        min_free_kbytes=swap_spec.min_free_kbytes,
+        watermark_scale_factor=swap_spec.watermark_scale_factor,
+    )
 
   def _Create(self) -> None:
     """Stub: EKS kubelet LimitedSwap config via nodeadm (deferred to PR #6780)."""
@@ -243,9 +302,6 @@ def _Create(self) -> None:
         '(deferred to PR #6780). Swap will not be enabled on EKS nodes.'
     )
 
-  def _Delete(self) -> None:
-    """No-op."""
-
   def GetNodeadmConfig(self) -> str:
     """Return nodeadm bootstrap YAML for kubelet swap settings."""
     return (
@@ -256,4 +312,9 @@ def GetNodeadmConfig(self) -> str:
         '    config:\n'
         f'      memorySwapBehavior: {self.memory_swap_behavior}\n'
         f'      failSwapOn: {str(self.fail_swap_on).lower()}\n'
+        '  containerd:\n'
+        '    config:\n'
+        f'      vm.swappiness: {self.swappiness}\n'
+        f'      vm.min_free_kbytes: {self.min_free_kbytes}\n'
+        f'      vm.watermark_scale_factor: {self.watermark_scale_factor}\n'
     )

From 8e6e719369ced00d960677328f6d453b0fd4d49b Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Wed, 1 Jul 2026 11:15:44 +0530
Subject: [PATCH 15/17] test(swap_config): add unit tests for BaseSwapConfig,
 GkeSwapConfig, EksSwapConfig and GKE wiring

---
 .../swap_encryption_benchmark_test.py         | 141 ++++++++++
 .../gcp/google_kubernetes_engine_test.py      | 156 +++++++++++
 .../container_service/swap_config_test.py     | 260 ++++++++++++++++++
 3 files changed, 557 insertions(+)
 create mode 100644 tests/linux_benchmarks/swap_encryption_benchmark_test.py
 create mode 100644 tests/resources/container_service/swap_config_test.py

diff --git a/tests/linux_benchmarks/swap_encryption_benchmark_test.py b/tests/linux_benchmarks/swap_encryption_benchmark_test.py
new file mode 100644
index 0000000000..9a29939cde
--- /dev/null
+++ b/tests/linux_benchmarks/swap_encryption_benchmark_test.py
@@ -0,0 +1,141 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for perfkitbenchmarker.linux_benchmarks.swap_encryption_benchmark."""
+
+import unittest
+from unittest import mock
+
+from perfkitbenchmarker.linux_benchmarks import swap_encryption_benchmark
+from tests import pkb_common_test_case
+
+
+class GetConfigTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests that BENCHMARK_CONFIG is well-formed and loadable."""
+
+  def test_get_config_returns_dict(self):
+    config = swap_encryption_benchmark.GetConfig({})
+    self.assertIsInstance(config, dict)
+
+  def test_get_config_has_container_cluster(self):
+    # configs.LoadConfig returns the inner benchmark dict directly (no benchmark
+    # name wrapper), so top-level keys are 'container_cluster', 'description', etc.
+    config = swap_encryption_benchmark.GetConfig({})
+    self.assertIn('container_cluster', config)
+
+  def test_get_config_benchmark_nodepool_present(self):
+    config = swap_encryption_benchmark.GetConfig({})
+    nodepools = config['container_cluster']['nodepools']
+    self.assertIn(
+        swap_encryption_benchmark._BENCHMARK_NODEPOOL,
+        nodepools,
+    )
+
+  def test_get_config_swap_config_present_on_benchmark_nodepool(self):
+    config = swap_encryption_benchmark.GetConfig({})
+    nodepool = config['container_cluster']['nodepools'][
+        swap_encryption_benchmark._BENCHMARK_NODEPOOL
+    ]
+    self.assertIn('swap_config', nodepool)
+    self.assertTrue(nodepool['swap_config'].get('enabled', False))
+
+
+class ParseCipherTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for _parse_cipher() output parsing."""
+
+  def test_parse_cipher_standard_aes_xts(self):
+    # Typical dmsetup status line: <name> <start>-<end> crypt <cipher> ...
+    status = '0 67108864 crypt aes-xts-plain64 0 8:16 0 1 sector_size:4096'
+    self.assertEqual(
+        swap_encryption_benchmark._parse_cipher(status), 'aes-xts-plain64'
+    )
+
+  def test_parse_cipher_returns_empty_when_no_crypt_token(self):
+    status = '0 67108864 linear 8:16 0'
+    self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '')
+
+  def test_parse_cipher_returns_empty_on_empty_string(self):
+    self.assertEqual(swap_encryption_benchmark._parse_cipher(''), '')
+
+  def test_parse_cipher_crypt_at_end_returns_empty(self):
+    # 'crypt' present but no token after it.
+    status = 'something crypt'
+    self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '')
+
+  def test_parse_cipher_not_encrypted_string(self):
+    # Output from the benchmark when dm-crypt not active.
+    status = 'not_encrypted'
+    self.assertEqual(swap_encryption_benchmark._parse_cipher(status), '')
+
+
+class DetectSwapDeviceTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for _detect_swap_device() with mocked PodExec."""
+
+  def _make_ds(self, pod_exec_output):
+    ds = mock.Mock()
+    ds.PodExec.return_value = (pod_exec_output, '')
+    return ds
+
+  def test_detect_swap_device_returns_device_basename(self):
+    # /proc/swaps first device column (after header skip via awk NR>1).
+    ds = self._make_ds('/dev/dm-0\n')
+    result = swap_encryption_benchmark._detect_swap_device(ds)
+    self.assertEqual(result, 'dm-0')
+
+  def test_detect_swap_device_returns_first_device_when_multiple(self):
+    ds = self._make_ds('/dev/dm-0\n/dev/dm-1\n')
+    result = swap_encryption_benchmark._detect_swap_device(ds)
+    self.assertEqual(result, 'dm-0')
+
+  def test_detect_swap_device_returns_empty_when_no_swap(self):
+    ds = self._make_ds('')
+    result = swap_encryption_benchmark._detect_swap_device(ds)
+    self.assertEqual(result, '')
+
+  def test_detect_swap_device_returns_empty_on_pod_exec_exception(self):
+    ds = mock.Mock()
+    ds.PodExec.side_effect = Exception('pod not found')
+    result = swap_encryption_benchmark._detect_swap_device(ds)
+    self.assertEqual(result, '')
+
+
+class BuildMetadataTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for _build_metadata() with mocked PodExec."""
+
+  def test_build_metadata_includes_swap_device(self):
+    ds = mock.Mock()
+    ds.PodExec.return_value = ('5.15.0-gke-1234\n', '')
+    meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0')
+    self.assertEqual(meta['swap_device'], 'dm-0')
+
+  def test_build_metadata_swap_device_unknown_when_empty(self):
+    ds = mock.Mock()
+    ds.PodExec.return_value = ('5.15.0\n', '')
+    meta = swap_encryption_benchmark._build_metadata(ds, '')
+    self.assertEqual(meta['swap_device'], 'unknown')
+
+  def test_build_metadata_includes_kernel_version(self):
+    ds = mock.Mock()
+    ds.PodExec.return_value = ('5.15.0-gke-1234\n', '')
+    meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0')
+    self.assertEqual(meta['kernel_version'], '5.15.0-gke-1234')
+
+  def test_build_metadata_kernel_version_absent_on_pod_exec_exception(self):
+    ds = mock.Mock()
+    ds.PodExec.side_effect = Exception('timeout')
+    meta = swap_encryption_benchmark._build_metadata(ds, 'dm-0')
+    self.assertNotIn('kernel_version', meta)
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tests/providers/gcp/google_kubernetes_engine_test.py b/tests/providers/gcp/google_kubernetes_engine_test.py
index dbf8232f5e..421d99cd02 100644
--- a/tests/providers/gcp/google_kubernetes_engine_test.py
+++ b/tests/providers/gcp/google_kubernetes_engine_test.py
@@ -34,6 +34,7 @@
 from perfkitbenchmarker.resources.container_service import container
 from perfkitbenchmarker.resources.container_service import kubectl
 from perfkitbenchmarker.resources.container_service import kubernetes_commands
+from perfkitbenchmarker.resources.container_service import swap_config as swap_config_lib
 from tests import pkb_common_test_case
 
 FLAGS = flgs.FLAGS
@@ -949,5 +950,160 @@ def testCreateWithPerNodepoolAutoscaling(self):
       self.assertIn('--max-nodes 10', nodepool_cmd)
 
 
+class GoogleKubernetesEngineSwapConfigTestCase(PatchedObjectsTestCase):
+  """Tests that _AddNodeParamsToCmd wires swap_config flags correctly."""
+
+  @staticmethod
+  def _make_swap_spec(
+      boot_disk_iops=160000,
+      boot_disk_throughput=2400,
+      lssd=False,
+      lssd_count=0,
+  ):
+    """Build a ContainerClusterSpec with swap_config on the benchmark nodepool."""
+    return container_spec.ContainerClusterSpec(
+        'NAME',
+        **{
+            'cloud': 'GCP',
+            'vm_spec': {
+                'GCP': {
+                    'machine_type': 'e2-medium',
+                    'zone': 'us-central1-a',
+                },
+            },
+            'nodepools': {
+                'benchmark': {
+                    'vm_spec': {
+                        'GCP': {
+                            'machine_type': 'n4-highmem-32',
+                            'zone': 'us-central1-a',
+                        },
+                    },
+                    'swap_config': {
+                        'enabled': True,
+                        'swappiness': 100,
+                        'min_free_kbytes': 200,
+                        'watermark_scale_factor': 500,
+                        'lssd': lssd,
+                        'lssd_count': lssd_count,
+                        'boot_disk_iops': boot_disk_iops,
+                        'boot_disk_throughput': boot_disk_throughput,
+                    },
+                },
+            },
+        },
+    )
+
+  def setUp(self):
+    super().setUp()
+    # Avoid real tempfile creation in GKE command-generation tests.
+    # GkeSwapConfig implementation is tested separately in swap_config_test.py.
+    self.enter_context(
+        mock.patch.object(
+            swap_config_lib.GkeSwapConfig,
+            'WriteLinuxConfigYaml',
+            return_value='/tmp/fake_linux_config.yaml',
+        )
+    )
+
+  def test_swap_config_sets_system_config_from_file_flag(self):
+    spec = self._make_swap_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertIsNotNone(nodepool_cmd)
+      self.assertIn('--system-config-from-file', nodepool_cmd)
+      self.assertIn('/tmp/fake_linux_config.yaml', nodepool_cmd)
+
+  def test_swap_config_sets_ubuntu_containerd_image_type(self):
+    spec = self._make_swap_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertIn('UBUNTU_CONTAINERD', nodepool_cmd)
+
+  def test_swap_config_sets_no_enable_autorepair(self):
+    spec = self._make_swap_spec()
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertIn('--no-enable-autorepair', nodepool_cmd)
+
+  def test_swap_config_with_boot_disk_iops_sets_provisioned_flags(self):
+    spec = self._make_swap_spec(boot_disk_iops=160000, boot_disk_throughput=2400)
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertIn('--boot-disk-provisioned-iops', nodepool_cmd)
+      self.assertIn('--boot-disk-provisioned-throughput', nodepool_cmd)
+
+  def test_swap_config_lssd_omits_boot_disk_provisioned_flags(self):
+    # When lssd=True the swap device is local NVMe, not the boot disk.
+    spec = self._make_swap_spec(lssd=True, lssd_count=2, boot_disk_iops=0)
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertNotIn('--boot-disk-provisioned-iops', nodepool_cmd)
+      self.assertNotIn('--boot-disk-provisioned-throughput', nodepool_cmd)
+
+  def test_nodepool_without_swap_config_omits_all_swap_flags(self):
+    spec = container_spec.ContainerClusterSpec(
+        'NAME',
+        **{
+            'cloud': 'GCP',
+            'vm_spec': {
+                'GCP': {
+                    'machine_type': 'e2-medium',
+                    'zone': 'us-central1-a',
+                },
+            },
+            'nodepools': {
+                'benchmark': {
+                    'vm_spec': {
+                        'GCP': {
+                            'machine_type': 'n4-highmem-32',
+                            'zone': 'us-central1-a',
+                        },
+                    },
+                },
+            },
+        },
+    )
+    with self.patch_critical_objects() as issue_command:
+      cluster = google_kubernetes_engine.GkeCluster(spec)
+      cluster._Create()
+      nodepool_cmd = issue_command.GetCommandWithSubstring(
+          'node-pools create benchmark'
+      )
+      self.assertNotIn('--system-config-from-file', nodepool_cmd)
+      self.assertNotIn('UBUNTU_CONTAINERD', nodepool_cmd)
+      self.assertNotIn('--no-enable-autorepair', nodepool_cmd)
+
+  def test_cleanup_yaml_called_after_nodepool_create(self):
+    spec = self._make_swap_spec()
+    with mock.patch.object(
+        swap_config_lib.GkeSwapConfig, 'CleanupYaml'
+    ) as mock_cleanup:
+      with self.patch_critical_objects():
+        cluster = google_kubernetes_engine.GkeCluster(spec)
+        cluster._Create()
+    mock_cleanup.assert_called_once()
+
+
 if __name__ == '__main__':
   unittest.main()
diff --git a/tests/resources/container_service/swap_config_test.py b/tests/resources/container_service/swap_config_test.py
new file mode 100644
index 0000000000..f71ba04d8e
--- /dev/null
+++ b/tests/resources/container_service/swap_config_test.py
@@ -0,0 +1,260 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for perfkitbenchmarker.resources.container_service.swap_config."""
+
+import os
+import unittest
+from unittest import mock
+
+from perfkitbenchmarker.resources.container_service import swap_config
+from tests import pkb_common_test_case
+
+
+class BaseSwapConfigTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for the abstract BaseSwapConfig class."""
+
+  def test_default_attrs(self):
+    cfg = swap_config.BaseSwapConfig()
+    self.assertEqual(cfg.swappiness, 100)
+    self.assertEqual(cfg.min_free_kbytes, 200)
+    self.assertEqual(cfg.watermark_scale_factor, 500)
+
+  def test_custom_attrs(self):
+    cfg = swap_config.BaseSwapConfig(
+        swappiness=60, min_free_kbytes=400, watermark_scale_factor=200
+    )
+    self.assertEqual(cfg.swappiness, 60)
+    self.assertEqual(cfg.min_free_kbytes, 400)
+    self.assertEqual(cfg.watermark_scale_factor, 200)
+
+  def test_from_spec_raises_not_implemented(self):
+    with self.assertRaises(NotImplementedError):
+      swap_config.BaseSwapConfig.from_spec(mock.Mock())
+
+  def test_create_is_noop(self):
+    cfg = swap_config.BaseSwapConfig()
+    cfg._Create()  # Must not raise.
+
+  def test_delete_is_noop(self):
+    cfg = swap_config.BaseSwapConfig()
+    cfg._Delete()  # Must not raise.
+
+
+class GkeSwapConfigTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for GkeSwapConfig: YAML generation, Hyperdisk clamping, lifecycle."""
+
+  def _make_spec(self, **kwargs):
+    """Return a mock SwapConfigSpec with sensible defaults."""
+    spec = mock.Mock()
+    spec.swappiness = kwargs.get('swappiness', 100)
+    spec.min_free_kbytes = kwargs.get('min_free_kbytes', 200)
+    spec.watermark_scale_factor = kwargs.get('watermark_scale_factor', 500)
+    spec.lssd = kwargs.get('lssd', False)
+    spec.lssd_count = kwargs.get('lssd_count', 0)
+    spec.boot_disk_iops = kwargs.get('boot_disk_iops', 0)
+    spec.boot_disk_throughput = kwargs.get('boot_disk_throughput', 0)
+    return spec
+
+  # ── from_spec ─────────────────────────────────────────────────────────────
+
+  def test_from_spec_maps_all_attrs(self):
+    spec = self._make_spec(
+        swappiness=60,
+        min_free_kbytes=400,
+        watermark_scale_factor=200,
+        lssd=True,
+        lssd_count=2,
+        boot_disk_iops=160000,
+        boot_disk_throughput=2400,
+    )
+    cfg = swap_config.GkeSwapConfig.from_spec(spec)
+    self.assertEqual(cfg.swappiness, 60)
+    self.assertEqual(cfg.min_free_kbytes, 400)
+    self.assertEqual(cfg.watermark_scale_factor, 200)
+    self.assertTrue(cfg.lssd)
+    self.assertEqual(cfg.lssd_count, 2)
+    self.assertEqual(cfg.boot_disk_iops, 160000)
+    self.assertEqual(cfg.boot_disk_throughput, 2400)
+
+  # ── WriteLinuxConfigYaml ──────────────────────────────────────────────────
+
+  def test_write_linux_config_yaml_basic_content(self):
+    cfg = swap_config.GkeSwapConfig(
+        swappiness=80, min_free_kbytes=300, watermark_scale_factor=400
+    )
+    path = cfg.WriteLinuxConfigYaml()
+    try:
+      with open(path) as f:
+        content = f.read()
+      self.assertIn('linuxConfig:', content)
+      self.assertIn('swapConfig:', content)
+      self.assertIn('enabled: true', content)
+      self.assertIn('vm.swappiness: 80', content)
+      self.assertIn('vm.min_free_kbytes: 300', content)
+      self.assertIn('vm.watermark_scale_factor: 400', content)
+    finally:
+      cfg.CleanupYaml()
+
+  def test_write_linux_config_yaml_no_lssd_has_no_disk_profile(self):
+    cfg = swap_config.GkeSwapConfig(lssd=False)
+    path = cfg.WriteLinuxConfigYaml()
+    try:
+      with open(path) as f:
+        content = f.read()
+      self.assertNotIn('dedicatedLocalSsdProfile', content)
+      self.assertNotIn('diskCount', content)
+    finally:
+      cfg.CleanupYaml()
+
+  def test_write_linux_config_yaml_lssd_includes_disk_profile(self):
+    cfg = swap_config.GkeSwapConfig(lssd=True, lssd_count=2)
+    path = cfg.WriteLinuxConfigYaml()
+    try:
+      with open(path) as f:
+        content = f.read()
+      self.assertIn('dedicatedLocalSsdProfile:', content)
+      self.assertIn('diskCount: 2', content)
+    finally:
+      cfg.CleanupYaml()
+
+  def test_write_linux_config_yaml_returns_existing_file_path(self):
+    cfg = swap_config.GkeSwapConfig()
+    path = cfg.WriteLinuxConfigYaml()
+    try:
+      self.assertTrue(os.path.isfile(path))
+    finally:
+      cfg.CleanupYaml()
+
+  # ── CleanupYaml ───────────────────────────────────────────────────────────
+
+  def test_cleanup_yaml_removes_tempfile(self):
+    cfg = swap_config.GkeSwapConfig()
+    path = cfg.WriteLinuxConfigYaml()
+    self.assertTrue(os.path.exists(path))
+    cfg.CleanupYaml()
+    self.assertFalse(os.path.exists(path))
+
+  def test_cleanup_yaml_noop_before_write(self):
+    cfg = swap_config.GkeSwapConfig()
+    cfg.CleanupYaml()  # Must not raise.
+
+  def test_cleanup_yaml_noop_on_second_call(self):
+    cfg = swap_config.GkeSwapConfig()
+    cfg.WriteLinuxConfigYaml()
+    cfg.CleanupYaml()
+    cfg.CleanupYaml()  # Second call must not raise.
+
+  # ── ValidHyperdiskThroughput ──────────────────────────────────────────────
+
+  def test_valid_hyperdisk_throughput_no_clamp_needed(self):
+    # min_throughput = ceil(160000 / 256) = 625; 2400 > 625 → unchanged.
+    cfg = swap_config.GkeSwapConfig(
+        boot_disk_iops=160000, boot_disk_throughput=2400
+    )
+    self.assertEqual(cfg.ValidHyperdiskThroughput(), 2400)
+
+  def test_valid_hyperdisk_throughput_clamps_up(self):
+    # min_throughput = ceil(160000 / 256) = 625; 100 < 625 → clamp to 625.
+    cfg = swap_config.GkeSwapConfig(
+        boot_disk_iops=160000, boot_disk_throughput=100
+    )
+    self.assertEqual(cfg.ValidHyperdiskThroughput(), 625)
+
+  def test_valid_hyperdisk_throughput_no_iops_returns_throughput(self):
+    # iops=0 means no constraint → return throughput unchanged.
+    cfg = swap_config.GkeSwapConfig(boot_disk_iops=0, boot_disk_throughput=500)
+    self.assertEqual(cfg.ValidHyperdiskThroughput(), 500)
+
+  def test_valid_hyperdisk_throughput_both_zero_returns_zero(self):
+    cfg = swap_config.GkeSwapConfig(boot_disk_iops=0, boot_disk_throughput=0)
+    self.assertEqual(cfg.ValidHyperdiskThroughput(), 0)
+
+  def test_valid_hyperdisk_throughput_exact_minimum_no_clamp(self):
+    # iops=256, throughput=1 → min=1; exactly at boundary → unchanged.
+    cfg = swap_config.GkeSwapConfig(boot_disk_iops=256, boot_disk_throughput=1)
+    self.assertEqual(cfg.ValidHyperdiskThroughput(), 1)
+
+
+class EksSwapConfigTest(pkb_common_test_case.PkbCommonTestCase):
+  """Tests for EksSwapConfig: nodeadm YAML output and from_spec mapping."""
+
+  def _make_spec(self, **kwargs):
+    spec = mock.Mock()
+    spec.swappiness = kwargs.get('swappiness', 100)
+    spec.min_free_kbytes = kwargs.get('min_free_kbytes', 200)
+    spec.watermark_scale_factor = kwargs.get('watermark_scale_factor', 500)
+    return spec
+
+  # ── from_spec ─────────────────────────────────────────────────────────────
+
+  def test_from_spec_maps_sysctl_attrs(self):
+    spec = self._make_spec(
+        swappiness=60, min_free_kbytes=400, watermark_scale_factor=200
+    )
+    cfg = swap_config.EksSwapConfig.from_spec(spec)
+    self.assertEqual(cfg.swappiness, 60)
+    self.assertEqual(cfg.min_free_kbytes, 400)
+    self.assertEqual(cfg.watermark_scale_factor, 200)
+
+  def test_from_spec_eks_specific_attrs_use_defaults(self):
+    # from_spec does not accept memory_swap_behavior / fail_swap_on from spec.
+    cfg = swap_config.EksSwapConfig.from_spec(self._make_spec())
+    self.assertEqual(cfg.memory_swap_behavior, 'LimitedSwap')
+    self.assertFalse(cfg.fail_swap_on)
+
+  # ── GetNodeadmConfig ──────────────────────────────────────────────────────
+
+  def test_get_nodeadm_config_api_version(self):
+    cfg = swap_config.EksSwapConfig()
+    self.assertIn('apiVersion: node.eks.aws/v1alpha1', cfg.GetNodeadmConfig())
+
+  def test_get_nodeadm_config_memory_swap_behavior(self):
+    cfg = swap_config.EksSwapConfig()
+    self.assertIn('memorySwapBehavior: LimitedSwap', cfg.GetNodeadmConfig())
+
+  def test_get_nodeadm_config_fail_swap_on_false(self):
+    cfg = swap_config.EksSwapConfig(fail_swap_on=False)
+    self.assertIn('failSwapOn: false', cfg.GetNodeadmConfig())
+
+  def test_get_nodeadm_config_sysctl_keys_present(self):
+    cfg = swap_config.EksSwapConfig()
+    output = cfg.GetNodeadmConfig()
+    self.assertIn('vm.swappiness:', output)
+    self.assertIn('vm.min_free_kbytes:', output)
+    self.assertIn('vm.watermark_scale_factor:', output)
+
+  def test_get_nodeadm_config_reflects_custom_sysctl_values(self):
+    cfg = swap_config.EksSwapConfig(
+        swappiness=60, min_free_kbytes=400, watermark_scale_factor=200
+    )
+    output = cfg.GetNodeadmConfig()
+    self.assertIn('vm.swappiness: 60', output)
+    self.assertIn('vm.min_free_kbytes: 400', output)
+    self.assertIn('vm.watermark_scale_factor: 200', output)
+
+  # ── _Create stub ──────────────────────────────────────────────────────────
+
+  def test_create_logs_deferred_warning(self):
+    cfg = swap_config.EksSwapConfig()
+    with self.assertLogs(level='WARNING') as log_ctx:
+      cfg._Create()
+    combined = ' '.join(log_ctx.output).lower()
+    self.assertTrue(
+        'stub' in combined or 'deferred' in combined,
+        msg=f'Expected "stub" or "deferred" in log output: {log_ctx.output}',
+    )
+
+
+if __name__ == '__main__':
+  unittest.main()

From 7056b1a14e3267e7631e0c82e903f93e70c233ab Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Wed, 1 Jul 2026 12:56:11 +0530
Subject: [PATCH 16/17] fix(swap_config): quote sysctl values as strings in GKE
 linuxConfig YAML

---
 .../resources/container_service/swap_config.py              | 6 +++---
 tests/resources/container_service/swap_config_test.py       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py
index ca36dbad8b..a38b7adccf 100644
--- a/perfkitbenchmarker/resources/container_service/swap_config.py
+++ b/perfkitbenchmarker/resources/container_service/swap_config.py
@@ -191,9 +191,9 @@ def WriteLinuxConfigYaml(self) -> str:
         'linuxConfig:\n'
         + swap_block
         + '  sysctl:\n'
-        + f'    vm.swappiness: {self.swappiness}\n'
-        + f'    vm.min_free_kbytes: {self.min_free_kbytes}\n'
-        + f'    vm.watermark_scale_factor: {self.watermark_scale_factor}\n'
+        + f'    vm.swappiness: "{self.swappiness}"\n'
+        + f'    vm.min_free_kbytes: "{self.min_free_kbytes}"\n'
+        + f'    vm.watermark_scale_factor: "{self.watermark_scale_factor}"\n'
     )
 
     tmp = tempfile.NamedTemporaryFile(
diff --git a/tests/resources/container_service/swap_config_test.py b/tests/resources/container_service/swap_config_test.py
index f71ba04d8e..0d965ed2ce 100644
--- a/tests/resources/container_service/swap_config_test.py
+++ b/tests/resources/container_service/swap_config_test.py
@@ -100,9 +100,9 @@ def test_write_linux_config_yaml_basic_content(self):
       self.assertIn('linuxConfig:', content)
       self.assertIn('swapConfig:', content)
       self.assertIn('enabled: true', content)
-      self.assertIn('vm.swappiness: 80', content)
-      self.assertIn('vm.min_free_kbytes: 300', content)
-      self.assertIn('vm.watermark_scale_factor: 400', content)
+      self.assertIn('vm.swappiness: "80"', content)
+      self.assertIn('vm.min_free_kbytes: "300"', content)
+      self.assertIn('vm.watermark_scale_factor: "400"', content)
     finally:
       cfg.CleanupYaml()
 

From c8a8dd91b022388348183435b34fa6ceb7f751e0 Mon Sep 17 00:00:00 2001
From: DevVegeta <shreepatil89@gmail.com>
Date: Wed, 1 Jul 2026 23:50:29 +0530
Subject: [PATCH 17/17] fix(swap_config,swap_daemonset): pylint cleanups -
 rename _retries->retries arg, suppress invalid-name on from_spec base class
 definition

---
 .../resources/container_service/swap_config.py         | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/perfkitbenchmarker/resources/container_service/swap_config.py b/perfkitbenchmarker/resources/container_service/swap_config.py
index a38b7adccf..6184220520 100644
--- a/perfkitbenchmarker/resources/container_service/swap_config.py
+++ b/perfkitbenchmarker/resources/container_service/swap_config.py
@@ -35,7 +35,7 @@
         swap_config:
           enabled: true
           swappiness: 100
-          min_free_kbytes: 200
+          min_free_kbytes: 67584
           watermark_scale_factor: 500
           boot_disk_iops: 160000
           boot_disk_throughput: 2400
@@ -75,7 +75,7 @@ class BaseSwapConfig(resource.BaseResource):
   def __init__(
       self,
       swappiness: int = 100,
-      min_free_kbytes: int = 200,
+      min_free_kbytes: int = 67584,
       watermark_scale_factor: int = 500,
   ) -> None:
     super().__init__()
@@ -111,7 +111,7 @@ class GkeSwapConfig(BaseSwapConfig):
 
   Attributes:
     swappiness: vm.swappiness sysctl value (0-200, default 100).
-    min_free_kbytes: vm.min_free_kbytes sysctl (default 200).
+    min_free_kbytes: vm.min_free_kbytes sysctl (default 67584, GKE minimum >= 67584).
     watermark_scale_factor: vm.watermark_scale_factor sysctl (default 500).
     lssd: True if the nodepool uses local NVMe SSDs for swap device.
     lssd_count: Number of local NVMe SSDs (dedicatedLocalSsdProfile.diskCount).
@@ -125,7 +125,7 @@ class GkeSwapConfig(BaseSwapConfig):
   def __init__(
       self,
       swappiness: int = 100,
-      min_free_kbytes: int = 200,
+      min_free_kbytes: int = 67584,
       watermark_scale_factor: int = 500,
       lssd: bool = False,
       lssd_count: int = 0,
@@ -272,7 +272,7 @@ class EksSwapConfig(BaseSwapConfig):
   def __init__(
       self,
       swappiness: int = 100,
-      min_free_kbytes: int = 200,
+      min_free_kbytes: int = 67584,
       watermark_scale_factor: int = 500,
       memory_swap_behavior: str = 'LimitedSwap',
       fail_swap_on: bool = False,