diff --git a/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2 b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
new file mode 100644
index 0000000000..29cacfb3ce
--- /dev/null
+++ b/perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2
@@ -0,0 +1,120 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: {{ ds_name }}
+  namespace: {{ ds_namespace }}
+  labels:
+    app: {{ ds_label }}
+spec:
+  selector:
+    matchLabels:
+      app: {{ ds_label }}
+  template:
+    metadata:
+      labels:
+        app: {{ ds_label }}
+    spec:
+      hostPID: true
+      hostNetwork: true
+      # Pin to the benchmark nodepool — never schedule on the dummy default pool.
+      nodeSelector:
+        pkb_nodepool: {{ benchmark_nodepool }}
+      tolerations:
+      - operator: Exists
+      containers:
+      - name: benchmark
+        image: {{ image }}
+        command:
+        - bash
+        - -c
+        - |
+          echo "[pkb] Installing measurement tools..."
+          # Only the tools needed for Phase 1 (raw-device fio) and Phase 2
+          # (CPU/I/O overhead) are installed here.  Workload benchmarks
+          # (redis, opensearch, kernel-build) run in separate pods via
+          # existing PKB benchmark modules and are NOT installed here.
+          PKB_APT_OK=0
+          for _attempt in 1 2 3; do
+            apt-get update -qq 2>&1 || true
+            DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
+              fio \
+              cryptsetup \
+              mdadm \
+              sysstat \
+              nvme-cli \
+              2>&1 && PKB_APT_OK=1 && break
+            echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
+            sleep 15
+          done
+          if [ "$PKB_APT_OK" != "1" ] || ! command -v fio >/dev/null 2>&1; then
+            echo "[pkb] FATAL: fio not installed after 3 attempts" >&2
+            exit 1
+          fi
+          echo "[pkb] fio installed: $(fio --version 2>&1 | head -1)"
+          echo "[pkb] Verifying swap device is active..."
+          PKB_SWAP_FOUND=0
+          for _attempt in $(seq 1 30); do
+            if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then
+              PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps)
+              echo "[pkb] Swap device active: $PKB_SWAP_DEV"
+              PKB_SWAP_FOUND=1
+              break
+            fi
+            echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2
+            sleep 5
+          done
+          if [ "$PKB_SWAP_FOUND" != "1" ]; then
+            echo "[pkb] WARNING: no active swap device after 150s — " \
+                 "check linuxConfig.swapConfig / kubelet swap config." >&2
+          fi
+          echo "[pkb] Measurement tools ready. Writing ready sentinel."
+          touch /tmp/pkb_ready
+          sleep infinity
+        securityContext:
+          privileged: true
+          capabilities:
+            add: ["SYS_ADMIN", "IPC_LOCK"]
+        resources:
+          requests:
+            memory: "512Mi"
+        env:
+        - name: NODE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: spec.nodeName
+        volumeMounts:
+        - name: dev
+          mountPath: /dev
+        - name: sys
+          mountPath: /sys
+        - name: run
+          mountPath: /run
+        - name: proc-host
+          mountPath: /proc-host
+          readOnly: true
+        - name: stateful-partition
+          mountPath: /mnt/stateful_partition
+        - name: lib-modules
+          mountPath: /lib/modules
+          readOnly: true
+      volumes:
+      - name: dev
+        hostPath:
+          path: /dev
+      - name: sys
+        hostPath:
+          path: /sys
+      - name: run
+        hostPath:
+          path: /run
+      - name: proc-host
+        hostPath:
+          path: /proc
+      - name: stateful-partition
+        hostPath:
+          path: /mnt/stateful_partition
+          type: DirectoryOrCreate
+      - name: lib-modules
+        hostPath:
+          path: /lib/modules
+          type: Directory
diff --git a/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
new file mode 100644
index 0000000000..2f93f57049
--- /dev/null
+++ b/perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py
@@ -0,0 +1,2050 @@
+# Copyright 2026 PerfKitBenchmarker Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GKE vs. AWS EKS Swap Encryption and LSSD Performance Benchmark.
+
+Methodology: go/swap-encryption-and-lssd-performance-comparison:gke-vs-aws
+
+== Architecture ==
+
+Provisions a real GKE (GCP) or EKS (AWS) Kubernetes cluster via PKB's
+container_cluster abstraction, then deploys a privileged DaemonSet whose
+pod has full host-device access (/dev, /sys, hostPID).  All benchmark
+phases execute inside this pod via kubectl exec, so measurements reflect
+actual cluster-node behaviour including Kubernetes overhead (kubelet,
+containerd cgroup hierarchy, etc.).
+
+  GKE nodes  ── dm-crypt with ephemeral key (go/node:swap-encryption)
+                 swap device: /dev/mapper/swap_encrypted (over dedicated
+                 hyperdisk or LSSD RAID-0 /dev/md0).
+                 Single-disk fallback: plain loop device on
+                 /mnt/stateful_partition — dm-crypt is blocked by COS
+                 kernel namespace restrictions from inside a pod.
+
+  EKS nodes  ── NVMe Instance Store, Nitro hardware-offloaded encryption
+                 swap device: /dev/nvme1n1 (or auto-detected)
+
+== Resource pattern ==
+
+Infrastructure lifecycle lives in two BaseResource subclasses:
+
+    _Create():  gcloud container node-pools create with linuxConfig.swapConfig
+                + sysctl via --system-config-from-file; waits for node Ready;
+                optionally creates and attaches a dedicated swap disk.
+    _Delete():  detach+delete disk; delete the nodepool.
+    DeleteDefaultPool(): remove the dummy e2-medium default pool after the
+                DaemonSet pod is Running (separate step to avoid API-server
+                contention during nodepool ops).
+
+  SwapDaemonSet  (perfkitbenchmarker/resources/container_service/swap_daemonset.py)
+    _Create():  apply Jinja2 manifest; wait for Running + /tmp/pkb_ready.
+    _Delete():  in-pod swapoff / dmsetup / losetup teardown; kubectl delete.
+    PodExec():  kubectl exec wrapper with transient-reset retry, OOM-kill
+                detection (rc=137), and automatic pod recovery.
+
+Both resources are added to spec.resources in Prepare() and are auto-deleted
+by the PKB framework in Cleanup().
+
+== Benchmark Phases ==
+
+  Phase 1 – fio Microbenchmarks
+    Run fio directly on the swap block device (swapoff first) to measure
+    the hardware + encryption ceiling: random IOPS (4K), sequential
+    bandwidth (1M), and completion latency (iodepth=1).
+
+  Phase 2a – CPU Overhead  (PR2/PR4)
+  Phase 2b – I/O Interference  (PR4)
+  Phase 3a – Redis Latency  (PR5)
+  Phase 3b – Kernel Build  (PR5)
+  Phase 3c – OpenSearch  (PR5)
+"""
+
+import json
+import logging
+import textwrap
+import time
+from typing import Any
+
+from absl import flags
+from perfkitbenchmarker import benchmark_spec as bm_spec_lib
+from perfkitbenchmarker import configs
+from perfkitbenchmarker import errors
+from perfkitbenchmarker import sample
+from perfkitbenchmarker import vm_util
+from perfkitbenchmarker.resources.container_service import kubectl
+from perfkitbenchmarker.resources.container_service import swap_daemonset as _ds_mod
+
+FLAGS = flags.FLAGS
+
+_BenchmarkSpec = bm_spec_lib.BenchmarkSpec
+
+# ---------------------------------------------------------------------------
+# Benchmark identity
+# ---------------------------------------------------------------------------
+
+BENCHMARK_NAME = 'swap_encryption'
+
+
+BENCHMARK_CONFIG = """
+swap_encryption:
+  description: >
+    Verify dm-crypt encrypted swap on GKE/EKS nodes. Swap-enabled 'benchmark' nodepool declared in BENCHMARK_CONFIG;
+    GKE cluster creation applies --system-config-from-file (dm-crypt swapConfig)
+    automatically via swap_config field on NodepoolSpec.
+  container_cluster:
+    cloud: GCP
+    type: Kubernetes
+    vm_count: 1
+    vm_spec:
+      GCP:
+        machine_type: e2-medium
+        boot_disk_size: 20
+        zone: us-central1-a
+    nodepools:
+      benchmark:
+        vm_count: 1
+        vm_spec:
+          GCP:
+            machine_type: n4-highmem-32
+            boot_disk_type: hyperdisk-balanced
+            boot_disk_size: 500
+            zone: us-central1-a
+        swap_config:
+          enabled: true
+          swappiness: 100
+          min_free_kbytes: 200
+          watermark_scale_factor: 500
+          boot_disk_iops: 160000
+          boot_disk_throughput: 2400
+"""
+
+
+_SWAP_DEVICE = flags.DEFINE_string(
+    'swap_encryption_device',
+    '',
+    'Explicit swap block-device path on the cluster node, e.g. '
+    '/dev/nvme1n1 or /dev/dm-0.  When empty the benchmark auto-detects '
+    'via /proc/swaps after setup.',
+)
+
+
+_SWAP_SIZE_GB = flags.DEFINE_integer(
+    'swap_encryption_swap_size_gb',
+    32,
+    'Size in GB of the swap space to configure on the node. '
+    'Ignored when a ready swap device already exists.',
+)
+
+
+_SWAP_TYPE = flags.DEFINE_enum(
+    'swap_encryption_swap_type',
+    'auto',
+    ['auto', 'hyperdisk', 'lssd', 'boot_disk', 'instance_store', 'io2'],
+    'Swap backing storage target, one per methodology test-matrix row:\n'
+    '  GKE:  boot_disk (swap file on the OS boot disk — pd-balanced or '
+    'hyperdisk-balanced, chosen via --swap_encryption_boot_disk_type),\n'
+    '        hyperdisk (dedicated hyperdisk-balanced data disk),\n'
+    '        lssd (dedicated Local SSD RAID-0).\n'
+    '  AWS:  instance_store (NVMe Instance Store, Nitro-encrypted),\n'
+    '        io2 (EBS io2 data/root volume).\n'
+    'dm-crypt is applied on the GKE targets when '
+    '--swap_encryption_enable_dmcrypt is set; AWS targets are encrypted by '
+    'Nitro at the hardware level.  auto = detect from cloud + instance type.',
+)
+
+
+_ENABLE_ZSWAP = flags.DEFINE_boolean(
+    'swap_encryption_enable_zswap',
+    False,
+    'Enable zswap (lz4 compressor, 20%% max pool) before running tests.',
+)
+
+
+_MIN_FREE_KBYTES = flags.DEFINE_integer(
+    'swap_encryption_min_free_kbytes',
+    65536,
+    'Value written to /proc/sys/vm/min_free_kbytes to trigger earlier '
+    'swapping. Set 0 to leave the kernel default unchanged.',
+)
+
+
+_DAEMONSET_IMAGE = flags.DEFINE_string(
+    'swap_encryption_daemonset_image',
+    'ubuntu:22.04',
+    'Container image used for the privileged benchmark DaemonSet pod.',
+)
+
+
+_NODEPOOL = flags.DEFINE_string(
+    'swap_encryption_nodepool',
+    'benchmark',
+    'Name of the node pool to deploy the benchmark DaemonSet on.',
+)
+
+
+_INSTANCE_SIZE_LABEL = flags.DEFINE_string(
+    'swap_encryption_instance_size_label',
+    '',
+    'Human-readable label for the current instance size being tested, e.g. '
+    '"n4-highmem-32" or "i4i.4xlarge".  Stored in sample metadata so that '
+    'results from multiple PKB runs across different instance sizes can be '
+    'collated and compared.  Defaults to the value reported by the cloud '
+    'metadata endpoint inside the pod.',
+)
+
+
+_COLLECT_COST = flags.DEFINE_boolean(
+    'swap_encryption_collect_cost',
+    False,
+    'When True, emit a cost_estimate_usd sample using on-demand pricing '
+    'for the instance type detected at runtime.',
+)
+
+
+_IO2_ENCRYPTED = flags.DEFINE_boolean(
+    'swap_encryption_io2_encrypted',
+    True,
+    'When True (default), the dedicated io2 swap volume is created with EBS '
+    'encryption (Nitro/KMS) -> matrix row "io2 + hardware encryption". '
+    'Set False for the unencrypted io2 baseline row. Only applies when '
+    '--swap_encryption_swap_type=io2 on AWS/EKS.',
+)
+
+
+_IO2_KMS_KEY_ID = flags.DEFINE_string(
+    'swap_encryption_io2_kms_key_id',
+    '',
+    'Optional KMS key id/ARN for the encrypted io2 volume. Empty = the '
+    'account default aws/ebs key. Ignored unless io2_encrypted is True.',
+)
+
+
+_FAIL_ON_DEGRADED = flags.DEFINE_boolean(
+    'swap_encryption_fail_on_degraded',
+    True,
+    'When True (default), raise an error at the end of Run() if the run was '
+    'catastrophically degraded — e.g. the benchmark pod was OOM-evicted and '
+    'replaced mid-run, Gate 1 (fio) produced no samples, or the stress-ng '
+    'swap-pressure phase was OOM-killed before completing.  This prevents PKB '
+    'from reporting SUCCEEDED for a run whose post-eviction phases produced '
+    'empty or meaningless data.  Set False to keep the legacy behaviour of '
+    'always returning whatever partial samples were collected.',
+)
+
+
+_PHASES = flags.DEFINE_list(
+    'swap_encryption_phases',
+    ['all'],
+    'Which Run() phases to execute, for fast iteration against an '
+    'already-provisioned cluster (e.g. --run_stage=run --run_uri=...).  '
+    'Comma-separated subset of: fio (Tier 1 microbenchmarks), 2a (stress-ng '
+    'CPU overhead + swap pressure), 2b (I/O interference), 3a (redis), '
+    '3b (kernel build), 3c (opensearch).  Default "all" runs everything.  '
+    'Example: --swap_encryption_phases=2a runs only the swap-pressure phase. '
+    'Phases not listed are skipped and do not affect the degraded-run gate '
+    '(e.g. skipping fio will not be reported as "Gate 1 produced no samples").',
+)
+
+
+_BENCHMARK_MACHINE_TYPE = flags.DEFINE_string(
+    'swap_encryption_benchmark_machine_type',
+    'n4-highmem-32',
+    'Machine type for the benchmark nodepool created in Prepare(). '
+    'Use n4-highmem-32 (hyperdisk, default) or c4-standard-8-lssd '
+    '(LSSD RAID-0).  The matching swap setup is selected automatically.',
+)
+
+
+_BENCHMARK_LSSD = flags.DEFINE_boolean(
+    'swap_encryption_lssd',
+    False,
+    'Force LSSD RAID-0 swap path even when the machine type name does not '
+    'contain "lssd".  Auto-detected from machine type when False.',
+)
+
+
+_LSSD_COUNT = flags.DEFINE_integer(
+    'swap_encryption_lssd_count',
+    1,
+    'Number of local NVMe SSDs to attach as raw block devices '
+    '(--local-nvme-ssd-block count=N).  Must match the fixed local SSD '
+    'count for the chosen machine type: c4-standard-8-lssd=1, '
+    'c4-standard-16-lssd=2, i4i.4xlarge has NVMe Instance Store (AWS).  '
+    'Default 1 covers most single-lssd machine types.',
+)
+
+
+_ENABLE_DMCRYPT = flags.DEFINE_boolean(
+    'swap_encryption_enable_dmcrypt',
+    True,
+    'When True (default), configure dm-crypt on the swap device — the '
+    '"encryption enabled" column of the test matrix.  Set False to use '
+    'plain swap (encryption disabled column).',
+)
+
+
+_NODE_IMAGE_TYPE = flags.DEFINE_string(
+    'swap_encryption_node_image_type',
+    'UBUNTU_CONTAINERD',
+    'GKE node image type for the benchmark nodepool.  '
+    'UBUNTU_CONTAINERD is required for dm-crypt measurement: COS locks '
+    'down device-mapper at the kernel LSM level and cryptsetup hangs '
+    'indefinitely from any pod context (even privileged, even via nsenter '
+    'into the host mount namespace).  Ubuntu GKE nodes allow cryptsetup '
+    'from privileged pods without restriction.  '
+    'Use COS_CONTAINERD only when dm-crypt is disabled '
+    '(--noswap_encryption_enable_dmcrypt) to measure plain-swap overhead.  '
+    'AL2 on EKS.',
+)
+
+
+_BOOT_DISK_TYPE = flags.DEFINE_string(
+    'swap_encryption_boot_disk_type',
+    'hyperdisk-balanced',
+    'Disk type for the benchmark nodepool boot disk.  Use hyperdisk-balanced '
+    'for production machines (n4, c3, c4 families).  Use pd-ssd for n2/e2 '
+    'dev/test machines, which do not support hyperdisk-balanced.',
+)
+
+
+_BOOT_DISK_IOPS = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_iops',
+    80000,
+    'Provisioned IOPS for the boot disk (hyperdisk-balanced only).  '
+    '80 000 is the COS max-IOPS target.  Ignored for pd-ssd.',
+)
+
+
+_BOOT_DISK_THROUGHPUT = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_throughput',
+    1200,
+    'Provisioned throughput in MB/s for the boot disk (hyperdisk-balanced '
+    'only).  Must be set together with iops.  1200 MB/s pairs with 80 000 '
+    'IOPS for production; use 140 (minimum) for dev/test.  Ignored for '
+    'pd-ssd.',
+)
+
+
+_BOOT_DISK_SIZE_GB = flags.DEFINE_integer(
+    'swap_encryption_boot_disk_size_gb',
+    500,
+    'Boot disk size in GiB for the benchmark nodepool.  500 GiB is '
+    'required for the n4-highmem-32 + hyperdisk-balanced Config 2 run '
+    '(see Engineer Assignments table in execution-plan.md).  '
+    'For LSSD configs the boot disk is smaller; 100 GiB is fine.',
+)
+
+
+_ADD_SWAP_DISK = flags.DEFINE_boolean(
+    'swap_encryption_add_swap_disk',
+    False,
+    'Attach a dedicated second disk to the benchmark nodepool for use as '
+    'the swap device.  Required for dm-crypt measurement on single-boot-disk '
+    'machines (n4-highmem-32, n4-highmem-8) because COS blocks device-mapper '
+    'from pod namespaces.  The second disk is provisioned via '
+    '--additional-node-disk using the same type/IOPS/throughput as the boot '
+    'disk flags.',
+)
+
+
+_SWAP_DISK_SIZE_GB = flags.DEFINE_integer(
+    'swap_encryption_swap_disk_size_gb',
+    500,
+    'Size in GiB of the dedicated swap disk when '
+    '--swap_encryption_add_swap_disk is True.  Must satisfy the '
+    'hyperdisk-balanced IOPS constraint: provisioned_iops ≤ size_gb × 80.',
+)
+
+
+_FIO_RUNTIME_SEC = flags.DEFINE_integer(
+    'swap_encryption_fio_runtime_sec',
+    60,
+    'Wall-clock seconds each fio job runs in Phase 1 microbenchmarks.',
+)
+
+
+_STRESS_VM_BYTES = flags.DEFINE_string(
+    'swap_encryption_stress_vm_bytes',
+    '28G',
+    'stress-ng --vm-bytes value for Phase 2a swap-pressure stressor.  '
+    'Should exceed available node RAM to force sustained paging.',
+)
+
+
+_STRESS_VM_BYTES_LIST = flags.DEFINE_list(
+    'swap_encryption_stress_vm_bytes_list',
+    [],
+    'Comma-separated list of --vm-bytes values to sweep in Phase 2a, '
+    'e.g. "14G,28G,56G".  Overrides --swap_encryption_stress_vm_bytes.',
+)
+
+
+_STRESS_TIMEOUT_SEC = flags.DEFINE_integer(
+    'swap_encryption_stress_timeout_sec',
+    300,
+    'Maximum seconds to wait for the stress-ng swap-pressure phase.',
+)
+
+
+# DaemonSet constants used by both SwapDaemonSet construction and the EKS path.
+_DS_NAME = 'pkb-swap-benchmark'
+_DS_NAMESPACE = 'default'
+_DS_LABEL = 'pkb-swap-benchmark'
+_BENCHMARK_NODEPOOL = 'benchmark'
+
+# Module-level stash for the io2 volume id created by _ensure_io2_volume().
+_IO2_VOLUME_ID = ''
+
+
+def GetConfig(user_config: dict[str, Any]) -> dict[str, Any]:
+    """Load and return benchmark config spec."""
+    return configs.LoadConfig(BENCHMARK_CONFIG, user_config, BENCHMARK_NAME)
+
+
+def Prepare(spec: _BenchmarkSpec) -> None:
+    """Two-step nodepool setup then DaemonSet deployment.
+
+    PKB cluster creation automatically provisions the swap-enabled 'benchmark'
+    nodepool (swap_config in BENCHMARK_CONFIG). This function only:
+      1. Deploys the privileged SwapDaemonSet and waits for Running.
+      2. Deletes the cheap e2-medium default-pool (required at cluster create).
+
+    DaemonSet is appended to spec.resources for PKB auto-cleanup.
+    """
+    cluster = spec.container_cluster
+
+    # The swap-enabled 'benchmark' nodepool is already provisioned by GKE
+    # cluster creation (swap_config declared in BENCHMARK_CONFIG).
+    # Prepare() only deploys the privileged DaemonSet + deletes the cheap
+    # e2-medium default pool that GKE requires at cluster creation time.
+    logging.info('[swap_encryption] Deploying privileged DaemonSet')
+    daemonset = _ds_mod.SwapDaemonSet(
+        name=_DS_NAME,
+        namespace=_DS_NAMESPACE,
+        label=_DS_LABEL,
+        nodepool=_BENCHMARK_NODEPOOL,
+        image=_DAEMONSET_IMAGE.value,
+    )
+    daemonset.Create()
+    spec.resources.append(daemonset)
+    logging.info('[swap_encryption] Benchmark pod ready: %s', daemonset.pod_name)
+    _delete_default_pool(cluster)
+    daemonset.WaitForPod()
+    logging.info(
+        '[swap_encryption] Benchmark pod (post-deletion): %s', daemonset.pod_name
+    )
+
+
+def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
+    """Execute all benchmark phases with gate logic.
+
+    Execution is structured in three gated tiers matching the execution plan:
+
+      Tier 1 (Gate 1) — fio microbenchmarks
+        Raw I/O ceiling of the swap device.  Gate 1 fails if fio produces
+        zero samples (device not found, O_DIRECT error, etc.).
+
+      Tier 2 (Gate 2) — stress-ng CPU overhead + I/O interference (PR4)
+        Requires an active swap device (Gate 1 must pass).
+
+      Tier 3 (Gate 3) — real-world workloads (PR5)
+        Independent of Tier 2 results.
+
+    If Gate 1 fails, Tiers 2 and 3 are skipped.
+    """
+    daemonset = _get_daemonset(spec)
+
+    pod = daemonset.WaitForPod()
+    if pod is None:
+        raise errors.Benchmarks.RunError(
+            '[swap_encryption] Benchmark pod never became ready.'
+        )
+    # Reset per-run accumulators before starting phases.
+    daemonset.oom_events.clear()
+    daemonset.pod_lost.clear()
+    original_pod = pod
+    degraded_reasons: list[str] = []
+
+    # ── Swap setup (cloud-specific) ───────────────────────────────────────────
+    daemonset.PodExec('sysctl -w vm.swappiness=100', ignore_failure=True)
+    if _MIN_FREE_KBYTES.value > 0:
+        daemonset.PodExec(
+            f'sysctl -w vm.min_free_kbytes={_MIN_FREE_KBYTES.value}'
+        )
+    daemonset.PodExec(
+        textwrap.dedent("""
+    PKB_CG=$(awk -F: '/^0::/{print $3; exit}' /proc/self/cgroup 2>/dev/null)
+    if [ -n "$PKB_CG" ] && [ -f "/sys/fs/cgroup${PKB_CG}/memory.swap.max" ]; then
+      echo max > "/sys/fs/cgroup${PKB_CG}/memory.swap.max" 2>/dev/null || true
+    fi
+    PKB_CG1=$(awk -F: '/:memory:/{print $3; exit}' /proc/self/cgroup 2>/dev/null)
+    if [ -n "$PKB_CG1" ] && \
+       [ -f "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" ]; then
+      echo -1 > "/sys/fs/cgroup/memory${PKB_CG1}/memory.memsw.limit_in_bytes" \
+        2>/dev/null || true
+    fi
+  """),
+        ignore_failure=True,
+    )
+    if _ENABLE_ZSWAP.value:
+        _enable_zswap(daemonset)
+
+    cloud = _detect_cloud(daemonset)
+    logging.info('[swap_encryption] Detected cloud: %s', cloud)
+    if cloud == 'gcp':
+        _setup_gke_swap(daemonset)
+    elif cloud == 'aws':
+        _setup_eks_swap(daemonset)
+    else:
+        logging.warning(
+            '[swap_encryption] Unknown cloud – falling back to plain swapfile'
+        )
+        _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value)
+
+    swap_dev = _detect_swap_device(daemonset)
+    base_meta = _build_metadata(daemonset, swap_dev)
+    results: list[sample.Sample] = []
+    t_run_start = time.time()
+
+    logging.info('[swap_encryption] swap device: %s', swap_dev)
+
+    # ── Phase 1: fio microbenchmarks on raw swap device ───────────────────────
+    if _phase_selected('fio'):
+        logging.info(
+            '[swap_encryption] Phase 1: fio microbenchmarks on %s', swap_dev
+        )
+        try:
+            phase1_samples = _run_phase1_fio(daemonset, swap_dev, base_meta)
+            results += phase1_samples
+            if not phase1_samples:
+                degraded_reasons.append(
+                    'Phase 1 (fio) produced no samples — '
+                    'check fio install and swap device accessibility'
+                )
+                logging.error('[swap_encryption] Phase 1: no samples produced')
+        except Exception as e:  # pylint: disable=broad-except
+            degraded_reasons.append(f'Phase 1 fio failed: {e}')
+            logging.error('[swap_encryption] Phase 1 fio error: %s', e)
+
+    # ── Cost estimate ─────────────────────────────────────────────────────────
+    if _COLLECT_COST.value:
+        elapsed = time.time() - t_run_start
+        results += _collect_cost_sample(daemonset, elapsed, base_meta)
+
+    # ── Final degradation gate ────────────────────────────────────────────────
+    if daemonset.pod_name and daemonset.pod_name != original_pod:
+        degraded_reasons.append(
+            f'benchmark pod was replaced during the run ({original_pod} →'
+            f' {daemonset.pod_name}) — it was OOM-evicted under swap'
+            ' pressure; phases executed after the eviction ran against a'
+            ' freshly-initialised pod (empty /tmp, swap re-setup) and may'
+            ' be invalid'
+        )
+    if daemonset.pod_lost:
+        degraded_reasons.append(
+            'benchmark pod(s) went NotFound during the run'
+            f' ({", ".join(daemonset.pod_lost)}) — the pod died (node'
+            ' memory-pressure eviction or container exit) and any phase'
+            ' running at or after that point produced invalid data'
+        )
+    if daemonset.oom_events:
+        degraded_reasons.append(
+            'OOM kill(s) (rc=137) occurred during the run on pod(s) '
+            f'{", ".join(daemonset.oom_events)} — a phase exceeded memory'
+            ' and was killed by the OOM killer; the affected phase(s)'
+            ' produced no or partial data'
+        )
+
+    degraded = bool(degraded_reasons)
+    results.append(
+        sample.Sample(
+            'swap_encryption_run_status',
+            0.0 if degraded else 1.0,
+            'status',
+            dict(
+                base_meta,
+                degraded=degraded,
+                degraded_reasons='; '.join(degraded_reasons) or 'none',
+                num_samples=len(results) + 1,
+            ),
+        )
+    )
+
+    if degraded:
+        msg = '[swap_encryption] RUN DEGRADED — ' + '; '.join(degraded_reasons)
+        logging.error(msg)
+        if _FAIL_ON_DEGRADED.value:
+            raise errors.Benchmarks.RunError(msg)
+    else:
+        logging.info(
+            '[swap_encryption] Run completed cleanly (%d samples)',
+            len(results),
+        )
+
+    return results
+
+
+
+def _delete_default_pool(cluster) -> None:
+  """Delete the dummy e2-medium default-pool once the benchmark pod is Running.
+
+  GKE requires at least one nodepool at cluster creation time; the e2-medium
+  default-pool satisfies that requirement. Deleting it before the DaemonSet
+  pod is Running can trigger a brief API-server timeout while two concurrent
+  nodepool operations are in progress.
+  """
+  try:
+    cmd = cluster._GcloudCommand(  # pylint: disable=protected-access
+        'container', 'node-pools', 'delete', _DEFAULT_POOL,
+        '--cluster', cluster.name,
+    )
+    cmd.args.append('--quiet')
+    logging.info('[swap_encryption] Deleting default nodepool: %s', _DEFAULT_POOL)
+    _, stderr, rc = cmd.Issue(timeout=300, raise_on_failure=False)
+    if rc != 0:
+      logging.warning(
+          '[swap_encryption] Could not delete default nodepool (rc=%d): %s',
+          rc, stderr,
+      )
+    else:
+      logging.info('[swap_encryption] Default nodepool deleted')
+  except Exception as e:  # pylint: disable=broad-except
+    logging.warning('[swap_encryption] _delete_default_pool failed: %s', e)
+def Cleanup(spec: _BenchmarkSpec) -> None:
+    """Resources in spec.resources are auto-deleted by the PKB framework.
+
+    SwapDaemonSet._Delete() runs in-pod teardown (swapoff, dmsetup remove,
+    losetup cleanup, pkill fio/stress-ng) then deletes the DaemonSet.
+    SwapNodePool._Delete() detaches+deletes the swap disk (if any) then
+    deletes the benchmark nodepool.
+    """
+
+
+# ---------------------------------------------------------------------------
+# Internal helpers
+# ---------------------------------------------------------------------------
+
+
+def _get_daemonset(spec: _BenchmarkSpec) -> _ds_mod.SwapDaemonSet:
+    """Retrieve the SwapDaemonSet resource from spec.resources."""
+    daemonset = next(
+        (r for r in spec.resources if isinstance(r, _ds_mod.SwapDaemonSet)),
+        None,
+    )
+    if daemonset is None:
+        raise errors.Benchmarks.RunError(
+            '[swap_encryption] SwapDaemonSet not found in spec.resources —'
+            ' was Prepare() called?'
+        )
+    return daemonset
+
+
+def _phase_selected(token: str) -> bool:
+    """Return True if phase `token` should run given --swap_encryption_phases.
+
+    'all' (the default) selects every phase.  Otherwise only the
+    comma-separated tokens listed in the flag run.
+    """
+    selected = [p.strip().lower() for p in _PHASES.value if p.strip()]
+    return (not selected) or ('all' in selected) or (token.lower() in selected)
+
+
+def _ensure_io2_volume() -> None:
+    """Create + attach a dedicated io2 EBS volume to the benchmark node so the
+    io2 test-matrix row swaps on real io2 hardware-encrypted storage.
+
+    No-op unless --swap_encryption_swap_type=io2 on an AWS/EKS cluster.
+    Best-effort: logs and returns on failure.  Stashes the created volume id in
+    _IO2_VOLUME_ID for serial-based device detection in _setup_eks_io2_swap.
+    """
+    global _IO2_VOLUME_ID
+    if _SWAP_TYPE.value != 'io2':
+        return
+    out, _, rc = kubectl.RunKubectlCommand(
+        ['get', 'nodes', '-o', 'jsonpath={.items[0].spec.providerID}'],
+        raise_on_failure=False,
+    )
+    provider = (out or '').strip()  # aws:///us-east-1a/i-0abc...
+    if rc != 0 or 'aws://' not in provider:
+        logging.warning(
+            '[swap_encryption] io2 attach skipped: could not resolve '
+            'EC2 instance from providerID=%r',
+            provider,
+        )
+        return
+    parts = [p for p in provider.split('/') if p]
+    instance_id, az = parts[-1], parts[-2]
+    region = az[:-1]
+    base = ['aws', 'ec2', '--region', region]
+    try:
+        create_args = [
+            'create-volume',
+            '--volume-type',
+            'io2',
+            '--size',
+            '500',
+            '--iops',
+            '16000',
+            '--availability-zone',
+            az,
+            '--tag-specifications',
+            'ResourceType=volume,Tags=[{Key=pkb,Value=swap_encryption}]',
+        ]
+        if _IO2_ENCRYPTED.value:
+            create_args.append('--encrypted')
+            if _IO2_KMS_KEY_ID.value:
+                create_args += ['--kms-key-id', _IO2_KMS_KEY_ID.value]
+            logging.info(
+                '[swap_encryption] io2 volume will be EBS-encrypted '
+                '(row: hardware encryption)'
+            )
+        else:
+            logging.info(
+                '[swap_encryption] io2 volume UNENCRYPTED (baseline row)'
+            )
+        create_args += ['--query', 'VolumeId', '--output', 'text']
+        vol_id, _, vrc = vm_util.IssueCommand(
+            base + create_args, raise_on_failure=False
+        )
+        vol_id = (vol_id or '').strip()
+        if vrc != 0 or not vol_id.startswith('vol-'):
+            logging.warning(
+                '[swap_encryption] io2 create-volume failed: %r', vol_id
+            )
+            return
+        vm_util.IssueCommand(
+            base + ['wait', 'volume-available', '--volume-ids', vol_id],
+            raise_on_failure=False,
+        )
+        vm_util.IssueCommand(
+            base
+            + [
+                'attach-volume',
+                '--volume-id',
+                vol_id,
+                '--instance-id',
+                instance_id,
+                '--device',
+                '/dev/sdf',
+            ],
+            raise_on_failure=False,
+        )
+        vm_util.IssueCommand(
+            base + ['wait', 'volume-in-use', '--volume-ids', vol_id],
+            raise_on_failure=False,
+        )
+        _IO2_VOLUME_ID = vol_id
+        logging.info(
+            '[swap_encryption] Attached io2 volume %s to %s as /dev/sdf',
+            vol_id,
+            instance_id,
+        )
+        time.sleep(15)  # allow the NVMe device node to appear
+    except Exception as e:  # pylint: disable=broad-except
+        logging.warning(
+            '[swap_encryption] io2 attach error (continuing): %s', e
+        )
+
+
+def _configure_eks_kubelet_swap(spec) -> None:
+    """Configure EKS kubelet for LimitedSwap via nodeadm bootstrap.
+
+    NOTE: Deferred — requires Ajay's PR #6780 (SwapConfigSpec + nodeadm
+    integration) to merge.  When that lands, EKS node pools should include
+    a preBootstrapCommands block writing nodeadm config with
+    memorySwapBehavior: LimitedSwap before kubelet starts::
+
+      apiVersion: node.eks.aws/v1alpha1
+      kind: NodeConfig
+      spec:
+        kubelet:
+          config:
+            memorySwapBehavior: LimitedSwap
+            failSwapOn: false
+
+    GKE equivalent: linuxConfig.swapConfig via --system-config-from-file
+    (swapConfig automatically enables memorySwapBehavior=LimitedSwap),
+    already implemented in SwapNodePool._CreateNodePool().
+
+    See: https://github.com/GoogleCloudPlatform/PerfKitBenchmarker/pull/6780
+    """
+    logging.warning(
+        '[swap_encryption] EKS kubelet LimitedSwap config via nodeadm is '
+        'deferred (blocked on PR #6780 — SwapConfigSpec). '
+        'EKS nodes will use default kubelet swap settings until that PR merges.'
+    )
+
+
+def _detect_cloud(daemonset: _ds_mod.SwapDaemonSet) -> str:
+    """Detect GCP vs AWS from DMI product info exposed via /sys hostPath mount.
+
+    DMI is the most reliable in-container detection method because it reads
+    directly from the host kernel's SMBIOS table via /sys (already mounted).
+    It avoids HTTP metadata endpoint quoting issues and network timeouts.
+
+    Falls back to metadata HTTP endpoints if DMI is inconclusive.
+    """
+    # Primary: DMI product name / vendor (available via /sys hostPath mount)
+    dmi_out, _ = daemonset.PodExec(
+        'cat /sys/class/dmi/id/sys_vendor /sys/class/dmi/id/product_name '
+        '/sys/class/dmi/id/bios_vendor 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    dmi = dmi_out.strip().lower()
+    if 'google' in dmi:
+        logging.info(
+            '[swap_encryption] Cloud detected via DMI: gcp (%s)',
+            dmi_out.strip(),
+        )
+        return 'gcp'
+    if any(k in dmi for k in ('amazon', 'ec2', 'aws')):
+        logging.info(
+            '[swap_encryption] Cloud detected via DMI: aws (%s)',
+            dmi_out.strip(),
+        )
+        return 'aws'
+
+    # Secondary: GCP metadata endpoint.
+    gcp_out, _ = daemonset.PodExec(
+        'curl -s -m 3 '
+        'http://metadata.google.internal/computeMetadata/v1/instance/zone '
+        '-H Metadata-Flavor:Google 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    if gcp_out.strip():
+        logging.info('[swap_encryption] Cloud detected via metadata: gcp')
+        return 'gcp'
+
+    # Tertiary: AWS IMDS (IMDSv2 token-based; IMDSv1 is often disabled).
+    aws_out, _ = daemonset.PodExec(
+        'T=$(curl -s -m 3 -X PUT '
+        'http://169.254.169.254/latest/api/token '
+        '-H "X-aws-ec2-metadata-token-ttl-seconds: 60" 2>/dev/null); '
+        'curl -s -m 3 -H "X-aws-ec2-metadata-token: $T" '
+        'http://169.254.169.254/latest/meta-data/instance-id '
+        '2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    if aws_out.strip():
+        logging.info('[swap_encryption] Cloud detected via IMDS: aws')
+        return 'aws'
+
+    logging.warning(
+        '[swap_encryption] Could not detect cloud from DMI or metadata'
+    )
+    return 'unknown'
+
+
+def _setup_gke_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Configure dm-crypt swap on the GKE node, mirroring go/node:swap-encryption.
+
+    GKE nodes use dm-crypt with an ephemeral random key so that swap contents
+    are encrypted at rest without requiring persistent key management.
+    We replicate this exactly using cryptsetup in plain mode (no LUKS header).
+    """
+    swap_type = _SWAP_TYPE.value
+    if swap_type == 'auto':
+        # Check whether Local SSDs are present
+        lssd_out, _ = daemonset.PodExec(
+            "lsblk -d -o NAME,MODEL | grep -i 'local\\|nvme' | "
+            "grep -v 'nvme0' | awk '{print $1}' | head -1",
+            ignore_failure=True,
+        )
+        swap_type = 'lssd' if lssd_out.strip() else 'hyperdisk'
+
+    if swap_type == 'lssd':
+        _setup_gke_lssd_swap(daemonset)
+    elif swap_type == 'boot_disk':
+        _setup_gke_bootdisk_swap(daemonset)
+    else:
+        _setup_gke_hyperdisk_swap(daemonset)
+
+
+def _setup_gke_hyperdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Configure dm-crypt swap on hyperdisk-balanced (GKE default).
+
+    Disk detection is split into two separate commands so that the boot-device
+    name is resolved first and then substituted as a literal string — nested
+    $() expansions inside a kubectl exec bash -c argument are unreliable.
+
+    If no dedicated data disk is attached (single-disk node) dm-crypt is set up
+    over a loop device backed by a file on the boot hyperdisk, which still
+    exercises the full encryption path on the same storage tier.
+    """
+    logging.info('[swap_encryption] GKE: setting up dm-crypt on hyperdisk')
+
+    # Step 1: identify the boot device name (e.g. "nvme0n1", "sda")
+    boot_out, _ = daemonset.PodExec(
+        'lsblk -no pkname "$(findmnt -n -o SOURCE /)" 2>/dev/null | head -1',
+        ignore_failure=True,
+    )
+    boot_base = boot_out.strip() or 'nvme0n1'
+    logging.info('[swap_encryption] GKE: boot device: %s', boot_base)
+
+    # Step 2: find a non-boot disk using the literal name from step 1
+    disk_out, _ = daemonset.PodExec(
+        "lsblk -d -o NAME,TYPE | awk '$2==\"disk\"{print $1}' "
+        f"| grep -v '^{boot_base}$' | head -1",
+        ignore_failure=True,
+    )
+    disk_name = disk_out.strip()
+
+    if not disk_name:
+        logging.info(
+            '[swap_encryption] No dedicated data disk found – '
+            'falling back to loop device on /mnt/stateful_partition '
+            '(direct-io=on, dm-crypt=%s)',
+            _ENABLE_DMCRYPT.value,
+        )
+        _setup_gke_loop_device_swap(daemonset)
+        return
+
+    disk = f'/dev/{disk_name}'
+    logging.info(
+        '[swap_encryption] GKE: swap target disk: %s  dmcrypt=%s',
+        disk,
+        _ENABLE_DMCRYPT.value,
+    )
+
+    # Clean up any stale mapping from a previous failed run.
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+    dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    wipefs -a {disk} 2>/dev/null || true
+  """),
+        ignore_failure=True,
+    )
+
+    if _ENABLE_DMCRYPT.value:
+        # We cannot use cryptsetup open from inside a container because
+        # libdevmapper calls dm_udev_wait() after creating the target, which
+        # blocks on /run/udev/control.  That socket belongs to udevd which is
+        # not running inside the container — so cryptsetup hangs forever.
+        #
+        # Instead we drive dmsetup directly with --noudevrules --noudevsync,
+        # which skips all udev synchronisation, and call dmsetup mknodes to
+        # ensure /dev/mapper/swap_encrypted appears without udev.
+        #
+        # insmod (not modprobe) loads the kernel module: modprobe also talks to
+        # systemd-udevd and can deadlock from a container for the same reason.
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      grep -q dm_crypt /proc/modules 2>/dev/null || {{
+        KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1)
+        [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true
+      }}
+      KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n')
+      SIZE=$(blockdev --getsz {disk})
+      printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{disk}" | \\
+        dmsetup create swap_encrypted --noudevrules --noudevsync
+      unset KEY
+      dmsetup mknodes swap_encrypted 2>/dev/null || true
+      mkswap /dev/mapper/swap_encrypted
+      swapon /dev/mapper/swap_encrypted
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: dm-crypt swap active on '
+            '/dev/mapper/swap_encrypted'
+        )
+    else:
+        # Encryption-disabled column of the test matrix
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      mkswap {disk} && \\
+      swapon {disk}
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: plain (unencrypted) swap active on %s', disk
+        )
+
+
+def _setup_gke_loop_device_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Plain loop-device swap for single-disk GKE nodes (no dedicated swap disk).
+
+    Used when _setup_gke_hyperdisk_swap finds no dedicated second disk (e.g.
+    n2-highmem-32 / n4-highmem-32 single-boot-disk nodes, regardless of image
+    type).
+
+    dm-crypt is skipped on this path for two reasons:
+    1. On COS (Container-Optimised OS): the device-mapper kernel subsystem is
+       inaccessible from inside a Kubernetes pod (even privileged).  Calls to
+       cryptsetup/dmsetup block indefinitely and are killed by the PKB timeout.
+       This is a deliberate COS security restriction, not a permissions issue.
+    2. On UBUNTU_CONTAINERD: the loop device is created in the container
+       namespace; its behaviour under nsenter (needed for dm-crypt on dedicated
+       disks) is untested, so plain loop swap is used for safety.
+    For dedicated block devices (hyperdisk, LSSD) nsenter into the host mount
+    namespace works around the COS restriction (see _setup_gke_hyperdisk_swap).
+    The loop device path skips dm-crypt on all image types; plain loop swap is
+    used instead.
+
+    Therefore this path uses a plain loop device as swap without dm-crypt.
+    Phase 1 (fio) is skipped for plain loop devices — the goal is enc-on vs
+    enc-off comparison, and fio on a plain loop device measures the backing
+    filesystem rather than the swap stack.  Tiers 2–6 (stress-ng, Redis,
+    kernel build, OpenSearch) run normally.
+
+    For dm-crypt measurement on GCP use a machine type with local NVMe (LSSD)
+    or provision a dedicated hyperdisk on a second disk slot (n4-highmem-32+).
+
+    Improvements over the old /var path:
+    - Backing file on /mnt/stateful_partition (ext4), not the container
+      overlayfs — avoids overlayfs O_DIRECT limitation.
+    - losetup --direct-io=on passes I/O through to the host ext4, reducing
+      double-buffering for Tiers 2–6 workloads.
+    """
+    size_gb = _SWAP_SIZE_GB.value
+    backing = '/mnt/stateful_partition/pkb_swap_backing'
+
+    # ── Step 0: detach any stale loop device from a previous failed run ───────
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | \
+      while read dev
+      do
+        swapoff "$dev" 2>/dev/null || true
+        losetup -d "$dev" 2>/dev/null || true
+      done
+    rm -f {backing}
+  """),
+        ignore_failure=True,
+    )
+
+    # ── Step 1: allocate backing file on stateful partition (ext4) ───────────
+    logging.info(
+        '[swap_encryption] GKE: creating %dG backing file on'
+        ' stateful_partition',
+        size_gb,
+    )
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    fallocate -l {size_gb}G {backing} 2>/dev/null || \\
+      truncate -s {size_gb}G {backing}
+  """),
+    )
+
+    # ── Step 2: loop device with direct-io passthrough ───────────────────────
+    loop_out, _ = daemonset.PodExec(
+        textwrap.dedent(f"""
+    LOOP=$(losetup -f) && \\
+    losetup --direct-io=on "$LOOP" {backing} && \\
+    echo "$LOOP"
+  """),
+    )
+    loop_dev = loop_out.strip()
+    if not loop_dev.startswith('/dev/loop'):
+        raise RuntimeError(
+            f'[swap_encryption] losetup failed – output: {loop_out!r}'
+        )
+    logging.info(
+        '[swap_encryption] GKE: loop device: %s  direct-io=on', loop_dev
+    )
+
+    # ── Step 3: plain mkswap + swapon (dm-crypt skipped on loop devices) ────────
+    daemonset.PodExec(f'mkswap {loop_dev}')
+    daemonset.PodExec(f'swapon {loop_dev}')
+    logging.warning(
+        '[swap_encryption] GKE: plain loop swap active on %s '
+        '(dm-crypt unavailable from COS pod — device-mapper is blocked by '
+        'COS kernel namespace restrictions). '
+        'Phase 1 (fio) will be skipped. '
+        'Use a machine with LSSD (c4-*-lssd) or attach a dedicated second '
+        'hyperdisk for dm-crypt measurement.',
+        loop_dev,
+    )
+
+
+def _setup_gke_bootdisk_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Swap on the OS BOOT disk — methodology Table 0 rows 1-4.
+
+    Creates a loop-backed swap file on /mnt/stateful_partition (the node's boot
+    disk, whose type — pd-balanced or hyperdisk-balanced — is chosen at
+    nodepool-creation time via --swap_encryption_boot_disk_type).  dm-crypt is
+    layered on the loop device when --swap_encryption_enable_dmcrypt is set
+    (encryption-on rows 2/4); otherwise plain swap is used (encryption-off rows
+    1/3).
+
+    Reuses the same loop-creation and dmsetup patterns as the LSSD/hyperdisk
+    paths — no shared provider module is touched.  Requires an Ubuntu node image
+    (dm-crypt from a pod is blocked on COS).
+    """
+    size_gb = _SWAP_SIZE_GB.value
+    backing = '/mnt/stateful_partition/pkb_swap_backing'
+    logging.info(
+        '[swap_encryption] GKE: boot-disk swap (%dG backing, dmcrypt=%s)',
+        size_gb,
+        _ENABLE_DMCRYPT.value,
+    )
+
+    # Clean up any stale loop/mapping from a previous run.
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+    dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    losetup -j {backing} 2>/dev/null | awk -F: '{{print $1}}' | while read d
+    do
+      swapoff "$d" 2>/dev/null || true
+      losetup -d "$d" 2>/dev/null || true
+    done
+    rm -f {backing}
+  """),
+        ignore_failure=True,
+    )
+
+    # Allocate the backing file on the boot-disk ext4 stateful partition.
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    fallocate -l {size_gb}G {backing} 2>/dev/null || truncate -s {size_gb}G {backing}
+  """),
+    )
+
+    loop_out, _ = daemonset.PodExec(
+        textwrap.dedent(f"""
+    LOOP=$(losetup -f) && losetup --direct-io=on "$LOOP" {backing} && echo "$LOOP"
+  """),
+    )
+    loop_dev = (
+        loop_out.strip().splitlines()[-1].strip() if loop_out.strip() else ''
+    )
+    if not loop_dev.startswith('/dev/loop'):
+        raise RuntimeError(
+            f'[swap_encryption] boot-disk losetup failed: {loop_out!r}'
+        )
+    logging.info('[swap_encryption] GKE: boot-disk loop device: %s', loop_dev)
+
+    if _ENABLE_DMCRYPT.value:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      grep -q dm_crypt /proc/modules 2>/dev/null || {{
+        KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1)
+        [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true
+      }}
+      KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n')
+      SIZE=$(blockdev --getsz {loop_dev})
+      printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\
+        dmsetup create swap_encrypted --noudevrules --noudevsync
+      unset KEY
+      dmsetup mknodes swap_encrypted 2>/dev/null || true
+      mkswap /dev/mapper/swap_encrypted
+      swapon /dev/mapper/swap_encrypted
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: boot-disk dm-crypt swap active on '
+            '/dev/mapper/swap_encrypted'
+        )
+    else:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      mkswap {loop_dev} && swapon {loop_dev}
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: boot-disk plain swap active on %s', loop_dev
+        )
+
+
+def _setup_gke_lssd_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Configure dm-crypt on LSSD RAID-0 array (go/gke-swap-lssd)."""
+    logging.info('[swap_encryption] GKE: setting up LSSD RAID-0 swap')
+
+    # Reused-node hygiene: a previous run on this node may have left an ACTIVE
+    # dm-crypt swap (e.g. /dev/nvme0n1 └─swap_encrypted [SWAP]).  That makes the
+    # LSSD look "unclean/busy" to the device selector below, which then wrongly
+    # falls back to the hyperdisk path and tries the boot disk.  Tear down any
+    # prior PKB swap mapping FIRST so the underlying LSSD is freed and selectable.
+    daemonset.PodExec(
+        textwrap.dedent("""
+    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+    swapoff -a 2>/dev/null || true
+    dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+  """),
+        ignore_failure=True,
+    )
+
+    # Log the full block-device topology up front for diagnosis (every prior
+    # swap failure traced back to picking the wrong device).
+    topo, _ = daemonset.PodExec(
+        'lsblk -o NAME,TYPE,SIZE,ROTA,MOUNTPOINT 2>/dev/null',
+        ignore_failure=True,
+    )
+    logging.info(
+        '[swap_encryption] block device topology:\n%s', (topo or '').strip()
+    )
+
+    # Identify candidate swap devices = whole disks that are NOT the boot/OS
+    # disk.  We must NOT rely on a device name (boot disk enumerates as nvme0n1
+    # on some nodes, nvme1n1 on others) and we cannot use `findmnt /` because the
+    # container root is an overlay.  Instead we EXCLUDE any disk that:
+    #   * has partition children (boot disk has p1/p14/p15/p16), or
+    #   * has any mounted filesystem (itself or a child).
+    # A raw local SSD intended for swap has neither.  This robustly prevents the
+    # catastrophic bug where the 100 GB boot disk (root mounted) was RAIDed into
+    # the swap device, yielding a non-functional swap (fio empty + stress OOM).
+    lssd_out, _ = daemonset.PodExec(
+        textwrap.dedent("""
+        for d in $(lsblk -dno NAME,ROTA | awk '$2==0{print $1}')
+        do
+          if lsblk -no TYPE "/dev/$d" 2>/dev/null | grep -q '^part$'; then
+            continue   # has partitions -> boot/OS disk
+          fi
+          if lsblk -no MOUNTPOINT "/dev/$d" 2>/dev/null | grep -q '[^[:space:]]'; then
+            continue   # mounted somewhere -> not a free swap device
+          fi
+          echo "/dev/$d"
+        done
+      """),
+        ignore_failure=True,
+    )
+    devices = [d.strip() for d in lssd_out.strip().splitlines() if d.strip()]
+    if not devices:
+        logging.warning(
+            '[swap_encryption] No clean (unpartitioned, unmounted) local SSD'
+            ' found — falling back to hyperdisk swap path'
+        )
+        _setup_gke_hyperdisk_swap(daemonset)
+        return
+
+    device_list = ' '.join(devices)
+    n = len(devices)
+    logging.info(
+        '[swap_encryption] GKE: LSSD RAID-0 across %d clean device(s): '
+        '%s  dmcrypt=%s',
+        n,
+        device_list,
+        _ENABLE_DMCRYPT.value,
+    )
+
+    # Clean up stale mappings, RAID arrays, and GKE-managed mounts.
+    #
+    # GKE UBUNTU nodes run google-ssd-startup.service at boot which formats
+    # local NVMe SSDs as ext4 and mounts them at /mnt/disks/ssd0 etc. even
+    # when --local-nvme-ssd-block is set.  The mount makes the block device
+    # busy so mdadm/wipefs fail silently (we had || true).  We must unmount
+    # those paths first.  /proc-host/mounts reflects the host mount table
+    # (hostPID:true + privileged gives us access).
+    #
+    # pkb_swap is the dm-crypt device created by the node startup script (for
+    # single-LSSD nodes it holds /dev/nvme1n1 directly without an md0 layer).
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    echo "[pkb-lssd-cleanup] /proc/mdstat:" >&2
+    cat /proc/mdstat 2>/dev/null || true
+    echo "[pkb-lssd-cleanup] dmsetup ls:" >&2
+    dmsetup ls 2>/dev/null || true
+    echo "[pkb-lssd-cleanup] /proc/swaps:" >&2
+    cat /proc/swaps 2>/dev/null || true
+    echo "[pkb-lssd-cleanup] host mounts on {device_list}:" >&2
+    grep -E '{('|'.join(devices))}' /proc-host/mounts 2>/dev/null || true
+    echo "[pkb-lssd-cleanup] sysfs holders:" >&2
+    for dev in {device_list}
+    do
+      devname=$(basename "$dev")
+      ls -1 /sys/block/$devname/holders/ 2>/dev/null | while read h
+      do
+        echo "[pkb-lssd-cleanup]   $dev held by $h" >&2
+      done
+    done
+    echo "[pkb-lssd-cleanup] --- begin teardown ---" >&2
+    for dev in {device_list}
+    do
+      test -b "$dev" || continue
+      devname=$(basename "$dev")
+      for holder in /sys/block/$devname/holders/*
+      do
+        test -e "$holder" || continue
+        h=$(basename "$holder")
+        echo "[pkb-lssd-cleanup] removing holder /dev/$h from $dev" >&2
+        if echo "$h" | grep -q "^md"
+        then
+          mdadm --stop /dev/$h 2>/dev/null || true
+        else
+          dmsetup remove --force --noudevrules --noudevsync /dev/$h 2>/dev/null || true
+        fi
+      done
+      mounts=$(awk -v d="$dev" '$1==d{{print $2}}' /proc-host/mounts 2>/dev/null || true)
+      for mp in $mounts
+      do
+        echo "[pkb-lssd-cleanup] unmounting $mp from $dev" >&2
+        umount -f "$mp" 2>/dev/null || true
+      done
+    done
+    swapoff -a 2>/dev/null || true
+    swapoff /dev/mapper/pkb_swap 2>/dev/null || true
+    swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
+    dmsetup remove --force --noudevrules --noudevsync pkb_swap 2>/dev/null || true
+    dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    mdadm --stop --scan 2>/dev/null || true
+    mdadm --zero-superblock {device_list} 2>/dev/null || true
+    wipefs -a {device_list} 2>/dev/null || true
+    echo "[pkb-lssd-cleanup] lsblk after wipefs:" >&2
+    lsblk {device_list} 2>/dev/null || true
+    partx -u {device_list} 2>/dev/null || true
+    losetup -D 2>/dev/null || true
+    rm -f /mnt/stateful_partition/pkb_swap.img 2>/dev/null || true
+    sleep 2
+  """),
+        ignore_failure=True,
+    )
+
+    # Step 3: verify the devices are truly raw (unpartitioned).  On GKE Ubuntu
+    # nodes the local NVMe device may be partitioned by node startup scripts
+    # even when --local-nvme-ssd-block is specified.  The kernel refuses a
+    # whole-disk exclusive open (DM_TABLE_LOAD → EBUSY) when any partition of
+    # the disk is open by another process (e.g. the container overlay FS is
+    # backed by nvme1n1p1).  Detect this and fall back to a loop device backed
+    # by a file on /mnt/stateful_partition (which IS the SSD partition).
+    raw_check_out, _ = daemonset.PodExec(
+        textwrap.dedent(f"""
+        for dev in {device_list}
+        do
+          if lsblk -ln -o TYPE "$dev" 2>/dev/null | grep -q '^part$'
+          then
+            echo "[pkb-lssd] $dev is partitioned — cannot use as raw block device" >&2
+          else
+            echo "$dev"
+          fi
+        done
+      """),
+        ignore_failure=True,
+    )
+    raw_devices = [
+        d.strip() for d in raw_check_out.strip().splitlines() if d.strip()
+    ]
+
+    if not raw_devices:
+        logging.info(
+            '[swap_encryption] GKE: all LSSD devices are partitioned — '
+            'falling back to loop device on /mnt/stateful_partition'
+        )
+        _setup_gke_lssd_stateful_loop_swap(daemonset)
+        return
+
+    # Use only raw (unpartitioned) devices going forward.
+    devices = raw_devices
+    device_list = ' '.join(devices)
+    n = len(devices)
+    logging.info(
+        '[swap_encryption] GKE: using %d raw LSSD device(s): %s  dmcrypt=%s',
+        n,
+        device_list,
+        _ENABLE_DMCRYPT.value,
+    )
+
+    # For N=1 LSSD, skip mdadm entirely and target the raw device directly.
+    # For N>1 we stripe across multiple NVMe devices.
+    if n > 1:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      mdadm --create /dev/md0 --force \\
+        --level=0 --raid-devices={n} \\
+        {device_list}
+      test -b /dev/md0 || {{ echo "mdadm: /dev/md0 not created" >&2; exit 1; }}
+    """),
+        )
+        swap_block_dev = '/dev/md0'
+    else:
+        swap_block_dev = devices[0]
+        logging.info(
+            '[swap_encryption] GKE: single LSSD — skipping mdadm, '
+            'using %s directly',
+            swap_block_dev,
+        )
+
+    if _ENABLE_DMCRYPT.value:
+        # Same dmsetup --noudevrules --noudevsync approach as _setup_gke_hyperdisk_swap.
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      grep -q dm_crypt /proc/modules 2>/dev/null || {{
+        KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1)
+        [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true
+      }}
+      udevadm control --stop-exec-queue 2>/dev/null || true
+      KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n')
+      SIZE=$(blockdev --getsz {swap_block_dev})
+      printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{swap_block_dev}" | \\
+        dmsetup create swap_encrypted --noudevrules --noudevsync
+      udevadm control --start-exec-queue 2>/dev/null || true
+      unset KEY
+      dmsetup mknodes swap_encrypted 2>/dev/null || true
+      mkswap /dev/mapper/swap_encrypted
+      swapon /dev/mapper/swap_encrypted
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: LSSD dm-crypt swap active on %s',
+            swap_block_dev,
+        )
+    else:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      mkswap {swap_block_dev}
+      swapon {swap_block_dev}
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: LSSD plain swap active on %s',
+            swap_block_dev,
+        )
+
+
+def _setup_gke_lssd_stateful_loop_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Set up swap on the LSSD partition via a loop device.
+
+    Used when the local NVMe device is partitioned by GKE startup scripts
+    and cannot be opened as a whole raw block device (DM_TABLE_LOAD EBUSY).
+    The DaemonSet mounts /mnt/stateful_partition (hostPath) from the host's
+    nvme1n1p1 — which is still local SSD storage.  We create a large file
+    there and layer loop → dm-crypt → swap on top of it.
+    """
+    img_path = '/mnt/stateful_partition/pkb_swap.img'
+
+    # Clean up any previous run artifacts.
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    swapoff -a 2>/dev/null || true
+    dmsetup remove --force --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
+    losetup -D 2>/dev/null || true
+    rm -f {img_path} 2>/dev/null || true
+  """),
+        ignore_failure=True,
+    )
+
+    # Determine file size: 80% of available space, at least 16 GB.
+    size_out, _ = daemonset.PodExec(
+        "df -P /mnt/stateful_partition | awk 'NR==2{print $4}'",
+        ignore_failure=True,
+    )
+    avail_kb = int(size_out.strip() or '0')
+    swap_gb = max(16, int(avail_kb * 0.8 / 1024 / 1024))
+    logging.info(
+        '[swap_encryption] GKE: LSSD stateful-loop: %d GB image at %s',
+        swap_gb,
+        img_path,
+    )
+
+    # Allocate file (fallocate is instant on ext4; dd fallback for others).
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    fallocate -l {swap_gb}G {img_path} 2>/dev/null || \\
+      dd if=/dev/zero of={img_path} bs=1G count={swap_gb}
+    chmod 600 {img_path}
+    losetup --direct-io=on -f {img_path}
+  """),
+        timeout=300,
+    )
+
+    loop_out, _ = daemonset.PodExec(
+        f"losetup -j {img_path} | awk -F: '{{print $1}}' | head -1",
+        ignore_failure=True,
+    )
+    loop_dev = loop_out.strip()
+    if not loop_dev.startswith('/dev/loop'):
+        raise RuntimeError(
+            f'[swap_encryption] losetup failed for {img_path} — got:'
+            f' {loop_out!r}'
+        )
+    logging.info(
+        '[swap_encryption] GKE: LSSD stateful-loop device: %s', loop_dev
+    )
+
+    if _ENABLE_DMCRYPT.value:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      grep -q dm_crypt /proc/modules 2>/dev/null || {{
+        KO=$(find /lib/modules/$(uname -r) -name 'dm-crypt.ko*' 2>/dev/null | head -1)
+        [ -n "$KO" ] && insmod "$KO" 2>/dev/null || true
+      }}
+      udevadm control --stop-exec-queue 2>/dev/null || true
+      KEY=$(dd if=/dev/urandom bs=32 count=1 2>/dev/null | od -A n -t x1 | tr -d ' \\n')
+      SIZE=$(blockdev --getsz {loop_dev})
+      printf "0 %s crypt aes-xts-plain64 %s 0 %s 0\\n" "$SIZE" "$KEY" "{loop_dev}" | \\
+        dmsetup create swap_encrypted --noudevrules --noudevsync
+      udevadm control --start-exec-queue 2>/dev/null || true
+      unset KEY
+      dmsetup mknodes swap_encrypted 2>/dev/null || true
+      mkswap /dev/mapper/swap_encrypted
+      swapon /dev/mapper/swap_encrypted
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: LSSD stateful-loop dm-crypt swap active '
+            'on %s → %s',
+            img_path,
+            loop_dev,
+        )
+    else:
+        daemonset.PodExec(
+            textwrap.dedent(f"""
+      mkswap {loop_dev}
+      swapon {loop_dev}
+    """),
+        )
+        logging.info(
+            '[swap_encryption] GKE: LSSD stateful-loop plain swap active '
+            'on %s → %s',
+            img_path,
+            loop_dev,
+        )
+
+
+def _setup_eks_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Configure swap on EKS nodes — Instance Store OR io2 root disk.
+
+    Swap type is selected by --swap_encryption_swap_type:
+      instance_store (default) – NVMe SSD attached by Nitro (i4i, m6id, c6id).
+        Nitro encrypts all block-device writes at hardware level; no extra
+        cryptsetup needed.
+      io2 – EBS io2 volume provisioned as the node root/data disk.
+        Used for apples-to-apples comparison against GKE hyperdisk-balanced.
+    """
+    swap_type = _SWAP_TYPE.value
+    if swap_type in ('auto', 'instance_store'):
+        _setup_eks_instance_store_swap(daemonset)
+    elif swap_type == 'io2':
+        _setup_eks_io2_swap(daemonset)
+    else:
+        logging.warning(
+            '[swap_encryption] Unknown EKS swap type %s – fallback', swap_type
+        )
+        _setup_eks_instance_store_swap(daemonset)
+
+
+def _setup_eks_instance_store_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Swap on AWS NVMe Instance Store (Nitro hardware-offloaded encryption)."""
+    logging.info('[swap_encryption] EKS: setting up Instance Store swap')
+
+    # Find the Instance Store NVMe device (not the root EBS volume)
+    nvme_out, _ = daemonset.PodExec(
+        "nvme list 2>/dev/null | awk '/Instance Storage/{print $1}' | head -1"
+        " || lsblk -d -o NAME,MODEL | grep -i 'instance\\|nvme' | grep -v"
+        " 'nvme0' | awk '{print \"/dev/\"$1}' | head -1",
+        ignore_failure=True,
+    )
+    device = nvme_out.strip()
+    if not device:
+        # Common Instance Store device paths on AWS
+        for candidate in ['/dev/nvme1n1', '/dev/nvme2n1', '/dev/xvdb']:
+            exists_out, _ = daemonset.PodExec(
+                f'test -b {candidate} && echo yes || echo no',
+                ignore_failure=True,
+            )
+            if exists_out.strip() == 'yes':
+                device = candidate
+                break
+
+    if not device:
+        logging.warning(
+            '[swap_encryption] No Instance Store NVMe found – creating swapfile'
+        )
+        _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value)
+        return
+
+    logging.info('[swap_encryption] EKS: Instance Store device: %s', device)
+
+    # Nitro encrypts all Instance Store writes automatically.
+    # No additional cryptsetup required.
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    mkswap {device} && \\
+    swapon {device}
+  """),
+    )
+    logging.info(
+        '[swap_encryption] EKS: Instance Store swap active on %s', device
+    )
+
+
+def _setup_eks_io2_swap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Swap on AWS EBS io2 volume – apples-to-apples comparison vs GKE hyperdisk.
+
+    EBS io2 volumes on Nitro instances are encrypted at rest by AWS KMS (if
+    enabled on the volume) or via Nitro-level hardware encryption.  No additional
+    cryptsetup is needed here; we simply format the attached data disk as swap.
+
+    Device discovery order:
+      1. Match the io2 volume created by _ensure_io2_volume() by its NVMe serial
+         (serial == volume id without the dash).  This is unambiguous and never
+         picks the root disk or the instance store regardless of nvmeXn1
+         enumeration order on Nitro.
+      2. First non-root EBS ("Elastic Block Store") block device that is not
+         currently mounted.
+    """
+    logging.info('[swap_encryption] EKS: setting up io2 EBS swap')
+
+    # Identify root device so we can exclude it.
+    root_out, _ = daemonset.PodExec(
+        'lsblk -no pkname $(findmnt -n -o SOURCE /) 2>/dev/null || echo'
+        ' nvme0n1',
+        ignore_failure=True,
+    )
+    root_base = root_out.strip() or 'nvme0n1'
+
+    # Identify the io2 volume UNAMBIGUOUSLY by its NVMe serial == volume id.
+    # An EBS NVMe device's serial equals the volume id minus the dash
+    # (vol-0abc... -> serial vol0abc...).
+    device = ''
+    target = _IO2_VOLUME_ID.replace('-', '')
+    if target:
+        ser_out, _ = daemonset.PodExec(
+            'for d in /sys/block/nvme*n1; do '
+            '[ -e "$d" ] || continue; '
+            's=$(cat "$d/device/serial" 2>/dev/null | tr -d "-" | tr -d " "); '
+            f'[ "$s" = "{target}" ] && {{ echo "/dev/$(basename "$d")"; break;'
+            ' }; '
+            'done',
+            ignore_failure=True,
+        )
+        device = ser_out.strip()
+        if device:
+            logging.info(
+                '[swap_encryption] EKS: io2 matched by serial %s -> %s',
+                target,
+                device,
+            )
+
+    if not device:
+        # Fallback: first non-root EBS device, excluding any device that is
+        # currently mounted (root) or already active swap.
+        disk_out, _ = daemonset.PodExec(
+            'for d in /sys/block/nvme*n1 /sys/block/xvd[b-z]'
+            ' /sys/block/sd[b-z];'
+            ' do [ -e "$d" ] || continue; n=$(basename "$d"); [ "$n" ='
+            f' "{root_base}" ] && continue; m=$(cat "$d/device/model"'
+            ' 2>/dev/null);'
+            ' echo "$m" | grep -qi "Elastic Block Store" || continue;'
+            ' mnt=$(lsblk'
+            ' -no MOUNTPOINT "/dev/$n" 2>/dev/null | tr -d " "); [ -n "$mnt"'
+            ' ] &&'
+            ' continue; echo "/dev/$n"; break; done',
+            ignore_failure=True,
+        )
+        device = disk_out.strip()
+        if device:
+            logging.info(
+                '[swap_encryption] EKS: io2 fallback EBS device: %s', device
+            )
+
+    if not device:
+        logging.warning(
+            '[swap_encryption] No io2 EBS disk found – creating plain swapfile'
+        )
+        _setup_plain_swap_file(daemonset, _SWAP_SIZE_GB.value)
+        return
+
+    logging.info('[swap_encryption] EKS: io2 EBS device: %s', device)
+
+    # EBS io2 encryption is handled at the AWS level (Nitro / KMS).
+    out, _ = daemonset.PodExec(
+        textwrap.dedent(f"""
+    swapoff {device} 2>/dev/null || true
+    wipefs -a {device} 2>/dev/null || true
+    mkswap -f {device} && swapon {device}
+    swapon --show
+  """),
+        ignore_failure=True,
+    )
+    if device not in out:
+        raise RuntimeError(
+            f'[swap_encryption] io2 swap did not activate on {device}; '
+            f'swapon --show output: {out!r}. The device may be busy/mounted '
+            '(wrong device picked) or mkswap failed.'
+        )
+    logging.info('[swap_encryption] EKS: io2 EBS swap active on %s', device)
+
+
+def _setup_plain_swap_file(
+    daemonset: _ds_mod.SwapDaemonSet, size_gb: int
+) -> None:
+    """Fallback: create a loop-device-backed swapfile.
+
+    A plain file on overlayfs (the container root) cannot be used as swap —
+    the kernel rejects it with EINVAL.  Routing it through a loop device
+    presents a proper block device to the mm subsystem and succeeds.
+    """
+    logging.info('[swap_encryption] Creating %dGB loop-device swap', size_gb)
+    daemonset.PodExec(
+        textwrap.dedent(f"""
+    fallocate -l {size_gb}G /tmp/pkb_swapfile && \\
+    chmod 600 /tmp/pkb_swapfile && \\
+    LOOP=$(losetup -f) && \\
+    losetup "$LOOP" /tmp/pkb_swapfile && \\
+    mkswap "$LOOP" && \\
+    swapon "$LOOP" && \\
+    echo "swap loop device: $LOOP"
+  """),
+    )
+
+
+def _enable_zswap(daemonset: _ds_mod.SwapDaemonSet) -> None:
+    """Enable zswap with lz4 compressor and 20% pool limit inside the pod."""
+    logging.info('[swap_encryption] Enabling zswap (lz4, 20%% pool)')
+    for cmd in [
+        'echo 1      > /sys/module/zswap/parameters/enabled',
+        'echo lz4    > /sys/module/zswap/parameters/compressor',
+        'echo 20     > /sys/module/zswap/parameters/max_pool_percent',
+        'echo z3fold > /sys/module/zswap/parameters/zpool',
+    ]:
+        daemonset.PodExec(cmd, ignore_failure=True)
+
+
+def _detect_swap_device(
+    daemonset: _ds_mod.SwapDaemonSet,
+) -> str:
+    """Return the active swap device path on the cluster node."""
+    if _SWAP_DEVICE.value:
+        return _SWAP_DEVICE.value
+
+    # /proc/swaps is the source of truth — it lists the device ACTUALLY active.
+    # Do NOT just test -e /dev/mapper/swap_encrypted: a stale dm-crypt mapping
+    # from a previous run on a reused node can still appear as a /dev node while
+    # being non-functional (fio/swapoff fail with "No such device or address").
+    dm_out, _ = daemonset.PodExec(
+        textwrap.dedent("""
+            ACTIVE=$(awk 'NR==2{print $1}' /proc/swaps 2>/dev/null)
+            if [ -n "$ACTIVE" ]
+            then
+              echo "$ACTIVE"
+            elif test -e /dev/mapper/swap_encrypted
+            then
+              echo /dev/mapper/swap_encrypted
+            fi
+        """),
+        ignore_failure=True,
+    )
+    dev = dm_out.strip().splitlines()[-1].strip() if dm_out.strip() else ''
+    if dev:
+        return dev
+    raise ValueError(
+        'No active swap device found in the benchmark pod. '
+        'Use --swap_encryption_device to specify one.'
+    )
+
+
+def _build_metadata(
+    daemonset: _ds_mod.SwapDaemonSet, swap_dev: str
+) -> dict[str, Any]:
+    """Collect node environment, encryption type, and config into a dict."""
+    kernel_out, _ = daemonset.PodExec('uname -r', ignore_failure=True)
+    mem_out, _ = daemonset.PodExec(
+        "awk '/MemTotal/{print $2}' /proc/meminfo", ignore_failure=True
+    )
+    swap_out, _ = daemonset.PodExec(
+        "awk 'NR>1{sum+=$3} END{print sum+0}' /proc/swaps", ignore_failure=True
+    )
+
+    try:
+        mem_gb = round(int(mem_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        mem_gb = 0
+    try:
+        swap_gb = round(int(swap_out.strip()) / (1024 * 1024), 1)
+    except ValueError:
+        swap_gb = 0
+
+    # Encryption type — key off dm-crypt presence + swap target.
+    enc = 'unknown'
+    if '/dev/mapper/' in swap_dev:
+        table_out, _ = daemonset.PodExec(
+            f'dmsetup table {swap_dev.split("/")[-1]} 2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        enc = 'dm-crypt-plain' if 'crypt' in table_out.lower() else 'dm-other'
+    elif _SWAP_TYPE.value in ('instance_store', 'io2'):
+        enc = 'nitro_hardware_offload'
+    elif not _ENABLE_DMCRYPT.value:
+        enc = 'none'
+
+    cloud = _detect_cloud(daemonset)
+
+    instance_label = _INSTANCE_SIZE_LABEL.value
+    if not instance_label:
+        gcp_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail'
+            ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
+            ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        if gcp_type_out.strip():
+            instance_label = gcp_type_out.strip().split('/')[-1]
+    if not instance_label:
+        aws_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail '
+            'http://169.254.169.254/latest/meta-data/instance-type '
+            '2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        instance_label = aws_type_out.strip()
+
+    return {
+        'benchmark': BENCHMARK_NAME,
+        'execution_mode': 'kubernetes_privileged_pod',
+        'cloud': cloud,
+        'instance_size': instance_label,
+        'kernel_version': kernel_out.strip(),
+        'host_memory_gb': mem_gb,
+        'swap_device': swap_dev,
+        'swap_size_gb': swap_gb,
+        'swap_encryption': enc,
+        'storage_target': _SWAP_TYPE.value,
+        'boot_disk_type': _BOOT_DISK_TYPE.value,
+        'dmcrypt_enabled': _ENABLE_DMCRYPT.value,
+        'node_image_type': _NODE_IMAGE_TYPE.value,
+        'boot_disk_iops_target': _BOOT_DISK_IOPS.value,
+        'benchmark_machine_type': _BENCHMARK_MACHINE_TYPE.value,
+        'zswap_enabled': _ENABLE_ZSWAP.value,
+        'min_free_kbytes': _MIN_FREE_KBYTES.value,
+        'fio_runtime_sec': _FIO_RUNTIME_SEC.value,
+        'stress_vm_bytes_requested': _STRESS_VM_BYTES.value,
+        'stress_vm_bytes_list': _STRESS_VM_BYTES_LIST.value,
+        'stress_timeout_sec': _STRESS_TIMEOUT_SEC.value,
+        'nodepool': _NODEPOOL.value,
+    }
+
+
+def _run_phase1_fio(
+    daemonset: _ds_mod.SwapDaemonSet,
+    swap_dev: str,
+    base_meta: dict[str, Any],
+) -> list[sample.Sample]:
+    """Run fio microbenchmarks on the raw swap block device (Phase 1).
+
+    Calls swapoff before running fio so measurements reflect the raw
+    hardware + encryption ceiling with no swap-daemon overhead.  Re-enables
+    swap unconditionally after all jobs complete.
+
+    Jobs:
+      4k_randread   iodepth=32  → random read IOPS
+      4k_randwrite  iodepth=32  → random write IOPS
+      1m_seqread    iodepth=8   → sequential read bandwidth
+      1m_seqwrite   iodepth=8   → sequential write bandwidth
+      4k_lat_read   iodepth=1   → completion latency floor (read)
+
+    Args:
+      daemonset: Active SwapDaemonSet resource.
+      swap_dev: Block device path, e.g. /dev/mapper/swap_encrypted.
+      base_meta: Shared metadata dict from _build_metadata().
+
+    Returns:
+      List of Sample objects with IOPS, bandwidth and latency metrics.
+    """
+    samples: list[sample.Sample] = []
+
+    # swapoff before fio — running fio with --direct=1 on an active swap device
+    # races with kernel page-reclaim on the same dm-crypt target.
+    logging.info('[swap_encryption] Phase 1: swapoff %s', swap_dev)
+    daemonset.PodExec(
+        f'swapoff {swap_dev} 2>/dev/null || swapoff -a 2>/dev/null || true',
+        timeout=30,
+        ignore_failure=True,
+    )
+
+    # (name, rw_mode, block_size, iodepth)
+    fio_jobs = [
+        ('4k_randread', 'randread', '4k', 32),
+        ('4k_randwrite', 'randwrite', '4k', 32),
+        ('1m_seqread', 'read', '1m', 8),
+        ('1m_seqwrite', 'write', '1m', 8),
+        ('4k_lat_read', 'randread', '4k', 1),
+    ]
+
+    runtime = _FIO_RUNTIME_SEC.value
+    try:
+        for name, rw, bs, iodepth in fio_jobs:
+            cmd = (
+                f'fio --name={name} --filename={swap_dev}'
+                f' --rw={rw} --bs={bs} --iodepth={iodepth}'
+                ' --ioengine=libaio --direct=1'
+                f' --runtime={runtime} --time_based --group_reporting'
+                ' --output-format=json 2>/dev/null'
+            )
+            logging.info('[swap_encryption] Phase 1: fio job %s', name)
+            out, _ = daemonset.PodExec(cmd, timeout=runtime + 120)
+            samples += _parse_fio_json(out, name, base_meta)
+    finally:
+        # Always re-enable swap so subsequent phases can drive swap I/O.
+        logging.info('[swap_encryption] Phase 1: swapon %s', swap_dev)
+        daemonset.PodExec(
+            f'swapon {swap_dev} 2>/dev/null || true',
+            timeout=30,
+            ignore_failure=True,
+        )
+
+    logging.info(
+        '[swap_encryption] Phase 1 complete (%d samples)', len(samples)
+    )
+    return samples
+
+
+def _parse_fio_json(
+    fio_output: str, job_name: str, base_meta: dict[str, Any]
+) -> list[sample.Sample]:
+    """Parse fio --output-format=json output into PKB Sample objects.
+
+    Extracts per-direction (read/write) IOPS, bandwidth (MB/s) and completion
+    latency (mean + p50/p99/p999 percentiles).
+
+    Args:
+      fio_output: Raw stdout from fio with --output-format=json.
+      job_name: Short identifier embedded in metric names, e.g. '4k_randread'.
+      base_meta: Shared metadata dict copied into each sample.
+
+    Returns:
+      List of Sample objects; empty if output cannot be parsed or is zero.
+    """
+    # fio sometimes emits kernel warnings before the JSON object.
+    json_start = fio_output.find('{')
+    if json_start == -1:
+        logging.warning(
+            '[swap_encryption] Phase 1: no JSON in fio output for %s', job_name
+        )
+        return []
+
+    try:
+        data = json.loads(fio_output[json_start:])
+    except json.JSONDecodeError as e:
+        logging.warning(
+            '[swap_encryption] Phase 1: fio JSON parse error (%s): %s',
+            job_name,
+            e,
+        )
+        return []
+
+    jobs = data.get('jobs', [])
+    if not jobs:
+        return []
+
+    job = jobs[0]
+    samples: list[sample.Sample] = []
+    meta = dict(base_meta, fio_job=job_name)
+
+    for direction in ('read', 'write'):
+        d = job.get(direction, {})
+        iops = float(d.get('iops', 0))
+        bw_kbps = float(d.get('bw', 0))  # fio reports KiB/s
+        bw_mbps = bw_kbps / 1024.0
+
+        # Skip directions with near-zero throughput.
+        if iops < 1 and bw_kbps < 1:
+            continue
+
+        prefix = f'phase1_fio_{job_name}_{direction}'
+        samples.append(sample.Sample(f'{prefix}_iops', iops, 'IOPS', meta))
+        samples.append(
+            sample.Sample(f'{prefix}_bw_mbps', bw_mbps, 'MB/s', meta)
+        )
+
+        # Completion latency — fio reports nanoseconds; emit microseconds.
+        clat = d.get('clat_ns', d.get('lat_ns', {}))
+        lat_mean_ns = float(clat.get('mean', 0))
+        if lat_mean_ns > 0:
+            samples.append(
+                sample.Sample(
+                    f'{prefix}_lat_mean_us', lat_mean_ns / 1000.0, 'us', meta
+                )
+            )
+            for pct_key, label in (
+                ('50.000000', 'p50'),
+                ('99.000000', 'p99'),
+                ('99.900000', 'p999'),
+            ):
+                val_ns = clat.get('percentile', {}).get(pct_key, 0)
+                if val_ns:
+                    samples.append(
+                        sample.Sample(
+                            f'{prefix}_lat_{label}_us',
+                            val_ns / 1000.0,
+                            'us',
+                            meta,
+                        )
+                    )
+
+    return samples
+
+
+_INSTANCE_PRICE_USD_PER_HR: dict[str, float] = {
+    # GCP  (on-demand, us-central1 unless noted)
+    'c4-standard-8-lssd': 0.5888,  # 8 vCPU, 32 GB RAM + 1×375 GB LSSD
+    'c4-standard-8': 0.5008,  # 8 vCPU, 32 GB RAM, no LSSD
+    'n4-highmem-32': 3.0256,  # 32 vCPU, 256 GB RAM
+    'n2-highmem-32': 2.5216,  # 32 vCPU, 256 GB RAM
+    'n2-standard-32': 1.5264,  # 32 vCPU, 120 GB RAM
+    'z3-highmem-8': 2.7248,  # 8 vCPU + 4× LSSD
+    # AWS
+    'i4i.4xlarge': 1.4960,  # 16 vCPU, 128 GB RAM, NVMe Instance Store
+    'i4i.2xlarge': 0.7480,
+    'm6id.4xlarge': 0.9072,  # 16 vCPU, 64 GB RAM, NVMe Instance Store
+    'm6i.4xlarge': 0.7680,  # 16 vCPU, 64 GB RAM, no Instance Store
+    'r6i.4xlarge': 1.0080,  # 16 vCPU, 128 GB RAM, no Instance Store
+}
+
+
+def _collect_cost_sample(
+    daemonset: _ds_mod.SwapDaemonSet,
+    elapsed_sec: float,
+    base_meta: dict,
+) -> list[sample.Sample]:
+    """Emit a cost_estimate_usd sample for the benchmark run."""
+    instance_type = ''
+
+    gcp_type_out, _ = daemonset.PodExec(
+        'curl -s -m 3 --fail'
+        ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
+        ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
+        ignore_failure=True,
+    )
+    if gcp_type_out.strip():
+        instance_type = gcp_type_out.strip().split('/')[-1]
+
+    if not instance_type:
+        aws_type_out, _ = daemonset.PodExec(
+            'curl -s -m 3 --fail '
+            'http://169.254.169.254/latest/meta-data/instance-type '
+            '2>/dev/null || echo ""',
+            ignore_failure=True,
+        )
+        instance_type = aws_type_out.strip()
+
+    if _INSTANCE_SIZE_LABEL.value:
+        instance_type = _INSTANCE_SIZE_LABEL.value
+
+    if not instance_type and _BENCHMARK_MACHINE_TYPE.value:
+        instance_type = _BENCHMARK_MACHINE_TYPE.value
+        logging.info(
+            '[swap_encryption] Instance type from metadata unavailable; using'
+            ' --swap_encryption_benchmark_machine_type=%s for cost tracking',
+            instance_type,
+        )
+
+    price = _INSTANCE_PRICE_USD_PER_HR.get(instance_type)
+    if price is None:
+        logging.warning(
+            '[swap_encryption] Unknown instance type "%s" — skipping cost'
+            ' sample. Add it to _INSTANCE_PRICE_USD_PER_HR to enable cost'
+            ' tracking.',
+            instance_type,
+        )
+        return []
+
+    hours = elapsed_sec / 3600.0
+    meta = dict(
+        base_meta,
+        instance_type=instance_type,
+        price_usd_per_hr=price,
+        benchmark_elapsed_sec=round(elapsed_sec, 1),
+    )
+    return [sample.Sample('cost_estimate_usd', hours * price, 'USD', meta)]