6464"""
6565
6666import logging
67+ import os
68+ import tempfile
6769import textwrap
6870import time
6971from typing import Any
282284 '(unencrypted) swap overhead as a baseline.' ,
283285)
284286
287+ _GKE_KUBELET_MEMORY_SWAP = flags .DEFINE_string (
288+ 'swap_encryption_gke_kubelet_memory_swap' ,
289+ 'LimitedSwap' ,
290+ 'Value for kubeletConfig.memorySwapBehavior injected via '
291+ '--system-config-from-file when creating the GKE benchmark nodepool. '
292+ 'LimitedSwap (default) — the kubelet allows pods to use swap up to their '
293+ 'memory limit; required for the DaemonSet pod to drive kernel swapping. '
294+ 'NoSwap — disables swap at the kubelet level (use for a baseline run that '
295+ 'confirms zero swap activity). Set empty string to omit the flag entirely '
296+ 'and rely on the cluster-level default.' ,
297+ )
298+
285299_SWAP_DEVICE = flags .DEFINE_string (
286300 'swap_encryption_device' ,
287301 '' ,
@@ -547,9 +561,10 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
547561 )
548562 if _pod_lost :
549563 _degraded_reasons .append (
550- f'pod(s) NotFound during run: { ", " .join (_pod_lost )} — pod died'
551- ' (eviction/exit); phases at/after that point (e.g.'
552- ' kernel-build, OpenSearch) produced invalid data'
564+ 'benchmark pod(s) went NotFound during the run'
565+ f' ({ ", " .join (_pod_lost )} ) — the pod died (node memory-pressure'
566+ ' eviction or container exit) and any phase running at or after that'
567+ ' point (e.g. kernel-build baseline, OpenSearch) produced invalid data'
553568 )
554569 if _oom_events :
555570 _degraded_reasons .append (
@@ -598,10 +613,9 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
598613 _pod_exec (
599614 pod ,
600615 textwrap .dedent ("""
601- swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
602- dmsetup remove --noudevrules --noudevsync \
603- swap_encrypted 2>/dev/null || true
604- """ ),
616+ swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
617+ dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
618+ """ ),
605619 ignore_failure = True ,
606620 )
607621 # Clean up loop device backing files (single-disk fallback path).
@@ -622,9 +636,7 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
622636 ignore_failure = True ,
623637 )
624638 _pod_exec (
625- pod ,
626- "pkill -9 'stress-ng|fio' 2>/dev/null || true" ,
627- ignore_failure = True ,
639+ pod , "pkill -9 'stress-ng|fio' 2>/dev/null || true" , ignore_failure = True
628640 )
629641
630642 _delete_daemonset ()
@@ -672,8 +684,10 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
672684 '-n' ,
673685 _DS_NAMESPACE ,
674686 '-o' ,
675- r'jsonpath={range .items[*]}{.metadata.name}'
676- r'{"\t"}{.status.phase}{"\n"}{end}' ,
687+ (
688+ r'jsonpath={range'
689+ r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
690+ ),
677691 ],
678692 raise_on_failure = False ,
679693 )
@@ -721,15 +735,15 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
721735 '[swap_encryption] Pod %s ready (tools installed)' , ready_pod
722736 )
723737 return ready_pod
724- # "container not found" means the container crashed (CrashLoopBackOff
725- # or exited) — hard reset: re-check pod phase on next iteration.
738+ # "container not found" means the container crashed (CrashLoopBackOff or
739+ # exited) — treat it as a hard reset: re-check pod phase on next iteration.
726740 if (
727741 'container not found' in sentinel_err
728742 or 'unable to upgrade connection' in sentinel_err
729743 ):
730744 logging .warning (
731- '[swap_encryption] Pod %s: container not running (%s)'
732- ' — will re-check pod state' ,
745+ '[swap_encryption] Pod %s: container not running (%s) '
746+ '— will re-check pod state' ,
733747 ready_pod ,
734748 sentinel_err .strip (),
735749 )
@@ -749,7 +763,7 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
749763
750764
751765def _log_pod_events (pod_name : str ) -> None :
752- """Dump recent Kubernetes events for the pod to diagnose startup hangs."""
766+ """Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
753767 events_out , _ , _ = kubectl .RunKubectlCommand (
754768 [
755769 'describe' ,
@@ -793,8 +807,9 @@ def _delete_daemonset() -> None:
793807 logging .info ('[swap_encryption] DaemonSet deleted' )
794808
795809
796- # GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput.
797- _HYPERDISK_MAX_IOPS_PER_MBPS = 256
810+ _HYPERDISK_MAX_IOPS_PER_MBPS = (
811+ 256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
812+ )
798813
799814
800815def _valid_hyperdisk_throughput (iops : int , throughput : int ) -> int :
@@ -903,10 +918,36 @@ def _create_benchmark_node_pool(cluster) -> None:
903918 if is_lssd :
904919 cmd += ['--local-nvme-ssd-block' , f'count={ _LSSD_COUNT .value } ' ]
905920
921+ # ── GKE kubelet swap config ───────────────────────────────────────────────
922+ # Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
923+ # nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
924+ # so that the kubelet allocates swap to the DaemonSet pod. Without this flag
925+ # the Linux kernel swap device may exist but the kubelet blocks pod-level
926+ # swap usage and the benchmark pod cannot drive swap I/O.
927+ #
928+ # Passed as --system-config-from-file pointing to a temp YAML, which is the
929+ # same mechanism PKB's gke_node_system_config flag uses:
930+ # perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
931+ swap_behavior = _GKE_KUBELET_MEMORY_SWAP .value
932+ system_config_tmp = None
933+ if swap_behavior :
934+ kubelet_yaml = f'kubeletConfig:\n memorySwapBehavior: { swap_behavior } \n '
935+ system_config_tmp = tempfile .NamedTemporaryFile (
936+ mode = 'w' , suffix = '.yaml' , delete = False
937+ )
938+ system_config_tmp .write (kubelet_yaml )
939+ system_config_tmp .flush ()
940+ cmd += ['--system-config-from-file' , system_config_tmp .name ]
941+ logging .info (
942+ '[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)' ,
943+ swap_behavior ,
944+ system_config_tmp .name ,
945+ )
946+
906947 logging .info (
907948 '[swap_encryption] Creating benchmark nodepool: %s / %s / '
908949 'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
909- 'add_swap_disk=%s' ,
950+ 'add_swap_disk=%s / kubelet_swap=%s ' ,
910951 _BENCHMARK_NODEPOOL ,
911952 machine_type ,
912953 _NODE_IMAGE_TYPE .value ,
@@ -915,14 +956,22 @@ def _create_benchmark_node_pool(cluster) -> None:
915956 _ENABLE_DMCRYPT .value ,
916957 is_lssd ,
917958 _ADD_SWAP_DISK .value ,
959+ swap_behavior or 'unset' ,
918960 )
919961
920962 # LSSD nodepools take longer to provision than PD-only nodepools because
921963 # GKE must also initialise the local NVMe devices before marking nodes Ready.
922964 # 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
923- stdout , stderr , rc = vm_util .IssueCommand (
924- cmd , timeout = 1200 , raise_on_failure = False
925- )
965+ try :
966+ stdout , stderr , rc = vm_util .IssueCommand (
967+ cmd , timeout = 1200 , raise_on_failure = False
968+ )
969+ finally :
970+ if system_config_tmp is not None :
971+ try :
972+ os .unlink (system_config_tmp .name )
973+ except OSError :
974+ pass
926975
927976 if rc != 0 :
928977 # Idempotent prepare: if the nodepool already exists (e.g. re-running
@@ -1325,8 +1374,7 @@ def _pod_exec(
13251374 out , err , rc = kubectl .RunKubectlCommand (
13261375 ['exec' , active , '-n' , _DS_NAMESPACE , '--' , 'bash' , '-c' , cmd ],
13271376 raise_on_failure = False ,
1328- # Retry loop in _pod_exec handles transient resets.
1329- raise_on_timeout = False ,
1377+ raise_on_timeout = False , # let _pod_exec's own retry loop handle transient resets
13301378 timeout = timeout ,
13311379 )
13321380 is_transient = rc != 0 and any (e in err for e in _TRANSIENT_KUBECTL_ERRORS )
@@ -1366,13 +1414,15 @@ def _pod_exec(
13661414 pod_gone = _is_pod_gone (active )
13671415 if pod_gone :
13681416 logging .warning (
1369- '[swap_encryption] OOM-eviction (rc=137, pod gone) —'
1370- ' recovering pod name (cmd not retried)'
1417+ '[swap_encryption] OOM-eviction detected (rc=137, pod gone) —'
1418+ ' recovering pod name for subsequent commands (not retrying this'
1419+ ' cmd)'
13711420 )
13721421 else :
13731422 logging .warning (
1374- '[swap_encryption] OOM-kill (rc=137, pod exists) —'
1375- ' waiting for container restart before continuing'
1423+ '[swap_encryption] Container OOM-killed (rc=137, pod still exists)'
1424+ ' — waiting for container restart and tool re-install before'
1425+ ' continuing'
13761426 )
13771427 new_pod = _recover_pod (active )
13781428 if new_pod != active :
@@ -1595,12 +1645,10 @@ def _collect_cost_sample(
15951645 instance_type = ''
15961646
15971647 # GCP: machine type is the last segment of the metadata URL value
1598- _gcp_meta_url = (
1599- 'http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
1600- )
16011648 gcp_type_out , _ = _pod_exec (
16021649 pod ,
1603- f'curl -s -m 3 --fail { _gcp_meta_url } '
1650+ 'curl -s -m 3 --fail'
1651+ ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
16041652 ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""' ,
16051653 ignore_failure = True ,
16061654 )
@@ -1736,13 +1784,10 @@ def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
17361784 # cloud metadata so that the field is always populated.
17371785 instance_label = _INSTANCE_SIZE_LABEL .value
17381786 if not instance_label :
1739- _gcp_mt_url = (
1740- 'http://metadata.google.internal'
1741- '/computeMetadata/v1/instance/machine-type'
1742- )
17431787 gcp_type_out , _ = _pod_exec (
17441788 pod ,
1745- f'curl -s -m 3 --fail { _gcp_mt_url } '
1789+ 'curl -s -m 3 --fail'
1790+ ' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
17461791 ' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""' ,
17471792 ignore_failure = True ,
17481793 )
@@ -1815,4 +1860,4 @@ def _ensure_io2_volume() -> None:
18151860 """
18161861 if _SWAP_TYPE .value != 'io2' :
18171862 return
1818- logging .info ('[swap_encryption] io2 swap volume provisioning deferred to PR2' )
1863+ logging .info ('[swap_encryption] io2 swap volume provisioning deferred to PR2' )
0 commit comments