Skip to content

Commit 8aa8b23

Browse files
swap_encryption: add GKE kubelet memorySwapBehavior config
Per Ajay's review comment on PR GoogleCloudPlatform#6758: - Add _GKE_KUBELET_MEMORY_SWAP flag (default LimitedSwap) so the benchmark nodepool is created with kubeletConfig.memorySwapBehavior set via --system-config-from-file, enabling pod-level swap usage. - Wrap gcloud IssueCommand in try/finally to clean up the temp YAML. - Update nodepool creation log to include kubelet_swap value. - Rebuild from clean base: all P0/P1/P2 fixes, guard clauses, pyink and flake8 clean (tox.ini config).
1 parent b2fd7f2 commit 8aa8b23

1 file changed

Lines changed: 84 additions & 39 deletions

File tree

perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py

Lines changed: 84 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@
6464
"""
6565

6666
import logging
67+
import os
68+
import tempfile
6769
import textwrap
6870
import time
6971
from typing import Any
@@ -282,6 +284,18 @@
282284
'(unencrypted) swap overhead as a baseline.',
283285
)
284286

287+
_GKE_KUBELET_MEMORY_SWAP = flags.DEFINE_string(
288+
'swap_encryption_gke_kubelet_memory_swap',
289+
'LimitedSwap',
290+
'Value for kubeletConfig.memorySwapBehavior injected via '
291+
'--system-config-from-file when creating the GKE benchmark nodepool. '
292+
'LimitedSwap (default) — the kubelet allows pods to use swap up to their '
293+
'memory limit; required for the DaemonSet pod to drive kernel swapping. '
294+
'NoSwap — disables swap at the kubelet level (use for a baseline run that '
295+
'confirms zero swap activity). Set empty string to omit the flag entirely '
296+
'and rely on the cluster-level default.',
297+
)
298+
285299
_SWAP_DEVICE = flags.DEFINE_string(
286300
'swap_encryption_device',
287301
'',
@@ -547,9 +561,10 @@ def Run(spec: _BenchmarkSpec) -> list[sample.Sample]:
547561
)
548562
if _pod_lost:
549563
_degraded_reasons.append(
550-
f'pod(s) NotFound during run: {", ".join(_pod_lost)} — pod died'
551-
' (eviction/exit); phases at/after that point (e.g.'
552-
' kernel-build, OpenSearch) produced invalid data'
564+
'benchmark pod(s) went NotFound during the run'
565+
f' ({", ".join(_pod_lost)}) — the pod died (node memory-pressure'
566+
' eviction or container exit) and any phase running at or after that'
567+
' point (e.g. kernel-build baseline, OpenSearch) produced invalid data'
553568
)
554569
if _oom_events:
555570
_degraded_reasons.append(
@@ -598,10 +613,9 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
598613
_pod_exec(
599614
pod,
600615
textwrap.dedent("""
601-
swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
602-
dmsetup remove --noudevrules --noudevsync \
603-
swap_encrypted 2>/dev/null || true
604-
"""),
616+
swapoff /dev/mapper/swap_encrypted 2>/dev/null || true
617+
dmsetup remove --noudevrules --noudevsync swap_encrypted 2>/dev/null || true
618+
"""),
605619
ignore_failure=True,
606620
)
607621
# Clean up loop device backing files (single-disk fallback path).
@@ -622,9 +636,7 @@ def Cleanup(spec: _BenchmarkSpec) -> None:
622636
ignore_failure=True,
623637
)
624638
_pod_exec(
625-
pod,
626-
"pkill -9 'stress-ng|fio' 2>/dev/null || true",
627-
ignore_failure=True,
639+
pod, "pkill -9 'stress-ng|fio' 2>/dev/null || true", ignore_failure=True
628640
)
629641

630642
_delete_daemonset()
@@ -672,8 +684,10 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
672684
'-n',
673685
_DS_NAMESPACE,
674686
'-o',
675-
r'jsonpath={range .items[*]}{.metadata.name}'
676-
r'{"\t"}{.status.phase}{"\n"}{end}',
687+
(
688+
r'jsonpath={range'
689+
r' .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\n"}{end}'
690+
),
677691
],
678692
raise_on_failure=False,
679693
)
@@ -721,15 +735,15 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
721735
'[swap_encryption] Pod %s ready (tools installed)', ready_pod
722736
)
723737
return ready_pod
724-
# "container not found" means the container crashed (CrashLoopBackOff
725-
# or exited) — hard reset: re-check pod phase on next iteration.
738+
# "container not found" means the container crashed (CrashLoopBackOff or
739+
# exited) — treat it as a hard reset: re-check pod phase on next iteration.
726740
if (
727741
'container not found' in sentinel_err
728742
or 'unable to upgrade connection' in sentinel_err
729743
):
730744
logging.warning(
731-
'[swap_encryption] Pod %s: container not running (%s)'
732-
' — will re-check pod state',
745+
'[swap_encryption] Pod %s: container not running (%s) '
746+
'— will re-check pod state',
733747
ready_pod,
734748
sentinel_err.strip(),
735749
)
@@ -749,7 +763,7 @@ def _wait_for_benchmark_pod(timeout: int = 900) -> str | None:
749763

750764

751765
def _log_pod_events(pod_name: str) -> None:
752-
"""Dump recent Kubernetes events for the pod to diagnose startup hangs."""
766+
"""Dump recent Kubernetes events for the pod to help diagnose startup hangs."""
753767
events_out, _, _ = kubectl.RunKubectlCommand(
754768
[
755769
'describe',
@@ -793,8 +807,9 @@ def _delete_daemonset() -> None:
793807
logging.info('[swap_encryption] DaemonSet deleted')
794808

795809

796-
# GCP Hyperdisk Balanced: max IOPS = 256 × MiB/s provisioned throughput.
797-
_HYPERDISK_MAX_IOPS_PER_MBPS = 256
810+
_HYPERDISK_MAX_IOPS_PER_MBPS = (
811+
256 # GCP Hyperdisk Balanced: IOPS <= 256 x MiB/s
812+
)
798813

799814

800815
def _valid_hyperdisk_throughput(iops: int, throughput: int) -> int:
@@ -903,10 +918,36 @@ def _create_benchmark_node_pool(cluster) -> None:
903918
if is_lssd:
904919
cmd += ['--local-nvme-ssd-block', f'count={_LSSD_COUNT.value}']
905920

921+
# ── GKE kubelet swap config ───────────────────────────────────────────────
922+
# Per Ajay's review comment (go/pkb-swap-encryption-pr1): the benchmark
923+
# nodepool must be created with kubeletConfig.memorySwapBehavior=LimitedSwap
924+
# so that the kubelet allocates swap to the DaemonSet pod. Without this flag
925+
# the Linux kernel swap device may exist but the kubelet blocks pod-level
926+
# swap usage and the benchmark pod cannot drive swap I/O.
927+
#
928+
# Passed as --system-config-from-file pointing to a temp YAML, which is the
929+
# same mechanism PKB's gke_node_system_config flag uses:
930+
# perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py
931+
swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
932+
system_config_tmp = None
933+
if swap_behavior:
934+
kubelet_yaml = f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\n'
935+
system_config_tmp = tempfile.NamedTemporaryFile(
936+
mode='w', suffix='.yaml', delete=False
937+
)
938+
system_config_tmp.write(kubelet_yaml)
939+
system_config_tmp.flush()
940+
cmd += ['--system-config-from-file', system_config_tmp.name]
941+
logging.info(
942+
'[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)',
943+
swap_behavior,
944+
system_config_tmp.name,
945+
)
946+
906947
logging.info(
907948
'[swap_encryption] Creating benchmark nodepool: %s / %s / '
908949
'image=%s / disk=%dGiB / iops=%d / dmcrypt=%s / lssd=%s / '
909-
'add_swap_disk=%s',
950+
'add_swap_disk=%s / kubelet_swap=%s',
910951
_BENCHMARK_NODEPOOL,
911952
machine_type,
912953
_NODE_IMAGE_TYPE.value,
@@ -915,14 +956,22 @@ def _create_benchmark_node_pool(cluster) -> None:
915956
_ENABLE_DMCRYPT.value,
916957
is_lssd,
917958
_ADD_SWAP_DISK.value,
959+
swap_behavior or 'unset',
918960
)
919961

920962
# LSSD nodepools take longer to provision than PD-only nodepools because
921963
# GKE must also initialise the local NVMe devices before marking nodes Ready.
922964
# 1200 s (20 min) covers observed worst-case times on c4-lssd and n4 configs.
923-
stdout, stderr, rc = vm_util.IssueCommand(
924-
cmd, timeout=1200, raise_on_failure=False
925-
)
965+
try:
966+
stdout, stderr, rc = vm_util.IssueCommand(
967+
cmd, timeout=1200, raise_on_failure=False
968+
)
969+
finally:
970+
if system_config_tmp is not None:
971+
try:
972+
os.unlink(system_config_tmp.name)
973+
except OSError:
974+
pass
926975

927976
if rc != 0:
928977
# Idempotent prepare: if the nodepool already exists (e.g. re-running
@@ -1325,8 +1374,7 @@ def _pod_exec(
13251374
out, err, rc = kubectl.RunKubectlCommand(
13261375
['exec', active, '-n', _DS_NAMESPACE, '--', 'bash', '-c', cmd],
13271376
raise_on_failure=False,
1328-
# Retry loop in _pod_exec handles transient resets.
1329-
raise_on_timeout=False,
1377+
raise_on_timeout=False, # let _pod_exec's own retry loop handle transient resets
13301378
timeout=timeout,
13311379
)
13321380
is_transient = rc != 0 and any(e in err for e in _TRANSIENT_KUBECTL_ERRORS)
@@ -1366,13 +1414,15 @@ def _pod_exec(
13661414
pod_gone = _is_pod_gone(active)
13671415
if pod_gone:
13681416
logging.warning(
1369-
'[swap_encryption] OOM-eviction (rc=137, pod gone) —'
1370-
' recovering pod name (cmd not retried)'
1417+
'[swap_encryption] OOM-eviction detected (rc=137, pod gone) —'
1418+
' recovering pod name for subsequent commands (not retrying this'
1419+
' cmd)'
13711420
)
13721421
else:
13731422
logging.warning(
1374-
'[swap_encryption] OOM-kill (rc=137, pod exists) —'
1375-
' waiting for container restart before continuing'
1423+
'[swap_encryption] Container OOM-killed (rc=137, pod still exists)'
1424+
' — waiting for container restart and tool re-install before'
1425+
' continuing'
13761426
)
13771427
new_pod = _recover_pod(active)
13781428
if new_pod != active:
@@ -1595,12 +1645,10 @@ def _collect_cost_sample(
15951645
instance_type = ''
15961646

15971647
# GCP: machine type is the last segment of the metadata URL value
1598-
_gcp_meta_url = (
1599-
'http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
1600-
)
16011648
gcp_type_out, _ = _pod_exec(
16021649
pod,
1603-
f'curl -s -m 3 --fail {_gcp_meta_url}'
1650+
'curl -s -m 3 --fail'
1651+
' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
16041652
' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
16051653
ignore_failure=True,
16061654
)
@@ -1736,13 +1784,10 @@ def _build_metadata(pod: str, swap_dev: str) -> dict[str, Any]:
17361784
# cloud metadata so that the field is always populated.
17371785
instance_label = _INSTANCE_SIZE_LABEL.value
17381786
if not instance_label:
1739-
_gcp_mt_url = (
1740-
'http://metadata.google.internal'
1741-
'/computeMetadata/v1/instance/machine-type'
1742-
)
17431787
gcp_type_out, _ = _pod_exec(
17441788
pod,
1745-
f'curl -s -m 3 --fail {_gcp_mt_url}'
1789+
'curl -s -m 3 --fail'
1790+
' http://metadata.google.internal/computeMetadata/v1/instance/machine-type'
17461791
' -H "Metadata-Flavor: Google" 2>/dev/null || echo ""',
17471792
ignore_failure=True,
17481793
)
@@ -1815,4 +1860,4 @@ def _ensure_io2_volume() -> None:
18151860
"""
18161861
if _SWAP_TYPE.value != 'io2':
18171862
return
1818-
logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')
1863+
logging.info('[swap_encryption] io2 swap volume provisioning deferred to PR2')

0 commit comments

Comments
 (0)