Skip to content

Commit 2c7a677

Browse files
fix(swap_encryption): add linuxConfig.swapConfig to system-config and remove cgroup hack
Address Ajay review comments on PR GoogleCloudPlatform#6776: Comment #r3457877984 (linuxConfig.swapConfig): Extend --system-config-from-file YAML with linuxConfig blocks: linuxConfig.swapConfig.enabled: true -- GKE sets up node-level swap dedicatedLocalSsdProfile.diskCount: N -- LSSD: use local NVMe for swap linuxConfig.sysctl: vm.swappiness=100, vm.min_free_kbytes=200, vm.watermark_scale_factor=500 Ref: https://cloud.google.com/kubernetes-engine/docs/how-to/node-memory-swap Comment #r3457928855 (cgroup hack): Remove memory.swap.max=max loop from swap_encryption_daemonset.yaml.j2. With kubeletConfig.memorySwapBehavior=LimitedSwap the kubelet manages per-container swap allocation; the cgroup hack is unnecessary.
1 parent bbdd15d commit 2c7a677

2 files changed

Lines changed: 42 additions & 49 deletions

File tree

perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2

Lines changed: 10 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -166,53 +166,15 @@ spec:
166166
tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
167167
echo "[pkb] WARNING: kernel source extraction failed" >&2
168168
fi
169-
echo "[pkb] Unlocking container cgroup swap limits..."
170-
# GKE cgroup v2 sets memory.swap.max=0 per-container, which
171-
# prevents swap usage even when the node has a swap device and
172-
# vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because
173-
# the kernel can't page out to swap for this cgroup.
174-
#
175-
# NOTE: the old approach derived the cgroup path from
176-
# /proc/self/cgroup, but inside a cgroup namespace that reports
177-
# "0::/" — so the write targeted the host ROOT cgroup, silently
178-
# no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
179-
# /sys is the host cgroup tree (hostPath mount) and this pod is
180-
# privileged, so instead unlock swap across the entire kubepods
181-
# hierarchy, which is guaranteed to contain our own container.
182-
if [ -d /sys/fs/cgroup/kubepods.slice ] || \
183-
[ -d /sys/fs/cgroup/kubepods ]; then
184-
# cgroup v2: write 'max' to every memory.swap.max under kubepods*.
185-
find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
186-
2>/dev/null | while read -r _f; do
187-
echo max > "$_f" 2>/dev/null || true
188-
done
189-
fi
190-
# Best-effort: our own namespaced path and the unified root.
191-
PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
192-
2>/dev/null)
193-
for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
194-
/sys/fs/cgroup/memory.swap.max; do
195-
[ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
196-
done
197-
# cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
198-
find /sys/fs/cgroup/memory -path '*kubepods*' \
199-
-name memory.memsw.limit_in_bytes 2>/dev/null \
200-
| while read -r _f; do
201-
echo -1 > "$_f" 2>/dev/null || true
202-
done
203-
# Verify and surface the result in the pod log. grep -L lists
204-
# files that do NOT contain 'max' on their first line, i.e. ones
205-
# still capping swap.
206-
PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
207-
-name memory.swap.max 2>/dev/null \
208-
| xargs -r grep -L '^max' 2>/dev/null | head -1)
209-
if [ -n "$PKB_STILL_CAPPED" ]; then
210-
echo "[pkb] WARNING: cgroup swap still capped at \
211-
$PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
212-
OOM-killed before swap is exercised" >&2
213-
else
214-
echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
215-
fi
169+
# Container cgroup swap limits are managed by the kubelet when
170+
# kubeletConfig.memorySwapBehavior=LimitedSwap is set via
171+
# --system-config-from-file (GKE) or kubelet-config.json (EKS).
172+
# Manually writing memory.swap.max=max across kubepods is not
173+
# required and is superseded by the kubelet swap config.
174+
# Reference: Ajay's review comment go/pkb-swap-encryption-pr1
175+
# #r3457928855 — https://github.com/GoogleCloudPlatform/
176+
# PerfKitBenchmarker/pull/6776#discussion_r3457928855
177+
echo "[pkb] Swap limits managed by kubelet (LimitedSwap config)."
216178
echo "[pkb] Tools installed. Writing ready sentinel."
217179
touch /tmp/pkb_ready
218180
sleep infinity
@@ -264,3 +226,4 @@ spec:
264226
hostPath:
265227
path: /lib/modules
266228
type: Directory
229+

perfkitbenchmarker/linux_benchmarks/swap_encryption_benchmark.py

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -923,17 +923,47 @@ def _create_benchmark_node_pool(cluster) -> None:
923923
swap_behavior = _GKE_KUBELET_MEMORY_SWAP.value
924924
system_config_tmp = None
925925
if swap_behavior:
926-
kubelet_yaml = f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\n'
926+
# Build system-config YAML for --system-config-from-file.
927+
# Per Ajay's review (go/pkb-swap-encryption-pr1 #r3457877984):
928+
# kubeletConfig.memorySwapBehavior: kubelet allocates swap to pods.
929+
# linuxConfig.swapConfig: GKE enables node-level swap device.
930+
# For LSSD machines, dedicatedLocalSsdProfile tells GKE to use
931+
# the local NVMe as the swap device (avoids boot-disk overhead).
932+
# linuxConfig.sysctl: swap aggressiveness tuning so the benchmark
933+
# workloads can drive sustained swap I/O.
934+
# Reference:
935+
# https://docs.cloud.google.com/kubernetes-engine/docs/how-to/
936+
# node-memory-swap#enable
937+
if is_lssd:
938+
swap_config_block = (
939+
' swapConfig:\n'
940+
' enabled: true\n'
941+
' dedicatedLocalSsdProfile:\n'
942+
f' diskCount: {_LSSD_COUNT.value}\n'
943+
)
944+
else:
945+
swap_config_block = ' swapConfig:\n enabled: true\n'
946+
kubelet_yaml = (
947+
f'kubeletConfig:\n memorySwapBehavior: {swap_behavior}\nlinuxConfig:\n'
948+
+ swap_config_block
949+
+ ' sysctl:\n'
950+
' vm.min_free_kbytes: 200\n'
951+
' vm.watermark_scale_factor: 500\n'
952+
' vm.swappiness: 100\n'
953+
)
927954
system_config_tmp = tempfile.NamedTemporaryFile(
928955
mode='w', suffix='.yaml', delete=False
929956
)
930957
system_config_tmp.write(kubelet_yaml)
931958
system_config_tmp.flush()
932959
cmd.flags['system-config-from-file'] = system_config_tmp.name
933960
logging.info(
934-
'[swap_encryption] kubeletConfig.memorySwapBehavior=%s (written to %s)',
961+
'[swap_encryption] system-config-from-file: '
962+
'kubelet_swap=%s lssd=%s (written to %s):\n%s',
935963
swap_behavior,
964+
is_lssd,
936965
system_config_tmp.name,
966+
kubelet_yaml,
937967
)
938968

939969
logging.info(

0 commit comments

Comments
 (0)