Skip to content

Commit 06486f2

Browse files
committed
fix(swap_encryption/pr5): EKS nodeadm stub, DaemonSet alignment, formatting
1 parent 6319522 commit 06486f2

2 files changed

Lines changed: 4049 additions & 3290 deletions

File tree

perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2

Lines changed: 48 additions & 162 deletions
Original file line numberDiff line numberDiff line change
@@ -28,127 +28,60 @@ spec:
2828
- bash
2929
- -c
3030
- |
31-
echo "[pkb] Installing benchmark tools..."
32-
# Retry apt-get up to 3 times — transient network failures are
33-
# common on a freshly-started GKE node. Critical tools (fio,
34-
# stress-ng) must be present before we write the ready sentinel;
35-
# a silent || true here would cause /tmp/pkb_ready to appear even
36-
# when tools are missing, breaking all subsequent phases.
31+
echo "[pkb] Installing benchmark measurement tools..."
32+
# Phase 1+2 tools: fio (raw-device I/O), stress-ng (CPU overhead),
33+
# cryptsetup/mdadm (dm-crypt inspection), sysstat (vmstat/pidstat),
34+
# nvme-cli (NVMe telemetry), cgroup-tools (cgroup v1 guard).
35+
# Phase 3b tools: gcc/make/etc. (kernel build inside memory cap).
36+
# Redis/memtier/esrally/opensearch are NOT installed here —
37+
# those workloads run in separate PKB benchmark pods (Phase 3a, 3c)
38+
# per Ajay review comment r3457826290.
3739
PKB_APT_OK=0
3840
for _attempt in 1 2 3; do
3941
apt-get update -qq 2>&1 || true
40-
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
41-
fio \\
42-
stress-ng \\
43-
sysstat \\
44-
cryptsetup \\
45-
mdadm \\
46-
redis-server \\
47-
redis-tools \\
48-
git \\
49-
wget \\
50-
curl \\
51-
make \\
52-
gcc \\
53-
bc \\
54-
flex \\
55-
bison \\
56-
libelf-dev \\
57-
libssl-dev \\
58-
cgroup-tools \\
59-
nvme-cli \\
60-
util-linux \\
61-
python3-pip \\
62-
libevent-dev \\
63-
libssl-dev \\
64-
libpcre3-dev \\
65-
zlib1g-dev \\
66-
build-essential \\
67-
autoconf \\
68-
automake \\
69-
libtool \\
70-
libtool-bin \\
71-
pkg-config \\
72-
python3-dev \\
73-
default-jre-headless \\
42+
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
43+
fio \
44+
stress-ng \
45+
cryptsetup \
46+
mdadm \
47+
sysstat \
48+
nvme-cli \
49+
cgroup-tools \
50+
util-linux \
51+
gcc \
52+
make \
53+
bc \
54+
flex \
55+
bison \
56+
libelf-dev \
57+
libssl-dev \
7458
2>&1 && PKB_APT_OK=1 && break
7559
echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
7660
sleep 15
7761
done
78-
if [ "$PKB_APT_OK" != "1" ] || \\
79-
! command -v fio >/dev/null 2>&1 || \\
62+
if [ "$PKB_APT_OK" != "1" ] || \
63+
! command -v fio >/dev/null 2>&1 || \
8064
! command -v stress-ng >/dev/null 2>&1; then
81-
echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
65+
echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed" >&2
8266
exit 1
8367
fi
84-
echo "[pkb] Installing memtier_benchmark from source..."
85-
# Pin a stable release tag — building from the moving default
86-
# branch (HEAD) intermittently broke (memtier_benchmark not found
87-
# → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the
88-
# version PKB's memtier package (memtier.MemtierResult.Parse) is
89-
# validated against and builds cleanly with the apt deps above.
90-
# Fall back to HEAD only if the tagged clone fails.
91-
if ! command -v memtier_benchmark >/dev/null 2>&1; then
92-
(cd /tmp && \\
93-
rm -rf memtier_benchmark && \\
94-
( git clone --depth 1 --branch 2.2.1 \\
95-
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
96-
git clone --depth 1 \\
97-
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
98-
cd memtier_benchmark && \\
99-
autoreconf -ivf 2>&1 && \\
100-
./configure 2>&1 && \\
101-
make -j$(nproc) 2>&1 && \\
102-
make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
103-
echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
104-
fi
105-
if command -v memtier_benchmark >/dev/null 2>&1; then
106-
echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
107-
fi
108-
echo "[pkb] Installing esrally (lightweight)..."
109-
python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
110-
pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
111-
pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
112-
echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
113-
if command -v esrally >/dev/null 2>&1; then
114-
echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
115-
else
116-
echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
117-
fi
118-
echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
119-
# Phase 3c needs a real search server on :9200. Nothing in apt
120-
# ships one and the pod has no systemd, so install the OpenSearch
121-
# bundle (ships its own JDK) and launch the binary directly in the
122-
# phase. All best-effort: if any step fails the phase probes the
123-
# endpoint and skips cleanly rather than recording fake timings.
124-
if [ ! -x /opt/opensearch/bin/opensearch ]; then
125-
OS_VER=2.15.0
126-
(cd /opt && \\
127-
wget -q --timeout=600 -O os.tgz \\
128-
"https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
129-
tar -xzf os.tgz && rm -f os.tgz && \\
130-
mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
131-
echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
132-
fi
133-
if [ -x /opt/opensearch/bin/opensearch ]; then
134-
# pkbos owns and runs OpenSearch (it refuses to run as root).
135-
# Give it a home so HOME/temp paths are writable.
136-
id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
137-
printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
138-
> /opt/opensearch/config/opensearch.yml
139-
mkdir -p /opt/opensearch/config/jvm.options.d
140-
# 2 GB heap: 512 MB was too small and OpenSearch aborted early.
141-
# On a 252 GB node this still leaves plenty of page cache to
142-
# pressure into swap during the phase.
143-
printf -- '-Xms2g\\n-Xmx2g\\n' \\
144-
> /opt/opensearch/config/jvm.options.d/pkb-heap.options
145-
sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
146-
# CRITICAL: never run the binary as root here (it bails and
147-
# leaves root-owned files in logs/ that block the pkbos server).
148-
# Clear any stale logs and chown everything to pkbos LAST.
149-
rm -f /opt/opensearch/logs/* 2>/dev/null || true
150-
chown -R pkbos /opt/opensearch 2>/dev/null || true
151-
echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
68+
echo "[pkb] fio: $(fio --version 2>&1 | head -1)"
69+
echo "[pkb] stress-ng: $(stress-ng --version 2>&1 | head -1)"
70+
echo "[pkb] Verifying swap device is active..."
71+
PKB_SWAP_FOUND=0
72+
for _attempt in $(seq 1 30); do
73+
if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then
74+
PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps)
75+
echo "[pkb] Swap device active: $PKB_SWAP_DEV"
76+
PKB_SWAP_FOUND=1
77+
break
78+
fi
79+
echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2
80+
sleep 5
81+
done
82+
if [ "$PKB_SWAP_FOUND" != "1" ]; then
83+
echo "[pkb] WARNING: no active swap device after 150s — " \
84+
"check linuxConfig.swapConfig / kubelet swap config." >&2
15285
fi
15386
echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
15487
PKB_KVER="{{ kernel_version }}"
@@ -158,62 +91,15 @@ spec:
15891
PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
15992
mkdir -p "$PKB_KROOT"
16093
if [ ! -f "$PKB_KTARBALL" ]; then
161-
wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
94+
wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \
16295
echo "[pkb] WARNING: kernel tarball download failed" >&2
16396
fi
16497
if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
165-
echo "[pkb] Extracting kernel source (xz)..."
166-
tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
98+
echo "[pkb] Extracting kernel source (xz, may take ~60 s)..."
99+
tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \
167100
echo "[pkb] WARNING: kernel source extraction failed" >&2
168101
fi
169-
echo "[pkb] Unlocking container cgroup swap limits..."
170-
# GKE cgroup v2 sets memory.swap.max=0 per-container, which
171-
# prevents swap usage even when the node has a swap device and
172-
# vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because
173-
# the kernel can't page out to swap for this cgroup.
174-
#
175-
# NOTE: the old approach derived the cgroup path from
176-
# /proc/self/cgroup, but inside a cgroup namespace that reports
177-
# "0::/" — so the write targeted the host ROOT cgroup, silently
178-
# no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
179-
# /sys is the host cgroup tree (hostPath mount) and this pod is
180-
# privileged, so instead unlock swap across the entire kubepods
181-
# hierarchy, which is guaranteed to contain our own container.
182-
if [ -d /sys/fs/cgroup/kubepods.slice ] || \
183-
[ -d /sys/fs/cgroup/kubepods ]; then
184-
# cgroup v2: write 'max' to every memory.swap.max under kubepods*.
185-
find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
186-
2>/dev/null | while read -r _f; do
187-
echo max > "$_f" 2>/dev/null || true
188-
done
189-
fi
190-
# Best-effort: our own namespaced path and the unified root.
191-
PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
192-
2>/dev/null)
193-
for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
194-
/sys/fs/cgroup/memory.swap.max; do
195-
[ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
196-
done
197-
# cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
198-
find /sys/fs/cgroup/memory -path '*kubepods*' \
199-
-name memory.memsw.limit_in_bytes 2>/dev/null \
200-
| while read -r _f; do
201-
echo -1 > "$_f" 2>/dev/null || true
202-
done
203-
# Verify and surface the result in the pod log. grep -L lists
204-
# files that do NOT contain 'max' on their first line, i.e. ones
205-
# still capping swap.
206-
PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
207-
-name memory.swap.max 2>/dev/null \
208-
| xargs -r grep -L '^max' 2>/dev/null | head -1)
209-
if [ -n "$PKB_STILL_CAPPED" ]; then
210-
echo "[pkb] WARNING: cgroup swap still capped at \
211-
$PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
212-
OOM-killed before swap is exercised" >&2
213-
else
214-
echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
215-
fi
216-
echo "[pkb] Tools installed. Writing ready sentinel."
102+
echo "[pkb] Benchmark tools ready. Writing ready sentinel."
217103
touch /tmp/pkb_ready
218104
sleep infinity
219105
securityContext:

0 commit comments

Comments
 (0)