Skip to content

Commit 805dfbe

Browse files
committed
fix(swap_encryption/pr2): remove duplicate FLAGS, cgroup hack; add EKS nodeadm stub
1 parent f52c931 commit 805dfbe

2 files changed

Lines changed: 2393 additions & 2020 deletions

File tree

perfkitbenchmarker/data/cluster/swap_encryption_daemonset.yaml.j2

Lines changed: 29 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -28,192 +28,46 @@ spec:
2828
- bash
2929
- -c
3030
- |
31-
echo "[pkb] Installing benchmark tools..."
32-
# Retry apt-get up to 3 times — transient network failures are
33-
# common on a freshly-started GKE node. Critical tools (fio,
34-
# stress-ng) must be present before we write the ready sentinel;
35-
# a silent || true here would cause /tmp/pkb_ready to appear even
36-
# when tools are missing, breaking all subsequent phases.
31+
echo "[pkb] Installing measurement tools..."
32+
# Only the tools needed for Phase 1 (raw-device fio) and Phase 2
33+
# (CPU/I/O overhead) are installed here. Workload benchmarks
34+
# (redis, opensearch, kernel-build) run in separate pods via
35+
# existing PKB benchmark modules and are NOT installed here.
3736
PKB_APT_OK=0
3837
for _attempt in 1 2 3; do
3938
apt-get update -qq 2>&1 || true
40-
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
41-
fio \\
42-
stress-ng \\
43-
sysstat \\
44-
cryptsetup \\
45-
mdadm \\
46-
redis-server \\
47-
redis-tools \\
48-
git \\
49-
wget \\
50-
curl \\
51-
make \\
52-
gcc \\
53-
bc \\
54-
flex \\
55-
bison \\
56-
libelf-dev \\
57-
libssl-dev \\
58-
cgroup-tools \\
59-
nvme-cli \\
60-
util-linux \\
61-
python3-pip \\
62-
libevent-dev \\
63-
libssl-dev \\
64-
libpcre3-dev \\
65-
zlib1g-dev \\
66-
build-essential \\
67-
autoconf \\
68-
automake \\
69-
libtool \\
70-
libtool-bin \\
71-
pkg-config \\
72-
python3-dev \\
73-
default-jre-headless \\
39+
DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
40+
fio \
41+
cryptsetup \
42+
mdadm \
43+
sysstat \
44+
nvme-cli \
7445
2>&1 && PKB_APT_OK=1 && break
7546
echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
7647
sleep 15
7748
done
78-
if [ "$PKB_APT_OK" != "1" ] || \\
79-
! command -v fio >/dev/null 2>&1 || \\
80-
! command -v stress-ng >/dev/null 2>&1; then
81-
echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
49+
if [ "$PKB_APT_OK" != "1" ] || ! command -v fio >/dev/null 2>&1; then
50+
echo "[pkb] FATAL: fio not installed after 3 attempts" >&2
8251
exit 1
8352
fi
84-
echo "[pkb] Installing memtier_benchmark from source..."
85-
# Pin a stable release tag — building from the moving default
86-
# branch (HEAD) intermittently broke (memtier_benchmark not found
87-
# → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the
88-
# version PKB's memtier package (memtier.MemtierResult.Parse) is
89-
# validated against and builds cleanly with the apt deps above.
90-
# Fall back to HEAD only if the tagged clone fails.
91-
if ! command -v memtier_benchmark >/dev/null 2>&1; then
92-
(cd /tmp && \\
93-
rm -rf memtier_benchmark && \\
94-
( git clone --depth 1 --branch 2.2.1 \\
95-
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
96-
git clone --depth 1 \\
97-
https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
98-
cd memtier_benchmark && \\
99-
autoreconf -ivf 2>&1 && \\
100-
./configure 2>&1 && \\
101-
make -j$(nproc) 2>&1 && \\
102-
make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
103-
echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
104-
fi
105-
if command -v memtier_benchmark >/dev/null 2>&1; then
106-
echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
107-
fi
108-
echo "[pkb] Installing esrally (lightweight)..."
109-
python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
110-
pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
111-
pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
112-
echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
113-
if command -v esrally >/dev/null 2>&1; then
114-
echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
115-
else
116-
echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
117-
fi
118-
echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
119-
# Phase 3c needs a real search server on :9200. Nothing in apt
120-
# ships one and the pod has no systemd, so install the OpenSearch
121-
# bundle (ships its own JDK) and launch the binary directly in the
122-
# phase. All best-effort: if any step fails the phase probes the
123-
# endpoint and skips cleanly rather than recording fake timings.
124-
if [ ! -x /opt/opensearch/bin/opensearch ]; then
125-
OS_VER=2.15.0
126-
(cd /opt && \\
127-
wget -q --timeout=600 -O os.tgz \\
128-
"https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
129-
tar -xzf os.tgz && rm -f os.tgz && \\
130-
mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
131-
echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
132-
fi
133-
if [ -x /opt/opensearch/bin/opensearch ]; then
134-
# pkbos owns and runs OpenSearch (it refuses to run as root).
135-
# Give it a home so HOME/temp paths are writable.
136-
id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
137-
printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
138-
> /opt/opensearch/config/opensearch.yml
139-
mkdir -p /opt/opensearch/config/jvm.options.d
140-
# 2 GB heap: 512 MB was too small and OpenSearch aborted early.
141-
# On a 252 GB node this still leaves plenty of page cache to
142-
# pressure into swap during the phase.
143-
printf -- '-Xms2g\\n-Xmx2g\\n' \\
144-
> /opt/opensearch/config/jvm.options.d/pkb-heap.options
145-
sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
146-
# CRITICAL: never run the binary as root here (it bails and
147-
# leaves root-owned files in logs/ that block the pkbos server).
148-
# Clear any stale logs and chown everything to pkbos LAST.
149-
rm -f /opt/opensearch/logs/* 2>/dev/null || true
150-
chown -R pkbos /opt/opensearch 2>/dev/null || true
151-
echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
152-
fi
153-
echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
154-
PKB_KVER="{{ kernel_version }}"
155-
PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
156-
PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
157-
PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
158-
PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
159-
mkdir -p "$PKB_KROOT"
160-
if [ ! -f "$PKB_KTARBALL" ]; then
161-
wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
162-
echo "[pkb] WARNING: kernel tarball download failed" >&2
163-
fi
164-
if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
165-
echo "[pkb] Extracting kernel source (xz)..."
166-
tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
167-
echo "[pkb] WARNING: kernel source extraction failed" >&2
168-
fi
169-
echo "[pkb] Unlocking container cgroup swap limits..."
170-
# GKE cgroup v2 sets memory.swap.max=0 per-container, which
171-
# prevents swap usage even when the node has a swap device and
172-
# vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because
173-
# the kernel can't page out to swap for this cgroup.
174-
#
175-
# NOTE: the old approach derived the cgroup path from
176-
# /proc/self/cgroup, but inside a cgroup namespace that reports
177-
# "0::/" — so the write targeted the host ROOT cgroup, silently
178-
# no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
179-
# /sys is the host cgroup tree (hostPath mount) and this pod is
180-
# privileged, so instead unlock swap across the entire kubepods
181-
# hierarchy, which is guaranteed to contain our own container.
182-
if [ -d /sys/fs/cgroup/kubepods.slice ] || \
183-
[ -d /sys/fs/cgroup/kubepods ]; then
184-
# cgroup v2: write 'max' to every memory.swap.max under kubepods*.
185-
find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
186-
2>/dev/null | while read -r _f; do
187-
echo max > "$_f" 2>/dev/null || true
188-
done
189-
fi
190-
# Best-effort: our own namespaced path and the unified root.
191-
PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
192-
2>/dev/null)
193-
for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
194-
/sys/fs/cgroup/memory.swap.max; do
195-
[ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
53+
echo "[pkb] fio installed: $(fio --version 2>&1 | head -1)"
54+
echo "[pkb] Verifying swap device is active..."
55+
PKB_SWAP_FOUND=0
56+
for _attempt in $(seq 1 30); do
57+
if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then
58+
PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps)
59+
echo "[pkb] Swap device active: $PKB_SWAP_DEV"
60+
PKB_SWAP_FOUND=1
61+
break
62+
fi
63+
echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2
64+
sleep 5
19665
done
197-
# cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
198-
find /sys/fs/cgroup/memory -path '*kubepods*' \
199-
-name memory.memsw.limit_in_bytes 2>/dev/null \
200-
| while read -r _f; do
201-
echo -1 > "$_f" 2>/dev/null || true
202-
done
203-
# Verify and surface the result in the pod log. grep -L lists
204-
# files that do NOT contain 'max' on their first line, i.e. ones
205-
# still capping swap.
206-
PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
207-
-name memory.swap.max 2>/dev/null \
208-
| xargs -r grep -L '^max' 2>/dev/null | head -1)
209-
if [ -n "$PKB_STILL_CAPPED" ]; then
210-
echo "[pkb] WARNING: cgroup swap still capped at \
211-
$PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
212-
OOM-killed before swap is exercised" >&2
213-
else
214-
echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
66+
if [ "$PKB_SWAP_FOUND" != "1" ]; then
67+
echo "[pkb] WARNING: no active swap device after 150s — " \
68+
"check linuxConfig.swapConfig / kubelet swap config." >&2
21569
fi
216-
echo "[pkb] Tools installed. Writing ready sentinel."
70+
echo "[pkb] Measurement tools ready. Writing ready sentinel."
21771
touch /tmp/pkb_ready
21872
sleep infinity
21973
securityContext:

0 commit comments

Comments
 (0)