@@ -28,192 +28,46 @@ spec:
2828 - bash
2929 - -c
3030 - |
31- echo "[pkb] Installing benchmark tools..."
32- # Retry apt-get up to 3 times — transient network failures are
33- # common on a freshly-started GKE node. Critical tools (fio,
34- # stress-ng) must be present before we write the ready sentinel;
35- # a silent || true here would cause /tmp/pkb_ready to appear even
36- # when tools are missing, breaking all subsequent phases.
31+ echo "[pkb] Installing measurement tools..."
32+ # Only the tools needed for Phase 1 (raw-device fio) and Phase 2
33+ # (CPU/I/O overhead) are installed here. Workload benchmarks
34+ # (redis, opensearch, kernel-build) run in separate pods via
35+ # existing PKB benchmark modules and are NOT installed here.
3736 PKB_APT_OK=0
3837 for _attempt in 1 2 3; do
3938 apt-get update -qq 2>&1 || true
40- DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \\
41- fio \\
42- stress-ng \\
43- sysstat \\
44- cryptsetup \\
45- mdadm \\
46- redis-server \\
47- redis-tools \\
48- git \\
49- wget \\
50- curl \\
51- make \\
52- gcc \\
53- bc \\
54- flex \\
55- bison \\
56- libelf-dev \\
57- libssl-dev \\
58- cgroup-tools \\
59- nvme-cli \\
60- util-linux \\
61- python3-pip \\
62- libevent-dev \\
63- libssl-dev \\
64- libpcre3-dev \\
65- zlib1g-dev \\
66- build-essential \\
67- autoconf \\
68- automake \\
69- libtool \\
70- libtool-bin \\
71- pkg-config \\
72- python3-dev \\
73- default-jre-headless \\
39+ DEBIAN_FRONTEND=noninteractive apt-get install -y -qq \
40+ fio \
41+ cryptsetup \
42+ mdadm \
43+ sysstat \
44+ nvme-cli \
7445 2>&1 && PKB_APT_OK=1 && break
7546 echo "[pkb] apt-get attempt $_attempt failed, retrying in 15s..." >&2
7647 sleep 15
7748 done
78- if [ "$PKB_APT_OK" != "1" ] || \\
79- ! command -v fio >/dev/null 2>&1 || \\
80- ! command -v stress-ng >/dev/null 2>&1 ; then
81- echo "[pkb] FATAL: critical tools (fio, stress-ng) not installed after 3 attempts" >&2
49+ if [ "$PKB_APT_OK" != "1" ] || ! command -v fio >/dev/null 2>&1 ; then
50+ echo "[pkb] FATAL: fio not installed after 3 attempts" >&2
8251 exit 1
8352 fi
84- echo "[pkb] Installing memtier_benchmark from source..."
85- # Pin a stable release tag — building from the moving default
86- # branch (HEAD) intermittently broke (memtier_benchmark not found
87- # → Phase 3a lost its P50/P90/P99 latency). 2.2.1 matches the
88- # version PKB's memtier package (memtier.MemtierResult.Parse) is
89- # validated against and builds cleanly with the apt deps above.
90- # Fall back to HEAD only if the tagged clone fails.
91- if ! command -v memtier_benchmark >/dev/null 2>&1 ; then
92- (cd /tmp && \\
93- rm -rf memtier_benchmark && \\
94- ( git clone --depth 1 --branch 2.2.1 \\
95- https://github.com/RedisLabs/memtier_benchmark.git 2>&1 || \\
96- git clone --depth 1 \\
97- https://github.com/RedisLabs/memtier_benchmark.git 2>&1 ) && \\
98- cd memtier_benchmark && \\
99- autoreconf -ivf 2>&1 && \\
100- ./configure 2>&1 && \\
101- make -j$(nproc) 2>&1 && \\
102- make install 2>&1) > /tmp/pkb_memtier_build.log 2>&1 || \\
103- echo "[pkb] WARNING: memtier_benchmark build failed (see /tmp/pkb_memtier_build.log); redis-benchmark fallback will be used"
104- fi
105- if command -v memtier_benchmark >/dev/null 2>&1 ; then
106- echo "[pkb] memtier_benchmark installed: $(memtier_benchmark --version 2>&1 | head -1)"
107- fi
108- echo "[pkb] Installing esrally (lightweight)..."
109- python3 -m pip install --upgrade --break-system-packages pip setuptools wheel > /tmp/pkb_esrally_build.log 2>&1 || true
110- pip3 install --break-system-packages elastic-transport esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
111- pip3 install --break-system-packages esrally >> /tmp/pkb_esrally_build.log 2>&1 || \\
112- echo "[pkb] WARNING: esrally install failed (see /tmp/pkb_esrally_build.log); opensearch curl fallback will be used"
113- if command -v esrally >/dev/null 2>&1 ; then
114- echo "[pkb] esrally installed: $(esrally --version 2>&1 | head -1)"
115- else
116- echo "[pkb] WARNING: esrally binary not on PATH after install; opensearch curl fallback will be used" >&2
117- fi
118- echo "[pkb] Installing OpenSearch (single-node, security off) for Phase 3c..."
119- # Phase 3c needs a real search server on :9200. Nothing in apt
120- # ships one and the pod has no systemd, so install the OpenSearch
121- # bundle (ships its own JDK) and launch the binary directly in the
122- # phase. All best-effort: if any step fails the phase probes the
123- # endpoint and skips cleanly rather than recording fake timings.
124- if [ ! -x /opt/opensearch/bin/opensearch ]; then
125- OS_VER=2.15.0
126- (cd /opt && \\
127- wget -q --timeout=600 -O os.tgz \\
128- "https://artifacts.opensearch.org/releases/bundle/opensearch/$OS_VER/opensearch-$OS_VER-linux-x64.tar.gz" && \\
129- tar -xzf os.tgz && rm -f os.tgz && \\
130- mv "opensearch-$OS_VER" opensearch) > /tmp/pkb_opensearch_build.log 2>&1 || \\
131- echo "[pkb] WARNING: OpenSearch download/extract failed (see /tmp/pkb_opensearch_build.log); Phase 3c will skip" >&2
132- fi
133- if [ -x /opt/opensearch/bin/opensearch ]; then
134- # pkbos owns and runs OpenSearch (it refuses to run as root).
135- # Give it a home so HOME/temp paths are writable.
136- id pkbos >/dev/null 2>&1 || useradd -r -d /opt/opensearch -s /bin/bash pkbos 2>/dev/null || true
137- printf 'discovery.type: single-node\\nnetwork.host: 127.0.0.1\\nplugins.security.disabled: true\\n' \\
138- > /opt/opensearch/config/opensearch.yml
139- mkdir -p /opt/opensearch/config/jvm.options.d
140- # 2 GB heap: 512 MB was too small and OpenSearch aborted early.
141- # On a 252 GB node this still leaves plenty of page cache to
142- # pressure into swap during the phase.
143- printf -- '-Xms2g\\n-Xmx2g\\n' \\
144- > /opt/opensearch/config/jvm.options.d/pkb-heap.options
145- sysctl -w vm.max_map_count=262144 >/dev/null 2>&1 || true
146- # CRITICAL: never run the binary as root here (it bails and
147- # leaves root-owned files in logs/ that block the pkbos server).
148- # Clear any stale logs and chown everything to pkbos LAST.
149- rm -f /opt/opensearch/logs/* 2>/dev/null || true
150- chown -R pkbos /opt/opensearch 2>/dev/null || true
151- echo "[pkb] OpenSearch installed at /opt/opensearch (heap 2g, runs as pkbos)"
152- fi
153- echo "[pkb] Pre-fetching kernel source for Phase 3b build workload..."
154- PKB_KVER="{{ kernel_version }}"
155- PKB_KROOT="/mnt/stateful_partition/pkb_kernel"
156- PKB_KTARBALL="$PKB_KROOT/linux-$PKB_KVER.tar.xz"
157- PKB_KSRC="$PKB_KROOT/linux-$PKB_KVER"
158- PKB_KURL="https://cdn.kernel.org/pub/linux/kernel/v${PKB_KVER%%.*}.x/linux-$PKB_KVER.tar.xz"
159- mkdir -p "$PKB_KROOT"
160- if [ ! -f "$PKB_KTARBALL" ]; then
161- wget -q --timeout=300 -O "$PKB_KTARBALL" "$PKB_KURL" 2>&1 || \\
162- echo "[pkb] WARNING: kernel tarball download failed" >&2
163- fi
164- if [ -f "$PKB_KTARBALL" ] && [ ! -d "$PKB_KSRC" ]; then
165- echo "[pkb] Extracting kernel source (xz)..."
166- tar -xf "$PKB_KTARBALL" -C "$PKB_KROOT" 2>&1 || \\
167- echo "[pkb] WARNING: kernel source extraction failed" >&2
168- fi
169- echo "[pkb] Unlocking container cgroup swap limits..."
170- # GKE cgroup v2 sets memory.swap.max=0 per-container, which
171- # prevents swap usage even when the node has a swap device and
172- # vm.swappiness>0. Stress-ng gets OOM-killed in ~15s because
173- # the kernel can't page out to swap for this cgroup.
174- #
175- # NOTE: the old approach derived the cgroup path from
176- # /proc/self/cgroup, but inside a cgroup namespace that reports
177- # "0::/" — so the write targeted the host ROOT cgroup, silently
178- # no-op'd, and swap stayed locked (the OOM-in-15s symptom above).
179- # /sys is the host cgroup tree (hostPath mount) and this pod is
180- # privileged, so instead unlock swap across the entire kubepods
181- # hierarchy, which is guaranteed to contain our own container.
182- if [ -d /sys/fs/cgroup/kubepods.slice ] || \
183- [ -d /sys/fs/cgroup/kubepods ]; then
184- # cgroup v2: write 'max' to every memory.swap.max under kubepods*.
185- find /sys/fs/cgroup -path '*kubepods*' -name memory.swap.max \
186- 2>/dev/null | while read -r _f; do
187- echo max > "$_f" 2>/dev/null || true
188- done
189- fi
190- # Best-effort: our own namespaced path and the unified root.
191- PKB_CG=$(awk -F: '$2==""{print $3; exit}' /proc/self/cgroup \
192- 2>/dev/null)
193- for _cgf in "/sys/fs/cgroup${PKB_CG}/memory.swap.max" \
194- /sys/fs/cgroup/memory.swap.max; do
195- [ -f "$_cgf" ] && { echo max > "$_cgf" 2>/dev/null || true; }
53+ echo "[pkb] fio installed: $(fio --version 2>&1 | head -1)"
54+ echo "[pkb] Verifying swap device is active..."
55+ PKB_SWAP_FOUND=0
56+ for _attempt in $(seq 1 30); do
57+ if awk 'NR>1{found=1} END{exit !found}' /proc/swaps 2>/dev/null; then
58+ PKB_SWAP_DEV=$(awk 'NR==2{print $1}' /proc/swaps)
59+ echo "[pkb] Swap device active: $PKB_SWAP_DEV"
60+ PKB_SWAP_FOUND=1
61+ break
62+ fi
63+ echo "[pkb] Waiting for swap device (attempt $_attempt/30)..." >&2
64+ sleep 5
19665 done
197- # cgroup v1 fallback: lift the combined RAM+swap hard ceiling.
198- find /sys/fs/cgroup/memory -path '*kubepods*' \
199- -name memory.memsw.limit_in_bytes 2>/dev/null \
200- | while read -r _f; do
201- echo -1 > "$_f" 2>/dev/null || true
202- done
203- # Verify and surface the result in the pod log. grep -L lists
204- # files that do NOT contain 'max' on their first line, i.e. ones
205- # still capping swap.
206- PKB_STILL_CAPPED=$(find /sys/fs/cgroup -path '*kubepods*' \
207- -name memory.swap.max 2>/dev/null \
208- | xargs -r grep -L '^max' 2>/dev/null | head -1)
209- if [ -n "$PKB_STILL_CAPPED" ]; then
210- echo "[pkb] WARNING: cgroup swap still capped at \
211- $PKB_STILL_CAPPED=$(cat "$PKB_STILL_CAPPED" 2>/dev/null) — stress-ng may be \
212- OOM-killed before swap is exercised" >&2
213- else
214- echo "[pkb] cgroup swap unlocked (memory.swap.max=max across kubepods)"
66+ if [ "$PKB_SWAP_FOUND" != "1" ]; then
67+ echo "[pkb] WARNING: no active swap device after 150s — " \
68+ "check linuxConfig.swapConfig / kubelet swap config." >&2
21569 fi
216- echo "[pkb] Tools installed . Writing ready sentinel."
70+ echo "[pkb] Measurement tools ready . Writing ready sentinel."
21771 touch /tmp/pkb_ready
21872 sleep infinity
21973 securityContext:
0 commit comments