|
| 1 | +#!/usr/bin/env bash |
| 2 | +# Mac M4 review aid for PR-E1c (kv_live_bytes reporting fix). |
| 3 | +# |
| 4 | +# This PR closes the GetSessionInfo.kv_live_bytes=0 reporting bug |
| 5 | +# PR-E1b's 4-hour bench surfaced. The Linux unit gate exercises the |
| 6 | +# coordinator-level slab-write-through against a deterministic |
| 7 | +# FakeVerifier. The Mac M4 review here adds two further checks: |
| 8 | +# |
| 9 | +# 1. The CPU verifier's kv_live_bytes accessor against real |
| 10 | +# Qwen3-0.6B numerics — non-zero, plateaus at sink+window |
| 11 | +# capacity, equals k_seq_length × per-token bytes. |
| 12 | +# 2. A short (5-min) gRPC bench run that confirms |
| 13 | +# GetSessionInfo.kv_live_bytes is no longer 0 over the wire. |
| 14 | +# |
| 15 | +# Produces 2 artifacts: |
| 16 | +# |
| 17 | +# results/platform-tests/pr-e1c-mac-verifier-tests-<unix>.json |
| 18 | +# pytest tests/core/test_verifier.py + tests/backends/mlx/test_verifier.py |
| 19 | +# (the kv_live_bytes-related tests + INV-1 baseline). |
| 20 | +# |
| 21 | +# results/platform-tests/pr-e1c-mac-bench-session-5min-<unix>.json |
| 22 | +# bench_session_long_run.py @ 300s. Purpose: visually confirm |
| 23 | +# kv_live_bytes goes 0 -> capped multi-MB once cache hits |
| 24 | +# sink+window. Expected: kv_bounded=True, prefill_bounded=True, |
| 25 | +# min/mean/max kv_live_bytes all > 0. |
| 26 | +# |
| 27 | +# Usage (from repo root, on Mac M4): |
| 28 | +# |
| 29 | +# bash scripts/review_pr_e1c_on_mac.sh |
| 30 | +# |
| 31 | +# Then commit: |
| 32 | +# |
| 33 | +# git add results/platform-tests/pr-e1c-mac-* |
| 34 | +# git commit -m "Mac M4 review evidence for PR-E1c" |
| 35 | +# git push |
| 36 | + |
| 37 | +set -euo pipefail |
| 38 | + |
| 39 | +ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| 40 | +cd "$ROOT" |
| 41 | + |
| 42 | +stamp="$(date +%s)" |
| 43 | +out_dir="results/platform-tests" |
| 44 | +mkdir -p "$out_dir" |
| 45 | + |
| 46 | +# --- Part 1: verifier-level tests ----------------------------------------- |
| 47 | +verif_junit="$out_dir/pr-e1c-mac-verifier-tests-${stamp}.junit.xml" |
| 48 | +verif_report="$out_dir/pr-e1c-mac-verifier-tests-${stamp}.json" |
| 49 | + |
| 50 | +echo "==> CPU + MLX verifier tests covering kv_live_bytes (PR-E1c)" |
| 51 | +PYTHONPATH=.:sdks/python python3 -m pytest \ |
| 52 | + tests/core/test_verifier.py \ |
| 53 | + tests/backends/mlx/test_verifier.py \ |
| 54 | + -k "kv_live_bytes or k_seq_length or cache_inspector" \ |
| 55 | + --junitxml="$verif_junit" \ |
| 56 | + -v |
| 57 | + |
| 58 | +PYTHONPATH=.:sdks/python python3 - "$verif_junit" "$verif_report" <<'PY' |
| 59 | +import json |
| 60 | +import platform |
| 61 | +import sys |
| 62 | +import xml.etree.ElementTree as ET |
| 63 | +junit_path, out_path = sys.argv[1:3] |
| 64 | +jr = ET.parse(junit_path).getroot() |
| 65 | +testsuites = list(jr.iter("testsuite")) |
| 66 | +total_tests = sum(int(ts.get("tests", "0")) for ts in testsuites) |
| 67 | +total_failures = sum(int(ts.get("failures", "0")) for ts in testsuites) |
| 68 | +total_errors = sum(int(ts.get("errors", "0")) for ts in testsuites) |
| 69 | +total_skipped = sum(int(ts.get("skipped", "0")) for ts in testsuites) |
| 70 | +report = { |
| 71 | + "schema_version": 1, |
| 72 | + "kind": "pr_e1c_mac_verifier_tests", |
| 73 | + "host": { |
| 74 | + "platform": platform.platform(), |
| 75 | + "machine": platform.machine(), |
| 76 | + "python": platform.python_version(), |
| 77 | + }, |
| 78 | + "junit": { |
| 79 | + "tests": total_tests, "failures": total_failures, |
| 80 | + "errors": total_errors, "skipped": total_skipped, |
| 81 | + }, |
| 82 | +} |
| 83 | +with open(out_path, "w", encoding="utf-8") as fh: |
| 84 | + json.dump(report, fh, indent=2) |
| 85 | +print(f" -> {out_path}") |
| 86 | +PY |
| 87 | + |
| 88 | +# --- Part 2: 5-min gRPC bench --------------------------------------------- |
| 89 | +# This part requires PR-E1b's scripts/start_grpc_runtime_server.py and |
| 90 | +# scripts/bench_agentic/bench_session_long_run.py to be present on the |
| 91 | +# checked-out tree. PR-E1c merges *after* PR-E1b in the recommended |
| 92 | +# sequence; if PR-E1c is exercised against a tree where PR-E1b hasn't |
| 93 | +# landed yet, skip the bench gracefully so Part 1 evidence still |
| 94 | +# commits cleanly. |
| 95 | +if [[ ! -f scripts/start_grpc_runtime_server.py \ |
| 96 | + || ! -f scripts/bench_agentic/bench_session_long_run.py ]]; then |
| 97 | + echo |
| 98 | + echo "==> Part 2 skipped: PR-E1b artifacts not present on this tree." |
| 99 | + echo " Re-run after PR-E1b lands to capture the bench evidence." |
| 100 | + echo |
| 101 | + echo "==> Done. Commit Part 1 evidence:" |
| 102 | + echo " git add $out_dir/pr-e1c-mac-verifier-tests-${stamp}.*" |
| 103 | + echo " git commit -m 'Mac M4 review evidence for PR-E1c (verifier tests)'" |
| 104 | + echo " git push" |
| 105 | + exit 0 |
| 106 | +fi |
| 107 | + |
| 108 | +bench_json="$out_dir/pr-e1c-mac-bench-session-5min-${stamp}.json" |
| 109 | +server_log="$out_dir/pr-e1c-mac-bench-session-5min-${stamp}.server.log" |
| 110 | + |
| 111 | +server_pid="" |
| 112 | +cleanup() { |
| 113 | + if [[ -n "$server_pid" ]] && kill -0 "$server_pid" 2>/dev/null; then |
| 114 | + kill "$server_pid" 2>/dev/null || true |
| 115 | + wait "$server_pid" 2>/dev/null || true |
| 116 | + fi |
| 117 | +} |
| 118 | +trap cleanup EXIT |
| 119 | + |
| 120 | +echo |
| 121 | +echo "==> starting gRPC server (logs: $server_log)" |
| 122 | +PYTHONPATH=.:sdks/python python3 scripts/start_grpc_runtime_server.py \ |
| 123 | + --backend cpu --verifier-id Qwen/Qwen3-0.6B \ |
| 124 | + --bind 127.0.0.1:50051 --capacity 1 --sink 4 --window 64 \ |
| 125 | + >"$server_log" 2>&1 & |
| 126 | +server_pid=$! |
| 127 | + |
| 128 | +ready=0 |
| 129 | +for _ in $(seq 1 60); do |
| 130 | + if grep -q "kakeya gRPC RuntimeService listening on" "$server_log" 2>/dev/null; then |
| 131 | + ready=1 |
| 132 | + break |
| 133 | + fi |
| 134 | + sleep 1 |
| 135 | +done |
| 136 | + |
| 137 | +if [[ "$ready" != "1" ]]; then |
| 138 | + echo "!!! gRPC server didn't become ready" |
| 139 | + tail -20 "$server_log" || true |
| 140 | + exit 1 |
| 141 | +fi |
| 142 | + |
| 143 | +echo "==> running 5-min bench (validates kv_live_bytes is non-zero)" |
| 144 | +PYTHONPATH=.:sdks/python python3 \ |
| 145 | + scripts/bench_agentic/bench_session_long_run.py \ |
| 146 | + --grpc-address 127.0.0.1:50051 \ |
| 147 | + --tokenizer-id Qwen/Qwen3-0.6B \ |
| 148 | + --duration-s 300 --turn-spacing-s 30 \ |
| 149 | + --max-tokens 64 \ |
| 150 | + --output "$bench_json" |
| 151 | + |
| 152 | +echo |
| 153 | +echo "==> Headline KPIs from $bench_json:" |
| 154 | +PYTHONPATH=.:sdks/python python3 - "$bench_json" <<'PY' |
| 155 | +import json |
| 156 | +import sys |
| 157 | +with open(sys.argv[1], encoding="utf-8") as fh: |
| 158 | + payload = json.load(fh) |
| 159 | +agg = payload["agg"] |
| 160 | +print(f" n_turns = {agg['n_turns']}") |
| 161 | +print(f" n_errors = {agg['n_errors']}") |
| 162 | +print(f" p50_latency_s = {agg['p50_latency_s']}") |
| 163 | +print(f" kv min/mean/max = " |
| 164 | + f"{agg['min_kv_live_bytes']} / " |
| 165 | + f"{agg['mean_kv_live_bytes']} / " |
| 166 | + f"{agg['max_kv_live_bytes']}") |
| 167 | +print(f" kv_bounded = {agg['kv_bounded']}") |
| 168 | +print(f" prefill_bounded = {agg['prefill_bounded']}") |
| 169 | +m = agg["max_kv_live_bytes"] |
| 170 | +if m and m > 0: |
| 171 | + print(f" -> kv_live_bytes is non-zero; PR-E1c reporting fix VERIFIED.") |
| 172 | +else: |
| 173 | + print(f" -> kv_live_bytes is still 0; PR-E1c FAILED.") |
| 174 | + sys.exit(1) |
| 175 | +PY |
| 176 | + |
| 177 | +echo |
| 178 | +echo "==> Done. Commit:" |
| 179 | +echo " git add $out_dir/pr-e1c-mac-*" |
| 180 | +echo " git commit -m 'Mac M4 review evidence for PR-E1c'" |
| 181 | +echo " git push" |
0 commit comments