|
| 1 | +cltk#!/usr/bin/env bash |
| 2 | +# Cold-start sanity check: 27B turbo3-base short n=128 x3 (no NextN). |
| 3 | +# Compares against historical baseline of ~18.4 TPS to detect thermal vs code regression. |
| 4 | +set -uo pipefail |
| 5 | + |
| 6 | +ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| 7 | +PORT="${PORT:-18080}" |
| 8 | +HP="127.0.0.1:${PORT}" |
| 9 | +N="${N:-128}" |
| 10 | +RUNS="${RUNS:-3}" |
| 11 | +MAIN="${MAIN:-$ROOT/.scratch/Qwen3.6-27B-UD-Q4_K_XL/Qwen3.6-27B-UD-Q4_K_XL.gguf}" |
| 12 | + |
| 13 | +PROMPT='Write a detailed 300-word essay about the history of artificial intelligence, including early pioneers like Alan Turing and John McCarthy, key milestones such as the Dartmouth Conference and the development of expert systems, and future predictions about AGI and superintelligence.' |
| 14 | + |
| 15 | +pkill -9 -f llama-server 2>/dev/null || true |
| 16 | +sleep 1 |
| 17 | + |
| 18 | +SRV_LOG=$(mktemp -t sanity-srv.XXXX.log) |
| 19 | +"$ROOT/build/bin/llama-server" \ |
| 20 | + -m "$MAIN" -c 8192 -ngl 99 -ctk turbo3 -ctv turbo3 -fa on \ |
| 21 | + --host 127.0.0.1 --port "$PORT" --parallel 1 -np 1 --cont-batching \ |
| 22 | + --metrics --slots --no-warmup \ |
| 23 | + >"$SRV_LOG" 2>&1 & |
| 24 | +SRV_PID=$! |
| 25 | + |
| 26 | +echo "info: server pid=$SRV_PID log=$SRV_LOG" >&2 |
| 27 | + |
| 28 | +for i in $(seq 1 60); do |
| 29 | + if curl -fsS "http://${HP}/health" >/dev/null 2>&1; then |
| 30 | + echo "info: server ready after ${i}s" >&2 |
| 31 | + break |
| 32 | + fi |
| 33 | + sleep 1 |
| 34 | +done |
| 35 | + |
| 36 | +echo "info: warmup n=512..." >&2 |
| 37 | +curl -fsS -X POST "http://${HP}/completion" -H 'Content-Type: application/json' \ |
| 38 | + -d "$(jq -n --arg p "$PROMPT" --argjson n 512 '{prompt:$p,n_predict:$n,temperature:0,cache_prompt:false}')" \ |
| 39 | + | jq -r '.timings | "warmup: \(.predicted_per_second // 0)|\(.predicted_n // 0)"' || true |
| 40 | + |
| 41 | +echo "info: measuring short n=${N} x${RUNS}..." >&2 |
| 42 | +for i in $(seq 1 "$RUNS"); do |
| 43 | + RESP=$(curl -fsS -X POST "http://${HP}/completion" -H 'Content-Type: application/json' \ |
| 44 | + -d "$(jq -n --arg p "$PROMPT" --argjson n "$N" '{prompt:$p,n_predict:$n,temperature:0,cache_prompt:false}')") |
| 45 | + TPS=$(echo "$RESP" | jq -r '.timings.predicted_per_second // 0') |
| 46 | + PRED=$(echo "$RESP" | jq -r '.timings.predicted_n // 0') |
| 47 | + echo " run $i: ${TPS}|${PRED}" |
| 48 | +done |
| 49 | + |
| 50 | +kill "$SRV_PID" 2>/dev/null || true |
| 51 | +sleep 1 |
| 52 | +kill -9 "$SRV_PID" 2>/dev/null || true |
| 53 | +pkill -9 -f llama-server 2>/dev/null || true |
| 54 | +echo "info: done. server log: $SRV_LOG" >&2 |
0 commit comments