|
| 1 | +#!/usr/bin/env bash |
| 2 | +# |
| 3 | +# long_context_bench.sh — Long Context KV Cache Memory Benchmark |
| 4 | +# |
| 5 | +# Measures KV cache memory usage at various context lengths, comparing |
| 6 | +# TurboQuant (compressed Q4 KV) vs theoretical FP16 KV (llama.cpp default). |
| 7 | +# |
| 8 | +# Usage: |
| 9 | +# bash bench/long_context_bench.sh [model.tqm] [kv_type] |
| 10 | +# |
| 11 | +# Arguments: |
| 12 | +# model.tqm Path to TQM model file (default: gemma3-4b.tqm) |
| 13 | +# kv_type KV cache type (default: uniform_4b) |
| 14 | +# |
| 15 | +# Output: |
| 16 | +# - Table printed to stdout |
| 17 | +# - CSV saved to bench/long_context_results.csv |
| 18 | + |
| 19 | +set -euo pipefail |
| 20 | + |
| 21 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 22 | +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 23 | +BUILD_DIR="$PROJECT_DIR/build" |
| 24 | +TQ_RUN="$BUILD_DIR/tq_run" |
| 25 | + |
| 26 | +# Default arguments |
| 27 | +MODEL="${1:-gemma3-4b.tqm}" |
| 28 | +KV_TYPE="${2:-uniform_4b}" |
| 29 | +CSV_OUT="$SCRIPT_DIR/long_context_results.csv" |
| 30 | + |
| 31 | +# Context lengths to test |
| 32 | +CONTEXT_LENGTHS=(512 1024 2048 4096) |
| 33 | + |
| 34 | +# -------------------------------------------------------- |
| 35 | +# Ensure tq_run is built |
| 36 | +# -------------------------------------------------------- |
| 37 | +if [ ! -f "$TQ_RUN" ]; then |
| 38 | + echo "Building tq_run..." |
| 39 | + cmake -B "$BUILD_DIR" -DCMAKE_BUILD_TYPE=Release "$PROJECT_DIR" > /dev/null 2>&1 |
| 40 | + cmake --build "$BUILD_DIR" --target tq_run -j"$(sysctl -n hw.ncpu 2>/dev/null || nproc)" > /dev/null 2>&1 |
| 41 | +fi |
| 42 | + |
| 43 | +if [ ! -f "$TQ_RUN" ]; then |
| 44 | + echo "ERROR: Failed to build tq_run" >&2 |
| 45 | + exit 1 |
| 46 | +fi |
| 47 | + |
| 48 | +# -------------------------------------------------------- |
| 49 | +# Resolve model path |
| 50 | +# -------------------------------------------------------- |
| 51 | +if [ ! -f "$MODEL" ]; then |
| 52 | + # Try common locations |
| 53 | + for candidate in \ |
| 54 | + "$PROJECT_DIR/$MODEL" \ |
| 55 | + "$PROJECT_DIR/models/$MODEL" \ |
| 56 | + "$HOME/.cache/turboquant/$MODEL" \ |
| 57 | + "$HOME/$MODEL"; do |
| 58 | + if [ -f "$candidate" ]; then |
| 59 | + MODEL="$candidate" |
| 60 | + break |
| 61 | + fi |
| 62 | + done |
| 63 | +fi |
| 64 | + |
| 65 | +if [ ! -f "$MODEL" ]; then |
| 66 | + echo "WARNING: Model file '$MODEL' not found." |
| 67 | + echo "Running in estimation-only mode (no actual inference)." |
| 68 | + echo "" |
| 69 | + ESTIMATION_ONLY=1 |
| 70 | +else |
| 71 | + ESTIMATION_ONLY=0 |
| 72 | + echo "Model: $MODEL" |
| 73 | +fi |
| 74 | + |
| 75 | +echo "KV type: $KV_TYPE" |
| 76 | +echo "" |
| 77 | + |
| 78 | +# -------------------------------------------------------- |
| 79 | +# Get model config (if model available) |
| 80 | +# -------------------------------------------------------- |
| 81 | +N_LAYERS=0 |
| 82 | +N_KV_HEADS=0 |
| 83 | +HEAD_DIM=0 |
| 84 | + |
| 85 | +if [ "$ESTIMATION_ONLY" -eq 0 ]; then |
| 86 | + # Extract model config from --info output |
| 87 | + # Format: "Model: 34 layers, dim=2560, heads=32/4, vocab=262144, inter=6912" |
| 88 | + INFO=$("$TQ_RUN" "$MODEL" --info 2>&1 || true) |
| 89 | + |
| 90 | + # macOS-compatible parsing using sed (no grep -P) |
| 91 | + # Format: "Model: 34 layers, dim=2560, heads=32/4, head_dim=256, vocab=262144, inter=6912" |
| 92 | + N_LAYERS=$(echo "$INFO" | sed -n 's/^.*Model: \([0-9]*\) layers.*/\1/p' | head -1) |
| 93 | + N_KV_HEADS=$(echo "$INFO" | sed -n 's/^.*heads=[0-9]*\/\([0-9]*\).*/\1/p' | head -1) |
| 94 | + HEAD_DIM=$(echo "$INFO" | sed -n 's/^.*head_dim=\([0-9]*\).*/\1/p' | head -1) |
| 95 | +fi |
| 96 | + |
| 97 | +# Fallback to Gemma 3 4B defaults if parsing failed |
| 98 | +if [ -z "$N_LAYERS" ] || [ "$N_LAYERS" -eq 0 ]; then |
| 99 | + N_LAYERS=34 |
| 100 | +fi |
| 101 | +if [ -z "$N_KV_HEADS" ] || [ "$N_KV_HEADS" -eq 0 ]; then |
| 102 | + N_KV_HEADS=4 |
| 103 | +fi |
| 104 | +if [ -z "$HEAD_DIM" ] || [ "$HEAD_DIM" -eq 0 ]; then |
| 105 | + HEAD_DIM=256 |
| 106 | +fi |
| 107 | + |
| 108 | +echo "Model config: ${N_LAYERS} layers, ${N_KV_HEADS} kv_heads, head_dim=${HEAD_DIM}" |
| 109 | +echo "" |
| 110 | + |
| 111 | +# -------------------------------------------------------- |
| 112 | +# Calculate FP16 and compressed KV sizes |
| 113 | +# -------------------------------------------------------- |
| 114 | + |
| 115 | +# FP16 baseline (llama.cpp default): K_fp16 + V_fp16 |
| 116 | +# = 2 (K+V) * n_layers * n_kv_heads * head_dim * 2 bytes_per_fp16 |
| 117 | +FP16_PER_TOKEN=$(( 2 * N_LAYERS * N_KV_HEADS * HEAD_DIM * 2 )) |
| 118 | + |
| 119 | +# Quantized block parameters |
| 120 | +BLOCK_SIZE=128 |
| 121 | +case "$KV_TYPE" in |
| 122 | + uniform_4b) TYPE_SIZE=68 ;; # 4 + 128/2 |
| 123 | + uniform_2b) TYPE_SIZE=36 ;; # 4 + 128/4 |
| 124 | + polar_3b) TYPE_SIZE=72 ;; # 8 + 128/2 |
| 125 | + polar_4b) TYPE_SIZE=72 ;; # 8 + 128/2 |
| 126 | + turbo_3b) TYPE_SIZE=112 ;; # polar(72) + qjl(40) |
| 127 | + turbo_4b) TYPE_SIZE=112 ;; # polar(72) + qjl(40) |
| 128 | + mixed_4b8) TYPE_SIZE=80 ;; # 4 + 4 + 8 + 128/2 |
| 129 | + fp32) TYPE_SIZE=0 ;; |
| 130 | + *) TYPE_SIZE=68 ;; # default to uniform_4b |
| 131 | +esac |
| 132 | + |
| 133 | +BLOCKS_PER_HEAD=$(( (HEAD_DIM + BLOCK_SIZE - 1) / BLOCK_SIZE )) |
| 134 | + |
| 135 | +# TurboQuant Q4: both keys AND values quantized to same type |
| 136 | +# = 2 (K+V) * n_layers * n_kv_heads * blocks_per_head * type_size |
| 137 | +if [ "$KV_TYPE" = "fp32" ]; then |
| 138 | + Q4_PER_TOKEN=$(( 2 * N_LAYERS * N_KV_HEADS * HEAD_DIM * 4 )) |
| 139 | +else |
| 140 | + Q4_PER_TOKEN=$(( 2 * N_LAYERS * N_KV_HEADS * BLOCKS_PER_HEAD * TYPE_SIZE )) |
| 141 | +fi |
| 142 | + |
| 143 | +echo "Per-token KV (FP16 / llama.cpp): $(echo "scale=2; $FP16_PER_TOKEN / 1024" | bc) KB" |
| 144 | +echo "Per-token KV (Q4 / TurboQuant): $(echo "scale=2; $Q4_PER_TOKEN / 1024" | bc) KB" |
| 145 | +OVERALL_RATIO=$(echo "scale=2; $FP16_PER_TOKEN / $Q4_PER_TOKEN" | bc) |
| 146 | +echo "Compression ratio: ${OVERALL_RATIO}x" |
| 147 | +echo "" |
| 148 | + |
| 149 | +# -------------------------------------------------------- |
| 150 | +# Generate long prompts and run benchmarks |
| 151 | +# -------------------------------------------------------- |
| 152 | + |
| 153 | +# Create a repeatable text block (~100 tokens per repetition) |
| 154 | +FILLER="The quick brown fox jumps over the lazy dog. In the vast expanse of the universe, countless stars illuminate the darkness of space. Knowledge is the foundation upon which all great achievements are built. Every journey of a thousand miles begins with a single step forward. " |
| 155 | + |
| 156 | +generate_prompt() { |
| 157 | + local target_tokens=$1 |
| 158 | + # Rough estimate: ~1.3 tokens per word, ~4 chars per word |
| 159 | + local target_words=$(( target_tokens * 3 / 4 )) |
| 160 | + local result="" |
| 161 | + while [ ${#result} -lt $(( target_words * 5 )) ]; do |
| 162 | + result="${result}${FILLER}" |
| 163 | + done |
| 164 | + # Truncate to approximate target |
| 165 | + echo "${result:0:$(( target_words * 5 ))}" |
| 166 | +} |
| 167 | + |
| 168 | +# -------------------------------------------------------- |
| 169 | +# Write CSV header |
| 170 | +# -------------------------------------------------------- |
| 171 | +echo "context_length,compressed_kv_bytes,fp16_kv_bytes,compressed_kv_mb,fp16_kv_mb,compression_ratio,memory_saved_mb" > "$CSV_OUT" |
| 172 | + |
| 173 | +# Column labels |
| 174 | +COL_Q4="Q4 (TurboQuant)" |
| 175 | +COL_FP16="FP16 (llama.cpp)" |
| 176 | + |
| 177 | +# -------------------------------------------------------- |
| 178 | +# Print table header |
| 179 | +# -------------------------------------------------------- |
| 180 | +printf "\n%-15s %18s %18s %8s %15s\n" \ |
| 181 | + "Context Length" "Q4 TurboQuant" "FP16 llama.cpp" "Ratio" "Memory Saved" |
| 182 | +printf "%-15s %18s %18s %8s %15s\n" \ |
| 183 | + "---------------" "------------------" "------------------" "--------" "---------------" |
| 184 | + |
| 185 | +# -------------------------------------------------------- |
| 186 | +# Run benchmark at each context length |
| 187 | +# -------------------------------------------------------- |
| 188 | +for CTX in "${CONTEXT_LENGTHS[@]}"; do |
| 189 | + TOTAL_Q4=$(( Q4_PER_TOKEN * CTX )) |
| 190 | + TOTAL_FP16=$(( FP16_PER_TOKEN * CTX )) |
| 191 | + Q4_MB=$(echo "scale=2; $TOTAL_Q4 / 1048576" | bc) |
| 192 | + FP16_MB=$(echo "scale=2; $TOTAL_FP16 / 1048576" | bc) |
| 193 | + RATIO=$(echo "scale=2; $TOTAL_FP16 / $TOTAL_Q4" | bc) |
| 194 | + SAVED_MB=$(echo "scale=2; ($TOTAL_FP16 - $TOTAL_Q4) / 1048576" | bc) |
| 195 | + |
| 196 | + # If model is available, also run actual inference to validate |
| 197 | + if [ "$ESTIMATION_ONLY" -eq 0 ] && [ "$CTX" -le 512 ]; then |
| 198 | + # Only run actual inference for smaller contexts (larger ones take too long) |
| 199 | + PROMPT=$(generate_prompt "$CTX") |
| 200 | + GEN_TOKENS=$(( CTX / 4 )) # Generate 1/4 of context length |
| 201 | + STDERR_OUT=$("$TQ_RUN" "$MODEL" -p "$PROMPT" -n "$GEN_TOKENS" -k "$KV_TYPE" -M -q q4 2>&1 >/dev/null || true) |
| 202 | + # Extract actual MEMORY_CSV line if available |
| 203 | + ACTUAL_LINE=$(echo "$STDERR_OUT" | grep "^MEMORY_CSV:" || true) |
| 204 | + if [ -n "$ACTUAL_LINE" ]; then |
| 205 | + ACTUAL_RATIO=$(echo "$ACTUAL_LINE" | cut -d, -f4) |
| 206 | + printf "%-15s %18s %18s %8s %15s (actual: %.2fx)\n" \ |
| 207 | + "$CTX tokens" "${Q4_MB} MB" "${FP16_MB} MB" "${RATIO}x" "${SAVED_MB} MB" "$ACTUAL_RATIO" |
| 208 | + else |
| 209 | + printf "%-15s %15s %15s %12s %15s\n" \ |
| 210 | + "$CTX tokens" "${Q4_MB} MB" "${FP16_MB} MB" "${RATIO}x" "${SAVED_MB} MB" |
| 211 | + fi |
| 212 | + else |
| 213 | + printf "%-15s %18s %18s %8s %15s\n" \ |
| 214 | + "$CTX tokens" "${Q4_MB} MB" "${FP16_MB} MB" "${RATIO}x" "${SAVED_MB} MB" |
| 215 | + fi |
| 216 | + |
| 217 | + # Write CSV row |
| 218 | + echo "$CTX,$TOTAL_Q4,$TOTAL_FP16,$Q4_MB,$FP16_MB,$RATIO,$SAVED_MB" >> "$CSV_OUT" |
| 219 | +done |
| 220 | + |
| 221 | +# -------------------------------------------------------- |
| 222 | +# Extended context lengths (estimation only) |
| 223 | +# -------------------------------------------------------- |
| 224 | +EXTENDED_LENGTHS=(8192 16384 32768 65536 131072) |
| 225 | + |
| 226 | +printf "\n%-15s %18s %18s %8s %15s\n" \ |
| 227 | + "--- Extended ---" "" "" "" "" |
| 228 | + |
| 229 | +for CTX in "${EXTENDED_LENGTHS[@]}"; do |
| 230 | + TOTAL_Q4=$(( Q4_PER_TOKEN * CTX )) |
| 231 | + TOTAL_FP16=$(( FP16_PER_TOKEN * CTX )) |
| 232 | + Q4_MB=$(echo "scale=2; $TOTAL_Q4 / 1048576" | bc) |
| 233 | + FP16_MB=$(echo "scale=2; $TOTAL_FP16 / 1048576" | bc) |
| 234 | + RATIO=$(echo "scale=2; $TOTAL_FP16 / $TOTAL_Q4" | bc) |
| 235 | + SAVED_MB=$(echo "scale=2; ($TOTAL_FP16 - $TOTAL_Q4) / 1048576" | bc) |
| 236 | + |
| 237 | + printf "%-15s %15s %15s %12s %15s\n" \ |
| 238 | + "$CTX tokens" "${Q4_MB} MB" "${FP16_MB} MB" "${RATIO}x" "${SAVED_MB} MB" |
| 239 | + |
| 240 | + echo "$CTX,$TOTAL_Q4,$TOTAL_FP16,$Q4_MB,$FP16_MB,$RATIO,$SAVED_MB" >> "$CSV_OUT" |
| 241 | +done |
| 242 | + |
| 243 | +echo "" |
| 244 | +echo "CSV results saved to: $CSV_OUT" |
| 245 | +echo "" |
| 246 | +echo "To generate a chart:" |
| 247 | +echo " python3 bench/plot_memory.py bench/long_context_results.csv" |
0 commit comments