lmms-eval/examples/models/openrouter_molmo_throughput_compare.sh at main · nota-github/lmms-eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env bash

# Throughput comparison: baseline vs static concurrency vs adaptive concurrency
# Usage: bash openrouter_molmo_throughput_compare.sh [LIMIT]
#   LIMIT defaults to 40. Use openrouter_molmo_adaptive.sh for production runs.

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "${SCRIPT_DIR}/../.."

export HF_HOME="${HF_HOME:-/tmp/huggingface}"
export OPENAI_API_KEY="${OPENROUTER_API_KEY:?Error: OPENROUTER_API_KEY not set}"
export OPENAI_API_BASE="https://openrouter.ai/api/v1"

MODEL_VERSION="bytedance-seed/seed-1.6-flash"
TASKS="mme"
LIMIT="${1:-40}"
TIMEOUT=10
MAX_RETRIES=1
OUTPUT_BASE="./logs/openrouter_molmo_throughput"
COMPARISON="${OUTPUT_BASE}/throughput_comparison.csv"

echo "mode,concurrency,limit,wall_time_s,requests_per_sec" > "$COMPARISON"

run_benchmark() {
  local mode="$1" concurrency="$2" extra_args="${3:-}"
  local run_dir="${OUTPUT_BASE}/${mode}_c${concurrency}"
  mkdir -p "$run_dir"

  local start_ns=$(date +%s%N)
  python3 -m lmms_eval \
      --model openai \
      --model_args "model_version=$MODEL_VERSION,num_concurrent=$concurrency,timeout=$TIMEOUT,max_retries=$MAX_RETRIES${extra_args}" \
      --tasks "$TASKS" \
      --batch_size 1 \
      --limit "$LIMIT" \
      --output_path "${run_dir}/results" \
      --verbosity INFO \
      --log_samples 2>&1 | tee "${run_dir}/run.log"
  local end_ns=$(date +%s%N)

  local wall=$(awk -v s="$start_ns" -v e="$end_ns" 'BEGIN{printf "%.3f",(e-s)/1e9}')
  local rps=$(awk -v l="$LIMIT" -v w="$wall" 'BEGIN{if(w>0) printf "%.3f",l/w; else print 0}')
  echo "$mode,$concurrency,$LIMIT,$wall,$rps" >> "$COMPARISON"
  printf "[%s] c=%s  wall=%.1fs  rps=%s\n" "$mode" "$concurrency" "$wall" "$rps"
}

# 1. Baseline (sequential)
run_benchmark "baseline" 1 ",adaptive_concurrency=false"

# 2. Static concurrency sweep
for c in 2 4 8 16 24; do
  run_benchmark "static" "$c" ",adaptive_concurrency=false"
done

# 3. Adaptive concurrency
run_benchmark "adaptive" 16 ",adaptive_concurrency=true,adaptive_min_concurrency=1,adaptive_max_concurrency=64,adaptive_target_latency_s=15.0,adaptive_increase_step=0.15,adaptive_decrease_factor=0.75,adaptive_failure_threshold=0.05"

echo ""
echo "=== Results ==="
column -t -s, "$COMPARISON"