Skip to content

Commit d4da8f9

Browse files
author
Zhuoming Chen
committed
fix glm4.7
1 parent 30adc16 commit d4da8f9

7 files changed

Lines changed: 1918 additions & 24 deletions

File tree

examples/algo2.sh

Lines changed: 72 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,44 @@
11
#!/usr/bin/env bash
22
# Experiment driver for GLM-4.7-Flash MLA sparse attention (AIME26, mean@16).
33
#
4-
# Matrix:
4+
# Matrix (11 runs total):
55
# 1. full_attention — dense baseline, run ONCE (topk is ignored). Uses the
66
# dense flashinfer MLA path because the vortex sparse backends require
77
# enable_vortex_sparsity=True (so full attention can't use cuda_mla).
8-
# 2. rope_aware_block_sparse_mla } cuda_mla decode + Triton tensor-core indexer,
8+
# 2. rope_aware_block_sparse_mla } tritondecode + Triton tensor-core indexer,
99
# 3. lserve_centroid_mla } NO layer skip (all layers sparse), topk sweep.
1010
#
1111
# Each sparse run uses:
12-
# --attention-backend cuda_mla hand-CUDA block-table decode (geometry-agnostic)
12+
# --attention-backend triton hand-CUDA block-table decode (geometry-agnostic)
1313
# --vortex-impl-backend triton tensor-core is a Triton-indexer feature ...
1414
# --vortex-use-tensor-core ... and requires impl-backend=triton
1515
# --vortex-layers-skip (no values) => skip none, all layers sparse
1616
#
17+
# Parallelism: up to 4 runs at a time, one per GPU, in sequential waves. The
18+
# full_attention baseline is job 0 so it always lands in the FIRST wave
19+
# (prioritised — its result is the reference point for every sparse run).
20+
#
1721
# Results land in $SUMMARY_DIR/*.json; collect_algo2_results.py renders them to
1822
# examples/algo2_results.md (with the raw summaries embedded).
1923
#
20-
# NOTE on GPUs: 0 and 2 are broken on this host; 6/7 are often taken by other
21-
# users. Override CUDA_VISIBLE_DEVICES to a free, working GPU before running.
22-
export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-1}
24+
# GPU selection (in priority order):
25+
# 1. $GPUS env var (space-separated indices), e.g. GPUS="1 3 4 5" ./algo2.sh
26+
# 2. algorithm_scientist/free_gpus.sh auto-detection, minus $EXCLUDE_GPUS.
27+
# If neither yields a usable GPU the script errors out (it never guesses).
28+
# No GPUs are excluded by default; set $EXCLUDE_GPUS (space-separated indices)
29+
# to skip known-bad/busy ones, e.g. EXCLUDE_GPUS="0 2" ./algo2.sh. Parallelism
30+
# is capped at 4 even if more GPUs are free.
31+
set -uo pipefail
2332
export HF_HOME=/raid/catalyst/models/
2433

2534
MODEL="zai-org/GLM-4.7-Flash"
2635
DATA="examples/aime26_glm.jsonl"
2736
SUMMARY_DIR="summary-glm4.7-flash"
2837
TRIALS=16
2938
TOPK_VAL=(61 93 125 157 253)
30-
SPARSE_MODULES=(rope_aware_block_sparse_mla lserve_centroid_mla)
39+
SPARSE_MODULES=(rope_unaware_block_sparse_mla)
40+
MAX_PARALLEL=4
41+
EXCLUDE_GPUS="${EXCLUDE_GPUS:-}"
3142

3243
COMMON=(
3344
--trials "$TRIALS"
@@ -37,27 +48,65 @@ COMMON=(
3748
--summary-dir "$SUMMARY_DIR" --skip-already-finished-check
3849
)
3950

40-
run() { # one config; do not abort the whole sweep if a single run fails
41-
echo ">>> $*"
42-
python examples/verify_algo.py "${COMMON[@]}" "$@" || echo "!!! FAILED: $*"
43-
}
51+
# --- resolve the GPU pool -----------------------------------------------------
52+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
53+
if [ -n "${GPUS:-}" ]; then
54+
read -r -a GPU_POOL <<< "$GPUS"
55+
else
56+
if FREE=$("$HERE/algorithm_scientist/free_gpus.sh" 2>/dev/null); then
57+
read -r -a DETECTED <<< "$FREE"
58+
else
59+
DETECTED=()
60+
fi
61+
GPU_POOL=()
62+
for g in "${DETECTED[@]}"; do
63+
skip=0
64+
for x in $EXCLUDE_GPUS; do [ "$g" = "$x" ] && skip=1; done
65+
[ "$skip" -eq 0 ] && GPU_POOL+=("$g")
66+
done
67+
# no fallback: refuse to guess GPU indices when detection finds nothing usable
68+
if [ "${#GPU_POOL[@]}" -eq 0 ]; then
69+
echo "error: no free GPUs detected — set \$GPUS to run on specific indices" >&2
70+
exit 1
71+
fi
72+
fi
4473

45-
# --- 1. full-attention dense baseline (run once; topk ignored) ----------------
46-
# Use sglang's plain Triton backend for dense: the flashinfer dense-MLA path has
47-
# an illegal-memory-access / page-index bug on this pool (see marks/mla/progress.md).
48-
# block/page=32 (dense isn't compatible with block_size=16).
49-
run --vortex-module-name full_attention --attention-backend triton \
50-
--topk-val 253 --block-size 32 --page-size 32
74+
PARALLEL=${#GPU_POOL[@]}
75+
[ "$PARALLEL" -gt "$MAX_PARALLEL" ] && PARALLEL="$MAX_PARALLEL"
76+
echo "=== GPU pool: ${GPU_POOL[*]} (parallel=$PARALLEL) ==="
5177

52-
# --- 2/3. sparse modules: cuda_mla + tensor-core indexer, all layers sparse ---
78+
# --- build the job list (full_attention first => prioritised, first wave) -----
79+
JOBS=()
80+
# 1. full-attention dense baseline (run once; topk ignored).
81+
# Use sglang's plain Triton backend for dense: the flashinfer dense-MLA path
82+
# has an illegal-memory-access / page-index bug on this pool. block/page=32
83+
# (dense isn't compatible with block_size=16).
84+
#JOBS+=("--vortex-module-name full_attention --attention-backend triton --topk-val 253 --block-size 32 --page-size 32")
85+
# 2/3. sparse modules: triton + tensor-core indexer, all layers sparse.
5386
for algo in "${SPARSE_MODULES[@]}"; do
5487
for k in "${TOPK_VAL[@]}"; do
55-
run --vortex-module-name "$algo" --topk-val "$k" \
56-
--attention-backend cuda_mla \
57-
--vortex-impl-backend triton \
58-
--vortex-use-tensor-core \
59-
--vortex-layers-skip # MUST be last: no values => skip none
88+
JOBS+=("--vortex-module-name $algo --topk-val $k --attention-backend triton --vortex-impl-backend triton --vortex-use-tensor-core --vortex-layers-skip")
89+
done
90+
done
91+
92+
run_job() { # $1=gpu, $2..=verify_algo args; do not abort the sweep on failure
93+
local gpu="$1"; shift
94+
echo ">>> [GPU $gpu] $*"
95+
CUDA_VISIBLE_DEVICES="$gpu" python examples/verify_algo.py "${COMMON[@]}" "$@" \
96+
|| echo "!!! FAILED [GPU $gpu]: $*"
97+
}
98+
99+
# --- launch in waves of $PARALLEL, one job per GPU ----------------------------
100+
NJOBS=${#JOBS[@]}
101+
wave=1
102+
for ((start=0; start<NJOBS; start+=PARALLEL)); do
103+
echo "=== wave $wave: jobs $start..$((start + PARALLEL - 1 < NJOBS - 1 ? start + PARALLEL - 1 : NJOBS - 1)) ==="
104+
for ((i=0; i<PARALLEL && start+i<NJOBS; i++)); do
105+
# shellcheck disable=SC2086 — intentional word-split of the job arg string
106+
run_job "${GPU_POOL[$i]}" ${JOBS[$((start+i))]} &
60107
done
108+
wait
109+
wave=$((wave + 1))
61110
done
62111

63112
# --- render results markdown --------------------------------------------------

examples/algo2_blocksize.sh

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
#!/usr/bin/env bash
2+
# Block-size study for GLM-4.7-Flash MLA sparse attention (AIME26, mean@16),
3+
# cuda_mla decode. Sibling of algo2_cuda_mla.sh; NO full_attention baseline.
4+
#
5+
# Question: at a FIXED attended-token budget, how do larger blocks trade off
6+
# accuracy vs throughput? We hold the effective budget constant across block
7+
# sizes via
8+
#
9+
# (topk_val + RESERVED) * block_size = budget RESERVED = 3
10+
#
11+
# where RESERVED = vortex_block_reserved_bos(1) + vortex_block_reserved_eos(2),
12+
# i.e. the always-attended BOS/EOS blocks. block_size == page_size throughout.
13+
#
14+
# Matrix (2 modules x 2 block sizes x 2 budgets = 8 runs):
15+
# modules : rope_aware_block_sparse_mla , lserve_centroid_mla
16+
# block : 32 , 64 (block_size == page_size)
17+
# budget : 2048 , 4096 => topk = budget/block - 3
18+
# block=32: budget 2048 -> topk 61 ; budget 4096 -> topk 125
19+
# block=64: budget 2048 -> topk 29 ; budget 4096 -> topk 61
20+
# block_size=16 is already covered in $SUMMARY_DIR (topk 125 / 253), so it is
21+
# not re-run here; the collector shows those rows alongside these for comparison.
22+
#
23+
# Each run uses cuda_mla decode + Triton tensor-core indexer, no layer skip
24+
# (every layer sparse) -- identical to algo2_cuda_mla.sh except block/page/topk.
25+
#
26+
# Parallelism: up to 4 runs at a time, one per GPU, in sequential waves.
27+
# Results -> examples/algo2_blocksize_results.md (raw summaries embedded).
28+
#
29+
# GPU selection (in priority order):
30+
# 1. $GPUS env var (space-separated indices), e.g. GPUS="1 5 6 7" ./algo2_blocksize.sh
31+
# 2. algorithm_scientist/free_gpus.sh auto-detection, minus $EXCLUDE_GPUS.
32+
# If neither yields a usable GPU the script errors out (it never guesses).
33+
# No GPUs are excluded by default; set $EXCLUDE_GPUS (space-separated indices)
34+
# to skip known-bad/busy ones. Parallelism is capped at 4 even if more are free.
35+
set -uo pipefail
36+
export HF_HOME=/raid/catalyst/models/
37+
38+
MODEL="zai-org/GLM-4.7-Flash"
39+
DATA="examples/aime26_glm.jsonl"
40+
SUMMARY_DIR="summary-glm4.7-flash"
41+
RESULTS_MD="examples/algo2_blocksize_results.md"
42+
TRIALS=16
43+
SPARSE_MODULES=(rope_aware_block_sparse_mla lserve_centroid_mla)
44+
BLOCK_SIZES=(32 64)
45+
BUDGETS=(2048 4096)
46+
RESERVED=3 # vortex_block_reserved_bos(1) + eos(2); the +3 in the formula
47+
MAX_PARALLEL=4
48+
EXCLUDE_GPUS="${EXCLUDE_GPUS:-}"
49+
50+
# block/page are set per job (they vary), so they are NOT in COMMON.
51+
COMMON=(
52+
--trials "$TRIALS"
53+
--workload-chunk-size 64 --topk-ratio 0.00
54+
--model-name "$MODEL" --data-path "$DATA" --mem 0.9
55+
--generation-max-new-tokens 32768 --max-input-length 4096 --tp-size 1
56+
--summary-dir "$SUMMARY_DIR" --skip-already-finished-check
57+
)
58+
59+
# --- resolve the GPU pool -----------------------------------------------------
60+
HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
61+
if [ -n "${GPUS:-}" ]; then
62+
read -r -a GPU_POOL <<< "$GPUS"
63+
else
64+
if FREE=$("$HERE/algorithm_scientist/free_gpus.sh" 2>/dev/null); then
65+
read -r -a DETECTED <<< "$FREE"
66+
else
67+
DETECTED=()
68+
fi
69+
GPU_POOL=()
70+
for g in "${DETECTED[@]}"; do
71+
skip=0
72+
for x in $EXCLUDE_GPUS; do [ "$g" = "$x" ] && skip=1; done
73+
[ "$skip" -eq 0 ] && GPU_POOL+=("$g")
74+
done
75+
# no fallback: refuse to guess GPU indices when detection finds nothing usable
76+
if [ "${#GPU_POOL[@]}" -eq 0 ]; then
77+
echo "error: no free GPUs detected — set \$GPUS to run on specific indices" >&2
78+
exit 1
79+
fi
80+
fi
81+
82+
PARALLEL=${#GPU_POOL[@]}
83+
[ "$PARALLEL" -gt "$MAX_PARALLEL" ] && PARALLEL="$MAX_PARALLEL"
84+
echo "=== GPU pool: ${GPU_POOL[*]} (parallel=$PARALLEL) ==="
85+
86+
# --- build the job list: module x block_size x budget -------------------------
87+
# topk derived so (topk + RESERVED) * block == budget (constant attended budget).
88+
JOBS=()
89+
for algo in "${SPARSE_MODULES[@]}"; do
90+
for bs in "${BLOCK_SIZES[@]}"; do
91+
for budget in "${BUDGETS[@]}"; do
92+
if (( budget % bs != 0 )); then
93+
echo "skip: budget $budget not divisible by block $bs" >&2; continue
94+
fi
95+
topk=$(( budget / bs - RESERVED ))
96+
if (( topk < 1 )); then
97+
echo "skip: block=$bs budget=$budget -> topk=$topk < 1" >&2; continue
98+
fi
99+
JOBS+=("--vortex-module-name $algo --topk-val $topk --block-size $bs --page-size $bs --attention-backend cuda_mla --vortex-impl-backend triton --vortex-use-tensor-core --vortex-layers-skip")
100+
done
101+
done
102+
done
103+
104+
echo "=== plan (${#JOBS[@]} jobs) ==="
105+
for j in "${JOBS[@]}"; do echo " $j"; done
106+
107+
run_job() { # $1=gpu, $2..=verify_algo args; do not abort the sweep on failure
108+
local gpu="$1"; shift
109+
echo ">>> [GPU $gpu] $*"
110+
CUDA_VISIBLE_DEVICES="$gpu" python examples/verify_algo.py "${COMMON[@]}" "$@" \
111+
|| echo "!!! FAILED [GPU $gpu]: $*"
112+
}
113+
114+
# --- launch in waves of $PARALLEL, one job per GPU ----------------------------
115+
NJOBS=${#JOBS[@]}
116+
wave=1
117+
for ((start=0; start<NJOBS; start+=PARALLEL)); do
118+
echo "=== wave $wave: jobs $start..$((start + PARALLEL - 1 < NJOBS - 1 ? start + PARALLEL - 1 : NJOBS - 1)) ==="
119+
for ((i=0; i<PARALLEL && start+i<NJOBS; i++)); do
120+
# shellcheck disable=SC2086 — intentional word-split of the job arg string
121+
run_job "${GPU_POOL[$i]}" ${JOBS[$((start+i))]} &
122+
done
123+
wait
124+
wave=$((wave + 1))
125+
done
126+
127+
# --- render results markdown --------------------------------------------------
128+
python examples/collect_algo2_results.py --summary-dir "$SUMMARY_DIR" --out "$RESULTS_MD"
129+
echo "=== done; see $RESULTS_MD ==="

0 commit comments

Comments
 (0)