11#! /usr/bin/env bash
22# Experiment driver for GLM-4.7-Flash MLA sparse attention (AIME26, mean@16).
33#
4- # Matrix:
4+ # Matrix (11 runs total) :
55# 1. full_attention — dense baseline, run ONCE (topk is ignored). Uses the
66# dense flashinfer MLA path because the vortex sparse backends require
77# enable_vortex_sparsity=True (so full attention can't use cuda_mla).
8- # 2. rope_aware_block_sparse_mla } cuda_mla decode + Triton tensor-core indexer,
8+ # 2. rope_aware_block_sparse_mla } tritondecode + Triton tensor-core indexer,
99# 3. lserve_centroid_mla } NO layer skip (all layers sparse), topk sweep.
1010#
1111# Each sparse run uses:
12- # --attention-backend cuda_mla hand-CUDA block-table decode (geometry-agnostic)
12+ # --attention-backend triton hand-CUDA block-table decode (geometry-agnostic)
1313# --vortex-impl-backend triton tensor-core is a Triton-indexer feature ...
1414# --vortex-use-tensor-core ... and requires impl-backend=triton
1515# --vortex-layers-skip (no values) => skip none, all layers sparse
1616#
17+ # Parallelism: up to 4 runs at a time, one per GPU, in sequential waves. The
18+ # full_attention baseline is job 0 so it always lands in the FIRST wave
19+ # (prioritised — its result is the reference point for every sparse run).
20+ #
1721# Results land in $SUMMARY_DIR/*.json; collect_algo2_results.py renders them to
1822# examples/algo2_results.md (with the raw summaries embedded).
1923#
20- # NOTE on GPUs: 0 and 2 are broken on this host; 6/7 are often taken by other
21- # users. Override CUDA_VISIBLE_DEVICES to a free, working GPU before running.
22- export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:- 1}
24+ # GPU selection (in priority order):
25+ # 1. $GPUS env var (space-separated indices), e.g. GPUS="1 3 4 5" ./algo2.sh
26+ # 2. algorithm_scientist/free_gpus.sh auto-detection, minus $EXCLUDE_GPUS.
27+ # If neither yields a usable GPU the script errors out (it never guesses).
28+ # No GPUs are excluded by default; set $EXCLUDE_GPUS (space-separated indices)
29+ # to skip known-bad/busy ones, e.g. EXCLUDE_GPUS="0 2" ./algo2.sh. Parallelism
30+ # is capped at 4 even if more GPUs are free.
31+ set -uo pipefail
2332export HF_HOME=/raid/catalyst/models/
2433
2534MODEL=" zai-org/GLM-4.7-Flash"
2635DATA=" examples/aime26_glm.jsonl"
2736SUMMARY_DIR=" summary-glm4.7-flash"
2837TRIALS=16
2938TOPK_VAL=(61 93 125 157 253)
30- SPARSE_MODULES=(rope_aware_block_sparse_mla lserve_centroid_mla)
39+ SPARSE_MODULES=(rope_unaware_block_sparse_mla)
40+ MAX_PARALLEL=4
41+ EXCLUDE_GPUS=" ${EXCLUDE_GPUS:- } "
3142
3243COMMON=(
3344 --trials " $TRIALS "
@@ -37,27 +48,65 @@ COMMON=(
3748 --summary-dir " $SUMMARY_DIR " --skip-already-finished-check
3849)
3950
40- run () { # one config; do not abort the whole sweep if a single run fails
41- echo " >>> $* "
42- python examples/verify_algo.py " ${COMMON[@]} " " $@ " || echo " !!! FAILED: $* "
43- }
51+ # --- resolve the GPU pool -----------------------------------------------------
52+ HERE=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) /.." && pwd) "
53+ if [ -n " ${GPUS:- } " ]; then
54+ read -r -a GPU_POOL <<< " $GPUS"
55+ else
56+ if FREE=$( " $HERE /algorithm_scientist/free_gpus.sh" 2> /dev/null) ; then
57+ read -r -a DETECTED <<< " $FREE"
58+ else
59+ DETECTED=()
60+ fi
61+ GPU_POOL=()
62+ for g in " ${DETECTED[@]} " ; do
63+ skip=0
64+ for x in $EXCLUDE_GPUS ; do [ " $g " = " $x " ] && skip=1; done
65+ [ " $skip " -eq 0 ] && GPU_POOL+=(" $g " )
66+ done
67+ # no fallback: refuse to guess GPU indices when detection finds nothing usable
68+ if [ " ${# GPU_POOL[@]} " -eq 0 ]; then
69+ echo " error: no free GPUs detected — set \$ GPUS to run on specific indices" >&2
70+ exit 1
71+ fi
72+ fi
4473
45- # --- 1. full-attention dense baseline (run once; topk ignored) ----------------
46- # Use sglang's plain Triton backend for dense: the flashinfer dense-MLA path has
47- # an illegal-memory-access / page-index bug on this pool (see marks/mla/progress.md).
48- # block/page=32 (dense isn't compatible with block_size=16).
49- run --vortex-module-name full_attention --attention-backend triton \
50- --topk-val 253 --block-size 32 --page-size 32
74+ PARALLEL=${# GPU_POOL[@]}
75+ [ " $PARALLEL " -gt " $MAX_PARALLEL " ] && PARALLEL=" $MAX_PARALLEL "
76+ echo " === GPU pool: ${GPU_POOL[*]} (parallel=$PARALLEL ) ==="
5177
52- # --- 2/3. sparse modules: cuda_mla + tensor-core indexer, all layers sparse ---
78+ # --- build the job list (full_attention first => prioritised, first wave) -----
79+ JOBS=()
80+ # 1. full-attention dense baseline (run once; topk ignored).
81+ # Use sglang's plain Triton backend for dense: the flashinfer dense-MLA path
82+ # has an illegal-memory-access / page-index bug on this pool. block/page=32
83+ # (dense isn't compatible with block_size=16).
84+ # JOBS+=("--vortex-module-name full_attention --attention-backend triton --topk-val 253 --block-size 32 --page-size 32")
85+ # 2/3. sparse modules: triton + tensor-core indexer, all layers sparse.
5386for algo in " ${SPARSE_MODULES[@]} " ; do
5487 for k in " ${TOPK_VAL[@]} " ; do
55- run --vortex-module-name " $algo " --topk-val " $k " \
56- --attention-backend cuda_mla \
57- --vortex-impl-backend triton \
58- --vortex-use-tensor-core \
59- --vortex-layers-skip # MUST be last: no values => skip none
88+ JOBS+=(" --vortex-module-name $algo --topk-val $k --attention-backend triton --vortex-impl-backend triton --vortex-use-tensor-core --vortex-layers-skip" )
89+ done
90+ done
91+
92+ run_job () { # $1=gpu, $2..=verify_algo args; do not abort the sweep on failure
93+ local gpu=" $1 " ; shift
94+ echo " >>> [GPU $gpu ] $* "
95+ CUDA_VISIBLE_DEVICES=" $gpu " python examples/verify_algo.py " ${COMMON[@]} " " $@ " \
96+ || echo " !!! FAILED [GPU $gpu ]: $* "
97+ }
98+
99+ # --- launch in waves of $PARALLEL, one job per GPU ----------------------------
100+ NJOBS=${# JOBS[@]}
101+ wave=1
102+ for (( start= 0 ; start< NJOBS; start+= PARALLEL)) ; do
103+ echo " === wave $wave : jobs $start ..$(( start + PARALLEL - 1 < NJOBS - 1 ? start + PARALLEL - 1 : NJOBS - 1 )) ==="
104+ for (( i= 0 ; i< PARALLEL && start+ i< NJOBS; i++ )) ; do
105+ # shellcheck disable=SC2086 — intentional word-split of the job arg string
106+ run_job " ${GPU_POOL[$i]} " ${JOBS[$((start+i))]} &
60107 done
108+ wait
109+ wave=$(( wave + 1 ))
61110done
62111
63112# --- render results markdown --------------------------------------------------
0 commit comments