Skip to content

Commit f7bdb19

Browse files
committed
Add grouped-GEMM perf_test configs and launcher
Compares moe_grouped_gemm=false vs true on Qwen3-30BA3B (4n) and Qwen3-235B (16n). Launcher wraps the shared cluster_config.sh. Signed-off-by: sna <sna@nvidia.com>
1 parent 1713776 commit f7bdb19

6 files changed

Lines changed: 207 additions & 0 deletions

File tree

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
defaults: ../../recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
2+
3+
# Grouped GEMM baseline for 235B: moe_grouped_gemm=false. Pairs with
4+
# gemm_01_grouped to isolate the speedup from the grouped MoE expert GEMM.
5+
checkpointing:
6+
enabled: false
7+
grpo:
8+
max_num_steps: 10
9+
logger:
10+
log_dir: logs/perf_test/qwen3_235b/gemm_00_baseline
11+
wandb_enabled: false
12+
tensorboard_enabled: false
13+
policy:
14+
make_sequence_length_divisible_by: 128
15+
sequence_packing:
16+
enabled: true
17+
megatron_cfg:
18+
moe_grouped_gemm: false
19+
env_vars:
20+
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
21+
CUDA_HOME: /usr/local/cuda
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
defaults: ../../recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml
2+
3+
# Grouped GEMM enabled for 235B: single grouped GEMM across experts.
4+
# Pairs with gemm_00_baseline to measure the speedup.
5+
checkpointing:
6+
enabled: false
7+
grpo:
8+
max_num_steps: 10
9+
logger:
10+
log_dir: logs/perf_test/qwen3_235b/gemm_01_grouped
11+
wandb_enabled: false
12+
tensorboard_enabled: false
13+
policy:
14+
make_sequence_length_divisible_by: 128
15+
sequence_packing:
16+
enabled: true
17+
megatron_cfg:
18+
moe_grouped_gemm: true
19+
env_vars:
20+
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
21+
CUDA_HOME: /usr/local/cuda
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
defaults: ../../recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
2+
3+
# Grouped GEMM baseline: default moe_grouped_gemm=false path. Pairs with
4+
# gemm_01_grouped to isolate the speedup from running MoE expert FCs as a
5+
# single grouped GEMM instead of per-expert GEMMs.
6+
checkpointing:
7+
enabled: false
8+
grpo:
9+
max_num_steps: 10
10+
logger:
11+
log_dir: logs/perf_test/qwen3_30ba3b/gemm_00_baseline
12+
wandb_enabled: false
13+
tensorboard_enabled: false
14+
policy:
15+
make_sequence_length_divisible_by: 128
16+
sequence_packing:
17+
enabled: true
18+
megatron_cfg:
19+
moe_grouped_gemm: false
20+
env_vars:
21+
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
22+
CUDA_HOME: /usr/local/cuda
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
defaults: ../../recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml
2+
3+
# Grouped GEMM enabled: single grouped GEMM across experts instead of
4+
# per-expert GEMMs. Pairs with gemm_00_baseline to measure the speedup.
5+
checkpointing:
6+
enabled: false
7+
grpo:
8+
max_num_steps: 10
9+
logger:
10+
log_dir: logs/perf_test/qwen3_30ba3b/gemm_01_grouped
11+
wandb_enabled: false
12+
tensorboard_enabled: false
13+
policy:
14+
make_sequence_length_divisible_by: 128
15+
sequence_packing:
16+
enabled: true
17+
megatron_cfg:
18+
moe_grouped_gemm: true
19+
env_vars:
20+
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
21+
CUDA_HOME: /usr/local/cuda
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/bin/bash
2+
# Cluster auto-detect and shared paths for perf_test submissions.
3+
# Sourced by exp_*.sh scripts.
4+
5+
detect_gpus_per_node() {
6+
local partition="${1:-batch}"
7+
local gres_gpus
8+
gres_gpus=$(sinfo -p "$partition" -h -o "%G" 2>/dev/null | grep -oP 'gpu:\d+' | grep -oP '\d+' | head -1 || true)
9+
if [[ -n "$gres_gpus" && "$gres_gpus" -gt 0 ]]; then
10+
echo "$gres_gpus"
11+
else
12+
echo "4"
13+
fi
14+
}
15+
16+
setup_cluster_config() {
17+
local partition="${1:-batch}"
18+
PARTITION="${PARTITION:-$partition}"
19+
if [[ -z "${GPUS_PER_NODE:-}" ]]; then
20+
GPUS_PER_NODE=$(detect_gpus_per_node "$partition")
21+
fi
22+
if [[ "$GPUS_PER_NODE" -eq 8 ]]; then
23+
CLUSTER_TYPE="H100"
24+
else
25+
CLUSTER_TYPE="GB200"
26+
fi
27+
# GRES flag: Lyris rejects --gres=gpu:N, so allow explicit opt-out via
28+
# GRES_FLAG= (empty). Default is "--gres=gpu:${GPUS_PER_NODE}".
29+
if [[ -z "${GRES_FLAG+x}" ]]; then
30+
GRES_FLAG="--gres=gpu:${GPUS_PER_NODE}"
31+
fi
32+
33+
BASE="${BASE:-/lustre/fsw/portfolios/coreai/projects/coreai_dlalgo_nemorl/users/sna}"
34+
CONTAINER="${CONTAINER:-${BASE}/HybridEP_test/nemo_rl.sqsh}"
35+
MOUNTS="${MOUNTS:-/lustre:/lustre}"
36+
ACCOUNT="${ACCOUNT:-coreai_dlalgo_nemorl}"
37+
HF_HOME="${HF_HOME:-${BASE}/HybridEP_test/hf_home}"
38+
HF_DATASETS_CACHE="${HF_DATASETS_CACHE:-${HF_HOME}/cache}"
39+
40+
echo "[INFO] Cluster: ${CLUSTER_TYPE}, GPUs/node: ${GPUS_PER_NODE}, Partition: ${PARTITION}"
41+
echo "[INFO] Account: ${ACCOUNT}, GRES: ${GRES_FLAG:-<none>}"
42+
echo "[INFO] Container: ${CONTAINER}"
43+
echo "[INFO] HF_HOME: ${HF_HOME}"
44+
}
45+
46+
export_cluster_config() {
47+
export GPUS_PER_NODE CONTAINER GRES_FLAG CLUSTER_TYPE PARTITION
48+
export BASE MOUNTS ACCOUNT HF_HOME HF_DATASETS_CACHE
49+
}
50+
51+
# Submit a single perf_test variant.
52+
# Args: PROJECT_ROOT CONFIG_REL NUM_NODES JOB_NAME [EXTRA_ENV]
53+
# Example: submit_variant "$BASE/RL-selective-recompute" "perf_test/qwen3_30ba3b/recompute_00_no_ckpt" 4 "nrl-recompute-qwen-no-ckpt"
54+
submit_variant() {
55+
local project_root="$1"
56+
local config_rel="$2"
57+
local num_nodes="$3"
58+
local job_name="$4"
59+
local extra_env="${5:-}"
60+
61+
local log_dir="${project_root}/logs/${config_rel}"
62+
mkdir -p "$log_dir"
63+
64+
local uv_extra=""
65+
if [[ "$project_root" == *"moe-compute-opts"* ]] || [[ "$project_root" == *"high-priority-streams"* ]]; then
66+
uv_extra="--extra mcore"
67+
fi
68+
69+
# NRL_FORCE_REBUILD_VENVS=true forces uv to re-sync venvs against the
70+
# current pyproject.toml/uv.lock on every run. NEMO_RL_VENV_DIR pins the
71+
# venv location into the project dir so each worktree has its own venv.
72+
# Both must be exported inside the container, not just the login shell.
73+
local command="cd ${project_root} && export NRL_IGNORE_VERSION_MISMATCH=1 NRL_FORCE_REBUILD_VENVS=true NEMO_RL_VENV_DIR=${project_root}/venvs CUDA_HOME=/usr/local/cuda HF_HOME=${HF_HOME} HF_DATASETS_CACHE=${HF_DATASETS_CACHE} HF_HUB_OFFLINE=1 ${extra_env} && uv run ${uv_extra} examples/run_grpo.py --config examples/configs/${config_rel}.yaml"
74+
75+
# TIME_LIMIT override lets longer-running experiments use longer partitions
76+
# (Lyris gb200 has a 5h cap vs OCI-HSG's shorter slots). Default 1:30:00
77+
# covers the original short sweeps.
78+
local time_limit="${TIME_LIMIT:-1:30:00}"
79+
80+
echo "[SUBMIT] ${job_name} (nodes=${num_nodes}, time=${time_limit})"
81+
CONTAINER="$CONTAINER" MOUNTS="$MOUNTS" GPUS_PER_NODE="$GPUS_PER_NODE" \
82+
COMMAND="$command" BASE_LOG_DIR="$log_dir" \
83+
sbatch \
84+
--nodes="$num_nodes" \
85+
$GRES_FLAG \
86+
--time="$time_limit" \
87+
--segment="$num_nodes" \
88+
-A "$ACCOUNT" -p "$PARTITION" \
89+
--job-name="$job_name" \
90+
--output="${log_dir}/slurm-%j.out" \
91+
"${project_root}/ray.sub"
92+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#!/bin/bash
2+
# Launch grouped-GEMM perf_test variants on Lyris (GB200 aarch64).
3+
# Usage: bash exp_grouped_gemm.sh [variant_filter]
4+
5+
set -euo pipefail
6+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
7+
source "${SCRIPT_DIR}/cluster_config.sh"
8+
setup_cluster_config "${PARTITION:-batch}"
9+
export_cluster_config
10+
11+
PROJECT_ROOT="${BASE}/RL-grouped-gemm"
12+
13+
# (config_rel, num_nodes, job_suffix)
14+
declare -a JOBS=(
15+
"perf_test/qwen3_30ba3b/gemm_00_baseline 4 qwen-gemm-baseline"
16+
"perf_test/qwen3_30ba3b/gemm_01_grouped 4 qwen-gemm-grouped"
17+
"perf_test/qwen3_235b/gemm_00_baseline 16 qwen235b-gemm-baseline"
18+
"perf_test/qwen3_235b/gemm_01_grouped 16 qwen235b-gemm-grouped"
19+
)
20+
21+
FILTER="${1:-}"
22+
23+
for job in "${JOBS[@]}"; do
24+
read -r config num_nodes suffix <<<"$job"
25+
[[ -n "$FILTER" ]] && [[ "$config" != *"$FILTER"* ]] && [[ "$suffix" != *"$FILTER"* ]] && continue
26+
submit_variant "$PROJECT_ROOT" "$config" "$num_nodes" "nrl-gemm-${suffix}"
27+
done
28+
29+
echo ""
30+
echo "[MONITOR] squeue -u \$USER -o '%.18i %.30j %.8T %.10M %R'"

0 commit comments

Comments
 (0)