Skip to content

Commit 941d8ad

Browse files
committed
fix benchmark_fused_moe
1 parent d75db86 commit 941d8ad

1 file changed

Lines changed: 3 additions & 359 deletions

File tree

benchmark/scripts/benchmark_fused_moe.py

Lines changed: 3 additions & 359 deletions
Original file line numberDiff line numberDiff line change
@@ -39,363 +39,7 @@
3939

4040
device = infer_device()
4141

42-
43-
# ---------------------------------------------------------------------------
44-
# HuggingFace reference: Python loop per expert
45-
# Matches transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock
46-
# ---------------------------------------------------------------------------
47-
48-
49-
def _huggingface_moe_forward(x, gate_up_proj, down_proj, top_k_index, top_k_weights):
50-
T, H = x.shape
51-
E = gate_up_proj.shape[0]
52-
final = torch.zeros_like(x)
53-
with torch.no_grad():
54-
expert_mask = torch.nn.functional.one_hot(top_k_index.long(), num_classes=E)
55-
expert_mask = expert_mask.permute(2, 1, 0)
56-
expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
57-
for eh in expert_hit:
58-
eidx = eh[0]
59-
top_k_pos, token_idx = torch.where(expert_mask[eidx])
60-
curr = x[token_idx]
61-
gate, up = nn.functional.linear(curr, gate_up_proj[eidx]).chunk(2, dim=-1)
62-
curr = nn.functional.silu(gate) * up
63-
curr = nn.functional.linear(curr, down_proj[eidx])
64-
curr = curr * top_k_weights[token_idx, top_k_pos, None]
65-
final.index_add_(0, token_idx, curr.to(final.dtype))
66-
return final
67-
68-
69-
# ---------------------------------------------------------------------------
70-
# MoE model configurations
71-
# EP-adjusted values are baked in: T_local = T*K/ep_size, E_local = E/ep_size
72-
# ---------------------------------------------------------------------------
73-
74-
75-
@dataclass
76-
class MoEBenchConfig:
77-
T: int # tokens per GPU (EP-adjusted)
78-
E: int # experts per GPU (EP-adjusted)
79-
H: int # hidden size
80-
intermediate_dim: int # expert intermediate size
81-
K: int # top-k
82-
83-
84-
MOE_MODEL_CONFIGS = {
85-
# Mixtral 8x7B — EP=1 and EP=4
86-
"mixtral-8x7b": MoEBenchConfig(T=2048, E=8, H=4096, intermediate_dim=14336, K=2),
87-
"mixtral-8x7b-ep4": MoEBenchConfig(T=512, E=2, H=4096, intermediate_dim=14336, K=2),
88-
# Mixtral 8x22B — EP=1 and EP=4
89-
"mixtral-8x22b": MoEBenchConfig(T=2048, E=8, H=6144, intermediate_dim=16384, K=2),
90-
"mixtral-8x22b-ep4": MoEBenchConfig(T=512, E=2, H=6144, intermediate_dim=16384, K=2),
91-
# Qwen3-MoE-30B — EP=1 and EP=8 and EP=16
92-
"qwen3-moe-30b": MoEBenchConfig(T=8192, E=128, H=2048, intermediate_dim=768, K=8),
93-
"qwen3-moe-30b-ep8": MoEBenchConfig(T=1024, E=16, H=2048, intermediate_dim=768, K=8),
94-
"qwen3-moe-30b-ep16": MoEBenchConfig(T=512, E=8, H=2048, intermediate_dim=768, K=8),
95-
# Qwen3-MoE-235B — EP=16 and EP=32
96-
"qwen3-moe-235b-ep16": MoEBenchConfig(T=256, E=8, H=7168, intermediate_dim=2560, K=8),
97-
"qwen3-moe-235b-ep32": MoEBenchConfig(T=128, E=4, H=7168, intermediate_dim=2560, K=8),
98-
# DeepSeek-V3/R1 — EP=32 and EP=64
99-
"deepseek-v3-ep32": MoEBenchConfig(T=128, E=8, H=7168, intermediate_dim=2048, K=8),
100-
"deepseek-v3-ep64": MoEBenchConfig(T=64, E=4, H=7168, intermediate_dim=2048, K=8),
101-
}
102-
103-
DEFAULT_MOE_MODEL = "qwen3-moe-30b"
104-
105-
# Expert counts used in the num_experts sweep (independent of model).
106-
EXPERT_SWEEP_VALUES = [8, 16, 32, 64, 128]
107-
108-
109-
# ---------------------------------------------------------------------------
110-
# Input generation
111-
# ---------------------------------------------------------------------------
112-
113-
114-
def _make_moe_inputs(T, E, H, intermediate_dim, K, dtype, requires_grad=True):
115-
torch.manual_seed(42)
116-
x = torch.randn(T, H, dtype=dtype, device=device, requires_grad=requires_grad)
117-
gate_up_proj = (
118-
torch.randn(E, 2 * intermediate_dim, H, dtype=dtype, device=device, requires_grad=requires_grad) * 0.02
119-
)
120-
down_proj = torch.randn(E, H, intermediate_dim, dtype=dtype, device=device, requires_grad=requires_grad) * 0.02
121-
logits = torch.randn(T, E, device=device)
122-
top_k_index = torch.topk(logits, K, dim=-1).indices.to(torch.int32)
123-
top_k_weights = (
124-
torch.softmax(torch.gather(logits, 1, top_k_index.long()), dim=-1).to(dtype).requires_grad_(requires_grad)
125-
)
126-
return x, gate_up_proj, down_proj, top_k_index, top_k_weights
127-
128-
129-
# ---------------------------------------------------------------------------
130-
# Framework-integrated benchmark functions
131-
# ---------------------------------------------------------------------------
132-
133-
134-
def _setup_fused_moe(input: SingleBenchmarkRunInput):
135-
"""Return (fwd_fn, grad_tensors) for the given provider and config.
136-
137-
extra_benchmark_config keys:
138-
sweep_dim : "T" or "E" — which dim input.x varies
139-
T, E : fixed values for the dimension not being swept (None when swept)
140-
H, intermediate_dim, K : model dimensions
141-
dtype : torch.dtype
142-
"""
143-
cfg = input.extra_benchmark_config
144-
T = int(input.x) if cfg["sweep_dim"] == "T" else cfg["T"]
145-
E = int(input.x) if cfg["sweep_dim"] == "E" else cfg["E"]
146-
H, intermediate_dim, K = cfg["H"], cfg["intermediate_dim"], cfg["K"]
147-
dtype = cfg["dtype"]
148-
149-
x, gup, dn, idx, wts = _make_moe_inputs(T, E, H, intermediate_dim, K, dtype, requires_grad=True)
150-
151-
if input.kernel_provider == "liger":
152-
153-
def fwd_fn():
154-
return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts)
155-
elif input.kernel_provider == "huggingface":
156-
157-
def fwd_fn():
158-
return _huggingface_moe_forward(x, gup, dn, idx, wts)
159-
else:
160-
raise ValueError(f"Unknown provider: {input.kernel_provider}")
161-
162-
return fwd_fn, [x, gup, dn, wts]
163-
164-
165-
def bench_speed_fused_moe(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
166-
fwd_fn, grad_tensors = _setup_fused_moe(input)
167-
return run_speed_benchmark(fwd_fn, input.kernel_operation_mode, grad_tensors)
168-
169-
170-
def bench_memory_fused_moe(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
171-
fwd_fn, _ = _setup_fused_moe(input)
172-
return run_memory_benchmark(fwd_fn, input.kernel_operation_mode)
173-
174-
175-
# ---------------------------------------------------------------------------
176-
# Autotune warmup
177-
# ---------------------------------------------------------------------------
178-
179-
180-
def _warmup_liger(T, E, H, intermediate_dim, K, dtype, sweep_dim):
181-
"""Run one full fwd+bwd to exhaust Triton autotune for (H, intermediate_dim).
182-
183-
Triton autotune key is (H_dim, I_dim), so a single call is sufficient to
184-
cache the best config for all subsequent calls with the same H and intermediate_dim.
185-
For the num_experts sweep we also call this once per E value to warm up
186-
CUDA caches for each expert count before do_bench starts timing.
187-
"""
188-
warmup_input = SingleBenchmarkRunInput(
189-
x=T if sweep_dim == "T" else E,
190-
kernel_provider="liger",
191-
extra_benchmark_config={
192-
"sweep_dim": sweep_dim,
193-
"T": T,
194-
"E": E,
195-
"H": H,
196-
"intermediate_dim": intermediate_dim,
197-
"K": K,
198-
"dtype": dtype,
199-
},
200-
)
201-
warmup_fn, _ = _setup_fused_moe(warmup_input)
202-
warmup_out = warmup_fn()
203-
warmup_out.sum().backward()
204-
del warmup_out
42+
if device == "cuda":
43+
torch.cuda.synchronize()
44+
elif device == "npu":
20545
torch.npu.synchronize()
206-
207-
208-
# ---------------------------------------------------------------------------
209-
# Model-preset benchmark (standalone, stdout only, no CSV)
210-
# ---------------------------------------------------------------------------
211-
212-
213-
def _run_model_preset_benchmark(preset_name: str):
214-
cfg = MOE_MODEL_CONFIGS[preset_name]
215-
dtype = torch.bfloat16
216-
217-
print(f"\n=== Model preset: {preset_name} ===")
218-
print(f"T={cfg.T} E={cfg.E} H={cfg.H} intermediate_dim={cfg.intermediate_dim} K={cfg.K} dtype=bfloat16")
219-
print(f"{'provider':<14} {'mode':<10} {'ms (p50)':>9} {'ms (p20)':>9} {'ms (p80)':>9} {'mem_mb':>8}")
220-
print("-" * 65)
221-
222-
for provider in ["liger", "huggingface"]:
223-
for mode in ["forward", "backward", "full"]:
224-
x, gup, dn, idx, wts = _make_moe_inputs(
225-
cfg.T, cfg.E, cfg.H, cfg.intermediate_dim, cfg.K, dtype, requires_grad=True
226-
)
227-
if provider == "liger":
228-
229-
def fwd_fn():
230-
return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts)
231-
else:
232-
233-
def fwd_fn():
234-
return _huggingface_moe_forward(x, gup, dn, idx, wts)
235-
236-
grad_tensors = [x, gup, dn, wts]
237-
speed = run_speed_benchmark(fwd_fn, mode, grad_tensors)
238-
mem = run_memory_benchmark(fwd_fn, mode)
239-
240-
mem_str = f"{mem.y_50:>8.1f}"
241-
print(f"{provider:<14} {mode:<10} {speed.y_50:>9.3f} {speed.y_20:>9.3f} {speed.y_80:>9.3f} {mem_str}")
242-
print()
243-
244-
245-
# ---------------------------------------------------------------------------
246-
# Main
247-
# ---------------------------------------------------------------------------
248-
249-
250-
if __name__ == "__main__":
251-
parser = argparse.ArgumentParser(description="Benchmark LigerFusedMoEFunction")
252-
parser.add_argument(
253-
"--overwrite",
254-
action="store_true",
255-
help="Overwrite existing CSV benchmark data",
256-
)
257-
parser.add_argument(
258-
"--model",
259-
type=str,
260-
default=DEFAULT_MOE_MODEL,
261-
choices=list(MOE_MODEL_CONFIGS.keys()),
262-
help=(
263-
f"MoE model config for framework-mode sweep (default: {DEFAULT_MOE_MODEL} = Qwen3-MoE-30B). "
264-
"Overrides all MoE params: E, H, intermediate_dim, K, and base token count T."
265-
),
266-
)
267-
parser.add_argument(
268-
"--sweep-dim",
269-
choices=["num_tokens", "num_experts"],
270-
default="num_tokens",
271-
help="Dimension to sweep in standard framework mode",
272-
)
273-
parser.add_argument(
274-
"--model-preset",
275-
choices=list(MOE_MODEL_CONFIGS.keys()),
276-
default=None,
277-
dest="model_preset",
278-
help="Run standalone model-preset benchmark (stdout only, no CSV)",
279-
)
280-
args = parser.parse_args()
281-
282-
if args.model_preset is not None:
283-
# Standalone model-preset mode: no CSV, prints a table per preset
284-
_run_model_preset_benchmark(args.model_preset)
285-
else:
286-
# Standard framework-integrated mode.
287-
# All MoE parameters are derived from the selected model config so that
288-
# --model deepseek-v3-ep32 correctly overrides E, H, intermediate_dim, K, and T.
289-
moe_cfg = MOE_MODEL_CONFIGS[args.model]
290-
E = moe_cfg.E
291-
H = moe_cfg.H
292-
intermediate_dim = moe_cfg.intermediate_dim
293-
K = moe_cfg.K
294-
probe_T = moe_cfg.T # representative token count for probing and warmup
295-
dtype = torch.bfloat16
296-
297-
print(
298-
f"Model: {args.model} — E={E}, H={H}, intermediate_dim={intermediate_dim}, K={K}, "
299-
f"T_base={probe_T}, dtype={dtype}"
300-
)
301-
302-
# Memory probe using huggingface (no Triton, higher footprint = safe upper bound).
303-
def _probe():
304-
probe_input = SingleBenchmarkRunInput(
305-
x=probe_T,
306-
kernel_provider="huggingface",
307-
extra_benchmark_config={
308-
"sweep_dim": "T",
309-
"T": None,
310-
"E": E,
311-
"H": H,
312-
"intermediate_dim": intermediate_dim,
313-
"K": K,
314-
"dtype": dtype,
315-
},
316-
)
317-
fwd_fn, _ = _setup_fused_moe(probe_input)
318-
return fwd_fn()
319-
320-
peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe)
321-
kernel_bpt = peak_bytes // probe_T
322-
323-
# Pre-warm Liger's Triton autotune before benchmarks start.
324-
#
325-
# Autotune key is (H_dim, I_dim) — one warmup per (H, intermediate_dim) pair is sufficient
326-
# to cache the best config for the entire sweep.
327-
#
328-
# For num_tokens sweep: one pass with the model's base T is enough.
329-
# For num_experts sweep: one pass per E value in EXPERT_SWEEP_VALUES to also
330-
# warm up CUDA caches for each expert count, since weight tensor sizes differ.
331-
print(f"Pre-warming Liger autotune (H={H}, intermediate_dim={intermediate_dim})...")
332-
333-
if args.sweep_dim == "num_tokens":
334-
_warmup_liger(probe_T, E, H, intermediate_dim, K, dtype, sweep_dim="T")
335-
else: # num_experts
336-
for e_val in EXPERT_SWEEP_VALUES:
337-
print(f" warmup E={e_val}...")
338-
_warmup_liger(probe_T, e_val, H, intermediate_dim, K, dtype, sweep_dim="E")
339-
340-
torch.npu.synchronize()
341-
print("Autotune warmup complete.\n")
342-
343-
if args.sweep_dim == "num_tokens":
344-
# Derive a memory-safe upper bound for T from the probe measurement.
345-
# Target 40% GPU memory utilisation to leave headroom for framework overhead.
346-
usable_bytes = get_total_gpu_memory() * (1024**3) * 0.4
347-
max_T = min(32768, max(256, int(usable_bytes / kernel_bpt)))
348-
# Round down to nearest power-of-two for clean x-axis values.
349-
max_T = 2 ** int(math.log2(max_T)) if max_T >= 256 else 256
350-
x_values = [2**i for i in range(7, int(math.log2(max_T)) + 1)]
351-
extra_configs = [
352-
{
353-
"sweep_dim": "T",
354-
"T": None, # varied by framework
355-
"E": E,
356-
"H": H,
357-
"intermediate_dim": intermediate_dim,
358-
"K": K,
359-
"dtype": dtype,
360-
}
361-
]
362-
x_name, x_label = "T", "num_tokens"
363-
else: # num_experts
364-
x_values = EXPERT_SWEEP_VALUES
365-
extra_configs = [
366-
{
367-
"sweep_dim": "E",
368-
"T": probe_T, # fixed at model's base token count
369-
"E": None, # varied by framework
370-
"H": H,
371-
"intermediate_dim": intermediate_dim,
372-
"K": K,
373-
"dtype": dtype,
374-
}
375-
]
376-
x_name, x_label = "E", "num_experts"
377-
378-
common_configs = {
379-
"kernel_name": "fused_moe",
380-
"x_name": x_name,
381-
"x_label": x_label,
382-
"x_values": x_values,
383-
"kernel_providers": ["liger", "huggingface"],
384-
"extra_benchmark_configs": extra_configs,
385-
"overwrite": args.overwrite,
386-
}
387-
388-
run_benchmarks(
389-
bench_test_fn=bench_speed_fused_moe,
390-
kernel_operation_modes=["full", "forward", "backward"],
391-
metric_name="speed",
392-
metric_unit="ms",
393-
**common_configs,
394-
)
395-
run_benchmarks(
396-
bench_test_fn=bench_memory_fused_moe,
397-
kernel_operation_modes=["full", "forward", "backward"],
398-
metric_name="memory",
399-
metric_unit="MB",
400-
**common_configs,
401-
)

0 commit comments

Comments
 (0)