|
39 | 39 |
|
40 | 40 | device = infer_device() |
41 | 41 |
|
42 | | - |
43 | | -# --------------------------------------------------------------------------- |
44 | | -# HuggingFace reference: Python loop per expert |
45 | | -# Matches transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock |
46 | | -# --------------------------------------------------------------------------- |
47 | | - |
48 | | - |
49 | | -def _huggingface_moe_forward(x, gate_up_proj, down_proj, top_k_index, top_k_weights): |
50 | | - T, H = x.shape |
51 | | - E = gate_up_proj.shape[0] |
52 | | - final = torch.zeros_like(x) |
53 | | - with torch.no_grad(): |
54 | | - expert_mask = torch.nn.functional.one_hot(top_k_index.long(), num_classes=E) |
55 | | - expert_mask = expert_mask.permute(2, 1, 0) |
56 | | - expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() |
57 | | - for eh in expert_hit: |
58 | | - eidx = eh[0] |
59 | | - top_k_pos, token_idx = torch.where(expert_mask[eidx]) |
60 | | - curr = x[token_idx] |
61 | | - gate, up = nn.functional.linear(curr, gate_up_proj[eidx]).chunk(2, dim=-1) |
62 | | - curr = nn.functional.silu(gate) * up |
63 | | - curr = nn.functional.linear(curr, down_proj[eidx]) |
64 | | - curr = curr * top_k_weights[token_idx, top_k_pos, None] |
65 | | - final.index_add_(0, token_idx, curr.to(final.dtype)) |
66 | | - return final |
67 | | - |
68 | | - |
69 | | -# --------------------------------------------------------------------------- |
70 | | -# MoE model configurations |
71 | | -# EP-adjusted values are baked in: T_local = T*K/ep_size, E_local = E/ep_size |
72 | | -# --------------------------------------------------------------------------- |
73 | | - |
74 | | - |
75 | | -@dataclass |
76 | | -class MoEBenchConfig: |
77 | | - T: int # tokens per GPU (EP-adjusted) |
78 | | - E: int # experts per GPU (EP-adjusted) |
79 | | - H: int # hidden size |
80 | | - intermediate_dim: int # expert intermediate size |
81 | | - K: int # top-k |
82 | | - |
83 | | - |
84 | | -MOE_MODEL_CONFIGS = { |
85 | | - # Mixtral 8x7B — EP=1 and EP=4 |
86 | | - "mixtral-8x7b": MoEBenchConfig(T=2048, E=8, H=4096, intermediate_dim=14336, K=2), |
87 | | - "mixtral-8x7b-ep4": MoEBenchConfig(T=512, E=2, H=4096, intermediate_dim=14336, K=2), |
88 | | - # Mixtral 8x22B — EP=1 and EP=4 |
89 | | - "mixtral-8x22b": MoEBenchConfig(T=2048, E=8, H=6144, intermediate_dim=16384, K=2), |
90 | | - "mixtral-8x22b-ep4": MoEBenchConfig(T=512, E=2, H=6144, intermediate_dim=16384, K=2), |
91 | | - # Qwen3-MoE-30B — EP=1 and EP=8 and EP=16 |
92 | | - "qwen3-moe-30b": MoEBenchConfig(T=8192, E=128, H=2048, intermediate_dim=768, K=8), |
93 | | - "qwen3-moe-30b-ep8": MoEBenchConfig(T=1024, E=16, H=2048, intermediate_dim=768, K=8), |
94 | | - "qwen3-moe-30b-ep16": MoEBenchConfig(T=512, E=8, H=2048, intermediate_dim=768, K=8), |
95 | | - # Qwen3-MoE-235B — EP=16 and EP=32 |
96 | | - "qwen3-moe-235b-ep16": MoEBenchConfig(T=256, E=8, H=7168, intermediate_dim=2560, K=8), |
97 | | - "qwen3-moe-235b-ep32": MoEBenchConfig(T=128, E=4, H=7168, intermediate_dim=2560, K=8), |
98 | | - # DeepSeek-V3/R1 — EP=32 and EP=64 |
99 | | - "deepseek-v3-ep32": MoEBenchConfig(T=128, E=8, H=7168, intermediate_dim=2048, K=8), |
100 | | - "deepseek-v3-ep64": MoEBenchConfig(T=64, E=4, H=7168, intermediate_dim=2048, K=8), |
101 | | -} |
102 | | - |
103 | | -DEFAULT_MOE_MODEL = "qwen3-moe-30b" |
104 | | - |
105 | | -# Expert counts used in the num_experts sweep (independent of model). |
106 | | -EXPERT_SWEEP_VALUES = [8, 16, 32, 64, 128] |
107 | | - |
108 | | - |
109 | | -# --------------------------------------------------------------------------- |
110 | | -# Input generation |
111 | | -# --------------------------------------------------------------------------- |
112 | | - |
113 | | - |
114 | | -def _make_moe_inputs(T, E, H, intermediate_dim, K, dtype, requires_grad=True): |
115 | | - torch.manual_seed(42) |
116 | | - x = torch.randn(T, H, dtype=dtype, device=device, requires_grad=requires_grad) |
117 | | - gate_up_proj = ( |
118 | | - torch.randn(E, 2 * intermediate_dim, H, dtype=dtype, device=device, requires_grad=requires_grad) * 0.02 |
119 | | - ) |
120 | | - down_proj = torch.randn(E, H, intermediate_dim, dtype=dtype, device=device, requires_grad=requires_grad) * 0.02 |
121 | | - logits = torch.randn(T, E, device=device) |
122 | | - top_k_index = torch.topk(logits, K, dim=-1).indices.to(torch.int32) |
123 | | - top_k_weights = ( |
124 | | - torch.softmax(torch.gather(logits, 1, top_k_index.long()), dim=-1).to(dtype).requires_grad_(requires_grad) |
125 | | - ) |
126 | | - return x, gate_up_proj, down_proj, top_k_index, top_k_weights |
127 | | - |
128 | | - |
129 | | -# --------------------------------------------------------------------------- |
130 | | -# Framework-integrated benchmark functions |
131 | | -# --------------------------------------------------------------------------- |
132 | | - |
133 | | - |
134 | | -def _setup_fused_moe(input: SingleBenchmarkRunInput): |
135 | | - """Return (fwd_fn, grad_tensors) for the given provider and config. |
136 | | -
|
137 | | - extra_benchmark_config keys: |
138 | | - sweep_dim : "T" or "E" — which dim input.x varies |
139 | | - T, E : fixed values for the dimension not being swept (None when swept) |
140 | | - H, intermediate_dim, K : model dimensions |
141 | | - dtype : torch.dtype |
142 | | - """ |
143 | | - cfg = input.extra_benchmark_config |
144 | | - T = int(input.x) if cfg["sweep_dim"] == "T" else cfg["T"] |
145 | | - E = int(input.x) if cfg["sweep_dim"] == "E" else cfg["E"] |
146 | | - H, intermediate_dim, K = cfg["H"], cfg["intermediate_dim"], cfg["K"] |
147 | | - dtype = cfg["dtype"] |
148 | | - |
149 | | - x, gup, dn, idx, wts = _make_moe_inputs(T, E, H, intermediate_dim, K, dtype, requires_grad=True) |
150 | | - |
151 | | - if input.kernel_provider == "liger": |
152 | | - |
153 | | - def fwd_fn(): |
154 | | - return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts) |
155 | | - elif input.kernel_provider == "huggingface": |
156 | | - |
157 | | - def fwd_fn(): |
158 | | - return _huggingface_moe_forward(x, gup, dn, idx, wts) |
159 | | - else: |
160 | | - raise ValueError(f"Unknown provider: {input.kernel_provider}") |
161 | | - |
162 | | - return fwd_fn, [x, gup, dn, wts] |
163 | | - |
164 | | - |
165 | | -def bench_speed_fused_moe(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput: |
166 | | - fwd_fn, grad_tensors = _setup_fused_moe(input) |
167 | | - return run_speed_benchmark(fwd_fn, input.kernel_operation_mode, grad_tensors) |
168 | | - |
169 | | - |
170 | | -def bench_memory_fused_moe(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput: |
171 | | - fwd_fn, _ = _setup_fused_moe(input) |
172 | | - return run_memory_benchmark(fwd_fn, input.kernel_operation_mode) |
173 | | - |
174 | | - |
175 | | -# --------------------------------------------------------------------------- |
176 | | -# Autotune warmup |
177 | | -# --------------------------------------------------------------------------- |
178 | | - |
179 | | - |
180 | | -def _warmup_liger(T, E, H, intermediate_dim, K, dtype, sweep_dim): |
181 | | - """Run one full fwd+bwd to exhaust Triton autotune for (H, intermediate_dim). |
182 | | -
|
183 | | - Triton autotune key is (H_dim, I_dim), so a single call is sufficient to |
184 | | - cache the best config for all subsequent calls with the same H and intermediate_dim. |
185 | | - For the num_experts sweep we also call this once per E value to warm up |
186 | | - CUDA caches for each expert count before do_bench starts timing. |
187 | | - """ |
188 | | - warmup_input = SingleBenchmarkRunInput( |
189 | | - x=T if sweep_dim == "T" else E, |
190 | | - kernel_provider="liger", |
191 | | - extra_benchmark_config={ |
192 | | - "sweep_dim": sweep_dim, |
193 | | - "T": T, |
194 | | - "E": E, |
195 | | - "H": H, |
196 | | - "intermediate_dim": intermediate_dim, |
197 | | - "K": K, |
198 | | - "dtype": dtype, |
199 | | - }, |
200 | | - ) |
201 | | - warmup_fn, _ = _setup_fused_moe(warmup_input) |
202 | | - warmup_out = warmup_fn() |
203 | | - warmup_out.sum().backward() |
204 | | - del warmup_out |
| 42 | +if device == "cuda": |
| 43 | + torch.cuda.synchronize() |
| 44 | +elif device == "npu": |
205 | 45 | torch.npu.synchronize() |
206 | | - |
207 | | - |
208 | | -# --------------------------------------------------------------------------- |
209 | | -# Model-preset benchmark (standalone, stdout only, no CSV) |
210 | | -# --------------------------------------------------------------------------- |
211 | | - |
212 | | - |
213 | | -def _run_model_preset_benchmark(preset_name: str): |
214 | | - cfg = MOE_MODEL_CONFIGS[preset_name] |
215 | | - dtype = torch.bfloat16 |
216 | | - |
217 | | - print(f"\n=== Model preset: {preset_name} ===") |
218 | | - print(f"T={cfg.T} E={cfg.E} H={cfg.H} intermediate_dim={cfg.intermediate_dim} K={cfg.K} dtype=bfloat16") |
219 | | - print(f"{'provider':<14} {'mode':<10} {'ms (p50)':>9} {'ms (p20)':>9} {'ms (p80)':>9} {'mem_mb':>8}") |
220 | | - print("-" * 65) |
221 | | - |
222 | | - for provider in ["liger", "huggingface"]: |
223 | | - for mode in ["forward", "backward", "full"]: |
224 | | - x, gup, dn, idx, wts = _make_moe_inputs( |
225 | | - cfg.T, cfg.E, cfg.H, cfg.intermediate_dim, cfg.K, dtype, requires_grad=True |
226 | | - ) |
227 | | - if provider == "liger": |
228 | | - |
229 | | - def fwd_fn(): |
230 | | - return LigerFusedMoEFunction.apply(x, gup, dn, idx, wts) |
231 | | - else: |
232 | | - |
233 | | - def fwd_fn(): |
234 | | - return _huggingface_moe_forward(x, gup, dn, idx, wts) |
235 | | - |
236 | | - grad_tensors = [x, gup, dn, wts] |
237 | | - speed = run_speed_benchmark(fwd_fn, mode, grad_tensors) |
238 | | - mem = run_memory_benchmark(fwd_fn, mode) |
239 | | - |
240 | | - mem_str = f"{mem.y_50:>8.1f}" |
241 | | - print(f"{provider:<14} {mode:<10} {speed.y_50:>9.3f} {speed.y_20:>9.3f} {speed.y_80:>9.3f} {mem_str}") |
242 | | - print() |
243 | | - |
244 | | - |
245 | | -# --------------------------------------------------------------------------- |
246 | | -# Main |
247 | | -# --------------------------------------------------------------------------- |
248 | | - |
249 | | - |
250 | | -if __name__ == "__main__": |
251 | | - parser = argparse.ArgumentParser(description="Benchmark LigerFusedMoEFunction") |
252 | | - parser.add_argument( |
253 | | - "--overwrite", |
254 | | - action="store_true", |
255 | | - help="Overwrite existing CSV benchmark data", |
256 | | - ) |
257 | | - parser.add_argument( |
258 | | - "--model", |
259 | | - type=str, |
260 | | - default=DEFAULT_MOE_MODEL, |
261 | | - choices=list(MOE_MODEL_CONFIGS.keys()), |
262 | | - help=( |
263 | | - f"MoE model config for framework-mode sweep (default: {DEFAULT_MOE_MODEL} = Qwen3-MoE-30B). " |
264 | | - "Overrides all MoE params: E, H, intermediate_dim, K, and base token count T." |
265 | | - ), |
266 | | - ) |
267 | | - parser.add_argument( |
268 | | - "--sweep-dim", |
269 | | - choices=["num_tokens", "num_experts"], |
270 | | - default="num_tokens", |
271 | | - help="Dimension to sweep in standard framework mode", |
272 | | - ) |
273 | | - parser.add_argument( |
274 | | - "--model-preset", |
275 | | - choices=list(MOE_MODEL_CONFIGS.keys()), |
276 | | - default=None, |
277 | | - dest="model_preset", |
278 | | - help="Run standalone model-preset benchmark (stdout only, no CSV)", |
279 | | - ) |
280 | | - args = parser.parse_args() |
281 | | - |
282 | | - if args.model_preset is not None: |
283 | | - # Standalone model-preset mode: no CSV, prints a table per preset |
284 | | - _run_model_preset_benchmark(args.model_preset) |
285 | | - else: |
286 | | - # Standard framework-integrated mode. |
287 | | - # All MoE parameters are derived from the selected model config so that |
288 | | - # --model deepseek-v3-ep32 correctly overrides E, H, intermediate_dim, K, and T. |
289 | | - moe_cfg = MOE_MODEL_CONFIGS[args.model] |
290 | | - E = moe_cfg.E |
291 | | - H = moe_cfg.H |
292 | | - intermediate_dim = moe_cfg.intermediate_dim |
293 | | - K = moe_cfg.K |
294 | | - probe_T = moe_cfg.T # representative token count for probing and warmup |
295 | | - dtype = torch.bfloat16 |
296 | | - |
297 | | - print( |
298 | | - f"Model: {args.model} — E={E}, H={H}, intermediate_dim={intermediate_dim}, K={K}, " |
299 | | - f"T_base={probe_T}, dtype={dtype}" |
300 | | - ) |
301 | | - |
302 | | - # Memory probe using huggingface (no Triton, higher footprint = safe upper bound). |
303 | | - def _probe(): |
304 | | - probe_input = SingleBenchmarkRunInput( |
305 | | - x=probe_T, |
306 | | - kernel_provider="huggingface", |
307 | | - extra_benchmark_config={ |
308 | | - "sweep_dim": "T", |
309 | | - "T": None, |
310 | | - "E": E, |
311 | | - "H": H, |
312 | | - "intermediate_dim": intermediate_dim, |
313 | | - "K": K, |
314 | | - "dtype": dtype, |
315 | | - }, |
316 | | - ) |
317 | | - fwd_fn, _ = _setup_fused_moe(probe_input) |
318 | | - return fwd_fn() |
319 | | - |
320 | | - peak_bytes = estimate_kernel_peak_memory(probe_fn=_probe) |
321 | | - kernel_bpt = peak_bytes // probe_T |
322 | | - |
323 | | - # Pre-warm Liger's Triton autotune before benchmarks start. |
324 | | - # |
325 | | - # Autotune key is (H_dim, I_dim) — one warmup per (H, intermediate_dim) pair is sufficient |
326 | | - # to cache the best config for the entire sweep. |
327 | | - # |
328 | | - # For num_tokens sweep: one pass with the model's base T is enough. |
329 | | - # For num_experts sweep: one pass per E value in EXPERT_SWEEP_VALUES to also |
330 | | - # warm up CUDA caches for each expert count, since weight tensor sizes differ. |
331 | | - print(f"Pre-warming Liger autotune (H={H}, intermediate_dim={intermediate_dim})...") |
332 | | - |
333 | | - if args.sweep_dim == "num_tokens": |
334 | | - _warmup_liger(probe_T, E, H, intermediate_dim, K, dtype, sweep_dim="T") |
335 | | - else: # num_experts |
336 | | - for e_val in EXPERT_SWEEP_VALUES: |
337 | | - print(f" warmup E={e_val}...") |
338 | | - _warmup_liger(probe_T, e_val, H, intermediate_dim, K, dtype, sweep_dim="E") |
339 | | - |
340 | | - torch.npu.synchronize() |
341 | | - print("Autotune warmup complete.\n") |
342 | | - |
343 | | - if args.sweep_dim == "num_tokens": |
344 | | - # Derive a memory-safe upper bound for T from the probe measurement. |
345 | | - # Target 40% GPU memory utilisation to leave headroom for framework overhead. |
346 | | - usable_bytes = get_total_gpu_memory() * (1024**3) * 0.4 |
347 | | - max_T = min(32768, max(256, int(usable_bytes / kernel_bpt))) |
348 | | - # Round down to nearest power-of-two for clean x-axis values. |
349 | | - max_T = 2 ** int(math.log2(max_T)) if max_T >= 256 else 256 |
350 | | - x_values = [2**i for i in range(7, int(math.log2(max_T)) + 1)] |
351 | | - extra_configs = [ |
352 | | - { |
353 | | - "sweep_dim": "T", |
354 | | - "T": None, # varied by framework |
355 | | - "E": E, |
356 | | - "H": H, |
357 | | - "intermediate_dim": intermediate_dim, |
358 | | - "K": K, |
359 | | - "dtype": dtype, |
360 | | - } |
361 | | - ] |
362 | | - x_name, x_label = "T", "num_tokens" |
363 | | - else: # num_experts |
364 | | - x_values = EXPERT_SWEEP_VALUES |
365 | | - extra_configs = [ |
366 | | - { |
367 | | - "sweep_dim": "E", |
368 | | - "T": probe_T, # fixed at model's base token count |
369 | | - "E": None, # varied by framework |
370 | | - "H": H, |
371 | | - "intermediate_dim": intermediate_dim, |
372 | | - "K": K, |
373 | | - "dtype": dtype, |
374 | | - } |
375 | | - ] |
376 | | - x_name, x_label = "E", "num_experts" |
377 | | - |
378 | | - common_configs = { |
379 | | - "kernel_name": "fused_moe", |
380 | | - "x_name": x_name, |
381 | | - "x_label": x_label, |
382 | | - "x_values": x_values, |
383 | | - "kernel_providers": ["liger", "huggingface"], |
384 | | - "extra_benchmark_configs": extra_configs, |
385 | | - "overwrite": args.overwrite, |
386 | | - } |
387 | | - |
388 | | - run_benchmarks( |
389 | | - bench_test_fn=bench_speed_fused_moe, |
390 | | - kernel_operation_modes=["full", "forward", "backward"], |
391 | | - metric_name="speed", |
392 | | - metric_unit="ms", |
393 | | - **common_configs, |
394 | | - ) |
395 | | - run_benchmarks( |
396 | | - bench_test_fn=bench_memory_fused_moe, |
397 | | - kernel_operation_modes=["full", "forward", "backward"], |
398 | | - metric_name="memory", |
399 | | - metric_unit="MB", |
400 | | - **common_configs, |
401 | | - ) |
0 commit comments