Enable split-K decode SDPA by default with --no-splitk opt-out

digantdesai · digantdesai · commit d5209fc876f0 · 2026-04-14T18:36:58.000-07:00
Add `use_splitk_decode` config flag to control whether FullAttention
uses the split-K (flash-decoding) SDPA kernel or the tiled SDPA for
decode (T=1). The split-K kernel partitions the KV sequence across
CTAs, yielding ~20% higher decode throughput on H100:

  Variant          Decode tok/s (avg across prompts)
  Tiled SDPA       88.5
  Split-K SDPA     107.5   (+21%)

The flag defaults to True (split-K on). Pass `--no-splitk` at export
time to disable. Quality is verified identical at temperature=0.

This PR was authored with the assistance of Claude
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
@@ -68,7 +68,7 @@ def _prepare_and_quantize_mlx(model, config, args):
     pack_all_switch_linears(model)
 
 
-def load_and_quantize(args):
+def load_and_quantize(args):  # noqa: C901
     """Load model from checkpoint, optionally quantize.
 
     For CUDA: quantizes experts with packed INT4, then transformer layers on CUDA.
@@ -77,6 +77,7 @@ def load_and_quantize(args):
     Returns (model, config) ready for export.
     """
     backend = getattr(args, "backend", "cuda")
+    use_splitk = not getattr(args, "no_splitk", False)
 
     if not args.prequantized:
         if getattr(args, "tiny_test", False):
@@ -111,6 +112,7 @@ def load_and_quantize(args):
                 rms_norm_eps=1e-6,
                 rope_theta=10_000.0,
                 max_seq_len=64,
+                use_splitk_decode=use_splitk,
             )
             print("Building tiny model with random weights...")
             torch.manual_seed(42)
@@ -133,6 +135,10 @@ def load_and_quantize(args):
             model, config = Qwen35MoE.from_hf_checkpoint(
                 args.model_dir, max_seq_len=args.max_seq_len
             )
+            config.use_splitk_decode = use_splitk
+            for layer in model.layers:
+                if hasattr(layer.attn, "use_splitk_decode"):
+                    layer.attn.use_splitk_decode = use_splitk
             model.eval()
             print(
                 f"Model: {config.num_hidden_layers} layers, {config.hidden_size}d, "
@@ -148,7 +154,11 @@ def load_and_quantize(args):
 
     elif backend == "cuda":
         if args.prequantized:
-            return load_prequantized_model(args.prequantized, args.max_seq_len)
+            return load_prequantized_model(
+                args.prequantized,
+                args.max_seq_len,
+                use_splitk_decode=use_splitk,
+            )
 
         # CUDA: quantize experts with packed INT4 for Triton kernel
         if args.qlinear or args.qembedding:
@@ -162,12 +172,13 @@ def load_and_quantize(args):
     return model, config
 
 
-def load_prequantized_model(prequantized_dir, max_seq_len=4096):
+def load_prequantized_model(prequantized_dir, max_seq_len=4096, use_splitk_decode=True):
     """Load a prequantized safetensors bundle into a model.
 
     Args:
         prequantized_dir: Directory containing model.safetensors and config.json.
         max_seq_len: Maximum sequence length for KV cache.
+        use_splitk_decode: Use split-K SDPA for decode instead of tiled SDPA.
 
     Returns:
         (model, config) ready for export.
@@ -181,6 +192,7 @@ def load_prequantized_model(prequantized_dir, max_seq_len=4096):
 
     config = Qwen35MoEConfig.from_hf_config(config_path)
     config.max_seq_len = max_seq_len
+    config.use_splitk_decode = use_splitk_decode
 
     print(f"Loading prequantized weights from {safetensors_path}...")
     state_dict = load_quantized_state_dict(safetensors_path)
@@ -789,6 +801,11 @@ def main():
         "No checkpoint download needed. Tests all architectural features "
         "(GQA, GDN head ratio, mixed attention, MoE routing) at small scale.",
     )
+    parser.add_argument(
+        "--no-splitk",
+        action="store_true",
+        help="Disable split-K (flash-decoding) SDPA for decode; use tiled SDPA instead.",
+    )
     args = parser.parse_args()
 
     if args.model_id:
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py
@@ -50,6 +50,7 @@ class Qwen35MoEConfig:
     rms_norm_eps: float = 1e-6
     rope_theta: float = 10_000_000.0
     max_seq_len: int = 4096
+    use_splitk_decode: bool = True
     layer_types: list = field(default_factory=list)
 
     def __post_init__(self):
@@ -231,6 +232,7 @@ def __init__(self, config):
 
         self.kv_cache = KVCache(self.n_kv_heads, self.head_dim, config.max_seq_len)
         self.turboquant = False
+        self.use_splitk_decode = config.use_splitk_decode
 
         self.register_buffer(
             "cache_positions",
@@ -289,7 +291,7 @@ def forward(self, x, input_pos):
             # The export produces two methods — decode (T=1, static) and
             # prefill (T>=2, dynamic). Each traces only one branch, so no
             # torch.cond is needed and we avoid GPU→CPU sync overhead.
-            if T == 1:
+            if T == 1 and self.use_splitk_decode:
                 from executorch.backends.cuda.triton.kernels.sdpa import (
                     sdpa_decode_splitk,
                 )