update_with_shao-chun_comments

amd-ruitang3 · amd-ruitang3 · commit c651c8a658d4 · 2026-05-26T21:24:15.000-05:00
diff --git a/aiter/ops/triton/fusions/fused_routing_from_topk.py b/aiter/ops/triton/fusions/fused_routing_from_topk.py
@@ -42,13 +42,15 @@ def fused_routing_from_topk(
 
     Args:
         topk_weights: ``[n_tokens, n_expts_act]`` per-token routing weights.
+            Must be contiguous.
         topk_ids: ``[n_tokens, n_expts_act]`` selected expert ids; values
-            in ``[0, n_expts_tot)``.
+            in ``[0, n_expts_tot)``. Must be contiguous int32.
         n_expts_tot: Total number of routed experts (= ``E``).
         expert_map: Optional global→local expert map. When provided,
             ``topk_ids`` are treated as global ids and remapped inside fused
             kernels. Entries mapped to ``< 0`` are masked to zero weight and
-            redirected to local expert ``0`` for routing safety.
+            redirected to local expert ``0`` for routing safety. Must be
+            contiguous int32 when provided.
 
     Returns:
         Tuple ``(hist, topk_indx, gate_indx, gate_scal)``:
@@ -94,16 +96,20 @@ def fused_routing_from_topk(
     device = topk_weights.device
     weights_dtype = topk_weights.dtype
 
-    # Triton kernel needs flat int32 inputs. .reshape on a contiguous tensor
-    # is a view; .contiguous() / .to(int32) on already-canonical tensors
-    # are no-ops.
-    topk_ids_flat = topk_ids.contiguous().reshape(-1).to(torch.int32)
-    topk_weights_flat = topk_weights.contiguous().reshape(-1)
+    assert (
+        topk_ids.is_contiguous() and topk_ids.dtype == torch.int32
+    ), "topk_ids must be contiguous int32"
+    assert topk_weights.is_contiguous(), "topk_weights must be contiguous"
+    topk_ids_flat = topk_ids.reshape(-1)
+    topk_weights_flat = topk_weights.reshape(-1)
     expert_map_numel = 0
     expert_map_flat = topk_ids_flat
     has_expert_map = expert_map is not None
     if has_expert_map:
-        expert_map_flat = expert_map.contiguous().reshape(-1).to(torch.int32)
+        assert (
+            expert_map.is_contiguous() and expert_map.dtype == torch.int32
+        ), "expert_map must be contiguous int32"
+        expert_map_flat = expert_map.reshape(-1)
         expert_map_numel = int(expert_map_flat.numel())
 
     topk_indx = torch.empty(n_gates_pad, dtype=torch.int32, device=device)
diff --git a/op_tests/triton_tests/fusions/test_fused_routing_from_topk.py b/op_tests/triton_tests/fusions/test_fused_routing_from_topk.py
@@ -189,98 +189,48 @@ def _compare_buckets(ref_buckets, test_buckets, atol=1e-6):
 # tests
 # ---------------------------------------------------------------------------
 @pytest.mark.parametrize(
-    "n_tokens, n_expts_act, n_expts_tot",
+    "n_tokens, n_expts_act, n_expts_tot, n_expts_global",
     [
-        # V4-Flash decode shapes (E=256, K=6).
-        (1, 6, 256),
-        (16, 6, 256),
-        (64, 6, 256),
-        (256, 6, 256),
+        # V4-Flash decode shapes (E=256, K=6). n_expts_global ignored when
+        # has_expert_map=False.
+        (1, 6, 256, 256),
+        (16, 6, 256, 256),
+        (64, 6, 256, 256),
+        (256, 6, 256, 256),
         # Generic decode shapes used by other MoE configs.
-        (1, 8, 384),
-        (4, 8, 384),
-        (64, 8, 384),
-        (256, 8, 384),
+        (1, 8, 384, 384),
+        (4, 8, 384, 384),
+        (64, 8, 384, 384),
+        (256, 8, 384, 384),
         # Edge: small E.
-        (32, 4, 16),
+        (32, 4, 16, 16),
         # Boundary: NK at the kernel's MAX_NK = 4096.
-        (512, 8, 384),
-    ],
-)
-@pytest.mark.parametrize("dtype", [torch.float32])
-def test_fused_routing_from_topk(n_tokens, n_expts_act, n_expts_tot, dtype):
-    if not torch.cuda.is_available():
-        pytest.skip("CUDA not available")
-    torch.manual_seed(0)
-    topk_ids, topk_weights = _make_inputs(
-        n_tokens, n_expts_act, n_expts_tot, dtype, DEVICE, seed=0
-    )
-
-    ref_hist, ref_topk_indx, ref_gate_indx, ref_gate_scal = routing_from_topk_reference(
-        topk_weights, topk_ids, n_expts_tot
-    )
-    _check_routing_invariants(
-        ref_hist,
-        ref_topk_indx,
-        ref_gate_indx,
-        ref_gate_scal,
-        topk_ids,
-        n_expts_tot,
-        bucket_unsorted_layout=False,  # ref uses per-row-sorted layout
-    )
-    ground_buckets = _ground_truth_buckets(topk_ids, topk_weights)
-    ref_buckets = _per_expert_triples(
-        ref_hist, ref_topk_indx, ref_gate_scal, n_expts_act
-    )
-    _compare_buckets(ground_buckets, ref_buckets)
-
-    test_hist, test_topk_indx, test_gate_indx, test_gate_scal = fused_routing_from_topk(
-        topk_weights, topk_ids, n_expts_tot
-    )
-    _check_routing_invariants(
-        test_hist,
-        test_topk_indx,
-        test_gate_indx,
-        test_gate_scal,
-        topk_ids,
-        n_expts_tot,
-        bucket_unsorted_layout=True,  # fused uses unsorted topk_ids layout
-    )
-
-    # hist must match the reference exactly.
-    assert torch.equal(
-        ref_hist, test_hist
-    ), f"hist mismatch:\n  ref={ref_hist}\n  fused={test_hist}"
-
-    # Per-expert (token, weight) multisets match the reference.
-    test_buckets = _per_expert_triples(
-        test_hist, test_topk_indx, test_gate_scal, n_expts_act
-    )
-    _compare_buckets(ref_buckets, test_buckets)
-
-
-@pytest.mark.parametrize(
-    "n_tokens, n_expts_act, n_expts_tot,n_expts_global",
-    [
+        (512, 8, 384, 384),
+        # Expert-parallel shapes: n_expts_global > n_expts_tot, requires map.
         (16, 6, 64, 256),
         (64, 6, 128, 256),
     ],
 )
+@pytest.mark.parametrize("has_expert_map", [False, True])
 @pytest.mark.parametrize("dtype", [torch.float32])
-def test_fused_routing_from_topk_with_expert_map(
-    n_tokens, n_expts_act, n_expts_tot, n_expts_global, dtype
+def test_fused_routing_from_topk(
+    n_tokens, n_expts_act, n_expts_tot, n_expts_global, has_expert_map, dtype
 ):
     if not torch.cuda.is_available():
         pytest.skip("CUDA not available")
     torch.manual_seed(0)
+
+    id_range = n_expts_global if has_expert_map else n_expts_tot
     topk_ids, topk_weights = _make_inputs(
-        n_tokens, n_expts_act, n_expts_global, dtype, DEVICE, seed=0
+        n_tokens, n_expts_act, id_range, dtype, DEVICE, seed=0
     )
 
-    expert_map = torch.full((n_expts_global,), -1, dtype=torch.int32, device=DEVICE)
-    expert_map[: n_expts_tot // 2] = torch.arange(
-        n_expts_tot // 2, dtype=torch.int32, device=DEVICE
-    )
+    expert_map = None
+    if has_expert_map:
+        expert_map = torch.full((n_expts_global,), -1, dtype=torch.int32, device=DEVICE)
+        expert_map[: n_expts_tot // 2] = torch.arange(
+            n_expts_tot // 2, dtype=torch.int32, device=DEVICE
+        )
 
     ref_hist, ref_topk_indx, ref_gate_indx, ref_gate_scal = routing_from_topk_reference(
         topk_weights, topk_ids, n_expts_tot, expert_map=expert_map
@@ -292,7 +242,7 @@ def test_fused_routing_from_topk_with_expert_map(
         ref_gate_scal,
         topk_ids,
         n_expts_tot,
-        bucket_unsorted_layout=False,
+        bucket_unsorted_layout=False,  # ref uses per-row-sorted layout
     )
 
     test_hist, test_topk_indx, test_gate_indx, test_gate_scal = fused_routing_from_topk(
@@ -305,18 +255,33 @@ def test_fused_routing_from_topk_with_expert_map(
         test_gate_scal,
         topk_ids,
         n_expts_tot,
-        bucket_unsorted_layout=False,
+        bucket_unsorted_layout=not has_expert_map,
     )
 
+    # hist must match the reference exactly.
     assert torch.equal(
         ref_hist, test_hist
     ), f"hist mismatch:\n  ref={ref_hist}\n  fused={test_hist}"
 
-    # Intra-expert ordering can differ between fused and reference,
-    # especially in expert-0 bucket where invalid experts are redirected.
-    # Compare zeroed-weight cardinality instead of elementwise positions.
-    ref_zero_count = int((ref_gate_scal == 0).sum().item())
-    test_zero_count = int((test_gate_scal == 0).sum().item())
-    assert (
-        ref_zero_count == test_zero_count
-    ), f"zero-masked count mismatch: ref={ref_zero_count}, fused={test_zero_count}"
+    if has_expert_map:
+        # Intra-expert ordering can differ between fused and reference,
+        # especially in expert-0 bucket where invalid experts are redirected.
+        # Compare zeroed-weight cardinality instead of elementwise positions.
+        ref_zero_count = int((ref_gate_scal == 0).sum().item())
+        test_zero_count = int((test_gate_scal == 0).sum().item())
+        assert ref_zero_count == test_zero_count, (
+            f"zero-masked count mismatch: "
+            f"ref={ref_zero_count}, fused={test_zero_count}"
+        )
+    else:
+        ground_buckets = _ground_truth_buckets(topk_ids, topk_weights)
+        ref_buckets = _per_expert_triples(
+            ref_hist, ref_topk_indx, ref_gate_scal, n_expts_act
+        )
+        _compare_buckets(ground_buckets, ref_buckets)
+
+        # Per-expert (token, weight) multisets match the reference.
+        test_buckets = _per_expert_triples(
+            test_hist, test_topk_indx, test_gate_scal, n_expts_act
+        )
+        _compare_buckets(ref_buckets, test_buckets)