test(rotary_embedding): add pre_gathered=True coverage

zhangyue · zhangyue · commit 694506bb7a0d · 2026-04-23T14:40:35.000+08:00
Fold the deleted `test_apply_rotary_pos_emb` / `_3d` cases into a single
`test_rotary_embedding_pre_gathered` that exercises the `pre_gathered`
fast path directly on the `rotary_embedding` overload (no shim).
Parametrize over 2D / 3D query-key layouts, impls 0 and 1 (impl 2 asserts
`!pre_gathered_`), neox / GPT-J styles, fp16 / bf16.  The new
`_build_pre_gathered_cache` helper constructs the `[2*T, head_size]`
wire format that `src/ascend/rotary_embedding/kernel.h` expects —
cos rows 0..T-1, sin rows T..2T-1, both neox-expanded per token.

Coverage: 12 new cases pass (4 skip for `impl=0 + not-neox`, same as the
`test_rotary_embedding_full` skip — V2 only supports `rotaryMode="half"`).

Full rotary suite: 88 passed, 8 skipped (was 80 passed, 4 skipped before
this test was added).
diff --git a/tests/test_rotary_embedding.py b/tests/test_rotary_embedding.py
@@ -619,3 +619,108 @@ def test_rotary_embedding_inplace(implementation_index, dtype, rtol, atol, devic
     _assert_close(query, ref_q, rtol, atol)
     _assert_close(key, ref_k, rtol, atol)
 
+
+def _build_pre_gathered_cache(cos_sin_cache, positions, head_size, is_neox_style):
+    """Build the `[2 * T, head_size]` pre-gathered cache the kernel expects.
+
+    Layout (see `src/ascend/rotary_embedding/kernel.h` pre-gathered branch):
+      - rows `0..T-1`: neox-expanded cos for each token (row `t` holds the
+        cos values for `positions[t]`, broadcast to full `head_size`).
+      - rows `T..2T-1`: neox-expanded sin, same indexing.
+    """
+    half = head_size // 2
+    cos_half = cos_sin_cache[:, :half].index_select(0, positions)
+    sin_half = cos_sin_cache[:, half:].index_select(0, positions)
+
+    if is_neox_style:
+        cos_full = torch.cat([cos_half, cos_half], dim=-1)
+        sin_full = torch.cat([sin_half, sin_half], dim=-1)
+    else:
+        # GPT-J interleave: pair-wise expansion `(x[0],x[0],x[1],x[1],…)`.
+        cos_full = cos_half.repeat_interleave(2, dim=-1)
+        sin_full = sin_half.repeat_interleave(2, dim=-1)
+
+    return torch.cat([cos_full, sin_full], dim=0)
+
+
+# Hardcoded `(0, 1)` — impl 2 (`aclnnRopeWithSinCosCache`) asserts
+# `!pre_gathered_` at construction.  Cannot use conftest auto-injection.
+@pytest.mark.parametrize("implementation_index", (0, 1))
+@pytest.mark.parametrize("layout", ("2d", "3d"))
+@pytest.mark.parametrize("is_neox_style", (True, False))
+@pytest.mark.parametrize(
+    ("dtype", "rtol", "atol"),
+    (
+        (torch.float16, 1e-2, 5e-3),
+        (torch.bfloat16, 1e-2, 5e-3),
+    ),
+)
+@pytest.mark.parametrize("device", ("npu",))
+def test_rotary_embedding_pre_gathered(
+    implementation_index, layout, is_neox_style, dtype, rtol, atol, device
+):
+    """`pre_gathered=True` fast path: caller hands in `[2*T, head_size]` with
+    cos/sin already gathered and neox-expanded per token.  Exercises both 2D
+    `[T, N*D]` and 3D `[T, N, D]` query/key layouts."""
+    if not (hasattr(torch, "npu") and torch.npu.is_available()):
+        pytest.skip("NPU not available")
+
+    if not is_neox_style and implementation_index == 0:
+        pytest.skip(
+            'Ascend `aclnnApplyRotaryPosEmbV2` only supports `rotaryMode="half"`'
+        )
+
+    num_tokens = 8
+    num_heads = 16
+    num_kv_heads = 4
+    head_size = 128
+    rotary_dim = head_size
+    max_seq_len = 64
+
+    positions = randint_strided(
+        0, max_seq_len, (num_tokens,), None, dtype=torch.int64, device=device
+    )
+    cos_sin_cache = randn_strided(
+        (max_seq_len, rotary_dim), None, dtype=dtype, device=device
+    )
+
+    if layout == "3d":
+        q_shape = (num_tokens, num_heads, head_size)
+        k_shape = (num_tokens, num_kv_heads, head_size)
+    else:
+        q_shape = (num_tokens, num_heads * head_size)
+        k_shape = (num_tokens, num_kv_heads * head_size)
+
+    query = randn_strided(q_shape, None, dtype=dtype, device=device)
+    key = randn_strided(k_shape, None, dtype=dtype, device=device)
+    query_out = torch.empty_like(query)
+    key_out = torch.empty_like(key)
+
+    pre_gathered_cache = _build_pre_gathered_cache(
+        cos_sin_cache, positions, head_size, is_neox_style
+    )
+    # Kernel reads `positions` as `0..T-1` in the pre-gathered path (the
+    # gather has already happened); the actual values are not indexed.
+    arange_positions = torch.arange(num_tokens, dtype=torch.int64, device=device)
+
+    infini.ops.rotary_embedding(
+        arange_positions,
+        query,
+        key,
+        head_size,
+        pre_gathered_cache,
+        is_neox_style,
+        rotary_dim,
+        query_out,
+        key_out,
+        True,
+        implementation_index=implementation_index,
+        stream=get_stream(query.device),
+    )
+
+    ref_q, ref_k = _ref_rotary_embedding(
+        positions, query, key, cos_sin_cache, head_size, rotary_dim, is_neox_style
+    )
+
+    _assert_close(query_out, ref_q, rtol, atol)
+    _assert_close(key_out, ref_k, rtol, atol)