|
| 1 | +import pytest |
| 2 | +import torch |
| 3 | + |
| 4 | +from lightllm.models.deepseek3_2.triton_kernel.hadamard_transform import hadamard_transform |
| 5 | + |
| 6 | + |
| 7 | +TP = 8 |
| 8 | +INDEX_N_HEADS = 64 |
| 9 | +INDEX_HEAD_DIM = 128 |
| 10 | +TP_INDEX_N_HEADS = INDEX_N_HEADS // TP |
| 11 | +SCALE = INDEX_HEAD_DIM ** -0.5 |
| 12 | + |
| 13 | + |
| 14 | +def _get_sgl_kernel_hadamard_transform(): |
| 15 | + if not torch.cuda.is_available(): |
| 16 | + pytest.skip("CUDA is required for hadamard_transform comparison") |
| 17 | + try: |
| 18 | + from sgl_kernel import hadamard_transform as sgl_hadamard_transform |
| 19 | + except ImportError: |
| 20 | + pytest.skip("sgl_kernel.hadamard_transform is not available") |
| 21 | + return sgl_hadamard_transform |
| 22 | + |
| 23 | + |
| 24 | +def _bench(fn, x, warmup=30, iters=300): |
| 25 | + for _ in range(warmup): |
| 26 | + fn(x, scale=SCALE) |
| 27 | + torch.cuda.synchronize() |
| 28 | + |
| 29 | + start = torch.cuda.Event(enable_timing=True) |
| 30 | + end = torch.cuda.Event(enable_timing=True) |
| 31 | + start.record() |
| 32 | + for _ in range(iters): |
| 33 | + y = fn(x, scale=SCALE) |
| 34 | + end.record() |
| 35 | + torch.cuda.synchronize() |
| 36 | + return start.elapsed_time(end) / iters, y |
| 37 | + |
| 38 | + |
| 39 | +@pytest.mark.parametrize("tokens", [1, 16, 128, 512, 1024, 2048, 4096, 8192, 16384]) |
| 40 | +def test_hadamard_transform_matches_sgl_kernel_deepseek_v32_shapes(tokens): |
| 41 | + sgl_hadamard_transform = _get_sgl_kernel_hadamard_transform() |
| 42 | + |
| 43 | + q = torch.randn(tokens, TP_INDEX_N_HEADS, INDEX_HEAD_DIM, dtype=torch.bfloat16, device="cuda") |
| 44 | + k = torch.randn(tokens, INDEX_HEAD_DIM, dtype=torch.bfloat16, device="cuda") |
| 45 | + |
| 46 | + q_expected = sgl_hadamard_transform(q, scale=SCALE) |
| 47 | + q_actual = hadamard_transform(q, scale=SCALE) |
| 48 | + k_expected = sgl_hadamard_transform(k, scale=SCALE) |
| 49 | + k_actual = hadamard_transform(k, scale=SCALE) |
| 50 | + torch.cuda.synchronize() |
| 51 | + |
| 52 | + assert torch.equal(q_actual, q_expected) |
| 53 | + assert torch.equal(k_actual, k_expected) |
| 54 | + |
| 55 | + |
| 56 | +def test_hadamard_transform_perf_report_deepseek_v32_shapes(): |
| 57 | + sgl_hadamard_transform = _get_sgl_kernel_hadamard_transform() |
| 58 | + |
| 59 | + print( |
| 60 | + "\nDeepSeek-V3.2 per-rank shapes with tp=8:" |
| 61 | + "\n q: [tokens, 8, 128]" |
| 62 | + "\n k: [tokens, 128]" |
| 63 | + "\n\ntokens | q_diff | k_diff | sgl_q ms | tri_q ms | sgl_k ms | tri_k ms | tri(q+k) ms | slowdown q+k" |
| 64 | + ) |
| 65 | + |
| 66 | + for tokens in [1, 16, 128, 512, 1024, 2048, 4096, 8192, 16384]: |
| 67 | + q = torch.randn(tokens, TP_INDEX_N_HEADS, INDEX_HEAD_DIM, dtype=torch.bfloat16, device="cuda") |
| 68 | + k = torch.randn(tokens, INDEX_HEAD_DIM, dtype=torch.bfloat16, device="cuda") |
| 69 | + |
| 70 | + q_expected = sgl_hadamard_transform(q, scale=SCALE) |
| 71 | + q_actual = hadamard_transform(q, scale=SCALE) |
| 72 | + k_expected = sgl_hadamard_transform(k, scale=SCALE) |
| 73 | + k_actual = hadamard_transform(k, scale=SCALE) |
| 74 | + torch.cuda.synchronize() |
| 75 | + |
| 76 | + q_diff = (q_expected.float() - q_actual.float()).abs().max().item() |
| 77 | + k_diff = (k_expected.float() - k_actual.float()).abs().max().item() |
| 78 | + sgl_q_ms, _ = _bench(sgl_hadamard_transform, q) |
| 79 | + tri_q_ms, _ = _bench(hadamard_transform, q) |
| 80 | + sgl_k_ms, _ = _bench(sgl_hadamard_transform, k) |
| 81 | + tri_k_ms, _ = _bench(hadamard_transform, k) |
| 82 | + sgl_sum_ms = sgl_q_ms + sgl_k_ms |
| 83 | + tri_sum_ms = tri_q_ms + tri_k_ms |
| 84 | + |
| 85 | + print( |
| 86 | + f"{tokens:6d} | {q_diff:6.1g} | {k_diff:6.1g} | " |
| 87 | + f"{sgl_q_ms:8.4f} | {tri_q_ms:8.4f} | {sgl_k_ms:8.4f} | {tri_k_ms:8.4f} | " |
| 88 | + f"{tri_sum_ms:11.4f} | {tri_sum_ms / sgl_sum_ms:10.2f}x" |
| 89 | + ) |
| 90 | + |
| 91 | + assert q_diff == 0 |
| 92 | + assert k_diff == 0 |
0 commit comments