|
| 1 | +# Copyright 2025 Tencent Inc. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import torch |
| 16 | +import triton |
| 17 | +import triton.language as tl |
| 18 | + |
| 19 | +# modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py |
| 20 | +fp8_gemm_configs = [ |
| 21 | + triton.Config( |
| 22 | + {"BLOCK_SIZE_M": block_m, "BLOCK_SIZE_N": block_n, "BLOCK_SIZE_K": 128}, |
| 23 | + num_stages=num_stages, |
| 24 | + num_warps=8, |
| 25 | + ) |
| 26 | + for block_m in [16, 32, 64] |
| 27 | + for block_n in [32, 64, 128] |
| 28 | + for num_stages in [3, 4, 5, 6] |
| 29 | +] |
| 30 | + |
| 31 | + |
| 32 | +@triton.autotune(configs=fp8_gemm_configs, key=["N", "K"]) |
| 33 | +@triton.jit |
| 34 | +def _fp8_gemm_triton_block_kernel( |
| 35 | + a_ptr, |
| 36 | + b_ptr, |
| 37 | + c_ptr, |
| 38 | + a_s_ptr, |
| 39 | + b_s_ptr, |
| 40 | + M, |
| 41 | + N: tl.constexpr, |
| 42 | + K: tl.constexpr, |
| 43 | + BLOCK_SIZE_M: tl.constexpr, |
| 44 | + BLOCK_SIZE_N: tl.constexpr, |
| 45 | + BLOCK_SIZE_K: tl.constexpr, |
| 46 | +): |
| 47 | + """ |
| 48 | + Performs a matrix multiplication operation on FP8 matrices with scaling factors. |
| 49 | + """ |
| 50 | + pid_m = tl.program_id(axis=0) |
| 51 | + pid_n = tl.program_id(axis=1) |
| 52 | + k = tl.cdiv(K, BLOCK_SIZE_K) |
| 53 | + offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M |
| 54 | + offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N |
| 55 | + offs_k = tl.arange(0, BLOCK_SIZE_K) |
| 56 | + a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :] |
| 57 | + b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None] |
| 58 | + a_s_ptrs = a_s_ptr + offs_m * k |
| 59 | + b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k |
| 60 | + |
| 61 | + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) |
| 62 | + for i in range(k): |
| 63 | + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0) |
| 64 | + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0) |
| 65 | + a_s = tl.load(a_s_ptrs) |
| 66 | + b_s = tl.load(b_s_ptrs) |
| 67 | + |
| 68 | + accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :] |
| 69 | + a_ptrs += BLOCK_SIZE_K |
| 70 | + b_ptrs += BLOCK_SIZE_K |
| 71 | + a_s_ptrs += 1 |
| 72 | + b_s_ptrs += 1 |
| 73 | + c = accumulator.to(c_ptr.dtype.element_ty) |
| 74 | + offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) |
| 75 | + offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) |
| 76 | + c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :] |
| 77 | + mask = (offs_m[:, None] < M) & (offs_n[None, :] < N) |
| 78 | + tl.store(c_ptrs, c, mask=mask) |
| 79 | + |
| 80 | + |
| 81 | +# triton fp8 gemm for fp8 per-block weight & fp8 per-group activation |
| 82 | +# modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py |
| 83 | +def fp8_gemm_triton_block( |
| 84 | + a: torch.Tensor, |
| 85 | + a_s: torch.Tensor, |
| 86 | + b: torch.Tensor, |
| 87 | + b_s: torch.Tensor, |
| 88 | + out_dtype=torch.bfloat16, |
| 89 | + bias=None, |
| 90 | +) -> torch.Tensor: |
| 91 | + """ |
| 92 | + Perform a matrix multiplication using FP8 precision. |
| 93 | + """ |
| 94 | + assert a.is_contiguous() and b.is_contiguous() |
| 95 | + assert a_s.is_contiguous() and b_s.is_contiguous() |
| 96 | + K = a.size(-1) |
| 97 | + M = a.numel() // K |
| 98 | + N = b.size(0) |
| 99 | + c = a.new_empty(*a.size()[:-1], N, dtype=out_dtype) |
| 100 | + |
| 101 | + def grid(meta): |
| 102 | + return ( |
| 103 | + triton.cdiv(M, meta["BLOCK_SIZE_M"]), |
| 104 | + triton.cdiv(N, meta["BLOCK_SIZE_N"]), |
| 105 | + ) |
| 106 | + |
| 107 | + _fp8_gemm_triton_block_kernel[grid](a, b, c, a_s, b_s, M, N, K) |
| 108 | + |
| 109 | + if bias is not None: |
| 110 | + c += bias |
| 111 | + |
| 112 | + return c |
0 commit comments