|
| 1 | +# |
| 2 | +# Copyright (c) Meta Platforms, Inc. and affiliates. |
| 3 | +# All rights reserved. |
| 4 | +# |
| 5 | +# This source code is licensed under the BSD-style license found in the |
| 6 | +# LICENSE file in the root directory of this source tree. |
| 7 | +# |
| 8 | + |
| 9 | +""" |
| 10 | +Custom MLX operator definitions. |
| 11 | +
|
| 12 | +This module defines custom operators that are supported by the MLX backend. |
| 13 | +These ops are used during model export to represent operations that MLX |
| 14 | +can execute efficiently but may not have direct PyTorch equivalents. |
| 15 | +
|
| 16 | +The ops are registered using torch.library and include: |
| 17 | +- rms_norm: RMSNorm normalization |
| 18 | +- apply_rope: Rotary Position Embedding application |
| 19 | +""" |
| 20 | + |
| 21 | +from typing import Optional, Tuple |
| 22 | + |
| 23 | +import torch |
| 24 | +from torch import Tensor |
| 25 | + |
| 26 | + |
| 27 | +# ============================================================================= |
| 28 | +# rms_norm: RMSNorm normalization |
| 29 | +# ============================================================================= |
| 30 | + |
| 31 | + |
| 32 | +@torch.library.custom_op("mlx::rms_norm", mutates_args=()) |
| 33 | +def rms_norm(x: Tensor, weight: Tensor, eps: float = 1e-5) -> Tensor: |
| 34 | + """ |
| 35 | + RMSNorm normalization. |
| 36 | +
|
| 37 | + Args: |
| 38 | + x: Input tensor of shape (..., hidden_dim) |
| 39 | + weight: Weight tensor of shape (hidden_dim,) |
| 40 | + eps: Small constant for numerical stability |
| 41 | +
|
| 42 | + Returns: |
| 43 | + Normalized tensor of the same shape as x |
| 44 | + """ |
| 45 | + x_f = x.to(torch.float32) |
| 46 | + var = x_f.pow(2).mean(dim=-1, keepdim=True) |
| 47 | + y = x_f * torch.rsqrt(var + eps) |
| 48 | + y = y.to(x.dtype) |
| 49 | + return y * weight.to(x.dtype) |
| 50 | + |
| 51 | + |
| 52 | +@torch.library.register_fake("mlx::rms_norm") |
| 53 | +def rms_norm_fake(x: Tensor, weight: Tensor, eps: float = 1e-5) -> Tensor: |
| 54 | + """Fake implementation for tracing.""" |
| 55 | + return x.new_empty(x.shape) |
| 56 | + |
| 57 | + |
| 58 | +# ============================================================================= |
| 59 | +# apply_rope: Rotary Position Embedding |
| 60 | +# ============================================================================= |
| 61 | + |
| 62 | + |
| 63 | +@torch.library.custom_op("mlx::apply_rope", mutates_args=()) |
| 64 | +def apply_rope( |
| 65 | + q_in: Tensor, # (B, Hq, T, D) |
| 66 | + k_in: Tensor, # (B, Hk, T, D) |
| 67 | + head_dim: int, |
| 68 | + pos: int, # int, not tensor |
| 69 | + traditional: bool = False, |
| 70 | + base: float = 500000.0, |
| 71 | + scale: float = 1.0, |
| 72 | + freqs: Optional[Tensor] = None, |
| 73 | +) -> Tuple[Tensor, Tensor]: |
| 74 | + """ |
| 75 | + Apply Rotary Position Embedding to query and key tensors. |
| 76 | +
|
| 77 | + Args: |
| 78 | + q_in: Query tensor of shape (B, Hq, T, D) |
| 79 | + k_in: Key tensor of shape (B, Hk, T, D) |
| 80 | + head_dim: Dimension of each attention head |
| 81 | + pos: Starting position index (int, not tensor) |
| 82 | + traditional: Whether to use traditional RoPE formulation |
| 83 | + base: Base for frequency computation |
| 84 | + scale: Scale factor for frequencies |
| 85 | + freqs: Optional precomputed frequencies |
| 86 | +
|
| 87 | + Returns: |
| 88 | + Tuple of (rotated_q, rotated_k) |
| 89 | + """ |
| 90 | + Dh = int(head_dim) |
| 91 | + assert q_in.size(-1) == Dh and k_in.size(-1) == Dh, "head_dim mismatch" |
| 92 | + |
| 93 | + # unpack as (B, H, T, D) |
| 94 | + B, Hq, T, _ = q_in.shape |
| 95 | + B2, Hk, T2, _ = k_in.shape |
| 96 | + assert B == B2 and T == T2, "RoPE expects q and k to have same B,T" |
| 97 | + half = Dh // 2 |
| 98 | + |
| 99 | + if freqs is None: |
| 100 | + # [1, 1, 1, half] to broadcast over B,H,T |
| 101 | + i = torch.arange(half, device=q_in.device, dtype=torch.float32) |
| 102 | + inv_freq = (base ** (-2.0 * i / Dh)).view(1, 1, 1, half) |
| 103 | + |
| 104 | + # positions: [1, 1, T, 1] |
| 105 | + pos_range = torch.arange( |
| 106 | + pos, pos + T, device=q_in.device, dtype=torch.float32 |
| 107 | + ).view(1, 1, T, 1) |
| 108 | + |
| 109 | + # final angles: [1, 1, T, half] |
| 110 | + angles = (pos_range * inv_freq) * float(scale) |
| 111 | + else: |
| 112 | + # assume freqs is already per-position, just reshape to [1,1,T,half] |
| 113 | + angles = freqs.to(torch.float32).view(1, 1, T, half) |
| 114 | + |
| 115 | + cos = angles.cos().to(q_in.dtype) # [1,1,T,half] |
| 116 | + sin = angles.sin().to(q_in.dtype) # [1,1,T,half] |
| 117 | + |
| 118 | + def rot(x: Tensor) -> Tensor: |
| 119 | + # x: [B, H, T, D] |
| 120 | + x1, x2 = x[..., :half], x[..., half : 2 * half] |
| 121 | + xr = x1 * cos - x2 * sin |
| 122 | + xi = x1 * sin + x2 * cos |
| 123 | + if 2 * half != Dh: |
| 124 | + return torch.cat([xr, xi, x[..., 2 * half :]], dim=-1) |
| 125 | + return torch.cat([xr, xi], dim=-1) |
| 126 | + |
| 127 | + q_out = rot(q_in) |
| 128 | + k_out = rot(k_in) |
| 129 | + return q_out, k_out |
| 130 | + |
| 131 | + |
| 132 | +@torch.library.register_fake("mlx::apply_rope") |
| 133 | +def apply_rope_fake( |
| 134 | + q_in: Tensor, |
| 135 | + k_in: Tensor, |
| 136 | + head_dim: int, |
| 137 | + pos: int, |
| 138 | + traditional: bool = False, |
| 139 | + base: float = 500000.0, |
| 140 | + scale: float = 1.0, |
| 141 | + freqs: Optional[Tensor] = None, |
| 142 | +) -> Tuple[Tensor, Tensor]: |
| 143 | + """Fake implementation for tracing.""" |
| 144 | + return ( |
| 145 | + q_in.new_empty(q_in.shape), |
| 146 | + k_in.new_empty(k_in.shape), |
| 147 | + ) |
0 commit comments