xlite-dev
diff --git a/‎quack/cross_entropy.py‎
Lines changed: 3 additions & 59 deletions b/‎quack/cross_entropy.py‎
Lines changed: 3 additions & 59 deletions
diff --git a/‎quack/dsl/__init__.py‎
Lines changed: 5 additions & 0 deletions b/‎quack/dsl/__init__.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎quack/dsl/torch_library_op.py‎
Lines changed: 79 additions & 0 deletions b/‎quack/dsl/torch_library_op.py‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎quack/hadamard.py‎
Lines changed: 2 additions & 11 deletions b/‎quack/hadamard.py‎
Lines changed: 2 additions & 11 deletions
diff --git a/‎quack/rms_final_reduce.py‎
Lines changed: 2 additions & 10 deletions b/‎quack/rms_final_reduce.py‎
Lines changed: 2 additions & 10 deletions
diff --git a/‎quack/rmsnorm.py‎
Lines changed: 3 additions & 93 deletions b/‎quack/rmsnorm.py‎
Lines changed: 3 additions & 93 deletions
@@ -17,6 +17,7 @@
 import quack.copy_utils as copy_utils
 import quack.layout_utils as layout_utils
 from quack.compile_utils import make_fake_tensor as fake_tensor
+from quack.dsl import cute_op
 from quack.reduce import row_reduce, online_softmax_reduce
 from quack.reduction_base import ReductionBase
 from quack.cache_utils import jit_cache
@@ -308,7 +309,7 @@ def _compile_cross_entropy_fwd(
     )
 
 
-@torch.library.custom_op("quack::cross_entropy_fwd_out", mutates_args={"loss", "lse", "dx"})
+@cute_op("quack::cross_entropy_fwd_out", mutates_args={"loss", "lse", "dx"})
 def cross_entropy_fwd_out(
     x: Tensor,
     target: Tensor,
@@ -363,42 +364,6 @@ def cross_entropy_fwd_out(
     )(x, target, target_logit, loss, lse, dx, weight, Int32(ignore_index))
 
 
-@cross_entropy_fwd_out.register_fake
-def _cross_entropy_fwd_out_fake(
-    x: Tensor,
-    target: Tensor,
-    target_logit: Optional[Tensor],
-    loss: Tensor,
-    lse: Optional[Tensor],
-    dx: Optional[Tensor],
-    weight: Optional[Tensor],
-    ignore_index: int = -100,
-) -> None:
-    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.size(1), torch.SymInt):
-        N = x.size(1)
-        dtype = torch2cute_dtype_map[x.dtype]
-        target_dtype = torch2cute_dtype_map[target.dtype]
-        target_logit_dtype = (
-            torch2cute_dtype_map[target_logit.dtype] if target_logit is not None else None
-        )
-        target_logit_ndim = target_logit.ndim if target_logit is not None else None
-        weight_dtype = torch2cute_dtype_map[weight.dtype] if weight is not None else None
-        _compile_cross_entropy_fwd(
-            dtype,
-            target_dtype,
-            target_logit_dtype,
-            N,
-            lse is not None,
-            dx is not None,
-            weight_dtype,
-            target_logit_ndim,
-        )
-        _compile_cross_entropy_backward(dtype, target_dtype, N, weight_dtype)
-
-
 def cross_entropy_fwd(
     x: torch.Tensor,
     target: torch.Tensor,
@@ -649,7 +614,7 @@ def _cross_entropy_backward(
     )
 
 
-@torch.library.custom_op("quack::cross_entropy_bwd_out", mutates_args={"dx"})
+@cute_op("quack::cross_entropy_bwd_out", mutates_args={"dx"})
 def cross_entropy_bwd_out(
     x: torch.Tensor,
     target: torch.Tensor,
@@ -662,27 +627,6 @@ def cross_entropy_bwd_out(
     _cross_entropy_backward(x, target, dloss, lse, dx, weight, ignore_index)
 
 
-@cross_entropy_bwd_out.register_fake
-def _cross_entropy_bwd_out_fake(
-    x: torch.Tensor,
-    target: torch.Tensor,
-    dloss: torch.Tensor,
-    lse: torch.Tensor,
-    dx: torch.Tensor,
-    weight: Optional[torch.Tensor] = None,
-    ignore_index: int = -100,
-) -> None:
-    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.size(1), torch.SymInt):
-        N = x.size(1)
-        dtype = torch2cute_dtype_map[x.dtype]
-        target_dtype = torch2cute_dtype_map[target.dtype]
-        weight_dtype = torch2cute_dtype_map[weight.dtype] if weight is not None else None
-        _compile_cross_entropy_backward(dtype, target_dtype, N, weight_dtype)
-
-
 def cross_entropy_bwd(
     x: torch.Tensor,
     target: torch.Tensor,
 
@@ -0,0 +1,5 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+
+from quack.dsl.torch_library_op import cute_op
+
+__all__ = ["cute_op"]
@@ -0,0 +1,79 @@
+# Copyright (c) 2025, Wentao Guo, Ted Zadouri, Tri Dao.
+"""``cute_op``: ``torch.library.custom_op`` for CuTe DSL kernels.
+
+Same trick as ``torch.library.triton_op`` (register the impl as the fake/meta
+kernel too), specialized for our setup:
+
+* Under ``torch.compile`` we stay a complete no-op (matches prior behavior;
+  also avoids moving compile latency into dynamo trace time).
+* Under ``FakeTensorMode`` with SymInt shapes (dynamic-shape tracing), skip:
+  ``@jit_cache`` is an ``lru_cache`` and SymInts are unhashable.
+* Otherwise (``FakeTensorMode`` with concrete shapes, e.g. the COMPILE_ONLY
+  worker) flip ``cache_utils.COMPILE_ONLY`` for the duration of the call so
+  ``@jit_cache`` returns ``_noop_kernel`` for every ``_compile_*(...)`` it
+  populates. The body runs end-to-end, the .o cache is filled, and no kernel
+  is actually launched.
+
+This removes the need for hand-written ``_*_fake`` twins on each op.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Callable, Iterable, Optional, Union
+
+import torch
+
+from quack import cache_utils
+
+
+__all__ = ["cute_op"]
+
+
+def _has_symint_shape(args: Iterable[Any]) -> bool:
+    for a in args:
+        if isinstance(a, torch.Tensor) and any(isinstance(s, torch.SymInt) for s in a.shape):
+            return True
+    return False
+
+
+def cute_op(
+    name: str,
+    *,
+    mutates_args: Union[str, Iterable[str]],
+    schema: Optional[str] = None,
+    device_types: Optional[Union[str, Iterable[str]]] = None,
+) -> Callable:
+    """Like ``torch.library.triton_op``, but for CuTe DSL kernels.
+
+    Args:
+        name: ``"namespace::op_name"``.
+        mutates_args: Names of mutated tensor args.
+        schema: Optional explicit schema. Required when mutating an
+            ``Optional[Tensor]`` arg (PyTorch can't infer those).
+        device_types: Optional device-type restriction.
+    """
+
+    def dec(fn: Callable) -> Any:
+        kwargs: dict[str, Any] = {"mutates_args": mutates_args}
+        if schema is not None:
+            kwargs["schema"] = schema
+        if device_types is not None:
+            kwargs["device_types"] = device_types
+        op = torch.library.custom_op(name, fn, **kwargs)
+
+        @op.register_fake
+        def _fake(*args, **kw):
+            if torch.compiler.is_compiling():
+                return
+            if _has_symint_shape(args) or _has_symint_shape(kw.values()):
+                return
+            saved = cache_utils.COMPILE_ONLY
+            cache_utils.COMPILE_ONLY = True
+            try:
+                fn(*args, **kw)
+            finally:
+                cache_utils.COMPILE_ONLY = saved
+
+        return op
+
+    return dec
@@ -17,6 +17,7 @@
 from quack.cache_utils import jit_cache
 from quack.compile_utils import make_fake_tensor as fake_tensor
 from quack.cute_dsl_utils import torch2cute_dtype_map
+from quack.dsl import cute_op
 
 
 def _next_power_of_2(n: int) -> int:
@@ -280,7 +281,7 @@ def _compile_hadamard_transform_fwd(dtype, N):
     )
 
 
-@torch.library.custom_op(
+@cute_op(
     "quack::_hadamard_transform_fwd",
     mutates_args={"out"},
     device_types="cuda",
@@ -298,16 +299,6 @@ def _hadamard_transform_fwd(x: Tensor, out: Tensor, scale: float) -> None:
     _compile_hadamard_transform_fwd(dtype, N)(x, out, scale)
 
 
-@_hadamard_transform_fwd.register_fake
-def _hadamard_transform_fwd_fake(x: Tensor, out: Tensor, scale: float) -> None:
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.size(1), torch.SymInt):
-        N = x.size(1)
-        dtype = torch2cute_dtype_map[x.dtype]
-        _compile_hadamard_transform_fwd(dtype, N)
-
-
 def hadamard_transform_fwd(x: Tensor, scale: float = 1.0) -> Tensor:
     assert x.dim() >= 1, "Input must have at least one dimension"
     x = _ensure_last_dim_contiguous(x)
 
@@ -21,6 +21,7 @@
 from quack.reduction_base import ReductionBase
 from quack.cache_utils import jit_cache
 from quack.cute_dsl_utils import torch2cute_dtype_map
+from quack.dsl import cute_op
 
 
 class RmsFinalReduce(ReductionBase):
@@ -134,7 +135,7 @@ def _compile_rms_final_reduce(dtype, N):
     )
 
 
-@torch.library.custom_op(
+@cute_op(
     "quack::rms_final_reduce_out",
     mutates_args=("rstd",),
     device_types="cuda",
@@ -152,15 +153,6 @@ def _rms_final_reduce_out(
     compiled_fn(x, rstd, scale, eps)
 
 
-@_rms_final_reduce_out.register_fake
-def _rms_final_reduce_out_fake(x, rstd, scale, eps):
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.shape[0], torch.SymInt):
-        x_dtype = torch2cute_dtype_map[x.dtype]
-        _compile_rms_final_reduce(x_dtype, x.shape[1])
-
-
 def rms_final_reduce(
     x: Tensor,  # (M, N) partial squared sums
     scale: float,  # typically 1.0 / total_columns
 
@@ -17,6 +17,7 @@
 import quack.copy_utils as copy_utils
 import quack.layout_utils as layout_utils
 from quack.compile_utils import make_fake_tensor as fake_tensor
+from quack.dsl import cute_op
 from quack.reduce import row_reduce
 from quack.reduction_base import ReductionBase
 from quack.cache_utils import jit_cache
@@ -316,7 +317,7 @@ def kernel(
             copy(tXrO, tXgO)
 
 
-@torch.library.custom_op(
+@cute_op(
     "quack::_rmsnorm_fwd",
     mutates_args=("out", "rstd", "mean", "residual_out"),
     device_types="cuda",
@@ -375,58 +376,6 @@ def _rmsnorm_fwd(
     )(x, weight, bias, residual, out, residual_out, rstd, mean, eps)
 
 
-@_rmsnorm_fwd.register_fake
-def _rmsnorm_fwd_fake(
-    x: Tensor,
-    weight: Optional[Tensor],
-    out: Tensor,
-    bias: Optional[Tensor] = None,
-    rstd: Optional[Tensor] = None,
-    mean: Optional[Tensor] = None,
-    residual: Optional[Tensor] = None,
-    residual_out: Optional[Tensor] = None,
-    eps: float = 1e-6,
-    is_layernorm: bool = False,
-) -> None:
-    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.size(-1), torch.SymInt):
-        N = x.size(-1)
-        per_head = (weight is not None and weight.dim() == 2) or (
-            bias is not None and bias.dim() == 2
-        )
-        dtype, out_dtype, weight_dtype, bias_dtype, res_dtype, res_out_dtype = [
-            torch2cute_dtype_map[t.dtype] if t is not None else None
-            for t in [x, out, weight, bias, residual, residual_out]
-        ]
-        _compile_rmsnorm_fwd(
-            dtype,
-            out_dtype,
-            res_dtype,
-            weight_dtype,
-            bias_dtype,
-            res_out_dtype,
-            N,
-            rstd is not None,
-            mean is not None,
-            is_layernorm,
-            per_head,
-        )
-        _compile_rmsnorm_bwd(
-            N,
-            dtype,
-            dtype,
-            dtype,
-            weight_dtype,
-            bias is not None,
-            res_dtype,
-            res_out_dtype,
-            weight is not None,
-            per_head,
-        )
-
-
 @jit_cache
 def _compile_rmsnorm_fwd(
     dtype,
@@ -921,7 +870,7 @@ def _get_sm_count(N: int, device: torch.device) -> int:
     return sm_count
 
 
-@torch.library.custom_op(
+@cute_op(
     "quack::_rmsnorm_bwd",
     mutates_args={"dx", "dw_partial", "db_partial", "dresidual"},
     device_types="cuda",
@@ -991,45 +940,6 @@ def _rmsnorm_bwd(
     )(x, weight, dout, dresidual_out, rstd, dx, dw_partial, dresidual, db_partial, sm_count)
 
 
-@_rmsnorm_bwd.register_fake
-def _rmsnorm_bwd_fake(
-    x: Tensor,
-    weight: Optional[Tensor],
-    dout: Tensor,
-    rstd: Tensor,
-    dx: Tensor,
-    dw_partial: Optional[Tensor],
-    db_partial: Optional[Tensor] = None,
-    dresidual_out: Optional[Tensor] = None,
-    dresidual: Optional[Tensor] = None,
-    sm_count: Optional[int] = None,
-) -> None:
-    # See softmax.py _softmax_fwd_fake for why register_fake is needed.
-    from quack.cache_utils import COMPILE_ONLY
-
-    if COMPILE_ONLY and not isinstance(x.size(-1), torch.SymInt):
-        N = x.size(-1)
-        per_head = x.dim() == 3
-        if dw_partial is None and db_partial is None and sm_count is None:
-            return
-        dtype, dout_dtype, dx_dtype, weight_dtype, dres_dtype, dres_out_dtype = [
-            torch2cute_dtype_map[t.dtype] if t is not None else None
-            for t in [x, dout, dx, weight, dresidual, dresidual_out]
-        ]
-        _compile_rmsnorm_bwd(
-            N,
-            dtype,
-            dout_dtype,
-            dx_dtype,
-            weight_dtype,
-            db_partial is not None,
-            dres_dtype,
-            dres_out_dtype,
-            dw_partial is not None,
-            per_head,
-        )
-
-
 @jit_cache
 def _compile_rmsnorm_bwd(
     N,