NVIDIA
diff --git a/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions b/‎CHANGELOG.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎modelopt/torch/kernels/common/attention/hf_triton_attention.py‎
Lines changed: 23 additions & 77 deletions b/‎modelopt/torch/kernels/common/attention/hf_triton_attention.py‎
Lines changed: 23 additions & 77 deletions
diff --git a/‎modelopt/torch/kernels/common/attention/triton_fa.py‎
Lines changed: 96 additions & 81 deletions b/‎modelopt/torch/kernels/common/attention/triton_fa.py‎
Lines changed: 96 additions & 81 deletions
@@ -43,6 +43,7 @@ Changelog
 - Add Nemotron-3-Super-120B-A12B PTQ recipes ``modelopt_recipes/models/Nemotron-3-Super-120B-A12B/super-nvfp4.yaml`` (MSE-mixed) and ``super-nvfp4-max-calib.yaml`` (max-calib mixed): NVFP4 W4A4 routed experts + FP8 per-tensor shared experts / Mamba in/out_proj + FP8 KV cache.
 - Add quantized ``nn.Embedding`` support. ``nn.Embedding`` is now registered in ``QuantModuleRegistry`` and exposes ``weight_quantizer`` (embedding table), ``output_quantizer`` (lookup activations), and a permanently disabled ``input_quantizer`` placeholder — embedding inputs are integer indices and cannot be fake-quantized, so direct ``enable*()`` calls raise. ``export_hf_checkpoint`` packs quantized embedding weights alongside Linear layers. Embedding quantizers are opt-in (``parent_class: nn.Embedding`` disabled by default).
 - Add post-training quantization (PTQ) example for the Megatron-Bridge framework: ``examples/megatron_bridge/quantize.py`` calibrates an HF model (via ``--quant_cfg`` alias / full config name or a ``--recipe`` YAML, with optional KV-cache quant, weight-only, compression, and MoE expert-ratio calibration) and saves a Megatron checkpoint (tensor / pipeline / expert parallelism supported), and ``examples/megatron_bridge/export.py`` converts that checkpoint to a deployable HuggingFace (unified) checkpoint for TensorRT-LLM / vLLM / SGLang. See `examples/megatron_bridge/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/megatron_bridge>`_ for details.
+- Add ``mtsa.config.SKIP_SOFTMAX_TRITON_CALIB`` for skip-softmax attention-sparsity calibration through the fused Triton ``attention_calibrate`` kernel (HF ``modelopt_triton`` backend), measuring multi-threshold tile-skip statistics the way the Triton inference kernel actually skips tiles for both prefill and decode. Exposed as ``--sparse_attn_cfg skip_softmax_triton_calib`` in ``examples/llm_sparsity/attention_sparsity/hf_sa.py`` (with a new ``--calib_data_dir`` flag for RULER calibration data).
 
 **Bug Fixes**
 
 
@@ -22,70 +22,14 @@
 
 from __future__ import annotations
 
-import threading
-
 import torch
 import torch.nn as nn
 
 from modelopt.torch.kernels.common.attention.triton_fa import attention
 
-# ---------------------------------------------------------------------------
-# Thread-local skip-softmax calibration config for the HF (modelopt_triton) backend
-# ---------------------------------------------------------------------------
-# Mirrors the diffusers/LTX backends: during calibration the Triton calibration
-# kernel measures multi-threshold tile-skip statistics without skipping any tiles.
-# Inference-time config (skip threshold / scale factor) is still read from the
-# module/method attributes in ``triton_attention_forward`` — only calibration
-# state lives here.
-_thread_local = threading.local()
-
-
-def set_hf_triton_skip_softmax_config(
-    threshold: float | None = None,
-    calibration_mode: bool = False,
-    threshold_trials: list[float] | None = None,
-    scale_factor: float | None = None,
-    measure_sparsity: bool = False,
-) -> None:
-    """Set thread-local skip-softmax calibration config for the next forward.
-
-    Accepts the same keyword arguments as the diffusers/LTX backends so the
-    shared :class:`TritonSkipSoftmaxMethod` can configure all backends uniformly.
-    Only the calibration fields are consumed by the HF backend; the inference
-    fields (``threshold``/``scale_factor``/``measure_sparsity``) are accepted for
-    signature compatibility but ignored here, since the HF inference path reads
-    its threshold from the module/method attributes.
-
-    Args:
-        threshold: Ignored by the HF backend (inference threshold comes from the module).
-        calibration_mode: If True, route prefill attention through the calibration kernel.
-        threshold_trials: Thresholds to measure sparsity for (used when calibration_mode=True).
-        scale_factor: Ignored by the HF backend.
-        measure_sparsity: Ignored by the HF backend.
-    """
-    _thread_local.calibration_mode = calibration_mode
-    _thread_local.threshold_trials = threshold_trials
-    # Counters accumulated across all attention calls in one forward pass.
-    _thread_local.calibration_counters = None
-    _thread_local.calibration_seq_k = None
-
-
-def clear_hf_triton_skip_softmax_config() -> None:
-    """Clear thread-local skip-softmax calibration config."""
-    _thread_local.calibration_mode = False
-    _thread_local.threshold_trials = None
-    _thread_local.calibration_counters = None
-    _thread_local.calibration_seq_k = None
-
-
-def get_calibration_counters() -> torch.Tensor | None:
-    """Return accumulated calibration counters ``[num_thresholds, 2]`` or None."""
-    return getattr(_thread_local, "calibration_counters", None)
-
-
-def get_calibration_seq_k() -> int | None:
-    """Return KV sequence length observed during calibration, or None."""
-    return getattr(_thread_local, "calibration_seq_k", None)
+# Skip-softmax calibration config and counters live on the module's
+# ``_sparse_method_instance`` (HF passes the owning module to
+# ``triton_attention_forward``), so no separate thread-local state is needed.
 
 
 def _seq_lens_from_mask(
@@ -165,29 +109,35 @@ def triton_attention_forward(
         kw["b_seq_len_k"] = torch.full((batch,), seq_k, device=device, dtype=torch.int32)
         kw["max_input_len_k"] = seq_k
 
-    # --- Calibration mode: collect multi-threshold tile-skip stats (prefill only) ---
-    # Run the calibration kernel, which computes full (non-skipped) attention while
-    # counting, per candidate threshold, how many KV tiles would be skipped. ``kw`` at
-    # this point holds only the base attention args that ``attention_calibrate`` accepts;
-    # the sparse-attention kwargs below are intentionally not added in this branch.
-    calib_mode = getattr(_thread_local, "calibration_mode", False)
-    if calib_mode and not is_decode:
-        trials = getattr(_thread_local, "threshold_trials", None)
+    # Sparse-attention method instance. It carries the inference threshold and,
+    # during calibration, both the calibration config and the accumulated
+    # tile-skip counters. Available here because HF passes the owning module.
+    method = getattr(module, "_sparse_method_instance", None)
+
+    # Calibration mode: run the calibration kernel, which computes full attention
+    # while counting, per candidate threshold, how many KV tiles would be skipped.
+    # The sparse-attention kwargs below are intentionally not added in this branch.
+    if method is not None and getattr(method, "_calibration_mode", False):
+        trials = getattr(method, "_threshold_trials", None)
+        # Deferred: the package __init__ imports this module, so importing
+        # attention_calibrate at module top would be circular.
         from modelopt.torch.kernels.common.attention import attention_calibrate
 
         if trials and attention_calibrate is not None:
             o, counters = attention_calibrate(q, k, v, **kw, threshold_trials=trials)
 
             # Accumulate counters across all attention calls in this forward pass.
-            prev = getattr(_thread_local, "calibration_counters", None)
-            _thread_local.calibration_counters = counters if prev is None else prev + counters
-            _thread_local.calibration_seq_k = seq_k
+            # The method instance is per-module so the accumulator stays on one
+            # device, but guard the add against a device mismatch just in case.
+            prev = getattr(method, "_hf_calibration_counters", None)
+            method._hf_calibration_counters = (
+                counters if prev is None else prev + counters.to(prev.device)
+            )
+            method._hf_calibration_seq_k = seq_k
+            method._hf_calibration_is_decode = is_decode
 
             return (o.view(batch, seq_len, num_heads, head_dim), None)
 
-    # Sparse attention params
-    method = getattr(module, "_sparse_method_instance", None)
-
     # N:M sparse softmax: prefill only (no perf benefit for decode)
     if method is not None and not is_decode and getattr(module, "_apply_sparse_nm", False):
         kw["sparsity_n"] = method.sparsity_n
@@ -233,10 +183,6 @@ def register_triton_attention() -> bool:
 
 
 __all__ = [
-    "clear_hf_triton_skip_softmax_config",
-    "get_calibration_counters",
-    "get_calibration_seq_k",
     "register_triton_attention",
-    "set_hf_triton_skip_softmax_config",
     "triton_attention_forward",
 ]
@@ -919,23 +919,29 @@ def forward(
         def grid(META):
             return (batch, num_q_heads, triton.cdiv(max_input_len, META["BLOCK_M"]))
 
-        if do_measure:
-            # Runtime counters mutate global tensors, so do not run them through
-            # autotune candidate trials. Use one stable config for measurement.
-            _attn_fwd.fn[grid](
-                *fwd_args,
-                **fwd_kwargs,
-                BLOCK_M=_MEASURE_BLOCK_M,
-                BLOCK_N=_MEASURE_BLOCK_N,
-                num_warps=_MEASURE_NUM_WARPS,
-                num_stages=_MEASURE_NUM_STAGES,
-            )
-        else:
-            _attn_fwd[grid](
-                *fwd_args,
-                **fwd_kwargs,
-                # BLOCK_M, BLOCK_N, num_warps, num_stages chosen by autotune
-            )
+        # Triton launches on torch.cuda.current_device(), which is not
+        # necessarily the device the tensors live on (e.g. under accelerate
+        # device_map="auto" sharding). Activate the tensor's device so the
+        # kernel dereferences the right pointers instead of triggering an
+        # illegal memory access.
+        with torch.cuda.device(q.device):
+            if do_measure:
+                # Runtime counters mutate global tensors, so do not run them through
+                # autotune candidate trials. Use one stable config for measurement.
+                _attn_fwd.fn[grid](
+                    *fwd_args,
+                    **fwd_kwargs,
+                    BLOCK_M=_MEASURE_BLOCK_M,
+                    BLOCK_N=_MEASURE_BLOCK_N,
+                    num_warps=_MEASURE_NUM_WARPS,
+                    num_stages=_MEASURE_NUM_STAGES,
+                )
+            else:
+                _attn_fwd[grid](
+                    *fwd_args,
+                    **fwd_kwargs,
+                    # BLOCK_M, BLOCK_N, num_warps, num_stages chosen by autotune
+                )
 
         # Store sparsity counters on the output tensor for retrieval by callers
         if do_measure:
@@ -970,23 +976,30 @@ def backward(ctx, grad_output):
         do = grad_output.contiguous()
         num_warps = 4
 
+        # Triton launches on torch.cuda.current_device(), which is not
+        # necessarily the device the tensors live on (e.g. under accelerate
+        # device_map="auto" sharding). Activate the tensor's device for each
+        # launch so the kernels dereference the right pointers instead of
+        # triggering an illegal memory access.
+
         # Phase 1: delta = rowsum(O * dO)
         delta = torch.empty_like(lse)
-        _attn_bwd_preprocess[(ctx.num_q_heads, triton.cdiv(q.shape[0], BLOCK))](
-            o,
-            do,
-            delta,
-            o.stride(0),
-            o.stride(1),
-            do.stride(0),
-            do.stride(1),
-            delta.stride(0),
-            delta.stride(1),
-            q.shape[0],
-            HEAD_DIM=HEAD_DIM,
-            BLOCK_D=BLOCK_D,
-            BLOCK_M=BLOCK,
-        )
+        with torch.cuda.device(q.device):
+            _attn_bwd_preprocess[(ctx.num_q_heads, triton.cdiv(q.shape[0], BLOCK))](
+                o,
+                do,
+                delta,
+                o.stride(0),
+                o.stride(1),
+                do.stride(0),
+                do.stride(1),
+                delta.stride(0),
+                delta.stride(1),
+                q.shape[0],
+                HEAD_DIM=HEAD_DIM,
+                BLOCK_D=BLOCK_D,
+                BLOCK_M=BLOCK,
+            )
 
         dq = torch.zeros_like(q)
         dk = torch.zeros_like(k)
@@ -1016,57 +1029,59 @@ def backward(ctx, grad_output):
         )
 
         # Phase 2: dK, dV
-        _attn_bwd_dkdv[(ctx.batch, ctx.num_kv_heads, triton.cdiv(ctx.max_input_len_k, BLOCK))](
-            *bwd_args[:4],
-            dk,
-            dv,
-            *bwd_args[4:],
-            dk.stride(0),
-            dk.stride(1),
-            dv.stride(0),
-            dv.stride(1),
-            lse.stride(0),
-            lse.stride(1),
-            kv_group_num=ctx.kv_group_num,
-            BLOCK_M=BLOCK,
-            BLOCK_D=BLOCK_D,
-            BLOCK_N=BLOCK,
-            IS_CAUSAL=ctx.is_causal,
-            HEAD_DIM=HEAD_DIM,
-            SPARSITY_N=ctx.sparsity_n,
-            SPARSITY_M=ctx.sparsity_m,
-            DENSE_SINK_TOKENS=ctx.dense_sink_tokens,
-            DENSE_RECENT_TOKENS=ctx.dense_recent_tokens,
-            APPLY_SKIP_SOFTMAX=ctx.apply_skip,
-            SKIP_THRESHOLD_LOG2=ctx.skip_threshold_log2,
-            num_warps=num_warps,
-            num_stages=1,
-        )
+        with torch.cuda.device(q.device):
+            _attn_bwd_dkdv[(ctx.batch, ctx.num_kv_heads, triton.cdiv(ctx.max_input_len_k, BLOCK))](
+                *bwd_args[:4],
+                dk,
+                dv,
+                *bwd_args[4:],
+                dk.stride(0),
+                dk.stride(1),
+                dv.stride(0),
+                dv.stride(1),
+                lse.stride(0),
+                lse.stride(1),
+                kv_group_num=ctx.kv_group_num,
+                BLOCK_M=BLOCK,
+                BLOCK_D=BLOCK_D,
+                BLOCK_N=BLOCK,
+                IS_CAUSAL=ctx.is_causal,
+                HEAD_DIM=HEAD_DIM,
+                SPARSITY_N=ctx.sparsity_n,
+                SPARSITY_M=ctx.sparsity_m,
+                DENSE_SINK_TOKENS=ctx.dense_sink_tokens,
+                DENSE_RECENT_TOKENS=ctx.dense_recent_tokens,
+                APPLY_SKIP_SOFTMAX=ctx.apply_skip,
+                SKIP_THRESHOLD_LOG2=ctx.skip_threshold_log2,
+                num_warps=num_warps,
+                num_stages=1,
+            )
 
         # Phase 3: dQ
-        _attn_bwd_dq[(ctx.batch, ctx.num_q_heads, triton.cdiv(ctx.max_input_len, BLOCK))](
-            *bwd_args[:4],
-            dq,
-            *bwd_args[4:],
-            dq.stride(0),
-            dq.stride(1),
-            lse.stride(0),
-            lse.stride(1),
-            kv_group_num=ctx.kv_group_num,
-            BLOCK_M=BLOCK,
-            BLOCK_D=BLOCK_D,
-            BLOCK_N=BLOCK,
-            IS_CAUSAL=ctx.is_causal,
-            HEAD_DIM=HEAD_DIM,
-            SPARSITY_N=ctx.sparsity_n,
-            SPARSITY_M=ctx.sparsity_m,
-            DENSE_SINK_TOKENS=ctx.dense_sink_tokens,
-            DENSE_RECENT_TOKENS=ctx.dense_recent_tokens,
-            APPLY_SKIP_SOFTMAX=ctx.apply_skip,
-            SKIP_THRESHOLD_LOG2=ctx.skip_threshold_log2,
-            num_warps=num_warps,
-            num_stages=1,
-        )
+        with torch.cuda.device(q.device):
+            _attn_bwd_dq[(ctx.batch, ctx.num_q_heads, triton.cdiv(ctx.max_input_len, BLOCK))](
+                *bwd_args[:4],
+                dq,
+                *bwd_args[4:],
+                dq.stride(0),
+                dq.stride(1),
+                lse.stride(0),
+                lse.stride(1),
+                kv_group_num=ctx.kv_group_num,
+                BLOCK_M=BLOCK,
+                BLOCK_D=BLOCK_D,
+                BLOCK_N=BLOCK,
+                IS_CAUSAL=ctx.is_causal,
+                HEAD_DIM=HEAD_DIM,
+                SPARSITY_N=ctx.sparsity_n,
+                SPARSITY_M=ctx.sparsity_m,
+                DENSE_SINK_TOKENS=ctx.dense_sink_tokens,
+                DENSE_RECENT_TOKENS=ctx.dense_recent_tokens,
+                APPLY_SKIP_SOFTMAX=ctx.apply_skip,
+                SKIP_THRESHOLD_LOG2=ctx.skip_threshold_log2,
+                num_warps=num_warps,
+                num_stages=1,
+            )
 
         return (
             dq,