NVIDIA
diff --git a/‎examples/llm_sparsity/attention_sparsity/README.md‎
Lines changed: 6 additions & 6 deletions b/‎examples/llm_sparsity/attention_sparsity/README.md‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/llm_sparsity/attention_sparsity/hf_sa.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 16 additions & 78 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py‎
Lines changed: 16 additions & 78 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py‎
Lines changed: 2 additions & 2 deletions b/‎modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎modelopt/torch/sparsity/attention_sparsity/calibration/dataset.py‎
Lines changed: 61 additions & 1 deletion b/‎modelopt/torch/sparsity/attention_sparsity/calibration/dataset.py‎
Lines changed: 61 additions & 1 deletion
@@ -1,6 +1,6 @@
 # Attention Sparsity for HuggingFace Models
 
-In this tutorial, we demonstrate how to use NVIDIA TensorRT Model Optimizer to apply attention sparsity to HuggingFace models. Attention sparsity reduces computational cost by skipping near-zero attention scores during the softmax computation.
+In this tutorial, we demonstrate how to use NVIDIA Model Optimizer to apply attention sparsity to HuggingFace models. Attention sparsity reduces computational cost by skipping near-zero attention scores during the softmax computation.
 
 ## Getting Started
 
@@ -63,7 +63,7 @@ pip install nvidia-modelopt[hf]
 If using `SKIP_SOFTMAX_CALIB`, you need to download the RULER calibration dataset first:
 
 ```bash
-bash modelopt/torch/sparsity/attention_sparsity/calibration/download_ruler_data.sh
+bash ./download_ruler_data.sh
 ```
 
 This downloads the Paul Graham essays dataset used for generating calibration samples.
@@ -75,7 +75,7 @@ This downloads the Paul Graham essays dataset used for generating calibration sa
 Apply sparse attention with a fixed threshold:
 
 ```bash
-python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+python hf_sa.py \
     --pyt_ckpt_path Qwen/Qwen3-8B \
     --sparse_attn skip_softmax
 ```
@@ -85,7 +85,7 @@ python examples/llm_sparsity/attention_sparsity/hf_sa.py \
 Apply sparse attention with calibrated thresholds for optimal sparsity:
 
 ```bash
-python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+python hf_sa.py \
     --pyt_ckpt_path Qwen/Qwen3-8B \
     --sparse_attn skip_softmax_calib
 ```
@@ -121,7 +121,7 @@ The script automatically compares outputs before and after applying sparse atten
 Export the sparsified model to a HuggingFace checkpoint:
 
 ```bash
-python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+python hf_sa.py \
     --pyt_ckpt_path Qwen/Qwen3-8B \
     --sparse_attn skip_softmax_calib \
     --export_dir ./exported_sparse_model
@@ -161,5 +161,5 @@ model = mtsa.sparsify(model, config=custom_config)
 
 ## References
 
-- [TensorRT Model Optimizer Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/)
+- [Model Optimizer Documentation](https://nvidia.github.io/Model-Optimizer/)
 - [RULER: What's the Real Context Size of Your Long-Context Language Models?](https://github.com/NVIDIA/RULER)
@@ -171,7 +171,7 @@ def main(args):
     print(f"\nApplying sparse attention: {args.sparse_attn}")
     sparse_config = SPARSE_ATTN_CFG_CHOICES[args.sparse_attn]
 
-    # Override target_sparse_ratio if provided via CLI
+    # Override calibration options if provided via CLI
     if args.target_sparse_ratio is not None:
         sparse_config = copy.deepcopy(sparse_config)
         sparse_cfg = sparse_config.get("sparse_cfg", {})
 
@@ -15,11 +15,8 @@
 
 """Calibration functions for sparse attention."""
 
-import hashlib
-import json
 import warnings
 from collections.abc import Callable
-from pathlib import Path
 from typing import Any
 
 import torch
@@ -28,59 +25,11 @@
 
 from ..config import CalibrationConfig
 from ..conversion import print_sparse_attention_summary
-from ..sparse_attention import SparseAttentionModule
+from ..utils import get_named_sparse_attention_modules
 from .calibrator import DynamicThresholdCalibrator
 from .dataset import RulerDatasetBuilder
 
 
-def _get_cache_path(
-    tokenizer_path: str, samples: int, max_seqlen: int, cache_dir: str | None = None
-) -> Path:
-    """Generate cache file path based on calibration parameters.
-
-    Args:
-        tokenizer_path: Path to tokenizer (used in hash)
-        samples: Number of calibration samples
-        max_seqlen: Maximum sequence length
-        cache_dir: Optional cache directory. If None, uses ~/.cache/modelopt/sparse_attention/
-    """
-    # Create a hash of the parameters for the cache filename
-    key = f"{tokenizer_path}_{samples}_{max_seqlen}"
-    hash_str = hashlib.md5(key.encode(), usedforsecurity=False).hexdigest()[:12]
-    filename = f"ruler_cache_{samples}s_{max_seqlen}l_{hash_str}.json"
-
-    if cache_dir:
-        base_dir = Path(cache_dir)
-    else:
-        base_dir = Path.home() / ".cache" / "modelopt" / "sparse_attention"
-
-    return base_dir / filename
-
-
-def _load_cached_data(cache_path: Path) -> list[dict[str, Any]] | None:
-    """Load calibration data from cache if it exists."""
-    if cache_path.exists():
-        try:
-            with open(cache_path) as f:
-                data = json.load(f)
-            print(f"Loaded {len(data)} cached calibration samples from {cache_path}")
-            return data
-        except Exception as e:
-            print(f"Warning: Failed to load cache: {e}")
-    return None
-
-
-def _save_cached_data(cache_path: Path, data: list[dict[str, Any]]) -> None:
-    """Save calibration data to cache."""
-    try:
-        cache_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(cache_path, "w") as f:
-            json.dump(data, f)
-        print(f"Saved calibration samples to cache: {cache_path}")
-    except Exception as e:
-        print(f"Warning: Failed to save cache: {e}")
-
-
 def _extract_tokenizer_from_model(model: nn.Module) -> str:
     """Extract tokenizer name/path from model config.
 
@@ -152,7 +101,9 @@ def create_calibration_forward_loop(
         tokenizer.pad_token = tokenizer.eos_token
 
     def forward_loop(model: nn.Module) -> None:
-        device = next(model.parameters()).device
+        from modelopt.torch.utils import get_module_device
+
+        device = get_module_device(model)
 
         for sample in calibration_data:
             inputs = tokenizer(
@@ -210,7 +161,9 @@ def create_decode_calibration_forward_loop(
         tokenizer.pad_token = tokenizer.eos_token
 
     def forward_loop(model: nn.Module) -> None:
-        device = next(model.parameters()).device
+        from modelopt.torch.utils import get_module_device
+
+        device = get_module_device(model)
 
         for sample in calibration_data:
             inputs = tokenizer(
@@ -291,9 +244,7 @@ def calibrate_sparse_attention(
         return {}
 
     # Get sparse attention modules
-    sparse_modules = [
-        (name, m) for name, m in model.named_modules() if isinstance(m, SparseAttentionModule)
-    ]
+    sparse_modules = get_named_sparse_attention_modules(model)
 
     if not sparse_modules:
         print("No sparse attention modules found for calibration")
@@ -306,29 +257,16 @@ def calibrate_sparse_attention(
     calibration_data = None
 
     if calibrate_prefill or calibrate_decode:
-        # Try to load from cache first
-        cache_path = _get_cache_path(
-            tokenizer,
-            calib_config.samples,
-            calib_config.max_seqlen,
+        builder = RulerDatasetBuilder(
+            samples=calib_config.samples,
+            max_seqlen=calib_config.max_seqlen,
+            tokenizer_name_or_path=tokenizer,
+            num_length_bins=calib_config.num_length_bins,
+            max_length_filter=int(calib_config.max_seqlen * 1.5),
             cache_dir=calib_config.cache_dir,
+            data_dir=calib_config.data_dir,
         )
-        calibration_data = _load_cached_data(cache_path)
-
-        # Generate if not cached
-        if calibration_data is None:
-            builder = RulerDatasetBuilder(
-                samples=calib_config.samples,
-                max_seqlen=calib_config.max_seqlen,
-                tokenizer_name_or_path=tokenizer,
-                num_length_bins=calib_config.num_length_bins,
-                max_length_filter=int(calib_config.max_seqlen * 1.5),
-            )
-            calibration_data = builder.build_calibration_dataset()
-            print(f"Generated {len(calibration_data)} calibration samples")
-
-            # Save to cache for future runs
-            _save_cached_data(cache_path, calibration_data)
+        calibration_data = builder.build_calibration_dataset()
 
     # Initialize results
     calibration_results: dict[str, Any] = {}
 
@@ -26,8 +26,8 @@
 from scipy.optimize import curve_fit
 from tqdm import tqdm
 
-from ..sparse_attention import SparseAttentionModule
 from ..stats_manager import SparseAttentionStatsManager
+from ..utils import get_sparse_attention_modules
 
 
 class DynamicThresholdCalibrator:
@@ -113,7 +113,7 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
             Dict with calibration results including a, b, r_squared, and num_data_points
         """
         # Extract attention modules
-        attention_modules = [m for m in model.modules() if isinstance(m, SparseAttentionModule)]
+        attention_modules = get_sparse_attention_modules(model)
 
         if not attention_modules:
             raise ValueError("No sparse attention modules found for calibration")
 
@@ -15,9 +15,12 @@
 
 """RULER dataset builder for sparse attention calibration."""
 
+import hashlib
+import json
 import random
 import string
 from dataclasses import dataclass
+from pathlib import Path
 from typing import Any
 
 from tqdm import tqdm
@@ -125,7 +128,7 @@ class RulerTask:
         ),
         answer_prefix=(
             " Answer: According to the chain(s) of variable assignment in the text above, "
-            "{num_v} variables are assgined the value {query}, they are: "
+            "{num_v} variables are assigned the value {query}, they are: "
         ),
         args={"num_chains": 1, "num_hops": 4},
     ),
@@ -189,6 +192,8 @@ def __init__(
         num_length_bins: int = 4,
         max_length_filter: int = 65536,
         seed: int = 42,
+        cache_dir: str | None = None,
+        data_dir: str | Path | None = None,
     ):
         """Initialize RULER dataset builder.
 
@@ -199,6 +204,9 @@ def __init__(
             seed: Random seed for reproducibility
             num_length_bins: Number of length bins to generate (default: 4)
             max_length_filter: Maximum sequence length to keep (default: 65536)
+            cache_dir: Optional cache directory. If None, uses ~/.cache/modelopt/data/
+            data_dir: Optional path to RULER data directory (contains 'essays' subdir).
+                Required for NIAH tasks with essay haystack when not using pip default layout.
 
         Note:
             Length bins are auto-generated as descending powers of 2:
@@ -220,6 +228,8 @@ def __init__(
         self.tokenizer_name_or_path = tokenizer_name_or_path
         self.seed = seed
         self.max_length_filter = max_length_filter
+        self.cache_dir = cache_dir
+        self.data_dir = Path(data_dir) if data_dir is not None else None
 
         # Generate target lengths and validate
         self.target_lengths = _generate_target_lengths(max_seqlen, num_length_bins, min_seqlen=1024)
@@ -238,12 +248,58 @@ def __init__(
             self.tokenizer = tokenizer_name_or_path
         random.seed(seed)
 
+    def _get_cache_path(self) -> Path:
+        """Generate cache file path based on calibration parameters."""
+        tokenizer_path = (
+            self.tokenizer_name_or_path
+            if isinstance(self.tokenizer_name_or_path, str)
+            else str(self.tokenizer_name_or_path)
+        )
+        key = f"{tokenizer_path}_{self.total_samples}_{self.max_seqlen}"
+        hash_str = hashlib.md5(key.encode(), usedforsecurity=False).hexdigest()[:12]
+        filename = f"ruler_cache_{self.total_samples}s_{self.max_seqlen}l_{hash_str}.json"
+        if self.cache_dir:
+            base_dir = Path(self.cache_dir)
+        else:
+            base_dir = Path.home() / ".cache" / "modelopt" / "data"
+        return base_dir / filename
+
+    def _load_cached_data(self, cache_path: Path) -> list[dict[str, Any]] | None:
+        """Load calibration data from cache if it exists."""
+        if cache_path.exists():
+            try:
+                with open(cache_path) as f:
+                    data = json.load(f)
+                print(f"Loaded {len(data)} cached calibration samples from {cache_path}")
+                return data
+            except Exception as e:
+                print(f"Warning: Failed to load cache: {e}")
+        return None
+
+    def _save_cached_data(self, cache_path: Path, data: list[dict[str, Any]]) -> None:
+        """Save calibration data to cache."""
+        try:
+            cache_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(cache_path, "w") as f:
+                json.dump(data, f)
+            print(f"Saved calibration samples to cache: {cache_path}")
+        except Exception as e:
+            print(f"Warning: Failed to save cache: {e}")
+
     def build_calibration_dataset(self) -> list[dict[str, Any]]:
         """Build the complete calibration dataset.
 
+        If cache_dir was set, checks cache first and returns cached data if present.
+        Otherwise generates the dataset, saves to cache (if cache_dir set), and returns.
+
         Returns:
             List of calibration samples with 'input' and 'length' fields
         """
+        cache_path = self._get_cache_path()
+        cached = self._load_cached_data(cache_path)
+        if cached is not None:
+            return cached
+
         all_samples = []
 
         print(
@@ -265,6 +321,8 @@ def build_calibration_dataset(self) -> list[dict[str, Any]]:
 
         random.shuffle(all_samples)
         print(f"Generated {len(all_samples)} valid samples")
+
+        self._save_cached_data(cache_path, all_samples)
         return all_samples
 
     def _generate_sample(
@@ -312,6 +370,7 @@ def _generate_niah_sample(
             num_needle_k=args.get("num_needle_k", 1),
             num_needle_v=args.get("num_needle_v", 1),
             num_needle_q=args.get("num_needle_q", 1),
+            data_dir=self.data_dir,
         )
 
         # Generate sample using official RULER implementation
@@ -328,6 +387,7 @@ def _generate_niah_sample(
             num_needle_v=args.get("num_needle_v", 1),
             num_needle_q=args.get("num_needle_q", 1),
             random_seed=self.seed + sample_idx,
+            data_dir=self.data_dir,
         )
 
         # Add task metadata