From 816ddfaa063ffd355ae8369eebdf469ae098c8dc Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 28 Apr 2026 00:29:17 -0700
Subject: [PATCH 01/35] enabling runtime optimization

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/__init__.py |  15 +
 .../nas/subblock_stats/calc_runtime_stats.py  | 271 ++++++++++++++++++
 .../calc_subblock_params_and_memory.py        |   8 +-
 .../puzzletron/subblock_stats/__init__.py     |   1 -
 .../subblock_stats/calc_subblock_stats.py     | 131 ++-------
 5 files changed, 317 insertions(+), 109 deletions(-)
 create mode 100644 modelopt/torch/nas/subblock_stats/__init__.py
 create mode 100644 modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
 rename modelopt/torch/{puzzletron => nas}/subblock_stats/calc_subblock_params_and_memory.py (97%)

diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py
new file mode 100644
index 00000000000..ff8d16685d6
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/__init__.py
@@ -0,0 +1,15 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .calc_runtime_stats import calc_runtime_for_subblocks
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
new file mode 100644
index 00000000000..da525c0bdcc
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -0,0 +1,271 @@
+import json
+import os
+import subprocess
+import tempfile
+from dataclasses import dataclass, replace
+from pathlib import Path
+
+import torch
+from omegaconf import DictConfig
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter
+from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
+from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
+from modelopt.torch.puzzletron.block_config import (
+    AttentionConfig,
+    BlockConfig,
+    FFNConfig,
+    SubblockConfig,
+)
+
+
+def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig:
+    return BlockConfig(
+        attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads),
+        ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None),
+        parallel_blocks=None,
+    )
+
+
+def create_benchmark_model(
+    vocab_size: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    prefill_seq_len: int,
+    generation_seq_len: int,
+    block_config: BlockConfig | None,
+    repeat_block_n_times: int = 10,
+) -> LlamaForCausalLM:
+
+    block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)]
+
+    if block_config:
+        block_configs.extend([block_config] * repeat_block_n_times)
+
+    model_config = LlamaConfig(
+        max_position_embeddings=prefill_seq_len + generation_seq_len,
+        vocab_size=vocab_size,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        num_hidden_layers=len(block_configs),
+        head_dim=None,  # Compute from hidden_size // num_attention_heads instead of using default 128
+        # this is required for trt-llm convertion to know which model classes to use to the checkpoint
+        auto_map={
+            "AutoConfig": "transformers.models.llama.configuration_llama.LlamaConfig",
+            "AutoModelForCausalLM": "transformers.models.llama.modeling_llama.LlamaForCausalLM",
+        },
+    )
+
+    for idx, block_config in enumerate(block_configs):
+        block_configs[idx] = block_config.to_dict()
+    model_config.block_configs = block_configs
+
+    with deci_x_patcher(LlamaModelDescriptor, block_configs):
+        model = AutoModelForCausalLM.from_config(model_config)
+
+    model.config.architectures = ["AnyModel"]
+    model.config.base_architecture = "LlamaForCausalLM"
+
+    return model
+
+
+def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
+
+    # Save standard model checkpoint (as safetensors, HF format)
+    model.save_pretrained(output_dir, safe_serialization=True)
+
+    # Convert/slice weights into AnyModel subblock_safetensors format
+    Converter.convert_model_weights(
+        input_dir=output_dir,
+        output_dir=output_dir,
+        descriptor=descriptor,
+        num_hidden_layers=num_hidden_layers,
+    )
+    # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk.
+
+    config_path = output_dir / "config.json"
+    if config_path.exists():
+        with open(config_path) as f:
+            config_data = json.load(f)
+        config_data["architectures"] = ["AnyModel"]
+        with open(config_path, "w") as f:
+            json.dump(config_data, f, indent=2)
+
+
+def save_model(
+    model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
+) -> None:
+
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+    save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.save_pretrained(output_path)
+
+
+@dataclass(frozen=True)
+class RuntimeConfig:
+    vocab_size: int
+    hidden_size: int
+    num_attention_heads: int
+    master_puzzle_dir: str
+    tokenizer_path: str
+    synth_dataset_num_requests: int
+    repeat_block_n_times: int
+    prefill_seq_len: int
+    generation_seq_len: int
+    batch_size: int
+    num_iters: int
+    num_warmup_iters: int
+
+
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
+
+    output_json_path = model_path / "vllm_latency_benchmark.json"
+
+    cmd = [
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        str(model_path),
+        "--input-len",
+        str(runtime_config.prefill_seq_len),
+        "--output-len",
+        str(runtime_config.generation_seq_len),
+        "--batch-size",
+        str(runtime_config.batch_size),
+        "--output-json",
+        str(output_json_path),
+        "--max-model-len",
+        str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len),
+        "--num-iters-warmup",
+        str(runtime_config.num_warmup_iters),
+        "--num-iters",
+        str(runtime_config.num_iters),
+        "--max-num-seqs",
+        "1",
+        "--distributed-executor-backend",
+        "external_launcher",
+        "--tensor-parallel-size",
+        "1",
+        "--pipeline-parallel-size",
+        "1",
+    ]
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    subprocess.run(cmd)
+
+    with open(output_json_path) as f:
+        vllm_results = json.load(f)
+    print(vllm_results)
+    return vllm_results["avg_latency"] * 1000  # convert to milliseconds
+
+
+def calc_subblock_runtime(
+    runtime_config: RuntimeConfig,
+    subblock_config: SubblockConfig,
+) -> float:
+
+    block_config: BlockConfig | None = None
+
+    if subblock_config is not None:
+        if isinstance(subblock_config, BlockConfig):
+            block_config = subblock_config
+        elif isinstance(subblock_config, (AttentionConfig, FFNConfig)):
+            block_config = subblock_config.to_blockconfig()
+        else:
+            raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}")
+
+    model = create_benchmark_model(
+        runtime_config.vocab_size,
+        runtime_config.hidden_size,
+        runtime_config.num_attention_heads,
+        runtime_config.prefill_seq_len,
+        runtime_config.generation_seq_len,
+        block_config=block_config,
+        repeat_block_n_times=runtime_config.repeat_block_n_times,
+    )
+    with tempfile.TemporaryDirectory() as model_tmpdir:
+        save_model(
+            model,
+            Path(runtime_config.tokenizer_path),
+            Path(model_tmpdir),
+            num_hidden_layers=runtime_config.repeat_block_n_times + 1,
+        )
+        subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
+
+    return subblock_total_runtime_ms
+
+
+def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
+
+    runtime_config1 = replace(runtime_config, repeat_block_n_times=0)
+    runtime_config10 = replace(runtime_config, repeat_block_n_times=9)
+
+    block_config = _make_standard_block_config(
+        runtime_config.hidden_size, runtime_config.num_attention_heads
+    )
+
+    runtime_ms1 = calc_subblock_runtime(runtime_config1, None)
+    runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config)
+
+    no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9
+
+    return no_block_runtime_ms
+
+
+def calc_runtime_for_subblocks(
+    subblock_config_set: set[SubblockConfig],
+    runtime_stats_config: DictConfig,
+    vocab_size: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    master_puzzle_dir: str,
+    tokenizer_path: str,
+    synth_dataset_num_requests: int,
+    prefill_seq_len: int,
+    generation_seq_len: int,
+) -> tuple[dict[SubblockConfig, float], float]:
+
+    repeat_block_n_times = 10
+    runtime_config = RuntimeConfig(
+        vocab_size,
+        hidden_size,
+        num_attention_heads,
+        master_puzzle_dir,
+        tokenizer_path,
+        synth_dataset_num_requests,
+        repeat_block_n_times,
+        prefill_seq_len,
+        generation_seq_len,
+        runtime_stats_config.get("batch_size", 1),
+        runtime_stats_config.get("num_iters", 30),
+        runtime_stats_config.get("num_warmup_iters", 10),
+    )
+
+    runtime_by_subblock_dict = {}
+
+    baseline_runtime_ms = calc_subblock_runtime(runtime_config, None)
+
+    for subblock_config in tqdm(
+        sorted(subblock_config_set),
+        desc=(
+            f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, "
+            f"num_subblocks={len(subblock_config_set)}]"
+        ),
+    ):
+        if subblock_config.no_op:
+            total_runtime_ms = 0.0
+        else:
+            subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config)
+            total_runtime_ms = (
+                subblock_total_runtime_ms - baseline_runtime_ms
+            ) / repeat_block_n_times
+
+        runtime_by_subblock_dict[subblock_config] = total_runtime_ms
+
+    no_block_runtime_ms = calc_no_block_runtime(runtime_config)
+
+    return runtime_by_subblock_dict, no_block_runtime_ms
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
similarity index 97%
rename from modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
rename to modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
index d893eb55bb3..3938bb55596 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
@@ -31,16 +31,16 @@
 import torch
 from transformers import PretrainedConfig
 
-from ..anymodel.model_descriptor import ModelDescriptor
-from ..block_config import (
+from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
+from modelopt.torch.puzzletron.block_config import (
     AttentionConfig,
     BlockConfig,
     FFNConfig,
     MambaConfig,
     maybe_cast_block_configs,
 )
-from ..tools.checkpoint_utils_hf import init_model_from_config
-from ..utils.misc import (
+from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config
+from modelopt.torch.puzzletron.utils.misc import (
     EmptyInitOnDevice,
     calculate_kv_dim,
     raise_unknown_subblock_config_error,
diff --git a/modelopt/torch/puzzletron/subblock_stats/__init__.py b/modelopt/torch/puzzletron/subblock_stats/__init__.py
index fbbeb3ff709..4964dba0cfa 100644
--- a/modelopt/torch/puzzletron/subblock_stats/__init__.py
+++ b/modelopt/torch/puzzletron/subblock_stats/__init__.py
@@ -15,5 +15,4 @@
 
 """Subblock statistics collection for Puzzletron."""
 
-from .calc_subblock_params_and_memory import *
 from .calc_subblock_stats import *
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index dc89a1f6450..f36a71710a3 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -24,7 +24,7 @@
 from functools import partial
 from itertools import product
 from pathlib import Path
-from typing import Iterable, Optional, Type, TypeVar
+from typing import Iterable, Type, TypeVar
 
 import pandas as pd
 import torch
@@ -41,7 +41,7 @@
 from ..tools.checkpoint_utils import load_model_config
 from ..tools.logger import mprint
 from ..utils.parsing import format_global_config
-from .calc_subblock_params_and_memory import (
+from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import (
     calc_subblock_active_params,
     calculate_non_block_memory,
     calculate_non_block_params,
@@ -52,7 +52,6 @@
 __all__ = [
     "calculate_subblock_stats",
     "launch_calc_subblock_stats",
-    "add_int8_runtime_estimates",
 ]
 
 # Type variable for dataclasses
@@ -60,10 +59,10 @@
 
 """
 Usage:
-python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ]
+python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --runtime_stats ]
 
---benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime,
-  only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker.
+--runtime_stats_enabled=False (the default) means that the code won't benchmark runtime,
+  only memory stats will be calculated. If you want to benchmark runtime, run inside an trtllm docker.
 
 """
 
@@ -82,7 +81,7 @@ def calculate_subblock_stats(
     n_embd: int,
     n_head: int,
     vocab_size: int,
-    benchmark_iterations: Optional[int],
+    runtime_stats_enabled: bool,
     use_cuda_graph: bool,
     weights_dtype: torch.dtype,
     activations_dtype: torch.dtype,
@@ -90,14 +89,12 @@ def calculate_subblock_stats(
     allocate_prefill_query: bool,
     moe_stats_file: str | Path | None = None,
 ) -> dict:
-    is_calc_runtime = benchmark_iterations is not None
-    if is_calc_runtime:
-        raise NotImplementedError("Runtime stats calculation is not implemented yet")
+    if runtime_stats_enabled:
+        from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks
 
     gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name()
     subblock_stats = {
         "args": dict(
-            is_calc_runtime=is_calc_runtime,
             gpu=gpu,
             batch_size=batch_size,
             prefill_seq_len=prefill_seq_len,
@@ -106,7 +103,7 @@ def calculate_subblock_stats(
             n_embd=n_embd,
             n_head=n_head,
             vocab_size=vocab_size,
-            benchmark_iterations=benchmark_iterations,
+            runtime_stats=runtime_stats_enabled,
             use_cuda_graph=use_cuda_graph,
             weights_dtype=str(weights_dtype),
             activations_dtype=str(activations_dtype),
@@ -116,8 +113,7 @@ def calculate_subblock_stats(
         "subblocks": list(),
     }
     # Compute runtime stats for unique subblocks only
-    if is_calc_runtime:
-        raise NotImplementedError("Runtime stats calculation is not implemented yet")
+    if runtime_stats_enabled:
         subblock_configs_nolayerindex = set(
             [subblock_config["subblock_config"] for subblock_config in subblock_configs]
         )
@@ -127,16 +123,19 @@ def calculate_subblock_stats(
         synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get(
             "synth_dataset_num_requests", 200
         )
-        backend = calc_subblock_stats_config.get("runtime_stats", {}).get("backend", "trt_torch")
-        runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_ms_for_subblocks(
+        runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {})
+
+        runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks(
             subblock_configs_nolayerindex,
+            runtime_stats_config,
             vocab_size,
             n_embd,
             n_head,
             master_puzzle_dir,
             teacher_dir,
             synth_dataset_num_requests,
-            backend,
+            prefill_seq_len,
+            generation_seq_len,
         )
 
     sorted_subblock_config = sorted(
@@ -144,7 +143,7 @@ def calculate_subblock_stats(
     )
     it = (
         tqdm(sorted_subblock_config, desc="Measuring subblock runtimes")
-        if is_calc_runtime
+        if runtime_stats_enabled
         else sorted_subblock_config
     )
     for subblock_config_indexed in it:
@@ -156,7 +155,7 @@ def calculate_subblock_stats(
             descriptor.get_language_model_config(layer_model_config), parent_layer_indices[0]
         )
 
-        if is_calc_runtime:
+        if runtime_stats_enabled:
             total_runtime_ms = runtime_by_subblock_dict[subblock_config]
             prefill_runtime_ms = None
             decode_runtime_ms = None
@@ -207,25 +206,13 @@ def calculate_subblock_stats(
             }
         )
 
-    if is_calc_runtime:
-        # TODO: fix
-        # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms
-        # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \
-        #     measure_non_block_runtime_ms(batch_size, prefill_seq_len, generation_seq_len, n_embd, vocab_size,
-        #                                  benchmark_iterations, use_cuda_graph)
-        embedding_runtime_ms, lm_head_runtime_ms = None, None
-    else:
-        non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = None, None, None
+    if not runtime_stats_enabled:
+        non_block_runtime_ms = None
     non_block_memory = calculate_non_block_memory(n_embd, vocab_size, weights_dtype)
     non_block_params = calculate_non_block_params(n_embd, vocab_size)
 
-    # TODO
-    # the semantics here is wrong why do we refer, prefill_runtime_ms as embedding_runtime_ms and lm_head_runtime_ms as decode_runtime_ms ?
-    # Prefill is the first the user prompt inference, and Decode refer to the next generation process. both processes use all the model layers.
     subblock_stats["non_block"] = {
         "runtime_ms": non_block_runtime_ms,
-        "prefill_runtime_ms": embedding_runtime_ms,
-        "decode_runtime_ms": lm_head_runtime_ms,
         "memory_mib": non_block_memory,
         "num_params": non_block_params,
     }
@@ -256,7 +243,9 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None:
         num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None),
         prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size,
         allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False),
-        benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None),
+        runtime_stats_enabled=cfg.calc_subblock_stats.get("runtime_stats", {}).get(
+            "enabled", False
+        ),
         merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats,
         subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename,
         moe_stats_filename=cfg.calc_subblock_stats.moe_stats_filename,
@@ -276,9 +265,7 @@ def calculate_subblock_stats_for_puzzle_dir(
     num_active_tokens_override: int | None = None,
     prefill_queue_size: int = 0,  # it's an infery-llm thing
     allocate_prefill_query: bool = False,
-    benchmark_iterations: (
-        int | None
-    ) = None,  # If set then compute runtime performance statistics. TODO: recommend default value, is 1000 good?
+    runtime_stats_enabled: bool = False,  # Compute runtime statistics.
     merge_with_existing_stats: bool = False,
     subblock_stats_filename: str = "subblock_stats.json",
     moe_stats_filename: str = "moe_stats.json",
@@ -344,8 +331,8 @@ def calculate_subblock_stats_for_puzzle_dir(
         if num_active_tokens_override is not None:
             prefill_seq_len = generation_seq_len = int(num_active_tokens_override / batch_size / 2)
 
-        curr_benchmark_iterations = (
-            benchmark_iterations if weights_dtype == torch.bfloat16 else None
+        curr_runtime_stats_enabled = (
+            runtime_stats_enabled if weights_dtype == torch.bfloat16 else False
         )
 
         curr_subblock_stats = calculate_subblock_stats(
@@ -362,7 +349,7 @@ def calculate_subblock_stats_for_puzzle_dir(
             n_embd=model_hidden_size,
             n_head=lm_config.num_attention_heads,
             vocab_size=lm_config.vocab_size,
-            benchmark_iterations=curr_benchmark_iterations,
+            runtime_stats_enabled=curr_runtime_stats_enabled,
             use_cuda_graph=True,
             weights_dtype=weights_dtype,
             activations_dtype=activations_dtype,
@@ -378,8 +365,6 @@ def calculate_subblock_stats_for_puzzle_dir(
 
         subblock_stats.append(curr_subblock_stats)
 
-    # TODO fix: add_int8_runtime_estimates(subblock_stats)
-
     json_dump(subblock_stats, subblock_stats_file)
 
     mprint(subblock_stats_file)
@@ -503,65 +488,3 @@ def _dataclass_from_dict(
     raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}")
 
 
-def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None:
-    for curr_subblock_stats in subblock_stats:
-        args = curr_subblock_stats["args"]
-        if args["weights_dtype"] == "torch.int8":
-            assert args["activations_dtype"] == "torch.int8"
-            ffn_factor = 0.5
-            attention_factor = 0.5 if args["kv_cache_dtype"] == "torch.int8" else 0.8
-
-            bf16_stats = _find_corresponding_bf16_stats(args, subblock_stats)
-            if bf16_stats is not None:
-                curr_subblocks = curr_subblock_stats["subblocks"] + [
-                    curr_subblock_stats["non_block"]
-                ]
-                bf16_subblocks = bf16_stats["subblocks"] + [bf16_stats["non_block"]]
-                for curr_subblock, bf16_subblock in zip(curr_subblocks, bf16_subblocks):
-                    assert curr_subblock.get("subblock_config", None) == bf16_subblock.get(
-                        "subblock_config", None
-                    )
-                    is_attention = False
-                    if (subblock_config := curr_subblock.get("subblock_config")) is not None:
-                        if hasattr(subblock_config, "__dataclass_fields__"):
-                            subblock_config = dataclasses.asdict(subblock_config)
-                        is_attention = subblock_config.get("num_key_value_heads", None) is not None
-                    runtime_factor = attention_factor if is_attention else ffn_factor
-                    for stat_name, stat_value in bf16_subblock.items():
-                        if "runtime" in stat_name:
-                            curr_subblock[stat_name] = stat_value * runtime_factor
-
-
-def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> dict | None:
-    scenario_keys = [
-        "batch_size",
-        "prefill_seq_len",
-        "generation_seq_len",
-        "prefill_queue_size",
-        "gpu",
-        "n_embd",
-        "n_head",
-        "vocab_size",
-    ]
-    corresponding_bf16_args = {
-        **{k: v for k, v in args.items() if k in scenario_keys},
-        "is_calc_runtime": True,
-        "weights_dtype": "torch.bfloat16",
-        "activations_dtype": "torch.bfloat16",
-        "kv_cache_dtype": "torch.bfloat16",
-    }
-    matching_bf16_stats = [
-        stats
-        for stats in subblock_stats
-        if all(
-            [
-                stats["args"][key] == corresponding_bf16_args[key]
-                for key in corresponding_bf16_args.keys()
-            ]
-        )
-    ]
-    if len(matching_bf16_stats) == 0:
-        return None
-    if len(matching_bf16_stats) == 1:
-        return matching_bf16_stats[0]
-    raise ValueError(f"Found more than 1 matching bf16 stats for {args=}")

From 3041dc2d63183d55de7c2e1fac014f1639b62883 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 28 Apr 2026 14:49:10 -0700
Subject: [PATCH 02/35] done ruff formatting and docstrings

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/__init__.py |   8 +
 .../nas/subblock_stats/calc_runtime_stats.py  |  38 +++-
 .../calc_subblock_params_and_memory.py        | 171 ++++++++++++++++--
 3 files changed, 190 insertions(+), 27 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py
index ff8d16685d6..aeac903f8f4 100644
--- a/modelopt/torch/nas/subblock_stats/__init__.py
+++ b/modelopt/torch/nas/subblock_stats/__init__.py
@@ -12,4 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Subblock runtime statistics API for ModelOpt NAS.
+
+This module provides utilities for measuring and calculating runtime statistics
+of subblocks (e.g., Attention, FFN) within transformer architectures.
+
+Primary API:
+    - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations
+"""
 from .calc_runtime_stats import calc_runtime_for_subblocks
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index da525c0bdcc..d3b997f4525 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -1,3 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# mypy: ignore-errors
+
+"""Runtime statistics calculation for NAS subblock benchmarking via vLLM."""
+
 import json
 import os
 import subprocess
@@ -38,7 +56,7 @@ def create_benchmark_model(
     block_config: BlockConfig | None,
     repeat_block_n_times: int = 10,
 ) -> LlamaForCausalLM:
-
+    """Build a small Llama model with repeated subblocks for latency benchmarking."""
     block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)]
 
     if block_config:
@@ -58,8 +76,8 @@ def create_benchmark_model(
         },
     )
 
-    for idx, block_config in enumerate(block_configs):
-        block_configs[idx] = block_config.to_dict()
+    for idx, bc in enumerate(block_configs):
+        block_configs[idx] = bc.to_dict()
     model_config.block_configs = block_configs
 
     with deci_x_patcher(LlamaModelDescriptor, block_configs):
@@ -72,7 +90,7 @@ def create_benchmark_model(
 
 
 def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
-
+    """Save a model checkpoint in AnyModel subblock-safetensors format."""
     # Save standard model checkpoint (as safetensors, HF format)
     model.save_pretrained(output_dir, safe_serialization=True)
 
@@ -97,7 +115,7 @@ def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layer
 def save_model(
     model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
 ) -> None:
-
+    """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
     model.to(dtype=torch.bfloat16).save_pretrained(output_path)
     save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
 
@@ -107,6 +125,8 @@ def save_model(
 
 @dataclass(frozen=True)
 class RuntimeConfig:
+    """Configuration for a vLLM latency benchmark run."""
+
     vocab_size: int
     hidden_size: int
     num_attention_heads: int
@@ -122,7 +142,7 @@ class RuntimeConfig:
 
 
 def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
-
+    """Run ``vllm bench latency`` and return the average latency in milliseconds."""
     output_json_path = model_path / "vllm_latency_benchmark.json"
 
     cmd = [
@@ -167,7 +187,7 @@ def calc_subblock_runtime(
     runtime_config: RuntimeConfig,
     subblock_config: SubblockConfig,
 ) -> float:
-
+    """Measure total runtime of a repeated subblock via vLLM latency benchmark."""
     block_config: BlockConfig | None = None
 
     if subblock_config is not None:
@@ -200,7 +220,7 @@ def calc_subblock_runtime(
 
 
 def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
-
+    """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
     runtime_config1 = replace(runtime_config, repeat_block_n_times=0)
     runtime_config10 = replace(runtime_config, repeat_block_n_times=9)
 
@@ -228,7 +248,7 @@ def calc_runtime_for_subblocks(
     prefill_seq_len: int,
     generation_seq_len: int,
 ) -> tuple[dict[SubblockConfig, float], float]:
-
+    """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead."""
     repeat_block_n_times = 10
     runtime_config = RuntimeConfig(
         vocab_size,
diff --git a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
index 3938bb55596..abe7a1a3884 100644
--- a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
@@ -25,7 +25,6 @@
 import json
 import math
 from pathlib import Path
-from typing import Type
 
 import numpy as np
 import torch
@@ -48,16 +47,16 @@
 )
 
 __all__ = [
-    "calculate_subblock_memory",
-    "calculate_subblock_params",
     "calc_subblock_active_params",
-    "load_moe_stats",
-    "estimate_num_active_experts",
+    "calculate_ffn_memory",
     "calculate_mamba_memory",
     "calculate_mamba_state_size",
-    "calculate_ffn_memory",
     "calculate_non_block_memory",
     "calculate_non_block_params",
+    "calculate_subblock_memory",
+    "calculate_subblock_params",
+    "estimate_num_active_experts",
+    "load_moe_stats",
 ]
 
 
@@ -73,9 +72,29 @@ def calculate_subblock_memory(
     kv_cache_dtype: torch.dtype,
     allocate_prefill_query: bool,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
 ) -> float | dict[str, float]:
-    """``model_config`` / ``descriptor`` are required (puzzletron-style); FFN uses them for meta init."""
+    """Calculate the memory usage of a single subblock (FFN or Attention).
+
+    Given its configuration and runtime dimensions, returns bytes or a detailed dict.
+
+    Args:
+        subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass.
+        batch_size (int): Batch size for memory estimate.
+        prefill_seq_len (int): Sequence length for prefill phase.
+        generation_seq_len (int): Sequence length for generation phase (token-by-token).
+        prefill_queue_size (int): Token queue size for prefill attention memory allocation.
+        n_embd (int): Embedding (hidden) dimension.
+        n_head (int): Number of attention heads (used for non-FFN).
+        weights_dtype (torch.dtype): PyTorch dtype for model weights.
+        kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache.
+        allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens.
+        model_config (PretrainedConfig): HuggingFace-style config instance describing the model.
+        descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types).
+
+    Returns:
+        float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type.
+    """
     if subblock_config.no_op:
         return 0
     if isinstance(subblock_config, FFNConfig):
@@ -116,7 +135,7 @@ def calculate_subblock_memory(
 def calculate_subblock_params(
     config: PretrainedConfig,
     layer_config: BlockConfig | FFNConfig | AttentionConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
 ) -> int:
     """Count parameters on one meta decoder layer.
 
@@ -124,9 +143,7 @@ def calculate_subblock_params(
     ``hybrid_override_pattern``) before passing ``config``; see
     ``ModelDescriptor.truncate_pattern_for_subblock``.
     """
-    if isinstance(layer_config, FFNConfig):
-        block_config = layer_config.to_blockconfig()
-    elif isinstance(layer_config, AttentionConfig):
+    if isinstance(layer_config, (FFNConfig, AttentionConfig)):
         block_config = layer_config.to_blockconfig()
     else:
         block_config = layer_config
@@ -189,12 +206,31 @@ def calculate_subblock_params(
 def calc_subblock_active_params(
     sublayer_config: FFNConfig | AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     n_embd: int,
     moe_stats_file: str,
     batch_size: int,
     block_idx: int,
 ) -> int:
+    """Calculate the number of "active" parameters for a subblock (FFN, Attention, or MoE).
+
+    For non-MoE subblocks, simply calls `calculate_subblock_params` to count all parameters.
+    For MoE (Mixture-of-Experts) FFN subblocks, estimates the expected number of active parameters
+    per batch by leveraging expert activation statistics (from a given stats file) and calculating
+    the expected number of active experts, then multiplies by the number of parameters per expert.
+
+    Args:
+        sublayer_config: The subblock configuration (either FFNConfig or AttentionConfig).
+        model_config: The Hugging Face model configuration.
+        descriptor: The ModelDescriptor class corresponding to this model family.
+        n_embd: The embedding size (hidden dimension).
+        moe_stats_file: Path to file containing expert activation probabilities.
+        batch_size: The batch size used for the estimate.
+        block_idx: The index of the block/subblock within the network, used to index into the stats.
+
+    Returns:
+        int: The expected number of "active" parameters for the given subblock.
+    """
     if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe):
         return calculate_subblock_params(model_config, sublayer_config, descriptor)
     return estimate_moe_active_params(
@@ -203,14 +239,45 @@ def calc_subblock_active_params(
 
 
 def load_moe_stats(stats_file: str) -> dict:
+    """Load MoE (Mixture-of-Experts) routing statistics from a file.
+
+    This function reads a JSON file containing expert activation probabilities or counts for each MoE block.
+    It returns the normalized probability distributions over experts for each block, as a list of numpy arrays.
+
+    Args:
+        stats_file (str): Path to the JSON file containing expert routing statistics for each block.
+
+    Returns:
+        list[np.ndarray]: A list where each element is a numpy array containing the normalized probability
+            distribution over experts for the corresponding block. If a block's expert list is empty,
+            its entry is 0.
+    """
     with open(stats_file) as f:
         stats = json.load(f)
-    return [np.array(l) / np.sum(l) if len(l) > 0 else 0 for l in stats]
+    return [
+        np.array(expert_probs) / np.sum(expert_probs) if len(expert_probs) > 0 else 0
+        for expert_probs in stats
+    ]
 
 
 def estimate_num_active_experts(
     dist_over_experts: np.ndarray, batch_size: int, num_experts: int
 ) -> int:
+    """Estimate the expected number of active experts in a Mixture-of-Experts (MoE) layer.
+
+    This function computes the expected number of unique experts that are selected at least once when performing
+    inference with a given batch size. It assumes, for each input in the batch, an expert is chosen with probability
+    given by `dist_over_experts` (typically a vector of probabilities for each expert). For a batch of size B, the
+    expected number of active (i.e., selected at least once) experts is computed.
+
+    Args:
+        dist_over_experts (np.ndarray): A 1D array of probabilities for each expert.
+        batch_size (int): The number of samples in the batch.
+        num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter).
+
+    Returns:
+        int: The expected number of experts selected at least once across the batch.
+    """
     # cut the tail and renormalize
     dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts]
     dist_over_experts = dist_over_experts / (dist_over_experts.sum())
@@ -226,6 +293,18 @@ def estimate_moe_active_params(
     batch_size: int,
     block_idx: int,
 ) -> int:
+    """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock.
+
+    Args:
+        subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured).
+        n_embd (int): The embedding dimension (input and output size per expert).
+        moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts.
+        batch_size (int): Batch size to simulate/extrapolate expected expert use.
+        block_idx (int): The index of the block/layer whose expert routing statistics should be used.
+
+    Returns:
+        int: Estimated number of parameters actively used for the current batch and expert selection statistics.
+    """
     assert Path(moe_stats_file).exists()
     # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution?
     #     return calculate_subblock_params(subblock_config, n_embd, n_head=None)
@@ -255,7 +334,7 @@ def estimate_moe_active_params(
 def calculate_attention_memory(
     attention_config: AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     batch_size: int,
     prefill_seq_len: int,
     generation_seq_len: int,
@@ -267,6 +346,7 @@ def calculate_attention_memory(
     allocate_prefill_query: bool,
 ) -> dict[str, float]:
     """allocate_prefill_query: infery-llm style.
+
     Infery used a unified Wqkv matrix, so before extracting the kv-cache,
     the query also had to be kept in-memory, once per layer.
     """
@@ -294,11 +374,25 @@ def calculate_attention_memory(
 def calculate_mamba_memory(
     attention_config: AttentionConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     batch_size: int,
     weights_dtype: torch.dtype,
     kv_cache_dtype: torch.dtype,
 ) -> int:
+    """Calculate memory usage (MiB) for a Mamba attention subblock.
+
+    Args:
+        attention_config (AttentionConfig): Mamba attention configuration,
+            including Mamba-specific settings.
+        model_config (PretrainedConfig): Model configuration.
+        descriptor (type[ModelDescriptor]): Model descriptor class.
+        batch_size (int): Batch size for memory estimate.
+        weights_dtype (torch.dtype): Data type for model weights.
+        kv_cache_dtype (torch.dtype): Data type for state/kv-cache.
+
+    Returns:
+        int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock.
+    """
     assert attention_config.mamba is not None
     mamba_config = attention_config.mamba
     num_params = calculate_subblock_params(model_config, attention_config, descriptor)
@@ -312,7 +406,16 @@ def calculate_mamba_state_size(
     mamba_config: MambaConfig,
     batch_size: int,
 ) -> int:
-    d_inner, in_proj_dim, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config)
+    """Calculate the total state size for a Mamba attention subblock.
+
+    Args:
+        mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters.
+        batch_size (int): Batch size to estimate the memory/state requirements for.
+
+    Returns:
+        int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state.
+    """
+    _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config)
     conv_state_size = math.prod((batch_size, conv_dim, kernel_size))
     ssm_state_size = math.prod(
         (batch_size, mamba_config.num_heads, mamba_config.head_dim, mamba_config.state_dim)
@@ -333,10 +436,23 @@ def _calculate_mamba_intermediates(mamba_config: MambaConfig) -> tuple[int, ...]
 def calculate_ffn_memory(
     ffn_config: FFNConfig,
     model_config: PretrainedConfig,
-    descriptor: Type[ModelDescriptor],
+    descriptor: type[ModelDescriptor],
     weights_dtype: torch.dtype | str,
     experts_dtype: torch.dtype | str | None = None,
 ) -> float:
+    """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock.
+
+    Args:
+        ffn_config (FFNConfig): FFN configuration for the block.
+        model_config (PretrainedConfig): The parent model configuration.
+        descriptor (type[ModelDescriptor]): Model descriptor class.
+        weights_dtype (torch.dtype | str): Data type for FFN weights.
+        experts_dtype (torch.dtype | str | None, optional): Data type for expert weights
+            (for MoE layers, if present). Defaults to None.
+
+    Returns:
+        float: Estimated FFN memory usage in mebibytes (MiB).
+    """
     # TODO: How to separate between expert weights and the rest for any model (same as puzzletron).
     num_params = calculate_subblock_params(model_config, ffn_config, descriptor)
     return num_params * sizeof_dtype(weights_dtype) / 2**20
@@ -347,6 +463,16 @@ def calculate_non_block_memory(
     vocab_size: int,
     weight_dtype: torch.dtype,
 ) -> float:
+    """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection).
+
+    Args:
+        n_embd (int): Embedding dimension (hidden size).
+        vocab_size (int): Vocabulary size.
+        weight_dtype (torch.dtype): Data type for model weights.
+
+    Returns:
+        float: Estimated non-subblock memory usage in mebibytes (MiB).
+    """
     return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20
 
 
@@ -354,4 +480,13 @@ def calculate_non_block_params(
     n_embd: int,
     vocab_size: int,
 ) -> int:
+    """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection).
+
+    Args:
+        n_embd (int): Embedding dimension (hidden size).
+        vocab_size (int): Vocabulary size.
+
+    Returns:
+        int: Estimated non-subblock parameter count.
+    """
     return vocab_size * n_embd * 2 + n_embd

From a36375017352cc8daf61e2dbb8f46eb96ec03cdb Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 4 May 2026 02:41:13 -0700
Subject: [PATCH 03/35] distributed timeout is configurable

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/main.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py
index 8ceed378318..ba1c19d12c5 100644
--- a/examples/puzzletron/main.py
+++ b/examples/puzzletron/main.py
@@ -68,7 +68,20 @@ def run_full_puzzletron(hydra_config_path: str):
         config_path: Path to the YAML configuration file
     """
     mtpz.tools.mprint("Puzzletron Progress 1/8: starting puzzletron pipeline")
-    dist.setup(timeout=timedelta(minutes=10))
+    # Read the Hydra config to determine runtime_stats:enabled, and set the timeout accordingly
+    from omegaconf import OmegaConf
+
+    # Resolve absolute path for Hydra config
+    hydra_config_path = Path(hydra_config_path).resolve()
+    hydra_config = OmegaConf.load(str(hydra_config_path))
+
+    # Default timeout: 10 minutes, or extended to dist_timeout_minutes if set in config
+    if hasattr(hydra_config, "dist_timeout_minutes"):
+        timeout_minutes = timedelta(minutes=hydra_config.dist_timeout_minutes)
+    else:
+        timeout_minutes = timedelta(minutes=10)
+    mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}")
+    dist.setup(timeout=timeout_minutes)
 
     # Register Hydra custom resolvers (needed for config resolution)
     mtpz.tools.register_hydra_resolvers()

From 53a2caf8215b698fde5115ba6f7ec10c6802ccc7 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 4 May 2026 14:08:43 -0700
Subject: [PATCH 04/35] added example config for attn pruning and runtime
 constraint

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../Llama-3_1-8B.yaml                         | 109 ++++++++++++++++++
 .../llama-3_1-8B_pruneattn_runtime.yaml       |  29 +++++
 .../pruning/attn_pruning.yaml                 |  23 ++++
 .../pruning/ffn_pruning.yaml                  |  19 +++
 .../pruning/hidden_dim_pruning.yaml           |  15 +++
 .../pruning/pruning_defaults.yaml             |  33 ++++++
 .../validate_model_defaults.yaml              |  17 +++
 .../validate_solutions_defaults.yaml          |  10 ++
 8 files changed, 255 insertions(+)
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml
 create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml
new file mode 100644
index 00000000000..7340938da25
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml
@@ -0,0 +1,109 @@
+defaults:
+  - pruning: ffn_pruning
+  - scoring: ../validate_solutions_defaults
+  - realize_model: ../validate_solutions_defaults
+  - bypass:
+  - override hydra/hydra_logging: disabled
+  - _self_
+
+puzzle_dir: ???
+descriptor: llama
+teacher_dir: ${puzzle_dir}/ckpts/teacher/
+replacement_library_path: ${puzzle_dir}/replacement_library.json
+dataset_path: ??? # ppath to Nemotron-Post-Training-Dataset-v2
+
+skip_realize_model: false
+
+build_replacement_library:
+  add_ffn_no_ops: true
+  add_attention_no_ops: true
+
+calc_subblock_stats:
+  batch_sizes: [64, 96, 128]
+  prefill_seq_len: 4096
+  generation_seq_len: 4096
+  num_active_tokens_override: # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
+  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
+  merge_with_existing_stats: true
+  subblock_stats_filename: "subblock_stats.json"
+  moe_stats_filename: "moe_stats.json"
+  runtime_stats:
+    backend: trt_torch
+
+scoring:
+  descriptor: ${descriptor}
+  solutions_to_validate:
+  skip_existing_solutions: true
+
+  replacement_library_path: ${replacement_library_path}
+  solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json}
+  teacher_dir: ${to_path:${teacher_dir}}
+  output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
+
+  eval_samples: 8
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+mip:
+  single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}}
+  subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}}
+  output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions}
+  gathered_metrics_path:
+  puzzle_profile:
+
+  # puzzle_profile:
+  objective: metrics.cosine_embedding_loss_hidden_states
+  bigger_is_better: false
+
+  subblock_stats_args:
+    - batch_size: 96
+      weights_dtype: torch.bfloat16
+      activations_dtype: torch.bfloat16
+      kv_cache_dtype: torch.bfloat16
+
+  report_additional_costs:
+    - stats.memory_mib
+    - stats.num_params
+    - stats.num_kv_heads
+    - stats.has_attention
+    - stats.has_ffn
+    - stats.kv_cache_memory_mib
+    - stats.attention_memory_mib
+    - stats.ffn_memory_mib
+    - stats.ffn_num_params
+    - stats.attention_num_params
+
+  # human_constraints:
+  #   target_memory: 78_000
+  #   num_params: 7_000_000_000
+
+  mip_constraints:
+  metric_overrides:
+  max_seconds_per_solution: 60
+
+realize_model:
+  descriptor: ${descriptor}
+  teacher_dir: ${to_path:${teacher_dir}}
+  tokenizer_name: ${to_path:${teacher_dir}}
+  replacement_library_path: ${replacement_library_path}
+  save_models: true
+  solutions_path: # Filled dynamically
+
+  # Validate params
+  skip_validation: false # To enable validation of the model solution set `skip_validation` as False
+  eval_samples: 128
+  micro_batch_size: 1
+  seed: 42
+  shuffle_seed: 444
+  dataset_path: ${dataset_path}
+
+nccl_timeout_minutes: ${timedelta_minutes:10}
+
+# This section redirects Hydra outputs
+hydra:
+  run:
+    dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S}
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
new file mode 100644
index 00000000000..42e17d627a0
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
@@ -0,0 +1,29 @@
+defaults:
+  - Llama-3_1-8B
+  - override pruning: attn_pruning
+  - _self_
+
+# Input Hugging Face model to compress
+input_hf_model_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/puzzletron/checkpoints/meta-llama/Llama-3.1-8B-Instruct
+
+# Dataset path for pruning and NAS scoring
+dataset_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
+
+# Working directory for puzzletron outputs
+puzzle_dir: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/Model-Optimizer/artifacts
+
+dist_timeout_minutes: 60
+
+calc_subblock_stats:
+  runtime_stats:
+    enabled: true
+    synth_dataset_num_requests: 32
+    backend: vllm
+    num_warmup_iters: 2
+    num_iters: 10
+    batch_size: 1
+
+# MIP memory constraint (in MiB)
+mip:
+  human_constraints:
+    target_latency: 21
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml
new file mode 100644
index 00000000000..53d7e4bd9c6
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml
@@ -0,0 +1,23 @@
+defaults:
+  - pruning_defaults
+
+hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IndependentKvHeadContributionHook}
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor
+
+activation_hooks_kwargs:
+  method: independent_kv_head_contribution
+  optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
+  target_layer: "self_attn.o_proj"
+  layer_input_descriptors_path:
+
+# n_heads_in_group: 4
+# num_attention_heads: 32       # num query heads
+# num_kv_heads: 32 / 4 = 8      # num_query_heads // n_heads_in_group
+n_heads_in_group_list: [8, 16, 32]      # num_kv_heads = [4, 2, 1]
+gqa_init_mode: "PruneKVHeads"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml
new file mode 100644
index 00000000000..da0b9720700
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml
@@ -0,0 +1,19 @@
+defaults:
+  - pruning_defaults
+
+pruning_mixin:
+  _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn
+  layer_descriptor:
+    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
+
+hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IterativeChannelContributionHook}
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: iterative
+  target_layer: "mlp.down_proj"
+  layer_input_descriptors_path:
+
+intermediate_size_list: [3072, 5888, 8704, 11520]  # teacher_intermediate_size is 14336
+mlp_init_mode: "PruneByActivationsLog"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml
new file mode 100644
index 00000000000..407c835d8c4
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml
@@ -0,0 +1,15 @@
+defaults:
+  - pruning_defaults
+
+activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
+
+activation_hooks_kwargs:
+  method: layer_norm_contribution
+  target_layer: "layernorm"
+
+# Hidden dimension pruning specific settings
+hidden_size_list: [3072, 2048]  # Target hidden sizes to prune to
+hidden_size_init_mode: "PruneByChannelRanking"
+mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher
+gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher
+linear_init_mode: "FromTeacher"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml
new file mode 100644
index 00000000000..e05e775bee3
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml
@@ -0,0 +1,33 @@
+defaults:
+  - /validate_model_defaults
+
+descriptor: ${descriptor}
+model_name_or_path: ${teacher_dir}
+experiment_id: ${pruning.eval_samples}samples_diverse_mini
+activations_log_dir: ???
+activation_hooks_kwargs: ???
+
+# Data:
+eval_samples: 1000 # default is 10000
+micro_batch_size: 4
+dataset_path: ${dataset_path}
+val_dataset_name: train
+
+# Prune ckpts
+pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
+
+## FFN pruning
+ffn_list:
+mlp_init_mode: "Truncate" # PruneByActivationsLog
+
+## KV-heads pruning
+n_heads_in_group_list:
+gqa_init_mode: "AverageKV"
+
+## Hidden dimension pruning
+hidden_size_list:
+hidden_size_init_mode: "PruneByChannelRanking"
+linear_init_mode: "FromTeacher"
+
+mlp_init_config_yaml:
+  activations_log_dir: ${pruning.activations_log_dir}
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml
new file mode 100644
index 00000000000..6b36142a3a8
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml
@@ -0,0 +1,17 @@
+model_dtype: torch.bfloat16 # dtype to cast the model for validate_model
+autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model
+block_size: 8192
+bos_rate: 0.5
+data_column: messages
+val_dataset_name: validation
+shuffle_seed: 81436
+seed: 42
+fim_rate: 0
+fim_spm_rate: 0
+source_datasets_to_discard:
+varlen: false
+write_results: false
+calc_losses_on_cpu: false
+activations_log_dir:
+model_name_or_path:
+load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn}
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml
new file mode 100644
index 00000000000..ec139023794
--- /dev/null
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml
@@ -0,0 +1,10 @@
+defaults:
+  - /validate_model_defaults
+  - _self_
+
+solutions_to_validate:
+skip_validation: false
+save_models: false
+bigger_is_better: false
+sort_solutions_by:
+calculate_full_score_ablations: false

From dfb905ca3768a53131bc5236b904c17e7d59064d Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 5 May 2026 08:34:32 -0700
Subject: [PATCH 05/35] renamed configs

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md                 | 29 ++++++++++++++++++-
 .../Llama-3_1-8B.yaml                         |  4 ---
 .../llama-3_1-8B_pruneattn_runtime.yaml       | 11 ++++---
 .../pruning/attn_pruning.yaml                 |  0
 .../pruning/ffn_pruning.yaml                  |  0
 .../pruning/hidden_dim_pruning.yaml           |  0
 .../pruning/pruning_defaults.yaml             |  0
 .../validate_model_defaults.yaml              |  0
 .../validate_solutions_defaults.yaml          |  0
 9 files changed, 35 insertions(+), 9 deletions(-)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/Llama-3_1-8B.yaml (97%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/llama-3_1-8B_pruneattn_runtime.yaml (50%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/attn_pruning.yaml (100%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/ffn_pruning.yaml (100%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/hidden_dim_pruning.yaml (100%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/pruning_defaults.yaml (100%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/validate_model_defaults.yaml (100%)
 rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/validate_solutions_defaults.yaml (100%)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 571b40ca499..aeec7fc94dd 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -11,7 +11,7 @@ To use the Puzzle algorithm effectively, we need to specify the target number of
 
 In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. Other supported models should be compressed in a similar way. For GptOss there is one [additional step to be performed](GPTOSS.md).
 
-> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md).
+> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For KV-head pruning see [`llama-3_1-8B_pruneattn_runtime`](./configs/llama-3_1-8B_pruneattn_runtime/) and the [Attention Pruning](#attention-pruning-kv-head-reduction) and [Runtime-Based Latency Optimization](#runtime-based-latency-optimization) sections below. For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md).
 
 ## Environment
 
@@ -343,6 +343,33 @@ See [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) fo
 
 For distillation results on Puzzletron-compressed models, see [examples/pruning/puzzletron/](../pruning/puzzletron/README.md).
 
+## Runtime-Based Latency Optimization
+
+By default, subblock statistics use the `trt_torch` backend with theoretical memory proxies. You can instead enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints:
+
+```yaml
+calc_subblock_stats:
+  runtime_stats:
+    enabled: true
+    synth_dataset_num_requests: 32
+    backend: vllm
+    num_warmup_iters: 2
+    num_iters: 10
+    batch_size: 1
+
+mip:
+  human_constraints:
+    target_latency: 20  # ms
+```
+
+Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly:
+
+```yaml
+dist_timeout_minutes: 60  # default is 10 if omitted
+```
+
+This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout.
+
 ## Advanced Usage
 
 Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios.
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
similarity index 97%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index 7340938da25..bb352598e10 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -77,10 +77,6 @@ mip:
     - stats.ffn_num_params
     - stats.attention_num_params
 
-  # human_constraints:
-  #   target_memory: 78_000
-  #   num_params: 7_000_000_000
-
   mip_constraints:
   metric_overrides:
   max_seconds_per_solution: 60
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
similarity index 50%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
index 42e17d627a0..6eaf5f508b8 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
@@ -1,16 +1,15 @@
 defaults:
   - Llama-3_1-8B
-  - override pruning: attn_pruning
   - _self_
 
 # Input Hugging Face model to compress
-input_hf_model_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/puzzletron/checkpoints/meta-llama/Llama-3.1-8B-Instruct
+input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
 # Dataset path for pruning and NAS scoring
-dataset_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/datasets/nvidia/Nemotron-Post-Training-Dataset-v2
+dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
 
 # Working directory for puzzletron outputs
-puzzle_dir: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/Model-Optimizer/artifacts
+puzzle_dir: /workspace/puzzle_dir
 
 dist_timeout_minutes: 60
 
@@ -27,3 +26,7 @@ calc_subblock_stats:
 mip:
   human_constraints:
     target_latency: 21
+
+# FFN intermediate sizes to search over (heterogeneous architecture)
+pruning:
+  intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
similarity index 100%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml

From e165171e8ed445a6a95ed8a4493f510efac50172 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 6 May 2026 01:57:22 -0700
Subject: [PATCH 06/35] working on readme

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index aeec7fc94dd..134790bb011 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -359,7 +359,7 @@ calc_subblock_stats:
 
 mip:
   human_constraints:
-    target_latency: 20  # ms
+    target_latency: 20  # seconds
 ```
 
 Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly:

From d47b69c54964c1a5f62cc97f8f4c53f5ae4a848d Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 6 May 2026 05:25:46 -0700
Subject: [PATCH 07/35] working on refactoring

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../nas/subblock_stats/calc_runtime_stats.py  | 142 ++++--------------
 .../torch/nas/subblock_stats/runtime_utils.py |  61 ++++++++
 .../torch/nas/subblock_stats/runtime_vllm.py  |  48 ++++++
 3 files changed, 137 insertions(+), 114 deletions(-)
 create mode 100644 modelopt/torch/nas/subblock_stats/runtime_utils.py
 create mode 100644 modelopt/torch/nas/subblock_stats/runtime_vllm.py

diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index d3b997f4525..cadc7d8c9a6 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -16,19 +16,16 @@
 
 """Runtime statistics calculation for NAS subblock benchmarking via vLLM."""
 
-import json
-import os
-import subprocess
 import tempfile
-from dataclasses import dataclass, replace
+from dataclasses import replace
 from pathlib import Path
 
-import torch
 from omegaconf import DictConfig
 from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM
+from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
 
-from modelopt.torch.puzzletron.anymodel.converter import Converter
+from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig, save_model
+from modelopt.torch.nas.subblock_stats.runtime_vllm import run_vllm_latency_benchmark
 from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
 from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
 from modelopt.torch.puzzletron.block_config import (
@@ -89,98 +86,17 @@ def create_benchmark_model(
     return model
 
 
-def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
-    """Save a model checkpoint in AnyModel subblock-safetensors format."""
-    # Save standard model checkpoint (as safetensors, HF format)
-    model.save_pretrained(output_dir, safe_serialization=True)
-
-    # Convert/slice weights into AnyModel subblock_safetensors format
-    Converter.convert_model_weights(
-        input_dir=output_dir,
-        output_dir=output_dir,
-        descriptor=descriptor,
-        num_hidden_layers=num_hidden_layers,
-    )
-    # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk.
-
-    config_path = output_dir / "config.json"
-    if config_path.exists():
-        with open(config_path) as f:
-            config_data = json.load(f)
-        config_data["architectures"] = ["AnyModel"]
-        with open(config_path, "w") as f:
-            json.dump(config_data, f, indent=2)
-
-
-def save_model(
-    model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
-) -> None:
-    """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-    save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
-
-    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
-    tokenizer.save_pretrained(output_path)
-
-
-@dataclass(frozen=True)
-class RuntimeConfig:
-    """Configuration for a vLLM latency benchmark run."""
-
-    vocab_size: int
-    hidden_size: int
-    num_attention_heads: int
-    master_puzzle_dir: str
-    tokenizer_path: str
-    synth_dataset_num_requests: int
-    repeat_block_n_times: int
-    prefill_seq_len: int
-    generation_seq_len: int
-    batch_size: int
-    num_iters: int
-    num_warmup_iters: int
-
-
-def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
-    """Run ``vllm bench latency`` and return the average latency in milliseconds."""
-    output_json_path = model_path / "vllm_latency_benchmark.json"
-
-    cmd = [
-        "vllm",
-        "bench",
-        "latency",
-        "--model",
-        str(model_path),
-        "--input-len",
-        str(runtime_config.prefill_seq_len),
-        "--output-len",
-        str(runtime_config.generation_seq_len),
-        "--batch-size",
-        str(runtime_config.batch_size),
-        "--output-json",
-        str(output_json_path),
-        "--max-model-len",
-        str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len),
-        "--num-iters-warmup",
-        str(runtime_config.num_warmup_iters),
-        "--num-iters",
-        str(runtime_config.num_iters),
-        "--max-num-seqs",
-        "1",
-        "--distributed-executor-backend",
-        "external_launcher",
-        "--tensor-parallel-size",
-        "1",
-        "--pipeline-parallel-size",
-        "1",
-    ]
-    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-    subprocess.run(cmd)
-
-    with open(output_json_path) as f:
-        vllm_results = json.load(f)
-    print(vllm_results)
-    return vllm_results["avg_latency"] * 1000  # convert to milliseconds
+def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float:
+    """Measure total runtime of a model via vLLM latency benchmark."""
+    with tempfile.TemporaryDirectory() as model_tmpdir:
+        save_model(
+            model,
+            Path(runtime_config.tokenizer_path),
+            Path(model_tmpdir),
+            num_hidden_layers=runtime_config.repeat_block_n_times + 1,
+        )
+        model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
+    return model_total_runtime_ms
 
 
 def calc_subblock_runtime(
@@ -207,16 +123,7 @@ def calc_subblock_runtime(
         block_config=block_config,
         repeat_block_n_times=runtime_config.repeat_block_n_times,
     )
-    with tempfile.TemporaryDirectory() as model_tmpdir:
-        save_model(
-            model,
-            Path(runtime_config.tokenizer_path),
-            Path(model_tmpdir),
-            num_hidden_layers=runtime_config.repeat_block_n_times + 1,
-        )
-        subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
-
-    return subblock_total_runtime_ms
+    return calc_model_runtime(model, runtime_config)
 
 
 def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
@@ -267,15 +174,22 @@ def calc_runtime_for_subblocks(
 
     runtime_by_subblock_dict = {}
 
-    baseline_runtime_ms = calc_subblock_runtime(runtime_config, None)
+    baseline_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config=None)
 
     for subblock_config in tqdm(
         sorted(subblock_config_set),
-        desc=(
-            f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, "
-            f"num_subblocks={len(subblock_config_set)}]"
-        ),
+        desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"),
     ):
+        if isinstance(subblock_config, AttentionConfig):
+            num_key_value_heads = subblock_config.num_key_value_heads
+            desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})"
+        elif isinstance(subblock_config, FFNConfig):
+            intermediate_size = subblock_config.intermediate_size
+            desc = f"FFNConfig(intermediate_size={intermediate_size})"
+        else:
+            raise ValueError(f"Unsupported subblock type: {type(subblock_config)}")
+        print(f"Computing runtime for subblock: {desc} {subblock_config.no_op=}")
+
         if subblock_config.no_op:
             total_runtime_ms = 0.0
         else:
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
new file mode 100644
index 00000000000..e4eec38e033
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -0,0 +1,61 @@
+import json
+from dataclasses import dataclass
+from pathlib import Path
+
+import torch
+from transformers import AutoTokenizer, LlamaForCausalLM
+
+from modelopt.torch.puzzletron.anymodel.converter import Converter
+from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
+
+
+@dataclass(frozen=True)
+class RuntimeConfig:
+    """Configuration for a vLLM latency benchmark run."""
+
+    vocab_size: int
+    hidden_size: int
+    num_attention_heads: int
+    master_puzzle_dir: str
+    tokenizer_path: str
+    synth_dataset_num_requests: int
+    repeat_block_n_times: int
+    prefill_seq_len: int
+    generation_seq_len: int
+    batch_size: int
+    num_iters: int
+    num_warmup_iters: int
+
+
+def save_model(
+    model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
+) -> None:
+    """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
+    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+    save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+    tokenizer.save_pretrained(output_path)
+
+
+def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
+    """Save a model checkpoint in AnyModel subblock-safetensors format."""
+    # Save standard model checkpoint (as safetensors, HF format)
+    model.save_pretrained(output_dir, safe_serialization=True)
+
+    # Convert/slice weights into AnyModel subblock_safetensors format
+    Converter.convert_model_weights(
+        input_dir=output_dir,
+        output_dir=output_dir,
+        descriptor=descriptor,
+        num_hidden_layers=num_hidden_layers,
+    )
+    # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk.
+
+    config_path = output_dir / "config.json"
+    if config_path.exists():
+        with open(config_path) as f:
+            config_data = json.load(f)
+        config_data["architectures"] = ["AnyModel"]
+        with open(config_path, "w") as f:
+            json.dump(config_data, f, indent=2)
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
new file mode 100644
index 00000000000..f1c7c99ed0b
--- /dev/null
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -0,0 +1,48 @@
+import json
+import os
+import subprocess
+from pathlib import Path
+
+from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
+
+
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
+    """Run ``vllm bench latency`` and return the average latency in milliseconds."""
+    output_json_path = model_path / "vllm_latency_benchmark.json"
+
+    cmd = [
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        str(model_path),
+        "--input-len",
+        str(runtime_config.prefill_seq_len),
+        "--output-len",
+        str(runtime_config.generation_seq_len),
+        "--batch-size",
+        str(runtime_config.batch_size),
+        "--output-json",
+        str(output_json_path),
+        "--max-model-len",
+        str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len),
+        "--num-iters-warmup",
+        str(runtime_config.num_warmup_iters),
+        "--num-iters",
+        str(runtime_config.num_iters),
+        "--max-num-seqs",
+        "1",
+        "--distributed-executor-backend",
+        "external_launcher",
+        "--tensor-parallel-size",
+        "1",
+        "--pipeline-parallel-size",
+        "1",
+    ]
+    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+    subprocess.run(cmd)
+
+    with open(output_json_path) as f:
+        vllm_results = json.load(f)
+    print(vllm_results)
+    return vllm_results["avg_latency"] * 1000  # convert to milliseconds

From 12ed46ba2f71813ce9d61380edf1b147524ed461 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Sun, 17 May 2026 07:46:56 -0700
Subject: [PATCH 08/35] working on fix

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../nas/subblock_stats/calc_runtime_stats.py  | 57 ++++++++++++++++---
 .../torch/nas/subblock_stats/runtime_utils.py |  1 +
 .../torch/nas/subblock_stats/runtime_vllm.py  |  4 +-
 .../subblock_stats/calc_subblock_stats.py     | 39 ++++++-------
 4 files changed, 73 insertions(+), 28 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index cadc7d8c9a6..40b3191dbc4 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -18,6 +18,7 @@
 
 import tempfile
 from dataclasses import replace
+from functools import cache
 from pathlib import Path
 
 from omegaconf import DictConfig
@@ -36,9 +37,9 @@
 )
 
 
-def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig:
+def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig:
     return BlockConfig(
-        attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads),
+        attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads),
         ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None),
         parallel_blocks=None,
     )
@@ -47,6 +48,7 @@ def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> B
 def create_benchmark_model(
     vocab_size: int,
     hidden_size: int,
+    num_key_value_heads: int,
     num_attention_heads: int,
     prefill_seq_len: int,
     generation_seq_len: int,
@@ -54,7 +56,7 @@ def create_benchmark_model(
     repeat_block_n_times: int = 10,
 ) -> LlamaForCausalLM:
     """Build a small Llama model with repeated subblocks for latency benchmarking."""
-    block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)]
+    block_configs = [_make_standard_block_config(hidden_size, num_key_value_heads)]
 
     if block_config:
         block_configs.extend([block_config] * repeat_block_n_times)
@@ -88,17 +90,20 @@ def create_benchmark_model(
 
 def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float:
     """Measure total runtime of a model via vLLM latency benchmark."""
-    with tempfile.TemporaryDirectory() as model_tmpdir:
+    with tempfile.TemporaryDirectory(delete=False) as model_tmpdir:  # delete=True after debugging
+        print(f"|||| Saving model to {model_tmpdir}")
         save_model(
             model,
             Path(runtime_config.tokenizer_path),
             Path(model_tmpdir),
             num_hidden_layers=runtime_config.repeat_block_n_times + 1,
         )
+        print(f"|||| Running vLLM latency benchmark on {model_tmpdir}")
         model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
     return model_total_runtime_ms
 
 
+@cache
 def calc_subblock_runtime(
     runtime_config: RuntimeConfig,
     subblock_config: SubblockConfig,
@@ -110,13 +115,22 @@ def calc_subblock_runtime(
         if isinstance(subblock_config, BlockConfig):
             block_config = subblock_config
         elif isinstance(subblock_config, (AttentionConfig, FFNConfig)):
-            block_config = subblock_config.to_blockconfig()
+            if isinstance(subblock_config, FFNConfig):
+                block_config = BlockConfig(
+                    attention=AttentionConfig(
+                        no_op=False, num_key_value_heads=runtime_config.num_key_value_heads
+                    ),
+                    ffn=subblock_config,
+                )
+            else:
+                block_config = subblock_config.to_blockconfig()
         else:
             raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}")
 
     model = create_benchmark_model(
         runtime_config.vocab_size,
         runtime_config.hidden_size,
+        runtime_config.num_key_value_heads,
         runtime_config.num_attention_heads,
         runtime_config.prefill_seq_len,
         runtime_config.generation_seq_len,
@@ -126,13 +140,14 @@ def calc_subblock_runtime(
     return calc_model_runtime(model, runtime_config)
 
 
+@cache
 def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
     """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
     runtime_config1 = replace(runtime_config, repeat_block_n_times=0)
     runtime_config10 = replace(runtime_config, repeat_block_n_times=9)
 
     block_config = _make_standard_block_config(
-        runtime_config.hidden_size, runtime_config.num_attention_heads
+        runtime_config.hidden_size, runtime_config.num_key_value_heads
     )
 
     runtime_ms1 = calc_subblock_runtime(runtime_config1, None)
@@ -143,12 +158,30 @@ def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
     return no_block_runtime_ms
 
 
+@cache
+def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float:
+    """Calculate the base runtime of a model with no subblocks."""
+    base_runtime_ms = 0.0
+    if isinstance(subblock_config, AttentionConfig):
+        base_runtime_ms = calc_subblock_runtime(runtime_config, None)
+    elif isinstance(subblock_config, FFNConfig):
+        attn_block_config = AttentionConfig(
+            no_op=False, num_key_value_heads=runtime_config.num_key_value_heads
+        ).to_blockconfig()
+        base_runtime_ms = calc_subblock_runtime(runtime_config, attn_block_config)
+    else:
+        raise ValueError(f"Unsupported subblock type: {type(subblock_config)}")
+
+    return base_runtime_ms
+
+
 def calc_runtime_for_subblocks(
     subblock_config_set: set[SubblockConfig],
     runtime_stats_config: DictConfig,
     vocab_size: int,
     hidden_size: int,
     num_attention_heads: int,
+    num_key_value_heads: int,
     master_puzzle_dir: str,
     tokenizer_path: str,
     synth_dataset_num_requests: int,
@@ -161,6 +194,7 @@ def calc_runtime_for_subblocks(
         vocab_size,
         hidden_size,
         num_attention_heads,
+        num_key_value_heads,
         master_puzzle_dir,
         tokenizer_path,
         synth_dataset_num_requests,
@@ -174,12 +208,15 @@ def calc_runtime_for_subblocks(
 
     runtime_by_subblock_dict = {}
 
-    baseline_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config=None)
-
     for subblock_config in tqdm(
         sorted(subblock_config_set),
         desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"),
     ):
+        print("|||| Calculating baseline runtime")
+        # runtime_config_baseline = replace(runtime_config, repeat_block_n_times=0)
+        baseline_runtime_ms = calc_base_runtime(runtime_config, subblock_config)
+        print(f"|||| {baseline_runtime_ms=}")
+
         if isinstance(subblock_config, AttentionConfig):
             num_key_value_heads = subblock_config.num_key_value_heads
             desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})"
@@ -193,13 +230,17 @@ def calc_runtime_for_subblocks(
         if subblock_config.no_op:
             total_runtime_ms = 0.0
         else:
+            print("|||| Calculating subblock runtime")
             subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config)
+            print(f"|||| {subblock_total_runtime_ms=}")
             total_runtime_ms = (
                 subblock_total_runtime_ms - baseline_runtime_ms
             ) / repeat_block_n_times
 
         runtime_by_subblock_dict[subblock_config] = total_runtime_ms
 
+    print("|||| Calculating no-block runtime")
     no_block_runtime_ms = calc_no_block_runtime(runtime_config)
+    print(f"|||| {no_block_runtime_ms=}")
 
     return runtime_by_subblock_dict, no_block_runtime_ms
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index e4eec38e033..39baf2126f2 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -16,6 +16,7 @@ class RuntimeConfig:
     vocab_size: int
     hidden_size: int
     num_attention_heads: int
+    num_key_value_heads: int
     master_puzzle_dir: str
     tokenizer_path: str
     synth_dataset_num_requests: int
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index f1c7c99ed0b..80d541fb632 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -21,7 +21,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
         "--output-len",
         str(runtime_config.generation_seq_len),
         "--batch-size",
-        str(runtime_config.batch_size),
+        "1", #str(runtime_config.batch_size),
         "--output-json",
         str(output_json_path),
         "--max-model-len",
@@ -38,6 +38,8 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
         "1",
         "--pipeline-parallel-size",
         "1",
+        "--optimization-level",
+        "1",
     ]
     os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
     subprocess.run(cmd)
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index f36a71710a3..a1c0c6d6aa5 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -24,6 +24,7 @@
 from functools import partial
 from itertools import product
 from pathlib import Path
+from pdb import run
 from typing import Iterable, Type, TypeVar
 
 import pandas as pd
@@ -33,6 +34,13 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
+from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import (
+    calc_subblock_active_params,
+    calculate_non_block_memory,
+    calculate_non_block_params,
+    calculate_subblock_memory,
+    calculate_subblock_params,
+)
 from modelopt.torch.utils import json_dump
 
 from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory
@@ -41,13 +49,6 @@
 from ..tools.checkpoint_utils import load_model_config
 from ..tools.logger import mprint
 from ..utils.parsing import format_global_config
-from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import (
-    calc_subblock_active_params,
-    calculate_non_block_memory,
-    calculate_non_block_params,
-    calculate_subblock_memory,
-    calculate_subblock_params,
-)
 
 __all__ = [
     "calculate_subblock_stats",
@@ -124,18 +125,20 @@ def calculate_subblock_stats(
             "synth_dataset_num_requests", 200
         )
         runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {})
+        runtime_stats_config["batch_size"] = batch_size
 
         runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks(
-            subblock_configs_nolayerindex,
-            runtime_stats_config,
-            vocab_size,
-            n_embd,
-            n_head,
-            master_puzzle_dir,
-            teacher_dir,
-            synth_dataset_num_requests,
-            prefill_seq_len,
-            generation_seq_len,
+            subblock_config_set=subblock_configs_nolayerindex,
+            runtime_stats_config=runtime_stats_config,
+            vocab_size=vocab_size,
+            hidden_size=n_embd,
+            num_attention_heads=n_head,
+            num_key_value_heads=8,
+            master_puzzle_dir=master_puzzle_dir,
+            tokenizer_path=teacher_dir,
+            synth_dataset_num_requests=synth_dataset_num_requests,
+            prefill_seq_len=prefill_seq_len,
+            generation_seq_len=generation_seq_len,
         )
 
     sorted_subblock_config = sorted(
@@ -486,5 +489,3 @@ def _dataclass_from_dict(
     if pd.isna(d):
         return None
     raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}")
-
-

From ab925b90486d7f7d170c254c276bcfa9020c9211 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 18 May 2026 03:55:44 -0700
Subject: [PATCH 09/35] runtime accuracy improved

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../nas/subblock_stats/calc_runtime_stats.py  | 30 ++-----------------
 .../torch/nas/subblock_stats/runtime_utils.py | 10 +++----
 .../torch/nas/subblock_stats/runtime_vllm.py  |  4 +--
 3 files changed, 9 insertions(+), 35 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index 40b3191dbc4..b97214461e2 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -40,7 +40,7 @@
 def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig:
     return BlockConfig(
         attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads),
-        ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None),
+        ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None),
         parallel_blocks=None,
     )
 
@@ -90,15 +90,8 @@ def create_benchmark_model(
 
 def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float:
     """Measure total runtime of a model via vLLM latency benchmark."""
-    with tempfile.TemporaryDirectory(delete=False) as model_tmpdir:  # delete=True after debugging
-        print(f"|||| Saving model to {model_tmpdir}")
-        save_model(
-            model,
-            Path(runtime_config.tokenizer_path),
-            Path(model_tmpdir),
-            num_hidden_layers=runtime_config.repeat_block_n_times + 1,
-        )
-        print(f"|||| Running vLLM latency benchmark on {model_tmpdir}")
+    with tempfile.TemporaryDirectory() as model_tmpdir:
+        save_model(model, Path(runtime_config.tokenizer_path), Path(model_tmpdir))
         model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config)
     return model_total_runtime_ms
 
@@ -212,35 +205,18 @@ def calc_runtime_for_subblocks(
         sorted(subblock_config_set),
         desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"),
     ):
-        print("|||| Calculating baseline runtime")
-        # runtime_config_baseline = replace(runtime_config, repeat_block_n_times=0)
         baseline_runtime_ms = calc_base_runtime(runtime_config, subblock_config)
-        print(f"|||| {baseline_runtime_ms=}")
-
-        if isinstance(subblock_config, AttentionConfig):
-            num_key_value_heads = subblock_config.num_key_value_heads
-            desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})"
-        elif isinstance(subblock_config, FFNConfig):
-            intermediate_size = subblock_config.intermediate_size
-            desc = f"FFNConfig(intermediate_size={intermediate_size})"
-        else:
-            raise ValueError(f"Unsupported subblock type: {type(subblock_config)}")
-        print(f"Computing runtime for subblock: {desc} {subblock_config.no_op=}")
 
         if subblock_config.no_op:
             total_runtime_ms = 0.0
         else:
-            print("|||| Calculating subblock runtime")
             subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config)
-            print(f"|||| {subblock_total_runtime_ms=}")
             total_runtime_ms = (
                 subblock_total_runtime_ms - baseline_runtime_ms
             ) / repeat_block_n_times
 
         runtime_by_subblock_dict[subblock_config] = total_runtime_ms
 
-    print("|||| Calculating no-block runtime")
     no_block_runtime_ms = calc_no_block_runtime(runtime_config)
-    print(f"|||| {no_block_runtime_ms=}")
 
     return runtime_by_subblock_dict, no_block_runtime_ms
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index 39baf2126f2..b3b5278fa68 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -28,18 +28,16 @@ class RuntimeConfig:
     num_warmup_iters: int
 
 
-def save_model(
-    model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int
-) -> None:
+def save_model(model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path) -> None:
     """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
     model.to(dtype=torch.bfloat16).save_pretrained(output_path)
-    save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers)
+    save_model_as_anymodel(model, output_path, LlamaModelDescriptor)
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
     tokenizer.save_pretrained(output_path)
 
 
-def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int):
+def save_model_as_anymodel(model, output_dir: Path, descriptor):
     """Save a model checkpoint in AnyModel subblock-safetensors format."""
     # Save standard model checkpoint (as safetensors, HF format)
     model.save_pretrained(output_dir, safe_serialization=True)
@@ -49,7 +47,7 @@ def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layer
         input_dir=output_dir,
         output_dir=output_dir,
         descriptor=descriptor,
-        num_hidden_layers=num_hidden_layers,
+        num_hidden_layers=model.config.num_hidden_layers,
     )
     # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk.
 
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 80d541fb632..378337487b2 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -21,7 +21,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
         "--output-len",
         str(runtime_config.generation_seq_len),
         "--batch-size",
-        "1", #str(runtime_config.batch_size),
+        "1",
         "--output-json",
         str(output_json_path),
         "--max-model-len",
@@ -39,7 +39,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
         "--pipeline-parallel-size",
         "1",
         "--optimization-level",
-        "1",
+        "0",
     ]
     os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
     subprocess.run(cmd)

From 58f17e48b94f0a0df6955a149865cbbdc5b7b4c0 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 18 May 2026 08:29:56 -0700
Subject: [PATCH 10/35] using vllm api instead of subprocess

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../torch/nas/subblock_stats/runtime_vllm.py  | 61 +++++++++----------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 378337487b2..40f9f1f7239 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -1,8 +1,10 @@
+import argparse
 import json
 import os
-import subprocess
 from pathlib import Path
 
+from vllm.benchmarks.latency import main as vllm_latency_main
+
 from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
 
 
@@ -10,39 +12,32 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
     """Run ``vllm bench latency`` and return the average latency in milliseconds."""
     output_json_path = model_path / "vllm_latency_benchmark.json"
 
-    cmd = [
-        "vllm",
-        "bench",
-        "latency",
-        "--model",
-        str(model_path),
-        "--input-len",
-        str(runtime_config.prefill_seq_len),
-        "--output-len",
-        str(runtime_config.generation_seq_len),
-        "--batch-size",
-        "1",
-        "--output-json",
-        str(output_json_path),
-        "--max-model-len",
-        str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len),
-        "--num-iters-warmup",
-        str(runtime_config.num_warmup_iters),
-        "--num-iters",
-        str(runtime_config.num_iters),
-        "--max-num-seqs",
-        "1",
-        "--distributed-executor-backend",
-        "external_launcher",
-        "--tensor-parallel-size",
-        "1",
-        "--pipeline-parallel-size",
-        "1",
-        "--optimization-level",
-        "0",
-    ]
+    # Use vLLM latency benchmark as a library.
+
+    # Create a mock argparse.Namespace similar to what is parsed by vllm.benchmarks.latency.main
+    args_ns = argparse.Namespace()
+
+    # Populate the Namespace with all required attributes
+    args_ns.model = str(model_path)
+    args_ns.input_len = runtime_config.prefill_seq_len
+    args_ns.output_len = runtime_config.generation_seq_len
+    args_ns.batch_size = 1
+    args_ns.output_json = str(output_json_path)
+    args_ns.max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len
+    args_ns.num_iters_warmup = runtime_config.num_warmup_iters
+    args_ns.num_iters = runtime_config.num_iters
+    args_ns.max_num_seqs = 1
+    args_ns.distributed_executor_backend = (
+        "external_launcher"  # Running vLLM with torchrun so need to indicate that.
+    )
+    args_ns.tensor_parallel_size = 1
+    args_ns.pipeline_parallel_size = 1
+    args_ns.optimization_level = 0  # This is required to make the stats accurate.
+    args_ns.n = 1
+    args_ns.disable_detokenize = False
+
     os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-    subprocess.run(cmd)
+    vllm_latency_main(args_ns)
 
     with open(output_json_path) as f:
         vllm_results = json.load(f)

From e8683039f07fdd97fcbe0812c9ea7bb10f4576fc Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 03:00:03 -0700
Subject: [PATCH 11/35] working on review feedback

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml |  2 --
 ...time.yaml => llama-3_1-8B_pruneffn_runtime.yaml} |  4 ++--
 examples/puzzletron/main.py                         | 13 +++++++------
 .../subblock_stats/calc_subblock_stats.py           |  3 +--
 4 files changed, 10 insertions(+), 12 deletions(-)
 rename examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/{llama-3_1-8B_pruneattn_runtime.yaml => llama-3_1-8B_pruneffn_runtime.yaml} (94%)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index bb352598e10..0e270906151 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -97,8 +97,6 @@ realize_model:
   shuffle_seed: 444
   dataset_path: ${dataset_path}
 
-nccl_timeout_minutes: ${timedelta_minutes:10}
-
 # This section redirects Hydra outputs
 hydra:
   run:
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
similarity index 94%
rename from examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
index 6eaf5f508b8..036486df530 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
@@ -2,6 +2,8 @@ defaults:
   - Llama-3_1-8B
   - _self_
 
+nccl_timeout_minutes: ${timedelta_minutes:90}
+
 # Input Hugging Face model to compress
 input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
@@ -11,8 +13,6 @@ dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2
 # Working directory for puzzletron outputs
 puzzle_dir: /workspace/puzzle_dir
 
-dist_timeout_minutes: 60
-
 calc_subblock_stats:
   runtime_stats:
     enabled: true
diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py
index ba1c19d12c5..82ace24b4da 100644
--- a/examples/puzzletron/main.py
+++ b/examples/puzzletron/main.py
@@ -75,16 +75,17 @@ def run_full_puzzletron(hydra_config_path: str):
     hydra_config_path = Path(hydra_config_path).resolve()
     hydra_config = OmegaConf.load(str(hydra_config_path))
 
-    # Default timeout: 10 minutes, or extended to dist_timeout_minutes if set in config
-    if hasattr(hydra_config, "dist_timeout_minutes"):
-        timeout_minutes = timedelta(minutes=hydra_config.dist_timeout_minutes)
+    # Register Hydra custom resolvers (needed for config resolution)
+    mtpz.tools.register_hydra_resolvers()
+
+    # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config
+    if hasattr(hydra_config, "nccl_timeout_minutes"):
+        timeout_minutes = hydra_config.nccl_timeout_minutes
     else:
         timeout_minutes = timedelta(minutes=10)
     mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}")
-    dist.setup(timeout=timeout_minutes)
 
-    # Register Hydra custom resolvers (needed for config resolution)
-    mtpz.tools.register_hydra_resolvers()
+    dist.setup(timeout=timeout_minutes)
 
     hydra_config_path = Path(hydra_config_path).resolve()
     hydra_config_dir = str(hydra_config_path.parent)
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index a1c0c6d6aa5..dc37c1bec26 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -24,7 +24,6 @@
 from functools import partial
 from itertools import product
 from pathlib import Path
-from pdb import run
 from typing import Iterable, Type, TypeVar
 
 import pandas as pd
@@ -133,7 +132,7 @@ def calculate_subblock_stats(
             vocab_size=vocab_size,
             hidden_size=n_embd,
             num_attention_heads=n_head,
-            num_key_value_heads=8,
+            num_key_value_heads=model_config.num_key_value_heads,
             master_puzzle_dir=master_puzzle_dir,
             tokenizer_path=teacher_dir,
             synth_dataset_num_requests=synth_dataset_num_requests,

From f7be643078b752f4ca8abfc6abbb7a9561eb3547 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 03:28:10 -0700
Subject: [PATCH 12/35] removed unused batch_size; cleaned up config loading

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/main.py                   | 23 +++++++------------
 .../nas/subblock_stats/calc_runtime_stats.py  |  1 -
 .../torch/nas/subblock_stats/runtime_utils.py |  1 -
 .../subblock_stats/calc_subblock_stats.py     |  1 -
 4 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py
index 82ace24b4da..f093e5b7e68 100644
--- a/examples/puzzletron/main.py
+++ b/examples/puzzletron/main.py
@@ -68,25 +68,10 @@ def run_full_puzzletron(hydra_config_path: str):
         config_path: Path to the YAML configuration file
     """
     mtpz.tools.mprint("Puzzletron Progress 1/8: starting puzzletron pipeline")
-    # Read the Hydra config to determine runtime_stats:enabled, and set the timeout accordingly
-    from omegaconf import OmegaConf
-
-    # Resolve absolute path for Hydra config
-    hydra_config_path = Path(hydra_config_path).resolve()
-    hydra_config = OmegaConf.load(str(hydra_config_path))
 
     # Register Hydra custom resolvers (needed for config resolution)
     mtpz.tools.register_hydra_resolvers()
 
-    # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config
-    if hasattr(hydra_config, "nccl_timeout_minutes"):
-        timeout_minutes = hydra_config.nccl_timeout_minutes
-    else:
-        timeout_minutes = timedelta(minutes=10)
-    mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}")
-
-    dist.setup(timeout=timeout_minutes)
-
     hydra_config_path = Path(hydra_config_path).resolve()
     hydra_config_dir = str(hydra_config_path.parent)
     hydra_config_name = hydra_config_path.stem
@@ -98,6 +83,14 @@ def run_full_puzzletron(hydra_config_path: str):
         overrides=[],
     )
 
+    # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config
+    if hasattr(hydra_cfg, "nccl_timeout_minutes"):
+        timeout_minutes = hydra_cfg.nccl_timeout_minutes
+    else:
+        timeout_minutes = timedelta(minutes=10)
+
+    dist.setup(timeout=timeout_minutes)
+
     # Convert model (convert from HF to DeciLM, score pruning activations,
     # prune the model and save pruned checkpoints)
     input_model = mtpz.puzzletron_nas_plugin.PuzzletronModel()
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index b97214461e2..50d4a7e40cb 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -194,7 +194,6 @@ def calc_runtime_for_subblocks(
         repeat_block_n_times,
         prefill_seq_len,
         generation_seq_len,
-        runtime_stats_config.get("batch_size", 1),
         runtime_stats_config.get("num_iters", 30),
         runtime_stats_config.get("num_warmup_iters", 10),
     )
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index b3b5278fa68..dce9bcdd36c 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -23,7 +23,6 @@ class RuntimeConfig:
     repeat_block_n_times: int
     prefill_seq_len: int
     generation_seq_len: int
-    batch_size: int
     num_iters: int
     num_warmup_iters: int
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index dc37c1bec26..e76212c573d 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -124,7 +124,6 @@ def calculate_subblock_stats(
             "synth_dataset_num_requests", 200
         )
         runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {})
-        runtime_stats_config["batch_size"] = batch_size
 
         runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks(
             subblock_config_set=subblock_configs_nolayerindex,

From 49235d19290d50e6bb81f0e9faab69783bbc5d02 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 04:55:19 -0700
Subject: [PATCH 13/35] cleanup based on pre-commit

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/__init__.py     |  1 +
 .../torch/nas/subblock_stats/runtime_utils.py     | 15 +++++++++++++++
 modelopt/torch/nas/subblock_stats/runtime_vllm.py | 15 +++++++++++++++
 3 files changed, 31 insertions(+)

diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py
index aeac903f8f4..1976eb2f2e1 100644
--- a/modelopt/torch/nas/subblock_stats/__init__.py
+++ b/modelopt/torch/nas/subblock_stats/__init__.py
@@ -20,4 +20,5 @@
 Primary API:
     - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations
 """
+
 from .calc_runtime_stats import calc_runtime_for_subblocks
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index dce9bcdd36c..ed49b644f2b 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import json
 from dataclasses import dataclass
 from pathlib import Path
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 40f9f1f7239..347233dcf9b 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import argparse
 import json
 import os

From 781d44d251c7e218397ffeabb96c06f67dcb0718 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 06:19:17 -0700
Subject: [PATCH 14/35] added docstrings

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../torch/nas/subblock_stats/runtime_utils.py    |  9 +++++++++
 .../torch/nas/subblock_stats/runtime_vllm.py     | 16 ++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index ed49b644f2b..09245c278cb 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -12,6 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""Utilities for runtime benchmarking and model saving in ModelOpt NAS.
+
+This module provides classes and utility functions used for empirical runtime
+estimation of Transformer subblocks and for saving models and tokenizers in
+formats suitable for benchmarking (e.g., vLLM latency benchmark) or the
+AnyModel subblock-safetensors format. It defines the configuration dataclass
+used to parameterize runtime benchmarks, as well as model checkpointing helpers
+to ensure compatibility with downstream evaluation pipelines.
+"""
 
 import json
 from dataclasses import dataclass
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 347233dcf9b..eb1931addf6 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -12,6 +12,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""vLLM Runtime Benchmark Integration for ModelOpt NAS Subblocks.
+
+This module provides the integration logic to empirically benchmark subblock
+runtime statistics within transformer architectures using the vLLM latency
+benchmark. It defines helper functions and utilities to invoke the vLLM
+latency benchmark programmatically (as a library) and collect runtime
+statistics, given a prepared model directory and a benchmarking configuration.
+
+Usage:
+    - Call `run_vllm_latency_benchmark` with a model path and a
+      `RuntimeConfig` instance to run a latency benchmark and
+      return the average latency for the configuration (in milliseconds).
+
+This is used internally by ModelOpt NAS to benchmark different subblock
+configurations for search and scoring, enabling data-driven NAS for latency-optimized architectures.
+"""
 
 import argparse
 import json

From a1901c7bc12ee4c651c6883fa4836142ffe133b5 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 06:23:58 -0700
Subject: [PATCH 15/35] updated readme

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 134790bb011..a7af6f66959 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -11,7 +11,7 @@ To use the Puzzle algorithm effectively, we need to specify the target number of
 
 In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. Other supported models should be compressed in a similar way. For GptOss there is one [additional step to be performed](GPTOSS.md).
 
-> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For KV-head pruning see [`llama-3_1-8B_pruneattn_runtime`](./configs/llama-3_1-8B_pruneattn_runtime/) and the [Attention Pruning](#attention-pruning-kv-head-reduction) and [Runtime-Based Latency Optimization](#runtime-based-latency-optimization) sections below. For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md).
+> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md).
 
 ## Environment
 

From 0b755024f3f1040ef80a2bf7796cd06cd7ef34c7 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 08:12:34 -0700
Subject: [PATCH 16/35] further changes based on review

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../validate_solutions_defaults.yaml          |  4 ++--
 .../nas/subblock_stats/calc_runtime_stats.py  | 23 ++++++++-----------
 .../torch/nas/subblock_stats/runtime_utils.py |  2 --
 .../torch/nas/subblock_stats/runtime_vllm.py  |  4 ++--
 4 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
index ec139023794..f950566802a 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
@@ -2,9 +2,9 @@ defaults:
   - /validate_model_defaults
   - _self_
 
-solutions_to_validate:
+solutions_to_validate: []
 skip_validation: false
 save_models: false
 bigger_is_better: false
-sort_solutions_by:
+sort_solutions_by: null
 calculate_full_score_ablations: false
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index 50d4a7e40cb..7c212f8e3b3 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -37,7 +37,7 @@
 )
 
 
-def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig:
+def _make_standard_block_config(num_key_value_heads: int) -> BlockConfig:
     return BlockConfig(
         attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads),
         ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None),
@@ -56,7 +56,7 @@ def create_benchmark_model(
     repeat_block_n_times: int = 10,
 ) -> LlamaForCausalLM:
     """Build a small Llama model with repeated subblocks for latency benchmarking."""
-    block_configs = [_make_standard_block_config(hidden_size, num_key_value_heads)]
+    block_configs = [_make_standard_block_config(num_key_value_heads)]
 
     if block_config:
         block_configs.extend([block_config] * repeat_block_n_times)
@@ -136,17 +136,16 @@ def calc_subblock_runtime(
 @cache
 def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
     """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
-    runtime_config1 = replace(runtime_config, repeat_block_n_times=0)
-    runtime_config10 = replace(runtime_config, repeat_block_n_times=9)
+    runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9)
 
-    block_config = _make_standard_block_config(
-        runtime_config.hidden_size, runtime_config.num_key_value_heads
-    )
+    block_config = _make_standard_block_config(runtime_config.num_key_value_heads)
 
-    runtime_ms1 = calc_subblock_runtime(runtime_config1, None)
-    runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config)
+    runtime_ms_one_block = calc_subblock_runtime(runtime_config, None)  # only one base block
+    runtime_ms_ten_blocks = calc_subblock_runtime(
+        runtime_cfg_ten_blocks, block_config
+    )  # one base block + 9 repeated blocks
 
-    no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9
+    no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9
 
     return no_block_runtime_ms
 
@@ -154,7 +153,7 @@ def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
 @cache
 def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float:
     """Calculate the base runtime of a model with no subblocks."""
-    base_runtime_ms = 0.0
+    base_runtime_ms = None
     if isinstance(subblock_config, AttentionConfig):
         base_runtime_ms = calc_subblock_runtime(runtime_config, None)
     elif isinstance(subblock_config, FFNConfig):
@@ -188,9 +187,7 @@ def calc_runtime_for_subblocks(
         hidden_size,
         num_attention_heads,
         num_key_value_heads,
-        master_puzzle_dir,
         tokenizer_path,
-        synth_dataset_num_requests,
         repeat_block_n_times,
         prefill_seq_len,
         generation_seq_len,
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index 09245c278cb..00e0bc6a5f2 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -41,9 +41,7 @@ class RuntimeConfig:
     hidden_size: int
     num_attention_heads: int
     num_key_value_heads: int
-    master_puzzle_dir: str
     tokenizer_path: str
-    synth_dataset_num_requests: int
     repeat_block_n_times: int
     prefill_seq_len: int
     generation_seq_len: int
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index eb1931addf6..e7a4a90b69c 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -34,13 +34,13 @@
 import os
 from pathlib import Path
 
-from vllm.benchmarks.latency import main as vllm_latency_main
-
 from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
 
 
 def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
     """Run ``vllm bench latency`` and return the average latency in milliseconds."""
+    from vllm.benchmarks.latency import main as vllm_latency_main
+
     output_json_path = model_path / "vllm_latency_benchmark.json"
 
     # Use vLLM latency benchmark as a library.

From 7e2f995a1920f0ab3e3f9fa86e3eff3491accb01 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 08:13:20 -0700
Subject: [PATCH 17/35] further changes based on review

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../validate_solutions_defaults.yaml                            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
index f950566802a..81218606ecd 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
@@ -6,5 +6,5 @@ solutions_to_validate: []
 skip_validation: false
 save_models: false
 bigger_is_better: false
-sort_solutions_by: null
+sort_solutions_by:
 calculate_full_score_ablations: false

From 2ca530629f16f1b0c84b755ff393be250b69bef9 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 15:31:57 -0700
Subject: [PATCH 18/35] removed synth_dataset_num_requests

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../nas/subblock_stats/calc_runtime_stats.py  | 36 +++++++++----------
 .../subblock_stats/calc_subblock_stats.py     |  7 ----
 2 files changed, 17 insertions(+), 26 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index 7c212f8e3b3..33b03b75531 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -133,23 +133,6 @@ def calc_subblock_runtime(
     return calc_model_runtime(model, runtime_config)
 
 
-@cache
-def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
-    """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
-    runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9)
-
-    block_config = _make_standard_block_config(runtime_config.num_key_value_heads)
-
-    runtime_ms_one_block = calc_subblock_runtime(runtime_config, None)  # only one base block
-    runtime_ms_ten_blocks = calc_subblock_runtime(
-        runtime_cfg_ten_blocks, block_config
-    )  # one base block + 9 repeated blocks
-
-    no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9
-
-    return no_block_runtime_ms
-
-
 @cache
 def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float:
     """Calculate the base runtime of a model with no subblocks."""
@@ -167,6 +150,23 @@ def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockCo
     return base_runtime_ms
 
 
+@cache
+def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float:
+    """Estimate the overhead runtime (embedding + LM head) with no decoder blocks."""
+    runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9)
+
+    block_config = _make_standard_block_config(runtime_config.num_key_value_heads)
+
+    runtime_ms_one_block = calc_subblock_runtime(runtime_config, None)  # only one base block
+    runtime_ms_ten_blocks = calc_subblock_runtime(
+        runtime_cfg_ten_blocks, block_config
+    )  # one base block + 9 repeated blocks
+
+    no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9
+
+    return no_block_runtime_ms
+
+
 def calc_runtime_for_subblocks(
     subblock_config_set: set[SubblockConfig],
     runtime_stats_config: DictConfig,
@@ -174,9 +174,7 @@ def calc_runtime_for_subblocks(
     hidden_size: int,
     num_attention_heads: int,
     num_key_value_heads: int,
-    master_puzzle_dir: str,
     tokenizer_path: str,
-    synth_dataset_num_requests: int,
     prefill_seq_len: int,
     generation_seq_len: int,
 ) -> tuple[dict[SubblockConfig, float], float]:
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index e76212c573d..0a678dc8c76 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -118,11 +118,6 @@ def calculate_subblock_stats(
             [subblock_config["subblock_config"] for subblock_config in subblock_configs]
         )
 
-        # dict[SubblockConfig, float], float
-        # TODO: Manage default values for calc_subblock_stats_config in one place, e.g. within a dataclass for hydra config.
-        synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get(
-            "synth_dataset_num_requests", 200
-        )
         runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {})
 
         runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks(
@@ -132,9 +127,7 @@ def calculate_subblock_stats(
             hidden_size=n_embd,
             num_attention_heads=n_head,
             num_key_value_heads=model_config.num_key_value_heads,
-            master_puzzle_dir=master_puzzle_dir,
             tokenizer_path=teacher_dir,
-            synth_dataset_num_requests=synth_dataset_num_requests,
             prefill_seq_len=prefill_seq_len,
             generation_seq_len=generation_seq_len,
         )

From ca2174843d7eb34be207e790c99c6c13c0a9a59e Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Tue, 19 May 2026 16:13:27 -0700
Subject: [PATCH 19/35] removed duplicate model saving

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/runtime_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py
index 00e0bc6a5f2..9adb0826278 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py
@@ -51,7 +51,7 @@ class RuntimeConfig:
 
 def save_model(model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path) -> None:
     """Save model weights as AnyModel and copy the tokenizer to ``output_path``."""
-    model.to(dtype=torch.bfloat16).save_pretrained(output_path)
+    model = model.to(dtype=torch.bfloat16)
     save_model_as_anymodel(model, output_path, LlamaModelDescriptor)
 
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

From 26ceb36aa4df96486be50eed1300c9f3f1d8e7c3 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 20 May 2026 07:39:06 -0700
Subject: [PATCH 20/35] added test

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../llama-3_1-8B_pruneffn_runtime.yaml        |  2 -
 .../torch/nas/subblock_stats/runtime_vllm.py  | 87 +++++++++++--------
 .../gpu/torch/nas/test_calc_runtime_stats.py  | 78 +++++++++++++++++
 3 files changed, 127 insertions(+), 40 deletions(-)
 create mode 100644 tests/gpu/torch/nas/test_calc_runtime_stats.py

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
index 036486df530..0c55b3b5c2d 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
@@ -16,11 +16,9 @@ puzzle_dir: /workspace/puzzle_dir
 calc_subblock_stats:
   runtime_stats:
     enabled: true
-    synth_dataset_num_requests: 32
     backend: vllm
     num_warmup_iters: 2
     num_iters: 10
-    batch_size: 1
 
 # MIP memory constraint (in MiB)
 mip:
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index e7a4a90b69c..96ac57b951b 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -16,61 +16,72 @@
 
 This module provides the integration logic to empirically benchmark subblock
 runtime statistics within transformer architectures using the vLLM latency
-benchmark. It defines helper functions and utilities to invoke the vLLM
-latency benchmark programmatically (as a library) and collect runtime
-statistics, given a prepared model directory and a benchmarking configuration.
+benchmark. Each invocation is launched in a dedicated subprocess so that GPU
+memory and CUDA state are fully reclaimed when the subprocess exits, allowing
+many sequential benchmarks to run in a single Python session without leaking.
 
 Usage:
     - Call `run_vllm_latency_benchmark` with a model path and a
       `RuntimeConfig` instance to run a latency benchmark and
       return the average latency for the configuration (in milliseconds).
-
-This is used internally by ModelOpt NAS to benchmark different subblock
-configurations for search and scoring, enabling data-driven NAS for latency-optimized architectures.
 """
 
-import argparse
 import json
-import os
+import subprocess
 from pathlib import Path
 
 from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
 
 
-def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig):
-    """Run ``vllm bench latency`` and return the average latency in milliseconds."""
-    from vllm.benchmarks.latency import main as vllm_latency_main
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float:
+    """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms.
 
+    Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA
+    context, and vLLM engine state are fully released on subprocess exit, so
+    many calls in one parent process do not accumulate.
+    """
     output_json_path = model_path / "vllm_latency_benchmark.json"
+    max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len
 
-    # Use vLLM latency benchmark as a library.
-
-    # Create a mock argparse.Namespace similar to what is parsed by vllm.benchmarks.latency.main
-    args_ns = argparse.Namespace()
-
-    # Populate the Namespace with all required attributes
-    args_ns.model = str(model_path)
-    args_ns.input_len = runtime_config.prefill_seq_len
-    args_ns.output_len = runtime_config.generation_seq_len
-    args_ns.batch_size = 1
-    args_ns.output_json = str(output_json_path)
-    args_ns.max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len
-    args_ns.num_iters_warmup = runtime_config.num_warmup_iters
-    args_ns.num_iters = runtime_config.num_iters
-    args_ns.max_num_seqs = 1
-    args_ns.distributed_executor_backend = (
-        "external_launcher"  # Running vLLM with torchrun so need to indicate that.
-    )
-    args_ns.tensor_parallel_size = 1
-    args_ns.pipeline_parallel_size = 1
-    args_ns.optimization_level = 0  # This is required to make the stats accurate.
-    args_ns.n = 1
-    args_ns.disable_detokenize = False
+    cmd = [
+        "vllm",
+        "bench",
+        "latency",
+        "--model",
+        str(model_path),
+        "--input-len",
+        str(runtime_config.prefill_seq_len),
+        "--output-len",
+        str(runtime_config.generation_seq_len),
+        "--batch-size",
+        "1",
+        "--output-json",
+        str(output_json_path),
+        "--max-model-len",
+        str(max_model_len),
+        "--num-iters-warmup",
+        str(runtime_config.num_warmup_iters),
+        "--num-iters",
+        str(runtime_config.num_iters),
+        "--max-num-seqs",
+        "1",
+        "--tensor-parallel-size",
+        "1",
+        "--pipeline-parallel-size",
+        "1",
+        "--distributed-executor-backend",
+        "external_launcher",
+        # vLLM defaults to 0.9; keep the per-run budget modest so the parent
+        # process always has headroom for the next benchmark.
+        "--gpu-memory-utilization",
+        "0.3",
+        # Required for accurate per-block runtime stats.
+        "--optimization-level",
+        "0",
+    ]
 
-    os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
-    vllm_latency_main(args_ns)
+    subprocess.run(cmd, check=True)
 
     with open(output_json_path) as f:
         vllm_results = json.load(f)
-    print(vllm_results)
-    return vllm_results["avg_latency"] * 1000  # convert to milliseconds
+    return vllm_results["avg_latency"] * 1000  # seconds -> milliseconds
diff --git a/tests/gpu/torch/nas/test_calc_runtime_stats.py b/tests/gpu/torch/nas/test_calc_runtime_stats.py
new file mode 100644
index 00000000000..0917f2df502
--- /dev/null
+++ b/tests/gpu/torch/nas/test_calc_runtime_stats.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""GPU test for ``calc_runtime_for_subblocks``.
+
+Exercises the end-to-end vLLM latency benchmarking pipeline on a tiny model:
+constructs a small subblock set, runs the benchmark for each candidate, and
+checks the returned per-subblock runtime dict and no-block overhead.
+"""
+
+from functools import partial
+from pathlib import Path
+
+import pytest
+from _test_utils.torch.distributed.utils import spawn_multiprocess_job
+from _test_utils.torch.transformers_models import get_tiny_tokenizer
+
+pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks")
+
+
+def test_calc_runtime_for_subblocks(tmp_path: Path):
+    """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead."""
+    spawn_multiprocess_job(size=1, job=partial(_run, tmp_path), backend="nccl")
+
+
+def _run(tmp_path: Path, rank: int, size: int):
+    import math
+
+    from omegaconf import OmegaConf
+
+    from modelopt.torch.nas.subblock_stats import calc_runtime_for_subblocks
+    from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig
+
+    tokenizer = get_tiny_tokenizer()
+    tokenizer_dir = tmp_path / "tokenizer"
+    tokenizer.save_pretrained(str(tokenizer_dir))
+
+    attn = AttentionConfig(no_op=False, num_key_value_heads=2)
+    ffn = FFNConfig(no_op=False, intermediate_size=256, moe=None)
+    attn_noop = AttentionConfig(no_op=True)
+    subblock_set = {attn, ffn, attn_noop}
+
+    # vLLM's bench latency samples input ids in [0, 10000) (see
+    # vllm/benchmarks/latency.py), and its input validator accepts an id when
+    # it fits in max(tokenizer.max_token_id, model_vocab_size - 1). The tiny
+    # tokenizer's vocab is ~200, so we size the model vocab past 10000 to
+    # cover the sampled range.
+    runtime_by_subblock, no_block_runtime_ms = calc_runtime_for_subblocks(
+        subblock_config_set=subblock_set,
+        runtime_stats_config=OmegaConf.create({"num_iters": 1, "num_warmup_iters": 1}),
+        vocab_size=10016,
+        hidden_size=256,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        tokenizer_path=str(tokenizer_dir),
+        prefill_seq_len=8,
+        generation_seq_len=4,
+    )
+
+    assert set(runtime_by_subblock) == subblock_set
+    assert runtime_by_subblock[attn_noop] == 0.0
+    assert math.isfinite(runtime_by_subblock[attn])
+    assert math.isfinite(runtime_by_subblock[ffn])
+    # The 1-block model is always slower than the per-block extrapolation from
+    # the 10-block model, so the (embedding + LM-head) overhead is positive.
+    assert no_block_runtime_ms > 0

From 4c5b133e621879fed9271d7d1623b7ef4b94e988 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 20 May 2026 11:47:54 -0700
Subject: [PATCH 21/35] suppressing bandit warnings B404 and B603; precedence
 found in repo

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/runtime_vllm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 96ac57b951b..6ba8671bf46 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -27,7 +27,7 @@
 """
 
 import json
-import subprocess
+import subprocess  # nosec B404
 from pathlib import Path
 
 from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
@@ -80,7 +80,8 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
         "0",
     ]
 
-    subprocess.run(cmd, check=True)
+    # cmd is a fixed list of strings (no shell, no untrusted input).
+    subprocess.run(cmd, text=True, check=True, capture_output=True)  # nosec B603
 
     with open(output_json_path) as f:
         vllm_results = json.load(f)

From 398808a81ae19c7332c48bd7950454e3d1990756 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 21 May 2026 02:08:54 -0700
Subject: [PATCH 22/35] removed gpu utilization param

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../validate_solutions_defaults.yaml                          | 2 +-
 modelopt/torch/nas/subblock_stats/runtime_vllm.py             | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
index 81218606ecd..ec139023794 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
@@ -2,7 +2,7 @@ defaults:
   - /validate_model_defaults
   - _self_
 
-solutions_to_validate: []
+solutions_to_validate:
 skip_validation: false
 save_models: false
 bigger_is_better: false
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 6ba8671bf46..a6850892798 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -71,10 +71,6 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
         "1",
         "--distributed-executor-backend",
         "external_launcher",
-        # vLLM defaults to 0.9; keep the per-run budget modest so the parent
-        # process always has headroom for the next benchmark.
-        "--gpu-memory-utilization",
-        "0.3",
         # Required for accurate per-block runtime stats.
         "--optimization-level",
         "0",

From e468f62819d5efcea6701de4319e1c530fc60b86 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 21 May 2026 07:10:16 -0700
Subject: [PATCH 23/35] wip

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../torch/nas/subblock_stats/runtime_vllm.py  | 25 +++++++++++++++----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index a6850892798..59b97a408ab 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -33,7 +33,7 @@
 from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
 
 
-def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float:
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None:
     """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms.
 
     Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA
@@ -77,8 +77,23 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
     ]
 
     # cmd is a fixed list of strings (no shell, no untrusted input).
-    subprocess.run(cmd, text=True, check=True, capture_output=True)  # nosec B603
+    vllm_results = None
+    try:
+        subprocess.run(
+            cmd,
+            check=True,
+            capture_output=True,
+            text=True,
+            timeout=1800,  # 30 minutes
+        )  # nosec B603
+    except subprocess.TimeoutExpired as exc:
+        raise TimeoutError("vLLM latency benchmark timed out") from exc
+    except subprocess.CalledProcessError as exc:
+        raise RuntimeError(exc.stderr or exc.stdout or "vLLM latency benchmark failed") from exc
 
-    with open(output_json_path) as f:
-        vllm_results = json.load(f)
-    return vllm_results["avg_latency"] * 1000  # seconds -> milliseconds
+    if output_json_path.exists():
+        with open(output_json_path) as f:
+            vllm_results = json.load(f)
+            vllm_results = vllm_results["avg_latency"] * 1000  # seconds -> milliseconds
+
+    return vllm_results

From 34dbe52c623a7d536dca01c3b7eac8a081e7ec89 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 21 May 2026 13:50:15 -0700
Subject: [PATCH 24/35] removed redundant configs; guards for vllm results

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../validate_model_defaults.yaml              |  2 +-
 .../Llama-3_1-8B.yaml                         | 13 +++++---
 .../llama-3_1-8B_pruneffn_runtime.yaml        |  6 ----
 .../pruning/attn_pruning.yaml                 | 23 -------------
 .../pruning/ffn_pruning.yaml                  | 19 -----------
 .../pruning/hidden_dim_pruning.yaml           | 15 ---------
 .../pruning/pruning_defaults.yaml             | 33 -------------------
 .../validate_model_defaults.yaml              | 17 ----------
 .../validate_solutions_defaults.yaml          | 10 ------
 .../torch/nas/subblock_stats/runtime_vllm.py  |  5 +--
 10 files changed, 12 insertions(+), 131 deletions(-)
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml
 delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml
index ce1749d9698..6b36142a3a8 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml
@@ -3,7 +3,7 @@ autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model
 block_size: 8192
 bos_rate: 0.5
 data_column: messages
-val_dataset_name: valid
+val_dataset_name: validation
 shuffle_seed: 81436
 seed: 42
 fim_rate: 0
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index 0e270906151..b70e1c367eb 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -1,7 +1,7 @@
 defaults:
-  - pruning: ffn_pruning
-  - scoring: ../validate_solutions_defaults
-  - realize_model: ../validate_solutions_defaults
+  - ../llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning@pruning
+  - ../llama-3_1-8B_pruneffn_memory/validate_solutions_defaults@scoring
+  - ../llama-3_1-8B_pruneffn_memory/validate_solutions_defaults@realize_model
   - bypass:
   - override hydra/hydra_logging: disabled
   - _self_
@@ -26,7 +26,7 @@ calc_subblock_stats:
   prefill_queue_size: 0
   allocate_prefill_query: false
   benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
-  merge_with_existing_stats: true
+  merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
   runtime_stats:
@@ -42,7 +42,7 @@ scoring:
   teacher_dir: ${to_path:${teacher_dir}}
   output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
 
-  eval_samples: 8
+  eval_samples: 128
   micro_batch_size: 1
   seed: 42
   shuffle_seed: 444
@@ -77,6 +77,9 @@ mip:
     - stats.ffn_num_params
     - stats.attention_num_params
 
+  human_constraints:
+    target_latency: 21
+
   mip_constraints:
   metric_overrides:
   max_seconds_per_solution: 60
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
index 0c55b3b5c2d..701c31e7c10 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
@@ -16,15 +16,9 @@ puzzle_dir: /workspace/puzzle_dir
 calc_subblock_stats:
   runtime_stats:
     enabled: true
-    backend: vllm
     num_warmup_iters: 2
     num_iters: 10
 
-# MIP memory constraint (in MiB)
-mip:
-  human_constraints:
-    target_latency: 21
-
 # FFN intermediate sizes to search over (heterogeneous architecture)
 pruning:
   intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml
deleted file mode 100644
index 53d7e4bd9c6..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-defaults:
-  - pruning_defaults
-
-hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IndependentKvHeadContributionHook}
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-pruning_mixin:
-  _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn
-  layer_descriptor:
-    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor
-
-activation_hooks_kwargs:
-  method: independent_kv_head_contribution
-  optimize_for: memory    # IndependentKvHeadContributionHook implementation that consumes less memory
-  target_layer: "self_attn.o_proj"
-  layer_input_descriptors_path:
-
-# n_heads_in_group: 4
-# num_attention_heads: 32       # num query heads
-# num_kv_heads: 32 / 4 = 8      # num_query_heads // n_heads_in_group
-n_heads_in_group_list: [8, 16, 32]      # num_kv_heads = [4, 2, 1]
-gqa_init_mode: "PruneKVHeads"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml
deleted file mode 100644
index da0b9720700..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-defaults:
-  - pruning_defaults
-
-pruning_mixin:
-  _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn
-  layer_descriptor:
-    _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor
-
-hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IterativeChannelContributionHook}
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-activation_hooks_kwargs:
-  method: iterative
-  target_layer: "mlp.down_proj"
-  layer_input_descriptors_path:
-
-intermediate_size_list: [3072, 5888, 8704, 11520]  # teacher_intermediate_size is 14336
-mlp_init_mode: "PruneByActivationsLog"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml
deleted file mode 100644
index 407c835d8c4..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml
+++ /dev/null
@@ -1,15 +0,0 @@
-defaults:
-  - pruning_defaults
-
-activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id}
-
-activation_hooks_kwargs:
-  method: layer_norm_contribution
-  target_layer: "layernorm"
-
-# Hidden dimension pruning specific settings
-hidden_size_list: [3072, 2048]  # Target hidden sizes to prune to
-hidden_size_init_mode: "PruneByChannelRanking"
-mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher
-gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher
-linear_init_mode: "FromTeacher"
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml
deleted file mode 100644
index e05e775bee3..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml
+++ /dev/null
@@ -1,33 +0,0 @@
-defaults:
-  - /validate_model_defaults
-
-descriptor: ${descriptor}
-model_name_or_path: ${teacher_dir}
-experiment_id: ${pruning.eval_samples}samples_diverse_mini
-activations_log_dir: ???
-activation_hooks_kwargs: ???
-
-# Data:
-eval_samples: 1000 # default is 10000
-micro_batch_size: 4
-dataset_path: ${dataset_path}
-val_dataset_name: train
-
-# Prune ckpts
-pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id}
-
-## FFN pruning
-ffn_list:
-mlp_init_mode: "Truncate" # PruneByActivationsLog
-
-## KV-heads pruning
-n_heads_in_group_list:
-gqa_init_mode: "AverageKV"
-
-## Hidden dimension pruning
-hidden_size_list:
-hidden_size_init_mode: "PruneByChannelRanking"
-linear_init_mode: "FromTeacher"
-
-mlp_init_config_yaml:
-  activations_log_dir: ${pruning.activations_log_dir}
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml
deleted file mode 100644
index 6b36142a3a8..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-model_dtype: torch.bfloat16 # dtype to cast the model for validate_model
-autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model
-block_size: 8192
-bos_rate: 0.5
-data_column: messages
-val_dataset_name: validation
-shuffle_seed: 81436
-seed: 42
-fim_rate: 0
-fim_spm_rate: 0
-source_datasets_to_discard:
-varlen: false
-write_results: false
-calc_losses_on_cpu: false
-activations_log_dir:
-model_name_or_path:
-load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn}
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
deleted file mode 100644
index ec139023794..00000000000
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-defaults:
-  - /validate_model_defaults
-  - _self_
-
-solutions_to_validate:
-skip_validation: false
-save_models: false
-bigger_is_better: false
-sort_solutions_by:
-calculate_full_score_ablations: false
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
index 59b97a408ab..21aa5e30bcd 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py
@@ -94,6 +94,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
     if output_json_path.exists():
         with open(output_json_path) as f:
             vllm_results = json.load(f)
-            vllm_results = vllm_results["avg_latency"] * 1000  # seconds -> milliseconds
+        if "avg_latency" in vllm_results:
+            return vllm_results["avg_latency"] * 1000  # seconds -> milliseconds
 
-    return vllm_results
+    raise RuntimeError(f"vLLM benchmark output not found at {output_json_path}")

From 24fa2d5a84e6d8bd30996373dd4d3c6e25e4d10c Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 21 May 2026 13:59:59 -0700
Subject: [PATCH 25/35] following annotation suggestion

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 modelopt/torch/nas/subblock_stats/calc_runtime_stats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
index 33b03b75531..16d62ed1d61 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
@@ -99,7 +99,7 @@ def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -
 @cache
 def calc_subblock_runtime(
     runtime_config: RuntimeConfig,
-    subblock_config: SubblockConfig,
+    subblock_config: SubblockConfig | None,
 ) -> float:
     """Measure total runtime of a repeated subblock via vLLM latency benchmark."""
     block_config: BlockConfig | None = None

From 4b824f143b28918048c09de5b40569608723672d Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Fri, 22 May 2026 03:57:04 -0700
Subject: [PATCH 26/35] updated readme

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index a7af6f66959..91d2bab1d3e 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -345,31 +345,43 @@ For distillation results on Puzzletron-compressed models, see [examples/pruning/
 
 ## Runtime-Based Latency Optimization
 
-By default, subblock statistics use the `trt_torch` backend with theoretical memory proxies. You can instead enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints:
+You can enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints.
+
+A ready-to-run example config is included at [`configs/llama-3_1-8B_pruneffn_runtime/`](./configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml). The following key fields enable and control execution of the runtime statistics in the `llama-3_1-8B_pruneffn_runtime.yaml` config file:
 
 ```yaml
 calc_subblock_stats:
   runtime_stats:
     enabled: true
-    synth_dataset_num_requests: 32
-    backend: vllm
     num_warmup_iters: 2
     num_iters: 10
-    batch_size: 1
+```
+
+The runtime constraint is specified in the `human_constraints` section of the config `Llama-3_1-8B.yaml`:
+```yaml
+human_constraints:
+  target_latency: 21
+```
 
-mip:
-  human_constraints:
-    target_latency: 20  # seconds
+Run the pipeline against this config the same way as the memory-constrained variant:
+
+```bash
+torchrun --nproc_per_node 2 examples/puzzletron/main.py \
+   --config examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml 2>&1 | tee ./log.txt | grep "Puzzletron Progress"
 ```
 
-Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly:
+The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency`, instead of optimizing for a memory budget.
+
+Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly (already included in the example config):
 
 ```yaml
-dist_timeout_minutes: 60  # default is 10 if omitted
+nccl_timeout_minutes: 90  # default is 10 if omitted
 ```
 
 This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout.
 
+Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency` value of 21 seconds resulted in a final model latency of 22.3 seconds.
+
 ## Advanced Usage
 
 Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios.

From ae25ec7a9b9c48f0b6a5ad48465397ff4a0bd04f Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Fri, 22 May 2026 11:55:13 -0700
Subject: [PATCH 27/35] moved stats utils from nas to puzzletron

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md                 |  1 +
 .../Llama-3_1-8B.yaml                         |  2 +-
 .../Llama-3_1-8B.yaml                         |  2 +-
 modelopt/torch/nas/subblock_stats/__init__.py | 24 -------------------
 .../subblock_stats/calc_runtime_stats.py      |  4 ++--
 .../calc_subblock_params_and_memory.py        |  0
 .../subblock_stats/calc_subblock_stats.py     |  6 +++--
 .../subblock_stats/runtime_utils.py           |  0
 .../subblock_stats/runtime_vllm.py            |  2 +-
 .../test_calc_runtime_stats.py                | 18 ++++----------
 10 files changed, 15 insertions(+), 44 deletions(-)
 delete mode 100644 modelopt/torch/nas/subblock_stats/__init__.py
 rename modelopt/torch/{nas => puzzletron}/subblock_stats/calc_runtime_stats.py (97%)
 rename modelopt/torch/{nas => puzzletron}/subblock_stats/calc_subblock_params_and_memory.py (100%)
 rename modelopt/torch/{nas => puzzletron}/subblock_stats/runtime_utils.py (100%)
 rename modelopt/torch/{nas => puzzletron}/subblock_stats/runtime_vllm.py (97%)
 rename tests/gpu/torch/{nas => puzzletron}/test_calc_runtime_stats.py (86%)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 91d2bab1d3e..27918819c3f 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -358,6 +358,7 @@ calc_subblock_stats:
 ```
 
 The runtime constraint is specified in the `human_constraints` section of the config `Llama-3_1-8B.yaml`:
+
 ```yaml
 human_constraints:
   target_latency: 21
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
index 21903db1623..1c302fd4c30 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml
@@ -42,7 +42,7 @@ scoring:
   teacher_dir: ${to_path:${teacher_dir}}
   output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
 
-  eval_samples: 128
+  eval_samples: 8
   micro_batch_size: 1
   seed: 42
   shuffle_seed: 444
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index b70e1c367eb..eb9e2398efb 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -42,7 +42,7 @@ scoring:
   teacher_dir: ${to_path:${teacher_dir}}
   output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
 
-  eval_samples: 128
+  eval_samples: 8
   micro_batch_size: 1
   seed: 42
   shuffle_seed: 444
diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py
deleted file mode 100644
index 1976eb2f2e1..00000000000
--- a/modelopt/torch/nas/subblock_stats/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Subblock runtime statistics API for ModelOpt NAS.
-
-This module provides utilities for measuring and calculating runtime statistics
-of subblocks (e.g., Attention, FFN) within transformer architectures.
-
-Primary API:
-    - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations
-"""
-
-from .calc_runtime_stats import calc_runtime_for_subblocks
diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
similarity index 97%
rename from modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
rename to modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
index 16d62ed1d61..1610b3d7397 100644
--- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
@@ -25,8 +25,8 @@
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
 
-from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig, save_model
-from modelopt.torch.nas.subblock_stats.runtime_vllm import run_vllm_latency_benchmark
+from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig, save_model
+from modelopt.torch.puzzletron.subblock_stats.runtime_vllm import run_vllm_latency_benchmark
 from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
 from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
 from modelopt.torch.puzzletron.block_config import (
diff --git a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
similarity index 100%
rename from modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py
rename to modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index 0a678dc8c76..f91ba397fc1 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -33,7 +33,7 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
-from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import (
+from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import (
     calc_subblock_active_params,
     calculate_non_block_memory,
     calculate_non_block_params,
@@ -90,7 +90,9 @@ def calculate_subblock_stats(
     moe_stats_file: str | Path | None = None,
 ) -> dict:
     if runtime_stats_enabled:
-        from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks
+        from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import (
+            calc_runtime_for_subblocks,
+        )
 
     gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name()
     subblock_stats = {
diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
similarity index 100%
rename from modelopt/torch/nas/subblock_stats/runtime_utils.py
rename to modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
similarity index 97%
rename from modelopt/torch/nas/subblock_stats/runtime_vllm.py
rename to modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
index 21aa5e30bcd..75e3953ac72 100644
--- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
@@ -30,7 +30,7 @@
 import subprocess  # nosec B404
 from pathlib import Path
 
-from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig
+from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig
 
 
 def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None:
diff --git a/tests/gpu/torch/nas/test_calc_runtime_stats.py b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
similarity index 86%
rename from tests/gpu/torch/nas/test_calc_runtime_stats.py
rename to tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
index 0917f2df502..fba8d563f44 100644
--- a/tests/gpu/torch/nas/test_calc_runtime_stats.py
+++ b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
@@ -20,29 +20,21 @@
 checks the returned per-subblock runtime dict and no-block overhead.
 """
 
-from functools import partial
+import math
 from pathlib import Path
 
 import pytest
-from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.transformers_models import get_tiny_tokenizer
+from omegaconf import OmegaConf
+
+from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig
+from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks
 
 pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks")
 
 
 def test_calc_runtime_for_subblocks(tmp_path: Path):
     """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead."""
-    spawn_multiprocess_job(size=1, job=partial(_run, tmp_path), backend="nccl")
-
-
-def _run(tmp_path: Path, rank: int, size: int):
-    import math
-
-    from omegaconf import OmegaConf
-
-    from modelopt.torch.nas.subblock_stats import calc_runtime_for_subblocks
-    from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig
-
     tokenizer = get_tiny_tokenizer()
     tokenizer_dir = tmp_path / "tokenizer"
     tokenizer.save_pretrained(str(tokenizer_dir))

From f34d3a3f3529ae1fef138bba42aec65376f70e24 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 27 May 2026 04:40:49 -0700
Subject: [PATCH 28/35] responding to reviews

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md                   |  6 +++---
 .../Llama-3_1-8B.yaml                           |  8 +-------
 modelopt/torch/puzzletron/mip/run_puzzle.py     |  6 +++---
 .../subblock_stats/calc_runtime_stats.py        | 16 +++++-----------
 .../calc_subblock_params_and_memory.py          |  8 ++++----
 .../subblock_stats/calc_subblock_stats.py       | 17 ++++++++---------
 .../puzzletron/subblock_stats/runtime_utils.py  |  4 ++--
 .../puzzletron/subblock_stats/runtime_vllm.py   |  2 +-
 8 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index 27918819c3f..d6f5b17a554 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -361,7 +361,7 @@ The runtime constraint is specified in the `human_constraints` section of the co
 
 ```yaml
 human_constraints:
-  target_latency: 21
+  target_latency_seconds: 21
 ```
 
 Run the pipeline against this config the same way as the memory-constrained variant:
@@ -371,7 +371,7 @@ torchrun --nproc_per_node 2 examples/puzzletron/main.py \
    --config examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml 2>&1 | tee ./log.txt | grep "Puzzletron Progress"
 ```
 
-The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency`, instead of optimizing for a memory budget.
+The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency_seconds`, instead of optimizing for a memory budget.
 
 Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly (already included in the example config):
 
@@ -381,7 +381,7 @@ nccl_timeout_minutes: 90  # default is 10 if omitted
 
 This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout.
 
-Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency` value of 21 seconds resulted in a final model latency of 22.3 seconds.
+Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 21 resulted in a final model latency of 22.3 seconds.
 
 ## Advanced Usage
 
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index eb9e2398efb..437f7006e96 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -23,14 +23,10 @@ calc_subblock_stats:
   prefill_seq_len: 4096
   generation_seq_len: 4096
   num_active_tokens_override: # Optional override for sequence lengths
-  prefill_queue_size: 0
-  allocate_prefill_query: false
   benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
-  runtime_stats:
-    backend: trt_torch
 
 scoring:
   descriptor: ${descriptor}
@@ -62,8 +58,6 @@ mip:
   subblock_stats_args:
     - batch_size: 96
       weights_dtype: torch.bfloat16
-      activations_dtype: torch.bfloat16
-      kv_cache_dtype: torch.bfloat16
 
   report_additional_costs:
     - stats.memory_mib
@@ -78,7 +72,7 @@ mip:
     - stats.attention_num_params
 
   human_constraints:
-    target_latency: 21
+    target_latency_seconds: 21
 
   mip_constraints:
   metric_overrides:
diff --git a/modelopt/torch/puzzletron/mip/run_puzzle.py b/modelopt/torch/puzzletron/mip/run_puzzle.py
index 761534f6df9..22c8b471546 100644
--- a/modelopt/torch/puzzletron/mip/run_puzzle.py
+++ b/modelopt/torch/puzzletron/mip/run_puzzle.py
@@ -79,7 +79,7 @@ class Type(enum.Enum):
     _ALLOWED_HUMAN_CONSTRAINTS = {
         "target_memory",
         "target_throughput",
-        "target_latency",
+        "target_latency_seconds",
         "target_time_to_first_token",
         "num_params",
         "stats.has_attention",
@@ -175,8 +175,8 @@ def to_mip_constraints(self, subblock_stats_args) -> dict[str, Any]:
             throughput_constraints.append(
                 batch_size * generation_seq_len / self.constraints["target_throughput"]
             )
-        if "target_latency" in self.constraints:
-            throughput_constraints.append(self.constraints["target_latency"])
+        if "target_latency_seconds" in self.constraints:
+            throughput_constraints.append(self.constraints["target_latency_seconds"])
         if throughput_constraints:
             mip_constraints["stats.runtime_ms"] = 1000 * min(throughput_constraints)
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
index 1610b3d7397..641fc21c24f 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
@@ -25,23 +25,17 @@
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM
 
-from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig, save_model
-from modelopt.torch.puzzletron.subblock_stats.runtime_vllm import run_vllm_latency_benchmark
-from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
-from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher
-from modelopt.torch.puzzletron.block_config import (
-    AttentionConfig,
-    BlockConfig,
-    FFNConfig,
-    SubblockConfig,
-)
+from ..anymodel.models.llama import LlamaModelDescriptor
+from ..anymodel.puzzformer import deci_x_patcher
+from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig
+from .runtime_utils import RuntimeConfig, save_model
+from .runtime_vllm import run_vllm_latency_benchmark
 
 
 def _make_standard_block_config(num_key_value_heads: int) -> BlockConfig:
     return BlockConfig(
         attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads),
         ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None),
-        parallel_blocks=None,
     )
 
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
index abe7a1a3884..b0772fb839f 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
@@ -30,16 +30,16 @@
 import torch
 from transformers import PretrainedConfig
 
-from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
-from modelopt.torch.puzzletron.block_config import (
+from ..anymodel.model_descriptor import ModelDescriptor
+from ..block_config import (
     AttentionConfig,
     BlockConfig,
     FFNConfig,
     MambaConfig,
     maybe_cast_block_configs,
 )
-from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config
-from modelopt.torch.puzzletron.utils.misc import (
+from ..tools.checkpoint_utils_hf import init_model_from_config
+from ..utils.misc import (
     EmptyInitOnDevice,
     calculate_kv_dim,
     raise_unknown_subblock_config_error,
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index f91ba397fc1..b5f91c5efa9 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -33,21 +33,20 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
-from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import (
-    calc_subblock_active_params,
-    calculate_non_block_memory,
-    calculate_non_block_params,
-    calculate_subblock_memory,
-    calculate_subblock_params,
-)
-from modelopt.torch.utils import json_dump
-
 from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory
 from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig
 from ..replacement_library.replacement_utils import parse_layer_replacement
 from ..tools.checkpoint_utils import load_model_config
 from ..tools.logger import mprint
+from ..utils import json_dump
 from ..utils.parsing import format_global_config
+from .calc_subblock_params_and_memory import (
+    calc_subblock_active_params,
+    calculate_non_block_memory,
+    calculate_non_block_params,
+    calculate_subblock_memory,
+    calculate_subblock_params,
+)
 
 __all__ = [
     "calculate_subblock_stats",
diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
index 9adb0826278..5b073a8c351 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
@@ -29,8 +29,8 @@
 import torch
 from transformers import AutoTokenizer, LlamaForCausalLM
 
-from modelopt.torch.puzzletron.anymodel.converter import Converter
-from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor
+from ..anymodel.converter import Converter
+from ..anymodel.models.llama import LlamaModelDescriptor
 
 
 @dataclass(frozen=True)
diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
index 75e3953ac72..5f996535a0f 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
@@ -30,7 +30,7 @@
 import subprocess  # nosec B404
 from pathlib import Path
 
-from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig
+from .runtime_utils import RuntimeConfig
 
 
 def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None:

From 333214995c9ed9d9e6711eb6a326ceb077abcc1e Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Wed, 27 May 2026 08:33:05 -0700
Subject: [PATCH 29/35] reenabled some vars

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml   | 4 +++-
 .../torch/puzzletron/subblock_stats/calc_subblock_stats.py    | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index 437f7006e96..0a0cc015e28 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -23,6 +23,8 @@ calc_subblock_stats:
   prefill_seq_len: 4096
   generation_seq_len: 4096
   num_active_tokens_override: # Optional override for sequence lengths
+  prefill_queue_size: 0
+  allocate_prefill_query: false
   benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
@@ -38,7 +40,7 @@ scoring:
   teacher_dir: ${to_path:${teacher_dir}}
   output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation
 
-  eval_samples: 8
+  eval_samples: 128
   micro_batch_size: 1
   seed: 42
   shuffle_seed: 444
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index b5f91c5efa9..9597063600b 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -33,12 +33,13 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
+from modelopt.torch.utils import json_dump
+
 from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory
 from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig
 from ..replacement_library.replacement_utils import parse_layer_replacement
 from ..tools.checkpoint_utils import load_model_config
 from ..tools.logger import mprint
-from ..utils import json_dump
 from ..utils.parsing import format_global_config
 from .calc_subblock_params_and_memory import (
     calc_subblock_active_params,

From 88e16d7c1687eee4215d485681f157465d9fd514 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 28 May 2026 14:09:23 -0700
Subject: [PATCH 30/35] added support for batch_sizes

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml    | 10 ++++++----
 .../llama-3_1-8B_pruneffn_runtime.yaml                 |  2 --
 .../puzzletron/subblock_stats/calc_runtime_stats.py    |  3 +++
 .../puzzletron/subblock_stats/calc_subblock_stats.py   |  2 +-
 .../torch/puzzletron/subblock_stats/runtime_utils.py   |  1 +
 .../torch/puzzletron/subblock_stats/runtime_vllm.py    |  2 +-
 tests/gpu/torch/puzzletron/test_calc_runtime_stats.py  |  1 +
 7 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index 0a0cc015e28..4020497f800 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -19,9 +19,9 @@ build_replacement_library:
   add_attention_no_ops: true
 
 calc_subblock_stats:
-  batch_sizes: [64, 96, 128]
-  prefill_seq_len: 4096
-  generation_seq_len: 4096
+  batch_sizes: [1, 4]
+  prefill_seq_len: 1024
+  generation_seq_len: 1024
   num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
@@ -58,7 +58,7 @@ mip:
   bigger_is_better: false
 
   subblock_stats_args:
-    - batch_size: 96
+    - batch_size: 1
       weights_dtype: torch.bfloat16
 
   report_additional_costs:
@@ -96,6 +96,8 @@ realize_model:
   shuffle_seed: 444
   dataset_path: ${dataset_path}
 
+nccl_timeout_minutes: ${timedelta_minutes:120}
+
 # This section redirects Hydra outputs
 hydra:
   run:
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
index 701c31e7c10..588df25f27d 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml
@@ -2,8 +2,6 @@ defaults:
   - Llama-3_1-8B
   - _self_
 
-nccl_timeout_minutes: ${timedelta_minutes:90}
-
 # Input Hugging Face model to compress
 input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
index 641fc21c24f..6e4821936e7 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py
@@ -171,9 +171,11 @@ def calc_runtime_for_subblocks(
     tokenizer_path: str,
     prefill_seq_len: int,
     generation_seq_len: int,
+    batch_size: int,
 ) -> tuple[dict[SubblockConfig, float], float]:
     """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead."""
     repeat_block_n_times = 10
+
     runtime_config = RuntimeConfig(
         vocab_size,
         hidden_size,
@@ -183,6 +185,7 @@ def calc_runtime_for_subblocks(
         repeat_block_n_times,
         prefill_seq_len,
         generation_seq_len,
+        batch_size,
         runtime_stats_config.get("num_iters", 30),
         runtime_stats_config.get("num_warmup_iters", 10),
     )
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index 9597063600b..4284e70a3db 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -19,7 +19,6 @@
 import copy
 import dataclasses
 import json
-import os
 import warnings
 from functools import partial
 from itertools import product
@@ -132,6 +131,7 @@ def calculate_subblock_stats(
             tokenizer_path=teacher_dir,
             prefill_seq_len=prefill_seq_len,
             generation_seq_len=generation_seq_len,
+            batch_size=batch_size,
         )
 
     sorted_subblock_config = sorted(
diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
index 5b073a8c351..3259e706c73 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py
@@ -45,6 +45,7 @@ class RuntimeConfig:
     repeat_block_n_times: int
     prefill_seq_len: int
     generation_seq_len: int
+    batch_size: int
     num_iters: int
     num_warmup_iters: int
 
diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
index 5f996535a0f..cc88fc5fe20 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
@@ -54,7 +54,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
         "--output-len",
         str(runtime_config.generation_seq_len),
         "--batch-size",
-        "1",
+        str(runtime_config.batch_size),
         "--output-json",
         str(output_json_path),
         "--max-model-len",
diff --git a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
index fba8d563f44..d976292969d 100644
--- a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
+++ b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
@@ -59,6 +59,7 @@ def test_calc_runtime_for_subblocks(tmp_path: Path):
         tokenizer_path=str(tokenizer_dir),
         prefill_seq_len=8,
         generation_seq_len=4,
+        batch_size=1,
     )
 
     assert set(runtime_by_subblock) == subblock_set

From 3f69e555336fb977e9c2d878af3e3daf7abfb012 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Thu, 28 May 2026 14:26:09 -0700
Subject: [PATCH 31/35] further fixes

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml    | 1 -
 modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py       | 3 +--
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index 4020497f800..a813f6c8259 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -25,7 +25,6 @@ calc_subblock_stats:
   num_active_tokens_override: # Optional override for sequence lengths
   prefill_queue_size: 0
   allocate_prefill_query: false
-  benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking
   merge_with_existing_stats: false
   subblock_stats_filename: "subblock_stats.json"
   moe_stats_filename: "moe_stats.json"
diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
index cc88fc5fe20..b1f3cf2fb7e 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
@@ -33,7 +33,7 @@
 from .runtime_utils import RuntimeConfig
 
 
-def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None:
+def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float:
     """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms.
 
     Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA
@@ -77,7 +77,6 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
     ]
 
     # cmd is a fixed list of strings (no shell, no untrusted input).
-    vllm_results = None
     try:
         subprocess.run(
             cmd,

From 36f46855a690dfc14511e932162e729efe46a6a6 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Sun, 31 May 2026 01:38:03 -0700
Subject: [PATCH 32/35] using 5s latency target in the example

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 examples/puzzletron/README.md                                   | 2 +-
 .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md
index d6f5b17a554..dce76866d6d 100644
--- a/examples/puzzletron/README.md
+++ b/examples/puzzletron/README.md
@@ -381,7 +381,7 @@ nccl_timeout_minutes: 90  # default is 10 if omitted
 
 This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout.
 
-Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 21 resulted in a final model latency of 22.3 seconds.
+Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 5 resulted in a final model latency of 5.4 seconds.
 
 ## Advanced Usage
 
diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
index a813f6c8259..b4adbb82add 100644
--- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
+++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml
@@ -73,7 +73,7 @@ mip:
     - stats.attention_num_params
 
   human_constraints:
-    target_latency_seconds: 21
+    target_latency_seconds: 5
 
   mip_constraints:
   metric_overrides:

From b1b810f16fa0b588b2546c516cd512bdd7cf5431 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 8 Jun 2026 03:57:49 -0700
Subject: [PATCH 33/35] added vllm adapter

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../puzzletron/subblock_stats/runtime_vllm.py |  12 ++
 .../torch/puzzletron/utils/vllm_adapter.py    | 203 ++++++++++++++++++
 2 files changed, 215 insertions(+)
 create mode 100644 modelopt/torch/puzzletron/utils/vllm_adapter.py

diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
index b1f3cf2fb7e..14eb337b707 100644
--- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
+++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py
@@ -30,6 +30,8 @@
 import subprocess  # nosec B404
 from pathlib import Path
 
+from ..tools.logger import mprint
+from ..utils.vllm_adapter import convert_block_configs_to_per_layer_config
 from .runtime_utils import RuntimeConfig
 
 
@@ -43,6 +45,16 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig)
     output_json_path = model_path / "vllm_latency_benchmark.json"
     max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len
 
+    with open(model_path / "config.json") as f:
+        config = json.load(f)
+
+    if convert_block_configs_to_per_layer_config(config):
+        mprint("Converted block configs to per-layer config")
+        with open(model_path / "config.json", "w") as f:
+            json.dump(config, f, indent=2)
+    else:
+        mprint("No block configs to convert")
+
     cmd = [
         "vllm",
         "bench",
diff --git a/modelopt/torch/puzzletron/utils/vllm_adapter.py b/modelopt/torch/puzzletron/utils/vllm_adapter.py
new file mode 100644
index 00000000000..ae8409a1de7
--- /dev/null
+++ b/modelopt/torch/puzzletron/utils/vllm_adapter.py
@@ -0,0 +1,203 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ModelOpt/AnyModel -> vLLM/AnyModel config adapter.
+
+ModelOpt/AnyModel checkpoints describe per-layer overrides via a dense
+``block_configs`` list with nested ``attention`` / ``ffn`` sub-sections.
+AnyModel in vLLM now consumes the HuggingFace heterogeneity schema: a sparse
+``per_layer_config`` dict mapping ``layer_idx -> {flat HF keys + optional
+"skip" list}``.
+
+This module rewrites the Puzzletron schema in-place so vLLM only
+ever sees ``per_layer_config``. It is invoked from
+``AnyModelConfig.verify_and_update_model_config`` before the arch
+convertor or layer-patching runs.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+# (num_experts_field, moe_intermediate_size_field) per base architecture.
+# ModelOpt always writes ``moe.num_local_experts`` and
+# ``moe.expert_intermediate_{size,dim}``; the adapter rewrites them into the
+# field names the base HF config actually reads.
+_MOE_FIELDS_BY_ARCH: dict[str, tuple[str, str]] = {
+    "Qwen2MoeForCausalLM": ("num_experts", "moe_intermediate_size"),
+    "Qwen3MoeForCausalLM": ("num_experts", "moe_intermediate_size"),
+    "MixtralForCausalLM": ("num_local_experts", "intermediate_size"),
+    "GptOssForCausalLM": ("num_local_experts", "intermediate_size"),
+    "NemotronHForCausalLM": ("n_routed_experts", "moe_intermediate_size"),
+    "DeepseekV3ForCausalLM": ("n_routed_experts", "moe_intermediate_size"),
+    "DeepseekV2ForCausalLM": ("n_routed_experts", "moe_intermediate_size"),
+}
+
+_DEFAULT_MOE_FIELDS: tuple[str, str] = ("num_local_experts", "intermediate_size")
+
+
+def _get(obj: Any, key: str, default: Any = None) -> Any:
+    if obj is None:
+        return default
+    if isinstance(obj, dict):
+        return obj.get(key, default)
+    return getattr(obj, key, default)
+
+
+def _convert_block_entry(
+    block: Any,
+    *,
+    global_kv: int | None,
+    global_isize: int | None,
+    global_hact: str | None,
+    global_moe_num: int | None,
+    global_moe_size: int | None,
+    moe_num_field: str,
+    moe_size_field: str,
+) -> dict[str, Any]:
+    """Translate a single ModelOpt ``block_configs`` entry into a flat
+    per-layer override dict. Only attributes that differ from the global
+    config are emitted; sub-module no-ops become a ``"skip"`` list."""
+    attn = _get(block, "attention") or {}
+    ffn = _get(block, "ffn") or {}
+    a_noop = bool(_get(attn, "no_op", False))
+    f_noop = bool(_get(ffn, "no_op", False))
+
+    entry: dict[str, Any] = {}
+    skip: list[str] = []
+    if a_noop:
+        skip.append("attention")
+    if f_noop:
+        skip.append("mlp")
+    if skip:
+        entry["skip"] = skip
+
+    if not a_noop:
+        kv = _get(attn, "num_key_value_heads")
+        if kv is not None and kv != global_kv:
+            entry["num_key_value_heads"] = kv
+
+    if not f_noop:
+        isize = _get(ffn, "intermediate_size")
+        if isize is not None and isize != global_isize:
+            entry["intermediate_size"] = isize
+
+        hact = _get(ffn, "hidden_act")
+        if hact is not None and hact != global_hact:
+            entry["hidden_act"] = hact
+
+        moe = _get(ffn, "moe")
+        if moe:
+            n_exp = _get(moe, "num_local_experts")
+            if n_exp is None:
+                n_exp = _get(moe, "num_experts")
+            if n_exp is None:
+                n_exp = _get(moe, "n_routed_experts")
+            if n_exp is not None and n_exp != global_moe_num:
+                entry[moe_num_field] = n_exp
+
+            exp_size = _get(
+                moe,
+                "expert_intermediate_size",
+                _get(moe, "expert_intermediate_dim"),
+            )
+            if exp_size is not None and exp_size != global_moe_size:
+                entry[moe_size_field] = exp_size
+
+    return entry
+
+
+def convert_block_configs_to_per_layer_config(hf_config: Any) -> bool:
+    """In-place: convert legacy ``block_configs`` on ``hf_config`` to
+    ``per_layer_config`` on its text config.
+
+    Returns ``True`` if a conversion happened, ``False`` if there was
+    nothing to convert. If ``per_layer_config`` is already set, the legacy
+    field is dropped and a warning emitted (the new schema wins).
+    """
+    block_configs = getattr(hf_config, "block_configs", None)
+    if not block_configs:
+        return False
+
+    text_config = (
+        hf_config.get_text_config() if hasattr(hf_config, "get_text_config") else hf_config
+    )
+
+    existing = getattr(text_config, "per_layer_config", None)
+    if existing:
+        logger.warning_once(
+            "AnyModel config has both legacy 'block_configs' and new "
+            "'per_layer_config'; using per_layer_config and ignoring "
+            "block_configs."
+        )
+        if hasattr(hf_config, "block_configs"):
+            try:
+                delattr(hf_config, "block_configs")
+            except AttributeError:
+                pass
+        return False
+
+    base_architecture = getattr(hf_config, "base_architecture", None) or ""
+    moe_num_field, moe_size_field = _MOE_FIELDS_BY_ARCH.get(base_architecture, _DEFAULT_MOE_FIELDS)
+
+    global_kv = getattr(text_config, "num_key_value_heads", None)
+    global_isize = getattr(text_config, "intermediate_size", None)
+    global_hact = getattr(text_config, "hidden_act", None)
+    global_moe_num = getattr(text_config, moe_num_field, None)
+    global_moe_size = getattr(text_config, moe_size_field, None)
+
+    per_layer_config: dict[str, dict[str, Any]] = {}
+    for idx, block in enumerate(block_configs):
+        entry = _convert_block_entry(
+            block,
+            global_kv=global_kv,
+            global_isize=global_isize,
+            global_hact=global_hact,
+            global_moe_num=global_moe_num,
+            global_moe_size=global_moe_size,
+            moe_num_field=moe_num_field,
+            moe_size_field=moe_size_field,
+        )
+        if entry:
+            per_layer_config[str(idx)] = entry
+
+    n_layers = getattr(text_config, "num_hidden_layers", None)
+    if n_layers is not None and len(block_configs) != n_layers:
+        logger.warning(
+            "block_configs length (%d) does not match num_hidden_layers "
+            "(%d); converted entries beyond num_hidden_layers will fail "
+            "AnyModel validation.",
+            len(block_configs),
+            n_layers,
+        )
+
+    setattr(text_config, "per_layer_config", per_layer_config)
+    try:
+        delattr(hf_config, "block_configs")
+    except AttributeError:
+        pass
+
+    logger.info(
+        "Converted ModelOpt block_configs (%d entries) to AnyModel "
+        "per_layer_config (%d non-empty entries) for base_architecture=%r.",
+        len(block_configs),
+        len(per_layer_config),
+        base_architecture or "<unknown>",
+    )
+    return True

From f49fbc976523b52346b7afd10ec1cbeb1212ec07 Mon Sep 17 00:00:00 2001
From: Grzegorz Karch <gkarch@nvidia.com>
Date: Mon, 8 Jun 2026 04:15:51 -0700
Subject: [PATCH 34/35] disabled vllm tests that depends on anymodel

Signed-off-by: Grzegorz Karch <gkarch@nvidia.com>
---
 .../torch/puzzletron/test_calc_runtime_stats.py                  | 1 +
 1 file changed, 1 insertion(+)
 rename tests/{gpu => gpu_vllm}/torch/puzzletron/test_calc_runtime_stats.py (97%)

diff --git a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
similarity index 97%
rename from tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
rename to tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
index d976292969d..bff5438db0a 100644
--- a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py
+++ b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
@@ -33,6 +33,7 @@
 pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks")
 
 
+@pytest.mark.skip(reason="AnyModel is not supported in vLLM yet")
 def test_calc_runtime_for_subblocks(tmp_path: Path):
     """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead."""
     tokenizer = get_tiny_tokenizer()

From d6e1c6bd14a7a7e3040f783873bb056f7898938c Mon Sep 17 00:00:00 2001
From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
Date: Mon, 8 Jun 2026 11:14:24 -0700
Subject: [PATCH 35/35] Fix CI failures

Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com>
---
 docs/source/conf.py                           |   2 +-
 .../kernels/sparsity/attention/calibrate.py   |  13 +-
 .../calc_subblock_params_and_memory.py        | 111 ++++++++----------
 .../subblock_stats/calc_subblock_stats.py     |   5 +-
 noxfile.py                                    |   2 +-
 .../puzzletron/test_calc_runtime_stats.py     |   2 -
 6 files changed, 56 insertions(+), 79 deletions(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index 6fe7a860024..47f997a0113 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -124,7 +124,7 @@
 
 
 # Mock imports for autodoc
-autodoc_mock_imports = ["mpi4py", "tensorrt_llm", "triton"]
+autodoc_mock_imports = ["mpi4py", "tensorrt_llm", "triton", "vllm"]
 
 autosummary_generate = True
 autosummary_imported_members = False
diff --git a/modelopt/torch/kernels/sparsity/attention/calibrate.py b/modelopt/torch/kernels/sparsity/attention/calibrate.py
index 971c423f711..61707f63013 100644
--- a/modelopt/torch/kernels/sparsity/attention/calibrate.py
+++ b/modelopt/torch/kernels/sparsity/attention/calibrate.py
@@ -200,17 +200,18 @@ def attention_calibrate(
     measuring how many KV tiles would be skipped at each threshold in
     ``threshold_trials``. No autograd — forward only.
 
+    All arguments except ``threshold_trials`` match
+    :func:`modelopt.torch.kernels.common.attention.attention`.
+
     Args:
-        q, k, v, b_start_loc, b_seq_len, max_input_len, is_causal,
-        softmax_scale, b_start_loc_k, b_seq_len_k, max_input_len_k:
-            Same as :func:`modelopt.torch.kernels.common.attention.attention`.
         threshold_trials: List of threshold values to measure sparsity for.
             Each value is converted to log2-scaled space for the kernel.
 
     Returns:
-        Tuple of (output, sparsity_counters):
-        - output: ``[total_q_tokens, num_q_heads, head_dim]``
-        - sparsity_counters: ``[num_thresholds, 2]`` int64 tensor where
+        Tuple of ``(output, sparsity_counters)``:
+
+        - ``output``: ``[total_q_tokens, num_q_heads, head_dim]``
+        - ``sparsity_counters``: ``[num_thresholds, 2]`` int64 tensor where
           ``[:, 0]`` = total tile evaluations, ``[:, 1]`` = skipped tiles.
           Sparsity per threshold = ``counters[:, 1] / counters[:, 0]``.
     """
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
index b0772fb839f..531f7a3f0a1 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py
@@ -79,21 +79,21 @@ def calculate_subblock_memory(
     Given its configuration and runtime dimensions, returns bytes or a detailed dict.
 
     Args:
-        subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass.
-        batch_size (int): Batch size for memory estimate.
-        prefill_seq_len (int): Sequence length for prefill phase.
-        generation_seq_len (int): Sequence length for generation phase (token-by-token).
-        prefill_queue_size (int): Token queue size for prefill attention memory allocation.
-        n_embd (int): Embedding (hidden) dimension.
-        n_head (int): Number of attention heads (used for non-FFN).
-        weights_dtype (torch.dtype): PyTorch dtype for model weights.
-        kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache.
-        allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens.
-        model_config (PretrainedConfig): HuggingFace-style config instance describing the model.
-        descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types).
+        subblock_config: Subblock configuration dataclass.
+        batch_size: Batch size for memory estimate.
+        prefill_seq_len: Sequence length for prefill phase.
+        generation_seq_len: Sequence length for generation phase (token-by-token).
+        prefill_queue_size: Token queue size for prefill attention memory allocation.
+        n_embd: Embedding (hidden) dimension.
+        n_head: Number of attention heads (used for non-FFN).
+        weights_dtype: PyTorch dtype for model weights.
+        kv_cache_dtype: PyTorch dtype for KV cache.
+        allocate_prefill_query: Whether to allocate query cache for prefill tokens.
+        model_config: HuggingFace-style config instance describing the model.
+        descriptor: Model descriptor type (for puzzletron model types).
 
     Returns:
-        float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type.
+        Memory usage in bytes (float), or a dictionary by memory type.
     """
     if subblock_config.no_op:
         return 0
@@ -229,7 +229,7 @@ def calc_subblock_active_params(
         block_idx: The index of the block/subblock within the network, used to index into the stats.
 
     Returns:
-        int: The expected number of "active" parameters for the given subblock.
+        The expected number of "active" parameters for the given subblock.
     """
     if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe):
         return calculate_subblock_params(model_config, sublayer_config, descriptor)
@@ -245,12 +245,12 @@ def load_moe_stats(stats_file: str) -> dict:
     It returns the normalized probability distributions over experts for each block, as a list of numpy arrays.
 
     Args:
-        stats_file (str): Path to the JSON file containing expert routing statistics for each block.
+        stats_file: Path to the JSON file containing expert routing statistics for each block.
 
     Returns:
-        list[np.ndarray]: A list where each element is a numpy array containing the normalized probability
-            distribution over experts for the corresponding block. If a block's expert list is empty,
-            its entry is 0.
+        A list where each element is a numpy array containing the normalized probability
+        distribution over experts for the corresponding block. If a block's expert list is empty,
+        its entry is 0.
     """
     with open(stats_file) as f:
         stats = json.load(f)
@@ -271,12 +271,12 @@ def estimate_num_active_experts(
     expected number of active (i.e., selected at least once) experts is computed.
 
     Args:
-        dist_over_experts (np.ndarray): A 1D array of probabilities for each expert.
-        batch_size (int): The number of samples in the batch.
-        num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter).
+        dist_over_experts: A 1D array of probabilities for each expert.
+        batch_size: The number of samples in the batch.
+        num_experts: The maximum number of experts to consider (fewer if `dist_over_experts` is shorter).
 
     Returns:
-        int: The expected number of experts selected at least once across the batch.
+        The expected number of experts selected at least once across the batch.
     """
     # cut the tail and renormalize
     dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts]
@@ -296,14 +296,14 @@ def estimate_moe_active_params(
     """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock.
 
     Args:
-        subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured).
-        n_embd (int): The embedding dimension (input and output size per expert).
-        moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts.
-        batch_size (int): Batch size to simulate/extrapolate expected expert use.
-        block_idx (int): The index of the block/layer whose expert routing statistics should be used.
+        subblock_config: The FFNConfig for the MoE subblock (with .moe field configured).
+        n_embd: The embedding dimension (input and output size per expert).
+        moe_stats_file: Path to the JSON file containing routing/selection probabilities for experts.
+        batch_size: Batch size to simulate/extrapolate expected expert use.
+        block_idx: The index of the block/layer whose expert routing statistics should be used.
 
     Returns:
-        int: Estimated number of parameters actively used for the current batch and expert selection statistics.
+        Estimated number of parameters actively used for the current batch and expert selection statistics.
     """
     assert Path(moe_stats_file).exists()
     # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution?
@@ -382,16 +382,15 @@ def calculate_mamba_memory(
     """Calculate memory usage (MiB) for a Mamba attention subblock.
 
     Args:
-        attention_config (AttentionConfig): Mamba attention configuration,
-            including Mamba-specific settings.
-        model_config (PretrainedConfig): Model configuration.
-        descriptor (type[ModelDescriptor]): Model descriptor class.
-        batch_size (int): Batch size for memory estimate.
-        weights_dtype (torch.dtype): Data type for model weights.
-        kv_cache_dtype (torch.dtype): Data type for state/kv-cache.
+        attention_config: Mamba attention configuration, including Mamba-specific settings.
+        model_config: Model configuration.
+        descriptor: Model descriptor class.
+        batch_size: Batch size for memory estimate.
+        weights_dtype: Data type for model weights.
+        kv_cache_dtype: Data type for state/kv-cache.
 
     Returns:
-        int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock.
+        Estimated memory usage in mebibytes (MiB) for the Mamba subblock.
     """
     assert attention_config.mamba is not None
     mamba_config = attention_config.mamba
@@ -409,11 +408,11 @@ def calculate_mamba_state_size(
     """Calculate the total state size for a Mamba attention subblock.
 
     Args:
-        mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters.
-        batch_size (int): Batch size to estimate the memory/state requirements for.
+        mamba_config: Configuration object containing Mamba subblock parameters.
+        batch_size: Batch size to estimate the memory/state requirements for.
 
     Returns:
-        int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state.
+        Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state.
     """
     _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config)
     conv_state_size = math.prod((batch_size, conv_dim, kernel_size))
@@ -443,15 +442,14 @@ def calculate_ffn_memory(
     """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock.
 
     Args:
-        ffn_config (FFNConfig): FFN configuration for the block.
-        model_config (PretrainedConfig): The parent model configuration.
-        descriptor (type[ModelDescriptor]): Model descriptor class.
-        weights_dtype (torch.dtype | str): Data type for FFN weights.
-        experts_dtype (torch.dtype | str | None, optional): Data type for expert weights
-            (for MoE layers, if present). Defaults to None.
+        ffn_config: FFN configuration for the block.
+        model_config: The parent model configuration.
+        descriptor: Model descriptor class.
+        weights_dtype: Data type for FFN weights.
+        experts_dtype: Data type for expert weights (for MoE layers, if present).
 
     Returns:
-        float: Estimated FFN memory usage in mebibytes (MiB).
+        Estimated FFN memory usage in mebibytes (MiB).
     """
     # TODO: How to separate between expert weights and the rest for any model (same as puzzletron).
     num_params = calculate_subblock_params(model_config, ffn_config, descriptor)
@@ -463,16 +461,7 @@ def calculate_non_block_memory(
     vocab_size: int,
     weight_dtype: torch.dtype,
 ) -> float:
-    """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection).
-
-    Args:
-        n_embd (int): Embedding dimension (hidden size).
-        vocab_size (int): Vocabulary size.
-        weight_dtype (torch.dtype): Data type for model weights.
-
-    Returns:
-        float: Estimated non-subblock memory usage in mebibytes (MiB).
-    """
+    """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection)."""
     return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20
 
 
@@ -480,13 +469,5 @@ def calculate_non_block_params(
     n_embd: int,
     vocab_size: int,
 ) -> int:
-    """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection).
-
-    Args:
-        n_embd (int): Embedding dimension (hidden size).
-        vocab_size (int): Vocabulary size.
-
-    Returns:
-        int: Estimated non-subblock parameter count.
-    """
+    """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection)."""
     return vocab_size * n_embd * 2 + n_embd
diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
index 4284e70a3db..1d04cc01add 100644
--- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
+++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py
@@ -414,10 +414,7 @@ def _load_subblock_configs_from_replacement_library(
     4 intermediate_size + teacher_intermediate_size + ffn_noop + att_op (teacher) + att_noop.
 
     Args:
-        master_puzzle_dir (Path): Directory with "replacement_library.json" file
-
-    Returns:
-        list[SubblockConfig]:
+        master_puzzle_dir: Directory with "replacement_library.json" file
     """
     replacement_library = json.loads((master_puzzle_dir / "replacement_library.json").read_text())
     subblock_configs = set()
diff --git a/noxfile.py b/noxfile.py
index 059f351b7f9..1a28321bbd7 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -142,7 +142,7 @@ def gpu_trtllm(session):
 # Pin must stay in sync with examples/vllm_serve/Dockerfile.
 @nox.session(venv_backend="none")
 def gpu_vllm(session):
-    session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]")
+    session.run("python3", "-m", "pip", "install", "-e", ".[hf,puzzletron,dev-test]")
     session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args())
 
 
diff --git a/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
index bff5438db0a..377a2ffed19 100644
--- a/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
+++ b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py
@@ -30,8 +30,6 @@
 from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig
 from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks
 
-pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks")
-
 
 @pytest.mark.skip(reason="AnyModel is not supported in vLLM yet")
 def test_calc_runtime_for_subblocks(tmp_path: Path):