From 816ddfaa063ffd355ae8369eebdf469ae098c8dc Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 28 Apr 2026 00:29:17 -0700 Subject: [PATCH 01/35] enabling runtime optimization Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/__init__.py | 15 + .../nas/subblock_stats/calc_runtime_stats.py | 271 ++++++++++++++++++ .../calc_subblock_params_and_memory.py | 8 +- .../puzzletron/subblock_stats/__init__.py | 1 - .../subblock_stats/calc_subblock_stats.py | 131 ++------- 5 files changed, 317 insertions(+), 109 deletions(-) create mode 100644 modelopt/torch/nas/subblock_stats/__init__.py create mode 100644 modelopt/torch/nas/subblock_stats/calc_runtime_stats.py rename modelopt/torch/{puzzletron => nas}/subblock_stats/calc_subblock_params_and_memory.py (97%) diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py new file mode 100644 index 00000000000..ff8d16685d6 --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .calc_runtime_stats import calc_runtime_for_subblocks diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py new file mode 100644 index 00000000000..da525c0bdcc --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -0,0 +1,271 @@ +import json +import os +import subprocess +import tempfile +from dataclasses import dataclass, replace +from pathlib import Path + +import torch +from omegaconf import DictConfig +from tqdm import tqdm +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM + +from modelopt.torch.puzzletron.anymodel.converter import Converter +from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor +from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher +from modelopt.torch.puzzletron.block_config import ( + AttentionConfig, + BlockConfig, + FFNConfig, + SubblockConfig, +) + + +def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig: + return BlockConfig( + attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads), + ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None), + parallel_blocks=None, + ) + + +def create_benchmark_model( + vocab_size: int, + hidden_size: int, + num_attention_heads: int, + prefill_seq_len: int, + generation_seq_len: int, + block_config: BlockConfig | None, + repeat_block_n_times: int = 10, +) -> LlamaForCausalLM: + + block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)] + + if block_config: + block_configs.extend([block_config] * repeat_block_n_times) + + model_config = LlamaConfig( + max_position_embeddings=prefill_seq_len + generation_seq_len, + vocab_size=vocab_size, + hidden_size=hidden_size, + num_attention_heads=num_attention_heads, + num_hidden_layers=len(block_configs), + head_dim=None, # Compute from hidden_size // num_attention_heads instead of using default 128 + # this is required for trt-llm convertion to know which model classes to use to the checkpoint + auto_map={ + "AutoConfig": "transformers.models.llama.configuration_llama.LlamaConfig", + "AutoModelForCausalLM": "transformers.models.llama.modeling_llama.LlamaForCausalLM", + }, + ) + + for idx, block_config in enumerate(block_configs): + block_configs[idx] = block_config.to_dict() + model_config.block_configs = block_configs + + with deci_x_patcher(LlamaModelDescriptor, block_configs): + model = AutoModelForCausalLM.from_config(model_config) + + model.config.architectures = ["AnyModel"] + model.config.base_architecture = "LlamaForCausalLM" + + return model + + +def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): + + # Save standard model checkpoint (as safetensors, HF format) + model.save_pretrained(output_dir, safe_serialization=True) + + # Convert/slice weights into AnyModel subblock_safetensors format + Converter.convert_model_weights( + input_dir=output_dir, + output_dir=output_dir, + descriptor=descriptor, + num_hidden_layers=num_hidden_layers, + ) + # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk. + + config_path = output_dir / "config.json" + if config_path.exists(): + with open(config_path) as f: + config_data = json.load(f) + config_data["architectures"] = ["AnyModel"] + with open(config_path, "w") as f: + json.dump(config_data, f, indent=2) + + +def save_model( + model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int +) -> None: + + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + tokenizer.save_pretrained(output_path) + + +@dataclass(frozen=True) +class RuntimeConfig: + vocab_size: int + hidden_size: int + num_attention_heads: int + master_puzzle_dir: str + tokenizer_path: str + synth_dataset_num_requests: int + repeat_block_n_times: int + prefill_seq_len: int + generation_seq_len: int + batch_size: int + num_iters: int + num_warmup_iters: int + + +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): + + output_json_path = model_path / "vllm_latency_benchmark.json" + + cmd = [ + "vllm", + "bench", + "latency", + "--model", + str(model_path), + "--input-len", + str(runtime_config.prefill_seq_len), + "--output-len", + str(runtime_config.generation_seq_len), + "--batch-size", + str(runtime_config.batch_size), + "--output-json", + str(output_json_path), + "--max-model-len", + str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len), + "--num-iters-warmup", + str(runtime_config.num_warmup_iters), + "--num-iters", + str(runtime_config.num_iters), + "--max-num-seqs", + "1", + "--distributed-executor-backend", + "external_launcher", + "--tensor-parallel-size", + "1", + "--pipeline-parallel-size", + "1", + ] + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + subprocess.run(cmd) + + with open(output_json_path) as f: + vllm_results = json.load(f) + print(vllm_results) + return vllm_results["avg_latency"] * 1000 # convert to milliseconds + + +def calc_subblock_runtime( + runtime_config: RuntimeConfig, + subblock_config: SubblockConfig, +) -> float: + + block_config: BlockConfig | None = None + + if subblock_config is not None: + if isinstance(subblock_config, BlockConfig): + block_config = subblock_config + elif isinstance(subblock_config, (AttentionConfig, FFNConfig)): + block_config = subblock_config.to_blockconfig() + else: + raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}") + + model = create_benchmark_model( + runtime_config.vocab_size, + runtime_config.hidden_size, + runtime_config.num_attention_heads, + runtime_config.prefill_seq_len, + runtime_config.generation_seq_len, + block_config=block_config, + repeat_block_n_times=runtime_config.repeat_block_n_times, + ) + with tempfile.TemporaryDirectory() as model_tmpdir: + save_model( + model, + Path(runtime_config.tokenizer_path), + Path(model_tmpdir), + num_hidden_layers=runtime_config.repeat_block_n_times + 1, + ) + subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) + + return subblock_total_runtime_ms + + +def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: + + runtime_config1 = replace(runtime_config, repeat_block_n_times=0) + runtime_config10 = replace(runtime_config, repeat_block_n_times=9) + + block_config = _make_standard_block_config( + runtime_config.hidden_size, runtime_config.num_attention_heads + ) + + runtime_ms1 = calc_subblock_runtime(runtime_config1, None) + runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config) + + no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9 + + return no_block_runtime_ms + + +def calc_runtime_for_subblocks( + subblock_config_set: set[SubblockConfig], + runtime_stats_config: DictConfig, + vocab_size: int, + hidden_size: int, + num_attention_heads: int, + master_puzzle_dir: str, + tokenizer_path: str, + synth_dataset_num_requests: int, + prefill_seq_len: int, + generation_seq_len: int, +) -> tuple[dict[SubblockConfig, float], float]: + + repeat_block_n_times = 10 + runtime_config = RuntimeConfig( + vocab_size, + hidden_size, + num_attention_heads, + master_puzzle_dir, + tokenizer_path, + synth_dataset_num_requests, + repeat_block_n_times, + prefill_seq_len, + generation_seq_len, + runtime_stats_config.get("batch_size", 1), + runtime_stats_config.get("num_iters", 30), + runtime_stats_config.get("num_warmup_iters", 10), + ) + + runtime_by_subblock_dict = {} + + baseline_runtime_ms = calc_subblock_runtime(runtime_config, None) + + for subblock_config in tqdm( + sorted(subblock_config_set), + desc=( + f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, " + f"num_subblocks={len(subblock_config_set)}]" + ), + ): + if subblock_config.no_op: + total_runtime_ms = 0.0 + else: + subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config) + total_runtime_ms = ( + subblock_total_runtime_ms - baseline_runtime_ms + ) / repeat_block_n_times + + runtime_by_subblock_dict[subblock_config] = total_runtime_ms + + no_block_runtime_ms = calc_no_block_runtime(runtime_config) + + return runtime_by_subblock_dict, no_block_runtime_ms diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py similarity index 97% rename from modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py rename to modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py index d893eb55bb3..3938bb55596 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py @@ -31,16 +31,16 @@ import torch from transformers import PretrainedConfig -from ..anymodel.model_descriptor import ModelDescriptor -from ..block_config import ( +from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor +from modelopt.torch.puzzletron.block_config import ( AttentionConfig, BlockConfig, FFNConfig, MambaConfig, maybe_cast_block_configs, ) -from ..tools.checkpoint_utils_hf import init_model_from_config -from ..utils.misc import ( +from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config +from modelopt.torch.puzzletron.utils.misc import ( EmptyInitOnDevice, calculate_kv_dim, raise_unknown_subblock_config_error, diff --git a/modelopt/torch/puzzletron/subblock_stats/__init__.py b/modelopt/torch/puzzletron/subblock_stats/__init__.py index fbbeb3ff709..4964dba0cfa 100644 --- a/modelopt/torch/puzzletron/subblock_stats/__init__.py +++ b/modelopt/torch/puzzletron/subblock_stats/__init__.py @@ -15,5 +15,4 @@ """Subblock statistics collection for Puzzletron.""" -from .calc_subblock_params_and_memory import * from .calc_subblock_stats import * diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index dc89a1f6450..f36a71710a3 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -24,7 +24,7 @@ from functools import partial from itertools import product from pathlib import Path -from typing import Iterable, Optional, Type, TypeVar +from typing import Iterable, Type, TypeVar import pandas as pd import torch @@ -41,7 +41,7 @@ from ..tools.checkpoint_utils import load_model_config from ..tools.logger import mprint from ..utils.parsing import format_global_config -from .calc_subblock_params_and_memory import ( +from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import ( calc_subblock_active_params, calculate_non_block_memory, calculate_non_block_params, @@ -52,7 +52,6 @@ __all__ = [ "calculate_subblock_stats", "launch_calc_subblock_stats", - "add_int8_runtime_estimates", ] # Type variable for dataclasses @@ -60,10 +59,10 @@ """ Usage: -python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --benchmark_iterations 1000 ] +python -m modelopt.torch.puzzletron.subblock_stats.calc_subblock_stats PUZZLE_DIR [ --runtime_stats ] ---benchmark_iterations=None (the default) means that the code won't use infery to benchmark runtime, - only memory stats will be calculated. If you want to benchmark runtime, run inside an infery-llm docker. +--runtime_stats_enabled=False (the default) means that the code won't benchmark runtime, + only memory stats will be calculated. If you want to benchmark runtime, run inside an trtllm docker. """ @@ -82,7 +81,7 @@ def calculate_subblock_stats( n_embd: int, n_head: int, vocab_size: int, - benchmark_iterations: Optional[int], + runtime_stats_enabled: bool, use_cuda_graph: bool, weights_dtype: torch.dtype, activations_dtype: torch.dtype, @@ -90,14 +89,12 @@ def calculate_subblock_stats( allocate_prefill_query: bool, moe_stats_file: str | Path | None = None, ) -> dict: - is_calc_runtime = benchmark_iterations is not None - if is_calc_runtime: - raise NotImplementedError("Runtime stats calculation is not implemented yet") + if runtime_stats_enabled: + from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name() subblock_stats = { "args": dict( - is_calc_runtime=is_calc_runtime, gpu=gpu, batch_size=batch_size, prefill_seq_len=prefill_seq_len, @@ -106,7 +103,7 @@ def calculate_subblock_stats( n_embd=n_embd, n_head=n_head, vocab_size=vocab_size, - benchmark_iterations=benchmark_iterations, + runtime_stats=runtime_stats_enabled, use_cuda_graph=use_cuda_graph, weights_dtype=str(weights_dtype), activations_dtype=str(activations_dtype), @@ -116,8 +113,7 @@ def calculate_subblock_stats( "subblocks": list(), } # Compute runtime stats for unique subblocks only - if is_calc_runtime: - raise NotImplementedError("Runtime stats calculation is not implemented yet") + if runtime_stats_enabled: subblock_configs_nolayerindex = set( [subblock_config["subblock_config"] for subblock_config in subblock_configs] ) @@ -127,16 +123,19 @@ def calculate_subblock_stats( synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get( "synth_dataset_num_requests", 200 ) - backend = calc_subblock_stats_config.get("runtime_stats", {}).get("backend", "trt_torch") - runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_ms_for_subblocks( + runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {}) + + runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks( subblock_configs_nolayerindex, + runtime_stats_config, vocab_size, n_embd, n_head, master_puzzle_dir, teacher_dir, synth_dataset_num_requests, - backend, + prefill_seq_len, + generation_seq_len, ) sorted_subblock_config = sorted( @@ -144,7 +143,7 @@ def calculate_subblock_stats( ) it = ( tqdm(sorted_subblock_config, desc="Measuring subblock runtimes") - if is_calc_runtime + if runtime_stats_enabled else sorted_subblock_config ) for subblock_config_indexed in it: @@ -156,7 +155,7 @@ def calculate_subblock_stats( descriptor.get_language_model_config(layer_model_config), parent_layer_indices[0] ) - if is_calc_runtime: + if runtime_stats_enabled: total_runtime_ms = runtime_by_subblock_dict[subblock_config] prefill_runtime_ms = None decode_runtime_ms = None @@ -207,25 +206,13 @@ def calculate_subblock_stats( } ) - if is_calc_runtime: - # TODO: fix - # from puzzle_tools.calc_subblock_runtime import measure_non_block_runtime_ms - # non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = \ - # measure_non_block_runtime_ms(batch_size, prefill_seq_len, generation_seq_len, n_embd, vocab_size, - # benchmark_iterations, use_cuda_graph) - embedding_runtime_ms, lm_head_runtime_ms = None, None - else: - non_block_runtime_ms, embedding_runtime_ms, lm_head_runtime_ms = None, None, None + if not runtime_stats_enabled: + non_block_runtime_ms = None non_block_memory = calculate_non_block_memory(n_embd, vocab_size, weights_dtype) non_block_params = calculate_non_block_params(n_embd, vocab_size) - # TODO - # the semantics here is wrong why do we refer, prefill_runtime_ms as embedding_runtime_ms and lm_head_runtime_ms as decode_runtime_ms ? - # Prefill is the first the user prompt inference, and Decode refer to the next generation process. both processes use all the model layers. subblock_stats["non_block"] = { "runtime_ms": non_block_runtime_ms, - "prefill_runtime_ms": embedding_runtime_ms, - "decode_runtime_ms": lm_head_runtime_ms, "memory_mib": non_block_memory, "num_params": non_block_params, } @@ -256,7 +243,9 @@ def launch_calc_subblock_stats(cfg: DictConfig) -> None: num_active_tokens_override=cfg.calc_subblock_stats.get("num_active_tokens_override", None), prefill_queue_size=cfg.calc_subblock_stats.prefill_queue_size, allocate_prefill_query=cfg.calc_subblock_stats.get("allocate_prefill_query", False), - benchmark_iterations=cfg.calc_subblock_stats.get("benchmark_iterations", None), + runtime_stats_enabled=cfg.calc_subblock_stats.get("runtime_stats", {}).get( + "enabled", False + ), merge_with_existing_stats=cfg.calc_subblock_stats.merge_with_existing_stats, subblock_stats_filename=cfg.calc_subblock_stats.subblock_stats_filename, moe_stats_filename=cfg.calc_subblock_stats.moe_stats_filename, @@ -276,9 +265,7 @@ def calculate_subblock_stats_for_puzzle_dir( num_active_tokens_override: int | None = None, prefill_queue_size: int = 0, # it's an infery-llm thing allocate_prefill_query: bool = False, - benchmark_iterations: ( - int | None - ) = None, # If set then compute runtime performance statistics. TODO: recommend default value, is 1000 good? + runtime_stats_enabled: bool = False, # Compute runtime statistics. merge_with_existing_stats: bool = False, subblock_stats_filename: str = "subblock_stats.json", moe_stats_filename: str = "moe_stats.json", @@ -344,8 +331,8 @@ def calculate_subblock_stats_for_puzzle_dir( if num_active_tokens_override is not None: prefill_seq_len = generation_seq_len = int(num_active_tokens_override / batch_size / 2) - curr_benchmark_iterations = ( - benchmark_iterations if weights_dtype == torch.bfloat16 else None + curr_runtime_stats_enabled = ( + runtime_stats_enabled if weights_dtype == torch.bfloat16 else False ) curr_subblock_stats = calculate_subblock_stats( @@ -362,7 +349,7 @@ def calculate_subblock_stats_for_puzzle_dir( n_embd=model_hidden_size, n_head=lm_config.num_attention_heads, vocab_size=lm_config.vocab_size, - benchmark_iterations=curr_benchmark_iterations, + runtime_stats_enabled=curr_runtime_stats_enabled, use_cuda_graph=True, weights_dtype=weights_dtype, activations_dtype=activations_dtype, @@ -378,8 +365,6 @@ def calculate_subblock_stats_for_puzzle_dir( subblock_stats.append(curr_subblock_stats) - # TODO fix: add_int8_runtime_estimates(subblock_stats) - json_dump(subblock_stats, subblock_stats_file) mprint(subblock_stats_file) @@ -503,65 +488,3 @@ def _dataclass_from_dict( raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}") -def add_int8_runtime_estimates(subblock_stats: list[dict]) -> None: - for curr_subblock_stats in subblock_stats: - args = curr_subblock_stats["args"] - if args["weights_dtype"] == "torch.int8": - assert args["activations_dtype"] == "torch.int8" - ffn_factor = 0.5 - attention_factor = 0.5 if args["kv_cache_dtype"] == "torch.int8" else 0.8 - - bf16_stats = _find_corresponding_bf16_stats(args, subblock_stats) - if bf16_stats is not None: - curr_subblocks = curr_subblock_stats["subblocks"] + [ - curr_subblock_stats["non_block"] - ] - bf16_subblocks = bf16_stats["subblocks"] + [bf16_stats["non_block"]] - for curr_subblock, bf16_subblock in zip(curr_subblocks, bf16_subblocks): - assert curr_subblock.get("subblock_config", None) == bf16_subblock.get( - "subblock_config", None - ) - is_attention = False - if (subblock_config := curr_subblock.get("subblock_config")) is not None: - if hasattr(subblock_config, "__dataclass_fields__"): - subblock_config = dataclasses.asdict(subblock_config) - is_attention = subblock_config.get("num_key_value_heads", None) is not None - runtime_factor = attention_factor if is_attention else ffn_factor - for stat_name, stat_value in bf16_subblock.items(): - if "runtime" in stat_name: - curr_subblock[stat_name] = stat_value * runtime_factor - - -def _find_corresponding_bf16_stats(args: dict, subblock_stats: list[dict]) -> dict | None: - scenario_keys = [ - "batch_size", - "prefill_seq_len", - "generation_seq_len", - "prefill_queue_size", - "gpu", - "n_embd", - "n_head", - "vocab_size", - ] - corresponding_bf16_args = { - **{k: v for k, v in args.items() if k in scenario_keys}, - "is_calc_runtime": True, - "weights_dtype": "torch.bfloat16", - "activations_dtype": "torch.bfloat16", - "kv_cache_dtype": "torch.bfloat16", - } - matching_bf16_stats = [ - stats - for stats in subblock_stats - if all( - [ - stats["args"][key] == corresponding_bf16_args[key] - for key in corresponding_bf16_args.keys() - ] - ) - ] - if len(matching_bf16_stats) == 0: - return None - if len(matching_bf16_stats) == 1: - return matching_bf16_stats[0] - raise ValueError(f"Found more than 1 matching bf16 stats for {args=}") From 3041dc2d63183d55de7c2e1fac014f1639b62883 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 28 Apr 2026 14:49:10 -0700 Subject: [PATCH 02/35] done ruff formatting and docstrings Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/__init__.py | 8 + .../nas/subblock_stats/calc_runtime_stats.py | 38 +++- .../calc_subblock_params_and_memory.py | 171 ++++++++++++++++-- 3 files changed, 190 insertions(+), 27 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py index ff8d16685d6..aeac903f8f4 100644 --- a/modelopt/torch/nas/subblock_stats/__init__.py +++ b/modelopt/torch/nas/subblock_stats/__init__.py @@ -12,4 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Subblock runtime statistics API for ModelOpt NAS. + +This module provides utilities for measuring and calculating runtime statistics +of subblocks (e.g., Attention, FFN) within transformer architectures. + +Primary API: + - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations +""" from .calc_runtime_stats import calc_runtime_for_subblocks diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index da525c0bdcc..d3b997f4525 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -1,3 +1,21 @@ +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# mypy: ignore-errors + +"""Runtime statistics calculation for NAS subblock benchmarking via vLLM.""" + import json import os import subprocess @@ -38,7 +56,7 @@ def create_benchmark_model( block_config: BlockConfig | None, repeat_block_n_times: int = 10, ) -> LlamaForCausalLM: - + """Build a small Llama model with repeated subblocks for latency benchmarking.""" block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)] if block_config: @@ -58,8 +76,8 @@ def create_benchmark_model( }, ) - for idx, block_config in enumerate(block_configs): - block_configs[idx] = block_config.to_dict() + for idx, bc in enumerate(block_configs): + block_configs[idx] = bc.to_dict() model_config.block_configs = block_configs with deci_x_patcher(LlamaModelDescriptor, block_configs): @@ -72,7 +90,7 @@ def create_benchmark_model( def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): - + """Save a model checkpoint in AnyModel subblock-safetensors format.""" # Save standard model checkpoint (as safetensors, HF format) model.save_pretrained(output_dir, safe_serialization=True) @@ -97,7 +115,7 @@ def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layer def save_model( model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int ) -> None: - + """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" model.to(dtype=torch.bfloat16).save_pretrained(output_path) save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) @@ -107,6 +125,8 @@ def save_model( @dataclass(frozen=True) class RuntimeConfig: + """Configuration for a vLLM latency benchmark run.""" + vocab_size: int hidden_size: int num_attention_heads: int @@ -122,7 +142,7 @@ class RuntimeConfig: def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): - + """Run ``vllm bench latency`` and return the average latency in milliseconds.""" output_json_path = model_path / "vllm_latency_benchmark.json" cmd = [ @@ -167,7 +187,7 @@ def calc_subblock_runtime( runtime_config: RuntimeConfig, subblock_config: SubblockConfig, ) -> float: - + """Measure total runtime of a repeated subblock via vLLM latency benchmark.""" block_config: BlockConfig | None = None if subblock_config is not None: @@ -200,7 +220,7 @@ def calc_subblock_runtime( def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: - + """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" runtime_config1 = replace(runtime_config, repeat_block_n_times=0) runtime_config10 = replace(runtime_config, repeat_block_n_times=9) @@ -228,7 +248,7 @@ def calc_runtime_for_subblocks( prefill_seq_len: int, generation_seq_len: int, ) -> tuple[dict[SubblockConfig, float], float]: - + """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead.""" repeat_block_n_times = 10 runtime_config = RuntimeConfig( vocab_size, diff --git a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py index 3938bb55596..abe7a1a3884 100644 --- a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py @@ -25,7 +25,6 @@ import json import math from pathlib import Path -from typing import Type import numpy as np import torch @@ -48,16 +47,16 @@ ) __all__ = [ - "calculate_subblock_memory", - "calculate_subblock_params", "calc_subblock_active_params", - "load_moe_stats", - "estimate_num_active_experts", + "calculate_ffn_memory", "calculate_mamba_memory", "calculate_mamba_state_size", - "calculate_ffn_memory", "calculate_non_block_memory", "calculate_non_block_params", + "calculate_subblock_memory", + "calculate_subblock_params", + "estimate_num_active_experts", + "load_moe_stats", ] @@ -73,9 +72,29 @@ def calculate_subblock_memory( kv_cache_dtype: torch.dtype, allocate_prefill_query: bool, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], ) -> float | dict[str, float]: - """``model_config`` / ``descriptor`` are required (puzzletron-style); FFN uses them for meta init.""" + """Calculate the memory usage of a single subblock (FFN or Attention). + + Given its configuration and runtime dimensions, returns bytes or a detailed dict. + + Args: + subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass. + batch_size (int): Batch size for memory estimate. + prefill_seq_len (int): Sequence length for prefill phase. + generation_seq_len (int): Sequence length for generation phase (token-by-token). + prefill_queue_size (int): Token queue size for prefill attention memory allocation. + n_embd (int): Embedding (hidden) dimension. + n_head (int): Number of attention heads (used for non-FFN). + weights_dtype (torch.dtype): PyTorch dtype for model weights. + kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache. + allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens. + model_config (PretrainedConfig): HuggingFace-style config instance describing the model. + descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types). + + Returns: + float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type. + """ if subblock_config.no_op: return 0 if isinstance(subblock_config, FFNConfig): @@ -116,7 +135,7 @@ def calculate_subblock_memory( def calculate_subblock_params( config: PretrainedConfig, layer_config: BlockConfig | FFNConfig | AttentionConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], ) -> int: """Count parameters on one meta decoder layer. @@ -124,9 +143,7 @@ def calculate_subblock_params( ``hybrid_override_pattern``) before passing ``config``; see ``ModelDescriptor.truncate_pattern_for_subblock``. """ - if isinstance(layer_config, FFNConfig): - block_config = layer_config.to_blockconfig() - elif isinstance(layer_config, AttentionConfig): + if isinstance(layer_config, (FFNConfig, AttentionConfig)): block_config = layer_config.to_blockconfig() else: block_config = layer_config @@ -189,12 +206,31 @@ def calculate_subblock_params( def calc_subblock_active_params( sublayer_config: FFNConfig | AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], n_embd: int, moe_stats_file: str, batch_size: int, block_idx: int, ) -> int: + """Calculate the number of "active" parameters for a subblock (FFN, Attention, or MoE). + + For non-MoE subblocks, simply calls `calculate_subblock_params` to count all parameters. + For MoE (Mixture-of-Experts) FFN subblocks, estimates the expected number of active parameters + per batch by leveraging expert activation statistics (from a given stats file) and calculating + the expected number of active experts, then multiplies by the number of parameters per expert. + + Args: + sublayer_config: The subblock configuration (either FFNConfig or AttentionConfig). + model_config: The Hugging Face model configuration. + descriptor: The ModelDescriptor class corresponding to this model family. + n_embd: The embedding size (hidden dimension). + moe_stats_file: Path to file containing expert activation probabilities. + batch_size: The batch size used for the estimate. + block_idx: The index of the block/subblock within the network, used to index into the stats. + + Returns: + int: The expected number of "active" parameters for the given subblock. + """ if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe): return calculate_subblock_params(model_config, sublayer_config, descriptor) return estimate_moe_active_params( @@ -203,14 +239,45 @@ def calc_subblock_active_params( def load_moe_stats(stats_file: str) -> dict: + """Load MoE (Mixture-of-Experts) routing statistics from a file. + + This function reads a JSON file containing expert activation probabilities or counts for each MoE block. + It returns the normalized probability distributions over experts for each block, as a list of numpy arrays. + + Args: + stats_file (str): Path to the JSON file containing expert routing statistics for each block. + + Returns: + list[np.ndarray]: A list where each element is a numpy array containing the normalized probability + distribution over experts for the corresponding block. If a block's expert list is empty, + its entry is 0. + """ with open(stats_file) as f: stats = json.load(f) - return [np.array(l) / np.sum(l) if len(l) > 0 else 0 for l in stats] + return [ + np.array(expert_probs) / np.sum(expert_probs) if len(expert_probs) > 0 else 0 + for expert_probs in stats + ] def estimate_num_active_experts( dist_over_experts: np.ndarray, batch_size: int, num_experts: int ) -> int: + """Estimate the expected number of active experts in a Mixture-of-Experts (MoE) layer. + + This function computes the expected number of unique experts that are selected at least once when performing + inference with a given batch size. It assumes, for each input in the batch, an expert is chosen with probability + given by `dist_over_experts` (typically a vector of probabilities for each expert). For a batch of size B, the + expected number of active (i.e., selected at least once) experts is computed. + + Args: + dist_over_experts (np.ndarray): A 1D array of probabilities for each expert. + batch_size (int): The number of samples in the batch. + num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter). + + Returns: + int: The expected number of experts selected at least once across the batch. + """ # cut the tail and renormalize dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts] dist_over_experts = dist_over_experts / (dist_over_experts.sum()) @@ -226,6 +293,18 @@ def estimate_moe_active_params( batch_size: int, block_idx: int, ) -> int: + """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock. + + Args: + subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured). + n_embd (int): The embedding dimension (input and output size per expert). + moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts. + batch_size (int): Batch size to simulate/extrapolate expected expert use. + block_idx (int): The index of the block/layer whose expert routing statistics should be used. + + Returns: + int: Estimated number of parameters actively used for the current batch and expert selection statistics. + """ assert Path(moe_stats_file).exists() # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution? # return calculate_subblock_params(subblock_config, n_embd, n_head=None) @@ -255,7 +334,7 @@ def estimate_moe_active_params( def calculate_attention_memory( attention_config: AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], batch_size: int, prefill_seq_len: int, generation_seq_len: int, @@ -267,6 +346,7 @@ def calculate_attention_memory( allocate_prefill_query: bool, ) -> dict[str, float]: """allocate_prefill_query: infery-llm style. + Infery used a unified Wqkv matrix, so before extracting the kv-cache, the query also had to be kept in-memory, once per layer. """ @@ -294,11 +374,25 @@ def calculate_attention_memory( def calculate_mamba_memory( attention_config: AttentionConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], batch_size: int, weights_dtype: torch.dtype, kv_cache_dtype: torch.dtype, ) -> int: + """Calculate memory usage (MiB) for a Mamba attention subblock. + + Args: + attention_config (AttentionConfig): Mamba attention configuration, + including Mamba-specific settings. + model_config (PretrainedConfig): Model configuration. + descriptor (type[ModelDescriptor]): Model descriptor class. + batch_size (int): Batch size for memory estimate. + weights_dtype (torch.dtype): Data type for model weights. + kv_cache_dtype (torch.dtype): Data type for state/kv-cache. + + Returns: + int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock. + """ assert attention_config.mamba is not None mamba_config = attention_config.mamba num_params = calculate_subblock_params(model_config, attention_config, descriptor) @@ -312,7 +406,16 @@ def calculate_mamba_state_size( mamba_config: MambaConfig, batch_size: int, ) -> int: - d_inner, in_proj_dim, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config) + """Calculate the total state size for a Mamba attention subblock. + + Args: + mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters. + batch_size (int): Batch size to estimate the memory/state requirements for. + + Returns: + int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state. + """ + _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config) conv_state_size = math.prod((batch_size, conv_dim, kernel_size)) ssm_state_size = math.prod( (batch_size, mamba_config.num_heads, mamba_config.head_dim, mamba_config.state_dim) @@ -333,10 +436,23 @@ def _calculate_mamba_intermediates(mamba_config: MambaConfig) -> tuple[int, ...] def calculate_ffn_memory( ffn_config: FFNConfig, model_config: PretrainedConfig, - descriptor: Type[ModelDescriptor], + descriptor: type[ModelDescriptor], weights_dtype: torch.dtype | str, experts_dtype: torch.dtype | str | None = None, ) -> float: + """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock. + + Args: + ffn_config (FFNConfig): FFN configuration for the block. + model_config (PretrainedConfig): The parent model configuration. + descriptor (type[ModelDescriptor]): Model descriptor class. + weights_dtype (torch.dtype | str): Data type for FFN weights. + experts_dtype (torch.dtype | str | None, optional): Data type for expert weights + (for MoE layers, if present). Defaults to None. + + Returns: + float: Estimated FFN memory usage in mebibytes (MiB). + """ # TODO: How to separate between expert weights and the rest for any model (same as puzzletron). num_params = calculate_subblock_params(model_config, ffn_config, descriptor) return num_params * sizeof_dtype(weights_dtype) / 2**20 @@ -347,6 +463,16 @@ def calculate_non_block_memory( vocab_size: int, weight_dtype: torch.dtype, ) -> float: + """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection). + + Args: + n_embd (int): Embedding dimension (hidden size). + vocab_size (int): Vocabulary size. + weight_dtype (torch.dtype): Data type for model weights. + + Returns: + float: Estimated non-subblock memory usage in mebibytes (MiB). + """ return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20 @@ -354,4 +480,13 @@ def calculate_non_block_params( n_embd: int, vocab_size: int, ) -> int: + """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection). + + Args: + n_embd (int): Embedding dimension (hidden size). + vocab_size (int): Vocabulary size. + + Returns: + int: Estimated non-subblock parameter count. + """ return vocab_size * n_embd * 2 + n_embd From a36375017352cc8daf61e2dbb8f46eb96ec03cdb Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 4 May 2026 02:41:13 -0700 Subject: [PATCH 03/35] distributed timeout is configurable Signed-off-by: Grzegorz Karch --- examples/puzzletron/main.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py index 8ceed378318..ba1c19d12c5 100644 --- a/examples/puzzletron/main.py +++ b/examples/puzzletron/main.py @@ -68,7 +68,20 @@ def run_full_puzzletron(hydra_config_path: str): config_path: Path to the YAML configuration file """ mtpz.tools.mprint("Puzzletron Progress 1/8: starting puzzletron pipeline") - dist.setup(timeout=timedelta(minutes=10)) + # Read the Hydra config to determine runtime_stats:enabled, and set the timeout accordingly + from omegaconf import OmegaConf + + # Resolve absolute path for Hydra config + hydra_config_path = Path(hydra_config_path).resolve() + hydra_config = OmegaConf.load(str(hydra_config_path)) + + # Default timeout: 10 minutes, or extended to dist_timeout_minutes if set in config + if hasattr(hydra_config, "dist_timeout_minutes"): + timeout_minutes = timedelta(minutes=hydra_config.dist_timeout_minutes) + else: + timeout_minutes = timedelta(minutes=10) + mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}") + dist.setup(timeout=timeout_minutes) # Register Hydra custom resolvers (needed for config resolution) mtpz.tools.register_hydra_resolvers() From 53a2caf8215b698fde5115ba6f7ec10c6802ccc7 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 4 May 2026 14:08:43 -0700 Subject: [PATCH 04/35] added example config for attn pruning and runtime constraint Signed-off-by: Grzegorz Karch --- .../Llama-3_1-8B.yaml | 109 ++++++++++++++++++ .../llama-3_1-8B_pruneattn_runtime.yaml | 29 +++++ .../pruning/attn_pruning.yaml | 23 ++++ .../pruning/ffn_pruning.yaml | 19 +++ .../pruning/hidden_dim_pruning.yaml | 15 +++ .../pruning/pruning_defaults.yaml | 33 ++++++ .../validate_model_defaults.yaml | 17 +++ .../validate_solutions_defaults.yaml | 10 ++ 8 files changed, 255 insertions(+) create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml create mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml new file mode 100644 index 00000000000..7340938da25 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml @@ -0,0 +1,109 @@ +defaults: + - pruning: ffn_pruning + - scoring: ../validate_solutions_defaults + - realize_model: ../validate_solutions_defaults + - bypass: + - override hydra/hydra_logging: disabled + - _self_ + +puzzle_dir: ??? +descriptor: llama +teacher_dir: ${puzzle_dir}/ckpts/teacher/ +replacement_library_path: ${puzzle_dir}/replacement_library.json +dataset_path: ??? # ppath to Nemotron-Post-Training-Dataset-v2 + +skip_realize_model: false + +build_replacement_library: + add_ffn_no_ops: true + add_attention_no_ops: true + +calc_subblock_stats: + batch_sizes: [64, 96, 128] + prefill_seq_len: 4096 + generation_seq_len: 4096 + num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false + benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking + merge_with_existing_stats: true + subblock_stats_filename: "subblock_stats.json" + moe_stats_filename: "moe_stats.json" + runtime_stats: + backend: trt_torch + +scoring: + descriptor: ${descriptor} + solutions_to_validate: + skip_existing_solutions: true + + replacement_library_path: ${replacement_library_path} + solutions_path: ${to_path:${puzzle_dir}/single_sequence_replacement_solutions.json} + teacher_dir: ${to_path:${teacher_dir}} + output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation + + eval_samples: 8 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path} + +mip: + single_block_replacement_validation_dir: ${to_path:${scoring.output_dir}} + subblock_stats_path: ${to_path:${puzzle_dir}/${calc_subblock_stats.subblock_stats_filename}} + output_path: ${to_path:${puzzle_dir}/mip/puzzle_solutions} + gathered_metrics_path: + puzzle_profile: + + # puzzle_profile: + objective: metrics.cosine_embedding_loss_hidden_states + bigger_is_better: false + + subblock_stats_args: + - batch_size: 96 + weights_dtype: torch.bfloat16 + activations_dtype: torch.bfloat16 + kv_cache_dtype: torch.bfloat16 + + report_additional_costs: + - stats.memory_mib + - stats.num_params + - stats.num_kv_heads + - stats.has_attention + - stats.has_ffn + - stats.kv_cache_memory_mib + - stats.attention_memory_mib + - stats.ffn_memory_mib + - stats.ffn_num_params + - stats.attention_num_params + + # human_constraints: + # target_memory: 78_000 + # num_params: 7_000_000_000 + + mip_constraints: + metric_overrides: + max_seconds_per_solution: 60 + +realize_model: + descriptor: ${descriptor} + teacher_dir: ${to_path:${teacher_dir}} + tokenizer_name: ${to_path:${teacher_dir}} + replacement_library_path: ${replacement_library_path} + save_models: true + solutions_path: # Filled dynamically + + # Validate params + skip_validation: false # To enable validation of the model solution set `skip_validation` as False + eval_samples: 128 + micro_batch_size: 1 + seed: 42 + shuffle_seed: 444 + dataset_path: ${dataset_path} + +nccl_timeout_minutes: ${timedelta_minutes:10} + +# This section redirects Hydra outputs +hydra: + run: + dir: ${puzzle_dir}/hydra_logs/${now:%Y-%m-%d}/${now:%H-%M-%S} diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml new file mode 100644 index 00000000000..42e17d627a0 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml @@ -0,0 +1,29 @@ +defaults: + - Llama-3_1-8B + - override pruning: attn_pruning + - _self_ + +# Input Hugging Face model to compress +input_hf_model_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/puzzletron/checkpoints/meta-llama/Llama-3.1-8B-Instruct + +# Dataset path for pruning and NAS scoring +dataset_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/datasets/nvidia/Nemotron-Post-Training-Dataset-v2 + +# Working directory for puzzletron outputs +puzzle_dir: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/Model-Optimizer/artifacts + +dist_timeout_minutes: 60 + +calc_subblock_stats: + runtime_stats: + enabled: true + synth_dataset_num_requests: 32 + backend: vllm + num_warmup_iters: 2 + num_iters: 10 + batch_size: 1 + +# MIP memory constraint (in MiB) +mip: + human_constraints: + target_latency: 21 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml new file mode 100644 index 00000000000..53d7e4bd9c6 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml @@ -0,0 +1,23 @@ +defaults: + - pruning_defaults + +hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IndependentKvHeadContributionHook} + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor + +activation_hooks_kwargs: + method: independent_kv_head_contribution + optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory + target_layer: "self_attn.o_proj" + layer_input_descriptors_path: + +# n_heads_in_group: 4 +# num_attention_heads: 32 # num query heads +# num_kv_heads: 32 / 4 = 8 # num_query_heads // n_heads_in_group +n_heads_in_group_list: [8, 16, 32] # num_kv_heads = [4, 2, 1] +gqa_init_mode: "PruneKVHeads" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml new file mode 100644 index 00000000000..da0b9720700 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml @@ -0,0 +1,19 @@ +defaults: + - pruning_defaults + +pruning_mixin: + _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn + layer_descriptor: + _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor + +hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IterativeChannelContributionHook} + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: iterative + target_layer: "mlp.down_proj" + layer_input_descriptors_path: + +intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 +mlp_init_mode: "PruneByActivationsLog" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml new file mode 100644 index 00000000000..407c835d8c4 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml @@ -0,0 +1,15 @@ +defaults: + - pruning_defaults + +activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} + +activation_hooks_kwargs: + method: layer_norm_contribution + target_layer: "layernorm" + +# Hidden dimension pruning specific settings +hidden_size_list: [3072, 2048] # Target hidden sizes to prune to +hidden_size_init_mode: "PruneByChannelRanking" +mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher +gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher +linear_init_mode: "FromTeacher" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml new file mode 100644 index 00000000000..e05e775bee3 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml @@ -0,0 +1,33 @@ +defaults: + - /validate_model_defaults + +descriptor: ${descriptor} +model_name_or_path: ${teacher_dir} +experiment_id: ${pruning.eval_samples}samples_diverse_mini +activations_log_dir: ??? +activation_hooks_kwargs: ??? + +# Data: +eval_samples: 1000 # default is 10000 +micro_batch_size: 4 +dataset_path: ${dataset_path} +val_dataset_name: train + +# Prune ckpts +pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} + +## FFN pruning +ffn_list: +mlp_init_mode: "Truncate" # PruneByActivationsLog + +## KV-heads pruning +n_heads_in_group_list: +gqa_init_mode: "AverageKV" + +## Hidden dimension pruning +hidden_size_list: +hidden_size_init_mode: "PruneByChannelRanking" +linear_init_mode: "FromTeacher" + +mlp_init_config_yaml: + activations_log_dir: ${pruning.activations_log_dir} diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml new file mode 100644 index 00000000000..6b36142a3a8 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml @@ -0,0 +1,17 @@ +model_dtype: torch.bfloat16 # dtype to cast the model for validate_model +autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model +block_size: 8192 +bos_rate: 0.5 +data_column: messages +val_dataset_name: validation +shuffle_seed: 81436 +seed: 42 +fim_rate: 0 +fim_spm_rate: 0 +source_datasets_to_discard: +varlen: false +write_results: false +calc_losses_on_cpu: false +activations_log_dir: +model_name_or_path: +load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml new file mode 100644 index 00000000000..ec139023794 --- /dev/null +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml @@ -0,0 +1,10 @@ +defaults: + - /validate_model_defaults + - _self_ + +solutions_to_validate: +skip_validation: false +save_models: false +bigger_is_better: false +sort_solutions_by: +calculate_full_score_ablations: false From dfb905ca3768a53131bc5236b904c17e7d59064d Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 5 May 2026 08:34:32 -0700 Subject: [PATCH 05/35] renamed configs Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 29 ++++++++++++++++++- .../Llama-3_1-8B.yaml | 4 --- .../llama-3_1-8B_pruneattn_runtime.yaml | 11 ++++--- .../pruning/attn_pruning.yaml | 0 .../pruning/ffn_pruning.yaml | 0 .../pruning/hidden_dim_pruning.yaml | 0 .../pruning/pruning_defaults.yaml | 0 .../validate_model_defaults.yaml | 0 .../validate_solutions_defaults.yaml | 0 9 files changed, 35 insertions(+), 9 deletions(-) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/Llama-3_1-8B.yaml (97%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/llama-3_1-8B_pruneattn_runtime.yaml (50%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/attn_pruning.yaml (100%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/ffn_pruning.yaml (100%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/hidden_dim_pruning.yaml (100%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/pruning/pruning_defaults.yaml (100%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/validate_model_defaults.yaml (100%) rename examples/puzzletron/configs/{llama-3_1-8B_pruneattn_runtime => llama-3_1-8B_pruneffn_runtime}/validate_solutions_defaults.yaml (100%) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 571b40ca499..aeec7fc94dd 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -11,7 +11,7 @@ To use the Puzzle algorithm effectively, we need to specify the target number of In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. Other supported models should be compressed in a similar way. For GptOss there is one [additional step to be performed](GPTOSS.md). -> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md). +> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For KV-head pruning see [`llama-3_1-8B_pruneattn_runtime`](./configs/llama-3_1-8B_pruneattn_runtime/) and the [Attention Pruning](#attention-pruning-kv-head-reduction) and [Runtime-Based Latency Optimization](#runtime-based-latency-optimization) sections below. For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md). ## Environment @@ -343,6 +343,33 @@ See [Megatron-Bridge distillation](../megatron_bridge/README.md#distillation) fo For distillation results on Puzzletron-compressed models, see [examples/pruning/puzzletron/](../pruning/puzzletron/README.md). +## Runtime-Based Latency Optimization + +By default, subblock statistics use the `trt_torch` backend with theoretical memory proxies. You can instead enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints: + +```yaml +calc_subblock_stats: + runtime_stats: + enabled: true + synth_dataset_num_requests: 32 + backend: vllm + num_warmup_iters: 2 + num_iters: 10 + batch_size: 1 + +mip: + human_constraints: + target_latency: 20 # ms +``` + +Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly: + +```yaml +dist_timeout_minutes: 60 # default is 10 if omitted +``` + +This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout. + ## Advanced Usage Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios. diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml similarity index 97% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index 7340938da25..bb352598e10 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -77,10 +77,6 @@ mip: - stats.ffn_num_params - stats.attention_num_params - # human_constraints: - # target_memory: 78_000 - # num_params: 7_000_000_000 - mip_constraints: metric_overrides: max_seconds_per_solution: 60 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml similarity index 50% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml index 42e17d627a0..6eaf5f508b8 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/llama-3_1-8B_pruneattn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml @@ -1,16 +1,15 @@ defaults: - Llama-3_1-8B - - override pruning: attn_pruning - _self_ # Input Hugging Face model to compress -input_hf_model_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/puzzletron/checkpoints/meta-llama/Llama-3.1-8B-Instruct +input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct # Dataset path for pruning and NAS scoring -dataset_path: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/datasets/nvidia/Nemotron-Post-Training-Dataset-v2 +dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 # Working directory for puzzletron outputs -puzzle_dir: /lustre/fs1/portfolios/coreai/projects/coreai_dlalgo_llm/users/gkarch/modelopt/Model-Optimizer/artifacts +puzzle_dir: /workspace/puzzle_dir dist_timeout_minutes: 60 @@ -27,3 +26,7 @@ calc_subblock_stats: mip: human_constraints: target_latency: 21 + +# FFN intermediate sizes to search over (heterogeneous architecture) +pruning: + intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/attn_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/ffn_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/hidden_dim_pruning.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/pruning/pruning_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_model_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml similarity index 100% rename from examples/puzzletron/configs/llama-3_1-8B_pruneattn_runtime/validate_solutions_defaults.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml From e165171e8ed445a6a95ed8a4493f510efac50172 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 6 May 2026 01:57:22 -0700 Subject: [PATCH 06/35] working on readme Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index aeec7fc94dd..134790bb011 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -359,7 +359,7 @@ calc_subblock_stats: mip: human_constraints: - target_latency: 20 # ms + target_latency: 20 # seconds ``` Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly: From d47b69c54964c1a5f62cc97f8f4c53f5ae4a848d Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 6 May 2026 05:25:46 -0700 Subject: [PATCH 07/35] working on refactoring Signed-off-by: Grzegorz Karch --- .../nas/subblock_stats/calc_runtime_stats.py | 142 ++++-------------- .../torch/nas/subblock_stats/runtime_utils.py | 61 ++++++++ .../torch/nas/subblock_stats/runtime_vllm.py | 48 ++++++ 3 files changed, 137 insertions(+), 114 deletions(-) create mode 100644 modelopt/torch/nas/subblock_stats/runtime_utils.py create mode 100644 modelopt/torch/nas/subblock_stats/runtime_vllm.py diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index d3b997f4525..cadc7d8c9a6 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -16,19 +16,16 @@ """Runtime statistics calculation for NAS subblock benchmarking via vLLM.""" -import json -import os -import subprocess import tempfile -from dataclasses import dataclass, replace +from dataclasses import replace from pathlib import Path -import torch from omegaconf import DictConfig from tqdm import tqdm -from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaConfig, LlamaForCausalLM +from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM -from modelopt.torch.puzzletron.anymodel.converter import Converter +from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig, save_model +from modelopt.torch.nas.subblock_stats.runtime_vllm import run_vllm_latency_benchmark from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher from modelopt.torch.puzzletron.block_config import ( @@ -89,98 +86,17 @@ def create_benchmark_model( return model -def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): - """Save a model checkpoint in AnyModel subblock-safetensors format.""" - # Save standard model checkpoint (as safetensors, HF format) - model.save_pretrained(output_dir, safe_serialization=True) - - # Convert/slice weights into AnyModel subblock_safetensors format - Converter.convert_model_weights( - input_dir=output_dir, - output_dir=output_dir, - descriptor=descriptor, - num_hidden_layers=num_hidden_layers, - ) - # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk. - - config_path = output_dir / "config.json" - if config_path.exists(): - with open(config_path) as f: - config_data = json.load(f) - config_data["architectures"] = ["AnyModel"] - with open(config_path, "w") as f: - json.dump(config_data, f, indent=2) - - -def save_model( - model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int -) -> None: - """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" - model.to(dtype=torch.bfloat16).save_pretrained(output_path) - save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) - - tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) - tokenizer.save_pretrained(output_path) - - -@dataclass(frozen=True) -class RuntimeConfig: - """Configuration for a vLLM latency benchmark run.""" - - vocab_size: int - hidden_size: int - num_attention_heads: int - master_puzzle_dir: str - tokenizer_path: str - synth_dataset_num_requests: int - repeat_block_n_times: int - prefill_seq_len: int - generation_seq_len: int - batch_size: int - num_iters: int - num_warmup_iters: int - - -def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): - """Run ``vllm bench latency`` and return the average latency in milliseconds.""" - output_json_path = model_path / "vllm_latency_benchmark.json" - - cmd = [ - "vllm", - "bench", - "latency", - "--model", - str(model_path), - "--input-len", - str(runtime_config.prefill_seq_len), - "--output-len", - str(runtime_config.generation_seq_len), - "--batch-size", - str(runtime_config.batch_size), - "--output-json", - str(output_json_path), - "--max-model-len", - str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len), - "--num-iters-warmup", - str(runtime_config.num_warmup_iters), - "--num-iters", - str(runtime_config.num_iters), - "--max-num-seqs", - "1", - "--distributed-executor-backend", - "external_launcher", - "--tensor-parallel-size", - "1", - "--pipeline-parallel-size", - "1", - ] - os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" - subprocess.run(cmd) - - with open(output_json_path) as f: - vllm_results = json.load(f) - print(vllm_results) - return vllm_results["avg_latency"] * 1000 # convert to milliseconds +def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float: + """Measure total runtime of a model via vLLM latency benchmark.""" + with tempfile.TemporaryDirectory() as model_tmpdir: + save_model( + model, + Path(runtime_config.tokenizer_path), + Path(model_tmpdir), + num_hidden_layers=runtime_config.repeat_block_n_times + 1, + ) + model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) + return model_total_runtime_ms def calc_subblock_runtime( @@ -207,16 +123,7 @@ def calc_subblock_runtime( block_config=block_config, repeat_block_n_times=runtime_config.repeat_block_n_times, ) - with tempfile.TemporaryDirectory() as model_tmpdir: - save_model( - model, - Path(runtime_config.tokenizer_path), - Path(model_tmpdir), - num_hidden_layers=runtime_config.repeat_block_n_times + 1, - ) - subblock_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) - - return subblock_total_runtime_ms + return calc_model_runtime(model, runtime_config) def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: @@ -267,15 +174,22 @@ def calc_runtime_for_subblocks( runtime_by_subblock_dict = {} - baseline_runtime_ms = calc_subblock_runtime(runtime_config, None) + baseline_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config=None) for subblock_config in tqdm( sorted(subblock_config_set), - desc=( - f"Computing runtime_by_subblock_dict [hidden_size={hidden_size}, " - f"num_subblocks={len(subblock_config_set)}]" - ), + desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"), ): + if isinstance(subblock_config, AttentionConfig): + num_key_value_heads = subblock_config.num_key_value_heads + desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})" + elif isinstance(subblock_config, FFNConfig): + intermediate_size = subblock_config.intermediate_size + desc = f"FFNConfig(intermediate_size={intermediate_size})" + else: + raise ValueError(f"Unsupported subblock type: {type(subblock_config)}") + print(f"Computing runtime for subblock: {desc} {subblock_config.no_op=}") + if subblock_config.no_op: total_runtime_ms = 0.0 else: diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py new file mode 100644 index 00000000000..e4eec38e033 --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -0,0 +1,61 @@ +import json +from dataclasses import dataclass +from pathlib import Path + +import torch +from transformers import AutoTokenizer, LlamaForCausalLM + +from modelopt.torch.puzzletron.anymodel.converter import Converter +from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor + + +@dataclass(frozen=True) +class RuntimeConfig: + """Configuration for a vLLM latency benchmark run.""" + + vocab_size: int + hidden_size: int + num_attention_heads: int + master_puzzle_dir: str + tokenizer_path: str + synth_dataset_num_requests: int + repeat_block_n_times: int + prefill_seq_len: int + generation_seq_len: int + batch_size: int + num_iters: int + num_warmup_iters: int + + +def save_model( + model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int +) -> None: + """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" + model.to(dtype=torch.bfloat16).save_pretrained(output_path) + save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + tokenizer.save_pretrained(output_path) + + +def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): + """Save a model checkpoint in AnyModel subblock-safetensors format.""" + # Save standard model checkpoint (as safetensors, HF format) + model.save_pretrained(output_dir, safe_serialization=True) + + # Convert/slice weights into AnyModel subblock_safetensors format + Converter.convert_model_weights( + input_dir=output_dir, + output_dir=output_dir, + descriptor=descriptor, + num_hidden_layers=num_hidden_layers, + ) + # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk. + + config_path = output_dir / "config.json" + if config_path.exists(): + with open(config_path) as f: + config_data = json.load(f) + config_data["architectures"] = ["AnyModel"] + with open(config_path, "w") as f: + json.dump(config_data, f, indent=2) diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py new file mode 100644 index 00000000000..f1c7c99ed0b --- /dev/null +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -0,0 +1,48 @@ +import json +import os +import subprocess +from pathlib import Path + +from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig + + +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): + """Run ``vllm bench latency`` and return the average latency in milliseconds.""" + output_json_path = model_path / "vllm_latency_benchmark.json" + + cmd = [ + "vllm", + "bench", + "latency", + "--model", + str(model_path), + "--input-len", + str(runtime_config.prefill_seq_len), + "--output-len", + str(runtime_config.generation_seq_len), + "--batch-size", + str(runtime_config.batch_size), + "--output-json", + str(output_json_path), + "--max-model-len", + str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len), + "--num-iters-warmup", + str(runtime_config.num_warmup_iters), + "--num-iters", + str(runtime_config.num_iters), + "--max-num-seqs", + "1", + "--distributed-executor-backend", + "external_launcher", + "--tensor-parallel-size", + "1", + "--pipeline-parallel-size", + "1", + ] + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" + subprocess.run(cmd) + + with open(output_json_path) as f: + vllm_results = json.load(f) + print(vllm_results) + return vllm_results["avg_latency"] * 1000 # convert to milliseconds From 12ed46ba2f71813ce9d61380edf1b147524ed461 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Sun, 17 May 2026 07:46:56 -0700 Subject: [PATCH 08/35] working on fix Signed-off-by: Grzegorz Karch --- .../nas/subblock_stats/calc_runtime_stats.py | 57 ++++++++++++++++--- .../torch/nas/subblock_stats/runtime_utils.py | 1 + .../torch/nas/subblock_stats/runtime_vllm.py | 4 +- .../subblock_stats/calc_subblock_stats.py | 39 ++++++------- 4 files changed, 73 insertions(+), 28 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index cadc7d8c9a6..40b3191dbc4 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -18,6 +18,7 @@ import tempfile from dataclasses import replace +from functools import cache from pathlib import Path from omegaconf import DictConfig @@ -36,9 +37,9 @@ ) -def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> BlockConfig: +def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig: return BlockConfig( - attention=AttentionConfig(no_op=False, num_key_value_heads=num_attention_heads), + attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads), ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None), parallel_blocks=None, ) @@ -47,6 +48,7 @@ def _make_standard_block_config(hidden_size: int, num_attention_heads: int) -> B def create_benchmark_model( vocab_size: int, hidden_size: int, + num_key_value_heads: int, num_attention_heads: int, prefill_seq_len: int, generation_seq_len: int, @@ -54,7 +56,7 @@ def create_benchmark_model( repeat_block_n_times: int = 10, ) -> LlamaForCausalLM: """Build a small Llama model with repeated subblocks for latency benchmarking.""" - block_configs = [_make_standard_block_config(hidden_size, num_attention_heads)] + block_configs = [_make_standard_block_config(hidden_size, num_key_value_heads)] if block_config: block_configs.extend([block_config] * repeat_block_n_times) @@ -88,17 +90,20 @@ def create_benchmark_model( def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float: """Measure total runtime of a model via vLLM latency benchmark.""" - with tempfile.TemporaryDirectory() as model_tmpdir: + with tempfile.TemporaryDirectory(delete=False) as model_tmpdir: # delete=True after debugging + print(f"|||| Saving model to {model_tmpdir}") save_model( model, Path(runtime_config.tokenizer_path), Path(model_tmpdir), num_hidden_layers=runtime_config.repeat_block_n_times + 1, ) + print(f"|||| Running vLLM latency benchmark on {model_tmpdir}") model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) return model_total_runtime_ms +@cache def calc_subblock_runtime( runtime_config: RuntimeConfig, subblock_config: SubblockConfig, @@ -110,13 +115,22 @@ def calc_subblock_runtime( if isinstance(subblock_config, BlockConfig): block_config = subblock_config elif isinstance(subblock_config, (AttentionConfig, FFNConfig)): - block_config = subblock_config.to_blockconfig() + if isinstance(subblock_config, FFNConfig): + block_config = BlockConfig( + attention=AttentionConfig( + no_op=False, num_key_value_heads=runtime_config.num_key_value_heads + ), + ffn=subblock_config, + ) + else: + block_config = subblock_config.to_blockconfig() else: raise Exception(f"Runtime stats: Not supported subblock type: {subblock_config}") model = create_benchmark_model( runtime_config.vocab_size, runtime_config.hidden_size, + runtime_config.num_key_value_heads, runtime_config.num_attention_heads, runtime_config.prefill_seq_len, runtime_config.generation_seq_len, @@ -126,13 +140,14 @@ def calc_subblock_runtime( return calc_model_runtime(model, runtime_config) +@cache def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" runtime_config1 = replace(runtime_config, repeat_block_n_times=0) runtime_config10 = replace(runtime_config, repeat_block_n_times=9) block_config = _make_standard_block_config( - runtime_config.hidden_size, runtime_config.num_attention_heads + runtime_config.hidden_size, runtime_config.num_key_value_heads ) runtime_ms1 = calc_subblock_runtime(runtime_config1, None) @@ -143,12 +158,30 @@ def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: return no_block_runtime_ms +@cache +def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float: + """Calculate the base runtime of a model with no subblocks.""" + base_runtime_ms = 0.0 + if isinstance(subblock_config, AttentionConfig): + base_runtime_ms = calc_subblock_runtime(runtime_config, None) + elif isinstance(subblock_config, FFNConfig): + attn_block_config = AttentionConfig( + no_op=False, num_key_value_heads=runtime_config.num_key_value_heads + ).to_blockconfig() + base_runtime_ms = calc_subblock_runtime(runtime_config, attn_block_config) + else: + raise ValueError(f"Unsupported subblock type: {type(subblock_config)}") + + return base_runtime_ms + + def calc_runtime_for_subblocks( subblock_config_set: set[SubblockConfig], runtime_stats_config: DictConfig, vocab_size: int, hidden_size: int, num_attention_heads: int, + num_key_value_heads: int, master_puzzle_dir: str, tokenizer_path: str, synth_dataset_num_requests: int, @@ -161,6 +194,7 @@ def calc_runtime_for_subblocks( vocab_size, hidden_size, num_attention_heads, + num_key_value_heads, master_puzzle_dir, tokenizer_path, synth_dataset_num_requests, @@ -174,12 +208,15 @@ def calc_runtime_for_subblocks( runtime_by_subblock_dict = {} - baseline_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config=None) - for subblock_config in tqdm( sorted(subblock_config_set), desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"), ): + print("|||| Calculating baseline runtime") + # runtime_config_baseline = replace(runtime_config, repeat_block_n_times=0) + baseline_runtime_ms = calc_base_runtime(runtime_config, subblock_config) + print(f"|||| {baseline_runtime_ms=}") + if isinstance(subblock_config, AttentionConfig): num_key_value_heads = subblock_config.num_key_value_heads desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})" @@ -193,13 +230,17 @@ def calc_runtime_for_subblocks( if subblock_config.no_op: total_runtime_ms = 0.0 else: + print("|||| Calculating subblock runtime") subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config) + print(f"|||| {subblock_total_runtime_ms=}") total_runtime_ms = ( subblock_total_runtime_ms - baseline_runtime_ms ) / repeat_block_n_times runtime_by_subblock_dict[subblock_config] = total_runtime_ms + print("|||| Calculating no-block runtime") no_block_runtime_ms = calc_no_block_runtime(runtime_config) + print(f"|||| {no_block_runtime_ms=}") return runtime_by_subblock_dict, no_block_runtime_ms diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index e4eec38e033..39baf2126f2 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -16,6 +16,7 @@ class RuntimeConfig: vocab_size: int hidden_size: int num_attention_heads: int + num_key_value_heads: int master_puzzle_dir: str tokenizer_path: str synth_dataset_num_requests: int diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index f1c7c99ed0b..80d541fb632 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -21,7 +21,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): "--output-len", str(runtime_config.generation_seq_len), "--batch-size", - str(runtime_config.batch_size), + "1", #str(runtime_config.batch_size), "--output-json", str(output_json_path), "--max-model-len", @@ -38,6 +38,8 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): "1", "--pipeline-parallel-size", "1", + "--optimization-level", + "1", ] os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" subprocess.run(cmd) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index f36a71710a3..a1c0c6d6aa5 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -24,6 +24,7 @@ from functools import partial from itertools import product from pathlib import Path +from pdb import run from typing import Iterable, Type, TypeVar import pandas as pd @@ -33,6 +34,13 @@ from tqdm import tqdm from transformers import PretrainedConfig +from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import ( + calc_subblock_active_params, + calculate_non_block_memory, + calculate_non_block_params, + calculate_subblock_memory, + calculate_subblock_params, +) from modelopt.torch.utils import json_dump from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory @@ -41,13 +49,6 @@ from ..tools.checkpoint_utils import load_model_config from ..tools.logger import mprint from ..utils.parsing import format_global_config -from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import ( - calc_subblock_active_params, - calculate_non_block_memory, - calculate_non_block_params, - calculate_subblock_memory, - calculate_subblock_params, -) __all__ = [ "calculate_subblock_stats", @@ -124,18 +125,20 @@ def calculate_subblock_stats( "synth_dataset_num_requests", 200 ) runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {}) + runtime_stats_config["batch_size"] = batch_size runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks( - subblock_configs_nolayerindex, - runtime_stats_config, - vocab_size, - n_embd, - n_head, - master_puzzle_dir, - teacher_dir, - synth_dataset_num_requests, - prefill_seq_len, - generation_seq_len, + subblock_config_set=subblock_configs_nolayerindex, + runtime_stats_config=runtime_stats_config, + vocab_size=vocab_size, + hidden_size=n_embd, + num_attention_heads=n_head, + num_key_value_heads=8, + master_puzzle_dir=master_puzzle_dir, + tokenizer_path=teacher_dir, + synth_dataset_num_requests=synth_dataset_num_requests, + prefill_seq_len=prefill_seq_len, + generation_seq_len=generation_seq_len, ) sorted_subblock_config = sorted( @@ -486,5 +489,3 @@ def _dataclass_from_dict( if pd.isna(d): return None raise ValueError(f"_dataclass_from_dict: unrecognized {type(d)=} {d=}") - - From ab925b90486d7f7d170c254c276bcfa9020c9211 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 18 May 2026 03:55:44 -0700 Subject: [PATCH 09/35] runtime accuracy improved Signed-off-by: Grzegorz Karch --- .../nas/subblock_stats/calc_runtime_stats.py | 30 ++----------------- .../torch/nas/subblock_stats/runtime_utils.py | 10 +++---- .../torch/nas/subblock_stats/runtime_vllm.py | 4 +-- 3 files changed, 9 insertions(+), 35 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index 40b3191dbc4..b97214461e2 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -40,7 +40,7 @@ def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig: return BlockConfig( attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads), - ffn=FFNConfig(no_op=False, intermediate_size=hidden_size, moe=None), + ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None), parallel_blocks=None, ) @@ -90,15 +90,8 @@ def create_benchmark_model( def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) -> float: """Measure total runtime of a model via vLLM latency benchmark.""" - with tempfile.TemporaryDirectory(delete=False) as model_tmpdir: # delete=True after debugging - print(f"|||| Saving model to {model_tmpdir}") - save_model( - model, - Path(runtime_config.tokenizer_path), - Path(model_tmpdir), - num_hidden_layers=runtime_config.repeat_block_n_times + 1, - ) - print(f"|||| Running vLLM latency benchmark on {model_tmpdir}") + with tempfile.TemporaryDirectory() as model_tmpdir: + save_model(model, Path(runtime_config.tokenizer_path), Path(model_tmpdir)) model_total_runtime_ms = run_vllm_latency_benchmark(Path(model_tmpdir), runtime_config) return model_total_runtime_ms @@ -212,35 +205,18 @@ def calc_runtime_for_subblocks( sorted(subblock_config_set), desc=(f"Computing runtime for {len(subblock_config_set)} subblocks\n"), ): - print("|||| Calculating baseline runtime") - # runtime_config_baseline = replace(runtime_config, repeat_block_n_times=0) baseline_runtime_ms = calc_base_runtime(runtime_config, subblock_config) - print(f"|||| {baseline_runtime_ms=}") - - if isinstance(subblock_config, AttentionConfig): - num_key_value_heads = subblock_config.num_key_value_heads - desc = f"AttentionConfig(num_key_value_heads={num_key_value_heads})" - elif isinstance(subblock_config, FFNConfig): - intermediate_size = subblock_config.intermediate_size - desc = f"FFNConfig(intermediate_size={intermediate_size})" - else: - raise ValueError(f"Unsupported subblock type: {type(subblock_config)}") - print(f"Computing runtime for subblock: {desc} {subblock_config.no_op=}") if subblock_config.no_op: total_runtime_ms = 0.0 else: - print("|||| Calculating subblock runtime") subblock_total_runtime_ms = calc_subblock_runtime(runtime_config, subblock_config) - print(f"|||| {subblock_total_runtime_ms=}") total_runtime_ms = ( subblock_total_runtime_ms - baseline_runtime_ms ) / repeat_block_n_times runtime_by_subblock_dict[subblock_config] = total_runtime_ms - print("|||| Calculating no-block runtime") no_block_runtime_ms = calc_no_block_runtime(runtime_config) - print(f"|||| {no_block_runtime_ms=}") return runtime_by_subblock_dict, no_block_runtime_ms diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index 39baf2126f2..b3b5278fa68 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -28,18 +28,16 @@ class RuntimeConfig: num_warmup_iters: int -def save_model( - model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path, num_hidden_layers: int -) -> None: +def save_model(model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path) -> None: """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" model.to(dtype=torch.bfloat16).save_pretrained(output_path) - save_model_as_anymodel(model, output_path, LlamaModelDescriptor, num_hidden_layers) + save_model_as_anymodel(model, output_path, LlamaModelDescriptor) tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) tokenizer.save_pretrained(output_path) -def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layers: int): +def save_model_as_anymodel(model, output_dir: Path, descriptor): """Save a model checkpoint in AnyModel subblock-safetensors format.""" # Save standard model checkpoint (as safetensors, HF format) model.save_pretrained(output_dir, safe_serialization=True) @@ -49,7 +47,7 @@ def save_model_as_anymodel(model, output_dir: Path, descriptor, num_hidden_layer input_dir=output_dir, output_dir=output_dir, descriptor=descriptor, - num_hidden_layers=num_hidden_layers, + num_hidden_layers=model.config.num_hidden_layers, ) # Load the model config.json, update "architectures" to ["AnyModel"], and write back to disk. diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 80d541fb632..378337487b2 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -21,7 +21,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): "--output-len", str(runtime_config.generation_seq_len), "--batch-size", - "1", #str(runtime_config.batch_size), + "1", "--output-json", str(output_json_path), "--max-model-len", @@ -39,7 +39,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): "--pipeline-parallel-size", "1", "--optimization-level", - "1", + "0", ] os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" subprocess.run(cmd) From 58f17e48b94f0a0df6955a149865cbbdc5b7b4c0 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 18 May 2026 08:29:56 -0700 Subject: [PATCH 10/35] using vllm api instead of subprocess Signed-off-by: Grzegorz Karch --- .../torch/nas/subblock_stats/runtime_vllm.py | 61 +++++++++---------- 1 file changed, 28 insertions(+), 33 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 378337487b2..40f9f1f7239 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -1,8 +1,10 @@ +import argparse import json import os -import subprocess from pathlib import Path +from vllm.benchmarks.latency import main as vllm_latency_main + from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig @@ -10,39 +12,32 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): """Run ``vllm bench latency`` and return the average latency in milliseconds.""" output_json_path = model_path / "vllm_latency_benchmark.json" - cmd = [ - "vllm", - "bench", - "latency", - "--model", - str(model_path), - "--input-len", - str(runtime_config.prefill_seq_len), - "--output-len", - str(runtime_config.generation_seq_len), - "--batch-size", - "1", - "--output-json", - str(output_json_path), - "--max-model-len", - str(runtime_config.prefill_seq_len + runtime_config.generation_seq_len), - "--num-iters-warmup", - str(runtime_config.num_warmup_iters), - "--num-iters", - str(runtime_config.num_iters), - "--max-num-seqs", - "1", - "--distributed-executor-backend", - "external_launcher", - "--tensor-parallel-size", - "1", - "--pipeline-parallel-size", - "1", - "--optimization-level", - "0", - ] + # Use vLLM latency benchmark as a library. + + # Create a mock argparse.Namespace similar to what is parsed by vllm.benchmarks.latency.main + args_ns = argparse.Namespace() + + # Populate the Namespace with all required attributes + args_ns.model = str(model_path) + args_ns.input_len = runtime_config.prefill_seq_len + args_ns.output_len = runtime_config.generation_seq_len + args_ns.batch_size = 1 + args_ns.output_json = str(output_json_path) + args_ns.max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len + args_ns.num_iters_warmup = runtime_config.num_warmup_iters + args_ns.num_iters = runtime_config.num_iters + args_ns.max_num_seqs = 1 + args_ns.distributed_executor_backend = ( + "external_launcher" # Running vLLM with torchrun so need to indicate that. + ) + args_ns.tensor_parallel_size = 1 + args_ns.pipeline_parallel_size = 1 + args_ns.optimization_level = 0 # This is required to make the stats accurate. + args_ns.n = 1 + args_ns.disable_detokenize = False + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" - subprocess.run(cmd) + vllm_latency_main(args_ns) with open(output_json_path) as f: vllm_results = json.load(f) From e8683039f07fdd97fcbe0812c9ea7bb10f4576fc Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 03:00:03 -0700 Subject: [PATCH 11/35] working on review feedback Signed-off-by: Grzegorz Karch --- .../llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml | 2 -- ...time.yaml => llama-3_1-8B_pruneffn_runtime.yaml} | 4 ++-- examples/puzzletron/main.py | 13 +++++++------ .../subblock_stats/calc_subblock_stats.py | 3 +-- 4 files changed, 10 insertions(+), 12 deletions(-) rename examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/{llama-3_1-8B_pruneattn_runtime.yaml => llama-3_1-8B_pruneffn_runtime.yaml} (94%) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index bb352598e10..0e270906151 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -97,8 +97,6 @@ realize_model: shuffle_seed: 444 dataset_path: ${dataset_path} -nccl_timeout_minutes: ${timedelta_minutes:10} - # This section redirects Hydra outputs hydra: run: diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml similarity index 94% rename from examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml rename to examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml index 6eaf5f508b8..036486df530 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneattn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml @@ -2,6 +2,8 @@ defaults: - Llama-3_1-8B - _self_ +nccl_timeout_minutes: ${timedelta_minutes:90} + # Input Hugging Face model to compress input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct @@ -11,8 +13,6 @@ dataset_path: /workspace/datasets/Nemotron-Post-Training-Dataset-v2 # Working directory for puzzletron outputs puzzle_dir: /workspace/puzzle_dir -dist_timeout_minutes: 60 - calc_subblock_stats: runtime_stats: enabled: true diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py index ba1c19d12c5..82ace24b4da 100644 --- a/examples/puzzletron/main.py +++ b/examples/puzzletron/main.py @@ -75,16 +75,17 @@ def run_full_puzzletron(hydra_config_path: str): hydra_config_path = Path(hydra_config_path).resolve() hydra_config = OmegaConf.load(str(hydra_config_path)) - # Default timeout: 10 minutes, or extended to dist_timeout_minutes if set in config - if hasattr(hydra_config, "dist_timeout_minutes"): - timeout_minutes = timedelta(minutes=hydra_config.dist_timeout_minutes) + # Register Hydra custom resolvers (needed for config resolution) + mtpz.tools.register_hydra_resolvers() + + # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config + if hasattr(hydra_config, "nccl_timeout_minutes"): + timeout_minutes = hydra_config.nccl_timeout_minutes else: timeout_minutes = timedelta(minutes=10) mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}") - dist.setup(timeout=timeout_minutes) - # Register Hydra custom resolvers (needed for config resolution) - mtpz.tools.register_hydra_resolvers() + dist.setup(timeout=timeout_minutes) hydra_config_path = Path(hydra_config_path).resolve() hydra_config_dir = str(hydra_config_path.parent) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index a1c0c6d6aa5..dc37c1bec26 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -24,7 +24,6 @@ from functools import partial from itertools import product from pathlib import Path -from pdb import run from typing import Iterable, Type, TypeVar import pandas as pd @@ -133,7 +132,7 @@ def calculate_subblock_stats( vocab_size=vocab_size, hidden_size=n_embd, num_attention_heads=n_head, - num_key_value_heads=8, + num_key_value_heads=model_config.num_key_value_heads, master_puzzle_dir=master_puzzle_dir, tokenizer_path=teacher_dir, synth_dataset_num_requests=synth_dataset_num_requests, From f7be643078b752f4ca8abfc6abbb7a9561eb3547 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 03:28:10 -0700 Subject: [PATCH 12/35] removed unused batch_size; cleaned up config loading Signed-off-by: Grzegorz Karch --- examples/puzzletron/main.py | 23 +++++++------------ .../nas/subblock_stats/calc_runtime_stats.py | 1 - .../torch/nas/subblock_stats/runtime_utils.py | 1 - .../subblock_stats/calc_subblock_stats.py | 1 - 4 files changed, 8 insertions(+), 18 deletions(-) diff --git a/examples/puzzletron/main.py b/examples/puzzletron/main.py index 82ace24b4da..f093e5b7e68 100644 --- a/examples/puzzletron/main.py +++ b/examples/puzzletron/main.py @@ -68,25 +68,10 @@ def run_full_puzzletron(hydra_config_path: str): config_path: Path to the YAML configuration file """ mtpz.tools.mprint("Puzzletron Progress 1/8: starting puzzletron pipeline") - # Read the Hydra config to determine runtime_stats:enabled, and set the timeout accordingly - from omegaconf import OmegaConf - - # Resolve absolute path for Hydra config - hydra_config_path = Path(hydra_config_path).resolve() - hydra_config = OmegaConf.load(str(hydra_config_path)) # Register Hydra custom resolvers (needed for config resolution) mtpz.tools.register_hydra_resolvers() - # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config - if hasattr(hydra_config, "nccl_timeout_minutes"): - timeout_minutes = hydra_config.nccl_timeout_minutes - else: - timeout_minutes = timedelta(minutes=10) - mtpz.tools.mprint(f"Puzzletron Progress 1/8: Timeout minutes: {timeout_minutes}") - - dist.setup(timeout=timeout_minutes) - hydra_config_path = Path(hydra_config_path).resolve() hydra_config_dir = str(hydra_config_path.parent) hydra_config_name = hydra_config_path.stem @@ -98,6 +83,14 @@ def run_full_puzzletron(hydra_config_path: str): overrides=[], ) + # Default timeout: 10 minutes, or extended to nccl_timeout_minutes if set in config + if hasattr(hydra_cfg, "nccl_timeout_minutes"): + timeout_minutes = hydra_cfg.nccl_timeout_minutes + else: + timeout_minutes = timedelta(minutes=10) + + dist.setup(timeout=timeout_minutes) + # Convert model (convert from HF to DeciLM, score pruning activations, # prune the model and save pruned checkpoints) input_model = mtpz.puzzletron_nas_plugin.PuzzletronModel() diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index b97214461e2..50d4a7e40cb 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -194,7 +194,6 @@ def calc_runtime_for_subblocks( repeat_block_n_times, prefill_seq_len, generation_seq_len, - runtime_stats_config.get("batch_size", 1), runtime_stats_config.get("num_iters", 30), runtime_stats_config.get("num_warmup_iters", 10), ) diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index b3b5278fa68..dce9bcdd36c 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -23,7 +23,6 @@ class RuntimeConfig: repeat_block_n_times: int prefill_seq_len: int generation_seq_len: int - batch_size: int num_iters: int num_warmup_iters: int diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index dc37c1bec26..e76212c573d 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -124,7 +124,6 @@ def calculate_subblock_stats( "synth_dataset_num_requests", 200 ) runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {}) - runtime_stats_config["batch_size"] = batch_size runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks( subblock_config_set=subblock_configs_nolayerindex, From 49235d19290d50e6bb81f0e9faab69783bbc5d02 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 04:55:19 -0700 Subject: [PATCH 13/35] cleanup based on pre-commit Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/__init__.py | 1 + .../torch/nas/subblock_stats/runtime_utils.py | 15 +++++++++++++++ modelopt/torch/nas/subblock_stats/runtime_vllm.py | 15 +++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py index aeac903f8f4..1976eb2f2e1 100644 --- a/modelopt/torch/nas/subblock_stats/__init__.py +++ b/modelopt/torch/nas/subblock_stats/__init__.py @@ -20,4 +20,5 @@ Primary API: - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations """ + from .calc_runtime_stats import calc_runtime_for_subblocks diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index dce9bcdd36c..ed49b644f2b 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import json from dataclasses import dataclass from pathlib import Path diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 40f9f1f7239..347233dcf9b 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -1,3 +1,18 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import argparse import json import os From 781d44d251c7e218397ffeabb96c06f67dcb0718 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 06:19:17 -0700 Subject: [PATCH 14/35] added docstrings Signed-off-by: Grzegorz Karch --- .../torch/nas/subblock_stats/runtime_utils.py | 9 +++++++++ .../torch/nas/subblock_stats/runtime_vllm.py | 16 ++++++++++++++++ 2 files changed, 25 insertions(+) diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index ed49b644f2b..09245c278cb 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -12,6 +12,15 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Utilities for runtime benchmarking and model saving in ModelOpt NAS. + +This module provides classes and utility functions used for empirical runtime +estimation of Transformer subblocks and for saving models and tokenizers in +formats suitable for benchmarking (e.g., vLLM latency benchmark) or the +AnyModel subblock-safetensors format. It defines the configuration dataclass +used to parameterize runtime benchmarks, as well as model checkpointing helpers +to ensure compatibility with downstream evaluation pipelines. +""" import json from dataclasses import dataclass diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 347233dcf9b..eb1931addf6 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -12,6 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""vLLM Runtime Benchmark Integration for ModelOpt NAS Subblocks. + +This module provides the integration logic to empirically benchmark subblock +runtime statistics within transformer architectures using the vLLM latency +benchmark. It defines helper functions and utilities to invoke the vLLM +latency benchmark programmatically (as a library) and collect runtime +statistics, given a prepared model directory and a benchmarking configuration. + +Usage: + - Call `run_vllm_latency_benchmark` with a model path and a + `RuntimeConfig` instance to run a latency benchmark and + return the average latency for the configuration (in milliseconds). + +This is used internally by ModelOpt NAS to benchmark different subblock +configurations for search and scoring, enabling data-driven NAS for latency-optimized architectures. +""" import argparse import json From a1901c7bc12ee4c651c6883fa4836142ffe133b5 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 06:23:58 -0700 Subject: [PATCH 15/35] updated readme Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 134790bb011..a7af6f66959 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -11,7 +11,7 @@ To use the Puzzle algorithm effectively, we need to specify the target number of In this example, we compress the [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) model reducing GPU memory usage from 113 GiB to 96 GiB (15% reduction) with less than 1% regression in the token_accuracy_top_10 metric. Other supported models should be compressed in a similar way. For GptOss there is one [additional step to be performed](GPTOSS.md). -> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For KV-head pruning see [`llama-3_1-8B_pruneattn_runtime`](./configs/llama-3_1-8B_pruneattn_runtime/) and the [Attention Pruning](#attention-pruning-kv-head-reduction) and [Runtime-Based Latency Optimization](#runtime-based-latency-optimization) sections below. For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md). +> **Note:** Other models are also supported. See the [configs](./configs/) directory for additional model configurations (e.g., Llama-3.2-3B-Instruct on 1x H100, Qwen2.5-7B-Instruct on 1x H100, Qwen3-8B on 1x H100, Nemotron-Nano-12B-v2 on 1x H100, Mistral-Small-24B-Instruct-2501 on 4x H100). For information on adding support for new models, see the [AnyModel Guide](../../modelopt/torch/puzzletron/anymodel/README.md). ## Environment From 0b755024f3f1040ef80a2bf7796cd06cd7ef34c7 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 08:12:34 -0700 Subject: [PATCH 16/35] further changes based on review Signed-off-by: Grzegorz Karch --- .../validate_solutions_defaults.yaml | 4 ++-- .../nas/subblock_stats/calc_runtime_stats.py | 23 ++++++++----------- .../torch/nas/subblock_stats/runtime_utils.py | 2 -- .../torch/nas/subblock_stats/runtime_vllm.py | 4 ++-- 4 files changed, 14 insertions(+), 19 deletions(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml index ec139023794..f950566802a 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml @@ -2,9 +2,9 @@ defaults: - /validate_model_defaults - _self_ -solutions_to_validate: +solutions_to_validate: [] skip_validation: false save_models: false bigger_is_better: false -sort_solutions_by: +sort_solutions_by: null calculate_full_score_ablations: false diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index 50d4a7e40cb..7c212f8e3b3 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -37,7 +37,7 @@ ) -def _make_standard_block_config(hidden_size: int, num_key_value_heads: int) -> BlockConfig: +def _make_standard_block_config(num_key_value_heads: int) -> BlockConfig: return BlockConfig( attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads), ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None), @@ -56,7 +56,7 @@ def create_benchmark_model( repeat_block_n_times: int = 10, ) -> LlamaForCausalLM: """Build a small Llama model with repeated subblocks for latency benchmarking.""" - block_configs = [_make_standard_block_config(hidden_size, num_key_value_heads)] + block_configs = [_make_standard_block_config(num_key_value_heads)] if block_config: block_configs.extend([block_config] * repeat_block_n_times) @@ -136,17 +136,16 @@ def calc_subblock_runtime( @cache def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" - runtime_config1 = replace(runtime_config, repeat_block_n_times=0) - runtime_config10 = replace(runtime_config, repeat_block_n_times=9) + runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9) - block_config = _make_standard_block_config( - runtime_config.hidden_size, runtime_config.num_key_value_heads - ) + block_config = _make_standard_block_config(runtime_config.num_key_value_heads) - runtime_ms1 = calc_subblock_runtime(runtime_config1, None) - runtime_ms10 = calc_subblock_runtime(runtime_config10, block_config) + runtime_ms_one_block = calc_subblock_runtime(runtime_config, None) # only one base block + runtime_ms_ten_blocks = calc_subblock_runtime( + runtime_cfg_ten_blocks, block_config + ) # one base block + 9 repeated blocks - no_block_runtime_ms = runtime_ms1 - (runtime_ms10 - runtime_ms1) / 9 + no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9 return no_block_runtime_ms @@ -154,7 +153,7 @@ def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: @cache def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float: """Calculate the base runtime of a model with no subblocks.""" - base_runtime_ms = 0.0 + base_runtime_ms = None if isinstance(subblock_config, AttentionConfig): base_runtime_ms = calc_subblock_runtime(runtime_config, None) elif isinstance(subblock_config, FFNConfig): @@ -188,9 +187,7 @@ def calc_runtime_for_subblocks( hidden_size, num_attention_heads, num_key_value_heads, - master_puzzle_dir, tokenizer_path, - synth_dataset_num_requests, repeat_block_n_times, prefill_seq_len, generation_seq_len, diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index 09245c278cb..00e0bc6a5f2 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -41,9 +41,7 @@ class RuntimeConfig: hidden_size: int num_attention_heads: int num_key_value_heads: int - master_puzzle_dir: str tokenizer_path: str - synth_dataset_num_requests: int repeat_block_n_times: int prefill_seq_len: int generation_seq_len: int diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index eb1931addf6..e7a4a90b69c 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -34,13 +34,13 @@ import os from pathlib import Path -from vllm.benchmarks.latency import main as vllm_latency_main - from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): """Run ``vllm bench latency`` and return the average latency in milliseconds.""" + from vllm.benchmarks.latency import main as vllm_latency_main + output_json_path = model_path / "vllm_latency_benchmark.json" # Use vLLM latency benchmark as a library. From 7e2f995a1920f0ab3e3f9fa86e3eff3491accb01 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 08:13:20 -0700 Subject: [PATCH 17/35] further changes based on review Signed-off-by: Grzegorz Karch --- .../validate_solutions_defaults.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml index f950566802a..81218606ecd 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml @@ -6,5 +6,5 @@ solutions_to_validate: [] skip_validation: false save_models: false bigger_is_better: false -sort_solutions_by: null +sort_solutions_by: calculate_full_score_ablations: false From 2ca530629f16f1b0c84b755ff393be250b69bef9 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 15:31:57 -0700 Subject: [PATCH 18/35] removed synth_dataset_num_requests Signed-off-by: Grzegorz Karch --- .../nas/subblock_stats/calc_runtime_stats.py | 36 +++++++++---------- .../subblock_stats/calc_subblock_stats.py | 7 ---- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index 7c212f8e3b3..33b03b75531 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -133,23 +133,6 @@ def calc_subblock_runtime( return calc_model_runtime(model, runtime_config) -@cache -def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: - """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" - runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9) - - block_config = _make_standard_block_config(runtime_config.num_key_value_heads) - - runtime_ms_one_block = calc_subblock_runtime(runtime_config, None) # only one base block - runtime_ms_ten_blocks = calc_subblock_runtime( - runtime_cfg_ten_blocks, block_config - ) # one base block + 9 repeated blocks - - no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9 - - return no_block_runtime_ms - - @cache def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockConfig) -> float: """Calculate the base runtime of a model with no subblocks.""" @@ -167,6 +150,23 @@ def calc_base_runtime(runtime_config: RuntimeConfig, subblock_config: SubblockCo return base_runtime_ms +@cache +def calc_no_block_runtime(runtime_config: RuntimeConfig) -> float: + """Estimate the overhead runtime (embedding + LM head) with no decoder blocks.""" + runtime_cfg_ten_blocks = replace(runtime_config, repeat_block_n_times=9) + + block_config = _make_standard_block_config(runtime_config.num_key_value_heads) + + runtime_ms_one_block = calc_subblock_runtime(runtime_config, None) # only one base block + runtime_ms_ten_blocks = calc_subblock_runtime( + runtime_cfg_ten_blocks, block_config + ) # one base block + 9 repeated blocks + + no_block_runtime_ms = runtime_ms_one_block - (runtime_ms_ten_blocks - runtime_ms_one_block) / 9 + + return no_block_runtime_ms + + def calc_runtime_for_subblocks( subblock_config_set: set[SubblockConfig], runtime_stats_config: DictConfig, @@ -174,9 +174,7 @@ def calc_runtime_for_subblocks( hidden_size: int, num_attention_heads: int, num_key_value_heads: int, - master_puzzle_dir: str, tokenizer_path: str, - synth_dataset_num_requests: int, prefill_seq_len: int, generation_seq_len: int, ) -> tuple[dict[SubblockConfig, float], float]: diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index e76212c573d..0a678dc8c76 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -118,11 +118,6 @@ def calculate_subblock_stats( [subblock_config["subblock_config"] for subblock_config in subblock_configs] ) - # dict[SubblockConfig, float], float - # TODO: Manage default values for calc_subblock_stats_config in one place, e.g. within a dataclass for hydra config. - synth_dataset_num_requests = calc_subblock_stats_config.get("runtime_stats", {}).get( - "synth_dataset_num_requests", 200 - ) runtime_stats_config = calc_subblock_stats_config.get("runtime_stats", {}) runtime_by_subblock_dict, non_block_runtime_ms = calc_runtime_for_subblocks( @@ -132,9 +127,7 @@ def calculate_subblock_stats( hidden_size=n_embd, num_attention_heads=n_head, num_key_value_heads=model_config.num_key_value_heads, - master_puzzle_dir=master_puzzle_dir, tokenizer_path=teacher_dir, - synth_dataset_num_requests=synth_dataset_num_requests, prefill_seq_len=prefill_seq_len, generation_seq_len=generation_seq_len, ) From ca2174843d7eb34be207e790c99c6c13c0a9a59e Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Tue, 19 May 2026 16:13:27 -0700 Subject: [PATCH 19/35] removed duplicate model saving Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/runtime_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/nas/subblock_stats/runtime_utils.py index 00e0bc6a5f2..9adb0826278 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_utils.py +++ b/modelopt/torch/nas/subblock_stats/runtime_utils.py @@ -51,7 +51,7 @@ class RuntimeConfig: def save_model(model: LlamaForCausalLM, tokenizer_path: Path, output_path: Path) -> None: """Save model weights as AnyModel and copy the tokenizer to ``output_path``.""" - model.to(dtype=torch.bfloat16).save_pretrained(output_path) + model = model.to(dtype=torch.bfloat16) save_model_as_anymodel(model, output_path, LlamaModelDescriptor) tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) From 26ceb36aa4df96486be50eed1300c9f3f1d8e7c3 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 20 May 2026 07:39:06 -0700 Subject: [PATCH 20/35] added test Signed-off-by: Grzegorz Karch --- .../llama-3_1-8B_pruneffn_runtime.yaml | 2 - .../torch/nas/subblock_stats/runtime_vllm.py | 87 +++++++++++-------- .../gpu/torch/nas/test_calc_runtime_stats.py | 78 +++++++++++++++++ 3 files changed, 127 insertions(+), 40 deletions(-) create mode 100644 tests/gpu/torch/nas/test_calc_runtime_stats.py diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml index 036486df530..0c55b3b5c2d 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml @@ -16,11 +16,9 @@ puzzle_dir: /workspace/puzzle_dir calc_subblock_stats: runtime_stats: enabled: true - synth_dataset_num_requests: 32 backend: vllm num_warmup_iters: 2 num_iters: 10 - batch_size: 1 # MIP memory constraint (in MiB) mip: diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index e7a4a90b69c..96ac57b951b 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -16,61 +16,72 @@ This module provides the integration logic to empirically benchmark subblock runtime statistics within transformer architectures using the vLLM latency -benchmark. It defines helper functions and utilities to invoke the vLLM -latency benchmark programmatically (as a library) and collect runtime -statistics, given a prepared model directory and a benchmarking configuration. +benchmark. Each invocation is launched in a dedicated subprocess so that GPU +memory and CUDA state are fully reclaimed when the subprocess exits, allowing +many sequential benchmarks to run in a single Python session without leaking. Usage: - Call `run_vllm_latency_benchmark` with a model path and a `RuntimeConfig` instance to run a latency benchmark and return the average latency for the configuration (in milliseconds). - -This is used internally by ModelOpt NAS to benchmark different subblock -configurations for search and scoring, enabling data-driven NAS for latency-optimized architectures. """ -import argparse import json -import os +import subprocess from pathlib import Path from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig -def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig): - """Run ``vllm bench latency`` and return the average latency in milliseconds.""" - from vllm.benchmarks.latency import main as vllm_latency_main +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float: + """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms. + Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA + context, and vLLM engine state are fully released on subprocess exit, so + many calls in one parent process do not accumulate. + """ output_json_path = model_path / "vllm_latency_benchmark.json" + max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len - # Use vLLM latency benchmark as a library. - - # Create a mock argparse.Namespace similar to what is parsed by vllm.benchmarks.latency.main - args_ns = argparse.Namespace() - - # Populate the Namespace with all required attributes - args_ns.model = str(model_path) - args_ns.input_len = runtime_config.prefill_seq_len - args_ns.output_len = runtime_config.generation_seq_len - args_ns.batch_size = 1 - args_ns.output_json = str(output_json_path) - args_ns.max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len - args_ns.num_iters_warmup = runtime_config.num_warmup_iters - args_ns.num_iters = runtime_config.num_iters - args_ns.max_num_seqs = 1 - args_ns.distributed_executor_backend = ( - "external_launcher" # Running vLLM with torchrun so need to indicate that. - ) - args_ns.tensor_parallel_size = 1 - args_ns.pipeline_parallel_size = 1 - args_ns.optimization_level = 0 # This is required to make the stats accurate. - args_ns.n = 1 - args_ns.disable_detokenize = False + cmd = [ + "vllm", + "bench", + "latency", + "--model", + str(model_path), + "--input-len", + str(runtime_config.prefill_seq_len), + "--output-len", + str(runtime_config.generation_seq_len), + "--batch-size", + "1", + "--output-json", + str(output_json_path), + "--max-model-len", + str(max_model_len), + "--num-iters-warmup", + str(runtime_config.num_warmup_iters), + "--num-iters", + str(runtime_config.num_iters), + "--max-num-seqs", + "1", + "--tensor-parallel-size", + "1", + "--pipeline-parallel-size", + "1", + "--distributed-executor-backend", + "external_launcher", + # vLLM defaults to 0.9; keep the per-run budget modest so the parent + # process always has headroom for the next benchmark. + "--gpu-memory-utilization", + "0.3", + # Required for accurate per-block runtime stats. + "--optimization-level", + "0", + ] - os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" - vllm_latency_main(args_ns) + subprocess.run(cmd, check=True) with open(output_json_path) as f: vllm_results = json.load(f) - print(vllm_results) - return vllm_results["avg_latency"] * 1000 # convert to milliseconds + return vllm_results["avg_latency"] * 1000 # seconds -> milliseconds diff --git a/tests/gpu/torch/nas/test_calc_runtime_stats.py b/tests/gpu/torch/nas/test_calc_runtime_stats.py new file mode 100644 index 00000000000..0917f2df502 --- /dev/null +++ b/tests/gpu/torch/nas/test_calc_runtime_stats.py @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""GPU test for ``calc_runtime_for_subblocks``. + +Exercises the end-to-end vLLM latency benchmarking pipeline on a tiny model: +constructs a small subblock set, runs the benchmark for each candidate, and +checks the returned per-subblock runtime dict and no-block overhead. +""" + +from functools import partial +from pathlib import Path + +import pytest +from _test_utils.torch.distributed.utils import spawn_multiprocess_job +from _test_utils.torch.transformers_models import get_tiny_tokenizer + +pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks") + + +def test_calc_runtime_for_subblocks(tmp_path: Path): + """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead.""" + spawn_multiprocess_job(size=1, job=partial(_run, tmp_path), backend="nccl") + + +def _run(tmp_path: Path, rank: int, size: int): + import math + + from omegaconf import OmegaConf + + from modelopt.torch.nas.subblock_stats import calc_runtime_for_subblocks + from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig + + tokenizer = get_tiny_tokenizer() + tokenizer_dir = tmp_path / "tokenizer" + tokenizer.save_pretrained(str(tokenizer_dir)) + + attn = AttentionConfig(no_op=False, num_key_value_heads=2) + ffn = FFNConfig(no_op=False, intermediate_size=256, moe=None) + attn_noop = AttentionConfig(no_op=True) + subblock_set = {attn, ffn, attn_noop} + + # vLLM's bench latency samples input ids in [0, 10000) (see + # vllm/benchmarks/latency.py), and its input validator accepts an id when + # it fits in max(tokenizer.max_token_id, model_vocab_size - 1). The tiny + # tokenizer's vocab is ~200, so we size the model vocab past 10000 to + # cover the sampled range. + runtime_by_subblock, no_block_runtime_ms = calc_runtime_for_subblocks( + subblock_config_set=subblock_set, + runtime_stats_config=OmegaConf.create({"num_iters": 1, "num_warmup_iters": 1}), + vocab_size=10016, + hidden_size=256, + num_attention_heads=4, + num_key_value_heads=2, + tokenizer_path=str(tokenizer_dir), + prefill_seq_len=8, + generation_seq_len=4, + ) + + assert set(runtime_by_subblock) == subblock_set + assert runtime_by_subblock[attn_noop] == 0.0 + assert math.isfinite(runtime_by_subblock[attn]) + assert math.isfinite(runtime_by_subblock[ffn]) + # The 1-block model is always slower than the per-block extrapolation from + # the 10-block model, so the (embedding + LM-head) overhead is positive. + assert no_block_runtime_ms > 0 From 4c5b133e621879fed9271d7d1623b7ef4b94e988 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 20 May 2026 11:47:54 -0700 Subject: [PATCH 21/35] suppressing bandit warnings B404 and B603; precedence found in repo Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/runtime_vllm.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 96ac57b951b..6ba8671bf46 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -27,7 +27,7 @@ """ import json -import subprocess +import subprocess # nosec B404 from pathlib import Path from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig @@ -80,7 +80,8 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) "0", ] - subprocess.run(cmd, check=True) + # cmd is a fixed list of strings (no shell, no untrusted input). + subprocess.run(cmd, text=True, check=True, capture_output=True) # nosec B603 with open(output_json_path) as f: vllm_results = json.load(f) From 398808a81ae19c7332c48bd7950454e3d1990756 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 21 May 2026 02:08:54 -0700 Subject: [PATCH 22/35] removed gpu utilization param Signed-off-by: Grzegorz Karch --- .../validate_solutions_defaults.yaml | 2 +- modelopt/torch/nas/subblock_stats/runtime_vllm.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml index 81218606ecd..ec139023794 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml @@ -2,7 +2,7 @@ defaults: - /validate_model_defaults - _self_ -solutions_to_validate: [] +solutions_to_validate: skip_validation: false save_models: false bigger_is_better: false diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 6ba8671bf46..a6850892798 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -71,10 +71,6 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) "1", "--distributed-executor-backend", "external_launcher", - # vLLM defaults to 0.9; keep the per-run budget modest so the parent - # process always has headroom for the next benchmark. - "--gpu-memory-utilization", - "0.3", # Required for accurate per-block runtime stats. "--optimization-level", "0", From e468f62819d5efcea6701de4319e1c530fc60b86 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 21 May 2026 07:10:16 -0700 Subject: [PATCH 23/35] wip Signed-off-by: Grzegorz Karch --- .../torch/nas/subblock_stats/runtime_vllm.py | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index a6850892798..59b97a408ab 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -33,7 +33,7 @@ from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig -def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float: +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None: """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms. Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA @@ -77,8 +77,23 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) ] # cmd is a fixed list of strings (no shell, no untrusted input). - subprocess.run(cmd, text=True, check=True, capture_output=True) # nosec B603 + vllm_results = None + try: + subprocess.run( + cmd, + check=True, + capture_output=True, + text=True, + timeout=1800, # 30 minutes + ) # nosec B603 + except subprocess.TimeoutExpired as exc: + raise TimeoutError("vLLM latency benchmark timed out") from exc + except subprocess.CalledProcessError as exc: + raise RuntimeError(exc.stderr or exc.stdout or "vLLM latency benchmark failed") from exc - with open(output_json_path) as f: - vllm_results = json.load(f) - return vllm_results["avg_latency"] * 1000 # seconds -> milliseconds + if output_json_path.exists(): + with open(output_json_path) as f: + vllm_results = json.load(f) + vllm_results = vllm_results["avg_latency"] * 1000 # seconds -> milliseconds + + return vllm_results From 34dbe52c623a7d536dca01c3b7eac8a081e7ec89 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 21 May 2026 13:50:15 -0700 Subject: [PATCH 24/35] removed redundant configs; guards for vllm results Signed-off-by: Grzegorz Karch --- .../validate_model_defaults.yaml | 2 +- .../Llama-3_1-8B.yaml | 13 +++++--- .../llama-3_1-8B_pruneffn_runtime.yaml | 6 ---- .../pruning/attn_pruning.yaml | 23 ------------- .../pruning/ffn_pruning.yaml | 19 ----------- .../pruning/hidden_dim_pruning.yaml | 15 --------- .../pruning/pruning_defaults.yaml | 33 ------------------- .../validate_model_defaults.yaml | 17 ---------- .../validate_solutions_defaults.yaml | 10 ------ .../torch/nas/subblock_stats/runtime_vllm.py | 5 +-- 10 files changed, 12 insertions(+), 131 deletions(-) delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml delete mode 100644 examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml index ce1749d9698..6b36142a3a8 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/validate_model_defaults.yaml @@ -3,7 +3,7 @@ autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model block_size: 8192 bos_rate: 0.5 data_column: messages -val_dataset_name: valid +val_dataset_name: validation shuffle_seed: 81436 seed: 42 fim_rate: 0 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index 0e270906151..b70e1c367eb 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -1,7 +1,7 @@ defaults: - - pruning: ffn_pruning - - scoring: ../validate_solutions_defaults - - realize_model: ../validate_solutions_defaults + - ../llama-3_1-8B_pruneffn_memory/pruning/ffn_pruning@pruning + - ../llama-3_1-8B_pruneffn_memory/validate_solutions_defaults@scoring + - ../llama-3_1-8B_pruneffn_memory/validate_solutions_defaults@realize_model - bypass: - override hydra/hydra_logging: disabled - _self_ @@ -26,7 +26,7 @@ calc_subblock_stats: prefill_queue_size: 0 allocate_prefill_query: false benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking - merge_with_existing_stats: true + merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" runtime_stats: @@ -42,7 +42,7 @@ scoring: teacher_dir: ${to_path:${teacher_dir}} output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation - eval_samples: 8 + eval_samples: 128 micro_batch_size: 1 seed: 42 shuffle_seed: 444 @@ -77,6 +77,9 @@ mip: - stats.ffn_num_params - stats.attention_num_params + human_constraints: + target_latency: 21 + mip_constraints: metric_overrides: max_seconds_per_solution: 60 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml index 0c55b3b5c2d..701c31e7c10 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml @@ -16,15 +16,9 @@ puzzle_dir: /workspace/puzzle_dir calc_subblock_stats: runtime_stats: enabled: true - backend: vllm num_warmup_iters: 2 num_iters: 10 -# MIP memory constraint (in MiB) -mip: - human_constraints: - target_latency: 21 - # FFN intermediate sizes to search over (heterogeneous architecture) pruning: intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml deleted file mode 100644 index 53d7e4bd9c6..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/attn_pruning.yaml +++ /dev/null @@ -1,23 +0,0 @@ -defaults: - - pruning_defaults - -hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IndependentKvHeadContributionHook} - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/attn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -pruning_mixin: - _target_: modelopt.torch.puzzletron.pruning.kv_heads_pruning_mixin.KVHeadsPruningMixIn - layer_descriptor: - _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaKVHeadsLayerDescriptor - -activation_hooks_kwargs: - method: independent_kv_head_contribution - optimize_for: memory # IndependentKvHeadContributionHook implementation that consumes less memory - target_layer: "self_attn.o_proj" - layer_input_descriptors_path: - -# n_heads_in_group: 4 -# num_attention_heads: 32 # num query heads -# num_kv_heads: 32 / 4 = 8 # num_query_heads // n_heads_in_group -n_heads_in_group_list: [8, 16, 32] # num_kv_heads = [4, 2, 1] -gqa_init_mode: "PruneKVHeads" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml deleted file mode 100644 index da0b9720700..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/ffn_pruning.yaml +++ /dev/null @@ -1,19 +0,0 @@ -defaults: - - pruning_defaults - -pruning_mixin: - _target_: modelopt.torch.puzzletron.pruning.ffn_intermediate_pruning_mixin.FFNIntermediatePruningMixIn - layer_descriptor: - _target_: modelopt.torch.puzzletron.anymodel.models.llama.llama_model_descriptor.LlamaFFNIntermediateLayerDescriptor - -hook_class: ${get_object:modelopt.torch.prune.importance_hooks.base_hooks.IterativeChannelContributionHook} - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/ffn_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -activation_hooks_kwargs: - method: iterative - target_layer: "mlp.down_proj" - layer_input_descriptors_path: - -intermediate_size_list: [3072, 5888, 8704, 11520] # teacher_intermediate_size is 14336 -mlp_init_mode: "PruneByActivationsLog" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml deleted file mode 100644 index 407c835d8c4..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/hidden_dim_pruning.yaml +++ /dev/null @@ -1,15 +0,0 @@ -defaults: - - pruning_defaults - -activations_log_dir: ${puzzle_dir}/pruning/pruning_scores/hidden_dim_${pruning.activation_hooks_kwargs.method}/${pruning.experiment_id} - -activation_hooks_kwargs: - method: layer_norm_contribution - target_layer: "layernorm" - -# Hidden dimension pruning specific settings -hidden_size_list: [3072, 2048] # Target hidden sizes to prune to -hidden_size_init_mode: "PruneByChannelRanking" -mlp_init_mode: "Truncate" # TODO, make it work with CopyAsIs/FromTeacher -gqa_init_mode: "AverageKV" # TODO, make it work with CopyAsIs/FromTeacher -linear_init_mode: "FromTeacher" diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml deleted file mode 100644 index e05e775bee3..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/pruning/pruning_defaults.yaml +++ /dev/null @@ -1,33 +0,0 @@ -defaults: - - /validate_model_defaults - -descriptor: ${descriptor} -model_name_or_path: ${teacher_dir} -experiment_id: ${pruning.eval_samples}samples_diverse_mini -activations_log_dir: ??? -activation_hooks_kwargs: ??? - -# Data: -eval_samples: 1000 # default is 10000 -micro_batch_size: 4 -dataset_path: ${dataset_path} -val_dataset_name: train - -# Prune ckpts -pruned_ckpts_output_dir: ${puzzle_dir}/pruning/${pruning.experiment_id} - -## FFN pruning -ffn_list: -mlp_init_mode: "Truncate" # PruneByActivationsLog - -## KV-heads pruning -n_heads_in_group_list: -gqa_init_mode: "AverageKV" - -## Hidden dimension pruning -hidden_size_list: -hidden_size_init_mode: "PruneByChannelRanking" -linear_init_mode: "FromTeacher" - -mlp_init_config_yaml: - activations_log_dir: ${pruning.activations_log_dir} diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml deleted file mode 100644 index 6b36142a3a8..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_model_defaults.yaml +++ /dev/null @@ -1,17 +0,0 @@ -model_dtype: torch.bfloat16 # dtype to cast the model for validate_model -autocast_dtype: torch.bfloat16 # dtype for torch.autocast for validate_model -block_size: 8192 -bos_rate: 0.5 -data_column: messages -val_dataset_name: validation -shuffle_seed: 81436 -seed: 42 -fim_rate: 0 -fim_spm_rate: 0 -source_datasets_to_discard: -varlen: false -write_results: false -calc_losses_on_cpu: false -activations_log_dir: -model_name_or_path: -load_dataset_fn: ${get_object:modelopt.torch.puzzletron.utils.data.dataloaders.load_from_disk_fn} diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml deleted file mode 100644 index ec139023794..00000000000 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/validate_solutions_defaults.yaml +++ /dev/null @@ -1,10 +0,0 @@ -defaults: - - /validate_model_defaults - - _self_ - -solutions_to_validate: -skip_validation: false -save_models: false -bigger_is_better: false -sort_solutions_by: -calculate_full_score_ablations: false diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/nas/subblock_stats/runtime_vllm.py index 59b97a408ab..21aa5e30bcd 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/nas/subblock_stats/runtime_vllm.py @@ -94,6 +94,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) if output_json_path.exists(): with open(output_json_path) as f: vllm_results = json.load(f) - vllm_results = vllm_results["avg_latency"] * 1000 # seconds -> milliseconds + if "avg_latency" in vllm_results: + return vllm_results["avg_latency"] * 1000 # seconds -> milliseconds - return vllm_results + raise RuntimeError(f"vLLM benchmark output not found at {output_json_path}") From 24fa2d5a84e6d8bd30996373dd4d3c6e25e4d10c Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 21 May 2026 13:59:59 -0700 Subject: [PATCH 25/35] following annotation suggestion Signed-off-by: Grzegorz Karch --- modelopt/torch/nas/subblock_stats/calc_runtime_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py index 33b03b75531..16d62ed1d61 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py @@ -99,7 +99,7 @@ def calc_model_runtime(model: LlamaForCausalLM, runtime_config: RuntimeConfig) - @cache def calc_subblock_runtime( runtime_config: RuntimeConfig, - subblock_config: SubblockConfig, + subblock_config: SubblockConfig | None, ) -> float: """Measure total runtime of a repeated subblock via vLLM latency benchmark.""" block_config: BlockConfig | None = None From 4b824f143b28918048c09de5b40569608723672d Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Fri, 22 May 2026 03:57:04 -0700 Subject: [PATCH 26/35] updated readme Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index a7af6f66959..91d2bab1d3e 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -345,31 +345,43 @@ For distillation results on Puzzletron-compressed models, see [examples/pruning/ ## Runtime-Based Latency Optimization -By default, subblock statistics use the `trt_torch` backend with theoretical memory proxies. You can instead enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints: +You can enable **runtime stats** to measure actual inference latency via vLLM, which unlocks latency-based MIP constraints. + +A ready-to-run example config is included at [`configs/llama-3_1-8B_pruneffn_runtime/`](./configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml). The following key fields enable and control execution of the runtime statistics in the `llama-3_1-8B_pruneffn_runtime.yaml` config file: ```yaml calc_subblock_stats: runtime_stats: enabled: true - synth_dataset_num_requests: 32 - backend: vllm num_warmup_iters: 2 num_iters: 10 - batch_size: 1 +``` + +The runtime constraint is specified in the `human_constraints` section of the config `Llama-3_1-8B.yaml`: +```yaml +human_constraints: + target_latency: 21 +``` -mip: - human_constraints: - target_latency: 20 # seconds +Run the pipeline against this config the same way as the memory-constrained variant: + +```bash +torchrun --nproc_per_node 2 examples/puzzletron/main.py \ + --config examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml 2>&1 | tee ./log.txt | grep "Puzzletron Progress" ``` -Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly: +The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency`, instead of optimizing for a memory budget. + +Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly (already included in the example config): ```yaml -dist_timeout_minutes: 60 # default is 10 if omitted +nccl_timeout_minutes: 90 # default is 10 if omitted ``` This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout. +Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency` value of 21 seconds resulted in a final model latency of 22.3 seconds. + ## Advanced Usage Modify `llama-3_1-8B_pruneffn_memory.yaml` file for advanced compression scenarios. From ae25ec7a9b9c48f0b6a5ad48465397ff4a0bd04f Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Fri, 22 May 2026 11:55:13 -0700 Subject: [PATCH 27/35] moved stats utils from nas to puzzletron Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 1 + .../Llama-3_1-8B.yaml | 2 +- .../Llama-3_1-8B.yaml | 2 +- modelopt/torch/nas/subblock_stats/__init__.py | 24 ------------------- .../subblock_stats/calc_runtime_stats.py | 4 ++-- .../calc_subblock_params_and_memory.py | 0 .../subblock_stats/calc_subblock_stats.py | 6 +++-- .../subblock_stats/runtime_utils.py | 0 .../subblock_stats/runtime_vllm.py | 2 +- .../test_calc_runtime_stats.py | 18 ++++---------- 10 files changed, 15 insertions(+), 44 deletions(-) delete mode 100644 modelopt/torch/nas/subblock_stats/__init__.py rename modelopt/torch/{nas => puzzletron}/subblock_stats/calc_runtime_stats.py (97%) rename modelopt/torch/{nas => puzzletron}/subblock_stats/calc_subblock_params_and_memory.py (100%) rename modelopt/torch/{nas => puzzletron}/subblock_stats/runtime_utils.py (100%) rename modelopt/torch/{nas => puzzletron}/subblock_stats/runtime_vllm.py (97%) rename tests/gpu/torch/{nas => puzzletron}/test_calc_runtime_stats.py (86%) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 91d2bab1d3e..27918819c3f 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -358,6 +358,7 @@ calc_subblock_stats: ``` The runtime constraint is specified in the `human_constraints` section of the config `Llama-3_1-8B.yaml`: + ```yaml human_constraints: target_latency: 21 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml index 21903db1623..1c302fd4c30 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_memory/Llama-3_1-8B.yaml @@ -42,7 +42,7 @@ scoring: teacher_dir: ${to_path:${teacher_dir}} output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation - eval_samples: 128 + eval_samples: 8 micro_batch_size: 1 seed: 42 shuffle_seed: 444 diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index b70e1c367eb..eb9e2398efb 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -42,7 +42,7 @@ scoring: teacher_dir: ${to_path:${teacher_dir}} output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation - eval_samples: 128 + eval_samples: 8 micro_batch_size: 1 seed: 42 shuffle_seed: 444 diff --git a/modelopt/torch/nas/subblock_stats/__init__.py b/modelopt/torch/nas/subblock_stats/__init__.py deleted file mode 100644 index 1976eb2f2e1..00000000000 --- a/modelopt/torch/nas/subblock_stats/__init__.py +++ /dev/null @@ -1,24 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Subblock runtime statistics API for ModelOpt NAS. - -This module provides utilities for measuring and calculating runtime statistics -of subblocks (e.g., Attention, FFN) within transformer architectures. - -Primary API: - - calc_runtime_for_subblocks: Empirically measures runtime for candidate subblock configurations -""" - -from .calc_runtime_stats import calc_runtime_for_subblocks diff --git a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py similarity index 97% rename from modelopt/torch/nas/subblock_stats/calc_runtime_stats.py rename to modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py index 16d62ed1d61..1610b3d7397 100644 --- a/modelopt/torch/nas/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py @@ -25,8 +25,8 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM -from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig, save_model -from modelopt.torch.nas.subblock_stats.runtime_vllm import run_vllm_latency_benchmark +from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig, save_model +from modelopt.torch.puzzletron.subblock_stats.runtime_vllm import run_vllm_latency_benchmark from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher from modelopt.torch.puzzletron.block_config import ( diff --git a/modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py similarity index 100% rename from modelopt/torch/nas/subblock_stats/calc_subblock_params_and_memory.py rename to modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 0a678dc8c76..f91ba397fc1 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -33,7 +33,7 @@ from tqdm import tqdm from transformers import PretrainedConfig -from modelopt.torch.nas.subblock_stats.calc_subblock_params_and_memory import ( +from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import ( calc_subblock_active_params, calculate_non_block_memory, calculate_non_block_params, @@ -90,7 +90,9 @@ def calculate_subblock_stats( moe_stats_file: str | Path | None = None, ) -> dict: if runtime_stats_enabled: - from modelopt.torch.nas.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks + from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import ( + calc_runtime_for_subblocks, + ) gpu = None if not torch.cuda.is_available() else torch.cuda.get_device_name() subblock_stats = { diff --git a/modelopt/torch/nas/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py similarity index 100% rename from modelopt/torch/nas/subblock_stats/runtime_utils.py rename to modelopt/torch/puzzletron/subblock_stats/runtime_utils.py diff --git a/modelopt/torch/nas/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py similarity index 97% rename from modelopt/torch/nas/subblock_stats/runtime_vllm.py rename to modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py index 21aa5e30bcd..75e3953ac72 100644 --- a/modelopt/torch/nas/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py @@ -30,7 +30,7 @@ import subprocess # nosec B404 from pathlib import Path -from modelopt.torch.nas.subblock_stats.runtime_utils import RuntimeConfig +from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None: diff --git a/tests/gpu/torch/nas/test_calc_runtime_stats.py b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py similarity index 86% rename from tests/gpu/torch/nas/test_calc_runtime_stats.py rename to tests/gpu/torch/puzzletron/test_calc_runtime_stats.py index 0917f2df502..fba8d563f44 100644 --- a/tests/gpu/torch/nas/test_calc_runtime_stats.py +++ b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py @@ -20,29 +20,21 @@ checks the returned per-subblock runtime dict and no-block overhead. """ -from functools import partial +import math from pathlib import Path import pytest -from _test_utils.torch.distributed.utils import spawn_multiprocess_job from _test_utils.torch.transformers_models import get_tiny_tokenizer +from omegaconf import OmegaConf + +from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig +from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks") def test_calc_runtime_for_subblocks(tmp_path: Path): """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead.""" - spawn_multiprocess_job(size=1, job=partial(_run, tmp_path), backend="nccl") - - -def _run(tmp_path: Path, rank: int, size: int): - import math - - from omegaconf import OmegaConf - - from modelopt.torch.nas.subblock_stats import calc_runtime_for_subblocks - from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig - tokenizer = get_tiny_tokenizer() tokenizer_dir = tmp_path / "tokenizer" tokenizer.save_pretrained(str(tokenizer_dir)) From f34d3a3f3529ae1fef138bba42aec65376f70e24 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 27 May 2026 04:40:49 -0700 Subject: [PATCH 28/35] responding to reviews Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 6 +++--- .../Llama-3_1-8B.yaml | 8 +------- modelopt/torch/puzzletron/mip/run_puzzle.py | 6 +++--- .../subblock_stats/calc_runtime_stats.py | 16 +++++----------- .../calc_subblock_params_and_memory.py | 8 ++++---- .../subblock_stats/calc_subblock_stats.py | 17 ++++++++--------- .../puzzletron/subblock_stats/runtime_utils.py | 4 ++-- .../puzzletron/subblock_stats/runtime_vllm.py | 2 +- 8 files changed, 27 insertions(+), 40 deletions(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index 27918819c3f..d6f5b17a554 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -361,7 +361,7 @@ The runtime constraint is specified in the `human_constraints` section of the co ```yaml human_constraints: - target_latency: 21 + target_latency_seconds: 21 ``` Run the pipeline against this config the same way as the memory-constrained variant: @@ -371,7 +371,7 @@ torchrun --nproc_per_node 2 examples/puzzletron/main.py \ --config examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml 2>&1 | tee ./log.txt | grep "Puzzletron Progress" ``` -The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency`, instead of optimizing for a memory budget. +The MIP solver will now search for a heterogeneous architecture whose measured end-to-end latency is at or below `target_latency_seconds`, instead of optimizing for a memory budget. Because vLLM startup adds substantial overhead during stats collection, extend the distributed process group timeout accordingly (already included in the example config): @@ -381,7 +381,7 @@ nccl_timeout_minutes: 90 # default is 10 if omitted This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout. -Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency` value of 21 seconds resulted in a final model latency of 22.3 seconds. +Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 21 resulted in a final model latency of 22.3 seconds. ## Advanced Usage diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index eb9e2398efb..437f7006e96 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -23,14 +23,10 @@ calc_subblock_stats: prefill_seq_len: 4096 generation_seq_len: 4096 num_active_tokens_override: # Optional override for sequence lengths - prefill_queue_size: 0 - allocate_prefill_query: false benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" - runtime_stats: - backend: trt_torch scoring: descriptor: ${descriptor} @@ -62,8 +58,6 @@ mip: subblock_stats_args: - batch_size: 96 weights_dtype: torch.bfloat16 - activations_dtype: torch.bfloat16 - kv_cache_dtype: torch.bfloat16 report_additional_costs: - stats.memory_mib @@ -78,7 +72,7 @@ mip: - stats.attention_num_params human_constraints: - target_latency: 21 + target_latency_seconds: 21 mip_constraints: metric_overrides: diff --git a/modelopt/torch/puzzletron/mip/run_puzzle.py b/modelopt/torch/puzzletron/mip/run_puzzle.py index 761534f6df9..22c8b471546 100644 --- a/modelopt/torch/puzzletron/mip/run_puzzle.py +++ b/modelopt/torch/puzzletron/mip/run_puzzle.py @@ -79,7 +79,7 @@ class Type(enum.Enum): _ALLOWED_HUMAN_CONSTRAINTS = { "target_memory", "target_throughput", - "target_latency", + "target_latency_seconds", "target_time_to_first_token", "num_params", "stats.has_attention", @@ -175,8 +175,8 @@ def to_mip_constraints(self, subblock_stats_args) -> dict[str, Any]: throughput_constraints.append( batch_size * generation_seq_len / self.constraints["target_throughput"] ) - if "target_latency" in self.constraints: - throughput_constraints.append(self.constraints["target_latency"]) + if "target_latency_seconds" in self.constraints: + throughput_constraints.append(self.constraints["target_latency_seconds"]) if throughput_constraints: mip_constraints["stats.runtime_ms"] = 1000 * min(throughput_constraints) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py index 1610b3d7397..641fc21c24f 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py @@ -25,23 +25,17 @@ from tqdm import tqdm from transformers import AutoModelForCausalLM, LlamaConfig, LlamaForCausalLM -from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig, save_model -from modelopt.torch.puzzletron.subblock_stats.runtime_vllm import run_vllm_latency_benchmark -from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor -from modelopt.torch.puzzletron.anymodel.puzzformer import deci_x_patcher -from modelopt.torch.puzzletron.block_config import ( - AttentionConfig, - BlockConfig, - FFNConfig, - SubblockConfig, -) +from ..anymodel.models.llama import LlamaModelDescriptor +from ..anymodel.puzzformer import deci_x_patcher +from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig +from .runtime_utils import RuntimeConfig, save_model +from .runtime_vllm import run_vllm_latency_benchmark def _make_standard_block_config(num_key_value_heads: int) -> BlockConfig: return BlockConfig( attention=AttentionConfig(no_op=False, num_key_value_heads=num_key_value_heads), ffn=FFNConfig(no_op=False, intermediate_size=256, moe=None), - parallel_blocks=None, ) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py index abe7a1a3884..b0772fb839f 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py @@ -30,16 +30,16 @@ import torch from transformers import PretrainedConfig -from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor -from modelopt.torch.puzzletron.block_config import ( +from ..anymodel.model_descriptor import ModelDescriptor +from ..block_config import ( AttentionConfig, BlockConfig, FFNConfig, MambaConfig, maybe_cast_block_configs, ) -from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import init_model_from_config -from modelopt.torch.puzzletron.utils.misc import ( +from ..tools.checkpoint_utils_hf import init_model_from_config +from ..utils.misc import ( EmptyInitOnDevice, calculate_kv_dim, raise_unknown_subblock_config_error, diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index f91ba397fc1..b5f91c5efa9 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -33,21 +33,20 @@ from tqdm import tqdm from transformers import PretrainedConfig -from modelopt.torch.puzzletron.subblock_stats.calc_subblock_params_and_memory import ( - calc_subblock_active_params, - calculate_non_block_memory, - calculate_non_block_params, - calculate_subblock_memory, - calculate_subblock_params, -) -from modelopt.torch.utils import json_dump - from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig from ..replacement_library.replacement_utils import parse_layer_replacement from ..tools.checkpoint_utils import load_model_config from ..tools.logger import mprint +from ..utils import json_dump from ..utils.parsing import format_global_config +from .calc_subblock_params_and_memory import ( + calc_subblock_active_params, + calculate_non_block_memory, + calculate_non_block_params, + calculate_subblock_memory, + calculate_subblock_params, +) __all__ = [ "calculate_subblock_stats", diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py index 9adb0826278..5b073a8c351 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py @@ -29,8 +29,8 @@ import torch from transformers import AutoTokenizer, LlamaForCausalLM -from modelopt.torch.puzzletron.anymodel.converter import Converter -from modelopt.torch.puzzletron.anymodel.models.llama import LlamaModelDescriptor +from ..anymodel.converter import Converter +from ..anymodel.models.llama import LlamaModelDescriptor @dataclass(frozen=True) diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py index 75e3953ac72..5f996535a0f 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py @@ -30,7 +30,7 @@ import subprocess # nosec B404 from pathlib import Path -from modelopt.torch.puzzletron.subblock_stats.runtime_utils import RuntimeConfig +from .runtime_utils import RuntimeConfig def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None: From 333214995c9ed9d9e6711eb6a326ceb077abcc1e Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Wed, 27 May 2026 08:33:05 -0700 Subject: [PATCH 29/35] reenabled some vars Signed-off-by: Grzegorz Karch --- .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml | 4 +++- .../torch/puzzletron/subblock_stats/calc_subblock_stats.py | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index 437f7006e96..0a0cc015e28 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -23,6 +23,8 @@ calc_subblock_stats: prefill_seq_len: 4096 generation_seq_len: 4096 num_active_tokens_override: # Optional override for sequence lengths + prefill_queue_size: 0 + allocate_prefill_query: false benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" @@ -38,7 +40,7 @@ scoring: teacher_dir: ${to_path:${teacher_dir}} output_dir: ${puzzle_dir}/single_sequence_replacement_solutions--validation - eval_samples: 8 + eval_samples: 128 micro_batch_size: 1 seed: 42 shuffle_seed: 444 diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index b5f91c5efa9..9597063600b 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -33,12 +33,13 @@ from tqdm import tqdm from transformers import PretrainedConfig +from modelopt.torch.utils import json_dump + from ..anymodel.model_descriptor import ModelDescriptor, ModelDescriptorFactory from ..block_config import AttentionConfig, BlockConfig, FFNConfig, SubblockConfig from ..replacement_library.replacement_utils import parse_layer_replacement from ..tools.checkpoint_utils import load_model_config from ..tools.logger import mprint -from ..utils import json_dump from ..utils.parsing import format_global_config from .calc_subblock_params_and_memory import ( calc_subblock_active_params, From 88e16d7c1687eee4215d485681f157465d9fd514 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 28 May 2026 14:09:23 -0700 Subject: [PATCH 30/35] added support for batch_sizes Signed-off-by: Grzegorz Karch --- .../llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml | 10 ++++++---- .../llama-3_1-8B_pruneffn_runtime.yaml | 2 -- .../puzzletron/subblock_stats/calc_runtime_stats.py | 3 +++ .../puzzletron/subblock_stats/calc_subblock_stats.py | 2 +- .../torch/puzzletron/subblock_stats/runtime_utils.py | 1 + .../torch/puzzletron/subblock_stats/runtime_vllm.py | 2 +- tests/gpu/torch/puzzletron/test_calc_runtime_stats.py | 1 + 7 files changed, 13 insertions(+), 8 deletions(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index 0a0cc015e28..4020497f800 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -19,9 +19,9 @@ build_replacement_library: add_attention_no_ops: true calc_subblock_stats: - batch_sizes: [64, 96, 128] - prefill_seq_len: 4096 - generation_seq_len: 4096 + batch_sizes: [1, 4] + prefill_seq_len: 1024 + generation_seq_len: 1024 num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false @@ -58,7 +58,7 @@ mip: bigger_is_better: false subblock_stats_args: - - batch_size: 96 + - batch_size: 1 weights_dtype: torch.bfloat16 report_additional_costs: @@ -96,6 +96,8 @@ realize_model: shuffle_seed: 444 dataset_path: ${dataset_path} +nccl_timeout_minutes: ${timedelta_minutes:120} + # This section redirects Hydra outputs hydra: run: diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml index 701c31e7c10..588df25f27d 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/llama-3_1-8B_pruneffn_runtime.yaml @@ -2,8 +2,6 @@ defaults: - Llama-3_1-8B - _self_ -nccl_timeout_minutes: ${timedelta_minutes:90} - # Input Hugging Face model to compress input_hf_model_path: /workspace/hf_models/meta-llama/Llama-3.1-8B-Instruct diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py index 641fc21c24f..6e4821936e7 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_runtime_stats.py @@ -171,9 +171,11 @@ def calc_runtime_for_subblocks( tokenizer_path: str, prefill_seq_len: int, generation_seq_len: int, + batch_size: int, ) -> tuple[dict[SubblockConfig, float], float]: """Benchmark each unique subblock and return per-subblock runtimes and no-block overhead.""" repeat_block_n_times = 10 + runtime_config = RuntimeConfig( vocab_size, hidden_size, @@ -183,6 +185,7 @@ def calc_runtime_for_subblocks( repeat_block_n_times, prefill_seq_len, generation_seq_len, + batch_size, runtime_stats_config.get("num_iters", 30), runtime_stats_config.get("num_warmup_iters", 10), ) diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 9597063600b..4284e70a3db 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -19,7 +19,6 @@ import copy import dataclasses import json -import os import warnings from functools import partial from itertools import product @@ -132,6 +131,7 @@ def calculate_subblock_stats( tokenizer_path=teacher_dir, prefill_seq_len=prefill_seq_len, generation_seq_len=generation_seq_len, + batch_size=batch_size, ) sorted_subblock_config = sorted( diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py index 5b073a8c351..3259e706c73 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_utils.py @@ -45,6 +45,7 @@ class RuntimeConfig: repeat_block_n_times: int prefill_seq_len: int generation_seq_len: int + batch_size: int num_iters: int num_warmup_iters: int diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py index 5f996535a0f..cc88fc5fe20 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py @@ -54,7 +54,7 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) "--output-len", str(runtime_config.generation_seq_len), "--batch-size", - "1", + str(runtime_config.batch_size), "--output-json", str(output_json_path), "--max-model-len", diff --git a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py index fba8d563f44..d976292969d 100644 --- a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py +++ b/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py @@ -59,6 +59,7 @@ def test_calc_runtime_for_subblocks(tmp_path: Path): tokenizer_path=str(tokenizer_dir), prefill_seq_len=8, generation_seq_len=4, + batch_size=1, ) assert set(runtime_by_subblock) == subblock_set From 3f69e555336fb977e9c2d878af3e3daf7abfb012 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Thu, 28 May 2026 14:26:09 -0700 Subject: [PATCH 31/35] further fixes Signed-off-by: Grzegorz Karch --- .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml | 1 - modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index 4020497f800..a813f6c8259 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -25,7 +25,6 @@ calc_subblock_stats: num_active_tokens_override: # Optional override for sequence lengths prefill_queue_size: 0 allocate_prefill_query: false - benchmark_iterations: # Set to a number (e.g., 1000) to enable runtime benchmarking merge_with_existing_stats: false subblock_stats_filename: "subblock_stats.json" moe_stats_filename: "moe_stats.json" diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py index cc88fc5fe20..b1f3cf2fb7e 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py @@ -33,7 +33,7 @@ from .runtime_utils import RuntimeConfig -def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float | None: +def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) -> float: """Run ``vllm bench latency`` in a fresh subprocess and return avg latency in ms. Spawning a subprocess per call gives OS-level isolation: GPU memory, CUDA @@ -77,7 +77,6 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) ] # cmd is a fixed list of strings (no shell, no untrusted input). - vllm_results = None try: subprocess.run( cmd, From 36f46855a690dfc14511e932162e729efe46a6a6 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Sun, 31 May 2026 01:38:03 -0700 Subject: [PATCH 32/35] using 5s latency target in the example Signed-off-by: Grzegorz Karch --- examples/puzzletron/README.md | 2 +- .../configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/puzzletron/README.md b/examples/puzzletron/README.md index d6f5b17a554..dce76866d6d 100644 --- a/examples/puzzletron/README.md +++ b/examples/puzzletron/README.md @@ -381,7 +381,7 @@ nccl_timeout_minutes: 90 # default is 10 if omitted This field is supported in any Puzzletron YAML config and overrides the default 10-minute distributed timeout. -Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 21 resulted in a final model latency of 22.3 seconds. +Due to non-linear extension of the runtime stats of single subblocks to the total runtime of the model, the `target_latency_seconds` value should be set to a value that is slightly lower than the desired latency. For example, in our experiments, the `target_latency_seconds` value of 5 resulted in a final model latency of 5.4 seconds. ## Advanced Usage diff --git a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml index a813f6c8259..b4adbb82add 100644 --- a/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml +++ b/examples/puzzletron/configs/llama-3_1-8B_pruneffn_runtime/Llama-3_1-8B.yaml @@ -73,7 +73,7 @@ mip: - stats.attention_num_params human_constraints: - target_latency_seconds: 21 + target_latency_seconds: 5 mip_constraints: metric_overrides: From b1b810f16fa0b588b2546c516cd512bdd7cf5431 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 8 Jun 2026 03:57:49 -0700 Subject: [PATCH 33/35] added vllm adapter Signed-off-by: Grzegorz Karch --- .../puzzletron/subblock_stats/runtime_vllm.py | 12 ++ .../torch/puzzletron/utils/vllm_adapter.py | 203 ++++++++++++++++++ 2 files changed, 215 insertions(+) create mode 100644 modelopt/torch/puzzletron/utils/vllm_adapter.py diff --git a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py index b1f3cf2fb7e..14eb337b707 100644 --- a/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py +++ b/modelopt/torch/puzzletron/subblock_stats/runtime_vllm.py @@ -30,6 +30,8 @@ import subprocess # nosec B404 from pathlib import Path +from ..tools.logger import mprint +from ..utils.vllm_adapter import convert_block_configs_to_per_layer_config from .runtime_utils import RuntimeConfig @@ -43,6 +45,16 @@ def run_vllm_latency_benchmark(model_path: Path, runtime_config: RuntimeConfig) output_json_path = model_path / "vllm_latency_benchmark.json" max_model_len = runtime_config.prefill_seq_len + runtime_config.generation_seq_len + with open(model_path / "config.json") as f: + config = json.load(f) + + if convert_block_configs_to_per_layer_config(config): + mprint("Converted block configs to per-layer config") + with open(model_path / "config.json", "w") as f: + json.dump(config, f, indent=2) + else: + mprint("No block configs to convert") + cmd = [ "vllm", "bench", diff --git a/modelopt/torch/puzzletron/utils/vllm_adapter.py b/modelopt/torch/puzzletron/utils/vllm_adapter.py new file mode 100644 index 00000000000..ae8409a1de7 --- /dev/null +++ b/modelopt/torch/puzzletron/utils/vllm_adapter.py @@ -0,0 +1,203 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ModelOpt/AnyModel -> vLLM/AnyModel config adapter. + +ModelOpt/AnyModel checkpoints describe per-layer overrides via a dense +``block_configs`` list with nested ``attention`` / ``ffn`` sub-sections. +AnyModel in vLLM now consumes the HuggingFace heterogeneity schema: a sparse +``per_layer_config`` dict mapping ``layer_idx -> {flat HF keys + optional +"skip" list}``. + +This module rewrites the Puzzletron schema in-place so vLLM only +ever sees ``per_layer_config``. It is invoked from +``AnyModelConfig.verify_and_update_model_config`` before the arch +convertor or layer-patching runs. +""" + +from __future__ import annotations + +from typing import Any + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +# (num_experts_field, moe_intermediate_size_field) per base architecture. +# ModelOpt always writes ``moe.num_local_experts`` and +# ``moe.expert_intermediate_{size,dim}``; the adapter rewrites them into the +# field names the base HF config actually reads. +_MOE_FIELDS_BY_ARCH: dict[str, tuple[str, str]] = { + "Qwen2MoeForCausalLM": ("num_experts", "moe_intermediate_size"), + "Qwen3MoeForCausalLM": ("num_experts", "moe_intermediate_size"), + "MixtralForCausalLM": ("num_local_experts", "intermediate_size"), + "GptOssForCausalLM": ("num_local_experts", "intermediate_size"), + "NemotronHForCausalLM": ("n_routed_experts", "moe_intermediate_size"), + "DeepseekV3ForCausalLM": ("n_routed_experts", "moe_intermediate_size"), + "DeepseekV2ForCausalLM": ("n_routed_experts", "moe_intermediate_size"), +} + +_DEFAULT_MOE_FIELDS: tuple[str, str] = ("num_local_experts", "intermediate_size") + + +def _get(obj: Any, key: str, default: Any = None) -> Any: + if obj is None: + return default + if isinstance(obj, dict): + return obj.get(key, default) + return getattr(obj, key, default) + + +def _convert_block_entry( + block: Any, + *, + global_kv: int | None, + global_isize: int | None, + global_hact: str | None, + global_moe_num: int | None, + global_moe_size: int | None, + moe_num_field: str, + moe_size_field: str, +) -> dict[str, Any]: + """Translate a single ModelOpt ``block_configs`` entry into a flat + per-layer override dict. Only attributes that differ from the global + config are emitted; sub-module no-ops become a ``"skip"`` list.""" + attn = _get(block, "attention") or {} + ffn = _get(block, "ffn") or {} + a_noop = bool(_get(attn, "no_op", False)) + f_noop = bool(_get(ffn, "no_op", False)) + + entry: dict[str, Any] = {} + skip: list[str] = [] + if a_noop: + skip.append("attention") + if f_noop: + skip.append("mlp") + if skip: + entry["skip"] = skip + + if not a_noop: + kv = _get(attn, "num_key_value_heads") + if kv is not None and kv != global_kv: + entry["num_key_value_heads"] = kv + + if not f_noop: + isize = _get(ffn, "intermediate_size") + if isize is not None and isize != global_isize: + entry["intermediate_size"] = isize + + hact = _get(ffn, "hidden_act") + if hact is not None and hact != global_hact: + entry["hidden_act"] = hact + + moe = _get(ffn, "moe") + if moe: + n_exp = _get(moe, "num_local_experts") + if n_exp is None: + n_exp = _get(moe, "num_experts") + if n_exp is None: + n_exp = _get(moe, "n_routed_experts") + if n_exp is not None and n_exp != global_moe_num: + entry[moe_num_field] = n_exp + + exp_size = _get( + moe, + "expert_intermediate_size", + _get(moe, "expert_intermediate_dim"), + ) + if exp_size is not None and exp_size != global_moe_size: + entry[moe_size_field] = exp_size + + return entry + + +def convert_block_configs_to_per_layer_config(hf_config: Any) -> bool: + """In-place: convert legacy ``block_configs`` on ``hf_config`` to + ``per_layer_config`` on its text config. + + Returns ``True`` if a conversion happened, ``False`` if there was + nothing to convert. If ``per_layer_config`` is already set, the legacy + field is dropped and a warning emitted (the new schema wins). + """ + block_configs = getattr(hf_config, "block_configs", None) + if not block_configs: + return False + + text_config = ( + hf_config.get_text_config() if hasattr(hf_config, "get_text_config") else hf_config + ) + + existing = getattr(text_config, "per_layer_config", None) + if existing: + logger.warning_once( + "AnyModel config has both legacy 'block_configs' and new " + "'per_layer_config'; using per_layer_config and ignoring " + "block_configs." + ) + if hasattr(hf_config, "block_configs"): + try: + delattr(hf_config, "block_configs") + except AttributeError: + pass + return False + + base_architecture = getattr(hf_config, "base_architecture", None) or "" + moe_num_field, moe_size_field = _MOE_FIELDS_BY_ARCH.get(base_architecture, _DEFAULT_MOE_FIELDS) + + global_kv = getattr(text_config, "num_key_value_heads", None) + global_isize = getattr(text_config, "intermediate_size", None) + global_hact = getattr(text_config, "hidden_act", None) + global_moe_num = getattr(text_config, moe_num_field, None) + global_moe_size = getattr(text_config, moe_size_field, None) + + per_layer_config: dict[str, dict[str, Any]] = {} + for idx, block in enumerate(block_configs): + entry = _convert_block_entry( + block, + global_kv=global_kv, + global_isize=global_isize, + global_hact=global_hact, + global_moe_num=global_moe_num, + global_moe_size=global_moe_size, + moe_num_field=moe_num_field, + moe_size_field=moe_size_field, + ) + if entry: + per_layer_config[str(idx)] = entry + + n_layers = getattr(text_config, "num_hidden_layers", None) + if n_layers is not None and len(block_configs) != n_layers: + logger.warning( + "block_configs length (%d) does not match num_hidden_layers " + "(%d); converted entries beyond num_hidden_layers will fail " + "AnyModel validation.", + len(block_configs), + n_layers, + ) + + setattr(text_config, "per_layer_config", per_layer_config) + try: + delattr(hf_config, "block_configs") + except AttributeError: + pass + + logger.info( + "Converted ModelOpt block_configs (%d entries) to AnyModel " + "per_layer_config (%d non-empty entries) for base_architecture=%r.", + len(block_configs), + len(per_layer_config), + base_architecture or "", + ) + return True From f49fbc976523b52346b7afd10ec1cbeb1212ec07 Mon Sep 17 00:00:00 2001 From: Grzegorz Karch Date: Mon, 8 Jun 2026 04:15:51 -0700 Subject: [PATCH 34/35] disabled vllm tests that depends on anymodel Signed-off-by: Grzegorz Karch --- .../torch/puzzletron/test_calc_runtime_stats.py | 1 + 1 file changed, 1 insertion(+) rename tests/{gpu => gpu_vllm}/torch/puzzletron/test_calc_runtime_stats.py (97%) diff --git a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py similarity index 97% rename from tests/gpu/torch/puzzletron/test_calc_runtime_stats.py rename to tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py index d976292969d..bff5438db0a 100644 --- a/tests/gpu/torch/puzzletron/test_calc_runtime_stats.py +++ b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py @@ -33,6 +33,7 @@ pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks") +@pytest.mark.skip(reason="AnyModel is not supported in vLLM yet") def test_calc_runtime_for_subblocks(tmp_path: Path): """End-to-end: a tiny subblock set yields a runtime dict + positive no-block overhead.""" tokenizer = get_tiny_tokenizer() From d6e1c6bd14a7a7e3040f783873bb056f7898938c Mon Sep 17 00:00:00 2001 From: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> Date: Mon, 8 Jun 2026 11:14:24 -0700 Subject: [PATCH 35/35] Fix CI failures Signed-off-by: Keval Morabia <28916987+kevalmorabia97@users.noreply.github.com> --- docs/source/conf.py | 2 +- .../kernels/sparsity/attention/calibrate.py | 13 +- .../calc_subblock_params_and_memory.py | 111 ++++++++---------- .../subblock_stats/calc_subblock_stats.py | 5 +- noxfile.py | 2 +- .../puzzletron/test_calc_runtime_stats.py | 2 - 6 files changed, 56 insertions(+), 79 deletions(-) diff --git a/docs/source/conf.py b/docs/source/conf.py index 6fe7a860024..47f997a0113 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -124,7 +124,7 @@ # Mock imports for autodoc -autodoc_mock_imports = ["mpi4py", "tensorrt_llm", "triton"] +autodoc_mock_imports = ["mpi4py", "tensorrt_llm", "triton", "vllm"] autosummary_generate = True autosummary_imported_members = False diff --git a/modelopt/torch/kernels/sparsity/attention/calibrate.py b/modelopt/torch/kernels/sparsity/attention/calibrate.py index 971c423f711..61707f63013 100644 --- a/modelopt/torch/kernels/sparsity/attention/calibrate.py +++ b/modelopt/torch/kernels/sparsity/attention/calibrate.py @@ -200,17 +200,18 @@ def attention_calibrate( measuring how many KV tiles would be skipped at each threshold in ``threshold_trials``. No autograd — forward only. + All arguments except ``threshold_trials`` match + :func:`modelopt.torch.kernels.common.attention.attention`. + Args: - q, k, v, b_start_loc, b_seq_len, max_input_len, is_causal, - softmax_scale, b_start_loc_k, b_seq_len_k, max_input_len_k: - Same as :func:`modelopt.torch.kernels.common.attention.attention`. threshold_trials: List of threshold values to measure sparsity for. Each value is converted to log2-scaled space for the kernel. Returns: - Tuple of (output, sparsity_counters): - - output: ``[total_q_tokens, num_q_heads, head_dim]`` - - sparsity_counters: ``[num_thresholds, 2]`` int64 tensor where + Tuple of ``(output, sparsity_counters)``: + + - ``output``: ``[total_q_tokens, num_q_heads, head_dim]`` + - ``sparsity_counters``: ``[num_thresholds, 2]`` int64 tensor where ``[:, 0]`` = total tile evaluations, ``[:, 1]`` = skipped tiles. Sparsity per threshold = ``counters[:, 1] / counters[:, 0]``. """ diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py index b0772fb839f..531f7a3f0a1 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_params_and_memory.py @@ -79,21 +79,21 @@ def calculate_subblock_memory( Given its configuration and runtime dimensions, returns bytes or a detailed dict. Args: - subblock_config (FFNConfig | AttentionConfig): Subblock configuration dataclass. - batch_size (int): Batch size for memory estimate. - prefill_seq_len (int): Sequence length for prefill phase. - generation_seq_len (int): Sequence length for generation phase (token-by-token). - prefill_queue_size (int): Token queue size for prefill attention memory allocation. - n_embd (int): Embedding (hidden) dimension. - n_head (int): Number of attention heads (used for non-FFN). - weights_dtype (torch.dtype): PyTorch dtype for model weights. - kv_cache_dtype (torch.dtype): PyTorch dtype for KV cache. - allocate_prefill_query (bool): Whether to allocate query cache for prefill tokens. - model_config (PretrainedConfig): HuggingFace-style config instance describing the model. - descriptor (type[ModelDescriptor]): Model descriptor type (for puzzletron model types). + subblock_config: Subblock configuration dataclass. + batch_size: Batch size for memory estimate. + prefill_seq_len: Sequence length for prefill phase. + generation_seq_len: Sequence length for generation phase (token-by-token). + prefill_queue_size: Token queue size for prefill attention memory allocation. + n_embd: Embedding (hidden) dimension. + n_head: Number of attention heads (used for non-FFN). + weights_dtype: PyTorch dtype for model weights. + kv_cache_dtype: PyTorch dtype for KV cache. + allocate_prefill_query: Whether to allocate query cache for prefill tokens. + model_config: HuggingFace-style config instance describing the model. + descriptor: Model descriptor type (for puzzletron model types). Returns: - float | dict[str, float]: Memory usage in bytes (float), or a dictionary by memory type. + Memory usage in bytes (float), or a dictionary by memory type. """ if subblock_config.no_op: return 0 @@ -229,7 +229,7 @@ def calc_subblock_active_params( block_idx: The index of the block/subblock within the network, used to index into the stats. Returns: - int: The expected number of "active" parameters for the given subblock. + The expected number of "active" parameters for the given subblock. """ if not (isinstance(sublayer_config, FFNConfig) and sublayer_config.is_moe): return calculate_subblock_params(model_config, sublayer_config, descriptor) @@ -245,12 +245,12 @@ def load_moe_stats(stats_file: str) -> dict: It returns the normalized probability distributions over experts for each block, as a list of numpy arrays. Args: - stats_file (str): Path to the JSON file containing expert routing statistics for each block. + stats_file: Path to the JSON file containing expert routing statistics for each block. Returns: - list[np.ndarray]: A list where each element is a numpy array containing the normalized probability - distribution over experts for the corresponding block. If a block's expert list is empty, - its entry is 0. + A list where each element is a numpy array containing the normalized probability + distribution over experts for the corresponding block. If a block's expert list is empty, + its entry is 0. """ with open(stats_file) as f: stats = json.load(f) @@ -271,12 +271,12 @@ def estimate_num_active_experts( expected number of active (i.e., selected at least once) experts is computed. Args: - dist_over_experts (np.ndarray): A 1D array of probabilities for each expert. - batch_size (int): The number of samples in the batch. - num_experts (int): The maximum number of experts to consider (fewer if `dist_over_experts` is shorter). + dist_over_experts: A 1D array of probabilities for each expert. + batch_size: The number of samples in the batch. + num_experts: The maximum number of experts to consider (fewer if `dist_over_experts` is shorter). Returns: - int: The expected number of experts selected at least once across the batch. + The expected number of experts selected at least once across the batch. """ # cut the tail and renormalize dist_over_experts = np.sort(dist_over_experts)[::-1][:num_experts] @@ -296,14 +296,14 @@ def estimate_moe_active_params( """Estimate the expected number of active (used) parameters for a Mixture-of-Experts (MoE) FFN subblock. Args: - subblock_config (FFNConfig): The FFNConfig for the MoE subblock (with .moe field configured). - n_embd (int): The embedding dimension (input and output size per expert). - moe_stats_file (Path | str): Path to the JSON file containing routing/selection probabilities for experts. - batch_size (int): Batch size to simulate/extrapolate expected expert use. - block_idx (int): The index of the block/layer whose expert routing statistics should be used. + subblock_config: The FFNConfig for the MoE subblock (with .moe field configured). + n_embd: The embedding dimension (input and output size per expert). + moe_stats_file: Path to the JSON file containing routing/selection probabilities for experts. + batch_size: Batch size to simulate/extrapolate expected expert use. + block_idx: The index of the block/layer whose expert routing statistics should be used. Returns: - int: Estimated number of parameters actively used for the current batch and expert selection statistics. + Estimated number of parameters actively used for the current batch and expert selection statistics. """ assert Path(moe_stats_file).exists() # if not Path(moe_stats_file).exists(): # if path is not provided, should we assume uniform distribution? @@ -382,16 +382,15 @@ def calculate_mamba_memory( """Calculate memory usage (MiB) for a Mamba attention subblock. Args: - attention_config (AttentionConfig): Mamba attention configuration, - including Mamba-specific settings. - model_config (PretrainedConfig): Model configuration. - descriptor (type[ModelDescriptor]): Model descriptor class. - batch_size (int): Batch size for memory estimate. - weights_dtype (torch.dtype): Data type for model weights. - kv_cache_dtype (torch.dtype): Data type for state/kv-cache. + attention_config: Mamba attention configuration, including Mamba-specific settings. + model_config: Model configuration. + descriptor: Model descriptor class. + batch_size: Batch size for memory estimate. + weights_dtype: Data type for model weights. + kv_cache_dtype: Data type for state/kv-cache. Returns: - int: Estimated memory usage in mebibytes (MiB) for the Mamba subblock. + Estimated memory usage in mebibytes (MiB) for the Mamba subblock. """ assert attention_config.mamba is not None mamba_config = attention_config.mamba @@ -409,11 +408,11 @@ def calculate_mamba_state_size( """Calculate the total state size for a Mamba attention subblock. Args: - mamba_config (MambaConfig): Configuration object containing Mamba subblock parameters. - batch_size (int): Batch size to estimate the memory/state requirements for. + mamba_config: Configuration object containing Mamba subblock parameters. + batch_size: Batch size to estimate the memory/state requirements for. Returns: - int: Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state. + Total state size (number of elements) required for the Mamba subblock, including convolution and SSM state. """ _, _, conv_dim, kernel_size = _calculate_mamba_intermediates(mamba_config) conv_state_size = math.prod((batch_size, conv_dim, kernel_size)) @@ -443,15 +442,14 @@ def calculate_ffn_memory( """Estimate the memory usage in MiB of a feed-forward network (FFN) subblock. Args: - ffn_config (FFNConfig): FFN configuration for the block. - model_config (PretrainedConfig): The parent model configuration. - descriptor (type[ModelDescriptor]): Model descriptor class. - weights_dtype (torch.dtype | str): Data type for FFN weights. - experts_dtype (torch.dtype | str | None, optional): Data type for expert weights - (for MoE layers, if present). Defaults to None. + ffn_config: FFN configuration for the block. + model_config: The parent model configuration. + descriptor: Model descriptor class. + weights_dtype: Data type for FFN weights. + experts_dtype: Data type for expert weights (for MoE layers, if present). Returns: - float: Estimated FFN memory usage in mebibytes (MiB). + Estimated FFN memory usage in mebibytes (MiB). """ # TODO: How to separate between expert weights and the rest for any model (same as puzzletron). num_params = calculate_subblock_params(model_config, ffn_config, descriptor) @@ -463,16 +461,7 @@ def calculate_non_block_memory( vocab_size: int, weight_dtype: torch.dtype, ) -> float: - """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection). - - Args: - n_embd (int): Embedding dimension (hidden size). - vocab_size (int): Vocabulary size. - weight_dtype (torch.dtype): Data type for model weights. - - Returns: - float: Estimated non-subblock memory usage in mebibytes (MiB). - """ + """Estimate the memory usage in MiB of non-subblock components (e.g., embeddings, output projection).""" return calculate_non_block_params(n_embd, vocab_size) * sizeof_dtype(weight_dtype) / 2**20 @@ -480,13 +469,5 @@ def calculate_non_block_params( n_embd: int, vocab_size: int, ) -> int: - """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection). - - Args: - n_embd (int): Embedding dimension (hidden size). - vocab_size (int): Vocabulary size. - - Returns: - int: Estimated non-subblock parameter count. - """ + """Calculate the number of parameters for non-subblock components (e.g., embeddings, output projection).""" return vocab_size * n_embd * 2 + n_embd diff --git a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py index 4284e70a3db..1d04cc01add 100644 --- a/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py +++ b/modelopt/torch/puzzletron/subblock_stats/calc_subblock_stats.py @@ -414,10 +414,7 @@ def _load_subblock_configs_from_replacement_library( 4 intermediate_size + teacher_intermediate_size + ffn_noop + att_op (teacher) + att_noop. Args: - master_puzzle_dir (Path): Directory with "replacement_library.json" file - - Returns: - list[SubblockConfig]: + master_puzzle_dir: Directory with "replacement_library.json" file """ replacement_library = json.loads((master_puzzle_dir / "replacement_library.json").read_text()) subblock_configs = set() diff --git a/noxfile.py b/noxfile.py index 059f351b7f9..1a28321bbd7 100644 --- a/noxfile.py +++ b/noxfile.py @@ -142,7 +142,7 @@ def gpu_trtllm(session): # Pin must stay in sync with examples/vllm_serve/Dockerfile. @nox.session(venv_backend="none") def gpu_vllm(session): - session.run("python3", "-m", "pip", "install", "-e", ".[hf,dev-test]") + session.run("python3", "-m", "pip", "install", "-e", ".[hf,puzzletron,dev-test]") session.run("python3", "-m", "pytest", "tests/gpu_vllm", *_cov_args()) diff --git a/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py index bff5438db0a..377a2ffed19 100644 --- a/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py +++ b/tests/gpu_vllm/torch/puzzletron/test_calc_runtime_stats.py @@ -30,8 +30,6 @@ from modelopt.torch.puzzletron.block_config import AttentionConfig, FFNConfig from modelopt.torch.puzzletron.subblock_stats.calc_runtime_stats import calc_runtime_for_subblocks -pytest.importorskip("vllm", reason="vLLM is required for calc_runtime_for_subblocks") - @pytest.mark.skip(reason="AnyModel is not supported in vLLM yet") def test_calc_runtime_for_subblocks(tmp_path: Path):