Skip to content

Commit e82164f

Browse files
Add anymodel directories to feature/puzzletron
- Add converter, model_descriptor, puzzformer, and llama model support - Selective merge of anymodel functionality Signed-off-by: Daniel Korzekwa <dkorzekwa@nvidia.com>
1 parent 5812777 commit e82164f

13 files changed

Lines changed: 1161 additions & 0 deletions

File tree

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Converters for transforming HuggingFace models to AnyModel format."""
16+
17+
from .convert_any_model import *
18+
from .converter import *
19+
from .converter_factory import *
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# mypy: ignore-errors
16+
17+
"""Convert a HuggingFace model to AnyModel format."""
18+
19+
from pathlib import Path
20+
21+
from modelopt.torch.puzzletron.anymodel.converter.converter import Converter
22+
from modelopt.torch.puzzletron.anymodel.converter.converter_factory import ConverterFactory
23+
from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptorFactory
24+
25+
__all__ = ["convert_model"]
26+
27+
28+
def convert_model(
29+
input_dir: str,
30+
output_dir: str,
31+
converter: Converter | str,
32+
):
33+
"""Convert a HuggingFace model to AnyModel format.
34+
35+
This function converts a HuggingFace checkpoint to the AnyModel format used
36+
for compression. The conversion process:
37+
38+
1. Copies non-weight files (config, tokenizer, etc.)
39+
2. Creates block_configs for each layer
40+
3. Reorganizes weights into subblock checkpoints
41+
42+
Args:
43+
input_dir: Path to the input HuggingFace checkpoint directory.
44+
output_dir: Path to the output AnyModel checkpoint directory.
45+
converter: Either a converter name (e.g., "llama") or a Converter class.
46+
47+
Example:
48+
>>> convert_model(
49+
... input_dir="/path/to/Llama-3.1-8B-Instruct",
50+
... output_dir="/path/to/output/ckpts/teacher",
51+
... converter="llama",
52+
... )
53+
"""
54+
input_dir = Path(input_dir)
55+
output_dir = Path(output_dir)
56+
output_dir.mkdir(parents=True, exist_ok=True)
57+
58+
# Get descriptor and converter from factories (they use the same name)
59+
descriptor = ModelDescriptorFactory.get(converter)
60+
converter = ConverterFactory.get(converter)
61+
62+
converter.convert(descriptor=descriptor, input_dir=input_dir, output_dir=output_dir)
63+
64+
65+
if __name__ == "__main__":
66+
from fire import Fire
67+
68+
Fire(convert_model)
Lines changed: 235 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,235 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
# mypy: ignore-errors
16+
17+
import copy
18+
import fnmatch
19+
import json
20+
import os
21+
import shutil
22+
from abc import ABC, abstractmethod
23+
from collections import defaultdict
24+
from pathlib import Path
25+
from typing import Dict, List
26+
27+
from safetensors.torch import load_file, save_file
28+
from tqdm import tqdm
29+
from transformers import PretrainedConfig
30+
from transformers.integrations.mxfp4 import convert_moe_packed_tensors
31+
32+
from modelopt.torch.puzzletron.anymodel.model_descriptor import ModelDescriptor
33+
from modelopt.torch.puzzletron.decilm.deci_lm_hf_code.block_config import BlockConfig
34+
from modelopt.torch.puzzletron.tools.checkpoint_utils_hf import load_model_config, save_model_config
35+
36+
__all__ = ["Converter"]
37+
38+
39+
class Converter(ABC):
40+
"""Base class for converting HuggingFace models to Puzzletron/AnyModel format."""
41+
42+
@staticmethod
43+
def _get_weight_map(input_dir: Path) -> Dict[str, str]:
44+
"""Load weight map from checkpoint directory (supports both sharded and single-file models).
45+
46+
Returns a dict mapping parameter names to their safetensors filenames.
47+
"""
48+
index_path = input_dir / "model.safetensors.index.json"
49+
single_file_path = input_dir / "model.safetensors"
50+
51+
if index_path.exists():
52+
# Sharded model
53+
with open(index_path, "r") as f:
54+
index = json.load(f)
55+
return index["weight_map"]
56+
elif single_file_path.exists():
57+
# Single file model - create a synthetic weight map
58+
data = load_file(single_file_path)
59+
return {name: "model.safetensors" for name in data.keys()}
60+
else:
61+
raise FileNotFoundError(
62+
f"Neither {index_path} nor {single_file_path} found. Cannot determine model format."
63+
)
64+
65+
@classmethod
66+
def convert_model_weights(
67+
cls, input_dir: Path, output_dir: Path, descriptor: ModelDescriptor, num_hidden_layers: int
68+
):
69+
"""Convert model weights to subblock format."""
70+
param_to_file = Converter._get_weight_map(input_dir)
71+
all_param_names = list(param_to_file.keys())
72+
73+
# Reverse map: file -> set of params
74+
file_to_params = defaultdict(set)
75+
for name, file in param_to_file.items():
76+
file_to_params[file].add(name)
77+
78+
# Determine subblocks needed
79+
subblocks = descriptor.get_weight_groups(
80+
all_param_names, num_hidden_layers=num_hidden_layers
81+
)
82+
83+
# Output directory
84+
out_dir = output_dir / "subblocks_safetensors"
85+
os.makedirs(out_dir, exist_ok=True)
86+
87+
# New weight index
88+
new_index = {"metadata": {"format": "pt"}, "weight_map": {}}
89+
90+
for subblock, param_names in tqdm(subblocks.items(), desc="Processing subblocks"):
91+
param_files = set(param_to_file[name] for name in param_names)
92+
tensors = {}
93+
94+
# Load only needed files for this subblock
95+
for file in param_files:
96+
data = load_file(os.path.join(input_dir, file))
97+
for name in param_names:
98+
if param_to_file[name] == file and name in data:
99+
converted_name = cls.convert_weight_name(name)
100+
# Convert MoE packed tensors if quantized is mxfp4 //gpt-oss-20b
101+
if getattr(cls, "quantized", None) == "mxfp4":
102+
if name.endswith("_blocks"):
103+
converted_name = converted_name.replace("_blocks", "")
104+
tensors[converted_name] = convert_moe_packed_tensors(
105+
data[converted_name + "_blocks"],
106+
data[converted_name + "_scales"],
107+
)
108+
elif name.endswith("_scales"):
109+
continue
110+
else:
111+
tensors[converted_name] = data[name]
112+
else:
113+
tensors[converted_name] = data[name]
114+
115+
# Save this subblock
116+
print(f"\n✅ Group: {subblock} ({len(tensors)} layers)")
117+
for layer in tensors.keys():
118+
print(f" - {layer}")
119+
120+
subblock_file = f"{subblock}.safetensors"
121+
save_file(tensors, os.path.join(out_dir, subblock_file))
122+
123+
# Update index
124+
for new_name in tensors.keys():
125+
new_index["weight_map"][new_name] = f"subblocks_safetensors/{subblock_file}"
126+
127+
# Save new index file
128+
with (output_dir / "model.safetensors.index.json").open("w") as f:
129+
json.dump(new_index, f, indent=2)
130+
131+
print(f"✅ Finished saving subblocks and index to {output_dir}")
132+
133+
@classmethod
134+
def convert_configs_in_dirs(
135+
cls,
136+
input_dir: Path,
137+
output_dir: Path,
138+
):
139+
"""Convert config and add block_configs."""
140+
config = load_model_config(input_dir)
141+
142+
block_configs = cls.create_block_configs_from_main_config(config)
143+
out_config = copy.deepcopy(config)
144+
out_config.block_configs = block_configs
145+
146+
save_model_config(out_config, output_dir)
147+
return out_config
148+
149+
@staticmethod
150+
def copy_checkpoint_files(input_dir: Path, output_dir: Path):
151+
"""Copy checkpoint files except model weights (which will be converted)."""
152+
ignore_patterns = [
153+
"model-*.safetensors",
154+
"model.safetensors",
155+
"model.safetensors.index.json",
156+
"subblocks_safetensors",
157+
]
158+
159+
def ignore_func(dir, files):
160+
ignored = set()
161+
for pattern in ignore_patterns:
162+
ignored.update(fnmatch.filter(files, pattern))
163+
return ignored
164+
165+
shutil.copytree(str(input_dir), str(output_dir), ignore=ignore_func, dirs_exist_ok=True)
166+
167+
@classmethod
168+
def convert(
169+
cls,
170+
descriptor: ModelDescriptor,
171+
input_dir: Path,
172+
output_dir: Path,
173+
):
174+
"""Convert a HuggingFace model to AnyModel format.
175+
176+
Args:
177+
descriptor: Model descriptor for the model type.
178+
input_dir: Path to the input HuggingFace checkpoint.
179+
output_dir: Path to the output AnyModel checkpoint.
180+
"""
181+
cls.copy_checkpoint_files(input_dir, output_dir)
182+
config = cls.convert_configs_in_dirs(input_dir, output_dir)
183+
cls.convert_model_weights(
184+
input_dir, output_dir, descriptor=descriptor, num_hidden_layers=config.num_hidden_layers
185+
)
186+
187+
@staticmethod
188+
@abstractmethod
189+
def create_block_configs_from_main_config(config: PretrainedConfig) -> List[BlockConfig]:
190+
"""Create per-layer BlockConfig list from a HuggingFace model config.
191+
192+
This method extracts layer-specific parameters (e.g., intermediate_size,
193+
num_key_value_heads) from the main model config and creates a BlockConfig
194+
for each layer. These BlockConfigs enable layer-specific pruning and
195+
modifications during the compression pipeline.
196+
197+
Args:
198+
config: HuggingFace PretrainedConfig (e.g., LlamaConfig, Qwen2Config)
199+
200+
Returns:
201+
List of BlockConfig, one per hidden layer. Each BlockConfig contains:
202+
- AttentionConfig: attention settings (no_op, num_key_value_heads)
203+
- FFNConfig: FFN settings (no_op, intermediate_size)
204+
205+
Example:
206+
For a model with uniform layers (e.g., Llama):
207+
return [BlockConfig(...)] * config.num_hidden_layers
208+
209+
For a model with heterogeneous layers (e.g., NemotronH with Mamba/Attention):
210+
return [BlockConfig(...) for layer_idx in range(num_layers)]
211+
"""
212+
raise NotImplementedError
213+
214+
@staticmethod
215+
def convert_weight_name(name: str) -> str:
216+
"""
217+
Convert weight names during checkpoint conversion.
218+
219+
This method can be overridden by subclasses to apply model-specific weight name
220+
transformations when converting checkpoints from HuggingFace format to Puzzletron format.
221+
222+
Default implementation returns the name unchanged (identity function).
223+
224+
Args:
225+
name: Original weight name from HuggingFace checkpoint
226+
227+
Returns:
228+
Converted weight name for Puzzletron format
229+
230+
Example:
231+
For Qwen2.5-VL, this converts:
232+
- visual.* → model.visual.*
233+
- model.* → model.language_model.*
234+
"""
235+
return name

0 commit comments

Comments
 (0)