NVIDIA · hychiang-git · Jun 1, 2026 · Feb 14, 2026 · May 13, 2026 · May 13, 2026
@@ -32,6 +32,7 @@ Changelog
 - Add ``--cast_mxfp4_to_nvfp4`` flag to ``examples/llm_ptq/hf_ptq.py`` for closed-form, bit-exact MXFP4 → NVFP4 weight conversion. Supports the GPT-OSS family (``openai/gpt-oss-20b``, ``openai/gpt-oss-120b``). See `examples/llm_ptq/README.md <https://github.com/NVIDIA/Model-Optimizer/tree/main/examples/llm_ptq#mxfp4--nvfp4-cast-for-gpt-oss>`__ for usage.
 - DeepSeek PTQ (``examples/deepseek/ptq.py``) now defaults to native top-k calibration with post-hoc per-layer peer-max sync of expert ``input_quantizer.amax``; the all-experts path is preserved behind ``--calib_all_experts``.
 - Add NVFP4 W4A16 weight-only quantization (``w4a16_nvfp4``): FP4 weights with group_size=16, BF16 activations, no calibration forward pass required. Use ``mtq.W4A16_NVFP4_CFG`` or ``--qformat w4a16_nvfp4`` in ``hf_ptq.py``. vLLM deployment support is in progress.
+- Add Megatron Core export/import mapping for Qwen3-VL (``Qwen3VLForConditionalGeneration``) vision-language models. The mapping handles the ``model.language_model.`` weight prefix used by Qwen3-VL.
 - Add ``DATASET_COMBOS`` to ``modelopt.torch.utils.dataset_utils`` — single ``--dataset`` tokens that fan out to multiple registered datasets; per-entry ``num_samples`` is split evenly across the members. Initial combos: ``cnn_nemotron_v2_mix`` (``cnn_dailymail`` + ``nemotron-post-training-dataset-v2``, used by ``hf_ptq.py`` when no ``--dataset`` is provided) and ``nemotron-post-training-v3`` (the seven ``nvidia/Nemotron-*`` SFT datasets added in #1498, mirroring the `nemotron-post-training-v3 collection <https://huggingface.co/collections/nvidia/nemotron-post-training-v3>`_). Combo names are listed by ``get_supported_datasets()`` and surfaced in ``--dataset`` help. ``get_dataset_dataloader`` rejects inputs that mix a combo with one of its member datasets (e.g. ``cnn_dailymail,cnn_nemotron_v2_mix``) to avoid double-sampling, and ``get_dataset_samples`` rejects combo names so callers route through the dataloader. ``hf_ptq.py`` default ``--calib_size`` is bumped from ``512`` to ``1024`` so the total calibration sample count under the new default combo matches the previous two-dataset fallback.
 - The ``nemotron-sft-agentic-v2`` registered dataset (added in #1498) now uses only the ``search`` split. The previously configured ``interactive_agent`` and ``tool_calling`` splits contain content-level defects (heterogeneous schema and a malformed JSON row, respectively) that cause pyarrow's streaming JSON reader to fail deterministically.
 - Add shared Megatron-Core calibration forward loop: ``modelopt.torch.utils.plugins.megatron_calibration.get_megatron_calibration_forward_loop`` produces the ``forward_loop`` callable expected by ``mtq.quantize`` / ``mtp.prune``. Replaces the bespoke calibration loops in Megatron-LM and Megatron-Bridge for quantization and pruning with a single canonical implementation.

@@ -61,6 +61,7 @@ Models:
   * Llama 4, 3.x (FP8, NVFP4)
   * Qwen 3, 2.5 (FP8, NVFP4)
   * Qwen 3 MoE (FP8, NVFP4)
+  * Qwen 3-VL (FP8, NVFP4)
   * Deepseek R1/V3 (NVFP4)
   * Mixtral 8x7B (FP8, NVFP4)
   * Medusa (FP8)

@@ -88,11 +88,15 @@ def copy_hf_ckpt_remote_code(
 
 def load_multimodal_components(
     pretrained_model_path: str | os.PathLike,
+    prefixes: tuple[str, ...] = ("multi_modal_projector", "vision_model"),
 ) -> dict[str, torch.Tensor]:
     """Load multimodal components from safetensors file.
 
     Args:
         pretrained_model_path: Path to the pretrained model.
+        prefixes: Tensor key prefixes to select.  Defaults to the LLaVA-style
+            ``multi_modal_projector`` / ``vision_model`` prefixes.  Pass
+            ``("model.visual.",)`` for Qwen3-VL checkpoints.
 
     Returns:
         A dictionary of multimodal components.
@@ -114,7 +118,7 @@ def load_multimodal_components(
             multimodal_keys = [
                 key
                 for key in f.keys()  # noqa: SIM118
-                if key.startswith(("multi_modal_projector", "vision_model"))
+                if key.startswith(prefixes)
             ]
             for key in tqdm(multimodal_keys, desc="Loading multimodal tensors"):
                 multimodal_state_dict[key] = f.get_tensor(key)
@@ -124,28 +128,13 @@ def load_multimodal_components(
         with open(safetensors_index_file) as f:
             safetensors_index = json.load(f)
 
-        # For multimodal models, vision_model and multi_modal_projector are in the first shard
         all_shard_files = sorted(set(safetensors_index["weight_map"].values()))
-        first_shard_file = all_shard_files[0]  # e.g., "model-00001-of-00050.safetensors"
-
-        # Load multimodal components from the first shard file
-        safetensors_filepath = Path(hf_checkpoint_path) / first_shard_file
-        print(f"Loading multimodal components from {first_shard_file}")
-
-        with safe_open(safetensors_filepath, framework="pt") as f:
-            shard_keys = list(f.keys())
-            multimodal_keys_in_shard = [
-                k for k in shard_keys if k.startswith(("multi_modal_projector", "vision_model"))
-            ]
-
-            if multimodal_keys_in_shard:
-                print(
-                    f"Found {len(multimodal_keys_in_shard)} multimodal tensors in {first_shard_file}"
-                )
-                for key in tqdm(multimodal_keys_in_shard, desc="Loading multimodal tensors"):
-                    multimodal_state_dict[key] = f.get_tensor(key)
-            else:
-                print(f"No multimodal components found in {first_shard_file}")
+        for shard_file in all_shard_files:
+            safetensors_filepath = Path(hf_checkpoint_path) / shard_file
+            with safe_open(safetensors_filepath, framework="pt") as f:
+                for key in f.keys():  # noqa: SIM118
+                    if key.startswith(prefixes):
+                        multimodal_state_dict[key] = f.get_tensor(key)
 
     else:
         print(f"Warning: No safetensors files found in {hf_checkpoint_path}")

@@ -39,6 +39,7 @@
     qwen25_causal_lm_export,
     qwen25_causal_lm_import,
 )
+from .mcore_qwen3vl import qwen3vl_causal_lm_export, qwen3vl_causal_lm_import
 
 all_mcore_hf_export_mapping: dict[str, Any] = {
     "DeepseekV2ForCausalLM": deepseek_causal_lm_export,
@@ -54,6 +55,7 @@
     "Qwen3MoeForCausalLM": qwen3_causal_lm_export,
     "Qwen2ForCausalLM": qwen25_causal_lm_export,
     "GptOssForCausalLM": gptoss_causal_lm_export,
+    "Qwen3VLForConditionalGeneration": qwen3vl_causal_lm_export,
 }
 
 all_mcore_hf_import_mapping: dict[str, Any] = {
@@ -66,4 +68,5 @@
     "Qwen3MoeForCausalLM": qwen3_causal_lm_import,
     "Qwen2ForCausalLM": qwen25_causal_lm_import,
     "GptOssForCausalLM": gptoss_causal_lm_import,
+    "Qwen3VLForConditionalGeneration": qwen3vl_causal_lm_import,
 }
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Custom mapping from Qwen3-VL Hugging Face models to Megatron Core models.
+
+Qwen3-VL differs from Qwen3 in one structural way: language-model weights live
+under ``model.language_model.`` instead of ``model.``, while ``lm_head.weight``
+remains at the root level.  The mappings below are derived automatically from
+the Qwen3 mappings by inserting ``language_model.`` after ``model.`` for every
+prefix that starts with ``model.``.
+
+Note: the visual encoder (``model.visual.*``) is intentionally excluded — this
+mapping covers only the language-model decoder used for quantization and export.
+
+Note: ``Qwen3VLMoeForConditionalGeneration`` is **not** supported here.  The MoE
+variant stores expert weights as 3-D tensors (``mlp.experts.gate_up_proj``,
+``mlp.experts.down_proj``) that require a dedicated fused-expert mapping and
+cannot reuse the dense Qwen3 rules.
+
+Reference: https://huggingface.co/Qwen/Qwen3-VL-8B-Instruct/blob/main/model.safetensors.index.json
+"""
+
+import copy
+
+from .mcore_custom import CustomModuleMapping
+from .mcore_qwen import qwen3_causal_lm_export, qwen3_causal_lm_import
+
+
+def _with_language_model_prefix(
+    mapping: dict[str, CustomModuleMapping],
+) -> dict[str, CustomModuleMapping]:
+    """Derive a VL mapping from a base Qwen3 mapping.
+
+    Rewrites every ``target_name_or_prefix`` that starts with ``model.`` to
+    ``model.language_model.<rest>``.  Prefixes that do not start with
+    ``model.`` (e.g. ``lm_head.``) are left unchanged.
+    """
+    result = {}
+    for key, m in mapping.items():
+        prefix = m.target_name_or_prefix
+        if prefix.startswith("model."):
+            prefix = "model.language_model." + prefix[len("model.") :]
+        result[key] = type(m)(
+            target_name_or_prefix=prefix, func_kwargs=copy.deepcopy(m.func_kwargs)
+        )
+    return result
+
+
+qwen3vl_causal_lm_import = _with_language_model_prefix(qwen3_causal_lm_import)
+qwen3vl_causal_lm_export = _with_language_model_prefix(qwen3_causal_lm_export)
@@ -382,9 +382,20 @@ def save_pretrained(
         # Add multimodal components to state_dict. Since only support decoder model quantization,
         # no changes will be made to the multimodal components. We copy the multimodal components
         # from the pretrained model directly to the state_dict to avoid implementing the export logic.
-        if is_first_stage_main_rank and self.is_multimodal:
-            multimodal_state_dict = load_multimodal_components(pretrained_model_name_or_path)
-            layer_state_dicts[0].update(multimodal_state_dict)
+        if is_first_stage_main_rank:
+            # layer_state_dicts is keyed by layer_number (1-indexed), so the first
+            # decoder layer on this (first) PP stage is the smallest key, not 0.
+            # Merge the multimodal components into that shard so they land in a file
+            # the index builder picks up (it scans shards 1..num_layers).
+            first_layer_key = next(iter(layer_state_dicts))
+            if self.is_multimodal:
+                multimodal_state_dict = load_multimodal_components(pretrained_model_name_or_path)
+                layer_state_dicts[first_layer_key].update(multimodal_state_dict)
+            elif self.arch == "Qwen3VLForConditionalGeneration":
+                vision_state_dict = load_multimodal_components(
+                    pretrained_model_name_or_path, prefixes=("model.visual.",)
+                )
+                layer_state_dicts[first_layer_key].update(vision_state_dict)
 
         # Barrier to ensure the export_dir has been created.
         torch.distributed.barrier()

@@ -29,6 +29,7 @@
     DeepseekV3Config,
     GptOssConfig,
     LlamaConfig,
+    NemotronConfig,
     PreTrainedModel,
     Qwen3Config,
     Qwen3MoeConfig,
@@ -121,6 +122,91 @@ def create_tiny_qwen3_moe_dir(
     return qwen3_moe_dir
 
 
+##### Qwen3-VL #####
+def get_tiny_qwen3vl(**config_kwargs) -> PreTrainedModel:
+    # Lazy imports — Qwen3VL classes live under transformers.models.qwen3_vl which
+    # may not exist in older transformers builds, and this module is imported by
+    # every test that uses transformers_models.py.
+    from transformers import Qwen3VLConfig
+    from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
+
+    set_seed(SEED)
+
+    # Defaults: hidden_size=num_attention_heads*head_dim (e.g. 4*8=32).
+    # Pass config_kwargs to override for multi-GPU tests (e.g. num_attention_heads=num_gpus,
+    # num_key_value_heads=num_gpus, hidden_size=num_gpus*head_dim).
+    text_kwargs = {
+        "hidden_size": 32,
+        "intermediate_size": 32,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 4,
+        "num_key_value_heads": 2,
+        "head_dim": 8,
+        "max_position_embeddings": 32,
+        "vocab_size": 32,
+    }
+    text_kwargs.update(config_kwargs)
+    # Pass as dicts — transformers 5.3.0 Qwen3VLConfig.__init__ only handles
+    # vision_config/text_config when they are dicts or None, not instances.
+    vision_kwargs = {
+        "depth": 1,
+        "hidden_size": 16,
+        "intermediate_size": 16,
+        "num_heads": 2,
+        "in_channels": 3,
+        "patch_size": 4,
+        "spatial_merge_size": 1,
+        "temporal_patch_size": 1,
+        "out_hidden_size": text_kwargs["hidden_size"],  # must match text hidden_size
+    }
+    cfg = Qwen3VLConfig(text_config=text_kwargs, vision_config=vision_kwargs)
+    return Qwen3VLForConditionalGeneration(cfg)
+
+
+def create_tiny_qwen3vl_dir(
+    tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs
+) -> Path:
+    qwen3vl_dir = Path(tmp_path) / "tiny_qwen3vl"
+    if with_tokenizer:
+        tokenizer = get_tiny_tokenizer()
+        tokenizer.save_pretrained(qwen3vl_dir)
+        config_kwargs["vocab_size"] = tokenizer.vocab_size
+    get_tiny_qwen3vl(**config_kwargs).save_pretrained(qwen3vl_dir)
+    return qwen3vl_dir
+
+
+##### NEMOTRON #####
+def get_tiny_nemotron(**config_kwargs) -> PreTrainedModel:
+    set_seed(SEED)
+
+    # hidden_size=64, ffn_hidden_size=128: relu2 activation needs non-trivial dims
+    # to avoid all-zero activations (scaling factor 0) in NVFP4 quantization.
+    kwargs = {
+        "dtype": torch.bfloat16,
+        "hidden_size": 64,
+        "intermediate_size": 128,
+        "num_hidden_layers": 2,
+        "num_attention_heads": 8,
+        "num_key_value_heads": 1,
+        "max_position_embeddings": 32,
+        "vocab_size": 32,
+    }
+    kwargs.update(**config_kwargs)
+    return AutoModelForCausalLM.from_config(NemotronConfig(**kwargs))
+
+
+def create_tiny_nemotron_dir(
+    tmp_path: Path | str, with_tokenizer: bool = False, **config_kwargs
+) -> Path:
+    nemotron_dir = Path(tmp_path) / "tiny_nemotron"
+    if with_tokenizer:
+        tokenizer = get_tiny_tokenizer()
+        tokenizer.save_pretrained(nemotron_dir)
+        config_kwargs["vocab_size"] = tokenizer.vocab_size
+    get_tiny_nemotron(**config_kwargs).save_pretrained(nemotron_dir)
+    return nemotron_dir
+
+
 ##### DeepSeek V3 #####
 def get_tiny_deepseek_v3(**config_kwargs) -> PreTrainedModel:
     set_seed(SEED)