1+ # SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+ # SPDX-License-Identifier: Apache-2.0
3+ #
4+ # Licensed under the Apache License, Version 2.0 (the "License");
5+ # you may not use this file except in compliance with the License.
6+ # You may obtain a copy of the License at
7+ #
8+ # http://www.apache.org/licenses/LICENSE-2.0
9+ #
10+ # Unless required by applicable law or agreed to in writing, software
11+ # distributed under the License is distributed on an "AS IS" BASIS,
12+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+ # See the License for the specific language governing permissions and
14+ # limitations under the License.
15+
16+ """Custom mapping from Qwen3-VL Hugging Face models to Megatron Core models.
17+
18+ Qwen3-VL model structure differs from Qwen3:
19+ - Language model weights are under `model.language_model.` prefix
20+ - Visual encoder weights are under `model.visual.` prefix
21+
22+ This module handles the language model conversion for PTQ/QAT workflows.
23+ Visual components are typically kept in full precision.
24+
25+ HuggingFace Qwen3-VL-8B structure:
26+ - model.language_model.embed_tokens.weight
27+ - model.language_model.layers.{L}.input_layernorm.weight
28+ - model.language_model.layers.{L}.self_attn.q_proj.weight
29+ - model.language_model.layers.{L}.self_attn.k_proj.weight
30+ - model.language_model.layers.{L}.self_attn.v_proj.weight
31+ - model.language_model.layers.{L}.self_attn.q_norm.weight
32+ - model.language_model.layers.{L}.self_attn.k_norm.weight
33+ - model.language_model.layers.{L}.self_attn.o_proj.weight
34+ - model.language_model.layers.{L}.post_attention_layernorm.weight
35+ - model.language_model.layers.{L}.mlp.gate_proj.weight
36+ - model.language_model.layers.{L}.mlp.up_proj.weight
37+ - model.language_model.layers.{L}.mlp.down_proj.weight
38+ - model.language_model.norm.weight
39+ - lm_head.weight
40+ """
41+
42+ from .mcore_custom import (
43+ COL_ETP ,
44+ COL_TP ,
45+ REPLICATE ,
46+ ROW_ETP ,
47+ ROW_TP ,
48+ CustomModuleMapping ,
49+ GatedMLPMerging ,
50+ GatedMLPSlicing ,
51+ NameRemapping ,
52+ QKVMerging ,
53+ QKVSlicing ,
54+ )
55+
56+ # Import rules: HuggingFace -> Megatron Core
57+ qwen3vl_causal_lm_import : dict [str , CustomModuleMapping ] = {
58+ # Embeddings - note the language_model prefix
59+ "word_embeddings" : NameRemapping ("model.language_model.embed_tokens." , COL_TP ),
60+ # Final layer norm
61+ "final_layernorm" : NameRemapping ("model.language_model.norm." , REPLICATE ),
62+ # Output layer (lm_head is at root level, not under language_model)
63+ "output_layer" : NameRemapping ("lm_head." , COL_TP ),
64+ # Attention - input layernorm
65+ "input_layernorm" : NameRemapping ("model.language_model.layers.{}.input_layernorm." , REPLICATE ),
66+ # Attention - QKV projection (merged)
67+ "linear_qkv" : QKVMerging ("model.language_model.layers.{}.self_attn." , COL_TP ),
68+ # Attention - output projection
69+ "linear_proj" : NameRemapping ("model.language_model.layers.{}.self_attn.o_proj." , ROW_TP ),
70+ # Attention - Q/K layer norms (Qwen3 uses RMSNorm on Q and K)
71+ "q_layernorm" : NameRemapping ("model.language_model.layers.{}.self_attn.q_norm." , REPLICATE ),
72+ "k_layernorm" : NameRemapping ("model.language_model.layers.{}.self_attn.k_norm." , REPLICATE ),
73+ # MLP - pre-MLP layernorm (post_attention_layernorm in HF)
74+ "pre_mlp_layernorm" : NameRemapping (
75+ "model.language_model.layers.{}.post_attention_layernorm." , REPLICATE
76+ ),
77+ # MLP - gate_proj + up_proj merged into linear_fc1
78+ "linear_fc1" : GatedMLPMerging ("model.language_model.layers.{}.mlp." , COL_TP ),
79+ # MLP - down_proj as linear_fc2
80+ "linear_fc2" : NameRemapping ("model.language_model.layers.{}.mlp.down_proj." , ROW_TP ),
81+ # MoE support (for Qwen3-VL MoE variants like 30B-A3B)
82+ "router" : NameRemapping ("model.language_model.layers.{}.mlp.gate." , REPLICATE ),
83+ "local_experts.linear_fc1" : GatedMLPMerging (
84+ "model.language_model.layers.{}.mlp.experts.{}." , COL_ETP
85+ ),
86+ "local_experts.linear_fc2" : NameRemapping (
87+ "model.language_model.layers.{}.mlp.experts.{}.down_proj." , ROW_ETP
88+ ),
89+ }
90+
91+ # Export rules: Megatron Core -> HuggingFace
92+ qwen3vl_causal_lm_export : dict [str , CustomModuleMapping ] = {
93+ # Embeddings
94+ "word_embeddings" : NameRemapping ("model.language_model.embed_tokens." ),
95+ # Final layer norm
96+ "final_layernorm" : NameRemapping ("model.language_model.norm." ),
97+ # Output layer
98+ "output_layer" : NameRemapping ("lm_head." ),
99+ # Attention - input layernorm
100+ "input_layernorm" : NameRemapping ("model.language_model.layers.{}.input_layernorm." ),
101+ # Attention - QKV projection (sliced back to separate q/k/v)
102+ "linear_qkv" : QKVSlicing ("model.language_model.layers.{}.self_attn." ),
103+ # Attention - output projection
104+ "linear_proj" : NameRemapping ("model.language_model.layers.{}.self_attn.o_proj." ),
105+ # Attention - Q/K layer norms
106+ "q_layernorm" : NameRemapping ("model.language_model.layers.{}.self_attn.q_norm." ),
107+ "k_layernorm" : NameRemapping ("model.language_model.layers.{}.self_attn.k_norm." ),
108+ # MLP - pre-MLP layernorm
109+ "pre_mlp_layernorm" : NameRemapping ("model.language_model.layers.{}.post_attention_layernorm." ),
110+ # MLP - linear_fc1 sliced back to gate_proj + up_proj
111+ "linear_fc1" : GatedMLPSlicing ("model.language_model.layers.{}.mlp." ),
112+ # MLP - down_proj
113+ "linear_fc2" : NameRemapping ("model.language_model.layers.{}.mlp.down_proj." ),
114+ # MoE support
115+ "router" : NameRemapping ("model.language_model.layers.{}.mlp.gate." ),
116+ "local_experts.linear_fc1" : GatedMLPSlicing ("model.language_model.layers.{}.mlp.experts.{}." ),
117+ "local_experts.linear_fc2" : NameRemapping (
118+ "model.language_model.layers.{}.mlp.experts.{}.down_proj."
119+ ),
120+ }
0 commit comments