NVIDIA · nvchenghaoz · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
@@ -7,6 +7,7 @@ max_batch_size: 32
 cuda_graph_config:
   batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 512]
 enable_chunked_prefill: true
+# Use AutoModelForCausalLM for text only mode until issue #12699 is resolved
 model_factory: Qwen3_5MoeForConditionalGeneration
 kv_cache_config:
   enable_block_reuse: false
@@ -15,13 +16,18 @@ kv_cache_config:
 model_kwargs:
   torch_dtype: bfloat16
 transforms:
+  # disable for text only use case
   initialize_mrope_delta_cache:
     enabled: true
   export_to_gm:
     num_moe_experts_for_export: 2
   fuse_gemms_mixed_children:
     enabled: true
+  fuse_nvfp4_moe:
+    backend: trtllm_gen
   detect_sharding:
+    # for long input, tp8ep1 gives better performance
+    # dist_mapping: {moe_tp: 8, moe_ep: 1}
     allreduce_strategy: SYMM_MEM
     shard_all_unprocessed: true
     simple_shard_filter: "lm_head"
@@ -37,6 +43,9 @@ transforms:
         "k_proj": "colwise"
         "v_proj": "colwise"
         "o_proj": "rowwise"
+        # lm_head: "gather" = column split + all_gather (not "colwise" which
+        # requires a LayerSubgraph and crashes for standalone unprocessed nodes)
+        "lm_head": "gather"
         # replicating shared experts (keep them commented out)
         # "shared_expert_gate_proj": "colwise"
         # "shared_expert_up_proj": "colwise"

@@ -730,7 +730,11 @@ class Qwen3_5MoeCausalLMOutput(ModelOutput):
 
 
 class Qwen3_5MoeTextModel(Qwen3_5MoePreTrainedModel):
-    """Qwen3.5 MoE text model (embed + decoder layers + final norm)."""
+    """Qwen3.5 MoE text model (embed + decoder layers + final norm + lm_head).
+
+    lm_head is included so that the exported GraphModule contains it directly,
+    allowing sharding and gather_logits_before_lm_head transforms to see it.
+    """
 
     def __init__(self, config: Qwen3_5MoeTextConfig):
         super().__init__(config)
@@ -746,10 +750,15 @@ def __init__(self, config: Qwen3_5MoeTextConfig):
         )
         self.norm = Qwen3_5MoeRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.rotary_emb = Qwen3_5MoeTextRotaryEmbedding(config=config)
+        self.lm_head = None  # set by parent model via set_lm_head()
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def set_lm_head(self, lm_head: nn.Module):
+        """Set the lm_head from the parent model."""
+        self.lm_head = lm_head
+
     def get_input_embeddings(self):
         return self.embed_tokens
 
@@ -801,7 +810,11 @@ def forward(
             hidden_states = decoder_layer(hidden_states, position_embeddings=position_embeddings)
 
         hidden_states = self.norm(hidden_states)
-        return Qwen3_5MoeOutput(last_hidden_state=hidden_states)
+        assert self.lm_head is not None, (
+            "lm_head not set — call set_lm_head() from the parent model before forward()"
+        )
+        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        return Qwen3_5MoeCausalLMOutput(logits=logits)
 
 
 class Qwen3_5MoeForCausalLM(Qwen3_5MoePreTrainedModel, GenerationMixin):
@@ -814,6 +827,7 @@ def __init__(self, config: Qwen3_5MoeTextConfig, **kwargs):
         self.model = Qwen3_5MoeTextModel(config)
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.model.set_lm_head(self.lm_head)
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -829,6 +843,7 @@ def get_output_embeddings(self):
 
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
+        self.model.set_lm_head(new_embeddings)
 
     def forward(
         self,
@@ -848,8 +863,7 @@ def forward(
             rope_cos=rope_cos,
             rope_sin=rope_sin,
         )
-        hidden_states = outputs[0]
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        logits = outputs.logits
         return Qwen3_5MoeCausalLMOutput(logits=logits)
 
 
@@ -2565,10 +2579,19 @@ def __init__(self, config: Qwen3_5MoeConfig, **kwargs):
         self.lm_head = nn.Linear(
             config.text_config.hidden_size, config.text_config.vocab_size, bias=False
         )
+        # Share lm_head with the text model so it's inside the exported graph
+        self.model.language_model.set_lm_head(self.lm_head)
 
         # Initialize weights and apply final processing
         self.post_init()
 
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+        self.model.language_model.set_lm_head(new_embeddings)
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -2590,8 +2613,7 @@ def forward(
             video_grid_thw=video_grid_thw,
             **kwargs,
         )
-        hidden_states = outputs.last_hidden_state
-        logits = self.lm_head(hidden_states.to(self.lm_head.weight.dtype)).float()
+        logits = outputs.logits
         return Qwen3_5MoeConditionalOutput(logits=logits)
 
 
@@ -2607,6 +2629,9 @@ class Qwen3_5MoeTextExportInfo(TextModelExportInfo):
     (batch, sequence) are dynamic.
     """
 
+    def __init__(self, submodule_name: str):
+        super().__init__(submodule_name)
+
     def _init_dynamic_shape_lookup(self):
         base = super()._init_dynamic_shape_lookup()
         batch_size_dyn = Dim.DYNAMIC
@@ -2858,4 +2883,7 @@ def init_input_processor(self, base):
 AutoConfig.register("qwen3_5_moe_text", Qwen3_5MoeTextConfig)
 
 AutoModelForCausalLMFactory.register_custom_model_cls("Qwen3_5MoeTextConfig", Qwen3_5MoeForCausalLM)
+AutoModelForCausalLMFactory.register_custom_model_cls(
+    "Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration
+)
 Qwen3_5MoeFactory.register_custom_model_cls("Qwen3_5MoeConfig", Qwen3_5MoeForConditionalGeneration)
@@ -2,6 +2,7 @@
 
 import torch.nn as nn
 from pydantic import Field
+from torch.fx import GraphModule
 
 from ...compile import ArgsKwargs, CompileBackendRegistry
 from ...models.factory import ModelFactory
@@ -16,6 +17,15 @@
 )
 
 
+def _set_submodule(model: nn.Module, key: str, new_module: nn.Module) -> None:
+    """Replace a nested submodule given a dotted key path (e.g. 'model.language_model')."""
+    parts = key.split(".")
+    parent = model
+    for part in parts[:-1]:
+        parent = getattr(parent, part)
+    setattr(parent, parts[-1], new_module)
+
+
 def _generate_default_piecewise_num_tokens(max_num_tokens: int) -> List[int]:
     """Generate default piecewise bucket sizes when none are specified.
 
@@ -138,13 +148,41 @@ def _get_args_kwargs(bs: int) -> ArgsKwargs:
         config_dict = self.config.model_dump()
         config_dict.update(config_overrides)
 
-        compiler_backend = CompileBackendRegistry.get(self.config.backend)(
-            mod,
-            get_args_kwargs_for_compile=_get_args_kwargs,
-            **extra_kwargs,
-            **config_dict,
-        )
-        mod_compiled = compiler_backend.compile()
+        # Walk the module tree and collect the top-level GraphModules to compile.
+        # Once a GM is found, its children are skipped (they're part of the GM).
+        compile_targets = []
+        seen = set()
+        if isinstance(mod, GraphModule):
+            compile_targets.append(("", mod))
+            seen.add("")
+        for name, submod in mod.named_modules():
+            if any(p == "" or name.startswith(p + ".") for p in seen):
+                continue
+            if isinstance(submod, GraphModule):
+                compile_targets.append((name, submod))
+                seen.add(name)
+
+        if compile_targets:
+            ad_logger.info(
+                f"CompileModel: compiling {len(compile_targets)} GraphModule(s): "
+                f"{[name or '(root)' for name, _ in compile_targets]}"
+            )
+
+        for gm_key, gm in compile_targets:
+            full_model = mod if gm_key else None
+            compiler_backend = CompileBackendRegistry.get(self.config.backend)(
+                gm,
+                get_args_kwargs_for_compile=_get_args_kwargs,
+                full_model=full_model,
+                **extra_kwargs,
+                **config_dict,
+            )
+            compiled_gm = compiler_backend.compile()
+            if gm_key:
+                _set_submodule(mod, gm_key, compiled_gm)
+            else:
+                mod = compiled_gm
+        mod_compiled = mod
 
         # store info object about the transform
         info = TransformInfo(skipped=False, num_matches=1, is_clean=True, has_valid_shapes=True)

@@ -405,6 +405,10 @@ def build_custom_args_for_linear(self, scales: Dict[str, Node]) -> Tuple:
         return ([scales["input_scale"]], [scales["weight_scale"], scales["alpha"]], [], [])
 
     def load_hook(self, state_dict, prefix, *args, weight_name):
+        # Prepend prefix so the hook works when the GraphModule is a submodule
+        # of the model on which load_state_dict is called (e.g., VLM models
+        # where the text model lives at model.language_model.*).
+        weight_name = prefix + weight_name
         if weight_name in state_dict:
             input_scale_name = weight_name.rsplit(".", 1)[0] + ".input_scale"
             alpha_name = weight_name.rsplit(".", 1)[0] + ".alpha"