fix qwen3.5 pp moduledict layer extraction

HuiyingLi · HuiyingLi · commit a55686b92840 · 2026-05-08T13:17:19.000-07:00
Signed-off-by: HuiyingLi &lt;willwin.lee@gmail.com&gt;
diff --git a/nemo_automodel/components/distributed/parallelizer.py b/nemo_automodel/components/distributed/parallelizer.py
@@ -1195,46 +1195,55 @@ def validate_tp_mesh(model, tp_mesh):
     )
 
 
-def _find_largest_module_list(model: nn.Module) -> Optional[nn.ModuleList]:
+def _find_largest_module_list(model: nn.Module) -> Optional[Union[nn.ModuleList, nn.ModuleDict]]:
     """
-    Heuristic function to find the largest nn.ModuleList in a model.
+    Heuristic function to find the largest layer container in a model.
 
-    This function recursively traverses the model to find all nn.ModuleList instances
-    and returns the one with the most modules. This is useful as a fallback when
-    the model architecture is unknown, since transformer layers are typically
-    organized in ModuleLists.
+    This function recursively traverses the model to find all nn.ModuleList and
+    pipeline-split nn.ModuleDict instances and returns the one with the most
+    modules. This is useful as a fallback when the model architecture is unknown,
+    since transformer layers are typically organized in ModuleLists. Pipeline
+    splitting converts ModuleLists to ModuleDicts keyed by original layer index.
 
     Args:
         model (nn.Module): The model to search through.
 
     Returns:
-        Optional[nn.ModuleList]: The largest ModuleList found, or None if no ModuleList exists.
+        Optional[Union[nn.ModuleList, nn.ModuleDict]]: The largest layer container found, or None.
     """
-    largest_module_list = None
+    largest_module_list: Optional[Union[nn.ModuleList, nn.ModuleDict]] = None
     largest_size = 0
 
+    def _is_pp_layer_module_dict(module: nn.ModuleDict) -> bool:
+        # functional.py converts split ModuleLists to ModuleDicts with stringified
+        # numeric indices. Avoid treating arbitrary named ModuleDicts (for example
+        # adapter registries) as transformer layer containers in the heuristic path.
+        return all(key.isdigit() for key in module.keys())
+
     def _recursive_search(module: nn.Module, path: str = ""):
         nonlocal largest_module_list, largest_size
 
         for name, child in module.named_children():
             current_path = f"{path}.{name}" if path else name
 
-            if isinstance(child, nn.ModuleList):
+            if isinstance(child, nn.ModuleList) or (
+                isinstance(child, nn.ModuleDict) and _is_pp_layer_module_dict(child)
+            ):
                 current_size = len(child)
                 if current_size > largest_size:
                     largest_size = current_size
                     largest_module_list = child
-                    logger.debug(f"Found ModuleList at {current_path} with {current_size} modules")
+                    logger.debug(f"Found {type(child).__name__} at {current_path} with {current_size} modules")
 
             # Continue recursive search
             _recursive_search(child, current_path)
 
     _recursive_search(model)
 
     if largest_module_list is not None:
-        logger.info(f"Largest ModuleList found with {largest_size} modules")
+        logger.info(f"Largest layer container found with {largest_size} modules")
     else:
-        logger.warning("No ModuleList found in the model")
+        logger.warning("No ModuleList or ModuleDict found in the model")
 
     return largest_module_list
 
@@ -1320,6 +1329,8 @@ def _extend_layers(layers, modules):
         for m in modules:
             if isinstance(m, nn.ModuleList):
                 layers.extend(m)
+            elif isinstance(m, nn.ModuleDict):
+                layers.extend(m.values())
             else:
                 layers.append(m)
 
@@ -1338,15 +1349,20 @@ def _extend_layers(layers, modules):
     elif hasattr(model, "layers"):
         layers.extend(model.layers)
     else:
-        # Use heuristic to find the largest ModuleList in the model
+        # Use heuristic to find the largest layer container in the model.
         logger.warning(f"Unknown model type: {model_cls}. Using heuristic to find transformer layers.")
         largest_module_list = _find_largest_module_list(model)
         if largest_module_list is None:
-            # If no ModuleList found, still raise an exception
+            # If no layer container is found, still raise an exception.
             print(model)
-            raise ValueError(f"Unknown model type: {model_cls} and no ModuleList found in model structure")
+            raise ValueError(
+                f"Unknown model type: {model_cls} and no ModuleList or ModuleDict found in model structure"
+            )
 
-        layers.extend(largest_module_list)
+        if isinstance(largest_module_list, nn.ModuleDict):
+            layers.extend(largest_module_list.values())
+        else:
+            layers.extend(largest_module_list)
         logger.info(f"Successfully extracted {len(largest_module_list)} layers using heuristic")
 
     assert all(isinstance(m, nn.Module) for m in layers), "layers shoudl be nn.Module instances"
diff --git a/tests/unit_tests/distributed/test_parallelizer.py b/tests/unit_tests/distributed/test_parallelizer.py
@@ -1525,8 +1525,9 @@ class TestExtractModelLayers:
 
     Covers the PR that replaced ``layers.extend(_reduce_attrs(...))`` with a
     helper that flattens ModuleList elements so each decoder layer ends up as
-    its own list entry (what AC wrapping expects), while leaving non-ModuleList
-    results (e.g. ModuleDict after PP split) appended as-is.
+    its own list entry (what AC wrapping expects). PP splitting represents kept
+    layer subsets as ModuleDicts, and those layer containers should be flattened
+    the same way.
     """
 
     def _make_layers(self, n: int) -> nn.ModuleList:
@@ -1616,14 +1617,13 @@ def test_multi_fqn_flattens_each_modulelist(self):
         assert [id(r) for r in result[5:]] == [id(item) for item in vis]
         assert not any(isinstance(r, nn.ModuleList) for r in result)
 
-    def test_non_modulelist_element_appended_as_single_entry(self):
+    def test_moduledict_layer_container_flattens(self):
         """PP post-split: ``_reduce_attrs`` returns a ModuleDict.
 
-        A ModuleDict is NOT an nn.ModuleList, so ``_extend_layers`` must fall
-        through to ``layers.append(m)`` and keep it as a single element —
-        same behaviour as before the fix (the AC loop then skips it via
-        hasattr, which is the expected PP-path behaviour and handled
-        elsewhere for the happy PP case).
+        The pipeline splitter replaces a ModuleList with a numeric-key
+        ModuleDict. ``_extract_model_layers`` must still return individual
+        layers so AC, TP follow-up logic, and FSDP layer handling see the same
+        shape as the unsplit path.
         """
         from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
 
@@ -1635,9 +1635,8 @@ def test_non_modulelist_element_appended_as_single_entry(self):
 
         result = _extract_model_layers(model)
 
-        # ModuleDict is not flattened — it stays as one element.
-        assert len(result) == 1
-        assert result[0] is layer_dict
+        assert len(result) == 2
+        assert [id(r) for r in result] == [id(v) for v in layer_dict.values()]
 
     def test_fallback_branch_still_handles_modulelist(self):
         """Non-MODEL_CLS_TO_LAYERS models hit the ``hasattr(model.model, 'layers')``
@@ -1671,6 +1670,17 @@ def __init__(self, layer_dict):
         assert len(result) == 3
         assert [id(r) for r in result] == [id(v) for v in layer_dict.values()]
 
+    def test_heuristic_ignores_named_moduledict(self):
+        """The unknown-model heuristic should not treat arbitrary ModuleDicts as layers."""
+
+        class UnknownWithAdapterRegistry(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.adapters = nn.ModuleDict({"default": nn.Linear(4, 4)})
+
+        with pytest.raises(ValueError, match="no ModuleList or ModuleDict found"):
+            _extract_model_layers(UnknownWithAdapterRegistry())
+
     def test_string_keyed_mistral3_fp8_vlm(self):
         """The ``"Mistral3FP8VLMForConditionalGeneration"`` string-key entry
         catches the runtime class produced by ``_get_mixin_wrapped_class``
diff --git a/tests/unit_tests/distributed/test_qwen3_5_tp_and_grad_sync.py b/tests/unit_tests/distributed/test_qwen3_5_tp_and_grad_sync.py
@@ -110,7 +110,7 @@ class TestExtractModelLayersStringFallbackAndNoneSafe:
        (which happen after PP stage split strips unused sub-modules).
     """
 
-    def _make_fake_qwen35(self, visual_is_none: bool):
+    def _make_fake_qwen35(self, visual_is_none: bool, layers_as_module_dict: bool = False):
         """Build a stand-in object whose type().__name__ is
         'Qwen3_5ForConditionalGeneration' but is NOT the real class — this
         mimics the lazy-import / deepcopy class-identity drift case."""
@@ -121,7 +121,10 @@ class Qwen3_5ForConditionalGeneration(nn.Module):  # noqa: N801  (name intention
         model = Qwen3_5ForConditionalGeneration()
         model.model = nn.Module()
         model.model.language_model = nn.Module()
-        model.model.language_model.layers = nn.ModuleList([nn.Linear(4, 4)])
+        if layers_as_module_dict:
+            model.model.language_model.layers = nn.ModuleDict({"0": nn.Linear(4, 4)})
+        else:
+            model.model.language_model.layers = nn.ModuleList([nn.Linear(4, 4)])
         if not visual_is_none:
             model.model.visual = nn.Module()
             model.model.visual.blocks = nn.ModuleList([nn.Linear(4, 4)])
@@ -146,6 +149,31 @@ def test_none_intermediate_attribute_skipped_gracefully(self):
         assert len(layers) == 1
         assert isinstance(layers[0], nn.Linear)
 
+    def test_module_dict_pp_stage_layers_are_flattened(self):
+        model = self._make_fake_qwen35(visual_is_none=True, layers_as_module_dict=True)
+        # PP splitting replaces ModuleList with ModuleDict keyed by original layer ids.
+        layers = parallelizer._extract_model_layers(model)
+        assert len(layers) == 1
+        assert isinstance(layers[0], nn.Linear)
+
+    def test_unknown_pp_stage_module_dict_heuristic(self):
+        class UnknownPPSplitStage(nn.Module):
+            pass
+
+        model = UnknownPPSplitStage()
+        model.model = nn.Module()
+        model.model.language_model = nn.Module()
+        model.model.language_model.layers = nn.ModuleDict(
+            {
+                "0": nn.Linear(4, 4),
+                "1": nn.Linear(4, 4),
+            }
+        )
+
+        layers = parallelizer._extract_model_layers(model)
+        assert len(layers) == 2
+        assert all(isinstance(x, nn.Linear) for x in layers)
+
 
 class TestAutoPipelineDeferFsdpGradSyncConversion:
     """AutoPipeline's surface uses the existing FSDP2Config-style knob