address reviewer comments.

Mandy3311 · hukongyi · Mandy3311 · commit c8cf4790160d · 2026-03-28T23:16:37.000+08:00
Co-authored-by: hukongyi &lt;hukongyi@cmbchina.com&gt;
diff --git a/specforge/data/preprocessing.py b/specforge/data/preprocessing.py
@@ -202,8 +202,6 @@ def preprocess_vlm_conversations(
             - pixel_values: List of pixel values for images in the examples.
             - image_grid_thw: List of image grid tensors.
     """
-    system_prompt = chat_template.system_prompt
-
     # prepare result
     results = {
         "input_ids": [],
@@ -213,16 +211,15 @@ def preprocess_vlm_conversations(
         "image_grid_thw": [],
     }
 
-    # Note: currently, we assume that each example has only one image
-    for i, image in enumerate(examples["image"]):
+    for i, images in enumerate(examples["images"]):
         source = examples["conversations"][i]
         messages = []
         # messages = [{"role": "system", "content": system_prompt}]
         if not source:
             # if the source is None, skip it
             continue
 
-        if not image:
+        if not images:
             text_messages = []
             convroles = ["user", "assistant"]
             for j, sentence in enumerate(source):
@@ -267,26 +264,17 @@ def preprocess_vlm_conversations(
             source = source[1:]
 
         convroles = ["user", "assistant"]
-        has_added_image = False
+        has_added_images = False
         for j, sentence in enumerate(source):
             role = sentence["role"]
             assert role == convroles[j % 2], f"unexpected role {role}"
             if role == "user":
-                # if the message is from user and has image, process the image
-                if not has_added_image:
-                    messages.append(
-                        {
-                            "role": role,
-                            "content": [
-                                {
-                                    "type": "image",
-                                    "image": image,
-                                },
-                                {"type": "text", "text": sentence["content"]},
-                            ],
-                        }
-                    )
-                    has_added_image = True
+                # Insert all images into the first user message
+                if not has_added_images:
+                    content = [{"type": "image", "image": img} for img in images]
+                    content.append({"type": "text", "text": sentence["content"]})
+                    messages.append({"role": role, "content": content})
+                    has_added_images = True
                 else:
                     messages.append({"role": role, "content": sentence["content"]})
             else:
@@ -319,7 +307,7 @@ def preprocess_vlm_conversations(
         input_ids = encoding.input_ids[0]
         offsets = encoding.offset_mapping[0]
         pixel_values = encoding.pixel_values
-        image_grid_thw = encoding.image_grid_thw[0]
+        image_grid_thw = encoding.image_grid_thw  # shape: (num_images, 3)
 
         # get conversation with image info for loss mask generation
         decoded_conversation = processor.tokenizer.decode(
@@ -335,7 +323,7 @@ def preprocess_vlm_conversations(
         results["loss_mask"].append(loss_mask[None, :])
         results["attention_mask"].append(torch.ones_like(loss_mask)[None, :])
         results["pixel_values"].append(pixel_values)
-        results["image_grid_thw"].append(image_grid_thw[None, :])
+        results["image_grid_thw"].append(image_grid_thw)
     return results
 
 
diff --git a/specforge/data/utils.py b/specforge/data/utils.py
@@ -205,6 +205,10 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
                 - attention_mask: torch.Tensor of shape (B, N)
                 - loss_mask: torch.Tensor of shape (B, N)
         """
+        assert len(features) == 1, (
+            f"VlmDataCollatorWithPadding requires batch_size=1, got {len(features)}. "
+            "Set per_device_train_batch_size=1 in your training config."
+        )
         max_length = max(item["input_ids"].shape[1] for item in features)
         batch_input_ids = torch.cat(
             [self.paddingtensor2D(item["input_ids"], max_length) for item in features]
diff --git a/specforge/modeling/target/dflash_target_model.py b/specforge/modeling/target/dflash_target_model.py
@@ -110,12 +110,8 @@ def _init_vlm_attributes(self):
         self.spatial_merge_size = getattr(vision_config, "spatial_merge_size", 2)
         self.vlm_model_type = getattr(vision_config, "model_type", "")
 
-        text_config = getattr(hf_config, "text_config", hf_config)
         self.tokens_per_second = None
 
-        rope_params = getattr(text_config, "rope_parameters", {}) or {}
-        self.mrope_interleaved = rope_params.get("mrope_interleaved", False)
-
     @classmethod
     def from_pretrained(
         cls,
@@ -437,9 +433,9 @@ def generate_dflash_data(
             output_hidden_states=True,
             use_cache=False,
         )
-        if pixel_values:
+        if pixel_values is not None:
             model_kwargs["pixel_values"] = pixel_values
-        if image_grid_thw:
+        if image_grid_thw is not None:
             model_kwargs["image_grid_thw"] = image_grid_thw
         outputs = self.model(**model_kwargs)
 
diff --git a/specforge/modeling/target/sglang_backend/patch.py b/specforge/modeling/target/sglang_backend/patch.py
@@ -98,7 +98,7 @@ def initialize_model_parallel(
         4 tensor model-parallel groups:
             [g0, g1], [g2, g3], [g4, g5], [g6, g7]
         2 pipeline model-parallel groups:
-            [g0, g2, g4, g6], [b1, g3, g5, g7]
+            [g0, g2, g4, g6], [g1, g3, g5, g7]
     Note that for efficiency, the caller should make sure adjacent ranks
     are on the same DGX box. For example if we are using 2 DGX-1 boxes
     with a total of 16 GPUs, rank 0 to 7 belong to the first box and
diff --git a/specforge/modeling/target/target_utils.py b/specforge/modeling/target/target_utils.py
@@ -63,7 +63,10 @@ def from_pretrained(
         instance = cls(config)
 
         if embed_key is None:
-            embed_key = "model.embed_tokens.weight"
+            if hasattr(config, "text_config") and config.text_config is not None:
+                embed_key = "model.language_model.embed_tokens.weight"
+            else:
+                embed_key = "model.embed_tokens.weight"
         if lm_head_key is None:
             lm_head_key = "lm_head.weight"
 
diff --git a/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py b/tests/test_modeling/test_target/test_sglang_backend/test_sglang_backend.py
@@ -32,7 +32,6 @@ def test_dense(rank, world_size, port, tp_size):
         device="cuda",
         attention_backend="fa3",
         mem_fraction_static=0.4,
-        # enable_torch_compile=True,
         enable_nccl_nvls=True,
         # enable_symm_mem=True,
         enable_symm_mem=False,