feat: support multiple images per column in image context (#257)

nabinchha · web-flow · commit 3d86a38dad66 · 2026-01-28T16:45:44.000-07:00
* allow image context column to have multiple images

* pack multimodal context at the front before user text messages

* Fix edge case with numpy array
diff --git a/packages/data-designer-config/src/data_designer/config/models.py b/packages/data-designer-config/src/data_designer/config/models.py
@@ -3,6 +3,7 @@
 
 from __future__ import annotations
 
+import json
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum
@@ -65,7 +66,7 @@ class ModalityContext(ABC, BaseModel):
     data_type: ModalityDataType
 
     @abstractmethod
-    def get_context(self, record: dict) -> dict[str, Any]: ...
+    def get_contexts(self, record: dict) -> list[dict[str, Any]]: ...
 
 
 class ImageContext(ModalityContext):
@@ -81,25 +82,53 @@ class ImageContext(ModalityContext):
     modality: Modality = Modality.IMAGE
     image_format: ImageFormat | None = None
 
-    def get_context(self, record: dict) -> dict[str, Any]:
-        """Get the context for the image modality.
+    def get_contexts(self, record: dict) -> list[dict[str, Any]]:
+        """Get the contexts for the image modality.
 
         Args:
-            record: The record containing the image data.
+            record: The record containing the image data. The data can be:
+                - A JSON serialized list of strings
+                - A list of strings
+                - A single string
 
         Returns:
-            The context for the image modality.
+            A list of image contexts.
         """
-        context = dict(type="image_url")
-        context_value = record[self.column_name]
-        if self.data_type == ModalityDataType.URL:
-            context["image_url"] = context_value
+        raw_value = record[self.column_name]
+
+        # Normalize to list of strings
+        if isinstance(raw_value, str):
+            # Try to parse as JSON first
+            try:
+                parsed_value = json.loads(raw_value)
+                if isinstance(parsed_value, list):
+                    context_values = parsed_value
+                else:
+                    context_values = [raw_value]
+            except (json.JSONDecodeError, TypeError):
+                context_values = [raw_value]
+        elif isinstance(raw_value, list):
+            context_values = raw_value
+        elif hasattr(raw_value, "__iter__") and not isinstance(raw_value, (str, bytes, dict)):
+            # Handle array-like objects (numpy arrays, pandas Series, etc.)
+            context_values = list(raw_value)
         else:
-            context["image_url"] = {
-                "url": f"data:image/{self.image_format.value};base64,{context_value}",
-                "format": self.image_format.value,
-            }
-        return context
+            context_values = [raw_value]
+
+        # Build context list
+        contexts = []
+        for context_value in context_values:
+            context = dict(type="image_url")
+            if self.data_type == ModalityDataType.URL:
+                context["image_url"] = context_value
+            else:
+                context["image_url"] = {
+                    "url": f"data:image/{self.image_format.value};base64,{context_value}",
+                    "format": self.image_format.value,
+                }
+            contexts.append(context)
+
+        return contexts
 
     @model_validator(mode="after")
     def _validate_image_format(self) -> Self:
diff --git a/packages/data-designer-config/tests/config/test_models.py b/packages/data-designer-config/tests/config/test_models.py
@@ -4,6 +4,7 @@
 import json
 import tempfile
 from collections import Counter
+from typing import TYPE_CHECKING
 
 import pytest
 import yaml
@@ -24,22 +25,159 @@
     UniformDistributionParams,
     load_model_configs,
 )
+from data_designer.lazy_heavy_imports import np
 
+if TYPE_CHECKING:
+    import numpy as np
 
-def test_image_context_get_context():
+
+def test_image_context_get_contexts_single_string():
+    """Test get_contexts with a single string value."""
     image_context = ImageContext(
         column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
     )
-    assert image_context.get_context({"image_base64": "somebase64encodedimagestring"}) == {
-        "type": "image_url",
-        "image_url": {"url": "data:image/png;base64,somebase64encodedimagestring", "format": "png"},
-    }
+    assert image_context.get_contexts({"image_base64": "somebase64encodedimagestring"}) == [
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,somebase64encodedimagestring", "format": "png"},
+        }
+    ]
 
     image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
-    assert image_context.get_context({"image_url": "https://example.com/examle_image.png"}) == {
-        "type": "image_url",
-        "image_url": "https://example.com/examle_image.png",
-    }
+    assert image_context.get_contexts({"image_url": "https://example.com/examle_image.png"}) == [
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/examle_image.png",
+        }
+    ]
+
+
+def test_image_context_get_contexts_list_of_strings():
+    """Test get_contexts with a list of strings."""
+    image_context = ImageContext(
+        column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
+    )
+    assert image_context.get_contexts({"image_base64": ["image1base64", "image2base64", "image3base64"]}) == [
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
+        },
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
+        },
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image3base64", "format": "png"},
+        },
+    ]
+
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    assert image_context.get_contexts(
+        {"image_url": ["https://example.com/image1.png", "https://example.com/image2.png"]}
+    ) == [
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image1.png",
+        },
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image2.png",
+        },
+    ]
+
+
+def test_image_context_get_contexts_numpy_array():
+    """Test get_contexts with numpy arrays (happens after parquet serialization)."""
+    image_context = ImageContext(
+        column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
+    )
+    numpy_array = np.array(["image1base64", "image2base64"])
+    assert image_context.get_contexts({"image_base64": numpy_array}) == [
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
+        },
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
+        },
+    ]
+
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    numpy_array = np.array(["https://example.com/image1.png", "https://example.com/image2.png"])
+    assert image_context.get_contexts({"image_url": numpy_array}) == [
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image1.png",
+        },
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image2.png",
+        },
+    ]
+
+
+def test_image_context_get_contexts_json_serialized_list():
+    """Test get_contexts with a JSON serialized list of strings."""
+    image_context = ImageContext(
+        column_name="image_base64", data_type=ModalityDataType.BASE64, image_format=ImageFormat.PNG
+    )
+    json_str = json.dumps(["image1base64", "image2base64"])
+    assert image_context.get_contexts({"image_base64": json_str}) == [
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image1base64", "format": "png"},
+        },
+        {
+            "type": "image_url",
+            "image_url": {"url": "data:image/png;base64,image2base64", "format": "png"},
+        },
+    ]
+
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    json_str = json.dumps(["https://example.com/image1.png", "https://example.com/image2.png"])
+    assert image_context.get_contexts({"image_url": json_str}) == [
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image1.png",
+        },
+        {
+            "type": "image_url",
+            "image_url": "https://example.com/image2.png",
+        },
+    ]
+
+
+def test_image_context_get_contexts_json_string_not_list():
+    """Test get_contexts with a JSON string that isn't a list (should treat as single string)."""
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    json_str = json.dumps({"nested": "object"})
+    # Should treat the entire JSON string as a single image URL
+    assert image_context.get_contexts({"image_url": json_str}) == [
+        {
+            "type": "image_url",
+            "image_url": json_str,
+        }
+    ]
+
+
+def test_image_context_get_contexts_invalid_json():
+    """Test get_contexts with invalid JSON string (should treat as single string)."""
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    invalid_json = "not a valid json string"
+    assert image_context.get_contexts({"image_url": invalid_json}) == [
+        {
+            "type": "image_url",
+            "image_url": invalid_json,
+        }
+    ]
+
+
+def test_image_context_get_contexts_empty_list():
+    """Test get_contexts with an empty list."""
+    image_context = ImageContext(column_name="image_url", data_type=ModalityDataType.URL)
+    assert image_context.get_contexts({"image_url": []}) == []
 
 
 def test_image_context_validate_image_format():
diff --git a/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/llm_completion.py b/packages/data-designer-engine/src/data_designer/engine/column_generators/generators/llm_completion.py
@@ -62,9 +62,9 @@ def generate(self, data: dict) -> dict:
 
         multi_modal_context = None
         if self.config.multi_modal_context is not None and len(self.config.multi_modal_context) > 0:
-            multi_modal_context = [
-                context.get_context(deserialized_record) for context in self.config.multi_modal_context
-            ]
+            multi_modal_context = []
+            for context in self.config.multi_modal_context:
+                multi_modal_context.extend(context.get_contexts(deserialized_record))
 
         response, reasoning_trace = self.model.generate(
             prompt=self.prompt_renderer.render(
diff --git a/packages/data-designer-engine/src/data_designer/engine/models/utils.py b/packages/data-designer-engine/src/data_designer/engine/models/utils.py
@@ -21,9 +21,9 @@ def prompt_to_messages(
     user_content = user_prompt
     if multi_modal_context and len(multi_modal_context) > 0:
         user_content = []
-        user_content.append({"type": "text", "text": user_prompt})
         for context in multi_modal_context:
             user_content.append(context)
+        user_content.append({"type": "text", "text": user_prompt})
     return (
         [
             str_to_message(content=system_prompt, role="system"),
diff --git a/packages/data-designer-engine/tests/engine/models/test_model_utils.py b/packages/data-designer-engine/tests/engine/models/test_model_utils.py
@@ -26,11 +26,11 @@ def test_prompt_to_messages():
         {"content": "hello", "role": "user"},
     ]
     assert prompt_to_messages(user_prompt="hello", multi_modal_context=[mult_modal_context]) == [
-        {"content": [{"type": "text", "text": "hello"}, mult_modal_context], "role": "user"}
+        {"content": [mult_modal_context, {"type": "text", "text": "hello"}], "role": "user"}
     ]
     assert prompt_to_messages(
         user_prompt="hello", system_prompt=stub_system_prompt, multi_modal_context=[mult_modal_context]
     ) == [
         {"content": stub_system_prompt, "role": "system"},
-        {"content": [{"type": "text", "text": "hello"}, mult_modal_context], "role": "user"},
+        {"content": [mult_modal_context, {"type": "text", "text": "hello"}], "role": "user"},
     ]

Original file line number	Diff line number	Diff line change
`@@ -26,11 +26,11 @@ def test_prompt_to_messages():`
`26`	`26`	`{"content": "hello", "role": "user"},`
`27`	`27`	`]`
`28`	`28`	`assert prompt_to_messages(user_prompt="hello", multi_modal_context=[mult_modal_context]) == [`
`29`		`- {"content": [{"type": "text", "text": "hello"}, mult_modal_context], "role": "user"}`
	`29`	`+ {"content": [mult_modal_context, {"type": "text", "text": "hello"}], "role": "user"}`
`30`	`30`	`]`
`31`	`31`	`assert prompt_to_messages(`
`32`	`32`	`user_prompt="hello", system_prompt=stub_system_prompt, multi_modal_context=[mult_modal_context]`
`33`	`33`	`) == [`
`34`	`34`	`{"content": stub_system_prompt, "role": "system"},`
`35`		`- {"content": [{"type": "text", "text": "hello"}, mult_modal_context], "role": "user"},`
	`35`	`+ {"content": [mult_modal_context, {"type": "text", "text": "hello"}], "role": "user"},`
`36`	`36`	`]`