Skip to content

Commit 3d4fc7e

Browse files
committed
feat: add visual search plugin
1 parent f4b9c52 commit 3d4fc7e

12 files changed

Lines changed: 1305 additions & 1 deletion

File tree

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@
88

99
# Plugins
1010
/plugins/data-designer-template/ @NVIDIA-NeMo/data_designer_reviewers
11+
/plugins/data-designer-visual-search/ eric.tramel@gmail.com

docs/catalog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,4 @@ Auto-generated from plugin metadata. Do not edit manually.
55
| Plugin | Version | Column Type | Description |
66
|--------|---------|-------------|-------------|
77
| data-designer-template | 0.1.0 | `text-transform` | Template Data Designer plugin — text transform column generator |
8+
| data-designer-visual-search | 0.1.0 | `visual-search` | Data Designer visual-search plugin |
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
# Owner(s) of this plugin — used to generate the root CODEOWNERS file.
2+
# GitHub accepts @username, @org/team, or email format.
3+
* eric.tramel@gmail.com
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# data-designer-visual-search
2+
3+
Data Designer plugin for VLM-driven visual search over image columns.
4+
5+
The `visual-search` column runs a vision-capable chat model with built-in
6+
image-operation tools:
7+
8+
- `open_image`
9+
- `get_image_info`
10+
- `list_images`
11+
- `crop_image`
12+
- `transform_image`
13+
- `edit_color`
14+
15+
Each operation returns an `image_id`. The column keeps intermediate images in
16+
memory and re-attaches tool-produced images to the following model turn, so the
17+
model can inspect a crop or transformed image before deciding what to do next.
18+
Because IDs remain addressable, the model can branch from an earlier image
19+
rather than being forced through a linear edit chain.
20+
21+
## Installation
22+
23+
```bash
24+
pip install data-designer-visual-search
25+
```
26+
27+
## Usage
28+
29+
Once installed, the `visual-search` column type is automatically discovered by
30+
[NeMo Data Designer](https://github.com/NVIDIA-NeMo/DataDesigner).
31+
32+
```python
33+
import pandas as pd
34+
from data_designer.config.config_builder import DataDesignerConfigBuilder
35+
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
36+
from data_designer.interface.data_designer import DataDesigner
37+
38+
seed_df = pd.DataFrame({"image_path": ["/path/to/scene.png"]})
39+
40+
builder = DataDesignerConfigBuilder()
41+
builder.with_seed_dataset(DataFrameSeedSource(df=seed_df))
42+
builder.add_column(
43+
name="visual_answer",
44+
column_type="visual-search",
45+
image_column="image_path",
46+
prompt="Find the red object. Crop or transform the image if that helps.",
47+
model_alias="nvidia-vision",
48+
# Optional: set a model-specific image token here if your endpoint requires
49+
# one in the text for every attached image.
50+
# image_placeholder="<image>",
51+
)
52+
53+
result = DataDesigner(artifact_path="artifacts").preview(builder, num_records=1)
54+
```
55+
56+
The main output column contains the model's final answer. By default the plugin
57+
also writes `{column_name}__image_history`, a compact tree of image IDs, parent
58+
IDs, operations, and dimensions.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
[project]
5+
name = "data-designer-visual-search"
6+
version = "0.1.0"
7+
description = "Data Designer visual-search plugin"
8+
requires-python = ">=3.10"
9+
dependencies = [
10+
"data-designer>=0.5.7",
11+
"pillow",
12+
"requests",
13+
]
14+
license = "Apache-2.0"
15+
readme = "README.md"
16+
authors = [
17+
{name = "NVIDIA Corporation"},
18+
]
19+
classifiers = [
20+
"Development Status :: 3 - Alpha",
21+
"Programming Language :: Python :: 3",
22+
]
23+
24+
[project.entry-points."data_designer.plugins"]
25+
visual-search = "data_designer_visual_search.plugin:plugin"
26+
27+
[project.urls]
28+
Repository = "https://github.com/NVIDIA-NeMo/DataDesignerPlugins"
29+
30+
[build-system]
31+
requires = ["hatchling"]
32+
build-backend = "hatchling.build"
33+
34+
[tool.hatch.build.targets.wheel]
35+
packages = ["src/data_designer_visual_search"]
36+
37+
[tool.ruff]
38+
extend = "../../pyproject.toml"
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
from typing import Literal
5+
6+
from data_designer.config.base import SingleColumnConfig
7+
from data_designer.config.models import ModalityDataType
8+
from data_designer.config.utils.constants import REASONING_CONTENT_COLUMN_POSTFIX, TRACE_COLUMN_POSTFIX
9+
from data_designer.config.utils.image_helpers import ImageFormat
10+
from data_designer.config.utils.misc import assert_valid_jinja2_template, extract_keywords_from_jinja2_template
11+
from data_designer.config.utils.trace_type import TraceType
12+
from pydantic import Field, model_validator
13+
from typing_extensions import Self
14+
15+
VisualSearchToolName = Literal[
16+
"open_image",
17+
"get_image_info",
18+
"list_images",
19+
"crop_image",
20+
"transform_image",
21+
"edit_color",
22+
]
23+
24+
25+
class VisualSearchColumnConfig(SingleColumnConfig):
26+
"""Configuration for image-grounded visual search with local image-operation tools.
27+
28+
The column runs a vision-capable chat model with built-in image tools. Each tool
29+
returns an image ID, and subsequent calls may operate on any previous image ID,
30+
which lets the model branch from earlier points in the image history.
31+
"""
32+
33+
column_type: Literal["visual-search"] = "visual-search"
34+
35+
image_column: str = Field(description="Column containing a local image path, URL, base64 string, or data URI.")
36+
prompt: str = Field(description="Jinja2 prompt template for the visual search task.")
37+
model_alias: str = Field(description="Alias of the vision-capable chat model to use.")
38+
system_prompt: str | None = Field(default=None, description="Optional Jinja2 system prompt template.")
39+
image_data_type: ModalityDataType | None = Field(
40+
default=None,
41+
description="Optional explicit format for values in image_column. Leave unset for auto-detection.",
42+
)
43+
image_format: ImageFormat | None = Field(
44+
default=None,
45+
description="Required when image_data_type is base64 and the image format cannot be auto-detected.",
46+
)
47+
image_placeholder: str | None = Field(
48+
default=None,
49+
description="Optional model-specific image token to include in text for endpoints that require it.",
50+
)
51+
max_tool_call_turns: int = Field(
52+
default=6,
53+
ge=1,
54+
description="Maximum tool-calling turns allowed for each row before the model must answer.",
55+
)
56+
allowed_tools: list[VisualSearchToolName] | None = Field(
57+
default=None,
58+
description="Optional allowlist of built-in visual tools. Defaults to all tools.",
59+
)
60+
attach_images_after_tool_calls: bool = Field(
61+
default=True,
62+
description="Attach resulting tool images back into the next model turn.",
63+
)
64+
include_image_history: bool = Field(
65+
default=True,
66+
description="Add a side-effect column with the tree of image operations and IDs.",
67+
)
68+
with_trace: TraceType = Field(default=TraceType.NONE, description="Optional chat trace capture mode.")
69+
extract_reasoning_content: bool = Field(
70+
default=False,
71+
description="If True, capture reasoning_content from the final assistant message.",
72+
)
73+
use_default_system_prompt: bool = Field(
74+
default=True,
75+
description="Prepend built-in instructions explaining image IDs and visual tools.",
76+
)
77+
78+
@staticmethod
79+
def get_column_emoji() -> str:
80+
return "🔎"
81+
82+
@property
83+
def required_columns(self) -> list[str]:
84+
required_cols = [self.image_column, *extract_keywords_from_jinja2_template(self.prompt)]
85+
if self.system_prompt:
86+
required_cols.extend(extract_keywords_from_jinja2_template(self.system_prompt))
87+
return list(dict.fromkeys(required_cols))
88+
89+
@property
90+
def side_effect_columns(self) -> list[str]:
91+
return [
92+
*([f"{self.name}__image_history"] if self.include_image_history else []),
93+
*([f"{self.name}{TRACE_COLUMN_POSTFIX}"] if self.with_trace != TraceType.NONE else []),
94+
*([f"{self.name}{REASONING_CONTENT_COLUMN_POSTFIX}"] if self.extract_reasoning_content else []),
95+
]
96+
97+
@model_validator(mode="after")
98+
def validate_templates_and_image_format(self) -> Self:
99+
"""Validate prompt templates and image modality settings."""
100+
assert_valid_jinja2_template(self.prompt)
101+
if self.system_prompt:
102+
assert_valid_jinja2_template(self.system_prompt)
103+
if self.image_data_type == ModalityDataType.BASE64 and self.image_format is None:
104+
raise ValueError("image_format is required when image_data_type is base64")
105+
return self

0 commit comments

Comments
 (0)