NVIDIA-NeMo
diff --git a/‎.github/workflows/docs-preview.yml‎
Lines changed: 0 additions & 1 deletion b/‎.github/workflows/docs-preview.yml‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 10 additions & 0 deletions b/‎README.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/concepts/agent-rollout-ingestion.md‎
Lines changed: 34 additions & 22 deletions b/‎docs/concepts/agent-rollout-ingestion.md‎
Lines changed: 34 additions & 22 deletions
diff --git a/‎packages/data-designer-config/src/data_designer/config/column_configs.py‎
Lines changed: 22 additions & 8 deletions b/‎packages/data-designer-config/src/data_designer/config/column_configs.py‎
Lines changed: 22 additions & 8 deletions
diff --git a/‎packages/data-designer-config/src/data_designer/config/seed_source.py‎
Lines changed: 10 additions & 2 deletions b/‎packages/data-designer-config/src/data_designer/config/seed_source.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎packages/data-designer-config/tests/config/test_columns.py‎
Lines changed: 32 additions & 0 deletions b/‎packages/data-designer-config/tests/config/test_columns.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/hermes_agent.py‎
Lines changed: 3 additions & 18 deletions b/‎packages/data-designer-engine/src/data_designer/engine/resources/agent_rollout/hermes_agent.py‎
Lines changed: 3 additions & 18 deletions
@@ -6,7 +6,6 @@ on:
     paths:
       - "docs/**"
       - "mkdocs.yml"
-      - "packages/*/src/data_designer/**"
       - ".github/workflows/docs-preview.yml"
 
 jobs:
 
@@ -22,6 +22,16 @@ Data Designer helps you create synthetic datasets that go beyond simple LLM prom
 
 ---
 
+### ⚠️ Security Notice: LiteLLM Supply-Chain Incident (2026-03-24)
+
+On March 24, 2026, malicious versions of `litellm` ([1.82.7 and 1.82.8](https://github.com/BerriAI/litellm/issues/24518)) were published to PyPI containing a credential stealer. The compromised packages were available for [approximately five hours](https://www.okta.com/blog/threat-intelligence/litellm-supply-chain-attack--an-explainer-for-identity-pros/) (10:39 – 16:00 UTC) before being removed.
+
+The only Data Designer releases that could resolve to these versions are **v0.2.2** (Dec 2025) and **v0.2.3** (Jan 2026), which carried a looser `litellm<2` upper bound. These are nearly three months old and have been superseded by eight subsequent releases — both have been yanked from PyPI as a precaution. All other releases (v0.3.0 – v0.5.3) pinned `litellm` to `>=1.73.6,<1.80.12` and were never compatible with 1.82.x. Starting with v0.5.4, `litellm` is no longer a dependency.
+
+To have been impacted through Data Designer, you would need to have had one of these two old versions explicitly pinned *and* run a fresh `pip install` or dependency-cache update that resolved `litellm` during the five-hour window on March 24. If you believe you may be affected, see [BerriAI's incident report](https://github.com/BerriAI/litellm/issues/24518) for remediation steps.
+
+---
+
 ## Quick Start
 
 ### 1. Install
 
@@ -42,6 +42,18 @@ Use `AgentRolloutSeedSource` when you want to work from existing agent traces in
     )
     ```
 
+=== "Pi Coding Agent"
+
+    Uses `~/.pi/agent/sessions` and `*.jsonl` by default. Sessions are tree-structured JSONL files; the active conversation path is resolved automatically.
+
+    ```python
+    import data_designer.config as dd
+
+    seed_source = dd.AgentRolloutSeedSource(
+        format=dd.AgentRolloutFormat.PI_CODING_AGENT,
+    )
+    ```
+
 === "ATIF"
 
     ATIF requires an explicit `path`. See Harbor's [ATIF documentation](https://harborframework.com/docs/trajectory-format) for the format specification.
@@ -63,31 +75,31 @@ You can override `path` and `file_pattern` for any format when your rollout arti
 
 All supported rollout formats map into the same seeded row schema. In the table below, `None` means the source artifact does not expose that field directly, and `derived` means Data Designer computes it from normalized `messages`.
 
-| Normalized field | ATIF | Claude Code | Codex | Hermes Agent |
-|---|---|---|---|---|
-| `trace_id` | `session_id` | `sessionId[:agentId]` | `session_meta.id` or file stem | CLI `session_id` or file stem; gateway file stem |
-| `source_kind` | `"atif"` | `"claude_code"` | `"codex"` | `"hermes_agent"` |
-| `source_path` | Parsed `.json` path | Parsed `.jsonl` trace path | Parsed `rollout-*.jsonl` path | Parsed CLI `.json` or gateway `.jsonl` path |
-| `root_session_id` | `session_id` | `sessionId` or file stem | `trace_id` | `trace_id` |
-| `agent_id` | `None` | `agentId` | `None` | `None` |
-| `is_sidechain` | `False` | `isSidechain` | `False` | `False` |
-| `cwd` | `agent.extra.cwd` | First non-null record `cwd` | `session_meta.cwd` | `None` |
-| `project_path` | `extra.project_path` or `cwd` | `projectPath` or `cwd` | `cwd` | `None` |
-| `git_branch` | `agent.extra.git_branch` | First non-null record `gitBranch` | `session_meta.git_branch` | `None` |
-| `started_at` | Earliest step timestamp | Earliest row timestamp | `session_meta.timestamp` or earliest record timestamp | CLI `session_start`; gateway `created_at` |
-| `ended_at` | Latest step timestamp | Latest row timestamp | Latest record timestamp | CLI `last_updated`; gateway `updated_at` |
-| `messages` | Normalized steps | Normalized trace rows | Normalized response items | Normalized CLI or gateway rows |
-| `source_meta` | ATIF metadata | Claude metadata | Codex metadata | Hermes metadata |
-| `message_count` | `derived` | `derived` | `derived` | `derived` |
-| `tool_call_count` | `derived` | `derived` | `derived` | `derived` |
-| `final_assistant_message` | `derived` | `derived` | `derived` | `derived` |
+| Normalized field | ATIF | Claude Code | Codex | Hermes Agent | Pi Coding Agent |
+|---|---|---|---|---|---|
+| `trace_id` | `session_id` | `sessionId[:agentId]` | `session_meta.id` or file stem | CLI `session_id` or file stem; gateway file stem | Session header `id` |
+| `source_kind` | `"atif"` | `"claude_code"` | `"codex"` | `"hermes_agent"` | `"pi_coding_agent"` |
+| `source_path` | Parsed `.json` path | Parsed `.jsonl` trace path | Parsed `rollout-*.jsonl` path | Parsed CLI `.json` or gateway `.jsonl` path | Parsed `.jsonl` session path |
+| `root_session_id` | `session_id` | `sessionId` or file stem | `trace_id` | `trace_id` | Session header `id` |
+| `agent_id` | `None` | `agentId` | `None` | `None` | `None` |
+| `is_sidechain` | `False` | `isSidechain` | `False` | `False` | `False` |
+| `cwd` | `agent.extra.cwd` | First non-null record `cwd` | `session_meta.cwd` | `None` | Session header `cwd` |
+| `project_path` | `extra.project_path` or `cwd` | `projectPath` or `cwd` | `cwd` | `None` | Session header `cwd` |
+| `git_branch` | `agent.extra.git_branch` | First non-null record `gitBranch` | `session_meta.git_branch` | `None` | `None` |
+| `started_at` | Earliest step timestamp | Earliest row timestamp | `session_meta.timestamp` or earliest record timestamp | CLI `session_start`; gateway `created_at` | Earliest entry timestamp |
+| `ended_at` | Latest step timestamp | Latest row timestamp | Latest record timestamp | CLI `last_updated`; gateway `updated_at` | Latest entry timestamp |
+| `messages` | Normalized steps | Normalized trace rows | Normalized response items | Normalized CLI or gateway rows | Normalized active-path messages |
+| `source_meta` | ATIF metadata | Claude metadata | Codex metadata | Hermes metadata | Pi session metadata |
+| `message_count` | `derived` | `derived` | `derived` | `derived` | `derived` |
+| `tool_call_count` | `derived` | `derived` | `derived` | `derived` | `derived` |
+| `final_assistant_message` | `derived` | `derived` | `derived` | `derived` | `derived` |
 
 ### Notes
 
-- `trace_id`: Claude Code appends `agentId` when present. Hermes uses either the CLI session ID or the gateway transcript file stem.
-- `is_sidechain`: ATIF and Hermes currently normalize this to `False`. Claude Code preserves `isSidechain` directly.
-- `messages`: All formats normalize into the same chat-style message schema. See [Message Traces](traces.md) for the shared block structure.
-- `source_meta`: This is where format-specific details live, such as ATIF copied-context metadata, Claude summaries, Codex response-item types, or Hermes tool/session metadata.
+- `trace_id`: Claude Code appends `agentId` when present. Hermes uses either the CLI session ID or the gateway transcript file stem. Pi uses the session header `id`.
+- `is_sidechain`: ATIF, Hermes, and Pi currently normalize this to `False`. Claude Code preserves `isSidechain` directly.
+- `messages`: All formats normalize into the same chat-style message schema. See [Message Traces](traces.md) for the shared block structure. Pi sessions are tree-structured; only the active conversation path (from the last entry back to root) is included.
+- `source_meta`: This is where format-specific details live, such as ATIF copied-context metadata, Claude summaries, Codex response-item types, Hermes tool/session metadata, or Pi session version and branch information.
 
 ## Example: Summarize a Random Turn
 
 
@@ -44,8 +44,10 @@ class SamplerColumnConfig(SingleColumnConfig):
         conditional_params: Optional dictionary for conditional parameters. The dict keys
             are the conditions that must be met (e.g., "age > 21") for the conditional parameters
             to be used. The values of dict are the parameters to use when the condition is met.
-        convert_to: Optional type conversion to apply after sampling. Must be one of "float", "int", or "str".
-            Useful for converting numerical samples to strings or other types.
+        convert_to: Optional type conversion to apply after sampling. For numerical samplers,
+            must be one of "float", "int", or "str". For datetime and timedelta samplers, accepts
+            a strftime format string (e.g., ``"%Y-%m-%d"``, ``"%m/%d/%Y %H:%M"``). When omitted,
+            datetime/timedelta columns default to ISO-8601 format (e.g., ``2024-01-15T09:30:00``).
 
     Inherited Attributes:
         name (required): Unique name of the column to be generated.
@@ -70,7 +72,12 @@ class SamplerColumnConfig(SingleColumnConfig):
         description="Optional dictionary for conditional parameters; keys are conditions, values are params to use when met",
     )
     convert_to: str | None = Field(
-        default=None, description="Optional type conversion after sampling: 'float', 'int', or 'str'"
+        default=None,
+        description=(
+            "Optional type conversion after sampling: 'float', 'int', or 'str' for numerical samplers; "
+            "a strftime format string (e.g., '%Y-%m-%d') for datetime/timedelta samplers. "
+            "Datetime/timedelta columns default to ISO-8601 (e.g., 2024-01-15T09:30:00) when omitted."
+        ),
     )
     column_type: Literal["sampler"] = "sampler"
 
@@ -178,14 +185,17 @@ def get_column_emoji() -> str:
 
     @property
     def required_columns(self) -> list[str]:
-        """Get columns referenced in the prompt and system_prompt templates.
+        """Get columns referenced in prompt templates and multi-modal context.
 
         Returns:
-            List of unique column names referenced in Jinja2 templates.
+            List of unique column names referenced in Jinja2 templates
+            and multi-modal context configurations.
         """
         required_cols = list(extract_keywords_from_jinja2_template(self.prompt))
         if self.system_prompt:
             required_cols.extend(list(extract_keywords_from_jinja2_template(self.system_prompt)))
+        if self.multi_modal_context:
+            required_cols.extend(ctx.column_name for ctx in self.multi_modal_context)
         return list(set(required_cols))
 
     @property
@@ -586,12 +596,16 @@ def get_column_emoji() -> str:
 
     @property
     def required_columns(self) -> list[str]:
-        """Get columns referenced in the prompt template.
+        """Get columns referenced in the prompt template and multi-modal context.
 
         Returns:
-            List of unique column names referenced in Jinja2 templates.
+            List of unique column names referenced in Jinja2 templates
+            and multi-modal context configurations.
         """
-        return list(extract_keywords_from_jinja2_template(self.prompt))
+        required_cols = list(extract_keywords_from_jinja2_template(self.prompt))
+        if self.multi_modal_context:
+            required_cols.extend(ctx.column_name for ctx in self.multi_modal_context)
+        return list(set(required_cols))
 
     @model_validator(mode="after")
     def assert_prompt_valid_jinja(self) -> Self:
 
@@ -176,6 +176,10 @@ def get_hermes_agent_default_path() -> str:
     return str(Path("~/.hermes/sessions").expanduser())
 
 
+def get_pi_coding_agent_default_path() -> str:
+    return str(Path("~/.pi/agent/sessions").expanduser())
+
+
 def _validate_filesystem_seed_source_path(value: str | None) -> str | None:
     if value is None:
         return None
@@ -200,6 +204,7 @@ class AgentRolloutFormat(StrEnum):
     CLAUDE_CODE = "claude_code"
     CODEX = "codex"
     HERMES_AGENT = "hermes_agent"
+    PI_CODING_AGENT = "pi_coding_agent"
 
 
 def get_agent_rollout_format_defaults(fmt: AgentRolloutFormat) -> tuple[str | None, str]:
@@ -211,6 +216,8 @@ def get_agent_rollout_format_defaults(fmt: AgentRolloutFormat) -> tuple[str | No
         return (get_codex_default_path(), "*.jsonl")
     if fmt == AgentRolloutFormat.HERMES_AGENT:
         return (get_hermes_agent_default_path(), "*.json*")
+    if fmt == AgentRolloutFormat.PI_CODING_AGENT:
+        return (get_pi_coding_agent_default_path(), "*.jsonl")
     raise ValueError(f"🛑 Unknown agent rollout format: {fmt!r}")
 
 
@@ -228,7 +235,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
             "Directory containing agent rollout artifacts. This field is required for ATIF trajectories. "
             "When omitted, built-in defaults are used for formats that define one. "
             "Claude Code defaults to ~/.claude/projects, Codex defaults to ~/.codex/sessions, "
-            "and Hermes Agent defaults to ~/.hermes/sessions. "
+            "Hermes Agent defaults to ~/.hermes/sessions, "
+            "and Pi Coding Agent defaults to ~/.pi/agent/sessions. "
             "Relative paths are resolved from the current working directory when the config is loaded, "
             "not from the config file location."
         ),
@@ -238,7 +246,7 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
         None,
         description=(
             "Case-sensitive filename pattern used to match agent rollout files. When omitted, "
-            "ATIF defaults to '*.json', Claude Code and Codex default to '*.jsonl', "
+            "ATIF defaults to '*.json', Claude Code, Codex, and Pi Coding Agent default to '*.jsonl', "
             "and Hermes Agent defaults to '*.json*'."
         ),
     )
 
@@ -9,6 +9,7 @@
 from data_designer.config.column_configs import (
     EmbeddingColumnConfig,
     ExpressionColumnConfig,
+    ImageColumnConfig,
     LLMCodeColumnConfig,
     LLMJudgeColumnConfig,
     LLMStructuredColumnConfig,
@@ -26,6 +27,7 @@
     is_plugin_column_type,
 )
 from data_designer.config.errors import InvalidConfigError
+from data_designer.config.models import ImageContext
 from data_designer.config.sampler_params import (
     CategorySamplerParams,
     GaussianSamplerParams,
@@ -122,6 +124,36 @@ def test_llm_text_column_config():
         )
 
 
+def test_llm_text_column_config_required_columns_includes_multi_modal_context():
+    config = LLMTextColumnConfig(
+        name="test_llm_text",
+        prompt="Classify this image: {{ description }}",
+        model_alias=stub_model_alias,
+        multi_modal_context=[ImageContext(column_name="image_base64")],
+    )
+    assert set(config.required_columns) == {"description", "image_base64"}
+
+
+def test_llm_text_column_config_required_columns_deduplicates_multi_modal_and_prompt():
+    config = LLMTextColumnConfig(
+        name="test_llm_text",
+        prompt="Classify this: {{ image_col }}",
+        model_alias=stub_model_alias,
+        multi_modal_context=[ImageContext(column_name="image_col")],
+    )
+    assert config.required_columns == ["image_col"]
+
+
+def test_image_column_config_required_columns_includes_multi_modal_context():
+    config = ImageColumnConfig(
+        name="test_image",
+        prompt="Generate based on {{ style }}",
+        model_alias=stub_model_alias,
+        multi_modal_context=[ImageContext(column_name="reference_image")],
+    )
+    assert set(config.required_columns) == {"style", "reference_image"}
+
+
 def test_llm_text_column_config_with_trace_serialization() -> None:
     """Test that with_trace field serializes and deserializes correctly."""
     config = LLMTextColumnConfig(
 
@@ -16,6 +16,7 @@
     coerce_optional_str,
     load_json_object,
     load_jsonl_rows,
+    normalize_message_content,
     normalize_message_role,
     require_string,
     stringify_json_value,
@@ -244,7 +245,7 @@ def normalize_hermes_messages(
             normalized_messages.append(
                 build_message(
                     role="tool",
-                    content=_normalize_message_content(raw_message.get("content")),
+                    content=normalize_message_content(raw_message.get("content")),
                     tool_call_id=require_string(
                         raw_message.get("tool_call_id"),
                         f"Hermes tool message tool_call_id #{message_index} in {file_path}",
@@ -253,7 +254,7 @@ def normalize_hermes_messages(
             )
             continue
 
-        content = _normalize_message_content(raw_message.get("content"))
+        content = normalize_message_content(raw_message.get("content"))
         reasoning_content = coerce_optional_str(raw_message.get("reasoning"))
         tool_calls = normalize_hermes_tool_calls(
             raw_message.get("tool_calls"),
@@ -413,22 +414,6 @@ def _require_message_list(raw_messages: Any, *, file_path: Path, context: str) -
     return raw_messages
 
 
-def _normalize_message_content(content: Any) -> Any:
-    """Coerce Hermes message content into the normalized content shape.
-
-    Args:
-        content: Raw Hermes message content.
-
-    Returns:
-        A string or content-block list compatible with ``build_message``.
-    """
-    if content is None:
-        return ""
-    if isinstance(content, (str, list)):
-        return content
-    return stringify_json_value(content)
-
-
 def _extract_finish_reasons(raw_messages: list[dict[str, Any]]) -> list[str]:
     """Collect distinct assistant finish reasons in first-seen order.