Skip to content

Commit 58870bb

Browse files
authored
feat: add ATIF rollout ingestion (#495)
1 parent 0d80858 commit 58870bb

File tree

11 files changed

+872
-14
lines changed

11 files changed

+872
-14
lines changed

docs/assets/recipes/trace_ingestion/agent_rollout_distillation.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -411,8 +411,9 @@ def build_arg_parser() -> ArgumentParser:
411411
type=Path,
412412
default=None,
413413
help=(
414-
"Optional directory containing rollout JSONL files. When omitted, `claude_code` defaults to "
415-
"~/.claude/projects and `codex` defaults to ~/.codex/sessions."
414+
"Optional directory containing rollout trace files. `atif` expects standalone JSON trajectory files "
415+
"and requires `--trace-dir`. When omitted, `claude_code` defaults to ~/.claude/projects and `codex` "
416+
"defaults to ~/.codex/sessions."
416417
),
417418
)
418419
parser.add_argument("--model-alias", type=str, default="nvidia-super")
@@ -459,6 +460,8 @@ def build_seed_source(
459460
trace_dir: Path | None,
460461
rollout_format: dd.AgentRolloutFormat,
461462
) -> dd.AgentRolloutSeedSource:
463+
if rollout_format == dd.AgentRolloutFormat.ATIF and trace_dir is None:
464+
raise ValueError("--trace-dir is required when --format atif.")
462465
seed_source_kwargs: dict[str, str | dd.AgentRolloutFormat] = {"format": rollout_format}
463466
if trace_dir is not None:
464467
seed_source_kwargs["path"] = str(trace_dir)

docs/concepts/seed-datasets.md

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,7 @@ Path: {{ relative_path }}
171171

172172
### 🤖 AgentRolloutSeedSource
173173

174-
Parse agent rollout trace files (e.g. from Claude Code or Codex) into a structured seed dataset. Each trace becomes one seed row with normalized metadata and the full message history, ready for distillation or analysis pipelines.
174+
Parse agent rollout trace files (e.g. from ATIF, Claude Code, or Codex) into a structured seed dataset. Each trace becomes one seed row with normalized metadata and the full message history, ready for distillation or analysis pipelines.
175175

176176
```python
177177
seed_source = dd.AgentRolloutSeedSource(
@@ -181,11 +181,13 @@ seed_source = dd.AgentRolloutSeedSource(
181181
config_builder.with_seed_dataset(seed_source)
182182
```
183183

184-
When `path` is omitted, built-in defaults are used:
184+
When `path` is omitted, built-in defaults are used for the vendor-native formats:
185185

186186
- **Claude Code**`~/.claude/projects`
187187
- **Codex**`~/.codex/sessions`
188188

189+
ATIF rollouts use standalone `.json` trajectory files and require an explicit `path`.
190+
189191
You can override both the path and file pattern:
190192

191193
```python
@@ -196,10 +198,19 @@ seed_source = dd.AgentRolloutSeedSource(
196198
)
197199
```
198200

201+
For ATIF trajectories:
202+
203+
```python
204+
seed_source = dd.AgentRolloutSeedSource(
205+
format=dd.AgentRolloutFormat.ATIF,
206+
path="my_atif_traces/",
207+
)
208+
```
209+
199210
`AgentRolloutSeedSource` exposes a rich set of seeded columns:
200211

201212
- `trace_id` — unique identifier for the trace
202-
- `source_kind` — the rollout format (e.g. `"claude_code"`, `"codex"`)
213+
- `source_kind` — the rollout format (e.g. `"atif"`, `"claude_code"`, `"codex"`)
203214
- `source_path` — full path to the source file
204215
- `root_session_id` — top-level session identifier
205216
- `agent_id` — agent identifier (if present)

docs/recipes/cards.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ Each recipe is a self-contained example that can be run independently.
107107

108108
**Demonstrates:**
109109

110-
- `AgentRolloutSeedSource` across Claude Code and Codex rollout formats
110+
- `AgentRolloutSeedSource` across ATIF, Claude Code, and Codex rollout formats
111111
- Using normalized trace columns in generation prompts
112112
- Distilling agent traces into reusable structured records
113113

docs/recipes/trace_ingestion/agent_rollout_distillation.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
[Download Code :octicons-download-24:](../../assets/recipes/trace_ingestion/agent_rollout_distillation.py){ .md-button download="agent_rollout_distillation.py" }
22

33
This recipe ingests built-in agent rollout traces with `AgentRolloutSeedSource(...)`, selecting the format with
4-
`--format` and optionally overriding the input directory with `--trace-dir`. It works with `claude_code` and `codex`;
5-
both can use their default locations when `--trace-dir` is omitted. The pipeline turns each imported trace into a
4+
`--format` and optionally overriding the input directory with `--trace-dir`. It works with `atif`, `claude_code`,
5+
and `codex`; `atif` expects standalone `.json` trajectory files and requires `--trace-dir`, while `claude_code` and
6+
`codex` can use their default locations when `--trace-dir` is omitted. The pipeline turns each imported trace into a
67
compact task digest, a standalone instruction-response pair for coding-assistant SFT, and a judge-scored quality
78
signal you can use for downstream filtering. It supports both full dataset creation and in-memory preview mode via
89
`--preview`.

packages/data-designer-config/src/data_designer/config/seed_source.py

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from pathlib import Path
99
from typing import TYPE_CHECKING, Any, Literal
1010

11-
from pydantic import BaseModel, Field, PrivateAttr, field_validator
11+
from pydantic import BaseModel, Field, PrivateAttr, field_validator, model_validator
1212
from typing_extensions import Self
1313

1414
from data_designer.config.errors import InvalidFilePathError
@@ -192,11 +192,14 @@ def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | No
192192

193193

194194
class AgentRolloutFormat(StrEnum):
195+
ATIF = "atif"
195196
CLAUDE_CODE = "claude_code"
196197
CODEX = "codex"
197198

198199

199-
def get_agent_rollout_format_defaults(fmt: AgentRolloutFormat) -> tuple[str, str]:
200+
def get_agent_rollout_format_defaults(fmt: AgentRolloutFormat) -> tuple[str | None, str]:
201+
if fmt == AgentRolloutFormat.ATIF:
202+
return (None, "*.json")
200203
if fmt == AgentRolloutFormat.CLAUDE_CODE:
201204
return (get_claude_code_default_path(), "*.jsonl")
202205
if fmt == AgentRolloutFormat.CODEX:
@@ -215,8 +218,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
215218
path: str | None = Field(
216219
None,
217220
description=(
218-
"Directory containing agent rollout artifacts. When omitted, built-in defaults are used: "
219-
"Claude Code defaults to ~/.claude/projects and Codex defaults to ~/.codex/sessions. "
221+
"Directory containing agent rollout artifacts. This field is required for ATIF trajectories. "
222+
"When omitted, built-in defaults are used for formats that define one. "
220223
"Relative paths are resolved from the current working directory when the config is loaded, "
221224
"not from the config file location."
222225
),
@@ -225,16 +228,26 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
225228
file_pattern: str | None = Field(
226229
None,
227230
description=(
228-
"Case-sensitive filename pattern used to match agent rollout files. When omitted, defaults to '*.jsonl'."
231+
"Case-sensitive filename pattern used to match agent rollout files. When omitted, "
232+
"ATIF defaults to '*.json' while Claude Code and Codex default to '*.jsonl'."
229233
),
230234
)
231235

236+
@model_validator(mode="after")
237+
def validate_runtime_path_source(self) -> Self:
238+
default_path, _ = get_agent_rollout_format_defaults(self.format)
239+
if self.path is None and default_path is None:
240+
raise ValueError(f"🛑 AgentRolloutSeedSource.path is required for format {self.format.value!r}.")
241+
return self
242+
232243
@property
233244
def runtime_path(self) -> str:
234245
if self._runtime_path is not None:
235246
return self._runtime_path
236247
default_path, _ = get_agent_rollout_format_defaults(self.format)
237248
resolved_path = self.path if self.path is not None else default_path
249+
if resolved_path is None:
250+
raise ValueError(f"🛑 AgentRolloutSeedSource.path is required for format {self.format.value!r}.")
238251
self._runtime_path = _resolve_filesystem_runtime_path(resolved_path)
239252
return self._runtime_path
240253

packages/data-designer-config/tests/config/test_seed_source.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,12 @@ def test_filesystem_seed_sources_reject_path_like_file_patterns(
226226
@pytest.mark.parametrize(
227227
("rollout_format", "file_pattern", "error_message"),
228228
[
229+
pytest.param(
230+
AgentRolloutFormat.ATIF,
231+
"nested/trace.json",
232+
"match file names, not relative paths",
233+
id="atif-posix",
234+
),
229235
pytest.param(
230236
AgentRolloutFormat.CLAUDE_CODE,
231237
"",
@@ -250,6 +256,23 @@ def test_agent_rollout_seed_source_rejects_invalid_file_patterns(
250256
AgentRolloutSeedSource(path=str(tmp_path), file_pattern=file_pattern, format=rollout_format)
251257

252258

259+
def test_agent_rollout_seed_source_requires_explicit_atif_path() -> None:
260+
with pytest.raises(ValueError, match="path is required for format 'atif'"):
261+
AgentRolloutSeedSource(format=AgentRolloutFormat.ATIF)
262+
263+
264+
def test_agent_rollout_seed_source_uses_default_atif_file_pattern(tmp_path: Path) -> None:
265+
trace_dir = tmp_path / "atif"
266+
trace_dir.mkdir()
267+
268+
source = AgentRolloutSeedSource(path=str(trace_dir), format=AgentRolloutFormat.ATIF)
269+
270+
assert source.seed_type == "agent_rollout"
271+
assert source.resolved_file_pattern == "*.json"
272+
assert source.recursive is True
273+
assert source.format == AgentRolloutFormat.ATIF
274+
275+
253276
def test_agent_rollout_seed_source_uses_default_claude_path(
254277
monkeypatch: pytest.MonkeyPatch,
255278
tmp_path: Path,

0 commit comments

Comments
 (0)