Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fern/versions/latest/pages/concepts/seed-datasets.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ Directory-backed seed datasets expose these columns:

<Note>
Filesystem matching
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off.
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off. Relative local `path` values are resolved by the active filesystem provider when the seed is validated or read, not when the config object is constructed.
</Note>

### πŸ“„ FileContentsSeedSource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,21 @@ class FileSystemSeedSource(SeedSource, ABC):
``FileSystemSeedReader`` implementation.

Attributes:
path: Directory containing seed artifacts. Relative paths are resolved
from the current working directory when the config is loaded, not
from the config file location.
path: Directory containing seed artifacts. Relative local paths are
resolved by the active filesystem provider when the seed is
validated or read, not when the config object is constructed.
file_pattern: Case-sensitive filename pattern used to match files under
the provided directory. Patterns match basenames only, not relative
paths. Defaults to ``'*'``.
recursive: Whether to search nested subdirectories under the provided
directory for matching files. Defaults to ``True``.
"""

_runtime_path: str | None = PrivateAttr(default=None)

path: str = Field(
...,
description=(
"Directory containing seed artifacts. Relative paths are resolved from the current working "
"directory when the config is loaded, not from the config file location."
"Directory containing seed artifacts. Relative local paths are resolved by the active filesystem "
"provider when the seed is validated or read, not when the config object is constructed."
),
)
file_pattern: str = Field(
Expand All @@ -137,20 +135,17 @@ def validate_path(cls, value: str | None) -> str | None:
# and inherited validators fire for all subclasses.
return _validate_filesystem_seed_source_path(value)

def model_post_init(self, __context: Any) -> None:
# None guard is exercised by AgentRolloutSeedSource (path: str | None) via inheritance.
self._runtime_path = None if self.path is None else _resolve_filesystem_runtime_path(self.path)

@property
def runtime_path(self) -> str:
if self._runtime_path is None:
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
return self._runtime_path

@field_validator("file_pattern", mode="after")
def validate_file_pattern(cls, value: str | None) -> str | None:
return _validate_filesystem_seed_source_file_pattern(value)

@property
def runtime_path(self) -> str:
# Path resolution and existence checks are the filesystem provider's job at read
# time, not the config object's. Keeping the raw value here preserves relative
# paths and avoids assuming a local filesystem.
return self.path


class DirectorySeedSource(FileSystemSeedSource):
seed_type: Literal["directory"] = "directory"
Expand Down Expand Up @@ -205,9 +200,8 @@ def get_pi_coding_agent_default_path() -> str:
def _validate_filesystem_seed_source_path(value: str | None) -> str | None:
if value is None:
return None
path = Path(value).expanduser().resolve()
if not path.is_dir():
raise InvalidFilePathError(f"πŸ›‘ Path {path} is not a directory.")
if not value.strip():
raise InvalidFilePathError("πŸ›‘ FileSystemSeedSource.path must be a non-empty string.")
return value


Expand Down Expand Up @@ -259,8 +253,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
"Claude Code defaults to ~/.claude/projects, Codex defaults to ~/.codex/sessions, "
"Hermes Agent defaults to ~/.hermes/sessions, "
"and Pi Coding Agent defaults to ~/.pi/agent/sessions. "
"Relative paths are resolved from the current working directory when the config is loaded, "
"not from the config file location."
"Relative local paths are resolved by the active filesystem provider when the seed is "
"validated or read, not when the config object is constructed."
),
)

Expand All @@ -282,14 +276,14 @@ def validate_runtime_path_source(self) -> Self:

@property
def runtime_path(self) -> str:
if self._runtime_path is not None:
return self._runtime_path
# Path resolution and existence checks happen in the filesystem provider at read
# time. When no explicit path is given, fall back to the format's default root.
if self.path is not None:
return self.path
default_path, _ = get_agent_rollout_format_defaults(self.format)
resolved_path = self.path if self.path is not None else default_path
if resolved_path is None:
if default_path is None:
raise ValueError(f"πŸ›‘ AgentRolloutSeedSource.path is required for format {self.format.value!r}.")
self._runtime_path = _resolve_filesystem_runtime_path(resolved_path)
return self._runtime_path
return default_path

@property
def resolved_file_pattern(self) -> str:
Expand Down
74 changes: 64 additions & 10 deletions packages/data-designer-config/tests/config/test_seed_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Literal

import pytest

Expand All @@ -15,6 +16,7 @@
AgentRolloutSeedSource,
DirectorySeedSource,
FileContentsSeedSource,
FileSystemSeedSource,
LocalFileSeedSource,
)
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
Expand Down Expand Up @@ -95,12 +97,14 @@ def test_dataframe_seed_source_serialization() -> None:
assert serialized == {"seed_type": "df"}


def test_directory_seed_source_requires_directory(tmp_path: Path) -> None:
def test_directory_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
file_path = tmp_path / "file.txt"
file_path.write_text("alpha", encoding="utf-8")

with pytest.raises(InvalidFilePathError, match="is not a directory"):
DirectorySeedSource(path=str(file_path))
source = DirectorySeedSource(path=str(file_path))

assert source.path == str(file_path)
assert source.runtime_path == str(file_path)


def test_directory_seed_source_preserves_relative_path_input(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
Expand Down Expand Up @@ -146,7 +150,7 @@ def test_file_contents_seed_source_preserves_relative_path_input(
pytest.param(FileContentsSeedSource, {"file_pattern": "*.txt"}, id="file-contents"),
],
)
def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(
def test_filesystem_seed_sources_preserve_raw_runtime_path_across_cwd_changes(
source_type: type[DirectorySeedSource] | type[FileContentsSeedSource],
source_kwargs: dict[str, str],
tmp_path: Path,
Expand All @@ -160,12 +164,11 @@ def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(

monkeypatch.chdir(initial_root)
source = source_type(path="seed-dir", **source_kwargs)
expected_runtime_path = str(initial_seed_dir.resolve())

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == expected_runtime_path
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


Expand All @@ -176,10 +179,10 @@ def test_seed_source_path_descriptions_document_cwd_resolution() -> None:

assert "current working directory" in local_path_description
assert "config file location" in local_path_description
assert "current working directory" in directory_path_description
assert "config file location" in directory_path_description
assert "current working directory" in file_contents_path_description
assert "config file location" in file_contents_path_description
assert "active filesystem provider" in directory_path_description
assert "config object is constructed" in directory_path_description
assert "active filesystem provider" in file_contents_path_description
assert "config object is constructed" in file_contents_path_description


def test_file_contents_seed_source_parses_from_dict(tmp_path: Path) -> None:
Expand Down Expand Up @@ -223,6 +226,17 @@ def test_filesystem_seed_sources_reject_path_like_file_patterns(
source_type(path=str(tmp_path), file_pattern=file_pattern)


def test_filesystem_seed_source_subclass_inherits_runtime_path(tmp_path: Path) -> None:
# Plugin authors subclass FileSystemSeedSource directly; readers rely on
# `source.runtime_path`, so the base must provide it without an override.
class PluginSeedSource(FileSystemSeedSource):
seed_type: Literal["plugin-seed-source"] = "plugin-seed-source"

source = PluginSeedSource(path=str(tmp_path))

assert source.runtime_path == str(tmp_path)


@pytest.mark.parametrize(
("rollout_format", "file_pattern", "error_message"),
[
Expand Down Expand Up @@ -267,6 +281,46 @@ def test_agent_rollout_seed_source_requires_explicit_atif_path() -> None:
AgentRolloutSeedSource(format=AgentRolloutFormat.ATIF)


def test_agent_rollout_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
missing_dir = tmp_path / "does-not-exist"

source = AgentRolloutSeedSource(path=str(missing_dir), format=AgentRolloutFormat.ATIF)

assert source.path == str(missing_dir)
assert source.runtime_path == str(missing_dir)


def test_agent_rollout_seed_source_preserves_raw_runtime_path_across_cwd_changes(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
initial_root = tmp_path / "initial"
later_root = tmp_path / "later"
(initial_root / "seed-dir").mkdir(parents=True)
later_root.mkdir()

monkeypatch.chdir(initial_root)
source = AgentRolloutSeedSource(path="seed-dir", format=AgentRolloutFormat.ATIF)

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


def test_agent_rollout_seed_source_runtime_path_falls_back_to_format_default(
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
) -> None:
monkeypatch.setenv("HOME", str(tmp_path))

source = AgentRolloutSeedSource(format=AgentRolloutFormat.CLAUDE_CODE)

assert source.path is None
assert source.runtime_path == str(tmp_path / ".claude" / "projects")


def test_agent_rollout_seed_source_uses_default_atif_file_pattern(tmp_path: Path) -> None:
trace_dir = tmp_path / "atif"
trace_dir.mkdir()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from data_designer.config.errors import InvalidConfigError
from data_designer.config.sampler_params import UUIDSamplerParams
from data_designer.engine.resources.resource_provider import ResourceProvider
from data_designer.engine.resources.seed_reader import SeedReader
from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderConfigError
from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config

logger = logging.getLogger(__name__)
Expand All @@ -31,7 +31,10 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR
if not seed_reader:
return

seed_col_names = seed_reader.get_column_names()
try:
seed_col_names = seed_reader.get_column_names()
except SeedReaderConfigError as error:
raise InvalidConfigError(str(error)) from error
existing_columns = {column.name for column in config.columns}
colliding_columns = {name for name in seed_col_names if name in existing_columns}
if colliding_columns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from copy import copy
from dataclasses import dataclass
from fnmatch import fnmatchcase
from pathlib import Path, PurePosixPath
from pathlib import Path, PurePath, PurePosixPath
from typing import TYPE_CHECKING, Any, ClassVar, Generic, Protocol, TypeVar, get_args, get_origin

from fsspec.implementations.dirfs import DirFileSystem
Expand Down Expand Up @@ -50,12 +50,37 @@
class SeedReaderError(DataDesignerError): ...


class SeedReaderConfigError(SeedReaderError): ...


@dataclass(frozen=True)
class SeedReaderFileSystemContext:
"""Filesystem and root path available to filesystem seed-reader plugins."""

fs: AbstractFileSystem
root_path: Path
root_path: PurePath
Comment thread
mikeknep marked this conversation as resolved.


class FileSystemProvider(Protocol):
"""Resolves a runtime path into a rooted fsspec filesystem."""

def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: ...

def ensure_root_exists(self, *, runtime_path: str) -> None: ...


class LocalFileSystemProvider:
"""Default filesystem provider backed by the local disk."""

def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext:
resolved_root_path = Path(runtime_path).expanduser().resolve()
rooted_fs = DirFileSystem(path=str(resolved_root_path), fs=LocalFileSystem())
return SeedReaderFileSystemContext(fs=rooted_fs, root_path=resolved_root_path)

def ensure_root_exists(self, *, runtime_path: str) -> None:
resolved_root_path = Path(runtime_path).expanduser().resolve()
if not resolved_root_path.is_dir():
raise SeedReaderConfigError(f"πŸ›‘ Seed source directory '{resolved_root_path}' does not exist.")


class SeedReaderBatch(Protocol):
Expand Down Expand Up @@ -388,12 +413,23 @@ class FileSystemSeedReader(SeedReader[FileSystemSourceT], ABC):

output_columns: ClassVar[list[str] | None] = None

def __init__(self, fs_provider: FileSystemProvider | None = None) -> None:
self._fs_provider = fs_provider or LocalFileSystemProvider()

def _reset_attachment_state(self) -> None:
super()._reset_attachment_state()
self._filesystem_context = None
self._output_df = None
self._row_manifest_df = None

def create_filesystem_context(self, root_path: Path | str) -> SeedReaderFileSystemContext:
"""Create a rooted filesystem context for directory-backed seed readers.

This hook is preserved for existing plugin readers. New host integrations
should prefer passing a ``FileSystemProvider`` to the reader constructor.
"""
return self._get_fs_provider().create_context(runtime_path=str(root_path))

def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection:
return self.create_dataframe_duckdb_connection(
table_name=self.get_dataset_uri(),
Expand Down Expand Up @@ -495,10 +531,18 @@ def _get_filesystem_context(self) -> SeedReaderFileSystemContext:
self._ensure_attached()
context = getattr(self, "_filesystem_context", None)
if context is None:
self._get_fs_provider().ensure_root_exists(runtime_path=self.source.runtime_path)
context = self.create_filesystem_context(self.source.runtime_path)
self._filesystem_context = context
return context

def _get_fs_provider(self) -> FileSystemProvider:
provider = getattr(self, "_fs_provider", None)
if provider is None:
provider = LocalFileSystemProvider()
self._fs_provider = provider
return provider

def _get_manifest_dataset_uri(self) -> str:
return self._build_internal_table_name("manifest")

Expand Down Expand Up @@ -653,8 +697,17 @@ def _get_parse_context(self, context: SeedReaderFileSystemContext) -> AgentRollo
if self._parse_context is not self._PARSE_CONTEXT_UNSET:
return self._parse_context

# Agent rollout handlers operate on the local filesystem directly (root_path.glob,
# root_path / relative_path), so they require a concrete Path rather than the
# PurePath the context type permits for remote providers.
root_path = context.root_path
if not isinstance(root_path, Path):
raise SeedReaderConfigError(
f"πŸ›‘ Agent rollout seed readers require a local filesystem, but got non-local root path "
f"{root_path!r} ({type(root_path).__name__})."
)
handler = self.get_format_handler()
self._parse_context = handler.build_parse_context(root_path=context.root_path, recursive=self.source.recursive)
self._parse_context = handler.build_parse_context(root_path=root_path, recursive=self.source.recursive)
return self._parse_context


Expand Down
Loading
Loading