Skip to content

Commit fa1a5f8

Browse files
committed
Add hybrid fs provider
Signed-off-by: Mike Knepper <mknepper@nvidia.com>
1 parent d1dca01 commit fa1a5f8

3 files changed

Lines changed: 104 additions & 6 deletions

File tree

packages/data_designer_nemo/src/data_designer_nemo/context.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
)
2424
from data_designer_nemo.errors import NDDError
2525
from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader
26-
from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider
26+
from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider
2727
from data_designer_nemo.model_provider import (
2828
make_local_first_model_provider_registry,
2929
make_model_provider_registry,
@@ -84,12 +84,17 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]:
8484
return errors
8585

8686
def get_seed_readers(self) -> list[SeedReader]:
87+
# Directory- and FileContents-style seeds may reference either a local
88+
# directory or a NeMo Platform fileset in local mode. The engine only
89+
# accepts one provider per reader, so we inject a hybrid provider that
90+
# resolves each seed path against local disk first, then a fileset.
91+
fs_provider = HybridFileSystemProvider(self._sdk, workspace=self._workspace)
8792
return [
8893
HuggingFaceSeedReader(),
8994
LocalFileSeedReader(),
9095
DataFrameSeedReader(),
91-
DirectorySeedReader(),
92-
FileContentsSeedReader(),
96+
DirectorySeedReader(fs_provider=fs_provider),
97+
FileContentsSeedReader(fs_provider=fs_provider),
9398
AgentRolloutSeedReader(),
9499
FilesetFileSeedReader(self._sdk),
95100
]

packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4-
from pathlib import PurePosixPath
4+
from pathlib import Path, PurePosixPath
55

66
from data_designer.engine.resources.seed_reader import (
7+
FileSystemProvider,
8+
LocalFileSystemProvider,
79
SeedReaderConfigError,
810
SeedReaderError,
911
SeedReaderFileSystemContext,
@@ -89,3 +91,42 @@ def _parse(self, runtime_path: str) -> tuple[str, str, str]:
8991
return parse_fileset_ref(runtime_path, workspace_fallback=self._workspace)
9092
except FilesetPathError as error:
9193
raise SeedReaderError(f"🛑 Invalid fileset seed source path {runtime_path!r}: {error}") from error
94+
95+
96+
class HybridFileSystemProvider:
97+
"""Filesystem provider that resolves a seed path against local disk first, then a fileset.
98+
99+
In local mode a directory-style seed source may point at either a directory on
100+
the local filesystem or a NeMo Platform fileset, and the engine only lets us
101+
inject a single provider per seed reader. We route per path: if the path
102+
resolves to an existing local directory we serve it from disk, otherwise we
103+
treat it as a fileset reference. This mirrors the local-first model-provider
104+
resolution strategy (locally-defined providers first, Inference Gateway as the
105+
fallback).
106+
"""
107+
108+
def __init__(
109+
self,
110+
sdk: NeMoPlatform | AsyncNeMoPlatform,
111+
*,
112+
workspace: str,
113+
validated_roots: set[str] | None = None,
114+
) -> None:
115+
self._local = LocalFileSystemProvider()
116+
self._fileset = FilesetFileSystemProvider(sdk, workspace=workspace, validated_roots=validated_roots)
117+
118+
def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext:
119+
return self._route(runtime_path).create_context(runtime_path=runtime_path)
120+
121+
def ensure_root_exists(self, *, runtime_path: str) -> None:
122+
self._route(runtime_path).ensure_root_exists(runtime_path=runtime_path)
123+
124+
def _route(self, runtime_path: str) -> FileSystemProvider:
125+
return self._local if _is_local_directory(runtime_path) else self._fileset
126+
127+
128+
def _is_local_directory(runtime_path: str) -> bool:
129+
try:
130+
return Path(runtime_path).expanduser().is_dir()
131+
except (OSError, ValueError, RuntimeError):
132+
return False

packages/data_designer_nemo/tests/unit/test_fileset_filesystem_provider.py

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
11
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
# SPDX-License-Identifier: Apache-2.0
33

4+
from pathlib import Path
45
from unittest.mock import Mock, patch
56

67
import pytest
7-
from data_designer.engine.resources.seed_reader import SeedReaderConfigError
8-
from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider
8+
from data_designer.engine.resources.seed_reader import (
9+
DirectorySeedReader,
10+
FileContentsSeedReader,
11+
SeedReaderConfigError,
12+
)
13+
from data_designer_nemo.context import LocalDataDesignerContext
14+
from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider
915

1016

1117
def test_create_context_roots_reader_in_canonical_fileset_ref() -> None:
@@ -44,3 +50,49 @@ def test_ensure_root_exists_reports_missing_fileset_path() -> None:
4450
provider.ensure_root_exists(runtime_path="docs#corpus")
4551

4652
assert fs_class.return_value.exists.call_count == 2
53+
54+
55+
def test_hybrid_routes_existing_local_directory_to_disk(tmp_path: Path) -> None:
56+
sdk = Mock()
57+
provider = HybridFileSystemProvider(sdk, workspace="default")
58+
59+
with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class:
60+
context = provider.create_context(runtime_path=str(tmp_path))
61+
provider.ensure_root_exists(runtime_path=str(tmp_path))
62+
63+
assert context.root_path == tmp_path.resolve()
64+
fs_class.assert_not_called()
65+
66+
67+
def test_hybrid_routes_non_local_path_to_fileset() -> None:
68+
sdk = Mock()
69+
provider = HybridFileSystemProvider(sdk, workspace="default")
70+
71+
with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class:
72+
fs_class.return_value.async_impl = True
73+
fs_class.return_value.asynchronous = False
74+
context = provider.create_context(runtime_path="docs#corpus")
75+
76+
fs_class.assert_called_once_with(sdk)
77+
assert str(context.root_path) == "default/docs#corpus"
78+
79+
80+
def test_hybrid_ensure_root_exists_validates_fileset_for_non_local_path() -> None:
81+
sdk = Mock()
82+
provider = HybridFileSystemProvider(sdk, workspace="default")
83+
84+
with patch("data_designer_nemo.fileset_filesystem_provider.FilesetFileSystem") as fs_class:
85+
fs_class.return_value.exists.side_effect = [False, True]
86+
87+
with pytest.raises(SeedReaderConfigError, match="Path 'corpus' not found in fileset 'default/docs'"):
88+
provider.ensure_root_exists(runtime_path="docs#corpus")
89+
90+
assert fs_class.return_value.exists.call_count == 2
91+
92+
93+
def test_local_context_wires_hybrid_provider_into_filesystem_readers() -> None:
94+
readers = LocalDataDesignerContext(Mock(), "default").get_seed_readers()
95+
96+
fs_readers = [r for r in readers if isinstance(r, DirectorySeedReader | FileContentsSeedReader)]
97+
assert len(fs_readers) == 2
98+
assert all(isinstance(r._fs_provider, HybridFileSystemProvider) for r in fs_readers)

0 commit comments

Comments
 (0)