Skip to content

Commit 35e1253

Browse files
committed
Local validation plus some refactoring
Signed-off-by: Mike Knepper <mknepper@nvidia.com>
1 parent fa1a5f8 commit 35e1253

12 files changed

Lines changed: 238 additions & 219 deletions

File tree

packages/data_designer_nemo/src/data_designer_nemo/context.py

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323
)
2424
from data_designer_nemo.errors import NDDError
2525
from data_designer_nemo.fileset_file_seed_reader import FilesetFileSeedReader
26-
from data_designer_nemo.fileset_filesystem_provider import FilesetFileSystemProvider, HybridFileSystemProvider
26+
from data_designer_nemo.fileset_filesystem_provider import (
27+
FilesetFileSystemProvider,
28+
HybridFileSystemProvider,
29+
)
2730
from data_designer_nemo.model_provider import (
2831
make_local_first_model_provider_registry,
2932
make_model_provider_registry,
@@ -34,10 +37,7 @@
3437
from data_designer_nemo.sdk_translation import sync_to_async_sdk
3538
from data_designer_nemo.secret_resolver import NMPSecretResolver
3639
from data_designer_nemo.seed import validate_seed
37-
from data_designer_nemo.unsupported_features import (
38-
validate_no_tool_configs,
39-
validate_seed_config_for_execution_context,
40-
)
40+
from data_designer_nemo.tool_configs import validate_no_tool_configs
4141
from nemo_platform import AsyncNeMoPlatform, NeMoPlatform
4242

4343

@@ -65,6 +65,7 @@ class LocalDataDesignerContext:
6565
def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str):
6666
self._sdk = sdk
6767
self._workspace = workspace
68+
self._validated_filesystem_roots: set[str] = set()
6869

6970
def get_secret_resolver(self) -> SecretResolver:
7071
return CompositeResolver(
@@ -76,19 +77,25 @@ def get_secret_resolver(self) -> SecretResolver:
7677
)
7778

7879
async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]:
80+
sdk = self._async_sdk()
7981
errors: list[NDDError] = []
82+
8083
try:
81-
validate_seed_config_for_execution_context(config, is_local=True)
84+
if validated_root := await validate_seed(config, self._workspace, sdk, is_local=True):
85+
self._validated_filesystem_roots.add(validated_root)
8286
except NDDError as e:
8387
errors.append(e)
88+
8489
return errors
8590

8691
def get_seed_readers(self) -> list[SeedReader]:
8792
# Directory- and FileContents-style seeds may reference either a local
8893
# directory or a NeMo Platform fileset in local mode. The engine only
8994
# accepts one provider per reader, so we inject a hybrid provider that
9095
# resolves each seed path against local disk first, then a fileset.
91-
fs_provider = HybridFileSystemProvider(self._sdk, workspace=self._workspace)
96+
fs_provider = HybridFileSystemProvider(
97+
self._sdk, workspace=self._workspace, validated_roots=self._validated_filesystem_roots
98+
)
9299
return [
93100
HuggingFaceSeedReader(),
94101
LocalFileSeedReader(),
@@ -114,6 +121,11 @@ async def get_model_providers(self, model_configs: list[dd.ModelConfig]) -> list
114121

115122
return [make_noop_provider()]
116123

124+
def _async_sdk(self) -> AsyncNeMoPlatform:
125+
if isinstance(self._sdk, NeMoPlatform):
126+
return sync_to_async_sdk(self._sdk)
127+
return self._sdk
128+
117129

118130
class RemoteDataDesignerContext:
119131
def __init__(self, sdk: AsyncNeMoPlatform | NeMoPlatform, workspace: str):
@@ -132,15 +144,13 @@ async def validate(self, config: dd.DataDesignerConfig) -> list[NDDError]:
132144
validate_no_tool_configs(config)
133145
except NDDError as e:
134146
errors.append(e)
147+
135148
try:
136-
validate_seed_config_for_execution_context(config, is_local=False)
137-
except NDDError as e:
138-
errors.append(e)
139-
try:
140-
if validated_root := await validate_seed(config, self._workspace, sdk):
149+
if validated_root := await validate_seed(config, self._workspace, sdk, is_local=False):
141150
self._validated_filesystem_roots.add(validated_root)
142151
except NDDError as e:
143152
errors.append(e)
153+
144154
try:
145155
await ensure_nemotron_personas_filesets(config, sdk)
146156
except NDDError as e:

packages/data_designer_nemo/src/data_designer_nemo/fileset_filesystem_provider.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class _FilesetDirFileSystem(DirFileSystem):
3030
this remains a complete AbstractFileSystem implementation.
3131
"""
3232

33-
def _relpath(self, path: str | list) -> str | list: # type: ignore[override]
33+
def _relpath(self, path: str | list) -> str | list:
3434
if isinstance(path, list):
3535
return [self._relpath(p) for p in path]
3636
if not self.path:
@@ -122,10 +122,16 @@ def ensure_root_exists(self, *, runtime_path: str) -> None:
122122
self._route(runtime_path).ensure_root_exists(runtime_path=runtime_path)
123123

124124
def _route(self, runtime_path: str) -> FileSystemProvider:
125-
return self._local if _is_local_directory(runtime_path) else self._fileset
125+
return self._local if is_local_directory(runtime_path) else self._fileset
126126

127127

128-
def _is_local_directory(runtime_path: str) -> bool:
128+
def is_local_directory(runtime_path: str) -> bool:
129+
"""Whether a seed path resolves to an existing directory on the local filesystem.
130+
131+
Shared by ``HybridFileSystemProvider`` routing and local-mode seed validation so
132+
that eager validation and read-time routing always agree on which backend serves
133+
a given path.
134+
"""
129135
try:
130136
return Path(runtime_path).expanduser().is_dir()
131137
except (OSError, ValueError, RuntimeError):

packages/data_designer_nemo/src/data_designer_nemo/seed.py

Lines changed: 95 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,30 +2,58 @@
22
# SPDX-License-Identifier: Apache-2.0
33

44
import logging
5+
from typing import Any
56

67
import data_designer.config as dd
78
from data_designer.config.seed_source import SeedSource
89
from data_designer_nemo.errors import NDDInternalError, NDDInvalidConfigError
910
from data_designer_nemo.fileset_file_seed_source import FilesetFileSeedSource
11+
from data_designer_nemo.fileset_filesystem_provider import is_local_directory
1012
from data_designer_nemo.secret_resolver import validate_secret
1113
from nemo_platform import AsyncNeMoPlatform, NotFoundError, PermissionDeniedError
1214
from nemo_platform.filesets import FilesetPathError, build_fileset_ref, parse_fileset_ref
1315

1416
logger = logging.getLogger(__name__)
1517

16-
17-
def get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None:
18-
return dd_config.seed_config.source if dd_config.seed_config else None
18+
_SUPPORTED_SEED_TYPES = {"directory", "file_contents", "hf", "nmp"}
19+
_UNSUPPORTED_SEED_TYPES_MESSAGE = (
20+
"The NeMo Platform Data Designer service only supports seed data from HuggingFace "
21+
"or the NeMo Platform Files service (FilesetFile, Directory, or FileContents seed sources "
22+
"referencing fileset paths). Upload your data to the Files service, adjust your config, and try again."
23+
)
24+
_DATAFRAME_SEED_TYPE = "df"
25+
_DATAFRAME_SEED_TYPE_MESSAGE = (
26+
"Dataframe seed sources (seed_type=df) are not supported on the NeMo Platform. TODO: more detail here!"
27+
)
1928

2029

21-
async def validate_seed(dd_config: dd.DataDesignerConfig, workspace: str, sdk: AsyncNeMoPlatform) -> str | None:
22-
if (seed_source := get_seed_source(dd_config)) is None:
30+
async def validate_seed(
31+
dd_config: dd.DataDesignerConfig,
32+
workspace: str,
33+
sdk: AsyncNeMoPlatform,
34+
is_local: bool,
35+
) -> str | None:
36+
if (seed_source := _get_seed_source(dd_config)) is None:
2337
return None
2438

25-
if isinstance(seed_source, dd.HuggingFaceSeedSource) and (token := seed_source.token) is not None:
26-
await validate_secret(sdk, token, workspace)
39+
_validate_seed_type_for_execution_context(
40+
seed_source.seed_type,
41+
is_local=is_local,
42+
)
43+
44+
if isinstance(seed_source, dd.HuggingFaceSeedSource):
45+
# In local execution context, a HF seed source token will always "resolve"
46+
# because the composite secret resolver includes a plaintext resolver.
47+
# In remote execution context, a HF seed source token must be a reference
48+
# to a Nemo Platform secret (if provided).
49+
if not is_local and (token := seed_source.token) is not None:
50+
await validate_secret(sdk, token, workspace)
2751
return None
2852

53+
if is_local and isinstance(seed_source, dd.DirectorySeedSource | dd.FileContentsSeedSource):
54+
if is_local_directory(seed_source.path):
55+
return None
56+
2957
if isinstance(seed_source, FilesetFileSeedSource | dd.DirectorySeedSource | dd.FileContentsSeedSource):
3058
return await _validate_seed_from_files_service(seed_source, workspace, sdk)
3159

@@ -78,3 +106,63 @@ async def _validate_seed_from_files_service(
78106
raise NDDInvalidConfigError(f"Path {fragment!r} not found in fileset {fully_qualified_fileset_name!r}")
79107

80108
return canonical_root
109+
110+
111+
def validate_seed_source_for_execution_context(data: Any, *, is_local: bool) -> None:
112+
"""Raises if a raw request seed source is unsupported for the execution context.
113+
114+
This function is used in Pydantic validators defined on the preview and job request models,
115+
both of which carry a `config: dd.DataDesignerConfig` field.
116+
117+
This function is used in "before"-style Pydantic validators, where the data argument is typed
118+
as Any. We run in the before context to preempt less-useful error messages from the DD library:
119+
- missing dataframe field (we don't serialize dataframes over the wire)
120+
- file does not exist (the client's local fs != the service's local fs)
121+
122+
The validators using this function only care about preventing unsupported seed types. All the
123+
other standard Pydantic validation will get applied by FastAPI parsing the request; this does
124+
not bypass that. So, we can safely ignore all Exceptions (most commonly KeyError, on requests
125+
that don't include a seed_config at all) and index our way straight to the deeply nested field
126+
we care about for this particular validation.
127+
128+
Per the Pydantic v2 contract, "before"-mode validators may raise ``ValueError``,
129+
``AssertionError``, or ``PydanticCustomError`` — anything else (including our
130+
``NDDInvalidConfigError``) propagates raw out of ``model_validate`` and is not wrapped in
131+
``pydantic.ValidationError``. That breaks ``except ValidationError`` clauses in CLI / framework
132+
code that turn validation problems into clean user-facing messages. To keep those code paths
133+
working *and* keep ``NDDInvalidConfigError`` as the canonical error class for non-Pydantic
134+
callers, we translate at this boundary: catch the plugin's error class and re-raise as a
135+
``ValueError`` carrying the same message.
136+
"""
137+
seed_type = _get_raw_seed_type(data)
138+
if seed_type is None:
139+
return
140+
141+
try:
142+
_validate_seed_type_for_execution_context(seed_type, is_local=is_local)
143+
except NDDInvalidConfigError as exc:
144+
raise ValueError(str(exc)) from exc
145+
146+
147+
def _validate_seed_type_for_execution_context(seed_type: str, *, is_local: bool) -> None:
148+
"""Raises if a seed source type is unsupported in this execution context."""
149+
if is_local:
150+
if seed_type == _DATAFRAME_SEED_TYPE:
151+
raise NDDInvalidConfigError(_DATAFRAME_SEED_TYPE_MESSAGE)
152+
return
153+
154+
if seed_type not in _SUPPORTED_SEED_TYPES:
155+
raise NDDInvalidConfigError(_UNSUPPORTED_SEED_TYPES_MESSAGE)
156+
157+
158+
def _get_seed_source(dd_config: dd.DataDesignerConfig) -> SeedSource | None:
159+
return dd_config.seed_config.source if dd_config.seed_config else None
160+
161+
162+
def _get_raw_seed_type(data: Any) -> str | None:
163+
try:
164+
seed_type = data["config"]["seed_config"]["source"]["seed_type"]
165+
except Exception:
166+
return None
167+
168+
return seed_type if isinstance(seed_type, str) else None
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
4+
import data_designer.config as dd
5+
from data_designer_nemo.errors import NDDInvalidConfigError
6+
7+
8+
def validate_no_tool_configs(config: dd.DataDesignerConfig) -> None:
9+
if config.tool_configs and len(config.tool_configs) > 0:
10+
raise NDDInvalidConfigError("Tool configs are not supported in the NeMo Platform Data Designer service.")

packages/data_designer_nemo/src/data_designer_nemo/unsupported_features.py

Lines changed: 0 additions & 99 deletions
This file was deleted.

0 commit comments

Comments
 (0)