Skip to content

Commit 0522ec1

Browse files
wip
1 parent 6c0eaae commit 0522ec1

2 files changed

Lines changed: 16 additions & 26 deletions

File tree

src/plaid/cli/plaidcheck.py

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -331,8 +331,14 @@ def _check_problem_definition_sample_features(
331331
"""Instantiate and validate one problem-definition sample view.
332332
333333
The sample is instantiated with the exact feature subset requested by the
334-
problem definition, then each requested feature is read back and checked for
335-
invalid content (None, NaN, Inf, empty arrays, object arrays containing None).
334+
problem definition, then each requested feature is read back to validate
335+
that the requested feature paths can actually be resolved.
336+
337+
Numeric content (NaN, Inf, None, empty arrays, ...) is intentionally not
338+
re-checked here: the per-split loop in :func:`check_dataset` already walks
339+
every sample's globals and fields and reports such issues with the
340+
``INVALID_DATA_VALUE A`` code. Re-checking them in this loop would only
341+
produce duplicate warnings under a different code/location.
336342
337343
Args:
338344
pb_name: Problem-definition name.
@@ -358,24 +364,14 @@ def _check_problem_definition_sample_features(
358364

359365
for feature in features:
360366
try:
361-
value = sample.get_feature_by_path(feature)
367+
sample.get_feature_by_path(feature)
362368
except Exception as exc:
363369
report.add(
364370
"error",
365371
"PB_DEF_FEATURE_READ_ERROR",
366372
f"{location} {feature}",
367373
str(exc),
368374
)
369-
continue
370-
371-
issue = _check_numeric_content(value)
372-
if issue is not None:
373-
report.add(
374-
"warning",
375-
"PB_DEF_INVALID_FEATURE_VALUE",
376-
f"{location} {feature}",
377-
issue,
378-
)
379375

380376

381377
def compute_checksum(sample: Any) -> str:

tests/cli/test_plaidcheck.py

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -480,12 +480,10 @@ def test_check_problem_definition_sample_reports_feature_read_error_and_continue
480480
and msg.message == "cannot read Unreadable"
481481
for msg in report.messages
482482
)
483-
assert any(
484-
msg.severity == "warning"
485-
and msg.code == "PB_DEF_INVALID_FEATURE_VALUE"
486-
and msg.location == "problem_definitions/pb/test_split/test[0] BadValue"
487-
and msg.message == "contains NaN"
488-
for msg in report.messages
483+
# Numeric content is intentionally not re-checked here: it is already
484+
# validated by the per-split loop in `check_dataset`.
485+
assert not any(
486+
msg.code == "PB_DEF_INVALID_FEATURE_VALUE" for msg in report.messages
489487
)
490488

491489

@@ -847,14 +845,10 @@ class _PBDef:
847845

848846
assert train_converter.feature_requests[-1] == ["Input", "Output"]
849847
assert test_converter.feature_requests[-1] == ["Input"]
850-
invalid = [
851-
msg for msg in report.messages if msg.code == "PB_DEF_INVALID_FEATURE_VALUE"
852-
]
853-
assert any(
854-
"train_split" in msg.location and "Output" in msg.location for msg in invalid
855-
)
848+
# The pb-def loop no longer re-checks numeric content; it only verifies that
849+
# the requested feature subset can be converted and read back.
856850
assert not any(
857-
"test_split" in msg.location and "Output" in msg.location for msg in invalid
851+
msg.code == "PB_DEF_INVALID_FEATURE_VALUE" for msg in report.messages
858852
)
859853

860854

0 commit comments

Comments
 (0)