Skip to content

Commit 2eb982c

Browse files
authored
Merge pull request #61 from datakind/Validation-Errors
fix: output specific validation errors
2 parents ea5dc8d + 4914a19 commit 2eb982c

2 files changed

Lines changed: 71 additions & 29 deletions

File tree

src/webapp/validation.py

Lines changed: 51 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import csv
66

77
from collections import Counter
8-
from typing import Final, Any
8+
from typing import Final, Any, NamedTuple
99

1010
from .utilities import SchemaType
1111

@@ -260,6 +260,12 @@
260260
}
261261

262262

263+
class ColumnValidationResult(NamedTuple):
264+
is_valid: bool
265+
unexpected_columns: list[str]
266+
missing_required_columns: list[str]
267+
268+
263269
def validate_file(filename: str, allowed_types: set[SchemaType]) -> set[SchemaType]:
264270
"""Validates given a filename."""
265271
with open(filename) as f:
@@ -299,26 +305,55 @@ def get_col_names(f: Any) -> Any:
299305

300306
def detect_file_type(col_names: list[str]) -> set[SchemaType]:
301307
"""Returns all schemas that match for a list of col names."""
302-
res = set()
303-
for schema, schema_cols in SCHEMA_TYPE_TO_COLS.items():
308+
matches = set()
309+
errors = {}
310+
311+
for schema, expected_cols in SCHEMA_TYPE_TO_COLS.items():
304312
optional_cols = SCHEMA_TYPE_TO_OPTIONAL_COLS[schema]
305-
if valid_subset_lists(schema_cols, col_names, optional_cols):
306-
res.add(schema)
307-
if not res:
308-
# If it doesn't match any, it will match unknown.
309-
res.add(SchemaType.UNKNOWN)
310-
return res
313+
result = valid_subset_lists(expected_cols, col_names, optional_cols)
314+
315+
if result.is_valid:
316+
matches.add(schema)
317+
else:
318+
errors[schema.name] = result
319+
320+
if matches:
321+
return matches
322+
323+
error_msgs = []
324+
for schema_name, res in errors.items():
325+
msg = f"\nSchema: {schema_name}"
326+
if res.unexpected_columns:
327+
msg += f"\n Unexpected columns: {res.unexpected_columns}"
328+
if res.missing_required_columns:
329+
msg += f"\n Missing required columns: {res.missing_required_columns}"
330+
error_msgs.append(msg)
331+
332+
raise ValueError(
333+
"No valid schema matched. Details of mismatches:\n" + "\n".join(error_msgs)
334+
)
311335

312336

313337
def valid_subset_lists(
314-
superset_list: list[str], subset_list: list[str], optional_list: list[str]
315-
) -> bool:
338+
expected: list[str], actual: list[str], optional_list: list[str]
339+
) -> ColumnValidationResult:
316340
"""Checks if the subset_list is a subset of or equivalent to superset_list. And if so,
317341
whether the missing values are all present in the optional list. This method disregards order
318342
but cares about duplicates."""
319343
# Checks if any value in subset list is NOT present in superset list.
320-
if Counter(subset_list) - Counter(superset_list):
321-
# This is not a valid state, users should not be passing in unrecognized columns.
322-
return False
323-
missing_vals = Counter(superset_list) - Counter(subset_list)
324-
return not Counter(missing_vals) - Counter(optional_list)
344+
expected_counter = Counter(expected)
345+
actual_counter = Counter(actual)
346+
347+
unexpected = list((actual_counter - expected_counter).elements())
348+
349+
# Columns expected but missing (excluding optional)
350+
missing_total = list((expected_counter - actual_counter).elements())
351+
missing_required = [col for col in missing_total if col not in optional_list]
352+
353+
is_valid = not unexpected and not missing_required
354+
355+
return ColumnValidationResult(
356+
is_valid=is_valid,
357+
unexpected_columns=unexpected,
358+
missing_required_columns=missing_required,
359+
)

src/webapp/validation_test.py

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,25 @@ def test_get_col_names():
1919

2020

2121
def test_valid_subset_lists():
22-
"""Testing valid subset checking."""
22+
"""Testing valid subset checking with detailed results."""
2323
list_a = [1, 2, 3, 4, 5, 6]
2424
list_b = [1, 2, 3, 4, 6]
2525
list_c = [5]
2626
list_d = [3, 4]
27-
assert valid_subset_lists(list_a, list_b, list_c)
28-
# Missing value is not in the optional list.
29-
assert not valid_subset_lists(list_a, list_b, list_d)
30-
# Subset has an additional element not found in superset.
31-
assert not valid_subset_lists(list_b, list_a, list_c)
27+
28+
result1 = valid_subset_lists(list_a, list_b, list_c)
29+
assert result1.is_valid
30+
assert result1.unexpected_columns == []
31+
assert result1.missing_required_columns == []
32+
33+
result2 = valid_subset_lists(list_a, list_b, list_d)
34+
assert not result2.is_valid
35+
assert result2.unexpected_columns == []
36+
assert result2.missing_required_columns == [5]
37+
38+
result3 = valid_subset_lists(list_b, list_a, list_c)
39+
assert not result3.is_valid
40+
assert result3.unexpected_columns == [5]
3241

3342

3443
def test_detect_file_type():
@@ -45,7 +54,9 @@ def test_detect_file_type():
4554
with open("src/webapp/test_files/cohort_pdp.csv", encoding="utf-8") as f:
4655
assert detect_file_type(get_col_names(f)) == {SchemaType.PDP_COHORT}
4756
with open("src/webapp/test_files/test_upload.csv", encoding="utf-8") as f:
48-
assert detect_file_type(get_col_names(f)) == {SchemaType.UNKNOWN}
57+
with pytest.raises(ValueError) as err:
58+
detect_file_type(get_col_names(f))
59+
assert "No valid schema matched" in str(err.value)
4960
with open("src/webapp/test_files/malformed.csv", encoding="utf-8") as f:
5061
with pytest.raises(ValueError) as err:
5162
detect_file_type(get_col_names(f))
@@ -75,9 +86,5 @@ def test_validate_file():
7586
validate_file(
7687
"src/webapp/test_files/test_upload.csv", [SchemaType.SST_PDP_FINANCE]
7788
)
78-
assert str(err.value) == "Some file schema/columns are not recognized"
79-
with pytest.raises(ValueError) as err:
80-
validate_file(
81-
"src/webapp/test_files/malformed.csv", [SchemaType.SST_PDP_FINANCE]
82-
)
83-
assert str(err.value) == "CSV file malformed: Could not determine delimiter"
89+
assert "No valid schema matched." in str(err.value)
90+
assert "Unexpected columns" in str(err.value)

0 commit comments

Comments
 (0)