Skip to content

Commit 6b44fd7

Browse files
authored
Merge pull request #45 from datakind/sftp-ingestion
fix: rearranged function arrangement
2 parents 7132dac + ae03159 commit 6b44fd7

2 files changed

Lines changed: 39 additions & 39 deletions

File tree

src/webapp/routers/data.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -872,7 +872,7 @@ def validation_helper(
872872
allowed_schemas = set()
873873
if inst_query_result[0][0].schemas:
874874
allowed_schemas = set(inst_query_result[0][0].schemas)
875-
print(f"Allowed schemas: {allowed_schemas}")
875+
876876
inferred_schemas = set()
877877
try:
878878
inferred_schemas = storage_control.validate_file(

src/webapp/validation.py

Lines changed: 38 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -260,39 +260,26 @@
260260
}
261261

262262

263-
def valid_subset_lists(
264-
superset_list: list[str], subset_list: list[str], optional_list: list[str]
265-
) -> bool:
266-
"""Checks if the subset_list is a subset of or equivalent to superset_list. And if so,
267-
whether the missing values are all present in the optional list. This method disregards order
268-
but cares about duplicates."""
269-
# Checks if any value in subset list is NOT present in superset list.
270-
if Counter(subset_list) - Counter(superset_list):
271-
# This is not a valid state, users should not be passing in unrecognized columns.
272-
return False
273-
missing_vals = Counter(superset_list) - Counter(subset_list)
274-
return not Counter(missing_vals) - Counter(optional_list)
275-
276-
277-
def detect_file_type(col_names: list[str]) -> set[SchemaType]:
278-
"""Returns all schemas that match for a list of col names."""
279-
res = set()
280-
for schema, schema_cols in SCHEMA_TYPE_TO_COLS.items():
281-
optional_cols = SCHEMA_TYPE_TO_OPTIONAL_COLS[schema]
282-
if valid_subset_lists(schema_cols, col_names, optional_cols):
283-
res.add(schema)
284-
if not res:
285-
# If it doesn't match any, it will match unknown.
286-
res.add(SchemaType.UNKNOWN)
287-
return res
288-
289-
290263
def validate_file(filename: str, allowed_types: set[SchemaType]) -> set[SchemaType]:
291264
"""Validates given a filename."""
292265
with open(filename) as f:
293266
return validate_file_reader(f, allowed_types)
294267

295268

269+
def validate_file_reader(
270+
reader: Any, allowed_types: set[SchemaType]
271+
) -> set[SchemaType]:
272+
"""Validates given a reader. Returns only if a valid format was found, otherwise raises error"""
273+
if not allowed_types:
274+
raise ValueError("CSV file schema not recognized")
275+
276+
file_columns = get_col_names(reader)
277+
res = detect_file_type(file_columns)
278+
if any(i in allowed_types for i in res):
279+
return res
280+
raise ValueError("Some file schema/columns are not recognized")
281+
282+
296283
def get_col_names(f: Any) -> Any:
297284
"""Get column names."""
298285
try:
@@ -310,15 +297,28 @@ def get_col_names(f: Any) -> Any:
310297
return col_names
311298

312299

313-
def validate_file_reader(
314-
reader: Any, allowed_types: set[SchemaType]
315-
) -> set[SchemaType]:
316-
"""Validates given a reader. Returns only if a valid format was found, otherwise raises error"""
317-
if not allowed_types:
318-
raise ValueError("CSV file schema not recognized")
300+
def detect_file_type(col_names: list[str]) -> set[SchemaType]:
301+
"""Returns all schemas that match for a list of col names."""
302+
res = set()
303+
for schema, schema_cols in SCHEMA_TYPE_TO_COLS.items():
304+
optional_cols = SCHEMA_TYPE_TO_OPTIONAL_COLS[schema]
305+
if valid_subset_lists(schema_cols, col_names, optional_cols):
306+
res.add(schema)
307+
if not res:
308+
# If it doesn't match any, it will match unknown.
309+
res.add(SchemaType.UNKNOWN)
310+
return res
319311

320-
file_columns = get_col_names(reader)
321-
res = detect_file_type(file_columns)
322-
if any(i in allowed_types for i in res):
323-
return res
324-
raise ValueError("Some file schema/columns are not recognized")
312+
313+
def valid_subset_lists(
314+
superset_list: list[str], subset_list: list[str], optional_list: list[str]
315+
) -> bool:
316+
"""Checks if the subset_list is a subset of or equivalent to superset_list. And if so,
317+
whether the missing values are all present in the optional list. This method disregards order
318+
but cares about duplicates."""
319+
# Checks if any value in subset list is NOT present in superset list.
320+
if Counter(subset_list) - Counter(superset_list):
321+
# This is not a valid state, users should not be passing in unrecognized columns.
322+
return False
323+
missing_vals = Counter(superset_list) - Counter(subset_list)
324+
return not Counter(missing_vals) - Counter(optional_list)

0 commit comments

Comments
 (0)