260260}
261261
262262
263- def valid_subset_lists (
264- superset_list : list [str ], subset_list : list [str ], optional_list : list [str ]
265- ) -> bool :
266- """Checks if the subset_list is a subset of or equivalent to superset_list. And if so,
267- whether the missing values are all present in the optional list. This method disregards order
268- but cares about duplicates."""
269- # Checks if any value in subset list is NOT present in superset list.
270- if Counter (subset_list ) - Counter (superset_list ):
271- # This is not a valid state, users should not be passing in unrecognized columns.
272- return False
273- missing_vals = Counter (superset_list ) - Counter (subset_list )
274- return not Counter (missing_vals ) - Counter (optional_list )
275-
276-
277- def detect_file_type (col_names : list [str ]) -> set [SchemaType ]:
278- """Returns all schemas that match for a list of col names."""
279- res = set ()
280- for schema , schema_cols in SCHEMA_TYPE_TO_COLS .items ():
281- optional_cols = SCHEMA_TYPE_TO_OPTIONAL_COLS [schema ]
282- if valid_subset_lists (schema_cols , col_names , optional_cols ):
283- res .add (schema )
284- if not res :
285- # If it doesn't match any, it will match unknown.
286- res .add (SchemaType .UNKNOWN )
287- return res
288-
289-
290263def validate_file (filename : str , allowed_types : set [SchemaType ]) -> set [SchemaType ]:
291264 """Validates given a filename."""
292265 with open (filename ) as f :
293266 return validate_file_reader (f , allowed_types )
294267
295268
269+ def validate_file_reader (
270+ reader : Any , allowed_types : set [SchemaType ]
271+ ) -> set [SchemaType ]:
272+ """Validates given a reader. Returns only if a valid format was found, otherwise raises error"""
273+ if not allowed_types :
274+ raise ValueError ("CSV file schema not recognized" )
275+
276+ file_columns = get_col_names (reader )
277+ res = detect_file_type (file_columns )
278+ if any (i in allowed_types for i in res ):
279+ return res
280+ raise ValueError ("Some file schema/columns are not recognized" )
281+
282+
296283def get_col_names (f : Any ) -> Any :
297284 """Get column names."""
298285 try :
@@ -310,15 +297,28 @@ def get_col_names(f: Any) -> Any:
310297 return col_names
311298
312299
313- def validate_file_reader (
314- reader : Any , allowed_types : set [SchemaType ]
315- ) -> set [SchemaType ]:
316- """Validates given a reader. Returns only if a valid format was found, otherwise raises error"""
317- if not allowed_types :
318- raise ValueError ("CSV file schema not recognized" )
300+ def detect_file_type (col_names : list [str ]) -> set [SchemaType ]:
301+ """Returns all schemas that match for a list of col names."""
302+ res = set ()
303+ for schema , schema_cols in SCHEMA_TYPE_TO_COLS .items ():
304+ optional_cols = SCHEMA_TYPE_TO_OPTIONAL_COLS [schema ]
305+ if valid_subset_lists (schema_cols , col_names , optional_cols ):
306+ res .add (schema )
307+ if not res :
308+ # If it doesn't match any, it will match unknown.
309+ res .add (SchemaType .UNKNOWN )
310+ return res
319311
320- file_columns = get_col_names (reader )
321- res = detect_file_type (file_columns )
322- if any (i in allowed_types for i in res ):
323- return res
324- raise ValueError ("Some file schema/columns are not recognized" )
312+
313+ def valid_subset_lists (
314+ superset_list : list [str ], subset_list : list [str ], optional_list : list [str ]
315+ ) -> bool :
316+ """Checks if the subset_list is a subset of or equivalent to superset_list. And if so,
317+ whether the missing values are all present in the optional list. This method disregards order
318+ but cares about duplicates."""
319+ # Checks if any value in subset list is NOT present in superset list.
320+ if Counter (subset_list ) - Counter (superset_list ):
321+ # This is not a valid state, users should not be passing in unrecognized columns.
322+ return False
323+ missing_vals = Counter (superset_list ) - Counter (subset_list )
324+ return not Counter (missing_vals ) - Counter (optional_list )
0 commit comments