3838from pandera .errors import SchemaError , SchemaErrors
3939
4040from edvise .dataio .read import read_raw_pdp_cohort_data , read_raw_pdp_course_data
41- from edvise .dataio .pdp_cohort_converters import converter_func_cohort
4241from edvise .utils .data_cleaning import handling_duplicates
4342
4443from . import validation_pdp_edvise as pdp_edvise
@@ -80,7 +79,11 @@ def validate_file_reader(
8079 pdp_cohort_converter_func : PDPConverterFunc = None ,
8180 pdp_course_converter_func : PDPConverterFunc = None ,
8281) -> dict [str , Any ]:
83- """Validates a dataset given a filename and schema selection.
82+ """
83+ Validate a CSV from a path or file-like handle against schema selection.
84+
85+ Thin wrapper around :func:`validate_dataset` with the same arguments
86+ reordered for call sites that pass ``allowed_schema`` first.
8487
8588 Args:
8689 filename: Path or file-like object for the CSV.
@@ -90,16 +93,17 @@ def validate_file_reader(
9093 institution_id: Key into inst_schema["institutions"]: "edvise", "pdp",
9194 "legacy" (any-format), or institution UUID for custom. Default "pdp".
9295 institution_identifier: Optional institution identifier (e.g. UUID) for display/context.
93- pdp_cohort_converter_func: Optional custom PDP cohort converter (school-specific).
94- pdp_course_converter_func: Optional custom PDP course converter (school-specific).
96+ pdp_cohort_converter_func: Optional cohort row transform before Pandera; default
97+ None. Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
98+ pdp_course_converter_func: Optional course converter; default duplicate handling only.
9599
96100 Returns:
97- Dict with validation_status, schemas, missing_optional, unknown_extra_columns.
98- On success also contains " normalized_df" (DataFrame, or None if nothing was validated).
101+ Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
102+ and on success normalized_df (DataFrame, or None if nothing was validated).
99103
100104 Raises:
101- HardValidationError: When required columns are missing or schema validation fails.
102- UnicodeError: When file encoding is not UTF-8/UTF-16/UTF-32 .
105+ HardValidationError: When required columns are missing, schema validation fails,
106+ or encoding cannot be resolved (decode failures use failure_cases, not UnicodeError) .
103107 """
104108 return validate_dataset (
105109 filename ,
@@ -751,7 +755,8 @@ def _compute_model_list_and_merged_specs(
751755
752756
753757# --------------------------------------------------------------------------- #
754- # PDP: use edvise read + validate (single source of truth)
758+ # PDP single-model path: edvise read + Pandera validate. Cohort converter defaults
759+ # to None so validated row sets can differ from batch jobs that use dataio converters.
755760# --------------------------------------------------------------------------- #
756761
757762# Datetime formats to try for PDP course (same order as pdp_data_audit)
@@ -798,7 +803,7 @@ def _convert_pdp_schema_errors_to_hard(
798803def _read_pdp_validated_dataframe (
799804 path : str ,
800805 model_set : set [str ],
801- cohort_converter : Callable [[ pd . DataFrame ], pd . DataFrame ] ,
806+ cohort_converter : PDPConverterFunc ,
802807 course_converter_func : PDPConverterFunc ,
803808) -> pd .DataFrame :
804809 """Read and validate PDP cohort or course data; return validated DataFrame or raise."""
@@ -876,23 +881,25 @@ def _read_pdp_course_edvise(
876881 course_converter_func : PDPConverterFunc = None ,
877882) -> pd .DataFrame :
878883 """
879- Read and validate PDP course data via edvise (same as pipeline) .
884+ Read and validate a PDP course CSV using edvise helpers .
880885
881- Tries each datetime format with each converter. If a custom
882- course_converter_func is provided (e.g. from a school), it is tried first;
883- then :func:`_default_pdp_course_duplicate_converter` (``handling_duplicates``
884- with PDP settings). Raises HardValidationError if all attempts fail.
886+ Tries each value in ``PDP_COURSE_DTTM_FORMATS`` with each converter: optional
887+ ``course_converter_func`` first, then :func:`_default_pdp_course_duplicate_converter`.
888+
889+ Batch PDP jobs may also try school-specific converters from ``dataio``; this
890+ path only runs converters passed in here, so results may differ from those jobs.
885891
886892 Args:
887893 path: Path to course CSV.
888- course_converter_func: Optional custom converter (e.g. converter_func_course)
889- that schools can provide; if None, only default converters are used.
894+ course_converter_func: Optional school-specific converter; if None, only the
895+ default duplicate-handling converter is used.
890896
891897 Returns:
892- Validated DataFrame (same as pipeline output).
898+ Validated DataFrame from ``read_raw_pdp_course_data`` for the first successful
899+ converter and datetime format.
893900
894901 Raises:
895- HardValidationError: If no ( converter, format) pair succeeded .
902+ HardValidationError: If every converter and format combination fails .
896903 """
897904 default_converters = (_default_pdp_course_duplicate_converter ,)
898905 converters = (
@@ -944,38 +951,38 @@ def _validate_pdp_with_edvise_read(
944951 pdp_course_converter_func : PDPConverterFunc = None ,
945952) -> Dict [str , Any ]:
946953 """
947- Validate PDP cohort or course via edvise read + schema (same as pipeline) .
954+ Validate a single-model PDP cohort or course file via edvise read and Pandera .
948955
949- Resolves filename to a path ( temp file if file-like) , then calls
950- read_raw_pdp_cohort_data or read_raw_pdp_course_data. Uses the same
951- converter functions as the edvise repo: cohort converter filters dual
952- enrollment students (DE/DS/SE); course converter handles duplicates.
953- Schools can provide custom converters via the optional func args .
956+ Writes file-like inputs to a temp path , then calls ``read_raw_pdp_cohort_data``
957+ (STUDENT) or ``_read_pdp_course_edvise`` (COURSE). Cohort rows are only
958+ transformed when ``pdp_cohort_converter_func`` is set; batch jobs may still
959+ filter cohort rows via ``dataio``, so API output rows are not guaranteed to
960+ match pipeline output for the same file .
954961
955962 Args:
956- filename: Path or file-like to CSV.
957- enc: Encoding (from sniff_encoding) for file-like decode.
958- model_list: Single model, e.g. ["STUDENT"] or ["COURSE"].
959- institution_id: Institution schema key (e.g. "pdp").
960- pdp_cohort_converter_func: Optional custom cohort converter; if None,
961- uses converter_func_cohort from edvise (filters DE/DS/SE).
962- pdp_course_converter_func: Optional custom course converter (e.g.
963- converter_func_course); if None, uses default handling_duplicates.
963+ filename: Path or file-like CSV source.
964+ enc: Encoding from :func:`sniff_encoding` (used when materializing file-like input).
965+ model_list: Exactly one model, e.g. ``["STUDENT"]`` or ``["COURSE"]``.
966+ institution_id: Schema namespace (e.g. ``"pdp"``); reserved for callers and logging.
967+ pdp_cohort_converter_func: Optional ``DataFrame -> DataFrame`` step before cohort
968+ schema validation; ``None`` means validate rows as read.
969+ pdp_course_converter_func: Optional course converter before default duplicate handling.
964970
965971 Returns:
966- Dict with validation_status, schemas, missing_optional,
967- unknown_extra_columns, normalized_df.
972+ Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
973+ and normalized_df on success .
968974
969975 Raises:
970- HardValidationError: If read/schema fails (or SchemaErrors converted).
976+ HardValidationError: If converters are non-callable, read fails, or Pandera
977+ validation fails (including converted SchemaErrors).
971978 """
972979 _reset_to_start_if_possible (filename )
973980 model_set = {str (m ).strip ().upper () for m in model_list if m }
974981
975982 _validate_pdp_converter_callables (
976983 pdp_cohort_converter_func , pdp_course_converter_func
977984 )
978- cohort_converter = pdp_cohort_converter_func or converter_func_cohort
985+ cohort_converter = pdp_cohort_converter_func
979986
980987 with _path_for_edvise_read (filename , enc ) as path :
981988 try :
@@ -1104,17 +1111,30 @@ def validate_dataset(
11041111 pdp_course_converter_func : PDPConverterFunc = None ,
11051112) -> Dict [str , Any ]:
11061113 """
1107- Validate a dataset against merged base/extension schemas.
1114+ Validate a dataset against merged base and optional extension schemas.
1115+
1116+ Detects encoding, merges institution column specs, then routes to legacy
1117+ any-format handling, PDP edvise read (single-model STUDENT/COURSE), or
1118+ JSON Pandera validation. ``institution_id == "legacy"`` skips column schema checks.
11081119
1109- Steps: encoding, merge specs, header pass, typed read, then PDP repo schema
1110- (if applicable) or JSON-based validation. Returns dict with validation_status,
1111- schemas, normalized_df (or None if empty merged_specs). Raises HardValidationError
1112- on failure; UnicodeError if encoding is not UTF-8/UTF-16/UTF-32.
1113- Legacy institutions (institution_id=="legacy"): accept any format (encoding + CSV read only).
1120+ Args:
1121+ filename: CSV path or file-like object.
1122+ base_schema: Base schema dict (e.g. base.data_models).
1123+ ext_schema: Optional extension schema with institutions.* blocks.
1124+ models: Model name(s) to validate; ``None`` follows merged_specs resolution.
1125+ institution_id: Institutions key, or ``"legacy"`` for encoding-only validation.
1126+ institution_identifier: Optional UUID string for caller context (e.g. Edvise).
1127+ pdp_cohort_converter_func: Optional cohort transform before Pandera; default ``None``.
1128+ Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
1129+ pdp_course_converter_func: Optional course converter before default duplicate handling.
1130+
1131+ Returns:
1132+ Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
1133+ and normalized_df (``None`` when merged_specs is empty).
11141134
1115- For PDP uploads, optional pdp_cohort_converter_func and pdp_course_converter_func
1116- allow schools to supply custom converters (e.g. from config); if None, edvise
1117- defaults are used (cohort: filter DE/DS/SE; course: handling_duplicates ).
1135+ Raises:
1136+ HardValidationError: On decode failure, missing columns, schema errors, or
1137+ other validation failures (including Unicode decode issues from sniff_encoding ).
11181138 """
11191139 try :
11201140 enc = sniff_encoding (filename )
@@ -1137,7 +1157,7 @@ def validate_dataset(
11371157 "normalized_df" : None ,
11381158 }
11391159
1140- # PDP single-model: use edvise read + validate (same as pipeline)
1160+ # Route PDP STUDENT/COURSE to edvise read path (cohort converter optional; see section above).
11411161 if pdp_edvise .get_edvise_schema_for_upload (institution_id , model_list ) is not None :
11421162 return _validate_pdp_with_edvise_read (
11431163 filename ,
0 commit comments