Merge pull request #227 from datakind/fix/remove-cohort-converter-validation

vishpillai123 · web-flow · commit 7dbc3622d837 · 2026-04-08T14:21:03.000-04:00
fix: default cohort converter to none
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
@@ -38,7 +38,6 @@
 from pandera.errors import SchemaError, SchemaErrors
 
 from edvise.dataio.read import read_raw_pdp_cohort_data, read_raw_pdp_course_data
-from edvise.dataio.pdp_cohort_converters import converter_func_cohort
 from edvise.utils.data_cleaning import handling_duplicates
 
 from . import validation_pdp_edvise as pdp_edvise
@@ -80,7 +79,11 @@ def validate_file_reader(
     pdp_cohort_converter_func: PDPConverterFunc = None,
     pdp_course_converter_func: PDPConverterFunc = None,
 ) -> dict[str, Any]:
-    """Validates a dataset given a filename and schema selection.
+    """
+    Validate a CSV from a path or file-like handle against schema selection.
+
+    Thin wrapper around :func:`validate_dataset` with the same arguments
+    reordered for call sites that pass ``allowed_schema`` first.
 
     Args:
         filename: Path or file-like object for the CSV.
@@ -90,16 +93,17 @@ def validate_file_reader(
         institution_id: Key into inst_schema["institutions"]: "edvise", "pdp",
             "legacy" (any-format), or institution UUID for custom. Default "pdp".
         institution_identifier: Optional institution identifier (e.g. UUID) for display/context.
-        pdp_cohort_converter_func: Optional custom PDP cohort converter (school-specific).
-        pdp_course_converter_func: Optional custom PDP course converter (school-specific).
+        pdp_cohort_converter_func: Optional cohort row transform before Pandera; default
+            None. Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
+        pdp_course_converter_func: Optional course converter; default duplicate handling only.
 
     Returns:
-        Dict with validation_status, schemas, missing_optional, unknown_extra_columns.
-        On success also contains "normalized_df" (DataFrame, or None if nothing was validated).
+        Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
+        and on success normalized_df (DataFrame, or None if nothing was validated).
 
     Raises:
-        HardValidationError: When required columns are missing or schema validation fails.
-        UnicodeError: When file encoding is not UTF-8/UTF-16/UTF-32.
+        HardValidationError: When required columns are missing, schema validation fails,
+            or encoding cannot be resolved (decode failures use failure_cases, not UnicodeError).
     """
     return validate_dataset(
         filename,
@@ -751,7 +755,8 @@ def _compute_model_list_and_merged_specs(
 
 
 # --------------------------------------------------------------------------- #
-# PDP: use edvise read + validate (single source of truth)
+# PDP single-model path: edvise read + Pandera validate. Cohort converter defaults
+# to None so validated row sets can differ from batch jobs that use dataio converters.
 # --------------------------------------------------------------------------- #
 
 # Datetime formats to try for PDP course (same order as pdp_data_audit)
@@ -798,7 +803,7 @@ def _convert_pdp_schema_errors_to_hard(
 def _read_pdp_validated_dataframe(
     path: str,
     model_set: set[str],
-    cohort_converter: Callable[[pd.DataFrame], pd.DataFrame],
+    cohort_converter: PDPConverterFunc,
     course_converter_func: PDPConverterFunc,
 ) -> pd.DataFrame:
     """Read and validate PDP cohort or course data; return validated DataFrame or raise."""
@@ -876,23 +881,25 @@ def _read_pdp_course_edvise(
     course_converter_func: PDPConverterFunc = None,
 ) -> pd.DataFrame:
     """
-    Read and validate PDP course data via edvise (same as pipeline).
+    Read and validate a PDP course CSV using edvise helpers.
 
-    Tries each datetime format with each converter. If a custom
-    course_converter_func is provided (e.g. from a school), it is tried first;
-    then :func:`_default_pdp_course_duplicate_converter` (``handling_duplicates``
-    with PDP settings). Raises HardValidationError if all attempts fail.
+    Tries each value in ``PDP_COURSE_DTTM_FORMATS`` with each converter: optional
+    ``course_converter_func`` first, then :func:`_default_pdp_course_duplicate_converter`.
+
+    Batch PDP jobs may also try school-specific converters from ``dataio``; this
+    path only runs converters passed in here, so results may differ from those jobs.
 
     Args:
         path: Path to course CSV.
-        course_converter_func: Optional custom converter (e.g. converter_func_course)
-            that schools can provide; if None, only default converters are used.
+        course_converter_func: Optional school-specific converter; if None, only the
+            default duplicate-handling converter is used.
 
     Returns:
-        Validated DataFrame (same as pipeline output).
+        Validated DataFrame from ``read_raw_pdp_course_data`` for the first successful
+        converter and datetime format.
 
     Raises:
-        HardValidationError: If no (converter, format) pair succeeded.
+        HardValidationError: If every converter and format combination fails.
     """
     default_converters = (_default_pdp_course_duplicate_converter,)
     converters = (
@@ -944,38 +951,38 @@ def _validate_pdp_with_edvise_read(
     pdp_course_converter_func: PDPConverterFunc = None,
 ) -> Dict[str, Any]:
     """
-    Validate PDP cohort or course via edvise read + schema (same as pipeline).
+    Validate a single-model PDP cohort or course file via edvise read and Pandera.
 
-    Resolves filename to a path (temp file if file-like), then calls
-    read_raw_pdp_cohort_data or read_raw_pdp_course_data. Uses the same
-    converter functions as the edvise repo: cohort converter filters dual
-    enrollment students (DE/DS/SE); course converter handles duplicates.
-    Schools can provide custom converters via the optional func args.
+    Writes file-like inputs to a temp path, then calls ``read_raw_pdp_cohort_data``
+    (STUDENT) or ``_read_pdp_course_edvise`` (COURSE). Cohort rows are only
+    transformed when ``pdp_cohort_converter_func`` is set; batch jobs may still
+    filter cohort rows via ``dataio``, so API output rows are not guaranteed to
+    match pipeline output for the same file.
 
     Args:
-        filename: Path or file-like to CSV.
-        enc: Encoding (from sniff_encoding) for file-like decode.
-        model_list: Single model, e.g. ["STUDENT"] or ["COURSE"].
-        institution_id: Institution schema key (e.g. "pdp").
-        pdp_cohort_converter_func: Optional custom cohort converter; if None,
-            uses converter_func_cohort from edvise (filters DE/DS/SE).
-        pdp_course_converter_func: Optional custom course converter (e.g.
-            converter_func_course); if None, uses default handling_duplicates.
+        filename: Path or file-like CSV source.
+        enc: Encoding from :func:`sniff_encoding` (used when materializing file-like input).
+        model_list: Exactly one model, e.g. ``["STUDENT"]`` or ``["COURSE"]``.
+        institution_id: Schema namespace (e.g. ``"pdp"``); reserved for callers and logging.
+        pdp_cohort_converter_func: Optional ``DataFrame -> DataFrame`` step before cohort
+            schema validation; ``None`` means validate rows as read.
+        pdp_course_converter_func: Optional course converter before default duplicate handling.
 
     Returns:
-        Dict with validation_status, schemas, missing_optional,
-        unknown_extra_columns, normalized_df.
+        Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
+        and normalized_df on success.
 
     Raises:
-        HardValidationError: If read/schema fails (or SchemaErrors converted).
+        HardValidationError: If converters are non-callable, read fails, or Pandera
+            validation fails (including converted SchemaErrors).
     """
     _reset_to_start_if_possible(filename)
     model_set = {str(m).strip().upper() for m in model_list if m}
 
     _validate_pdp_converter_callables(
         pdp_cohort_converter_func, pdp_course_converter_func
     )
-    cohort_converter = pdp_cohort_converter_func or converter_func_cohort
+    cohort_converter = pdp_cohort_converter_func
 
     with _path_for_edvise_read(filename, enc) as path:
         try:
@@ -1104,17 +1111,30 @@ def validate_dataset(
     pdp_course_converter_func: PDPConverterFunc = None,
 ) -> Dict[str, Any]:
     """
-    Validate a dataset against merged base/extension schemas.
+    Validate a dataset against merged base and optional extension schemas.
+
+    Detects encoding, merges institution column specs, then routes to legacy
+    any-format handling, PDP edvise read (single-model STUDENT/COURSE), or
+    JSON Pandera validation. ``institution_id == "legacy"`` skips column schema checks.
 
-    Steps: encoding, merge specs, header pass, typed read, then PDP repo schema
-    (if applicable) or JSON-based validation. Returns dict with validation_status,
-    schemas, normalized_df (or None if empty merged_specs). Raises HardValidationError
-    on failure; UnicodeError if encoding is not UTF-8/UTF-16/UTF-32.
-    Legacy institutions (institution_id=="legacy"): accept any format (encoding + CSV read only).
+    Args:
+        filename: CSV path or file-like object.
+        base_schema: Base schema dict (e.g. base.data_models).
+        ext_schema: Optional extension schema with institutions.* blocks.
+        models: Model name(s) to validate; ``None`` follows merged_specs resolution.
+        institution_id: Institutions key, or ``"legacy"`` for encoding-only validation.
+        institution_identifier: Optional UUID string for caller context (e.g. Edvise).
+        pdp_cohort_converter_func: Optional cohort transform before Pandera; default ``None``.
+            Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
+        pdp_course_converter_func: Optional course converter before default duplicate handling.
+
+    Returns:
+        Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
+        and normalized_df (``None`` when merged_specs is empty).
 
-    For PDP uploads, optional pdp_cohort_converter_func and pdp_course_converter_func
-    allow schools to supply custom converters (e.g. from config); if None, edvise
-    defaults are used (cohort: filter DE/DS/SE; course: handling_duplicates).
+    Raises:
+        HardValidationError: On decode failure, missing columns, schema errors, or
+            other validation failures (including Unicode decode issues from sniff_encoding).
     """
     try:
         enc = sniff_encoding(filename)
@@ -1137,7 +1157,7 @@ def validate_dataset(
             "normalized_df": None,
         }
 
-    # PDP single-model: use edvise read + validate (same as pipeline)
+    # Route PDP STUDENT/COURSE to edvise read path (cohort converter optional; see section above).
     if pdp_edvise.get_edvise_schema_for_upload(institution_id, model_list) is not None:
         return _validate_pdp_with_edvise_read(
             filename,
diff --git a/src/webapp/validation_pdp_edvise.py b/src/webapp/validation_pdp_edvise.py
@@ -1,12 +1,13 @@
-"""PDP schema validation using canonical schemas from the edvise package.
+"""PDP Pandera schemas re-exported from edvise for upload validation.
 
-This module runs the same validation as the edvise repo (RawPDPCohortDataSchema,
-RawPDPCourseDataSchema) for PDP uploads only, so PDP validation rules match pipelines
-and audits. The edvise extension/institution uses JSON-based validation only (different
-columns and setup). All logic is in edvise-api; the edvise package is consumed read-only.
+Imports ``RawPDPCohortDataSchema`` and ``RawPDPCourseDataSchema`` so PDP uploads use
+the same column and type rules as edvise pipeline audits. Cohort row transforms run
+in ``validation.py`` (optional converter) and can differ from batch ``dataio`` hooks;
+this module only supplies schema classes and helpers.
 
-The edvise package is required for PDP validation: it must be installed (e.g. in
-pyproject.toml) so that PDP uploads are validated with the same schemas as the repo.
+Non-PDP Edvise institutions use JSON-based validation elsewhere (different columns).
+
+Requires the ``edvise`` package (see pyproject.toml).
 """
 
 from __future__ import annotations
diff --git a/src/webapp/validation_pdp_read_path_test.py b/src/webapp/validation_pdp_read_path_test.py
@@ -1,4 +1,8 @@
-"""Tests for PDP validation path that uses edvise read_raw_pdp_* (single source of truth)."""
+"""
+Tests for the PDP branch of validation (edvise ``read_raw_pdp_*`` integration).
+
+Covers routing from ``validate_file_reader``, cohort/course converter wiring, and errors.
+"""
 
 import io
 from pathlib import Path
@@ -9,7 +13,6 @@
 import pytest
 from pandera.errors import SchemaErrors
 
-from edvise.dataio.pdp_cohort_converters import converter_func_cohort
 
 from src.webapp.validation import (
     HardValidationError,
@@ -272,8 +275,8 @@ def test_validate_pdp_with_edvise_read_accepts_file_like() -> None:
     # Edvise read was given a path (temp file when file-like); keyword is file_path
     assert "file_path" in mock_read.call_args[1]
     assert isinstance(mock_read.call_args[1]["file_path"], str)
-    # Cohort validation uses converter_func_cohort by default (filters DE/DS/SE)
-    assert mock_read.call_args[1]["converter_func"] is converter_func_cohort
+    # Cohort validation uses no converter unless pdp_cohort_converter_func is passed
+    assert mock_read.call_args[1]["converter_func"] is None
 
 
 def test_validate_pdp_with_edvise_read_student_uses_custom_cohort_converter_when_provided(