Skip to content

Commit 7dbc362

Browse files
Merge pull request #227 from datakind/fix/remove-cohort-converter-validation
fix: default cohort converter to none
2 parents 482c2eb + a16ec3c commit 7dbc362

3 files changed

Lines changed: 82 additions & 58 deletions

File tree

src/webapp/validation.py

Lines changed: 67 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from pandera.errors import SchemaError, SchemaErrors
3939

4040
from edvise.dataio.read import read_raw_pdp_cohort_data, read_raw_pdp_course_data
41-
from edvise.dataio.pdp_cohort_converters import converter_func_cohort
4241
from edvise.utils.data_cleaning import handling_duplicates
4342

4443
from . import validation_pdp_edvise as pdp_edvise
@@ -80,7 +79,11 @@ def validate_file_reader(
8079
pdp_cohort_converter_func: PDPConverterFunc = None,
8180
pdp_course_converter_func: PDPConverterFunc = None,
8281
) -> dict[str, Any]:
83-
"""Validates a dataset given a filename and schema selection.
82+
"""
83+
Validate a CSV from a path or file-like handle against schema selection.
84+
85+
Thin wrapper around :func:`validate_dataset` with the same arguments
86+
reordered for call sites that pass ``allowed_schema`` first.
8487
8588
Args:
8689
filename: Path or file-like object for the CSV.
@@ -90,16 +93,17 @@ def validate_file_reader(
9093
institution_id: Key into inst_schema["institutions"]: "edvise", "pdp",
9194
"legacy" (any-format), or institution UUID for custom. Default "pdp".
9295
institution_identifier: Optional institution identifier (e.g. UUID) for display/context.
93-
pdp_cohort_converter_func: Optional custom PDP cohort converter (school-specific).
94-
pdp_course_converter_func: Optional custom PDP course converter (school-specific).
96+
pdp_cohort_converter_func: Optional cohort row transform before Pandera; default
97+
None. Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
98+
pdp_course_converter_func: Optional course converter; default duplicate handling only.
9599
96100
Returns:
97-
Dict with validation_status, schemas, missing_optional, unknown_extra_columns.
98-
On success also contains "normalized_df" (DataFrame, or None if nothing was validated).
101+
Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
102+
and on success normalized_df (DataFrame, or None if nothing was validated).
99103
100104
Raises:
101-
HardValidationError: When required columns are missing or schema validation fails.
102-
UnicodeError: When file encoding is not UTF-8/UTF-16/UTF-32.
105+
HardValidationError: When required columns are missing, schema validation fails,
106+
or encoding cannot be resolved (decode failures use failure_cases, not UnicodeError).
103107
"""
104108
return validate_dataset(
105109
filename,
@@ -751,7 +755,8 @@ def _compute_model_list_and_merged_specs(
751755

752756

753757
# --------------------------------------------------------------------------- #
754-
# PDP: use edvise read + validate (single source of truth)
758+
# PDP single-model path: edvise read + Pandera validate. Cohort converter defaults
759+
# to None so validated row sets can differ from batch jobs that use dataio converters.
755760
# --------------------------------------------------------------------------- #
756761

757762
# Datetime formats to try for PDP course (same order as pdp_data_audit)
@@ -798,7 +803,7 @@ def _convert_pdp_schema_errors_to_hard(
798803
def _read_pdp_validated_dataframe(
799804
path: str,
800805
model_set: set[str],
801-
cohort_converter: Callable[[pd.DataFrame], pd.DataFrame],
806+
cohort_converter: PDPConverterFunc,
802807
course_converter_func: PDPConverterFunc,
803808
) -> pd.DataFrame:
804809
"""Read and validate PDP cohort or course data; return validated DataFrame or raise."""
@@ -876,23 +881,25 @@ def _read_pdp_course_edvise(
876881
course_converter_func: PDPConverterFunc = None,
877882
) -> pd.DataFrame:
878883
"""
879-
Read and validate PDP course data via edvise (same as pipeline).
884+
Read and validate a PDP course CSV using edvise helpers.
880885
881-
Tries each datetime format with each converter. If a custom
882-
course_converter_func is provided (e.g. from a school), it is tried first;
883-
then :func:`_default_pdp_course_duplicate_converter` (``handling_duplicates``
884-
with PDP settings). Raises HardValidationError if all attempts fail.
886+
Tries each value in ``PDP_COURSE_DTTM_FORMATS`` with each converter: optional
887+
``course_converter_func`` first, then :func:`_default_pdp_course_duplicate_converter`.
888+
889+
Batch PDP jobs may also try school-specific converters from ``dataio``; this
890+
path only runs converters passed in here, so results may differ from those jobs.
885891
886892
Args:
887893
path: Path to course CSV.
888-
course_converter_func: Optional custom converter (e.g. converter_func_course)
889-
that schools can provide; if None, only default converters are used.
894+
course_converter_func: Optional school-specific converter; if None, only the
895+
default duplicate-handling converter is used.
890896
891897
Returns:
892-
Validated DataFrame (same as pipeline output).
898+
Validated DataFrame from ``read_raw_pdp_course_data`` for the first successful
899+
converter and datetime format.
893900
894901
Raises:
895-
HardValidationError: If no (converter, format) pair succeeded.
902+
HardValidationError: If every converter and format combination fails.
896903
"""
897904
default_converters = (_default_pdp_course_duplicate_converter,)
898905
converters = (
@@ -944,38 +951,38 @@ def _validate_pdp_with_edvise_read(
944951
pdp_course_converter_func: PDPConverterFunc = None,
945952
) -> Dict[str, Any]:
946953
"""
947-
Validate PDP cohort or course via edvise read + schema (same as pipeline).
954+
Validate a single-model PDP cohort or course file via edvise read and Pandera.
948955
949-
Resolves filename to a path (temp file if file-like), then calls
950-
read_raw_pdp_cohort_data or read_raw_pdp_course_data. Uses the same
951-
converter functions as the edvise repo: cohort converter filters dual
952-
enrollment students (DE/DS/SE); course converter handles duplicates.
953-
Schools can provide custom converters via the optional func args.
956+
Writes file-like inputs to a temp path, then calls ``read_raw_pdp_cohort_data``
957+
(STUDENT) or ``_read_pdp_course_edvise`` (COURSE). Cohort rows are only
958+
transformed when ``pdp_cohort_converter_func`` is set; batch jobs may still
959+
filter cohort rows via ``dataio``, so API output rows are not guaranteed to
960+
match pipeline output for the same file.
954961
955962
Args:
956-
filename: Path or file-like to CSV.
957-
enc: Encoding (from sniff_encoding) for file-like decode.
958-
model_list: Single model, e.g. ["STUDENT"] or ["COURSE"].
959-
institution_id: Institution schema key (e.g. "pdp").
960-
pdp_cohort_converter_func: Optional custom cohort converter; if None,
961-
uses converter_func_cohort from edvise (filters DE/DS/SE).
962-
pdp_course_converter_func: Optional custom course converter (e.g.
963-
converter_func_course); if None, uses default handling_duplicates.
963+
filename: Path or file-like CSV source.
964+
enc: Encoding from :func:`sniff_encoding` (used when materializing file-like input).
965+
model_list: Exactly one model, e.g. ``["STUDENT"]`` or ``["COURSE"]``.
966+
institution_id: Schema namespace (e.g. ``"pdp"``); reserved for callers and logging.
967+
pdp_cohort_converter_func: Optional ``DataFrame -> DataFrame`` step before cohort
968+
schema validation; ``None`` means validate rows as read.
969+
pdp_course_converter_func: Optional course converter before default duplicate handling.
964970
965971
Returns:
966-
Dict with validation_status, schemas, missing_optional,
967-
unknown_extra_columns, normalized_df.
972+
Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
973+
and normalized_df on success.
968974
969975
Raises:
970-
HardValidationError: If read/schema fails (or SchemaErrors converted).
976+
HardValidationError: If converters are non-callable, read fails, or Pandera
977+
validation fails (including converted SchemaErrors).
971978
"""
972979
_reset_to_start_if_possible(filename)
973980
model_set = {str(m).strip().upper() for m in model_list if m}
974981

975982
_validate_pdp_converter_callables(
976983
pdp_cohort_converter_func, pdp_course_converter_func
977984
)
978-
cohort_converter = pdp_cohort_converter_func or converter_func_cohort
985+
cohort_converter = pdp_cohort_converter_func
979986

980987
with _path_for_edvise_read(filename, enc) as path:
981988
try:
@@ -1104,17 +1111,30 @@ def validate_dataset(
11041111
pdp_course_converter_func: PDPConverterFunc = None,
11051112
) -> Dict[str, Any]:
11061113
"""
1107-
Validate a dataset against merged base/extension schemas.
1114+
Validate a dataset against merged base and optional extension schemas.
1115+
1116+
Detects encoding, merges institution column specs, then routes to legacy
1117+
any-format handling, PDP edvise read (single-model STUDENT/COURSE), or
1118+
JSON Pandera validation. ``institution_id == "legacy"`` skips column schema checks.
11081119
1109-
Steps: encoding, merge specs, header pass, typed read, then PDP repo schema
1110-
(if applicable) or JSON-based validation. Returns dict with validation_status,
1111-
schemas, normalized_df (or None if empty merged_specs). Raises HardValidationError
1112-
on failure; UnicodeError if encoding is not UTF-8/UTF-16/UTF-32.
1113-
Legacy institutions (institution_id=="legacy"): accept any format (encoding + CSV read only).
1120+
Args:
1121+
filename: CSV path or file-like object.
1122+
base_schema: Base schema dict (e.g. base.data_models).
1123+
ext_schema: Optional extension schema with institutions.* blocks.
1124+
models: Model name(s) to validate; ``None`` follows merged_specs resolution.
1125+
institution_id: Institutions key, or ``"legacy"`` for encoding-only validation.
1126+
institution_identifier: Optional UUID string for caller context (e.g. Edvise).
1127+
pdp_cohort_converter_func: Optional cohort transform before Pandera; default ``None``.
1128+
Batch PDP jobs may still apply school-specific cohort converters via ``dataio``.
1129+
pdp_course_converter_func: Optional course converter before default duplicate handling.
1130+
1131+
Returns:
1132+
Dict with validation_status, schemas, missing_optional, unknown_extra_columns,
1133+
and normalized_df (``None`` when merged_specs is empty).
11141134
1115-
For PDP uploads, optional pdp_cohort_converter_func and pdp_course_converter_func
1116-
allow schools to supply custom converters (e.g. from config); if None, edvise
1117-
defaults are used (cohort: filter DE/DS/SE; course: handling_duplicates).
1135+
Raises:
1136+
HardValidationError: On decode failure, missing columns, schema errors, or
1137+
other validation failures (including Unicode decode issues from sniff_encoding).
11181138
"""
11191139
try:
11201140
enc = sniff_encoding(filename)
@@ -1137,7 +1157,7 @@ def validate_dataset(
11371157
"normalized_df": None,
11381158
}
11391159

1140-
# PDP single-model: use edvise read + validate (same as pipeline)
1160+
# Route PDP STUDENT/COURSE to edvise read path (cohort converter optional; see section above).
11411161
if pdp_edvise.get_edvise_schema_for_upload(institution_id, model_list) is not None:
11421162
return _validate_pdp_with_edvise_read(
11431163
filename,

src/webapp/validation_pdp_edvise.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
1-
"""PDP schema validation using canonical schemas from the edvise package.
1+
"""PDP Pandera schemas re-exported from edvise for upload validation.
22
3-
This module runs the same validation as the edvise repo (RawPDPCohortDataSchema,
4-
RawPDPCourseDataSchema) for PDP uploads only, so PDP validation rules match pipelines
5-
and audits. The edvise extension/institution uses JSON-based validation only (different
6-
columns and setup). All logic is in edvise-api; the edvise package is consumed read-only.
3+
Imports ``RawPDPCohortDataSchema`` and ``RawPDPCourseDataSchema`` so PDP uploads use
4+
the same column and type rules as edvise pipeline audits. Cohort row transforms run
5+
in ``validation.py`` (optional converter) and can differ from batch ``dataio`` hooks;
6+
this module only supplies schema classes and helpers.
77
8-
The edvise package is required for PDP validation: it must be installed (e.g. in
9-
pyproject.toml) so that PDP uploads are validated with the same schemas as the repo.
8+
Non-PDP Edvise institutions use JSON-based validation elsewhere (different columns).
9+
10+
Requires the ``edvise`` package (see pyproject.toml).
1011
"""
1112

1213
from __future__ import annotations

src/webapp/validation_pdp_read_path_test.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
"""Tests for PDP validation path that uses edvise read_raw_pdp_* (single source of truth)."""
1+
"""
2+
Tests for the PDP branch of validation (edvise ``read_raw_pdp_*`` integration).
3+
4+
Covers routing from ``validate_file_reader``, cohort/course converter wiring, and errors.
5+
"""
26

37
import io
48
from pathlib import Path
@@ -9,7 +13,6 @@
913
import pytest
1014
from pandera.errors import SchemaErrors
1115

12-
from edvise.dataio.pdp_cohort_converters import converter_func_cohort
1316

1417
from src.webapp.validation import (
1518
HardValidationError,
@@ -272,8 +275,8 @@ def test_validate_pdp_with_edvise_read_accepts_file_like() -> None:
272275
# Edvise read was given a path (temp file when file-like); keyword is file_path
273276
assert "file_path" in mock_read.call_args[1]
274277
assert isinstance(mock_read.call_args[1]["file_path"], str)
275-
# Cohort validation uses converter_func_cohort by default (filters DE/DS/SE)
276-
assert mock_read.call_args[1]["converter_func"] is converter_func_cohort
278+
# Cohort validation uses no converter unless pdp_cohort_converter_func is passed
279+
assert mock_read.call_args[1]["converter_func"] is None
277280

278281

279282
def test_validate_pdp_with_edvise_read_student_uses_custom_cohort_converter_when_provided(

0 commit comments

Comments
 (0)