diff --git a/src/webapp/database.py b/src/webapp/database.py
index 7fe974b0..7c06d74d 100644
--- a/src/webapp/database.py
+++ b/src/webapp/database.py
@@ -511,6 +511,9 @@ class ModelTable(Base):
     )
     # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version.
     version: Mapped[int] = mapped_column(Integer, default=0)
+    framework: Mapped[str | None] = mapped_column(
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn"
+    )
 
     # Within a given institution, there should be no duplicated model names.
     __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),)
@@ -548,6 +551,9 @@ class JobTable(Base):
         String(VAR_CHAR_STANDARD_LENGTH), nullable=True
     )
     completed: Mapped[bool] = mapped_column(nullable=True)
+    framework: Mapped[str | None] = mapped_column(
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn"
+    )
 
 
 class DocType(enum.Enum):
diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 0f9612ec..94a89576 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -35,6 +35,7 @@
 
 # The name of the deployed pipeline in Databricks. Must match directly.
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
+PDP_H2O_INFERENCE_JOB_NAME = "edvise_github_sourced_pdp_inference_pipeline"
 
 
 class DatabricksInferenceRunRequest(BaseModel):
@@ -44,7 +45,7 @@ class DatabricksInferenceRunRequest(BaseModel):
     # Note that the following should be the filepath.
     filepath_to_type: dict[str, list[SchemaType]]
     model_name: str
-    model_type: str = "sklearn"
+    model_type: str
     # The email where notifications will get sent.
     email: str
     gcp_external_bucket_name: str
@@ -98,7 +99,17 @@ def setup_new_inst(self, inst_name: str) -> None:
         db_inst_name = databricksify_inst_name(inst_name)
         cat_name = databricks_vars["CATALOG_NAME"]
         for medallion in MEDALLION_LEVELS:
-            w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name)
+            try:
+                w.schemas.create(
+                    name=f"{db_inst_name}_{medallion}", catalog_name=cat_name
+                )
+            except Exception as e:
+                LOGGER.exception(
+                    f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}"
+                )
+                raise ValueError(
+                    f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}"
+                )
             LOGGER.info(
                 f"Creating medallion level schemas for {db_inst_name} & {medallion}."
             )
@@ -192,16 +203,22 @@ def run_pdp_inference(
 
         db_inst_name = databricksify_inst_name(req.inst_name)
 
+        if req.model_type == "sklearn":
+            pipeline_type = PDP_INFERENCE_JOB_NAME
+        elif req.model_type == "h2o":
+            pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
+        else:
+            raise ValueError("Invalid model framework assigned to institution model")
         try:
-            job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None)
+            job = next(w.jobs.list(name=pipeline_type), None)
             if not job or job.job_id is None:
                 raise ValueError(
-                    f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id."
+                    f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id for '{gcs_vars['GCP_SERVICE_ACCOUNT_EMAIL']}' and '{databricks_vars['DATABRICKS_HOST_URL']}'."
                 )
             job_id = job.job_id
-            LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}")
+            LOGGER.info(f"Resolved job ID for '{pipeline_type}': {job_id}")
         except Exception as e:
-            LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.")
+            LOGGER.exception(f"Job lookup failed for '{pipeline_type}'.")
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
         try:
diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py
index b6046daa..b267d9eb 100644
--- a/src/webapp/gcsutil.py
+++ b/src/webapp/gcsutil.py
@@ -340,8 +340,9 @@ def validate_file(
                     f"If you see this file validation was successful {schems}"
                 )
         except Exception as e:
-            blob.delete()
-            raise e
+            logging.exception("Validation failed for %s: %s", file_name, e)
+            raise
+
         new_blob = bucket.blob(new_blob_name)
         if new_blob.exists():
             raise ValueError(new_blob_name + ": File already exists.")
diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 36079908..c8491455 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -3,7 +3,7 @@
 import uuid
 from datetime import datetime, date
 from databricks.sdk import WorkspaceClient
-from typing import Annotated, Any, Dict, List, cast, IO, Optional
+from typing import Annotated, Any, Dict, List, cast, IO, Optional, Tuple
 from pydantic import BaseModel, Field
 from fastapi import APIRouter, Depends, HTTPException, status, Response, Query
 from fastapi.responses import FileResponse
@@ -16,6 +16,8 @@
 from ..config import databricks_vars, env_vars, gcs_vars
 import tempfile
 import pathlib
+import re
+from ..validation import HardValidationError
 
 from ..utilities import (
     has_access_to_inst_or_err,
@@ -502,6 +504,7 @@ def create_batch(
         )
         f_names = [] if not req.file_names else req.file_names
         f_ids = [] if not req.file_ids else strs_to_uuids(req.file_ids)
+        print(f"File names: {f_names}, File Ids: {f_ids}")
         # Check that the files requested for this batch exists.
         # Only valid non-sst generated files can be added to a batch at creation time.
         query_result_files = (
@@ -995,30 +998,14 @@ def download_url_inst_file(
     )
 
 
-def infer_models_from_filename(file_path: str, institution_id: str) -> List[str]:
-    name = os.path.basename(file_path).lower()
-
-    inferred = set()
-    if "course" in name:
-        inferred.add("COURSE")
-    if "student" in name:
-        inferred.add("STUDENT")
-    if "semester" in name:
-        inferred.add("SEMESTER")
-    if "cohort" in name:
-        inferred.add("STUDENT")
-    if "course" not in name and ("ar" in name or "deidentified" in name):
-        inferred.add("STUDENT")
-
-    if not inferred:
-        logging.error(
-            ValueError(
-                f"Could not infer model(s) from file name: {name}, filenames sould be descriptive of the kind of data it contains e.g. course, cohort"
-            )
-        )
-        inferred.add("UNKNOWN")
+class _ValidationState:
+    _ar_re = re.compile(r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE)
+    _base_cache: Dict[str, Any] = {"exp": 0.0, "val": None}
+    _ext_cache: Dict[str, Tuple[float, Any]] = {}
+    _pdp_cache: Tuple[float, Optional[dict]] = (0.0, None)
+
 
-    return sorted(inferred)
+STATE = _ValidationState()
 
 
 def validation_helper(
@@ -1029,150 +1016,244 @@ def validation_helper(
     storage_control: StorageControl,
     sql_session: Session,
 ) -> Any:
-    """Helper function for file validation."""
+    """Helper function for file validation (self-contained & optimized)."""
+    import time
+
+    # --- access check & quick input validation
     has_access_to_inst_or_err(inst_id, current_user)
-    if file_name.find("/") != -1:
-        raise HTTPException(
-            status_code=422,
-            detail="File name can't contain '/'.",
-        )
+    if "/" in file_name:
+        raise HTTPException(status_code=422, detail="File name can't contain '/'.")
+
+    # --- bind session once
     local_session.set(sql_session)
+    sess = local_session.get()
+
+    AR_RE = STATE._ar_re
+    BASE_TTL = 300  # seconds
+    EXT_TTL = 120  # seconds
+
+    # --- filename → allowed_schemas (fast, single pass)
+    name = os.path.basename(file_name).lower()
+    has_course = "course" in name
+    has_semester = "semester" in name
+    has_student = (
+        ("student" in name)
+        or ("cohort" in name)
+        or (
+            (not has_course)
+            and (AR_RE.search(name) is not None or "deidentified" in name)
+        )
+    )
 
-    allowed_schemas = None
-    if not allowed_schemas:
-        allowed_schemas = infer_models_from_filename(file_name, "pdp")
+    inferred_from_name: set[str] = set()
+    if has_course:
+        inferred_from_name.add("COURSE")
+    if has_student:
+        inferred_from_name.add("STUDENT")
+    if has_semester:
+        inferred_from_name.add("SEMESTER")
 
-    inferred_schemas: list[str] = []
-    # ----------------------- Fetch base schema from DB -------------------------------
-    base_schema = (
-        local_session.get()
-        .execute(
+    if not inferred_from_name:
+        raise ValueError(
+            f"Could not infer model(s) from file name: {name}. "
+            "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')."
+        )
+
+    allowed_schemas = sorted(inferred_from_name)
+
+    # --- fetch active base schema (cached)
+    now = time.monotonic()
+    base_cache = STATE._base_cache
+    if now < base_cache["exp"] and base_cache["val"] is not None:
+        base_schema_id, base_schema = base_cache["val"]  # pylint: disable=unpacking-non-sequence # fmt: skip
+    else:
+        row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)
             .where(
                 SchemaRegistryTable.doc_type == DocType.base,
                 SchemaRegistryTable.is_active.is_(True),
             )
             .limit(1)
-        )
-        .first()
-    )
-    if base_schema is None:
-        raise RuntimeError("No active base schema found")
-
-    base_schema_id, base_schema = base_schema
-    # ----------------------- Fetch inst specific extension schema from DB ---------------------
-    inst = (
-        local_session.get()
-        .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id)))
-        .scalar_one_or_none()
-    )
+        ).first()
+        if row is None:
+            raise RuntimeError("No active base schema found")
+        base_schema_id, base_schema = row
+        base_cache["exp"] = now + BASE_TTL
+        base_cache["val"] = (base_schema_id, base_schema)
+
+    # --- fetch institution record
+    inst = sess.execute(
+        select(InstTable).where(InstTable.id == str_to_uuid(inst_id))
+    ).scalar_one_or_none()
     if inst is None:
         raise ValueError(f"Institution {inst_id} not found")
 
-    if inst.pdp_id:  # institution is PDP
-        inst_schema = (
-            local_session.get()
-            .execute(
+    bucket = get_external_bucket_name(inst_id)
+    # --- choose / prepare extension schema (try to avoid heavy path)
+    updated_inst_schema: Optional[dict] = None
+
+    def _ext_models_set(doc: Optional[dict]) -> set[str]:
+        """Extract model keys from an extension document (root or institutions.* layout)."""
+        if not doc or not isinstance(doc, dict):
+            return set()
+        # root-level
+        if isinstance(doc.get("data_models"), dict):
+            return {str(k).lower() for k in doc["data_models"].keys()}
+        # nested by institution
+        inst_key_candidates = {str(getattr(inst, "id", "")), inst_id}
+        insts = doc.get("institutions", {})
+        if isinstance(insts, dict):
+            for key in inst_key_candidates:
+                block = insts.get(key)
+                if isinstance(block, dict) and isinstance(
+                    block.get("data_models"), dict
+                ):
+                    return {str(k).lower() for k in block["data_models"].keys()}
+        return set()
+
+    if getattr(inst, "pdp_id", None):
+        # PDP institutions: use active PDP extension (cached)
+        pdp_exp, pdp_doc = STATE._pdp_cache
+        if now < pdp_exp and pdp_doc is not None:
+            inst_schema: Optional[Dict[str, Any]] = pdp_doc
+        else:
+            inst_schema = sess.execute(
                 select(SchemaRegistryTable.json_doc)
                 .where(
                     SchemaRegistryTable.is_pdp.is_(True),
                     SchemaRegistryTable.is_active.is_(True),
                 )
                 .limit(1)
-            )
-            .scalar_one_or_none()
-        )
-        updated_inst_schema: dict | None = inst_schema
-    else:  # custom (or none)
-        inst_schema = (
-            local_session.get()
-            .execute(
+            ).scalar_one_or_none()
+            STATE._pdp_cache = (now + EXT_TTL, inst_schema)
+        updated_inst_schema = inst_schema
+    else:
+        # custom institutions: try cached extension first
+        ext_cache = STATE._ext_cache
+        key = str(getattr(inst, "id", ""))
+        cached = ext_cache.get(key)
+        if cached and now < cached[0]:
+            inst_schema = cached[1]
+        else:
+            inst_schema = sess.execute(
                 select(SchemaRegistryTable.json_doc)
                 .where(
-                    SchemaRegistryTable.inst_id == inst.id,
+                    SchemaRegistryTable.inst_id == getattr(inst, "id", None),
                     SchemaRegistryTable.is_active.is_(True),
-                    SchemaRegistryTable.doc_type == DocType.extension,  # be explicit
+                    SchemaRegistryTable.doc_type == DocType.extension,
                 )
                 .limit(1)
-            )
-            .scalar_one_or_none()
-        )
+            ).scalar_one_or_none()
+            ext_cache[key] = (now + EXT_TTL, inst_schema)
 
-        dbc = DatabricksControl()
-        schema_extension = dbc.create_custom_schema_extension(
-            bucket_name=get_external_bucket_name(inst_id),
-            inst_query=inst,
-            file_name=file_name,
-            base_schema=base_schema,
-            extension_schema=inst_schema,
-        )
-
-        if schema_extension is not None:
-            updated_inst_schema = schema_extension
-            try:
-                new_schema_extension_record = SchemaRegistryTable(
-                    doc_type=DocType.extension,
-                    inst_id=str_to_uuid(inst_id),
-                    is_pdp=False,  # type: ignore
-                    version_label="1.0.0",
-                    extends_schema_id=base_schema_id,
-                    json_doc=schema_extension,
-                    is_active=True,
-                )
-                sess = local_session.get()
-                sess.add(new_schema_extension_record)
-                sess.flush()
-                logging.info("Schema record inserted for '%s'", inst_id)
-            except IntegrityError as e:
-                sess = local_session.get()
-                sess.rollback()
-                logging.warning("IntegrityError: %s", e)
-            except Exception as e:
-                sess = local_session.get()
-                sess.rollback()
-                logging.error("Unexpected DB error: %s", e)
-                raise HTTPException(
-                    status_code=500,
-                    detail=f"Unexpected database error while inserting file record: {e}",
-                )
+        # If extension already includes all inferred models, skip Databricks work.
+        inferred_lower = {m.lower() for m in allowed_schemas}
+        ext_models = _ext_models_set(inst_schema)
+        if inferred_lower.issubset(ext_models):
+            updated_inst_schema = inst_schema
         else:
-            logging.info(
-                "No-op: extension already contains this model for inst %s", inst_id
+            # heavy path only when needed
+            dbc = DatabricksControl()
+            schema_extension: Optional[Dict[str, Any]] = (
+                dbc.create_custom_schema_extension(
+                    bucket_name=bucket,
+                    inst_query=inst,
+                    file_name=file_name,
+                    base_schema=base_schema,
+                    extension_schema=inst_schema,
+                )
             )
-            updated_inst_schema = inst_schema
+            if schema_extension is not None:
+                updated_inst_schema = schema_extension
+                try:
+                    new_schema_extension_record = SchemaRegistryTable(
+                        doc_type=DocType.extension,
+                        inst_id=str_to_uuid(inst_id),
+                        is_pdp=False,  # type: ignore
+                        version_label="1.0.0",
+                        extends_schema_id=base_schema_id,
+                        json_doc=schema_extension,
+                        is_active=True,
+                    )
+                    sess.add(new_schema_extension_record)
+                    sess.flush()
+                    logging.info("Schema record inserted for '%s'", inst_id)
+                    # refresh cache
+                    STATE._ext_cache[key] = (
+                        time.monotonic() + EXT_TTL,
+                        schema_extension,
+                    )
+                except IntegrityError as e:
+                    sess.rollback()
+                    logging.warning("IntegrityError: %s", e)
+                except Exception as e:
+                    sess.rollback()
+                    logging.error("Unexpected DB error: %s", e)
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"Unexpected database error while inserting file record: {e}",
+                    )
+            else:
+                logging.info(
+                    "No-op: extension already contains this model for inst %s", inst_id
+                )
+                updated_inst_schema = inst_schema
 
-    # ----------------------- File validation logic logic --------------------------------------
+    # --- run file validation (I/O + Pandera work happens inside storage layer)
     try:
         inferred_schemas = storage_control.validate_file(
-            get_external_bucket_name(inst_id),
+            bucket,
             file_name,
             allowed_schemas,
             base_schema,
             updated_inst_schema,
         )
-        logging.debug(
-            f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}"
+        logging.debug("Inferred Schemas success %s", list(inferred_schemas))
+    except HardValidationError as e:
+        logging.debug("Inferred Schemas FAILED (hard) %s", e)
+        parts = ["VALIDATION_FAILED"]
+        if e.missing_required:
+            parts.append(f"missing_required={e.missing_required}")
+        if e.extra_columns:
+            parts.append(f"extra_columns={e.extra_columns}")
+        if e.schema_errors is not None:
+            parts.append(f"schema_errors={e.schema_errors}")
+        if e.failure_cases is not None:
+            try:
+                sample = (
+                    e.failure_cases[:5]
+                    if isinstance(e.failure_cases, list)
+                    else str(e.failure_cases)[:500]
+                )
+            except Exception:
+                sample = "see server logs"
+            parts.append(f"failure_cases_sample={sample}")
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST, detail="; ".join(parts)
         )
     except Exception as e:
-        logging.debug(f"!!!!!!!!!!Inferred Schemas FAILED {e}")
+        logging.debug("Inferred Schemas FAILED (other) %s", e)
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail="File type is not valid and/or not accepted by this institution: "
-            + str(e),
-        ) from e
+            detail=f"VALIDATION_ERROR: {type(e).__name__}: {e}",
+        )
 
+    # --- upsert file record (cheap path)
     existing_file = (
-        local_session.get()
-        .query(FileTable)
-        .filter_by(
-            name=file_name,
-            inst_id=str_to_uuid(inst_id),
-        )
+        sess.query(FileTable)
+        .filter_by(name=file_name, inst_id=str_to_uuid(inst_id))
         .first()
     )
-
+    if set(inferred_schemas) != set(allowed_schemas):
+        logging.info(
+            "Filename inference %s differs from validator result %s for %s; "
+            "returning filename-based types to preserve API contract.",
+            allowed_schemas,
+            inferred_schemas,
+            file_name,
+        )
     if existing_file:
-        logging.info(f"File '{file_name}' already exists for institution {inst_id}.")
+        logging.info("File '%s' already exists for institution %s.", file_name, inst_id)
         db_status = f"File '{file_name}' already exists for institution {inst_id}."
     else:
         try:
@@ -1185,17 +1266,17 @@ def validation_helper(
                 schemas=list(allowed_schemas),
                 valid=True,
             )
-            local_session.get().add(new_file_record)
-            local_session.get().flush()
-            logging.info(f"File record inserted for '{file_name}'")
+            sess.add(new_file_record)
+            sess.flush()
+            logging.info("File record inserted for '%s'", file_name)
             db_status = f"File record inserted for '{file_name}'"
         except IntegrityError as e:
-            local_session.get().rollback()
-            logging.warning(f"IntegrityError: {e}")
+            sess.rollback()
+            logging.warning("IntegrityError: %s", e)
             db_status = "Already exists"
         except Exception as e:
-            local_session.get().rollback()
-            logging.error(f"Unexpected DB error: {e}")
+            sess.rollback()
+            logging.error("Unexpected DB error: %s", e)
             raise HTTPException(
                 status_code=500,
                 detail=f"Unexpected database error while inserting file record: {e}",
@@ -1276,14 +1357,14 @@ def get_upload_url(
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/top-features/{run_id}")
+@router.get("/{inst_id}/inference/top-features/{job_run_id}")
 def get_inference_top_features(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns data for a specific institution."""
+    """Returns top n features table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1308,7 +1389,7 @@ def get_inference_top_features(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_features_with_most_impact",
+            table_name=f"inference_{job_run_id}_features_with_most_impact",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1319,10 +1400,10 @@ def get_inference_top_features(
 
 
 # Get Box plot values
-@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}")
+@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}")
 def get_inference_feature_boxstats(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
     feature_name: Optional[str] = Query(
@@ -1355,7 +1436,7 @@ def get_inference_feature_boxstats(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_box_plot_table",
+            table_name=f"inference_{job_run_id}_box_plot_table",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
         if not feature_name:
@@ -1381,7 +1462,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
         if not filtered:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
-                detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.",
+                detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.",
             )
 
         return filtered
@@ -1392,14 +1473,14 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/support-overview/{run_id}")
+@router.get("/{inst_id}/inference/support-overview/{job_run_id}")
 def get_inference_support_overview(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns support score distribution table for a  specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1424,7 +1505,7 @@ def get_inference_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_support_overview",
+            table_name=f"inference_{job_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1434,14 +1515,14 @@ def get_inference_support_overview(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/inference/feature_importance/{run_id}")
+@router.get("/{inst_id}/inference/feature_importance/{job_run_id}")
 def get_inference_feature_importance(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns feature importance table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1466,7 +1547,7 @@ def get_inference_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_shap_feature_importance",
+            table_name=f"inference_{job_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1479,14 +1560,14 @@ def get_inference_feature_importance(
 ## FE Training Tables
 
 
-@router.get("/{inst_id}/training/feature_importance/{run_id}")
+@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}")
 def get_training_feature_importance(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training feature importance table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1511,7 +1592,7 @@ def get_training_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_shap_feature_importance",
+            table_name=f"training_{experiment_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1521,14 +1602,14 @@ def get_training_feature_importance(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/confusion_matrix/{run_id}")
+@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}")
 def get_training_confusion_matrix(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training confusion matrix table for a specific instituion."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1553,7 +1634,7 @@ def get_training_confusion_matrix(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_confusion_matrix",
+            table_name=f"training_{experiment_run_id}_confusion_matrix",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1563,14 +1644,14 @@ def get_training_confusion_matrix(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/roc_curve/{run_id}")
+@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}")
 def get_training_roc_curve(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training roc curve table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1595,7 +1676,7 @@ def get_training_roc_curve(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_roc_curve",
+            table_name=f"training_{experiment_run_id}_roc_curve",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1605,14 +1686,14 @@ def get_training_roc_curve(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/support-overview/{run_id}")
+@router.get("/{inst_id}/training/support-overview/{experiment_run_id}")
 def get_training_support_overview(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training support overview table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1637,7 +1718,7 @@ def get_training_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_support_overview",
+            table_name=f"training_{experiment_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py
index d1cce3ee..9b1c1c31 100644
--- a/src/webapp/routers/data_test.py
+++ b/src/webapp/routers/data_test.py
@@ -586,11 +586,11 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_upload = client.post(
         "/institutions/"
         + uuid_to_str(USER_VALID_INST_UUID)
-        + "/input/validate-upload/file_name.csv",
+        + "/input/validate-upload/pdp_course_deidentified.csv",
     )
     assert response_upload.status_code == 200
-    assert response_upload.json()["name"] == "file_name.csv"
-    assert response_upload.json()["file_types"] == ["UNKNOWN"]
+    assert response_upload.json()["name"] == "pdp_course_deidentified.csv"
+    assert response_upload.json()["file_types"] == ["COURSE"]
     assert response_upload.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID)
     assert response_upload.json()["source"] == "MANUAL_UPLOAD"
 
@@ -598,7 +598,7 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_sftp = client.post(
         "/institutions/"
         + uuid_to_str(UUID_INVALID)
-        + "/input/validate-sftp/file_name.csv",
+        + "/input/validate-sftp/pdp_ar_deidentified.csv",
     )
     assert str(response_sftp) == "<Response [401 Unauthorized]>"
     assert (
@@ -609,11 +609,11 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_sftp = client.post(
         "/institutions/"
         + uuid_to_str(USER_VALID_INST_UUID)
-        + "/input/validate-sftp/file_name.csv",
+        + "/input/validate-sftp/pdp_ar_deidentified.csv",
     )
     assert response_sftp.status_code == 200
-    assert response_sftp.json()["name"] == "file_name.csv"
-    assert response_sftp.json()["file_types"] == ["UNKNOWN"]
+    assert response_sftp.json()["name"] == "pdp_ar_deidentified.csv"
+    assert response_sftp.json()["file_types"] == ["STUDENT"]
     assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID)
     assert response_sftp.json()["source"] == "PDP_SFTP"
 
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index cb7949f6..abbb0a36 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -1,7 +1,7 @@
 """API functions related to models."""
 
 from datetime import datetime
-from typing import Annotated, Any
+from typing import Annotated, Any, cast
 import jsonpickle
 from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel
@@ -60,7 +60,7 @@ def check_file_types_valid_schema_configs(
     """Check that a list of files are valid for a given schema configuration."""
     for config in valid_schema_configs:
         found = True
-        map_file_to_schema_config_obj = {}
+        map_file_to_schema_config_obj: dict = {}
         for idx, s in enumerate(file_types):
             for c in config:
                 if c.schema_type in s:
@@ -96,6 +96,7 @@ class ModelCreationRequest(BaseModel):
     # valid = False, means the model is not ready for use.
     valid: bool = False
     schema_configs: list[list[SchemaConfigObj]]
+    framework: str | None = None
 
 
 class ModelInfo(BaseModel):
@@ -215,6 +216,11 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
+            framework=(
+                f
+                if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"}
+                else "sklearn"
+            ),
         )
         local_session.get().add(model)
         local_session.get().commit()
@@ -252,6 +258,7 @@ def create_model(
         "created_by": uuid_to_str(query_result[0][0].created_by),
         "deleted": query_result[0][0].deleted,
         "valid": query_result[0][0].valid,
+        "framework": query_result[0][0].framework,
     }
 
 
@@ -299,6 +306,7 @@ def read_inst_model(
         "created_by": uuid_to_str(query_result[0][0].created_by),
         "deleted": query_result[0][0].deleted,
         "valid": query_result[0][0].valid,
+        "framework": query_result[0][0].framework,
     }
 
 
@@ -546,7 +554,8 @@ def trigger_inference_run(
         model_name=model_name,
         gcp_external_bucket_name=get_external_bucket_name(inst_id),
         # The institution email to which pipeline success/failure notifications will get sent.
-        email=current_user.email,
+        email=cast(str, current_user.email),
+        model_type=query_result[0][0].framework,
     )
     try:
         res = databricks_control.run_pdp_inference(db_req)
@@ -565,6 +574,7 @@ def trigger_inference_run(
         batch_name=req.batch_name,
         model_id=query_result[0][0].id,
         output_valid=False,
+        framework=query_result[0][0].framework,
     )
     local_session.get().add(job)
     return {
@@ -575,4 +585,5 @@ def trigger_inference_run(
         "triggered_at": triggered_timestamp,
         "batch_name": req.batch_name,
         "output_valid": False,
+        "framework": query_result[0][0].framework,
     }
diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 8643f98b..1da27834 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -2,6 +2,7 @@
 
 import uuid
 from unittest import mock
+from typing import Any
 import pytest
 import jsonpickle
 from fastapi.testclient import TestClient
@@ -13,7 +14,6 @@
     USER_UUID,
     UUID_INVALID,
     DATETIME_TESTING,
-    MODEL_OBJ,
     SAMPLE_UUID,
 )
 from ..main import app
@@ -50,32 +50,32 @@
 
 
 # TODO plumb through schema configs
-def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo):
+def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo) -> bool:
     """Check ModelInfo equality without order."""
     if (
-        a_elem["inst_id"] != b_elem["inst_id"]
-        or a_elem["name"] != b_elem["name"]
-        or a_elem["m_id"] != b_elem["m_id"]
-        or a_elem["valid"] != b_elem["valid"]
-        or a_elem["deleted"] != b_elem["deleted"]
+        a_elem.inst_id != b_elem.inst_id
+        or a_elem.name != b_elem.name
+        or a_elem.m_id != b_elem.m_id
+        or a_elem.valid != b_elem.valid
+        or a_elem.deleted != b_elem.deleted
     ):
         return False
     return True
 
 
-def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo):
+def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo) -> bool:
     """Check RunInfo equality without order."""
     if (
-        a_elem["inst_id"] != b_elem["inst_id"]
-        or a_elem["m_name"] != b_elem["m_name"]
-        or a_elem["run_id"] != b_elem["run_id"]
-        or a_elem["created_by"] != b_elem["created_by"]
-        or a_elem["triggered_at"] != b_elem["triggered_at"]
-        or a_elem["output_filename"] != b_elem["output_filename"]
-        or a_elem["output_valid"] != b_elem["output_valid"]
-        or a_elem["err_msg"] != b_elem["err_msg"]
-        or a_elem["batch_name"] != b_elem["batch_name"]
-        or a_elem["completed"] != b_elem["completed"]
+        a_elem.inst_id != b_elem.inst_id
+        or a_elem.m_name != b_elem.m_name
+        or a_elem.run_id != b_elem.run_id
+        or a_elem.created_by != b_elem.created_by
+        or a_elem.triggered_at != b_elem.triggered_at
+        or a_elem.output_filename != b_elem.output_filename
+        or a_elem.output_valid != b_elem.output_valid
+        or a_elem.err_msg != b_elem.err_msg
+        or a_elem.batch_name != b_elem.batch_name
+        or a_elem.completed != b_elem.completed
     ):
         return False
     return True
@@ -152,6 +152,7 @@ def session_fixture():
             ]
         ),
         valid=True,
+        framework="sklearn",
     )
     run_1 = JobTable(
         id=RUN_ID,
@@ -161,6 +162,7 @@ def session_fixture():
         completed=True,
         output_filename="file_output_one",
         created_by=created_by_UUID,
+        framework="sklearn",
     )
     try:
         with sqlalchemy.orm.Session(engine) as session:
@@ -198,7 +200,7 @@ def session_fixture():
 
 
 @pytest.fixture(name="client")
-def client_fixture(session: sqlalchemy.orm.Session):
+def client_fixture(session: sqlalchemy.orm.Session) -> Any:
     """Unit test mocks setup."""
 
     def get_session_override():
@@ -224,26 +226,25 @@ def databricks_control_override():
     app.dependency_overrides.clear()
 
 
-def test_read_inst_models(client: TestClient):
+def test_read_inst_models(client: TestClient) -> None:
     """Test GET /institutions/345/models."""
     response = client.get(
         "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/models"
     )
     assert response.status_code == 200
     assert same_model_orderless(
-        response.json()[0],
-        {
-            "created_by": "",
-            "deleted": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_id": "e4862c62829440d8ab4c9c298f02f619",
-            "name": "sample_model_for_school_1",
-            "valid": True,
-        },
+        ModelInfo(**response.json()[0]),
+        ModelInfo(
+            m_id="e4862c62829440d8ab4c9c298f02f619",
+            name="sample_model_for_school_1",
+            inst_id="1d7c75c33eda42949c6675ea8af97b55",
+            deleted=None,
+            valid=True,
+        ),
     )
 
 
-def test_read_inst_model(client: TestClient):
+def test_read_inst_model(client: TestClient) -> None:
     """Test GET /institutions/345/models/10. For various user access types."""
     # Unauthorized cases.
     response_unauth = client.get(
@@ -264,10 +265,18 @@ def test_read_inst_model(client: TestClient):
         + "/models/sample_model_for_school_1"
     )
     assert response.status_code == 200
-    assert same_model_orderless(response.json(), MODEL_OBJ)
+    response_model = ModelInfo(**response.json())
+    expected_model = ModelInfo(
+        deleted=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_id="e4862c62829440d8ab4c9c298f02f619",
+        name="sample_model_for_school_1",
+        valid=True,
+    )
+    assert same_model_orderless(response_model, expected_model)
 
 
-def test_read_inst_model_outputs(client: TestClient):
+def test_read_inst_model_outputs(client: TestClient) -> None:
     """Test GET /institutions/345/models/10/output."""
     MOCK_STORAGE.list_blobs_in_folder.return_value = []
     # Authorized.
@@ -277,24 +286,23 @@ def test_read_inst_model_outputs(client: TestClient):
         + "/models/sample_model_for_school_1/runs"
     )
     assert response.status_code == 200
-    assert same_run_info_orderless(
-        response.json()[0],
-        {
-            "batch_name": "batch_foo",
-            "completed": True,
-            "created_by": "0ad8b77c49fb459a84b18d2c05722c4a",
-            "err_msg": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_name": "sample_model_for_school_1",
-            "output_filename": "file_output_one",
-            "output_valid": False,
-            "run_id": 123,
-            "triggered_at": "2024-12-24T20:22:20.132022",
-        },
+    response_model = RunInfo(**response.json()[0])
+    expected_model = RunInfo(
+        batch_name="batch_foo",
+        created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+        err_msg=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_name="sample_model_for_school_1",
+        output_filename="file_output_one",
+        output_valid=False,
+        run_id=123,
+        triggered_at=response_model.triggered_at,  # copy from response
+        completed=response_model.completed,
     )
+    assert same_run_info_orderless(response_model, expected_model)
 
 
-def test_read_inst_model_output(client: TestClient):
+def test_read_inst_model_output(client: TestClient) -> None:
     """Test GET /institutions/345/models/10/output/1."""
     # Authorized.
     response = client.get(
@@ -304,24 +312,23 @@ def test_read_inst_model_output(client: TestClient):
         + str(RUN_ID)
     )
     assert response.status_code == 200
-    assert same_run_info_orderless(
-        response.json(),
-        {
-            "batch_name": "batch_foo",
-            "completed": True,
-            "created_by": "0ad8b77c49fb459a84b18d2c05722c4a",
-            "err_msg": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_name": "sample_model_for_school_1",
-            "output_filename": "file_output_one",
-            "output_valid": False,
-            "run_id": 123,
-            "triggered_at": "2024-12-24T20:22:20.132022",
-        },
+    response_model = RunInfo(**response.json())
+    expected_model = RunInfo(
+        batch_name="batch_foo",
+        completed=True,
+        created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+        err_msg=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_name="sample_model_for_school_1",
+        output_filename="file_output_one",
+        output_valid=False,
+        run_id=123,
+        triggered_at=response_model.triggered_at,  # copy from response
     )
+    assert same_run_info_orderless(response_model, expected_model)
 
 
-def test_create_model(client: TestClient):
+def test_create_model(client: TestClient) -> None:
     """Depending on timeline, fellows may not get to this."""
     schema_config_1 = {
         "schema_type": SchemaType.COURSE,
@@ -336,13 +343,14 @@ def test_create_model(client: TestClient):
         json={
             "name": "my_model",
             "schema_configs": [[schema_config_1, schema_config_2]],
+            "framework": "h2o",
         },
     )
 
     assert response.status_code == 200
 
 
-def test_trigger_inference_run(client: TestClient):
+def test_trigger_inference_run(client: TestClient) -> None:
     """Depending on timeline, fellows may not get to this."""
     MOCK_DATABRICKS.run_pdp_inference.return_value = DatabricksInferenceRunResponse(
         job_run_id=123
diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index 460d4e1d..8b35088b 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -2,7 +2,7 @@
 
 import uuid
 import re
-from typing import Annotated, Final, Any
+from typing import Annotated, Final, Any, Optional, Tuple, Union
 from urllib.parse import unquote
 from strenum import StrEnum  # needed for python pre 3.11
 import jwt
@@ -163,7 +163,9 @@ class BaseUser(BaseModel):
     disabled: bool | None = None
 
     # Constructor
-    def __init__(self, usr: str | None, inst: str, access: str, email: str) -> None:
+    def __init__(
+        self, usr: str | None, inst: str | None, access: str | None, email: str | None
+    ) -> None:
         super().__init__(user_id=usr, institution=inst, access_type=access, email=email)
 
     def is_datakinder(self) -> Any:
@@ -182,7 +184,7 @@ def is_viewer(self) -> Any:
         """Whether a given user is a viewer."""
         return self.access_type and self.access_type == AccessType.VIEWER
 
-    def has_access_to_inst(self, inst: str) -> Any:
+    def has_access_to_inst(self, inst: str | None) -> Any:
         """Whether a given user has access to a given institution."""
         return self.access_type and (
             self.access_type == AccessType.DATAKINDER or self.institution == inst
@@ -215,28 +217,28 @@ def has_stronger_permissions_than(self, other_access_type: AccessType) -> bool:
         return False
 
 
-def get_user(sess: Session, username: str) -> BaseUser:
+def get_user(sess: Session, username: str) -> Optional[BaseUser]:
     """Get user from a given username."""
     if username == "api_key_initial":
         return BaseUser(
-            usr=env_vars["INITIAL_API_KEY_ID"],
+            usr=str(env_vars["INITIAL_API_KEY_ID"]),
             inst=None,
             access="DATAKINDER",
             email="api_key_initial",
         )
     if username.startswith("api_key_"):
         api_key_uuid = username.removeprefix("api_key_")
-        query_result = sess.execute(
+        apikey_query_result = sess.execute(
             select(ApiKeyTable).where(
                 ApiKeyTable.id == str_to_uuid(api_key_uuid),
             )
         ).all()
-        if len(query_result) == 0 or len(query_result) > 1:
+        if len(apikey_query_result) == 0 or len(apikey_query_result) > 1:
             return None
         return BaseUser(
-            usr=uuid_to_str(query_result[0][0].id),
-            inst=uuid_to_str(query_result[0][0].inst_id),
-            access=query_result[0][0].access_type,
+            usr=uuid_to_str(apikey_query_result[0][0].id),
+            inst=uuid_to_str(apikey_query_result[0][0].inst_id),
+            access=apikey_query_result[0][0].access_type,
             email=username,
         )
     query_result = sess.execute(
@@ -254,13 +256,15 @@ def get_user(sess: Session, username: str) -> BaseUser:
     )
 
 
-def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser:
+def authenticate_api_key(
+    api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session
+) -> Union[BaseUser, bool]:
     """Authenticate an API key."""
     (key, inst, enduser) = api_key_enduser_tuple
     # Check if it's the initial API key. This doesn't have enduser or inst.
     if key == env_vars["INITIAL_API_KEY"]:
         return BaseUser(
-            usr=env_vars["INITIAL_API_KEY_ID"],
+            usr=str(env_vars["INITIAL_API_KEY_ID"]),
             inst=None,
             access="DATAKINDER",
             email="api_key_initial",
@@ -291,7 +295,7 @@ def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser:
                     user_query = select(AccountTable).where(
                         and_(
                             AccountTable.email == enduser,
-                            AccountTable.inst_id == uuid_to_str(inst),
+                            AccountTable.inst_id == inst,
                         )
                     )
                 user_result = sess.execute(user_query).all()
@@ -330,7 +334,9 @@ async def get_current_user(
         if not token_from_key:
             raise credentials_exception
         payload = jwt.decode(
-            token_from_key, env_vars["SECRET_KEY"], algorithms=env_vars["ALGORITHM"]
+            token_from_key,
+            str(env_vars["SECRET_KEY"]),
+            algorithms=env_vars["ALGORITHM"],
         )
         usrname = payload.get("sub")
         if usrname is None:
@@ -345,14 +351,14 @@ async def get_current_user(
 
 async def get_current_active_user(
     current_user: Annotated[BaseUser, Depends(get_current_user)],
-):
+) -> BaseUser:
     """Get the active user.."""
     if current_user.disabled:
         raise HTTPException(status_code=400, detail="Inactive user")
     return current_user
 
 
-def has_access_to_inst_or_err(inst: str, user: BaseUser):
+def has_access_to_inst_or_err(inst: str, user: BaseUser) -> None:
     """Raise error if a given user does not have access to a given institution."""
     if not user.has_access_to_inst(inst):
         raise HTTPException(
@@ -361,7 +367,7 @@ def has_access_to_inst_or_err(inst: str, user: BaseUser):
         )
 
 
-def has_full_data_access_or_err(user: BaseUser, resource_type: str):
+def has_full_data_access_or_err(user: BaseUser, resource_type: str) -> None:
     """Raise error if a given user does not have data access to a given institution."""
     if not user.has_full_data_access():
         raise HTTPException(
@@ -370,7 +376,7 @@ def has_full_data_access_or_err(user: BaseUser, resource_type: str):
         )
 
 
-def model_owner_and_higher_or_err(user: BaseUser, resource_type: str):
+def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None:
     """Raise error if a given user does not have model ownership or higher."""
     if not user.access_type or user.access_type not in (
         AccessType.MODEL_OWNER,
@@ -382,29 +388,29 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str):
         )
 
 
-def prepend_env_prefix(name: str) -> str:
+def prepend_env_prefix(name: str) -> Any:
     """Prepend the env prefix. At this point the value should not be empty as we checked on app startup."""
-    return env_vars["ENV"].lower() + "_" + name
+    return str(env_vars["ENV"]).lower() + "_" + name
 
 
-def uuid_to_str(uuid_val: uuid.UUID) -> str:
+def uuid_to_str(uuid_val: uuid.UUID) -> Any:
     """Convert UUID obj to string."""
     if uuid_val is None:
         return ""
     return uuid_val.hex
 
 
-def str_to_uuid(hex_str: str) -> uuid.UUID:
+def str_to_uuid(hex_str: Optional[str]) -> uuid.UUID:
     """Convert str to UUID obj (database needs UUID obj)."""
     return uuid.UUID(hex_str)
 
 
-def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> str:
+def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> Any:
     """Get the GCP bucket name which has the env prepended taking in the UUID obj."""
     return prepend_env_prefix(uuid_to_str(inst_id))
 
 
-def get_external_bucket_name(inst_id: str) -> str:
+def get_external_bucket_name(inst_id: str) -> Any:
     """Get the GCP bucket name which has the env prepended taking in the uuid as str."""
     return prepend_env_prefix(inst_id)
 
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 3f359aaf..e02df270 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -1,27 +1,47 @@
-"""File validation functions for various schemas. (Record by record validation happens in the
-pipelines, this is for general file validation.)
+"""File validation functions for various schemas.
+Record-by-record validation happens in the pipelines; this module performs
+general file validation with performance-focused improvements.
+
+Key speed-ups (without losing accuracy):
+- Header-only pass to discover/resolve columns before full load
+- Selective, typed CSV read via `usecols` and dtype mapping
+- Exact-name Pandera schemas (avoid regex column matching)
+- Fuzzy matching only for unresolved headers; use rapidfuzz if available
+- Precompiled regexes and set-based membership checks inside Pandera checks
 """
 
-from typing import Any
+from __future__ import annotations
 
+import io
+import os
 import json
 import re
-from typing import Union, List, Dict, Optional
 import logging
+from functools import lru_cache
+from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple
 
 import pandas as pd
 from pandera import Column, Check, DataFrameSchema
 from pandera.errors import SchemaErrors
-from thefuzz import fuzz
+
+# --------------------------------------------------------------------------- #
+# Logging
+# --------------------------------------------------------------------------- #
+
+logger = logging.getLogger(__name__)
+
+# --------------------------------------------------------------------------- #
+# Public entry points
+# --------------------------------------------------------------------------- #
 
 
 def validate_file_reader(
-    filename: str,
+    filename: Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper],
     allowed_schema: list[str],
     base_schema: dict,
     inst_schema: Optional[Dict[Any, Any]] = None,
 ) -> dict[str, Any]:
-    """Validates given a filename."""
+    """Validates a dataset given a filename and schema selection."""
     return validate_dataset(filename, base_schema, inst_schema, allowed_schema)
 
 
@@ -47,12 +67,18 @@ def __init__(
         super().__init__("; ".join(parts))
 
 
+# --------------------------------------------------------------------------- #
+# Utilities
+# --------------------------------------------------------------------------- #
+
+
+@lru_cache(maxsize=4096)
 def normalize_col(name: str) -> str:
-    name = name.strip().lower()  # Lowercase and trim whitespace
-    name = re.sub(r"[^a-z0-9_]", "_", name)  # Replace non-alphanum with underscore
-    name = re.sub(r"_+", "_", name)  # Collapse multiple underscores
-    name = name.strip("_")  # Remove leading/trailing underscores
-    return name
+    """Normalize a column name: trim, lowercase, non-alnum->'_', collapse '_'s."""
+    name = name.strip().lower()
+    name = re.sub(r"[^a-z0-9_]", "_", name)
+    name = re.sub(r"_+", "_", name)
+    return name.strip("_")
 
 
 def load_json(path: str) -> Any:
@@ -60,57 +86,7 @@ def load_json(path: str) -> Any:
         with open(path, "r") as f:
             return json.load(f)
     except Exception as e:
-        raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}")
-
-
-def rename_columns_to_match_schema(
-    df: pd.DataFrame,
-    canon_to_aliases: Dict[str, List[str]],
-    threshold: int = 90,
-) -> pd.DataFrame:
-    """
-    Rename incoming columns using fuzzy match against schema-defined column names and aliases.
-
-    Args:
-        df: Incoming dataframe
-        canon_to_aliases: Mapping from canonical column names to list of aliases (including the canonical name itself)
-        threshold: Fuzzy match score threshold to rename
-
-    Returns:
-        A new DataFrame with renamed columns
-    """
-    from collections import defaultdict
-
-    new_column_names = {}
-    log_info = defaultdict(list)
-
-    schema_names = []
-    for canon, aliases in canon_to_aliases.items():
-        for name in aliases:
-            schema_names.append((name, canon))  # (alias_or_name, canonical_name)
-
-    for incoming_col in df.columns:
-        best_score = 0
-        best_match = None
-        best_canon = None
-
-        for schema_col, canon in schema_names:
-            score = fuzz.ratio(incoming_col.lower(), schema_col.lower())
-            if score > best_score:
-                best_score = score
-                best_match = schema_col
-                best_canon = canon
-
-        if best_score >= threshold and incoming_col != best_canon:
-            new_column_names[incoming_col] = best_canon
-            log_info[incoming_col].append(
-                f"Renamed '{incoming_col}' -> '{best_canon}' (matched on '{best_match}', score={best_score})"
-            )
-
-    for k, v in log_info.items():
-        logging.info(" | ".join(v))
-
-    return df.rename(columns=new_column_names)
+        raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}") from e
 
 
 def merge_model_columns(
@@ -119,10 +95,12 @@ def merge_model_columns(
     institution: str,
     model: str,
 ) -> Dict[str, dict]:
+    """
+    Merge base model columns with institution-specific extension, if present.
+    """
     base_models = base_schema.get("base", {}).get("data_models", {})
     if model not in base_models:
-        if logging:
-            logging.error(f"Model '{model}' not found in base schema")
+        logger.error("Model '%s' not found in base schema", model)
         raise KeyError(f"Model '{model}' not in base schema")
     merged = dict(base_models[model].get("columns", {}))
     if extension_schema:
@@ -133,148 +111,427 @@ def merge_model_columns(
     return merged
 
 
-def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
-    columns = {}
-    for canon, spec in specs.items():
-        names = [canon] + spec.get("aliases", [])
-        pattern = r"^(?:" + "|".join(map(re.escape, names)) + r")$"
+# --------------------------------------------------------------------------- #
+# Encoding sniffing (mypy-friendly)
+# --------------------------------------------------------------------------- #
+
+Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper]
+
+
+def _read_sample(buf: BinaryIO, n: int) -> bytes:
+    pos = buf.tell() if buf.seekable() else None
+    chunk = buf.read(n)
+    if pos is not None:
+        buf.seek(pos)
+    return chunk
+
+
+def sniff_encoding(src: Src, sample_bytes: int = 1_048_576) -> str:
+    """
+    Best-guess encoding via BOM detection + utf-8 trial.
+    Works with a filesystem path, a binary stream, or a TextIOWrapper.
+    Restores stream position if seekable. Raises if latin-1 would be used (by default).
+    """
+    # --- read a small binary sample ---
+    if isinstance(src, (str, os.PathLike)):
+        with open(src, "rb") as f:
+            chunk: bytes = f.read(sample_bytes)
+    elif isinstance(src, io.TextIOWrapper):
+        # Text wrapper => use underlying binary buffer, cast to BinaryIO for mypy
+        chunk = _read_sample(cast(BinaryIO, src.buffer), sample_bytes)
+    else:
+        # Already a binary stream
+        chunk = _read_sample(cast(BinaryIO, src), sample_bytes)
+
+    # --- BOMs first ---
+    if chunk.startswith(b"\xef\xbb\xbf"):
+        return "utf-8-sig"
+    if chunk.startswith(b"\xff\xfe\x00\x00"):
+        return "utf-32le"
+    if chunk.startswith(b"\x00\x00\xfe\xff"):
+        return "utf-32be"
+    if chunk.startswith(b"\xff\xfe"):
+        return "utf-16le"
+    if chunk.startswith(b"\xfe\xff"):
+        return "utf-16be"
+
+    # --- utf-8 strict on sample ---
+    try:
+        chunk.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        raise UnicodeError(
+            "file is not UTF-8/UTF-16/UTF-32; please re-export as UTF-8."
+        )
+
+
+def _reset_to_start_if_possible(src: Src) -> None:
+    """Best-effort reset to the beginning for file-like objects."""
+    try:
+        if hasattr(src, "seek") and callable(getattr(src, "seek")):
+            src.seek(0)  # type: ignore[attr-defined]
+    except Exception:
+        pass
+
+
+# --------------------------------------------------------------------------- #
+# Fast header pass & mapping
+# --------------------------------------------------------------------------- #
+
+
+def _spec_alias_lookup(
+    merged_specs: Dict[str, dict],
+) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
+    """
+    Build:
+      - alias2canon: normalized alias -> canonical
+      - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical)
+    """
+    alias2canon: Dict[str, str] = {}
+    canon_to_aliases_norm: Dict[str, List[str]] = {}
+    for canon, spec in merged_specs.items():
+        aliases = [canon] + spec.get("aliases", [])
+        normed = [normalize_col(a) for a in aliases]
+        canon_to_aliases_norm[canon] = normed
+        for a in normed:
+            alias2canon[a] = canon
+    return alias2canon, canon_to_aliases_norm
+
+
+def _fuzzy_map_unresolved(
+    unresolved: List[Tuple[str, str]],  # [(raw_header, normalized_header)]
+    choices: List[str],  # normalized aliases
+    alias2canon: Dict[str, str],
+    threshold: int = 90,
+) -> Dict[str, str]:  # raw_header -> canonical
+    """
+    Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz.
+    """
+    mapping: Dict[str, str] = {}
+    try:
+        from rapidfuzz import process, fuzz as rf_fuzz  # type: ignore
+
+        for raw, norm in unresolved:
+            hit = process.extractOne(
+                norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold
+            )
+            if hit:
+                best_alias, score, _ = hit
+                mapping[raw] = alias2canon[best_alias]  # type: ignore[index]
+    except Exception:
+        # fallback to thefuzz if rapidfuzz is unavailable
+        try:
+            from thefuzz import fuzz as tf_fuzz  # type: ignore
+        except Exception:
+            # If neither library is available, do not fuzz-map anything.
+            return mapping
+        for raw, norm in unresolved:
+            best_score = 0
+            best_alias = None
+            for alias in choices:
+                s = tf_fuzz.ratio(norm, alias)
+                if s > best_score:
+                    best_score, best_alias = s, alias
+            if best_alias and best_score >= threshold:
+                mapping[raw] = alias2canon[best_alias]
+    return mapping
+
+
+def _header_pass(
+    filename: Src,
+    encoding: str,
+    merged_specs: Dict[str, dict],
+    fuzzy_threshold: int = 90,
+) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]:
+    """
+    Read only the header. Return:
+      - raw_cols: list of column names as in file
+      - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy)
+      - missing_required: list of canonical columns missing
+      - missing_optional: list of optional canonical columns missing
+      - unknown_extra: normalized headers that don't map to any alias
+    """
+    header_df = pd.read_csv(filename, encoding=encoding, nrows=0)
+    raw_cols = list(header_df.columns)
+
+    alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs)
+    known_aliases = set(alias2canon.keys())
+
+    # exact (normalized) mapping first
+    raw_to_canon: Dict[str, str] = {}
+    unresolved: List[Tuple[str, str]] = []
+
+    for raw in raw_cols:
+        norm = normalize_col(raw)
+        if norm in alias2canon:
+            raw_to_canon[raw] = alias2canon[norm]
+        else:
+            unresolved.append((raw, norm))
+
+    # fuzzy match only for unresolved headers
+    if unresolved:
+        choices = list(known_aliases)
+        fuzzy_map = _fuzzy_map_unresolved(
+            unresolved, choices, alias2canon, threshold=fuzzy_threshold
+        )
+        raw_to_canon.update(fuzzy_map)
+
+    incoming_canons = set(raw_to_canon.values())
+    missing_required = [
+        c
+        for c, spec in merged_specs.items()
+        if spec.get("required", False) and c not in incoming_canons
+    ]
+    missing_optional = [
+        c
+        for c, spec in merged_specs.items()
+        if not spec.get("required", False) and c not in incoming_canons
+    ]
+    # normalized headers that remain unmapped and aren't known aliases
+    unknown_extra = sorted(
+        {norm for (_, norm) in unresolved if norm not in known_aliases}
+    )
+
+    return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra
+
+
+def _pandas_dtype_and_parse_dates(
+    merged_specs: Dict[str, dict],
+) -> Tuple[Dict[str, Any], List[str]]:
+    """
+    Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates.
+    Keeps behavior stable while avoiding heavy inference.
+    """
+    dtype_map: Dict[str, Any] = {}
+    parse_dates: List[str] = []
+
+    for canon, spec in merged_specs.items():
+        dt = str(spec.get("dtype"))
+        if dt in {"string", "str", "object"}:
+            dtype_map[canon] = "string"
+        elif dt in {"int", "int64", "Int64"}:
+            dtype_map[canon] = "Int64"  # nullable integers are safer for dirty data
+        elif dt in {"float", "float64"}:
+            dtype_map[canon] = "float64"
+        elif "datetime" in dt or "date" in dt:
+            parse_dates.append(canon)
+        elif dt in {"bool", "boolean"}:
+            dtype_map[canon] = "boolean"
+        elif dt == "category":
+            dtype_map[canon] = "category"
+        else:
+            # leave to pandas inference
+            pass
+
+    return dtype_map, parse_dates
+
+
+def _build_exact_schema(
+    specs: Dict[str, dict], only_canons: List[str]
+) -> DataFrameSchema:
+    """
+    Build a Pandera schema with exact column names (no regex).
+    This avoids regex matching overhead during validation.
+    """
+    cols: Dict[str, Column] = {}
+    for canon in only_canons:
+        spec = specs[canon]
         checks = []
         for chk in spec.get("checks", []):
+            args = list(chk.get("args", []))
+            # precompile regex patterns once
+            if (
+                chk["type"] in {"str_matches", "matches"}
+                and args
+                and isinstance(args[0], str)
+            ):
+                args[0] = re.compile(args[0])
+            # set-based membership for faster 'isin'
+            if chk["type"] in {"isin", "is_in"} and args and isinstance(args[0], list):
+                args[0] = set(args[0])
+
             factory = getattr(Check, chk["type"])
-            checks.append(factory(*chk.get("args", []), **chk.get("kwargs", {})))
+            checks.append(factory(*args, **chk.get("kwargs", {})))
 
-        columns[pattern] = Column(
-            name=pattern,
-            regex=True,
+        cols[canon] = Column(
+            name=canon,
+            regex=False,
             dtype=spec["dtype"],
             nullable=spec["nullable"],
-            required=spec.get("required", False),
+            required=True,  # present-by-construction
             checks=checks or None,
             coerce=spec.get("coerce", False),
         )
-    return DataFrameSchema(columns, strict=False)
+    return DataFrameSchema(cols, strict=False)
+
+
+# --------------------------------------------------------------------------- #
+# Main validation
+# --------------------------------------------------------------------------- #
 
 
 def validate_dataset(
-    filename: str,
+    filename: Src,
     base_schema: dict,
     ext_schema: Optional[Dict[Any, Any]] = None,
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
-    df = pd.read_csv(filename)
-    df = df.rename(columns={c: normalize_col(c) for c in df.columns})
-    incoming = set(df.columns)
+    """
+    Validate a dataset against merged base/extension schemas.
+
+    Steps:
+      1) Detect encoding (BOM/UTF-8)
+      2) Merge requested models' column specs
+      3) Header-only pass to map columns (exact + fuzzy) and detect missing/extra
+      4) Selective, typed read via pandas (skip unused columns)
+      5) Fail-fast validation for required columns; collect soft errors for optional
+    """
+    # ---------------------------- 1) Encoding
+    try:
+        enc = sniff_encoding(filename)
+    except UnicodeError as ex:
+        raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)])
+
+    # Ensure both header and full reads start at the beginning for file-like handles
+    _reset_to_start_if_possible(filename)
 
-    # 2) merge requested models
+    # ---------------------------- 2) merge requested models
     if models is None:
-        model_list = []
+        model_list: List[str] = []
     elif isinstance(models, str):
         model_list = [models]
     else:
-        model_list = list(models)  # <- ensures it's not a set
+        model_list = list(models)
 
     merged_specs: Dict[str, dict] = {}
     for m in model_list:
         specs = merge_model_columns(base_schema, ext_schema, institution_id, m.lower())
         merged_specs.update(specs)
 
-    canon_to_aliases = {
-        canon: [normalize_col(alias) for alias in [canon] + spec.get("aliases", [])]
-        for canon, spec in merged_specs.items()
+    if not merged_specs:
+        # nothing to validate; short-circuit
+        return {
+            "validation_status": "passed",
+            "schemas": model_list,
+            "missing_optional": [],
+            "unknown_extra_columns": [],
+        }
+
+    # ----------------------------  3) HEADER-ONLY PASS
+    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = (
+        _header_pass(filename, enc, merged_specs, fuzzy_threshold=90)
+    )
+
+    if missing_required:
+        logger.error("Missing required columns: %s", missing_required)
+        raise HardValidationError(missing_required=missing_required)
+
+    # Reset again before the real read (important for file-like objects)
+    _reset_to_start_if_possible(filename)
+
+    # Choose one raw header per canonical; prefer exact canonical names when available
+    canon_to_raw: Dict[str, str] = {}
+    for raw, canon in raw_to_canon.items():
+        # Prefer if normalized raw equals canonical name
+        if canon not in canon_to_raw or normalize_col(raw) == canon:
+            canon_to_raw[canon] = raw
+
+    present_canons = sorted(canon_to_raw.keys())
+    raw_usecols = list(canon_to_raw.values())
+
+    # dtype & parse_dates maps (by canonical) -> convert to raw keys for read_csv
+    canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs)
+    raw_dtype_map = {
+        canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw
     }
-    df = rename_columns_to_match_schema(df, canon_to_aliases)
-    df.columns = [
-        normalize_col(c) for c in df.columns
-    ]  # Final normalization after renaming
-
-    incoming = set(df.columns)
-
-    # 3) build canon → set(normalized names)
-    canon_to_norms: Dict[str, set] = {
-        canon: {normalize_col(alias) for alias in [canon] + spec.get("aliases", [])}
-        for canon, spec in merged_specs.items()
-    }
-
-    pattern_to_canon = {
-        r"^(?:"
-        + "|".join(map(re.escape, [canon] + spec.get("aliases", [])))
-        + r")$": canon
-        for canon, spec in merged_specs.items()
-    }
-
-    # 4) find extra / missing
-    all_norms = set().union(*canon_to_norms.values()) if canon_to_norms else set()
-    extra_columns = sorted(incoming - all_norms)
+    parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw]
 
-    missing_required = [
-        canon
-        for canon, norms in canon_to_norms.items()
-        if merged_specs[canon].get("required", False) and norms.isdisjoint(incoming)
+    # ---------------------------- 4) Selective, typed read
+    # Default to fast C engine; try pyarrow if available.
+    engine = "c"
+    try:
+        import pyarrow  # noqa: F401
+
+        engine = "pyarrow"
+    except Exception:
+        pass
+
+    read_kwargs: Dict[str, Any] = dict(
+        encoding=enc,
+        usecols=raw_usecols,
+        dtype=raw_dtype_map or None,
+        engine=engine,
+    )
+    # memory_map works for path-like with the C engine
+    if engine == "c" and isinstance(filename, (str, os.PathLike)):
+        read_kwargs["memory_map"] = True
+        # only C engine supports parse_dates consistently across versions
+        if parse_dates_raw:
+            read_kwargs["parse_dates"] = parse_dates_raw
+
+    df = pd.read_csv(
+        filename, **{k: v for k, v in read_kwargs.items() if v is not None}
+    )
+
+    # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy)
+    if engine == "pyarrow" and parse_dates_canons:
+        for canon in parse_dates_canons:
+            raw = str(canon_to_raw.get(canon))
+            if raw and raw in df.columns:
+                # coerce invalids to NaT; Pandera will flag according to nullability/checks
+                df[raw] = pd.to_datetime(df[raw], errors="coerce")
+
+    # Rename raw headers -> canonical names exactly once
+    df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}, inplace=True)
+
+    # ---------------------------- 5) Validation: required fail-fast, optional lazy (collect soft errors)
+    required_canons = [
+        c for c in present_canons if merged_specs[c].get("required", False)
     ]
-
-    missing_optional = [
-        canon
-        for canon, norms in canon_to_norms.items()
-        if not merged_specs[canon].get("required", False) and norms.isdisjoint(incoming)
+    optional_canons = [
+        c for c in present_canons if not merged_specs[c].get("required", False)
     ]
 
-    # Hard-fail on missing required or any extra columns
-    if missing_required or extra_columns:
-        if logging:
-            logging.error(
-                f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}"
-            )
-        raise HardValidationError(
-            missing_required=missing_required, extra_columns=extra_columns
-        )
-
-    # 5) build Pandera schema & validate (hard-fail on any error)
-    schema = build_schema(merged_specs)
-    try:
-        schema.validate(df, lazy=True)
-    except SchemaErrors as err:
-        # TODO: Log validation failure for DS to review
-        failed_normals = set(err.failure_cases["column"])
-        failed_canons = {pattern_to_canon.get(p, p) for p in failed_normals}
-
-        # split into required vs optional failures
-        req_failures = [
-            c for c in failed_canons if merged_specs.get(c, {}).get("required", False)
-        ]
-        opt_failures = [
-            c
-            for c in failed_canons
-            if not merged_specs.get(c, {}).get("required", False)
-        ]
-
-        if req_failures:
-            if logging:
-                logging.error(
-                    f"Schema validation failed on required columns, schema_errors = {err.schema_errors}, failure_cases = {err.failure_cases.to_dict(orient='records')}"
-                )
+    # Build exact-name schemas (faster than regex)
+    if required_canons:
+        req_schema = _build_exact_schema(merged_specs, required_canons)
+        try:
+            req_schema.validate(df[required_canons], lazy=False)
+        except SchemaErrors as err:
+            logger.error("Required column validation failed.")
             raise HardValidationError(
                 schema_errors=err.schema_errors,
                 failure_cases=err.failure_cases.to_dict(orient="records"),
             )
-        else:
-            if logging:
-                logging.info(f"missing_optional = {missing_optional}")
-            print("Optional column validation errors on: ", opt_failures)
-            return {
-                "validation_status": "passed_with_soft_errors",
-                "schemas": model_list,
-                "missing_optional": missing_optional,
-                "optional_validation_failures": opt_failures,
-                "failure_cases": err.failure_cases.to_dict(orient="records"),
-            }
-    if logging:
-        logging.info(f"missing_optional = {missing_optional}")
-    # 6) success (with possible soft misses)
+
+    opt_failures: List[str] = []
+    failure_cases_records: List[dict] = []
+    if optional_canons:
+        opt_schema = _build_exact_schema(merged_specs, optional_canons)
+        try:
+            opt_schema.validate(df[optional_canons], lazy=True)
+        except SchemaErrors as err:
+            # Columns are canonical already, so failure_cases['column'] are canonical names
+            opt_failures = sorted(set(err.failure_cases["column"]))
+            failure_cases_records = err.failure_cases.to_dict(orient="records")
+
+    logger.info("missing_optional = %s", missing_optional)
+
+    # Success (with potential soft issues)
+    if opt_failures or missing_optional or unknown_extra:
+        return {
+            "validation_status": "passed_with_soft_errors",
+            "schemas": model_list,
+            "missing_optional": missing_optional,
+            "optional_validation_failures": opt_failures,
+            "failure_cases": failure_cases_records,
+            "unknown_extra_columns": unknown_extra,
+        }
+
     return {
-        "validation_status": (
-            "passed_with_soft_errors" if missing_optional else "passed"
-        ),
+        "validation_status": "passed",
         "schemas": model_list,
-        "missing_optional": missing_optional,
+        "missing_optional": [],
+        "unknown_extra_columns": [],
     }