From 34d412d212ce7eea1f1e318d24a2f0e756ae0f93 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Tue, 2 Sep 2025 17:36:37 -0400
Subject: [PATCH 01/92] testing h2o pipeline on synthetic 2

---
 src/webapp/databricks.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 0f9612ec..40643c53 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -35,6 +35,7 @@
 
 # The name of the deployed pipeline in Databricks. Must match directly.
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
+PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline"
 
 
 class DatabricksInferenceRunRequest(BaseModel):
@@ -192,16 +193,21 @@ def run_pdp_inference(
 
         db_inst_name = databricksify_inst_name(req.inst_name)
 
+        if db_inst_name in ["synthetic_2", "synthetic_uni_2"]:
+            db_job_name = PDP_H2O_INFERENCE_JOB_NAME
+        else:
+            db_job_name = PDP_INFERENCE_JOB_NAME
+
         try:
-            job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None)
+            job = next(w.jobs.list(name=db_job_name), None)
             if not job or job.job_id is None:
                 raise ValueError(
-                    f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id."
+                    f"run_pdp_inference(): Job '{db_job_name}' was not found or has no job_id."
                 )
             job_id = job.job_id
-            LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}")
+            LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}")
         except Exception as e:
-            LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.")
+            LOGGER.exception(f"Job lookup failed for '{db_job_name}' and '{db_inst_name}.")
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
         try:

From 45888ae47c15a84740417873264d030f2d3162f4 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Tue, 2 Sep 2025 17:42:08 -0400
Subject: [PATCH 02/92] style

---
 src/webapp/databricks.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 40643c53..eaf7b679 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -207,7 +207,9 @@ def run_pdp_inference(
             job_id = job.job_id
             LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}")
         except Exception as e:
-            LOGGER.exception(f"Job lookup failed for '{db_job_name}' and '{db_inst_name}.")
+            LOGGER.exception(
+                f"Job lookup failed for '{db_job_name}' and '{db_inst_name}."
+            )
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
         try:

From 4963fa391b4b53578bb6f45b934e84e9029d34ee Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Tue, 2 Sep 2025 20:02:36 -0400
Subject: [PATCH 03/92] bcrypt dep issue

---
 pyproject.toml |  6 +++---
 uv.lock        | 11 ++++++++---
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 81c867a0..b6e45d94 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,8 +13,8 @@ dependencies = [
     "cloud-sql-python-connector[pymysql]~=1.14.0",
     "sqlalchemy~=2.0.36",
     "pyjwt~=2.10.1",
-    "passlib~=1.7.4",
-    "bcrypt~=4.2.0",
+    "passlib[bcrypt]>=1.7.4,<1.8",
+    "bcrypt>=4.0.1,<5",
     "pycryptodome~=3.20.0",
     "python-dotenv~=1.0.1",
     "strenum~=0.4.15",
@@ -28,7 +28,7 @@ dependencies = [
     "thefuzz[speedup]~=0.22.1",
     "databricks-sql-connector~=3.5.0",
     "pandera~=0.13",
-    "mlflow~=2.15.0"
+    "mlflow~=2.15.0",
 ]
 
 [project.urls]
diff --git a/uv.lock b/uv.lock
index 4c589d69..e4134cc3 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2464,6 +2464,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" },
 ]
 
+[package.optional-dependencies]
+bcrypt = [
+    { name = "bcrypt" },
+]
+
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -3718,7 +3723,7 @@ dependencies = [
     { name = "pandas" },
     { name = "pandera" },
     { name = "paramiko" },
-    { name = "passlib" },
+    { name = "passlib", extra = ["bcrypt"] },
     { name = "pycryptodome" },
     { name = "pydantic" },
     { name = "pyjwt" },
@@ -3747,7 +3752,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
-    { name = "bcrypt", specifier = "~=4.2.0" },
+    { name = "bcrypt", specifier = ">=4.0.1,<5" },
     { name = "cloud-sql-python-connector", extras = ["pymysql"], specifier = "~=1.14.0" },
     { name = "databricks-sdk", specifier = "~=0.38.0" },
     { name = "databricks-sql-connector", specifier = "~=3.5.0" },
@@ -3758,7 +3763,7 @@ requires-dist = [
     { name = "pandas", specifier = "~=2.0" },
     { name = "pandera", specifier = "~=0.13" },
     { name = "paramiko", specifier = "~=3.5.0" },
-    { name = "passlib", specifier = "~=1.7.4" },
+    { name = "passlib", extras = ["bcrypt"], specifier = ">=1.7.4,<1.8" },
     { name = "pycryptodome", specifier = "~=3.20.0" },
     { name = "pydantic", specifier = "~=2.10" },
     { name = "pyjwt", specifier = "~=2.10.1" },

From 767a3fdafda8e42017f7fad2f8d8e4c22537cab7 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Tue, 2 Sep 2025 20:33:34 -0400
Subject: [PATCH 04/92] reverting deps for now

---
 pyproject.toml |  4 ++--
 uv.lock        | 11 +++--------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index b6e45d94..ca8f7b0b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,8 +13,8 @@ dependencies = [
     "cloud-sql-python-connector[pymysql]~=1.14.0",
     "sqlalchemy~=2.0.36",
     "pyjwt~=2.10.1",
-    "passlib[bcrypt]>=1.7.4,<1.8",
-    "bcrypt>=4.0.1,<5",
+    "passlib~=1.7.4",
+    "bcrypt~=4.2.0",
     "pycryptodome~=3.20.0",
     "python-dotenv~=1.0.1",
     "strenum~=0.4.15",
diff --git a/uv.lock b/uv.lock
index e4134cc3..4c589d69 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2464,11 +2464,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" },
 ]
 
-[package.optional-dependencies]
-bcrypt = [
-    { name = "bcrypt" },
-]
-
 [[package]]
 name = "pathspec"
 version = "0.12.1"
@@ -3723,7 +3718,7 @@ dependencies = [
     { name = "pandas" },
     { name = "pandera" },
     { name = "paramiko" },
-    { name = "passlib", extra = ["bcrypt"] },
+    { name = "passlib" },
     { name = "pycryptodome" },
     { name = "pydantic" },
     { name = "pyjwt" },
@@ -3752,7 +3747,7 @@ dev = [
 
 [package.metadata]
 requires-dist = [
-    { name = "bcrypt", specifier = ">=4.0.1,<5" },
+    { name = "bcrypt", specifier = "~=4.2.0" },
     { name = "cloud-sql-python-connector", extras = ["pymysql"], specifier = "~=1.14.0" },
     { name = "databricks-sdk", specifier = "~=0.38.0" },
     { name = "databricks-sql-connector", specifier = "~=3.5.0" },
@@ -3763,7 +3758,7 @@ requires-dist = [
     { name = "pandas", specifier = "~=2.0" },
     { name = "pandera", specifier = "~=0.13" },
     { name = "paramiko", specifier = "~=3.5.0" },
-    { name = "passlib", extras = ["bcrypt"], specifier = ">=1.7.4,<1.8" },
+    { name = "passlib", specifier = "~=1.7.4" },
     { name = "pycryptodome", specifier = "~=3.20.0" },
     { name = "pydantic", specifier = "~=2.10" },
     { name = "pyjwt", specifier = "~=2.10.1" },

From cabb5c361e087da8c2c1f3fc861a241ac7f97189 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Wed, 3 Sep 2025 12:26:30 -0400
Subject: [PATCH 05/92] reverting changes with pyproject

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index ca8f7b0b..81c867a0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dependencies = [
     "thefuzz[speedup]~=0.22.1",
     "databricks-sql-connector~=3.5.0",
     "pandera~=0.13",
-    "mlflow~=2.15.0",
+    "mlflow~=2.15.0"
 ]
 
 [project.urls]

From 9fc7d85e748fc9ee55e93ea6557f7ab7e83fd493 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 11:47:09 -0500
Subject: [PATCH 06/92] changed FE inference and training endpoint args for
 better understanding

---
 src/webapp/routers/data.py | 56 +++++++++++++++++++-------------------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 36079908..c76db03e 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1276,14 +1276,14 @@ def get_upload_url(
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/top-features/{run_id}")
+@router.get("/{inst_id}/inference/top-features/{job_run_id}")
 def get_inference_top_features(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns data for a specific institution."""
+    """Returns top n features table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1308,7 +1308,7 @@ def get_inference_top_features(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_features_with_most_impact",
+            table_name=f"inference_{job_run_id}_features_with_most_impact",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1319,10 +1319,10 @@ def get_inference_top_features(
 
 
 # Get Box plot values
-@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}")
+@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}")
 def get_inference_feature_boxstats(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
     feature_name: Optional[str] = Query(
@@ -1355,7 +1355,7 @@ def get_inference_feature_boxstats(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_box_plot_table",
+            table_name=f"inference_{job_run_id}_box_plot_table",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
         if not feature_name:
@@ -1381,7 +1381,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
         if not filtered:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
-                detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.",
+                detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.",
             )
 
         return filtered
@@ -1392,14 +1392,14 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/support-overview/{run_id}")
+@router.get("/{inst_id}/inference/support-overview/{job_run_id}")
 def get_inference_support_overview(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns support score distribution table for a  specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1424,7 +1424,7 @@ def get_inference_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_support_overview",
+            table_name=f"inference_{job_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1434,14 +1434,14 @@ def get_inference_support_overview(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/inference/feature_importance/{run_id}")
+@router.get("/{inst_id}/inference/feature_importance/{job_run_id}")
 def get_inference_feature_importance(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns feature importance table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1466,7 +1466,7 @@ def get_inference_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_shap_feature_importance",
+            table_name=f"inference_{job_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1479,10 +1479,10 @@ def get_inference_feature_importance(
 ## FE Training Tables
 
 
-@router.get("/{inst_id}/training/feature_importance/{run_id}")
+@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}")
 def get_training_feature_importance(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1511,7 +1511,7 @@ def get_training_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_shap_feature_importance",
+            table_name=f"training_{experiment_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1521,10 +1521,10 @@ def get_training_feature_importance(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/confusion_matrix/{run_id}")
+@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}")
 def get_training_confusion_matrix(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1553,7 +1553,7 @@ def get_training_confusion_matrix(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_confusion_matrix",
+            table_name=f"training_{experiment_run_id}_confusion_matrix",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1563,10 +1563,10 @@ def get_training_confusion_matrix(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/roc_curve/{run_id}")
+@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}")
 def get_training_roc_curve(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1595,7 +1595,7 @@ def get_training_roc_curve(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_roc_curve",
+            table_name=f"training_{experiment_run_id}_roc_curve",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1605,10 +1605,10 @@ def get_training_roc_curve(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/support-overview/{run_id}")
+@router.get("/{inst_id}/training/support-overview/{experiment_run_id}")
 def get_training_support_overview(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1637,7 +1637,7 @@ def get_training_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_support_overview",
+            table_name=f"training_{experiment_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 

From 5b3fd3cdc34c4d26733f33c658864f3c7c011f18 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 11:48:52 -0500
Subject: [PATCH 07/92] changed FE inference and training endpoint args for
 better understanding

---
 src/webapp/routers/data.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index c76db03e..7bd34f2b 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1486,7 +1486,7 @@ def get_training_feature_importance(
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training feature importance table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1528,7 +1528,7 @@ def get_training_confusion_matrix(
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training confusion matrix table for a specific instituion."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1570,7 +1570,7 @@ def get_training_roc_curve(
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training roc curve table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)
@@ -1612,7 +1612,7 @@ def get_training_support_overview(
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
-    """Returns a signed URL for uploading data to a specific institution."""
+    """Returns training support overview table for a specific institution."""
     # raise error at this level instead bc otherwise it's getting wrapped as a 200
     has_access_to_inst_or_err(inst_id, current_user)
     local_session.set(sql_session)

From 99e6dc8efdc66ed519f803423e0ef828740c8e46 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:00:06 -0500
Subject: [PATCH 08/92] fixed course flags in filename inference

---
 src/webapp/routers/data.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 7bd34f2b..02a52819 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -16,6 +16,7 @@
 from ..config import databricks_vars, env_vars, gcs_vars
 import tempfile
 import pathlib
+import re
 
 from ..utilities import (
     has_access_to_inst_or_err,
@@ -995,6 +996,7 @@ def download_url_inst_file(
     )
 
 
+_AR_WORD = re.compile(r'(?<![A-Za-z0-9])ar(?![A-Za-z0-9])', re.IGNORECASE)
 def infer_models_from_filename(file_path: str, institution_id: str) -> List[str]:
     name = os.path.basename(file_path).lower()
 
@@ -1007,16 +1009,14 @@ def infer_models_from_filename(file_path: str, institution_id: str) -> List[str]
         inferred.add("SEMESTER")
     if "cohort" in name:
         inferred.add("STUDENT")
-    if "course" not in name and ("ar" in name or "deidentified" in name):
+    if "course" not in name and (_AR_WORD.search(name) or "deidentified" in name):
         inferred.add("STUDENT")
 
     if not inferred:
-        logging.error(
-            ValueError(
-                f"Could not infer model(s) from file name: {name}, filenames sould be descriptive of the kind of data it contains e.g. course, cohort"
-            )
+        raise ValueError(
+            f"Could not infer model(s) from file name: {name}. "
+            "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')."
         )
-        inferred.add("UNKNOWN")
 
     return sorted(inferred)
 

From 73ef358e0f543a995325b553a92bf450f55c170a Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:03:21 -0500
Subject: [PATCH 09/92] fixed course flags in filename inference

---
 src/webapp/routers/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 02a52819..0471c44f 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -997,7 +997,7 @@ def download_url_inst_file(
 
 
 _AR_WORD = re.compile(r'(?<![A-Za-z0-9])ar(?![A-Za-z0-9])', re.IGNORECASE)
-def infer_models_from_filename(file_path: str, institution_id: str) -> List[str]:
+def infer_models_from_filename(file_path: str) -> List[str]:
     name = os.path.basename(file_path).lower()
 
     inferred = set()
@@ -1040,7 +1040,7 @@ def validation_helper(
 
     allowed_schemas = None
     if not allowed_schemas:
-        allowed_schemas = infer_models_from_filename(file_name, "pdp")
+        allowed_schemas = infer_models_from_filename(file_name)
 
     inferred_schemas: list[str] = []
     # ----------------------- Fetch base schema from DB -------------------------------

From ba7c04faa3cb9841cf2f5df29f70330354ec3818 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:06:09 -0500
Subject: [PATCH 10/92] fixed course flags in filename inference

---
 src/webapp/routers/data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 0471c44f..52b85536 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -996,7 +996,9 @@ def download_url_inst_file(
     )
 
 
-_AR_WORD = re.compile(r'(?<![A-Za-z0-9])ar(?![A-Za-z0-9])', re.IGNORECASE)
+_AR_WORD = re.compile(r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE)
+
+
 def infer_models_from_filename(file_path: str) -> List[str]:
     name = os.path.basename(file_path).lower()
 

From c37030f9f3fd553e7b210864db2c0d3d2e767088 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:35:54 -0500
Subject: [PATCH 11/92] fixed course flags in filename inference

---
 src/webapp/routers/data_test.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py
index d1cce3ee..9b1c1c31 100644
--- a/src/webapp/routers/data_test.py
+++ b/src/webapp/routers/data_test.py
@@ -586,11 +586,11 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_upload = client.post(
         "/institutions/"
         + uuid_to_str(USER_VALID_INST_UUID)
-        + "/input/validate-upload/file_name.csv",
+        + "/input/validate-upload/pdp_course_deidentified.csv",
     )
     assert response_upload.status_code == 200
-    assert response_upload.json()["name"] == "file_name.csv"
-    assert response_upload.json()["file_types"] == ["UNKNOWN"]
+    assert response_upload.json()["name"] == "pdp_course_deidentified.csv"
+    assert response_upload.json()["file_types"] == ["COURSE"]
     assert response_upload.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID)
     assert response_upload.json()["source"] == "MANUAL_UPLOAD"
 
@@ -598,7 +598,7 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_sftp = client.post(
         "/institutions/"
         + uuid_to_str(UUID_INVALID)
-        + "/input/validate-sftp/file_name.csv",
+        + "/input/validate-sftp/pdp_ar_deidentified.csv",
     )
     assert str(response_sftp) == "<Response [401 Unauthorized]>"
     assert (
@@ -609,11 +609,11 @@ def test_validate_success_batch(client: TestClient) -> None:
     response_sftp = client.post(
         "/institutions/"
         + uuid_to_str(USER_VALID_INST_UUID)
-        + "/input/validate-sftp/file_name.csv",
+        + "/input/validate-sftp/pdp_ar_deidentified.csv",
     )
     assert response_sftp.status_code == 200
-    assert response_sftp.json()["name"] == "file_name.csv"
-    assert response_sftp.json()["file_types"] == ["UNKNOWN"]
+    assert response_sftp.json()["name"] == "pdp_ar_deidentified.csv"
+    assert response_sftp.json()["file_types"] == ["STUDENT"]
     assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID)
     assert response_sftp.json()["source"] == "PDP_SFTP"
 

From 44a73e67d591792a8399be75bbcf6c24155354b9 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:45:38 -0500
Subject: [PATCH 12/92] fixed course flags in filename inference

---
 src/webapp/routers/data.py | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 52b85536..41e3a1ae 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1278,10 +1278,10 @@ def get_upload_url(
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/top-features/{job_run_id}")
+@router.get("/{inst_id}/inference/top-features/{run_id}")
 def get_inference_top_features(
     inst_id: str,
-    job_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1310,7 +1310,7 @@ def get_inference_top_features(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{job_run_id}_features_with_most_impact",
+            table_name=f"inference_{run_id}_features_with_most_impact",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1321,10 +1321,10 @@ def get_inference_top_features(
 
 
 # Get Box plot values
-@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}")
+@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}")
 def get_inference_feature_boxstats(
     inst_id: str,
-    job_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
     feature_name: Optional[str] = Query(
@@ -1357,7 +1357,7 @@ def get_inference_feature_boxstats(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{job_run_id}_box_plot_table",
+            table_name=f"inference_{run_id}_box_plot_table",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
         if not feature_name:
@@ -1383,7 +1383,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
         if not filtered:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
-                detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.",
+                detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.",
             )
 
         return filtered
@@ -1394,10 +1394,10 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/support-overview/{job_run_id}")
+@router.get("/{inst_id}/inference/support-overview/{run_id}")
 def get_inference_support_overview(
     inst_id: str,
-    job_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1426,7 +1426,7 @@ def get_inference_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{job_run_id}_support_overview",
+            table_name=f"inference_{run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1436,10 +1436,10 @@ def get_inference_support_overview(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/inference/feature_importance/{job_run_id}")
+@router.get("/{inst_id}/inference/feature_importance/{run_id}")
 def get_inference_feature_importance(
     inst_id: str,
-    job_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1468,7 +1468,7 @@ def get_inference_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{job_run_id}_shap_feature_importance",
+            table_name=f"inference_{run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1481,10 +1481,10 @@ def get_inference_feature_importance(
 ## FE Training Tables
 
 
-@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}")
+@router.get("/{inst_id}/training/feature_importance/{run_id}")
 def get_training_feature_importance(
     inst_id: str,
-    experiment_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1513,7 +1513,7 @@ def get_training_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{experiment_run_id}_shap_feature_importance",
+            table_name=f"training_{run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1523,10 +1523,10 @@ def get_training_feature_importance(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}")
+@router.get("/{inst_id}/training/confusion_matrix/{run_id}")
 def get_training_confusion_matrix(
     inst_id: str,
-    experiment_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1555,7 +1555,7 @@ def get_training_confusion_matrix(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{experiment_run_id}_confusion_matrix",
+            table_name=f"training_{run_id}_confusion_matrix",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1565,10 +1565,10 @@ def get_training_confusion_matrix(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}")
+@router.get("/{inst_id}/training/roc_curve/{run_id}")
 def get_training_roc_curve(
     inst_id: str,
-    experiment_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1597,7 +1597,7 @@ def get_training_roc_curve(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{experiment_run_id}_roc_curve",
+            table_name=f"training_{run_id}_roc_curve",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1607,10 +1607,10 @@ def get_training_roc_curve(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/support-overview/{experiment_run_id}")
+@router.get("/{inst_id}/training/support-overview/{run_id}")
 def get_training_support_overview(
     inst_id: str,
-    experiment_run_id: str,
+    run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1639,7 +1639,7 @@ def get_training_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{experiment_run_id}_support_overview",
+            table_name=f"training_{run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 

From 51118f1885cd2d65795feac8f3d3259ce67909df Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 14:51:50 -0500
Subject: [PATCH 13/92] changed FE inference and training endpoint args for
 better understanding

---
 src/webapp/routers/data.py | 50 +++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 41e3a1ae..52b85536 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1278,10 +1278,10 @@ def get_upload_url(
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/top-features/{run_id}")
+@router.get("/{inst_id}/inference/top-features/{job_run_id}")
 def get_inference_top_features(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1310,7 +1310,7 @@ def get_inference_top_features(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_features_with_most_impact",
+            table_name=f"inference_{job_run_id}_features_with_most_impact",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1321,10 +1321,10 @@ def get_inference_top_features(
 
 
 # Get Box plot values
-@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}")
+@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}")
 def get_inference_feature_boxstats(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
     feature_name: Optional[str] = Query(
@@ -1357,7 +1357,7 @@ def get_inference_feature_boxstats(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_box_plot_table",
+            table_name=f"inference_{job_run_id}_box_plot_table",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
         if not feature_name:
@@ -1383,7 +1383,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
         if not filtered:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
-                detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.",
+                detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.",
             )
 
         return filtered
@@ -1394,10 +1394,10 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]:
 
 
 # Get SHAP Values for Inference
-@router.get("/{inst_id}/inference/support-overview/{run_id}")
+@router.get("/{inst_id}/inference/support-overview/{job_run_id}")
 def get_inference_support_overview(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1426,7 +1426,7 @@ def get_inference_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_support_overview",
+            table_name=f"inference_{job_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1436,10 +1436,10 @@ def get_inference_support_overview(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/inference/feature_importance/{run_id}")
+@router.get("/{inst_id}/inference/feature_importance/{job_run_id}")
 def get_inference_feature_importance(
     inst_id: str,
-    run_id: str,
+    job_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1468,7 +1468,7 @@ def get_inference_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"inference_{run_id}_shap_feature_importance",
+            table_name=f"inference_{job_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1481,10 +1481,10 @@ def get_inference_feature_importance(
 ## FE Training Tables
 
 
-@router.get("/{inst_id}/training/feature_importance/{run_id}")
+@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}")
 def get_training_feature_importance(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1513,7 +1513,7 @@ def get_training_feature_importance(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_shap_feature_importance",
+            table_name=f"training_{experiment_run_id}_shap_feature_importance",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1523,10 +1523,10 @@ def get_training_feature_importance(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/confusion_matrix/{run_id}")
+@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}")
 def get_training_confusion_matrix(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1555,7 +1555,7 @@ def get_training_confusion_matrix(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_confusion_matrix",
+            table_name=f"training_{experiment_run_id}_confusion_matrix",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1565,10 +1565,10 @@ def get_training_confusion_matrix(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/roc_curve/{run_id}")
+@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}")
 def get_training_roc_curve(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1597,7 +1597,7 @@ def get_training_roc_curve(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_roc_curve",
+            table_name=f"training_{experiment_run_id}_roc_curve",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 
@@ -1607,10 +1607,10 @@ def get_training_roc_curve(
         raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve))
 
 
-@router.get("/{inst_id}/training/support-overview/{run_id}")
+@router.get("/{inst_id}/training/support-overview/{experiment_run_id}")
 def get_training_support_overview(
     inst_id: str,
-    run_id: str,
+    experiment_run_id: str,
     current_user: Annotated[BaseUser, Depends(get_current_active_user)],
     sql_session: Annotated[Session, Depends(get_session)],
 ) -> List[dict[str, Any]]:
@@ -1639,7 +1639,7 @@ def get_training_support_overview(
         rows = dbc.fetch_table_data(
             catalog_name=env_vars["CATALOG_NAME"],  # type: ignore
             inst_name=f"{query_result[0][0].name}",
-            table_name=f"training_{run_id}_support_overview",
+            table_name=f"training_{experiment_run_id}_support_overview",
             warehouse_id=env_vars["SQL_WAREHOUSE_ID"],  # type: ignore
         )
 

From 5e824f183b878b08f75a49f45a73ca64e95939da Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 17:17:42 -0500
Subject: [PATCH 14/92] patching validation.py

---
 src/webapp/gcsutil.py      |  4 +++-
 src/webapp/routers/data.py | 22 +++++++++++++++++++---
 src/webapp/validation.py   |  8 +++++---
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py
index b6046daa..44bef984 100644
--- a/src/webapp/gcsutil.py
+++ b/src/webapp/gcsutil.py
@@ -340,8 +340,10 @@ def validate_file(
                     f"If you see this file validation was successful {schems}"
                 )
         except Exception as e:
+            logging.exception("Validation failed for %s: %s", file_name, e)
             blob.delete()
-            raise e
+            raise
+        
         new_blob = bucket.blob(new_blob_name)
         if new_blob.exists():
             raise ValueError(new_blob_name + ": File already exists.")
diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 52b85536..7560bb9d 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -17,6 +17,7 @@
 import tempfile
 import pathlib
 import re
+from validation import HardValidationError
 
 from ..utilities import (
     has_access_to_inst_or_err,
@@ -1155,13 +1156,28 @@ def validation_helper(
         logging.debug(
             f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}"
         )
+    except HardValidationError as e:
+        logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e)
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail={
+                "code": "VALIDATION_FAILED",
+                "message": "Schema validation failed.",
+                "missing_required": e.missing_required,
+                "extra_columns": e.extra_columns,
+                "schema_errors": e.schema_errors,
+                "failure_cases": e.failure_cases,
+            },
+        )
     except Exception as e:
         logging.debug(f"!!!!!!!!!!Inferred Schemas FAILED {e}")
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail="File type is not valid and/or not accepted by this institution: "
-            + str(e),
-        ) from e
+            detail={
+                "code": "VALIDATION_ERROR",
+                "message": str(e),
+            },
+        )
 
     existing_file = (
         local_session.get()
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 3f359aaf..dc9f3c82 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -220,14 +220,15 @@ def validate_dataset(
     ]
 
     # Hard-fail on missing required or any extra columns
-    if missing_required or extra_columns:
+    if missing_required:
         if logging:
             logging.error(
                 f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}"
             )
         raise HardValidationError(
-            missing_required=missing_required, extra_columns=extra_columns
+            missing_required=missing_required
         )
+    unknown_extra = extra_columns
 
     # 5) build Pandera schema & validate (hard-fail on any error)
     schema = build_schema(merged_specs)
@@ -273,8 +274,9 @@ def validate_dataset(
     # 6) success (with possible soft misses)
     return {
         "validation_status": (
-            "passed_with_soft_errors" if missing_optional else "passed"
+            "passed_with_soft_errors" if (missing_optional or unknown_extra) else "passed"
         ),
         "schemas": model_list,
         "missing_optional": missing_optional,
+        "unknown_extra_columns": unknown_extra,
     }

From 92ca1eb80140846496757998203f10ffba446638 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 17:21:08 -0500
Subject: [PATCH 15/92] fix import ish

---
 src/webapp/routers/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 7560bb9d..74d1dabd 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -17,7 +17,7 @@
 import tempfile
 import pathlib
 import re
-from validation import HardValidationError
+from ..validation import HardValidationError
 
 from ..utilities import (
     has_access_to_inst_or_err,

From 6f63a0ee25eddeafb967563dc8dde641aa5aa5c6 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 17:22:52 -0500
Subject: [PATCH 16/92] fix import ish

---
 src/webapp/gcsutil.py    | 2 +-
 src/webapp/validation.py | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py
index 44bef984..5e955ea5 100644
--- a/src/webapp/gcsutil.py
+++ b/src/webapp/gcsutil.py
@@ -343,7 +343,7 @@ def validate_file(
             logging.exception("Validation failed for %s: %s", file_name, e)
             blob.delete()
             raise
-        
+
         new_blob = bucket.blob(new_blob_name)
         if new_blob.exists():
             raise ValueError(new_blob_name + ": File already exists.")
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index dc9f3c82..452ae678 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -225,9 +225,7 @@ def validate_dataset(
             logging.error(
                 f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}"
             )
-        raise HardValidationError(
-            missing_required=missing_required
-        )
+        raise HardValidationError(missing_required=missing_required)
     unknown_extra = extra_columns
 
     # 5) build Pandera schema & validate (hard-fail on any error)
@@ -274,7 +272,9 @@ def validate_dataset(
     # 6) success (with possible soft misses)
     return {
         "validation_status": (
-            "passed_with_soft_errors" if (missing_optional or unknown_extra) else "passed"
+            "passed_with_soft_errors"
+            if (missing_optional or unknown_extra)
+            else "passed"
         ),
         "schemas": model_list,
         "missing_optional": missing_optional,

From 410b8ea0c93c96627802069328a30c72f68c0e85 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 17:46:13 -0500
Subject: [PATCH 17/92] fixed table read

---
 src/webapp/routers/data.py |  5 ++---
 src/webapp/validation.py   | 11 ++++++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 74d1dabd..399a375b 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1153,9 +1153,8 @@ def validation_helper(
             base_schema,
             updated_inst_schema,
         )
-        logging.debug(
-            f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}"
-        )
+        logging.debug("!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas))
+
     except HardValidationError as e:
         logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e)
         raise HTTPException(
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 452ae678..be111b08 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -162,7 +162,16 @@ def validate_dataset(
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
-    df = pd.read_csv(filename)
+    read_errs = []
+    for enc in ("utf-8", "utf-8-sig", "latin1"):
+        try:
+            df = pd.read_csv(filename, encoding=enc)
+            break
+        except UnicodeDecodeError as ex:
+            read_errs.append(f"{enc}: {ex}")
+    else:
+        raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs)
+    
     df = df.rename(columns={c: normalize_col(c) for c in df.columns})
     incoming = set(df.columns)
 

From 51cd2525a42459b9c3f2a654d471deb0cba17311 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 17:49:22 -0500
Subject: [PATCH 18/92] fixed table read

---
 src/webapp/routers/data.py | 4 +++-
 src/webapp/validation.py   | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 399a375b..8d481e59 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1153,7 +1153,9 @@ def validation_helper(
             base_schema,
             updated_inst_schema,
         )
-        logging.debug("!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas))
+        logging.debug(
+            "!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas)
+        )
 
     except HardValidationError as e:
         logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e)
diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index be111b08..b04dad58 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -171,7 +171,7 @@ def validate_dataset(
             read_errs.append(f"{enc}: {ex}")
     else:
         raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs)
-    
+
     df = df.rename(columns={c: normalize_col(c) for c in df.columns})
     incoming = set(df.columns)
 

From 4d1de4c5472317419c9a55c6b7cc39e99b8b4115 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 3 Sep 2025 18:04:00 -0500
Subject: [PATCH 19/92] fixed table read

---
 src/webapp/routers/data.py | 36 +++++++++++++++++++++++-------------
 1 file changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 8d481e59..69777abf 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1159,25 +1159,35 @@ def validation_helper(
 
     except HardValidationError as e:
         logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e)
+        # Build a single string - frontend can render this reliably
+        msg_parts = ["VALIDATION_FAILED"]
+        if e.missing_required:
+            msg_parts.append(f"missing_required={e.missing_required}")
+        if e.extra_columns:
+            msg_parts.append(f"extra_columns={e.extra_columns}")
+        if e.schema_errors is not None:
+            msg_parts.append(f"schema_errors={e.schema_errors}")
+        if e.failure_cases is not None:
+            # keep short; avoid dumping huge tables
+            try:
+                sample = (
+                    e.failure_cases[:5]
+                    if isinstance(e.failure_cases, list)
+                    else str(e.failure_cases)[:500]
+                )
+            except Exception:
+                sample = "see server logs"
+            msg_parts.append(f"failure_cases_sample={sample}")
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail={
-                "code": "VALIDATION_FAILED",
-                "message": "Schema validation failed.",
-                "missing_required": e.missing_required,
-                "extra_columns": e.extra_columns,
-                "schema_errors": e.schema_errors,
-                "failure_cases": e.failure_cases,
-            },
+            detail="; ".join(msg_parts),
         )
+
     except Exception as e:
-        logging.debug(f"!!!!!!!!!!Inferred Schemas FAILED {e}")
+        logging.debug("!!!!!!!!!!Inferred Schemas FAILED (other) %s", e)
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
-            detail={
-                "code": "VALIDATION_ERROR",
-                "message": str(e),
-            },
+            detail=f"VALIDATION_ERROR: {type(e).__name__}: {e}",
         )
 
     existing_file = (

From f786e05fcc071ab34aa8438bb80353abd28e226d Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 4 Sep 2025 11:23:37 -0500
Subject: [PATCH 20/92] fixed table read

---
 src/webapp/gcsutil.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py
index 5e955ea5..b267d9eb 100644
--- a/src/webapp/gcsutil.py
+++ b/src/webapp/gcsutil.py
@@ -341,7 +341,6 @@ def validate_file(
                 )
         except Exception as e:
             logging.exception("Validation failed for %s: %s", file_name, e)
-            blob.delete()
             raise
 
         new_blob = bucket.blob(new_blob_name)

From 182ef289afac8ce7f5441f40ebb5c984307d6ad9 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Thu, 4 Sep 2025 15:10:03 -0400
Subject: [PATCH 21/92] trying to test why pipeline isn't being found

---
 src/webapp/databricks.py | 72 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index eaf7b679..7060f085 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -198,20 +198,76 @@ def run_pdp_inference(
         else:
             db_job_name = PDP_INFERENCE_JOB_NAME
 
+        # --- Resolve the Databricks Job by name, with diagnostics ---
         try:
-            job = next(w.jobs.list(name=db_job_name), None)
-            if not job or job.job_id is None:
+            # Helpful diagnostics about where we are and who we are
+            try:
+                me = w.current_user.me()
+                LOGGER.info("Databricks caller: user_name=%s, user_id=%s", getattr(me, "user_name", None), getattr(me, "id", None))
+            except Exception:
+                LOGGER.info("Could not resolve current user; continuing.")
+
+            host_url = databricks_vars["DATABRICKS_HOST_URL"]
+            LOGGER.info("Databricks host: %s", host_url)
+
+            # Gather visible jobs and log a small sample for troubleshooting
+            visible_jobs = list(w.jobs.list())  # materialize generator
+            LOGGER.info("Visible jobs count: %d", len(visible_jobs))
+
+            log_preview = []
+            for j in visible_jobs[:25]:
+                # In SDK, name commonly lives under settings.name
+                jname = getattr(getattr(j, "settings", None), "name", None)
+                jid = getattr(j, "job_id", None)
+                log_preview.append(f"{jid}:{jname}")
+            LOGGER.info("First up-to-25 visible jobs (id:name): %s", "; ".join(log_preview) if log_preview else "(none)")
+
+            # Try to find by name (exact, then case-insensitive, then prefix/close match)
+            def job_name(j) -> str:
+                return (getattr(getattr(j, "settings", None), "name", None) or "").strip()
+
+            target = db_job_name.strip()
+            candidates = [j for j in visible_jobs if job_name(j) == target]
+
+            if not candidates:
+                # Case-insensitive exact
+                candidates = [j for j in visible_jobs if job_name(j).lower() == target.lower()]
+
+            if not candidates:
+                # Prefix or contains
+                lowered = target.lower()
+                candidates = [j for j in visible_jobs if job_name(j).lower().startswith(lowered)]
+                if not candidates:
+                    candidates = [j for j in visible_jobs if lowered in job_name(j).lower()]
+
+            # If multiple, prefer exact case-insensitive match first; else first candidate
+            job_obj = candidates[0] if candidates else None
+
+            # If still not found, compute close matches to guide debugging
+            if not job_obj:
+                import difflib
+                names = [job_name(j) for j in visible_jobs]
+                close = difflib.get_close_matches(target, names, n=5, cutoff=0.6)
                 raise ValueError(
-                    f"run_pdp_inference(): Job '{db_job_name}' was not found or has no job_id."
+                    f"run_pdp_inference(): Job named '{db_job_name}' not found in workspace {host_url}. "
+                    f"Service principal may lack permissions, or the job name differs. "
+                    f"Close matches: {close}"
                 )
-            job_id = job.job_id
-            LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}")
+
+            job_id = getattr(job_obj, "job_id", None)
+            if not job_id:
+                raise ValueError(
+                    f"run_pdp_inference(): Found job '{job_name(job_obj)}' but it has no job_id. "
+                    "Check job visibility/permissions and that the SDK is returning full job metadata."
+                )
+
+            LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj))
+
         except Exception as e:
-            LOGGER.exception(
-                f"Job lookup failed for '{db_job_name}' and '{db_inst_name}."
-            )
+            LOGGER.exception("Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name)
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
+
         try:
             run_job: Any = w.jobs.run_now(
                 job_id,

From fe99c8dd3659b39a3510306a6bd42afba2338cb9 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Thu, 4 Sep 2025 15:13:19 -0400
Subject: [PATCH 22/92] black

---
 src/webapp/databricks.py | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 7060f085..bece8120 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -203,7 +203,11 @@ def run_pdp_inference(
             # Helpful diagnostics about where we are and who we are
             try:
                 me = w.current_user.me()
-                LOGGER.info("Databricks caller: user_name=%s, user_id=%s", getattr(me, "user_name", None), getattr(me, "id", None))
+                LOGGER.info(
+                    "Databricks caller: user_name=%s, user_id=%s",
+                    getattr(me, "user_name", None),
+                    getattr(me, "id", None),
+                )
             except Exception:
                 LOGGER.info("Could not resolve current user; continuing.")
 
@@ -220,25 +224,36 @@ def run_pdp_inference(
                 jname = getattr(getattr(j, "settings", None), "name", None)
                 jid = getattr(j, "job_id", None)
                 log_preview.append(f"{jid}:{jname}")
-            LOGGER.info("First up-to-25 visible jobs (id:name): %s", "; ".join(log_preview) if log_preview else "(none)")
+            LOGGER.info(
+                "First up-to-25 visible jobs (id:name): %s",
+                "; ".join(log_preview) if log_preview else "(none)",
+            )
 
             # Try to find by name (exact, then case-insensitive, then prefix/close match)
             def job_name(j) -> str:
-                return (getattr(getattr(j, "settings", None), "name", None) or "").strip()
+                return (
+                    getattr(getattr(j, "settings", None), "name", None) or ""
+                ).strip()
 
             target = db_job_name.strip()
             candidates = [j for j in visible_jobs if job_name(j) == target]
 
             if not candidates:
                 # Case-insensitive exact
-                candidates = [j for j in visible_jobs if job_name(j).lower() == target.lower()]
+                candidates = [
+                    j for j in visible_jobs if job_name(j).lower() == target.lower()
+                ]
 
             if not candidates:
                 # Prefix or contains
                 lowered = target.lower()
-                candidates = [j for j in visible_jobs if job_name(j).lower().startswith(lowered)]
+                candidates = [
+                    j for j in visible_jobs if job_name(j).lower().startswith(lowered)
+                ]
                 if not candidates:
-                    candidates = [j for j in visible_jobs if lowered in job_name(j).lower()]
+                    candidates = [
+                        j for j in visible_jobs if lowered in job_name(j).lower()
+                    ]
 
             # If multiple, prefer exact case-insensitive match first; else first candidate
             job_obj = candidates[0] if candidates else None
@@ -246,6 +261,7 @@ def job_name(j) -> str:
             # If still not found, compute close matches to guide debugging
             if not job_obj:
                 import difflib
+
                 names = [job_name(j) for j in visible_jobs]
                 close = difflib.get_close_matches(target, names, n=5, cutoff=0.6)
                 raise ValueError(
@@ -264,10 +280,11 @@ def job_name(j) -> str:
             LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj))
 
         except Exception as e:
-            LOGGER.exception("Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name)
+            LOGGER.exception(
+                "Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name
+            )
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
-
         try:
             run_job: Any = w.jobs.run_now(
                 job_id,

From 23e5fb0ecf8f44923c28083a314519b7a140c602 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Thu, 4 Sep 2025 15:21:40 -0400
Subject: [PATCH 23/92] type check

---
 src/webapp/databricks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index bece8120..cd89cf2c 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -230,7 +230,7 @@ def run_pdp_inference(
             )
 
             # Try to find by name (exact, then case-insensitive, then prefix/close match)
-            def job_name(j) -> str:
+            def job_name(j: Any) -> str:
                 return (
                     getattr(getattr(j, "settings", None), "name", None) or ""
                 ).strip()

From 7d2dc9130a234b100ba2e3d37a12f220b2d10db6 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 01:42:53 -0500
Subject: [PATCH 24/92] added framework column to cloud sql with default
 sklearn

---
 src/webapp/database.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/webapp/database.py b/src/webapp/database.py
index 7fe974b0..da1fc9a5 100644
--- a/src/webapp/database.py
+++ b/src/webapp/database.py
@@ -511,6 +511,9 @@ class ModelTable(Base):
     )
     # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version.
     version: Mapped[int] = mapped_column(Integer, default=0)
+    framework: Mapped[str | None] = mapped_column(
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn'
+    )
 
     # Within a given institution, there should be no duplicated model names.
     __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),)

From 705dbaf1537b670eea4b6dfb684ec2fb797338ac Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 01:47:53 -0500
Subject: [PATCH 25/92] defined acceptance criteria from FE

---
 src/webapp/routers/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index cb7949f6..88ee733c 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -215,6 +215,7 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
+            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn"
         )
         local_session.get().add(model)
         local_session.get().commit()

From e510cacab4af06a2ba33e67cdb92b116ca24646b Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 07:34:57 -0500
Subject: [PATCH 26/92] reverted databricks to original file

---
 src/webapp/databricks.py     | 95 +++---------------------------------
 src/webapp/routers/models.py |  3 +-
 2 files changed, 9 insertions(+), 89 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index cd89cf2c..592bc852 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -35,7 +35,6 @@
 
 # The name of the deployed pipeline in Databricks. Must match directly.
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
-PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline"
 
 
 class DatabricksInferenceRunRequest(BaseModel):
@@ -193,96 +192,16 @@ def run_pdp_inference(
 
         db_inst_name = databricksify_inst_name(req.inst_name)
 
-        if db_inst_name in ["synthetic_2", "synthetic_uni_2"]:
-            db_job_name = PDP_H2O_INFERENCE_JOB_NAME
-        else:
-            db_job_name = PDP_INFERENCE_JOB_NAME
-
-        # --- Resolve the Databricks Job by name, with diagnostics ---
         try:
-            # Helpful diagnostics about where we are and who we are
-            try:
-                me = w.current_user.me()
-                LOGGER.info(
-                    "Databricks caller: user_name=%s, user_id=%s",
-                    getattr(me, "user_name", None),
-                    getattr(me, "id", None),
-                )
-            except Exception:
-                LOGGER.info("Could not resolve current user; continuing.")
-
-            host_url = databricks_vars["DATABRICKS_HOST_URL"]
-            LOGGER.info("Databricks host: %s", host_url)
-
-            # Gather visible jobs and log a small sample for troubleshooting
-            visible_jobs = list(w.jobs.list())  # materialize generator
-            LOGGER.info("Visible jobs count: %d", len(visible_jobs))
-
-            log_preview = []
-            for j in visible_jobs[:25]:
-                # In SDK, name commonly lives under settings.name
-                jname = getattr(getattr(j, "settings", None), "name", None)
-                jid = getattr(j, "job_id", None)
-                log_preview.append(f"{jid}:{jname}")
-            LOGGER.info(
-                "First up-to-25 visible jobs (id:name): %s",
-                "; ".join(log_preview) if log_preview else "(none)",
-            )
-
-            # Try to find by name (exact, then case-insensitive, then prefix/close match)
-            def job_name(j: Any) -> str:
-                return (
-                    getattr(getattr(j, "settings", None), "name", None) or ""
-                ).strip()
-
-            target = db_job_name.strip()
-            candidates = [j for j in visible_jobs if job_name(j) == target]
-
-            if not candidates:
-                # Case-insensitive exact
-                candidates = [
-                    j for j in visible_jobs if job_name(j).lower() == target.lower()
-                ]
-
-            if not candidates:
-                # Prefix or contains
-                lowered = target.lower()
-                candidates = [
-                    j for j in visible_jobs if job_name(j).lower().startswith(lowered)
-                ]
-                if not candidates:
-                    candidates = [
-                        j for j in visible_jobs if lowered in job_name(j).lower()
-                    ]
-
-            # If multiple, prefer exact case-insensitive match first; else first candidate
-            job_obj = candidates[0] if candidates else None
-
-            # If still not found, compute close matches to guide debugging
-            if not job_obj:
-                import difflib
-
-                names = [job_name(j) for j in visible_jobs]
-                close = difflib.get_close_matches(target, names, n=5, cutoff=0.6)
+            job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None)
+            if not job or job.job_id is None:
                 raise ValueError(
-                    f"run_pdp_inference(): Job named '{db_job_name}' not found in workspace {host_url}. "
-                    f"Service principal may lack permissions, or the job name differs. "
-                    f"Close matches: {close}"
+                    f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id."
                 )
-
-            job_id = getattr(job_obj, "job_id", None)
-            if not job_id:
-                raise ValueError(
-                    f"run_pdp_inference(): Found job '{job_name(job_obj)}' but it has no job_id. "
-                    "Check job visibility/permissions and that the SDK is returning full job metadata."
-                )
-
-            LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj))
-
+            job_id = job.job_id
+            LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}")
         except Exception as e:
-            LOGGER.exception(
-                "Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name
-            )
+            LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.")
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
         try:
@@ -633,4 +552,4 @@ def create_custom_schema_extension(
             existing_extension=extension_schema,  # may be None
         )
 
-        return updated_extension
+        return updated_extension
\ No newline at end of file
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 88ee733c..3ff8df4a 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -215,7 +215,7 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
-            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn"
+            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn",
         )
         local_session.get().add(model)
         local_session.get().commit()
@@ -253,6 +253,7 @@ def create_model(
         "created_by": uuid_to_str(query_result[0][0].created_by),
         "deleted": query_result[0][0].deleted,
         "valid": query_result[0][0].valid,
+        "framework": query_result[0][0].framework,
     }
 
 

From a2275925f94f92c5928e5244f07f73214c4cdae8 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 07:49:17 -0500
Subject: [PATCH 27/92] feat: added databricks framework layer

---
 src/webapp/databricks.py     | 4 ++--
 src/webapp/routers/models.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 592bc852..9a39634f 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -44,10 +44,10 @@ class DatabricksInferenceRunRequest(BaseModel):
     # Note that the following should be the filepath.
     filepath_to_type: dict[str, list[SchemaType]]
     model_name: str
-    model_type: str = "sklearn"
     # The email where notifications will get sent.
     email: str
     gcp_external_bucket_name: str
+    framework: str
 
 
 class DatabricksInferenceRunResponse(BaseModel):
@@ -220,8 +220,8 @@ def run_pdp_inference(
                     ],  # is this value the same PER environ? dev/staging/prod
                     "gcp_bucket_name": req.gcp_external_bucket_name,
                     "model_name": req.model_name,
-                    "model_type": req.model_type,
                     "notification_email": req.email,
+                    "framework": req.framework,
                 },
             )
             LOGGER.info(
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 3ff8df4a..308b9108 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -301,6 +301,7 @@ def read_inst_model(
         "created_by": uuid_to_str(query_result[0][0].created_by),
         "deleted": query_result[0][0].deleted,
         "valid": query_result[0][0].valid,
+        "framework": query_result[0][0].framework,
     }
 
 
@@ -549,6 +550,7 @@ def trigger_inference_run(
         gcp_external_bucket_name=get_external_bucket_name(inst_id),
         # The institution email to which pipeline success/failure notifications will get sent.
         email=current_user.email,
+        framework=query_result[0][0].framework,
     )
     try:
         res = databricks_control.run_pdp_inference(db_req)

From 27a5eee78ea408b742cd1a9ebdbdb75a1224bb97 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 09:54:58 -0500
Subject: [PATCH 28/92] added framework param to job

---
 src/webapp/database.py       | 4 +++-
 src/webapp/routers/models.py | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/webapp/database.py b/src/webapp/database.py
index da1fc9a5..2862e6c6 100644
--- a/src/webapp/database.py
+++ b/src/webapp/database.py
@@ -551,7 +551,9 @@ class JobTable(Base):
         String(VAR_CHAR_STANDARD_LENGTH), nullable=True
     )
     completed: Mapped[bool] = mapped_column(nullable=True)
-
+    framework: Mapped[str | None] = mapped_column(
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn'
+    )
 
 class DocType(enum.Enum):
     base = "base"
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 308b9108..40f28135 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -569,6 +569,7 @@ def trigger_inference_run(
         batch_name=req.batch_name,
         model_id=query_result[0][0].id,
         output_valid=False,
+        framework=query_result[0][0].framework,
     )
     local_session.get().add(job)
     return {
@@ -579,4 +580,5 @@ def trigger_inference_run(
         "triggered_at": triggered_timestamp,
         "batch_name": req.batch_name,
         "output_valid": False,
+        "framework": query_result[0][0].framework,
     }

From 166b32926178008dd124641a760573ab29e5db37 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 09:59:56 -0500
Subject: [PATCH 29/92] added case block to job run

---
 src/webapp/databricks.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 9a39634f..f52fd3a2 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -35,7 +35,7 @@
 
 # The name of the deployed pipeline in Databricks. Must match directly.
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
-
+PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline"
 
 class DatabricksInferenceRunRequest(BaseModel):
     """Databricks parameters for an inference run."""
@@ -192,8 +192,16 @@ def run_pdp_inference(
 
         db_inst_name = databricksify_inst_name(req.inst_name)
 
+        if req.framework == "sklearn":
+            pipeline_type = PDP_INFERENCE_JOB_NAME
+        elif req.framework == "h20":
+            pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
+        else:
+            raise ValueError(
+                f"Invalid model framework assigned to institution model"
+            )
         try:
-            job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None)
+            job = next(w.jobs.list(name=pipeline_type), None)
             if not job or job.job_id is None:
                 raise ValueError(
                     f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id."

From f45b00cbd83c1a152db90783a3b5b06b7224be35 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 10:08:47 -0500
Subject: [PATCH 30/92] added case block to job run

---
 src/webapp/databricks.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index f52fd3a2..b2d94361 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -44,10 +44,10 @@ class DatabricksInferenceRunRequest(BaseModel):
     # Note that the following should be the filepath.
     filepath_to_type: dict[str, list[SchemaType]]
     model_name: str
+    model_type: str
     # The email where notifications will get sent.
     email: str
     gcp_external_bucket_name: str
-    framework: str
 
 
 class DatabricksInferenceRunResponse(BaseModel):
@@ -204,12 +204,12 @@ def run_pdp_inference(
             job = next(w.jobs.list(name=pipeline_type), None)
             if not job or job.job_id is None:
                 raise ValueError(
-                    f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id."
+                    f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id."
                 )
             job_id = job.job_id
-            LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}")
+            LOGGER.info(f"Resolved job ID for '{pipeline_type}': {job_id}")
         except Exception as e:
-            LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.")
+            LOGGER.exception(f"Job lookup failed for '{pipeline_type}'.")
             raise ValueError(f"run_pdp_inference(): Failed to find job: {e}")
 
         try:
@@ -228,8 +228,8 @@ def run_pdp_inference(
                     ],  # is this value the same PER environ? dev/staging/prod
                     "gcp_bucket_name": req.gcp_external_bucket_name,
                     "model_name": req.model_name,
+                    "model_type": req.framework,
                     "notification_email": req.email,
-                    "framework": req.framework,
                 },
             )
             LOGGER.info(

From 4d4bf6843699943cb57c3d6ae1593c6ea6ddd08c Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 10:13:25 -0500
Subject: [PATCH 31/92] added case block to job run

---
 src/webapp/databricks.py     | 6 +++---
 src/webapp/routers/models.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index b2d94361..252ffdb8 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -192,9 +192,9 @@ def run_pdp_inference(
 
         db_inst_name = databricksify_inst_name(req.inst_name)
 
-        if req.framework == "sklearn":
+        if req.model_type == "sklearn":
             pipeline_type = PDP_INFERENCE_JOB_NAME
-        elif req.framework == "h20":
+        elif req.model_type == "h20":
             pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
         else:
             raise ValueError(
@@ -228,7 +228,7 @@ def run_pdp_inference(
                     ],  # is this value the same PER environ? dev/staging/prod
                     "gcp_bucket_name": req.gcp_external_bucket_name,
                     "model_name": req.model_name,
-                    "model_type": req.framework,
+                    "model_type": req.model_type,
                     "notification_email": req.email,
                 },
             )
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 40f28135..9b464f67 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -569,7 +569,7 @@ def trigger_inference_run(
         batch_name=req.batch_name,
         model_id=query_result[0][0].id,
         output_valid=False,
-        framework=query_result[0][0].framework,
+        model_type=query_result[0][0].framework,
     )
     local_session.get().add(job)
     return {

From ae912c21b9da127dd42a4ab8e4410ab1fc8644e1 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 10:17:43 -0500
Subject: [PATCH 32/92] fix linting and test

---
 src/webapp/databricks.py          | 2 +-
 src/webapp/routers/models_test.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 252ffdb8..6cfb0bda 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -198,7 +198,7 @@ def run_pdp_inference(
             pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
         else:
             raise ValueError(
-                f"Invalid model framework assigned to institution model"
+                "Invalid model framework assigned to institution model"
             )
         try:
             job = next(w.jobs.list(name=pipeline_type), None)
diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 8643f98b..8d39a925 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -152,6 +152,7 @@ def session_fixture():
             ]
         ),
         valid=True,
+        framework="sklearn",
     )
     run_1 = JobTable(
         id=RUN_ID,
@@ -161,6 +162,7 @@ def session_fixture():
         completed=True,
         output_filename="file_output_one",
         created_by=created_by_UUID,
+        framework="sklearn",
     )
     try:
         with sqlalchemy.orm.Session(engine) as session:

From 18708f2c5b7771016c53ed858200205336795e53 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 10:20:32 -0500
Subject: [PATCH 33/92] fix linting and test

---
 src/webapp/databricks.py     | 2 +-
 src/webapp/routers/models.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 6cfb0bda..2b553953 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -194,7 +194,7 @@ def run_pdp_inference(
 
         if req.model_type == "sklearn":
             pipeline_type = PDP_INFERENCE_JOB_NAME
-        elif req.model_type == "h20":
+        elif req.model_type == "h2o":
             pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
         else:
             raise ValueError(
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 9b464f67..fd863e4c 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -215,7 +215,7 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
-            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn",
+            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h2o"} else "sklearn",
         )
         local_session.get().add(model)
         local_session.get().commit()

From cafcb107964394e035ce4aefefb4b14e496745c5 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 10:21:24 -0500
Subject: [PATCH 34/92] fix linting and test

---
 src/webapp/routers/models.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index fd863e4c..1d7c7422 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -96,6 +96,7 @@ class ModelCreationRequest(BaseModel):
     # valid = False, means the model is not ready for use.
     valid: bool = False
     schema_configs: list[list[SchemaConfigObj]]
+    framework: str
 
 
 class ModelInfo(BaseModel):

From be4cbdae803f716f0f51c91f2131f1047366730b Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:08:58 -0500
Subject: [PATCH 35/92] fix linting and test

---
 src/webapp/routers/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 1d7c7422..69905cd9 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -551,7 +551,7 @@ def trigger_inference_run(
         gcp_external_bucket_name=get_external_bucket_name(inst_id),
         # The institution email to which pipeline success/failure notifications will get sent.
         email=current_user.email,
-        framework=query_result[0][0].framework,
+        model_type=query_result[0][0].framework,
     )
     try:
         res = databricks_control.run_pdp_inference(db_req)

From abcd80149e58436b2c13218c758a24c5bfd6148f Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:10:20 -0500
Subject: [PATCH 36/92] fix linting and test

---
 src/webapp/routers/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 69905cd9..b97cab56 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -570,7 +570,7 @@ def trigger_inference_run(
         batch_name=req.batch_name,
         model_id=query_result[0][0].id,
         output_valid=False,
-        model_type=query_result[0][0].framework,
+        framework=query_result[0][0].framework,
     )
     local_session.get().add(job)
     return {

From f14297615deed824124b5dd1351cd9f6ae2042a2 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:12:59 -0500
Subject: [PATCH 37/92] fix linting and test

---
 src/webapp/routers/models_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 8d39a925..8c828b8e 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -338,6 +338,7 @@ def test_create_model(client: TestClient):
         json={
             "name": "my_model",
             "schema_configs": [[schema_config_1, schema_config_2]],
+            "framework": "h2o",
         },
     )
 

From 9c4944311ea8ccb5c8a3b26886b5cb9cdb026dee Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:16:54 -0500
Subject: [PATCH 38/92] fix linting and test

---
 src/webapp/database.py       | 5 +++--
 src/webapp/databricks.py     | 7 +++----
 src/webapp/routers/models.py | 4 +++-
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/webapp/database.py b/src/webapp/database.py
index 2862e6c6..7c06d74d 100644
--- a/src/webapp/database.py
+++ b/src/webapp/database.py
@@ -512,7 +512,7 @@ class ModelTable(Base):
     # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version.
     version: Mapped[int] = mapped_column(Integer, default=0)
     framework: Mapped[str | None] = mapped_column(
-        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn'
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn"
     )
 
     # Within a given institution, there should be no duplicated model names.
@@ -552,9 +552,10 @@ class JobTable(Base):
     )
     completed: Mapped[bool] = mapped_column(nullable=True)
     framework: Mapped[str | None] = mapped_column(
-        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn'
+        String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn"
     )
 
+
 class DocType(enum.Enum):
     base = "base"
     extension = "extension"
diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 2b553953..80ab290e 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -37,6 +37,7 @@
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
 PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline"
 
+
 class DatabricksInferenceRunRequest(BaseModel):
     """Databricks parameters for an inference run."""
 
@@ -197,9 +198,7 @@ def run_pdp_inference(
         elif req.model_type == "h2o":
             pipeline_type = PDP_H2O_INFERENCE_JOB_NAME
         else:
-            raise ValueError(
-                "Invalid model framework assigned to institution model"
-            )
+            raise ValueError("Invalid model framework assigned to institution model")
         try:
             job = next(w.jobs.list(name=pipeline_type), None)
             if not job or job.job_id is None:
@@ -560,4 +559,4 @@ def create_custom_schema_extension(
             existing_extension=extension_schema,  # may be None
         )
 
-        return updated_extension
\ No newline at end of file
+        return updated_extension
diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index b97cab56..876f557b 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -216,7 +216,9 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
-            framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h2o"} else "sklearn",
+            framework=f
+            if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"}
+            else "sklearn",
         )
         local_session.get().add(model)
         local_session.get().commit()

From 00065c76e37352972eb1c5de917dfae4cef7e55a Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:23:06 -0500
Subject: [PATCH 39/92] fix TYPECHECK

---
 src/webapp/utilities.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index 460d4e1d..c3350a22 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -2,7 +2,7 @@
 
 import uuid
 import re
-from typing import Annotated, Final, Any
+from typing import Annotated, Final, Any, Optional
 from urllib.parse import unquote
 from strenum import StrEnum  # needed for python pre 3.11
 import jwt
@@ -394,7 +394,7 @@ def uuid_to_str(uuid_val: uuid.UUID) -> str:
     return uuid_val.hex
 
 
-def str_to_uuid(hex_str: str) -> uuid.UUID:
+def str_to_uuid(hex_str: Optional[str]) -> uuid.UUID:
     """Convert str to UUID obj (database needs UUID obj)."""
     return uuid.UUID(hex_str)
 

From 0badfd351568043412e8a531a1847e84f5e23be4 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 11:29:48 -0500
Subject: [PATCH 40/92] fix TYPECHECK

---
 src/webapp/utilities.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index c3350a22..ee4617c6 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -163,7 +163,7 @@ class BaseUser(BaseModel):
     disabled: bool | None = None
 
     # Constructor
-    def __init__(self, usr: str | None, inst: str, access: str, email: str) -> None:
+    def __init__(self, usr: str | None, inst: str | None, access: str | None, email: str | None) -> None:
         super().__init__(user_id=usr, institution=inst, access_type=access, email=email)
 
     def is_datakinder(self) -> Any:
@@ -182,7 +182,7 @@ def is_viewer(self) -> Any:
         """Whether a given user is a viewer."""
         return self.access_type and self.access_type == AccessType.VIEWER
 
-    def has_access_to_inst(self, inst: str) -> Any:
+    def has_access_to_inst(self, inst: str | None) -> Any:
         """Whether a given user has access to a given institution."""
         return self.access_type and (
             self.access_type == AccessType.DATAKINDER or self.institution == inst
@@ -219,7 +219,7 @@ def get_user(sess: Session, username: str) -> BaseUser:
     """Get user from a given username."""
     if username == "api_key_initial":
         return BaseUser(
-            usr=env_vars["INITIAL_API_KEY_ID"],
+            usr=str(env_vars["INITIAL_API_KEY_ID"]),
             inst=None,
             access="DATAKINDER",
             email="api_key_initial",
@@ -260,7 +260,7 @@ def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser:
     # Check if it's the initial API key. This doesn't have enduser or inst.
     if key == env_vars["INITIAL_API_KEY"]:
         return BaseUser(
-            usr=env_vars["INITIAL_API_KEY_ID"],
+            usr=str(env_vars["INITIAL_API_KEY_ID"]),
             inst=None,
             access="DATAKINDER",
             email="api_key_initial",

From fecef5aaba6c8280726c235872a062853e27ce1a Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 12:46:16 -0500
Subject: [PATCH 41/92] fix: alllllllllll the typecheck issues

---
 src/webapp/routers/models.py      |   6 +-
 src/webapp/routers/models_test.py | 120 +++++++++++++++---------------
 src/webapp/utilities.py           |  38 +++++-----
 3 files changed, 84 insertions(+), 80 deletions(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 876f557b..5b13003d 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -1,7 +1,7 @@
 """API functions related to models."""
 
 from datetime import datetime
-from typing import Annotated, Any
+from typing import Annotated, Any, cast
 import jsonpickle
 from fastapi import APIRouter, Depends, HTTPException, status
 from pydantic import BaseModel
@@ -60,7 +60,7 @@ def check_file_types_valid_schema_configs(
     """Check that a list of files are valid for a given schema configuration."""
     for config in valid_schema_configs:
         found = True
-        map_file_to_schema_config_obj = {}
+        map_file_to_schema_config_obj: dict= {}
         for idx, s in enumerate(file_types):
             for c in config:
                 if c.schema_type in s:
@@ -552,7 +552,7 @@ def trigger_inference_run(
         model_name=model_name,
         gcp_external_bucket_name=get_external_bucket_name(inst_id),
         # The institution email to which pipeline success/failure notifications will get sent.
-        email=current_user.email,
+        email=cast(str, current_user.email),
         model_type=query_result[0][0].framework,
     )
     try:
diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 8c828b8e..cf565b90 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -2,6 +2,7 @@
 
 import uuid
 from unittest import mock
+from typing import Any
 import pytest
 import jsonpickle
 from fastapi.testclient import TestClient
@@ -50,32 +51,32 @@
 
 
 # TODO plumb through schema configs
-def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo):
+def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo) -> bool:
     """Check ModelInfo equality without order."""
     if (
-        a_elem["inst_id"] != b_elem["inst_id"]
-        or a_elem["name"] != b_elem["name"]
-        or a_elem["m_id"] != b_elem["m_id"]
-        or a_elem["valid"] != b_elem["valid"]
-        or a_elem["deleted"] != b_elem["deleted"]
+        a_elem.inst_id != b_elem.inst_id
+        or a_elem.name != b_elem.name
+        or a_elem.m_id != b_elem.m_id
+        or a_elem.valid != b_elem.valid
+        or a_elem.deleted != b_elem.deleted
     ):
         return False
     return True
 
 
-def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo):
+def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo) -> bool:
     """Check RunInfo equality without order."""
     if (
-        a_elem["inst_id"] != b_elem["inst_id"]
-        or a_elem["m_name"] != b_elem["m_name"]
-        or a_elem["run_id"] != b_elem["run_id"]
-        or a_elem["created_by"] != b_elem["created_by"]
-        or a_elem["triggered_at"] != b_elem["triggered_at"]
-        or a_elem["output_filename"] != b_elem["output_filename"]
-        or a_elem["output_valid"] != b_elem["output_valid"]
-        or a_elem["err_msg"] != b_elem["err_msg"]
-        or a_elem["batch_name"] != b_elem["batch_name"]
-        or a_elem["completed"] != b_elem["completed"]
+        a_elem.inst_id != b_elem.inst_id
+        or a_elem.m_name != b_elem.m_name
+        or a_elem.run_id != b_elem.run_id
+        or a_elem.created_by != b_elem.created_by
+        or a_elem.triggered_at != b_elem.triggered_at
+        or a_elem.output_filename != b_elem.output_filename
+        or a_elem.output_valid != b_elem.output_valid
+        or a_elem.err_msg != b_elem.err_msg
+        or a_elem.batch_name != b_elem.batch_name
+        or a_elem.completed != b_elem.completed
     ):
         return False
     return True
@@ -200,7 +201,7 @@ def session_fixture():
 
 
 @pytest.fixture(name="client")
-def client_fixture(session: sqlalchemy.orm.Session):
+def client_fixture(session: sqlalchemy.orm.Session) -> Any:
     """Unit test mocks setup."""
 
     def get_session_override():
@@ -226,26 +227,25 @@ def databricks_control_override():
     app.dependency_overrides.clear()
 
 
-def test_read_inst_models(client: TestClient):
+def test_read_inst_models(client: TestClient) -> None:
     """Test GET /institutions/345/models."""
     response = client.get(
         "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/models"
     )
     assert response.status_code == 200
     assert same_model_orderless(
-        response.json()[0],
-        {
-            "created_by": "",
-            "deleted": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_id": "e4862c62829440d8ab4c9c298f02f619",
-            "name": "sample_model_for_school_1",
-            "valid": True,
-        },
+        ModelInfo(**response.json()[0]),
+        ModelInfo(
+            m_id="e4862c62829440d8ab4c9c298f02f619",
+            name= "sample_model_for_school_1",
+            inst_id= "1d7c75c33eda42949c6675ea8af97b55",
+            deleted= None,
+            valid= True,
+        ),
     )
 
 
-def test_read_inst_model(client: TestClient):
+def test_read_inst_model(client: TestClient) -> None:
     """Test GET /institutions/345/models/10. For various user access types."""
     # Unauthorized cases.
     response_unauth = client.get(
@@ -266,10 +266,17 @@ def test_read_inst_model(client: TestClient):
         + "/models/sample_model_for_school_1"
     )
     assert response.status_code == 200
-    assert same_model_orderless(response.json(), MODEL_OBJ)
+    assert same_model_orderless(response.json(),
+        ModelInfo(
+            deleted= None,
+            inst_id= "1d7c75c33eda42949c6675ea8af97b55",
+            m_id="e4862c62829440d8ab4c9c298f02f619",
+            name="sample_model_for_school_1",
+            valid=True,
+        ))
 
 
-def test_read_inst_model_outputs(client: TestClient):
+def test_read_inst_model_outputs(client: TestClient) -> None:
     """Test GET /institutions/345/models/10/output."""
     MOCK_STORAGE.list_blobs_in_folder.return_value = []
     # Authorized.
@@ -281,22 +288,20 @@ def test_read_inst_model_outputs(client: TestClient):
     assert response.status_code == 200
     assert same_run_info_orderless(
         response.json()[0],
-        {
-            "batch_name": "batch_foo",
-            "completed": True,
-            "created_by": "0ad8b77c49fb459a84b18d2c05722c4a",
-            "err_msg": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_name": "sample_model_for_school_1",
-            "output_filename": "file_output_one",
-            "output_valid": False,
-            "run_id": 123,
-            "triggered_at": "2024-12-24T20:22:20.132022",
-        },
+        RunInfo(
+            batch_name="batch_foo",
+            created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+            err_msg=None,
+            inst_id="1d7c75c33eda42949c6675ea8af97b55",
+            m_name="sample_model_for_school_1",
+            output_filename="file_output_one",
+            output_valid=False,
+            run_id=123,
+        ),
     )
 
 
-def test_read_inst_model_output(client: TestClient):
+def test_read_inst_model_output(client: TestClient) -> None:
     """Test GET /institutions/345/models/10/output/1."""
     # Authorized.
     response = client.get(
@@ -308,22 +313,21 @@ def test_read_inst_model_output(client: TestClient):
     assert response.status_code == 200
     assert same_run_info_orderless(
         response.json(),
-        {
-            "batch_name": "batch_foo",
-            "completed": True,
-            "created_by": "0ad8b77c49fb459a84b18d2c05722c4a",
-            "err_msg": None,
-            "inst_id": "1d7c75c33eda42949c6675ea8af97b55",
-            "m_name": "sample_model_for_school_1",
-            "output_filename": "file_output_one",
-            "output_valid": False,
-            "run_id": 123,
-            "triggered_at": "2024-12-24T20:22:20.132022",
-        },
+        RunInfo(
+            batch_name="batch_foo",
+            completed=True,
+            created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+            err_msg=None,
+            inst_id="1d7c75c33eda42949c6675ea8af97b55",
+            m_name="sample_model_for_school_1",
+            output_filename="file_output_one",
+            output_valid=False,
+            run_id=123,
+        ),
     )
 
 
-def test_create_model(client: TestClient):
+def test_create_model(client: TestClient) -> None:
     """Depending on timeline, fellows may not get to this."""
     schema_config_1 = {
         "schema_type": SchemaType.COURSE,
@@ -345,7 +349,7 @@ def test_create_model(client: TestClient):
     assert response.status_code == 200
 
 
-def test_trigger_inference_run(client: TestClient):
+def test_trigger_inference_run(client: TestClient) -> None:
     """Depending on timeline, fellows may not get to this."""
     MOCK_DATABRICKS.run_pdp_inference.return_value = DatabricksInferenceRunResponse(
         job_run_id=123
diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index ee4617c6..392da4d3 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -2,7 +2,7 @@
 
 import uuid
 import re
-from typing import Annotated, Final, Any, Optional
+from typing import Annotated, Final, Any, Optional, Tuple, Union
 from urllib.parse import unquote
 from strenum import StrEnum  # needed for python pre 3.11
 import jwt
@@ -215,7 +215,7 @@ def has_stronger_permissions_than(self, other_access_type: AccessType) -> bool:
         return False
 
 
-def get_user(sess: Session, username: str) -> BaseUser:
+def get_user(sess: Session, username: str) -> Optional[BaseUser]:
     """Get user from a given username."""
     if username == "api_key_initial":
         return BaseUser(
@@ -226,17 +226,17 @@ def get_user(sess: Session, username: str) -> BaseUser:
         )
     if username.startswith("api_key_"):
         api_key_uuid = username.removeprefix("api_key_")
-        query_result = sess.execute(
+        apikey_query_result = sess.execute(
             select(ApiKeyTable).where(
                 ApiKeyTable.id == str_to_uuid(api_key_uuid),
             )
         ).all()
-        if len(query_result) == 0 or len(query_result) > 1:
+        if len(apikey_query_result) == 0 or len(apikey_query_result) > 1:
             return None
         return BaseUser(
-            usr=uuid_to_str(query_result[0][0].id),
-            inst=uuid_to_str(query_result[0][0].inst_id),
-            access=query_result[0][0].access_type,
+            usr=uuid_to_str(apikey_query_result[0][0].id),
+            inst=uuid_to_str(apikey_query_result[0][0].inst_id),
+            access=apikey_query_result[0][0].access_type,
             email=username,
         )
     query_result = sess.execute(
@@ -254,7 +254,7 @@ def get_user(sess: Session, username: str) -> BaseUser:
     )
 
 
-def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser:
+def authenticate_api_key(api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session) -> Union[BaseUser, bool]:
     """Authenticate an API key."""
     (key, inst, enduser) = api_key_enduser_tuple
     # Check if it's the initial API key. This doesn't have enduser or inst.
@@ -291,7 +291,7 @@ def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser:
                     user_query = select(AccountTable).where(
                         and_(
                             AccountTable.email == enduser,
-                            AccountTable.inst_id == uuid_to_str(inst),
+                            AccountTable.inst_id == inst,
                         )
                     )
                 user_result = sess.execute(user_query).all()
@@ -330,7 +330,7 @@ async def get_current_user(
         if not token_from_key:
             raise credentials_exception
         payload = jwt.decode(
-            token_from_key, env_vars["SECRET_KEY"], algorithms=env_vars["ALGORITHM"]
+            token_from_key, str(env_vars["SECRET_KEY"]), algorithms=env_vars["ALGORITHM"]
         )
         usrname = payload.get("sub")
         if usrname is None:
@@ -345,14 +345,14 @@ async def get_current_user(
 
 async def get_current_active_user(
     current_user: Annotated[BaseUser, Depends(get_current_user)],
-):
+) -> BaseUser:
     """Get the active user.."""
     if current_user.disabled:
         raise HTTPException(status_code=400, detail="Inactive user")
     return current_user
 
 
-def has_access_to_inst_or_err(inst: str, user: BaseUser):
+def has_access_to_inst_or_err(inst: str, user: BaseUser) -> None:
     """Raise error if a given user does not have access to a given institution."""
     if not user.has_access_to_inst(inst):
         raise HTTPException(
@@ -361,7 +361,7 @@ def has_access_to_inst_or_err(inst: str, user: BaseUser):
         )
 
 
-def has_full_data_access_or_err(user: BaseUser, resource_type: str):
+def has_full_data_access_or_err(user: BaseUser, resource_type: str) -> None:
     """Raise error if a given user does not have data access to a given institution."""
     if not user.has_full_data_access():
         raise HTTPException(
@@ -370,7 +370,7 @@ def has_full_data_access_or_err(user: BaseUser, resource_type: str):
         )
 
 
-def model_owner_and_higher_or_err(user: BaseUser, resource_type: str):
+def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None:
     """Raise error if a given user does not have model ownership or higher."""
     if not user.access_type or user.access_type not in (
         AccessType.MODEL_OWNER,
@@ -382,12 +382,12 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str):
         )
 
 
-def prepend_env_prefix(name: str) -> str:
+def prepend_env_prefix(name: str) -> Any:
     """Prepend the env prefix. At this point the value should not be empty as we checked on app startup."""
-    return env_vars["ENV"].lower() + "_" + name
+    return str(env_vars["ENV"]).lower() + "_" + name
 
 
-def uuid_to_str(uuid_val: uuid.UUID) -> str:
+def uuid_to_str(uuid_val: uuid.UUID) -> Any:
     """Convert UUID obj to string."""
     if uuid_val is None:
         return ""
@@ -399,12 +399,12 @@ def str_to_uuid(hex_str: Optional[str]) -> uuid.UUID:
     return uuid.UUID(hex_str)
 
 
-def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> str:
+def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> Any:
     """Get the GCP bucket name which has the env prepended taking in the UUID obj."""
     return prepend_env_prefix(uuid_to_str(inst_id))
 
 
-def get_external_bucket_name(inst_id: str) -> str:
+def get_external_bucket_name(inst_id: str) -> Any:
     """Get the GCP bucket name which has the env prepended taking in the uuid as str."""
     return prepend_env_prefix(inst_id)
 

From 457fd14821f19451226eee9cbeea5bb774dd5af1 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 12:47:36 -0500
Subject: [PATCH 42/92] fix: alllllllllll the typecheck issues

---
 src/webapp/routers/models_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index cf565b90..45f57327 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -14,7 +14,6 @@
     USER_UUID,
     UUID_INVALID,
     DATETIME_TESTING,
-    MODEL_OBJ,
     SAMPLE_UUID,
 )
 from ..main import app

From ee85f6b200c9c609bdbcd5ebd67d347a4d66d20b Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 12:53:16 -0500
Subject: [PATCH 43/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 45f57327..0459d3da 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -265,14 +265,15 @@ def test_read_inst_model(client: TestClient) -> None:
         + "/models/sample_model_for_school_1"
     )
     assert response.status_code == 200
-    assert same_model_orderless(response.json(),
-        ModelInfo(
-            deleted= None,
-            inst_id= "1d7c75c33eda42949c6675ea8af97b55",
-            m_id="e4862c62829440d8ab4c9c298f02f619",
-            name="sample_model_for_school_1",
-            valid=True,
-        ))
+    response_model = ModelInfo(**response.json())
+    expected_model = ModelInfo(
+        deleted=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_id="e4862c62829440d8ab4c9c298f02f619",
+        name="sample_model_for_school_1",
+        valid=True,
+    )
+    assert same_model_orderless(response_model, expected_model)
 
 
 def test_read_inst_model_outputs(client: TestClient) -> None:

From 2cc5937f14fe0872656e50bf6604ad9e17aeb748 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 12:56:04 -0500
Subject: [PATCH 44/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 0459d3da..4ae17ff2 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -286,19 +286,18 @@ def test_read_inst_model_outputs(client: TestClient) -> None:
         + "/models/sample_model_for_school_1/runs"
     )
     assert response.status_code == 200
-    assert same_run_info_orderless(
-        response.json()[0],
-        RunInfo(
-            batch_name="batch_foo",
-            created_by="0ad8b77c49fb459a84b18d2c05722c4a",
-            err_msg=None,
-            inst_id="1d7c75c33eda42949c6675ea8af97b55",
-            m_name="sample_model_for_school_1",
-            output_filename="file_output_one",
-            output_valid=False,
-            run_id=123,
-        ),
+    response_model = RunInfo(**response.json()[0])
+    expected_model = RunInfo(
+        batch_name="batch_foo",
+        created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+        err_msg=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_name="sample_model_for_school_1",
+        output_filename="file_output_one",
+        output_valid=False,
+        run_id=123,
     )
+    assert same_model_orderless(response_model, expected_model)
 
 
 def test_read_inst_model_output(client: TestClient) -> None:

From 75006261311e0081feacb7e597158c9ed12bef35 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 12:59:34 -0500
Subject: [PATCH 45/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 4ae17ff2..5b0f6a15 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -297,7 +297,8 @@ def test_read_inst_model_outputs(client: TestClient) -> None:
         output_valid=False,
         run_id=123,
     )
-    assert same_model_orderless(response_model, expected_model)
+    assert same_run_info_orderless(response_model, expected_model)
+
 
 
 def test_read_inst_model_output(client: TestClient) -> None:

From 7aeefabc0fa380ad45bbe9601a7460dfae502fd6 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:03:23 -0500
Subject: [PATCH 46/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 5b0f6a15..277da913 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -296,6 +296,8 @@ def test_read_inst_model_outputs(client: TestClient) -> None:
         output_filename="file_output_one",
         output_valid=False,
         run_id=123,
+        triggered_at=response_model.triggered_at,  # copy from response
+        completed=response_model.completed
     )
     assert same_run_info_orderless(response_model, expected_model)
 
@@ -311,9 +313,8 @@ def test_read_inst_model_output(client: TestClient) -> None:
         + str(RUN_ID)
     )
     assert response.status_code == 200
-    assert same_run_info_orderless(
-        response.json(),
-        RunInfo(
+    response_model = RunInfo(response.json())
+    expected_model = RunInfo(
             batch_name="batch_foo",
             completed=True,
             created_by="0ad8b77c49fb459a84b18d2c05722c4a",
@@ -323,8 +324,8 @@ def test_read_inst_model_output(client: TestClient) -> None:
             output_filename="file_output_one",
             output_valid=False,
             run_id=123,
-        ),
-    )
+        )
+    assert same_run_info_orderless(response_model, expected_model)
 
 
 def test_create_model(client: TestClient) -> None:

From baee11d2521b6ae041e2c22b879f8f791940da26 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:05:02 -0500
Subject: [PATCH 47/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 277da913..af59c66b 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -313,7 +313,7 @@ def test_read_inst_model_output(client: TestClient) -> None:
         + str(RUN_ID)
     )
     assert response.status_code == 200
-    response_model = RunInfo(response.json())
+    response_model = RunInfo(**response.json())
     expected_model = RunInfo(
             batch_name="batch_foo",
             completed=True,

From 2ede593afdf4d4330346820b3e6e465f5e6330e3 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:06:02 -0500
Subject: [PATCH 48/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index af59c66b..ed600522 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -324,6 +324,8 @@ def test_read_inst_model_output(client: TestClient) -> None:
             output_filename="file_output_one",
             output_valid=False,
             run_id=123,
+            triggered_at=response_model.triggered_at,  # copy from response
+            completed=response_model.completed
         )
     assert same_run_info_orderless(response_model, expected_model)
 

From 83380111e3732f658f85b2d12acabc5a7669ab2b Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:15:16 -0500
Subject: [PATCH 49/92] fix: typecheck issues

---
 src/webapp/routers/models_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index ed600522..6138498c 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -325,7 +325,6 @@ def test_read_inst_model_output(client: TestClient) -> None:
             output_valid=False,
             run_id=123,
             triggered_at=response_model.triggered_at,  # copy from response
-            completed=response_model.completed
         )
     assert same_run_info_orderless(response_model, expected_model)
 

From 5ba9886d7b99e7cc1b457388d776156621de3d09 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:16:24 -0500
Subject: [PATCH 50/92] fix: typecheck issues

---
 src/webapp/routers/models.py      |  2 +-
 src/webapp/routers/models_test.py | 33 +++++++++++++++----------------
 src/webapp/utilities.py           | 12 ++++++++---
 3 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index 5b13003d..f7737f36 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -60,7 +60,7 @@ def check_file_types_valid_schema_configs(
     """Check that a list of files are valid for a given schema configuration."""
     for config in valid_schema_configs:
         found = True
-        map_file_to_schema_config_obj: dict= {}
+        map_file_to_schema_config_obj: dict = {}
         for idx, s in enumerate(file_types):
             for c in config:
                 if c.schema_type in s:
diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py
index 6138498c..1da27834 100644
--- a/src/webapp/routers/models_test.py
+++ b/src/webapp/routers/models_test.py
@@ -236,10 +236,10 @@ def test_read_inst_models(client: TestClient) -> None:
         ModelInfo(**response.json()[0]),
         ModelInfo(
             m_id="e4862c62829440d8ab4c9c298f02f619",
-            name= "sample_model_for_school_1",
-            inst_id= "1d7c75c33eda42949c6675ea8af97b55",
-            deleted= None,
-            valid= True,
+            name="sample_model_for_school_1",
+            inst_id="1d7c75c33eda42949c6675ea8af97b55",
+            deleted=None,
+            valid=True,
         ),
     )
 
@@ -297,12 +297,11 @@ def test_read_inst_model_outputs(client: TestClient) -> None:
         output_valid=False,
         run_id=123,
         triggered_at=response_model.triggered_at,  # copy from response
-        completed=response_model.completed
+        completed=response_model.completed,
     )
     assert same_run_info_orderless(response_model, expected_model)
 
 
-
 def test_read_inst_model_output(client: TestClient) -> None:
     """Test GET /institutions/345/models/10/output/1."""
     # Authorized.
@@ -315,17 +314,17 @@ def test_read_inst_model_output(client: TestClient) -> None:
     assert response.status_code == 200
     response_model = RunInfo(**response.json())
     expected_model = RunInfo(
-            batch_name="batch_foo",
-            completed=True,
-            created_by="0ad8b77c49fb459a84b18d2c05722c4a",
-            err_msg=None,
-            inst_id="1d7c75c33eda42949c6675ea8af97b55",
-            m_name="sample_model_for_school_1",
-            output_filename="file_output_one",
-            output_valid=False,
-            run_id=123,
-            triggered_at=response_model.triggered_at,  # copy from response
-        )
+        batch_name="batch_foo",
+        completed=True,
+        created_by="0ad8b77c49fb459a84b18d2c05722c4a",
+        err_msg=None,
+        inst_id="1d7c75c33eda42949c6675ea8af97b55",
+        m_name="sample_model_for_school_1",
+        output_filename="file_output_one",
+        output_valid=False,
+        run_id=123,
+        triggered_at=response_model.triggered_at,  # copy from response
+    )
     assert same_run_info_orderless(response_model, expected_model)
 
 
diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index 392da4d3..8b35088b 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -163,7 +163,9 @@ class BaseUser(BaseModel):
     disabled: bool | None = None
 
     # Constructor
-    def __init__(self, usr: str | None, inst: str | None, access: str | None, email: str | None) -> None:
+    def __init__(
+        self, usr: str | None, inst: str | None, access: str | None, email: str | None
+    ) -> None:
         super().__init__(user_id=usr, institution=inst, access_type=access, email=email)
 
     def is_datakinder(self) -> Any:
@@ -254,7 +256,9 @@ def get_user(sess: Session, username: str) -> Optional[BaseUser]:
     )
 
 
-def authenticate_api_key(api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session) -> Union[BaseUser, bool]:
+def authenticate_api_key(
+    api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session
+) -> Union[BaseUser, bool]:
     """Authenticate an API key."""
     (key, inst, enduser) = api_key_enduser_tuple
     # Check if it's the initial API key. This doesn't have enduser or inst.
@@ -330,7 +334,9 @@ async def get_current_user(
         if not token_from_key:
             raise credentials_exception
         payload = jwt.decode(
-            token_from_key, str(env_vars["SECRET_KEY"]), algorithms=env_vars["ALGORITHM"]
+            token_from_key,
+            str(env_vars["SECRET_KEY"]),
+            algorithms=env_vars["ALGORITHM"],
         )
         usrname = payload.get("sub")
         if usrname is None:

From 34c7093a96652e899f94a8db9ebd94db7b059aec Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Wed, 10 Sep 2025 13:47:37 -0500
Subject: [PATCH 51/92] fix: typecheck issues

---
 src/webapp/routers/models.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index f7737f36..c838004d 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -216,9 +216,11 @@ def create_model(
             created_by=str_to_uuid(current_user.user_id),
             valid=req.valid,
             schema_configs=jsonpickle.encode(req.schema_configs),
-            framework=f
-            if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"}
-            else "sklearn",
+            framework=(
+                f
+                if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"}
+                else "sklearn"
+            ),
         )
         local_session.get().add(model)
         local_session.get().commit()

From d234f1c957550389f13bea25acb477d71e815622 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 11 Sep 2025 15:08:22 -0500
Subject: [PATCH 52/92] fix added logging

---
 src/webapp/databricks.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 80ab290e..85a16e02 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -99,7 +99,15 @@ def setup_new_inst(self, inst_name: str) -> None:
         db_inst_name = databricksify_inst_name(inst_name)
         cat_name = databricks_vars["CATALOG_NAME"]
         for medallion in MEDALLION_LEVELS:
-            w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name)
+            try:
+                w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name)
+            except Exception as e:
+                LOGGER.exception(
+                    f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}",
+                    databricks_vars["DATABRICKS_HOST_URL"],
+                    gcs_vars["GCP_SERVICE_ACCOUNT_EMAIL"],
+                )
+                raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}")
             LOGGER.info(
                 f"Creating medallion level schemas for {db_inst_name} & {medallion}."
             )

From 85d59c9c0f2f9dee635e8805f7e3f0cbb444a191 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 11 Sep 2025 15:08:46 -0500
Subject: [PATCH 53/92] fix added logging

---
 src/webapp/databricks.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 85a16e02..59434bb6 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -103,9 +103,7 @@ def setup_new_inst(self, inst_name: str) -> None:
                 w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name)
             except Exception as e:
                 LOGGER.exception(
-                    f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}",
-                    databricks_vars["DATABRICKS_HOST_URL"],
-                    gcs_vars["GCP_SERVICE_ACCOUNT_EMAIL"],
+                    f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}"
                 )
                 raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}")
             LOGGER.info(

From 8741a8349c1b34bb25650b1733da5bd482726746 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 11 Sep 2025 15:09:06 -0500
Subject: [PATCH 54/92] fix added logging

---
 src/webapp/databricks.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 59434bb6..80c31657 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -100,12 +100,16 @@ def setup_new_inst(self, inst_name: str) -> None:
         cat_name = databricks_vars["CATALOG_NAME"]
         for medallion in MEDALLION_LEVELS:
             try:
-                w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name)
+                w.schemas.create(
+                    name=f"{db_inst_name}_{medallion}", catalog_name=cat_name
+                )
             except Exception as e:
                 LOGGER.exception(
                     f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}"
                 )
-                raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}")
+                raise ValueError(
+                    f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}"
+                )
             LOGGER.info(
                 f"Creating medallion level schemas for {db_inst_name} & {medallion}."
             )

From 11f2aceaed6d7f5a49b6670282ac15a017d88544 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 11 Sep 2025 16:32:34 -0500
Subject: [PATCH 55/92] fix added logging

---
 src/webapp/routers/data.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 69777abf..29b8587e 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -504,6 +504,7 @@ def create_batch(
         )
         f_names = [] if not req.file_names else req.file_names
         f_ids = [] if not req.file_ids else strs_to_uuids(req.file_ids)
+        print(f"File names: {f_names}, File Ids: {f_ids}")
         # Check that the files requested for this batch exists.
         # Only valid non-sst generated files can be added to a batch at creation time.
         query_result_files = (

From 13e1b30421f52a830367f5c7841f448ed59df9b4 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Thu, 11 Sep 2025 16:48:31 -0500
Subject: [PATCH 56/92] fix added logging

---
 src/webapp/routers/models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py
index c838004d..abbb0a36 100644
--- a/src/webapp/routers/models.py
+++ b/src/webapp/routers/models.py
@@ -96,7 +96,7 @@ class ModelCreationRequest(BaseModel):
     # valid = False, means the model is not ready for use.
     valid: bool = False
     schema_configs: list[list[SchemaConfigObj]]
-    framework: str
+    framework: str | None = None
 
 
 class ModelInfo(BaseModel):

From 39396f572db7d5ac3ea0f96d8239ce9bca39da72 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 17:02:55 -0500
Subject: [PATCH 57/92] fix databricks h2o job name

---
 src/webapp/databricks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 80c31657..7c0cbc29 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -35,7 +35,7 @@
 
 # The name of the deployed pipeline in Databricks. Must match directly.
 PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline"
-PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline"
+PDP_H2O_INFERENCE_JOB_NAME = "edvise_github_sourced_pdp_inference_pipeline"
 
 
 class DatabricksInferenceRunRequest(BaseModel):

From c2d657672e01173392bb897d01422972025fdcef Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 17:29:10 -0500
Subject: [PATCH 58/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 46 ++++++++++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 9 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index b04dad58..82623d6b 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -155,6 +155,37 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
     return DataFrameSchema(columns, strict=False)
 
 
+def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str:
+    """
+    Return a best-guess encoding using BOM detection + trial decode on a small sample.
+    Prefers utf-8-sig for BOMmed utf-8 to avoid \ufeff in headers.
+    """
+    with open(path, "rb") as f:
+        chunk = f.read(sample_bytes)
+
+    # BOM checks first
+    if chunk.startswith(b"\xef\xbb\xbf"):
+        return "utf-8-sig"
+    if chunk.startswith(b"\xff\xfe\x00\x00"):
+        return "utf-32le"
+    if chunk.startswith(b"\x00\x00\xfe\xff"):
+        return "utf-32be"
+    if chunk.startswith(b"\xff\xfe"):
+        return "utf-16le"
+    if chunk.startswith(b"\xfe\xff"):
+        return "utf-16be"
+
+    # Try utf-8 (strict) on sample; if it works, it will work for the file
+    try:
+        chunk.decode("utf-8")
+        return "utf-8"
+    except UnicodeDecodeError:
+        pass
+
+    # Last resort: latin-1 (will not fail, but log later if you want)
+    return "latin1"
+
+
 def validate_dataset(
     filename: str,
     base_schema: dict,
@@ -162,15 +193,12 @@ def validate_dataset(
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
-    read_errs = []
-    for enc in ("utf-8", "utf-8-sig", "latin1"):
-        try:
-            df = pd.read_csv(filename, encoding=enc)
-            break
-        except UnicodeDecodeError as ex:
-            read_errs.append(f"{enc}: {ex}")
-    else:
-        raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs)
+    enc = sniff_encoding(filename)
+    try:
+        df = pd.read_csv(filename, encoding=enc)
+    except UnicodeDecodeError as ex:
+        # extremely rare: sample passed but full file fails
+        raise HardValidationError(schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"])
 
     df = df.rename(columns={c: normalize_col(c) for c in df.columns})
     incoming = set(df.columns)

From 91890625eb6d7d325ddbe74100ab36f8556d2f18 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 17:44:03 -0500
Subject: [PATCH 59/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 82623d6b..f63c4f2a 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -198,7 +198,9 @@ def validate_dataset(
         df = pd.read_csv(filename, encoding=enc)
     except UnicodeDecodeError as ex:
         # extremely rare: sample passed but full file fails
-        raise HardValidationError(schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"])
+        raise HardValidationError(
+            schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"]
+        )
 
     df = df.rename(columns={c: normalize_col(c) for c in df.columns})
     incoming = set(df.columns)

From 2a81f1d3eb90808bf4f6263880849813af49909e Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 17:51:45 -0500
Subject: [PATCH 60/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index f63c4f2a..d461bec1 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -182,7 +182,6 @@ def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str:
     except UnicodeDecodeError:
         pass
 
-    # Last resort: latin-1 (will not fail, but log later if you want)
     return "latin1"
 
 

From 04ed7804ea9927267a8ccfc95c85b993bb4c5212 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:15:21 -0500
Subject: [PATCH 61/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 47 ++++++++++++++++++++++++++++------------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index d461bec1..4a6fc1c0 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -2,11 +2,10 @@
 pipelines, this is for general file validation.)
 """
 
-from typing import Any
-
+import io, os
 import json
 import re
-from typing import Union, List, Dict, Optional
+from typing import Union, List, Dict, Optional, Any
 import logging
 
 import pandas as pd
@@ -155,15 +154,34 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
     return DataFrameSchema(columns, strict=False)
 
 
-def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str:
+def sniff_encoding(
+    src: Union[str, os.PathLike, io.IOBase], sample_bytes: int = 1_048_576
+) -> str:
     """
-    Return a best-guess encoding using BOM detection + trial decode on a small sample.
-    Prefers utf-8-sig for BOMmed utf-8 to avoid \ufeff in headers.
+    Return best-guess encoding using BOM detection + utf-8 trial decode.
+    Accepts path or file-like. Restores stream position if seekable.
+    If utf-8 fails, raises UnicodeError.
     """
-    with open(path, "rb") as f:
-        chunk = f.read(sample_bytes)
-
-    # BOM checks first
+    # --- read small binary sample ---
+    if isinstance(src, (str, os.PathLike)):
+        with open(src, "rb") as f:
+            chunk = f.read(sample_bytes)
+    else:
+        buf = src.buffer if isinstance(src, io.TextIOBase) else src
+        pos = None
+        try:
+            if buf.seekable():
+                pos = buf.tell()
+        except Exception:
+            pass
+        chunk = buf.read(sample_bytes)
+        if pos is not None:
+            try:
+                buf.seek(pos)
+            except Exception:
+                pass
+
+    # --- BOMs first ---
     if chunk.startswith(b"\xef\xbb\xbf"):
         return "utf-8-sig"
     if chunk.startswith(b"\xff\xfe\x00\x00"):
@@ -175,14 +193,15 @@ def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str:
     if chunk.startswith(b"\xfe\xff"):
         return "utf-16be"
 
-    # Try utf-8 (strict) on sample; if it works, it will work for the file
+    # --- utf-8 strict on sample ---
     try:
         chunk.decode("utf-8")
         return "utf-8"
     except UnicodeDecodeError:
-        pass
-
-    return "latin1"
+        raise UnicodeError(
+            "file is not UTF-8/UTF-16/UTF-32; "
+            "re-export as UTF-8 (with or without BOM)."
+        )
 
 
 def validate_dataset(

From 84c03f8f795893a533cb6a37dbea685722a3c856 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:17:14 -0500
Subject: [PATCH 62/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 4a6fc1c0..ad530682 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -211,14 +211,19 @@ def validate_dataset(
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
-    enc = sniff_encoding(filename)
     try:
-        df = pd.read_csv(filename, encoding=enc)
-    except UnicodeDecodeError as ex:
-        # extremely rare: sample passed but full file fails
-        raise HardValidationError(
-            schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"]
-        )
+        enc = sniff_encoding(filename)  # latin-1 is NOT allowed by default
+    except UnicodeError as ex:
+        raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)])
+
+    # ensure a file-like starts at beginning, then one real read
+    if hasattr(filename, "seek"):
+        try:
+            filename.seek(0)
+        except Exception:
+            pass
+
+    df = pd.read_csv(filename, encoding=enc)
 
     df = df.rename(columns={c: normalize_col(c) for c in df.columns})
     incoming = set(df.columns)

From 7e9d2095466f680e06638602120e73857b1a5695 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:18:58 -0500
Subject: [PATCH 63/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index ad530682..b59128e3 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -2,7 +2,8 @@
 pipelines, this is for general file validation.)
 """
 
-import io, os
+import io
+import os
 import json
 import re
 from typing import Union, List, Dict, Optional, Any

From f8c3b20df86efbd9a4f1eb0b9919cc93ae1105c5 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:20:57 -0500
Subject: [PATCH 64/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index b59128e3..6673dc86 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -200,8 +200,7 @@ def sniff_encoding(
         return "utf-8"
     except UnicodeDecodeError:
         raise UnicodeError(
-            "file is not UTF-8/UTF-16/UTF-32; "
-            "re-export as UTF-8 (with or without BOM)."
+            "file is not UTF-8/UTF-16/UTF-32; re-export as UTF-8 (with or without BOM)."
         )
 
 

From 666d4557fa1049f74d44a4aca4e6093e3c12751c Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:23:52 -0500
Subject: [PATCH 65/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 44 ++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 6673dc86..80f341b4 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -155,32 +155,40 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
     return DataFrameSchema(columns, strict=False)
 
 
+Src = Union[str, os.PathLike, io.BufferedIOBase, io.TextIOWrapper]
+
+
 def sniff_encoding(
-    src: Union[str, os.PathLike, io.IOBase], sample_bytes: int = 1_048_576
+    src: Src,
+    sample_bytes: int = 1_048_576,
 ) -> str:
     """
-    Return best-guess encoding using BOM detection + utf-8 trial decode.
-    Accepts path or file-like. Restores stream position if seekable.
-    If utf-8 fails, raises UnicodeError.
+    Best-guess encoding via BOM detection + utf-8 trial.
+    Works with a filesystem path, a binary stream, or a TextIOWrapper.
+    Restores stream position if seekable. Raises if latin-1 would be used (by default).
     """
-    # --- read small binary sample ---
+    # --- read a small binary sample ---
     if isinstance(src, (str, os.PathLike)):
         with open(src, "rb") as f:
             chunk = f.read(sample_bytes)
-    else:
-        buf = src.buffer if isinstance(src, io.TextIOBase) else src
-        pos = None
-        try:
-            if buf.seekable():
-                pos = buf.tell()
-        except Exception:
-            pass
+    elif isinstance(src, io.TextIOWrapper):
+        # Text wrapper => use underlying binary buffer (mypy-safe)
+        buf = src.buffer
+        pos = buf.tell() if buf.seekable() else None
+        chunk = buf.read(sample_bytes)
+        if pos is not None:
+            buf.seek(pos)
+    elif isinstance(src, io.BufferedIOBase):
+        # Already binary
+        buf = src
+        pos = buf.tell() if buf.seekable() else None
         chunk = buf.read(sample_bytes)
         if pos is not None:
-            try:
-                buf.seek(pos)
-            except Exception:
-                pass
+            buf.seek(pos)
+    else:
+        raise TypeError(
+            "sniff_encoding expects path, io.TextIOWrapper, or binary buffer"
+        )
 
     # --- BOMs first ---
     if chunk.startswith(b"\xef\xbb\xbf"):
@@ -200,7 +208,7 @@ def sniff_encoding(
         return "utf-8"
     except UnicodeDecodeError:
         raise UnicodeError(
-            "file is not UTF-8/UTF-16/UTF-32; re-export as UTF-8 (with or without BOM)."
+            "file is not UTF-8/UTF-16/UTF-32; please re-export as UTF-8."
         )
 
 

From af4053aea4bba676a2299c4f138fc8a9ce6ae8e5 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 18:39:48 -0500
Subject: [PATCH 66/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 80f341b4..9c0fcb40 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -6,7 +6,7 @@
 import os
 import json
 import re
-from typing import Union, List, Dict, Optional, Any
+from typing import Union, List, Dict, Optional, Any, BinaryIO, cast
 import logging
 
 import pandas as pd
@@ -155,13 +155,18 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
     return DataFrameSchema(columns, strict=False)
 
 
-Src = Union[str, os.PathLike, io.BufferedIOBase, io.TextIOWrapper]
+Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper]
 
 
-def sniff_encoding(
-    src: Src,
-    sample_bytes: int = 1_048_576,
-) -> str:
+def _read_sample(buf: BinaryIO, n: int) -> bytes:
+    pos = buf.tell() if buf.seekable() else None
+    chunk = buf.read(n)  # -> bytes for BinaryIO
+    if pos is not None:
+        buf.seek(pos)
+    return chunk
+
+
+def sniff_encoding(src: Src, sample_bytes: int = 1_048_576) -> str:
     """
     Best-guess encoding via BOM detection + utf-8 trial.
     Works with a filesystem path, a binary stream, or a TextIOWrapper.
@@ -170,25 +175,13 @@ def sniff_encoding(
     # --- read a small binary sample ---
     if isinstance(src, (str, os.PathLike)):
         with open(src, "rb") as f:
-            chunk = f.read(sample_bytes)
+            chunk: bytes = f.read(sample_bytes)
     elif isinstance(src, io.TextIOWrapper):
-        # Text wrapper => use underlying binary buffer (mypy-safe)
-        buf = src.buffer
-        pos = buf.tell() if buf.seekable() else None
-        chunk = buf.read(sample_bytes)
-        if pos is not None:
-            buf.seek(pos)
-    elif isinstance(src, io.BufferedIOBase):
-        # Already binary
-        buf = src
-        pos = buf.tell() if buf.seekable() else None
-        chunk = buf.read(sample_bytes)
-        if pos is not None:
-            buf.seek(pos)
+        # Text wrapper => use underlying binary buffer, cast to BinaryIO for mypy
+        chunk = _read_sample(cast(BinaryIO, src.buffer), sample_bytes)
     else:
-        raise TypeError(
-            "sniff_encoding expects path, io.TextIOWrapper, or binary buffer"
-        )
+        # Already a binary stream
+        chunk = _read_sample(cast(BinaryIO, src), sample_bytes)
 
     # --- BOMs first ---
     if chunk.startswith(b"\xef\xbb\xbf"):

From ca6801d8d9df379af5e31609fabbb04826c495c9 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:26:41 -0500
Subject: [PATCH 67/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py        | 215 +++++++++++++++++---------------
 src/webapp/validation_helper.py | 204 ++++++++++++++++++++++++++++++
 2 files changed, 316 insertions(+), 103 deletions(-)
 create mode 100644 src/webapp/validation_helper.py

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 9c0fcb40..d6e1f169 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -13,6 +13,13 @@
 from pandera import Column, Check, DataFrameSchema
 from pandera.errors import SchemaErrors
 from thefuzz import fuzz
+from validation_helper import (
+    _header_pass,
+    _pandas_dtype_and_parse_dates,
+    _build_exact_schema,
+)
+
+logger = logging.getLogger(__name__)
 
 
 def validate_file_reader(
@@ -51,8 +58,7 @@ def normalize_col(name: str) -> str:
     name = name.strip().lower()  # Lowercase and trim whitespace
     name = re.sub(r"[^a-z0-9_]", "_", name)  # Replace non-alphanum with underscore
     name = re.sub(r"_+", "_", name)  # Collapse multiple underscores
-    name = name.strip("_")  # Remove leading/trailing underscores
-    return name
+    return name.strip("_")  # Remove leading/trailing underscores
 
 
 def load_json(path: str) -> Any:
@@ -155,6 +161,8 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
     return DataFrameSchema(columns, strict=False)
 
 
+# --------------------- Actual Validation Layer ------------------------------
+
 Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper]
 
 
@@ -212,134 +220,135 @@ def validate_dataset(
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
+    # 0) encoding
     try:
-        enc = sniff_encoding(filename)  # latin-1 is NOT allowed by default
+        enc = sniff_encoding(filename)  # latin-1 NOT allowed by default
     except UnicodeError as ex:
         raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)])
 
-    # ensure a file-like starts at beginning, then one real read
-    if hasattr(filename, "seek"):
-        try:
-            filename.seek(0)
-        except Exception:
-            pass
-
-    df = pd.read_csv(filename, encoding=enc)
-
-    df = df.rename(columns={c: normalize_col(c) for c in df.columns})
-    incoming = set(df.columns)
-
-    # 2) merge requested models
+    # 1) merge requested models
     if models is None:
-        model_list = []
+        model_list: List[str] = []
     elif isinstance(models, str):
         model_list = [models]
     else:
-        model_list = list(models)  # <- ensures it's not a set
+        model_list = list(models)
 
     merged_specs: Dict[str, dict] = {}
     for m in model_list:
         specs = merge_model_columns(base_schema, ext_schema, institution_id, m.lower())
         merged_specs.update(specs)
 
-    canon_to_aliases = {
-        canon: [normalize_col(alias) for alias in [canon] + spec.get("aliases", [])]
-        for canon, spec in merged_specs.items()
-    }
-    df = rename_columns_to_match_schema(df, canon_to_aliases)
-    df.columns = [
-        normalize_col(c) for c in df.columns
-    ]  # Final normalization after renaming
+    if not merged_specs:
+        # nothing to validate; short-circuit
+        return {
+            "validation_status": "passed",
+            "schemas": model_list,
+            "missing_optional": [],
+            "unknown_extra_columns": [],
+        }
 
-    incoming = set(df.columns)
+    # 2) HEADER-ONLY PASS: map columns & find missing/extras cheaply
+    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = (
+        _header_pass(filename, enc, merged_specs, fuzzy_threshold=90)
+    )
 
-    # 3) build canon → set(normalized names)
-    canon_to_norms: Dict[str, set] = {
-        canon: {normalize_col(alias) for alias in [canon] + spec.get("aliases", [])}
-        for canon, spec in merged_specs.items()
-    }
+    if missing_required:
+        logger.error("Missing required columns: %s", missing_required)
+        raise HardValidationError(missing_required=missing_required)
 
-    pattern_to_canon = {
-        r"^(?:"
-        + "|".join(map(re.escape, [canon] + spec.get("aliases", [])))
-        + r")$": canon
-        for canon, spec in merged_specs.items()
+    # 3) selective typed load
+    present_canons = sorted(set(raw_to_canon.values()))
+    # choose one raw column per present canonical
+    canon_to_raw: Dict[str, str] = {}
+    for raw, canon in raw_to_canon.items():
+        # prefer the raw header that's already exactly canonical if present
+        if canon not in canon_to_raw or normalize_col(raw) == canon:
+            canon_to_raw[canon] = raw
+
+    raw_usecols = list(canon_to_raw.values())
+
+    # dtype & parse_dates maps (by canonical); convert to raw keys for read_csv
+    canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs)
+    raw_dtype_map = {
+        canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw
     }
+    parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw]
+
+    read_kwargs = dict(
+        encoding=enc,
+        usecols=raw_usecols,
+        dtype=raw_dtype_map or None,
+        parse_dates=parse_dates_raw or None,
+        memory_map=True,  # often helps on local/posix filesystems
+        engine="c",  # default fast path; keep behavior stable
+    )
+    # optional speed-up if pyarrow is available; behavior stays correct
+    try:
+        import pyarrow  # noqa: F401
 
-    # 4) find extra / missing
-    all_norms = set().union(*canon_to_norms.values()) if canon_to_norms else set()
-    extra_columns = sorted(incoming - all_norms)
+        read_kwargs["engine"] = "pyarrow"
+        # pandas>=2: dtype_backend speeds strings/ints; ignore if not supported
+        try:
+            read_kwargs["dtype_backend"] = "pyarrow"
+        except TypeError:
+            pass
+    except Exception:
+        pass
 
-    missing_required = [
-        canon
-        for canon, norms in canon_to_norms.items()
-        if merged_specs[canon].get("required", False) and norms.isdisjoint(incoming)
-    ]
+    df = pd.read_csv(
+        filename, **{k: v for k, v in read_kwargs.items() if v is not None}
+    )
 
-    missing_optional = [
-        canon
-        for canon, norms in canon_to_norms.items()
-        if not merged_specs[canon].get("required", False) and norms.isdisjoint(incoming)
-    ]
+    # 4) rename raw headers -> canon once (no DataFrame-wide fuzzy work)
+    df = df.rename(columns=canon_to_raw)  # temporarily raw->canon? Not quite.
+    # The above renames raw names to canonical because keys are canonical? Fix:
+    df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()})
 
-    # Hard-fail on missing required or any extra columns
-    if missing_required:
-        if logging:
-            logging.error(
-                f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}"
-            )
-        raise HardValidationError(missing_required=missing_required)
-    unknown_extra = extra_columns
+    # 5) REQUIRED FIRST (fail-fast), then OPTIONALS (collect soft errors)
+    required_canons = [
+        c for c in present_canons if merged_specs[c].get("required", False)
+    ]
+    optional_canons = [
+        c for c in present_canons if not merged_specs[c].get("required", False)
+    ]
 
-    # 5) build Pandera schema & validate (hard-fail on any error)
-    schema = build_schema(merged_specs)
-    try:
-        schema.validate(df, lazy=True)
-    except SchemaErrors as err:
-        # TODO: Log validation failure for DS to review
-        failed_normals = set(err.failure_cases["column"])
-        failed_canons = {pattern_to_canon.get(p, p) for p in failed_normals}
-
-        # split into required vs optional failures
-        req_failures = [
-            c for c in failed_canons if merged_specs.get(c, {}).get("required", False)
-        ]
-        opt_failures = [
-            c
-            for c in failed_canons
-            if not merged_specs.get(c, {}).get("required", False)
-        ]
-
-        if req_failures:
-            if logging:
-                logging.error(
-                    f"Schema validation failed on required columns, schema_errors = {err.schema_errors}, failure_cases = {err.failure_cases.to_dict(orient='records')}"
-                )
+    # Build schemas with exact names only (faster than regex patterns)
+    if required_canons:
+        req_schema = _build_exact_schema(merged_specs, required_canons)
+        try:
+            req_schema.validate(df[required_canons], lazy=False)
+        except SchemaErrors as err:
+            logger.error("Required column validation failed.")
             raise HardValidationError(
                 schema_errors=err.schema_errors,
                 failure_cases=err.failure_cases.to_dict(orient="records"),
             )
-        else:
-            if logging:
-                logging.info(f"missing_optional = {missing_optional}")
-            print("Optional column validation errors on: ", opt_failures)
-            return {
-                "validation_status": "passed_with_soft_errors",
-                "schemas": model_list,
-                "missing_optional": missing_optional,
-                "optional_validation_failures": opt_failures,
-                "failure_cases": err.failure_cases.to_dict(orient="records"),
-            }
-    if logging:
-        logging.info(f"missing_optional = {missing_optional}")
-    # 6) success (with possible soft misses)
+
+    opt_failures: List[str] = []
+    failure_cases_records: List[dict] = []
+    if optional_canons:
+        opt_schema = _build_exact_schema(merged_specs, optional_canons)
+        try:
+            opt_schema.validate(df[optional_canons], lazy=True)
+        except SchemaErrors as err:
+            opt_failures = sorted(set(err.failure_cases["column"]))
+            failure_cases_records = err.failure_cases.to_dict(orient="records")
+
+    # 6) return — status depends on soft errors / extras
+    if opt_failures or missing_optional or unknown_extra:
+        return {
+            "validation_status": "passed_with_soft_errors",
+            "schemas": model_list,
+            "missing_optional": missing_optional,
+            "optional_validation_failures": opt_failures,
+            "failure_cases": failure_cases_records,
+            "unknown_extra_columns": unknown_extra,
+        }
+
     return {
-        "validation_status": (
-            "passed_with_soft_errors"
-            if (missing_optional or unknown_extra)
-            else "passed"
-        ),
+        "validation_status": "passed",
         "schemas": model_list,
-        "missing_optional": missing_optional,
-        "unknown_extra_columns": unknown_extra,
+        "missing_optional": [],
+        "unknown_extra_columns": [],
     }
diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py
new file mode 100644
index 00000000..451f7839
--- /dev/null
+++ b/src/webapp/validation_helper.py
@@ -0,0 +1,204 @@
+import io
+import os
+import json
+import re
+import logging
+from functools import lru_cache
+from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple
+
+import pandas as pd
+from pandera import Column, Check, DataFrameSchema
+from pandera.errors import SchemaErrors
+
+logger = logging.getLogger(__name__)
+
+
+# ---------- normalization is pure; cache it ----------
+@lru_cache(maxsize=4096)
+def normalize_col(name: str) -> str:
+    name = name.strip().lower()
+    name = re.sub(r"[^a-z0-9_]", "_", name)
+    name = re.sub(r"_+", "_", name)
+    return name.strip("_")
+
+
+def _spec_alias_lookup(
+    merged_specs: Dict[str, dict]
+) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
+    """
+    Build fast lookups:
+      - alias2canon: normalized alias -> canonical
+      - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical)
+    """
+    alias2canon: Dict[str, str] = {}
+    canon_to_aliases_norm: Dict[str, List[str]] = {}
+    for canon, spec in merged_specs.items():
+        aliases = [canon] + spec.get("aliases", [])
+        normed = [normalize_col(a) for a in aliases]
+        canon_to_aliases_norm[canon] = normed
+        for a in normed:
+            alias2canon[a] = canon
+    return alias2canon, canon_to_aliases_norm
+
+
+def _fuzzy_map_unresolved(
+    unresolved: List[Tuple[str, str]],  # [(raw_header, normalized_header)]
+    choices: List[str],  # normalized aliases
+    alias2canon: Dict[str, str],
+    threshold: int = 90,
+) -> Dict[str, str]:  # raw_header -> canonical
+    """
+    Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz.
+    """
+    mapping: Dict[str, str] = {}
+    try:
+        from rapidfuzz import process, fuzz as rf_fuzz  # much faster
+
+        for raw, norm in unresolved:
+            hit = process.extractOne(
+                norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold
+            )
+            if hit:
+                best_alias, score, _ = hit
+                mapping[raw] = alias2canon[best_alias]
+    except Exception:
+        # fallback to thefuzz if rapidfuzz is unavailable
+        from thefuzz import fuzz as tf_fuzz
+
+        for raw, norm in unresolved:
+            best_score = 0
+            best_alias = None
+            for alias in choices:
+                s = tf_fuzz.ratio(norm, alias)
+                if s > best_score:
+                    best_score, best_alias = s, alias
+            if best_alias and best_score >= threshold:
+                mapping[raw] = alias2canon[best_alias]
+    return mapping
+
+
+def _header_pass(
+    filename: str,
+    encoding: str,
+    merged_specs: Dict[str, dict],
+    fuzzy_threshold: int = 90,
+) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]:
+    """
+    Read only the header. Return:
+      - raw_cols: list of column names as in file
+      - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy)
+      - missing_required: list of canonical columns missing
+      - missing_optional: list of optional canonical columns missing
+      - unknown_extra: normalized headers that don't map to any alias
+    """
+    header_df = pd.read_csv(filename, encoding=encoding, nrows=0)
+    raw_cols = list(header_df.columns)
+
+    alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs)
+    known_aliases = set(alias2canon.keys())
+
+    # exact (normalized) mapping first
+    raw_to_canon: Dict[str, str] = {}
+    unresolved: List[Tuple[str, str]] = []
+    incoming_norms: List[str] = []
+
+    for raw in raw_cols:
+        norm = normalize_col(raw)
+        incoming_norms.append(norm)
+        if norm in alias2canon:
+            raw_to_canon[raw] = alias2canon[norm]
+        else:
+            unresolved.append((raw, norm))
+
+    # fuzzy match only for unresolved headers
+    if unresolved:
+        choices = list(known_aliases)
+        fuzzy_map = _fuzzy_map_unresolved(
+            unresolved, choices, alias2canon, threshold=fuzzy_threshold
+        )
+        raw_to_canon.update(fuzzy_map)
+
+    # derive presence/missing/extras from header only
+    incoming_canons = set(raw_to_canon.values())
+    missing_required = [
+        c
+        for c, spec in merged_specs.items()
+        if spec.get("required", False) and c not in incoming_canons
+    ]
+    missing_optional = [
+        c
+        for c, spec in merged_specs.items()
+        if not spec.get("required", False) and c not in incoming_canons
+    ]
+    unknown_extra = sorted({n for (_, n) in unresolved if n not in known_aliases})
+
+    return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra
+
+
+def _pandas_dtype_and_parse_dates(
+    merged_specs: Dict[str, dict]
+) -> Tuple[Dict[str, Any], List[str]]:
+    """
+    Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates.
+    We keep it conservative to avoid accuracy loss.
+    """
+    dtype_map: Dict[str, Any] = {}
+    parse_dates: List[str] = []
+
+    for canon, spec in merged_specs.items():
+        dt = str(spec.get("dtype"))
+        # conservative mappings
+        if dt in {"string", "str", "object"}:
+            dtype_map[canon] = "string"
+        elif dt in {"int", "int64", "Int64"}:
+            # nullable integers are much safer for dirty data
+            dtype_map[canon] = "Int64"
+        elif dt in {"float", "float64"}:
+            dtype_map[canon] = "float64"
+        elif "datetime" in dt or "date" in dt:  # pandera often uses datetime64[ns]
+            parse_dates.append(canon)  # let pandas parse as datetime
+        elif dt in {"bool", "boolean"}:
+            dtype_map[canon] = "boolean"
+        elif dt == "category":
+            dtype_map[canon] = "category"
+        else:
+            # leave unmapped types to pandas inference (keeps behavior)
+            pass
+
+    return dtype_map, parse_dates
+
+
+def _build_exact_schema(
+    specs: Dict[str, dict], only_canons: List[str]
+) -> DataFrameSchema:
+    """
+    Build a Pandera schema with exact column names (no regex).
+    This avoids regex matching overhead during validation.
+    """
+    cols: Dict[str, Column] = {}
+    for canon in only_canons:
+        spec = specs[canon]
+        checks = []
+        for chk in spec.get("checks", []):
+            # small speedup opportunities:
+            #  - precompile regex patterns for str_matches
+            args = list(chk.get("args", []))
+            if (
+                chk["type"] in {"str_matches", "matches"}
+                and args
+                and isinstance(args[0], str)
+            ):
+                args[0] = re.compile(args[0])
+            factory = getattr(Check, chk["type"])
+            checks.append(factory(*args, **chk.get("kwargs", {})))
+
+        cols[canon] = Column(
+            name=canon,
+            regex=False,
+            dtype=spec["dtype"],
+            nullable=spec["nullable"],
+            required=True,  # present-by-construction here
+            checks=checks or None,
+            coerce=spec.get("coerce", False),
+        )
+    return DataFrameSchema(cols, strict=False)

From cbcc1b90510ef7e97521d591abc00527e42a44e0 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:27:37 -0500
Subject: [PATCH 68/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation_helper.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py
index 451f7839..a3e54732 100644
--- a/src/webapp/validation_helper.py
+++ b/src/webapp/validation_helper.py
@@ -1,14 +1,10 @@
-import io
-import os
-import json
 import re
 import logging
 from functools import lru_cache
-from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple
+from typing import List, Dict, Any, Tuple
 
 import pandas as pd
 from pandera import Column, Check, DataFrameSchema
-from pandera.errors import SchemaErrors
 
 logger = logging.getLogger(__name__)
 

From 624beaf625047106e3c206ca42db76091172db4a Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:30:59 -0500
Subject: [PATCH 69/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py        | 2 +-
 src/webapp/validation_helper.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index d6e1f169..88b97dcc 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -13,7 +13,7 @@
 from pandera import Column, Check, DataFrameSchema
 from pandera.errors import SchemaErrors
 from thefuzz import fuzz
-from validation_helper import (
+from .validation_helper import (
     _header_pass,
     _pandas_dtype_and_parse_dates,
     _build_exact_schema,
diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py
index a3e54732..3bc4cdc2 100644
--- a/src/webapp/validation_helper.py
+++ b/src/webapp/validation_helper.py
@@ -19,7 +19,7 @@ def normalize_col(name: str) -> str:
 
 
 def _spec_alias_lookup(
-    merged_specs: Dict[str, dict]
+    merged_specs: Dict[str, dict],
 ) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
     """
     Build fast lookups:
@@ -132,7 +132,7 @@ def _header_pass(
 
 
 def _pandas_dtype_and_parse_dates(
-    merged_specs: Dict[str, dict]
+    merged_specs: Dict[str, dict],
 ) -> Tuple[Dict[str, Any], List[str]]:
     """
     Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates.

From a92862c96bc51463ea9108578241b69bff169f49 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:37:37 -0500
Subject: [PATCH 70/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 88b97dcc..20d93e06 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -283,18 +283,6 @@ def validate_dataset(
         memory_map=True,  # often helps on local/posix filesystems
         engine="c",  # default fast path; keep behavior stable
     )
-    # optional speed-up if pyarrow is available; behavior stays correct
-    try:
-        import pyarrow  # noqa: F401
-
-        read_kwargs["engine"] = "pyarrow"
-        # pandas>=2: dtype_backend speeds strings/ints; ignore if not supported
-        try:
-            read_kwargs["dtype_backend"] = "pyarrow"
-        except TypeError:
-            pass
-    except Exception:
-        pass
 
     df = pd.read_csv(
         filename, **{k: v for k, v in read_kwargs.items() if v is not None}

From 680796ba06e81aa7a1b5a7e4262949f9e9ce3199 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:45:22 -0500
Subject: [PATCH 71/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py        | 423 ++++++++++++++++++++++----------
 src/webapp/validation_helper.py | 200 ---------------
 2 files changed, 296 insertions(+), 327 deletions(-)
 delete mode 100644 src/webapp/validation_helper.py

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 20d93e06..8efc0d3d 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -1,34 +1,47 @@
-"""File validation functions for various schemas. (Record by record validation happens in the
-pipelines, this is for general file validation.)
+"""File validation functions for various schemas.
+Record-by-record validation happens in the pipelines; this module performs
+general file validation with performance-focused improvements.
+
+Key speed-ups (without losing accuracy):
+- Header-only pass to discover/resolve columns before full load
+- Selective, typed CSV read via `usecols` and dtype mapping
+- Exact-name Pandera schemas (avoid regex column matching)
+- Fuzzy matching only for unresolved headers; use rapidfuzz if available
+- Precompiled regexes and set-based membership checks inside Pandera checks
 """
 
+from __future__ import annotations
+
 import io
 import os
 import json
 import re
-from typing import Union, List, Dict, Optional, Any, BinaryIO, cast
 import logging
+from functools import lru_cache
+from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple
 
 import pandas as pd
 from pandera import Column, Check, DataFrameSchema
 from pandera.errors import SchemaErrors
-from thefuzz import fuzz
-from .validation_helper import (
-    _header_pass,
-    _pandas_dtype_and_parse_dates,
-    _build_exact_schema,
-)
+
+# --------------------------------------------------------------------------- #
+# Logging
+# --------------------------------------------------------------------------- #
 
 logger = logging.getLogger(__name__)
 
+# --------------------------------------------------------------------------- #
+# Public entry points
+# --------------------------------------------------------------------------- #
+
 
 def validate_file_reader(
-    filename: str,
+    filename: Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper],
     allowed_schema: list[str],
     base_schema: dict,
     inst_schema: Optional[Dict[Any, Any]] = None,
 ) -> dict[str, Any]:
-    """Validates given a filename."""
+    """Validates a dataset given a filename and schema selection."""
     return validate_dataset(filename, base_schema, inst_schema, allowed_schema)
 
 
@@ -54,11 +67,18 @@ def __init__(
         super().__init__("; ".join(parts))
 
 
+# --------------------------------------------------------------------------- #
+# Utilities
+# --------------------------------------------------------------------------- #
+
+
+@lru_cache(maxsize=4096)
 def normalize_col(name: str) -> str:
-    name = name.strip().lower()  # Lowercase and trim whitespace
-    name = re.sub(r"[^a-z0-9_]", "_", name)  # Replace non-alphanum with underscore
-    name = re.sub(r"_+", "_", name)  # Collapse multiple underscores
-    return name.strip("_")  # Remove leading/trailing underscores
+    """Normalize a column name: trim, lowercase, non-alnum->'_', collapse '_'s."""
+    name = name.strip().lower()
+    name = re.sub(r"[^a-z0-9_]", "_", name)
+    name = re.sub(r"_+", "_", name)
+    return name.strip("_")
 
 
 def load_json(path: str) -> Any:
@@ -66,57 +86,7 @@ def load_json(path: str) -> Any:
         with open(path, "r") as f:
             return json.load(f)
     except Exception as e:
-        raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}")
-
-
-def rename_columns_to_match_schema(
-    df: pd.DataFrame,
-    canon_to_aliases: Dict[str, List[str]],
-    threshold: int = 90,
-) -> pd.DataFrame:
-    """
-    Rename incoming columns using fuzzy match against schema-defined column names and aliases.
-
-    Args:
-        df: Incoming dataframe
-        canon_to_aliases: Mapping from canonical column names to list of aliases (including the canonical name itself)
-        threshold: Fuzzy match score threshold to rename
-
-    Returns:
-        A new DataFrame with renamed columns
-    """
-    from collections import defaultdict
-
-    new_column_names = {}
-    log_info = defaultdict(list)
-
-    schema_names = []
-    for canon, aliases in canon_to_aliases.items():
-        for name in aliases:
-            schema_names.append((name, canon))  # (alias_or_name, canonical_name)
-
-    for incoming_col in df.columns:
-        best_score = 0
-        best_match = None
-        best_canon = None
-
-        for schema_col, canon in schema_names:
-            score = fuzz.ratio(incoming_col.lower(), schema_col.lower())
-            if score > best_score:
-                best_score = score
-                best_match = schema_col
-                best_canon = canon
-
-        if best_score >= threshold and incoming_col != best_canon:
-            new_column_names[incoming_col] = best_canon
-            log_info[incoming_col].append(
-                f"Renamed '{incoming_col}' -> '{best_canon}' (matched on '{best_match}', score={best_score})"
-            )
-
-    for k, v in log_info.items():
-        logging.info(" | ".join(v))
-
-    return df.rename(columns=new_column_names)
+        raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}") from e
 
 
 def merge_model_columns(
@@ -125,10 +95,12 @@ def merge_model_columns(
     institution: str,
     model: str,
 ) -> Dict[str, dict]:
+    """
+    Merge base model columns with institution-specific extension, if present.
+    """
     base_models = base_schema.get("base", {}).get("data_models", {})
     if model not in base_models:
-        if logging:
-            logging.error(f"Model '{model}' not found in base schema")
+        logger.error("Model '%s' not found in base schema", model)
         raise KeyError(f"Model '{model}' not in base schema")
     merged = dict(base_models[model].get("columns", {}))
     if extension_schema:
@@ -139,36 +111,16 @@ def merge_model_columns(
     return merged
 
 
-def build_schema(specs: Dict[str, dict]) -> DataFrameSchema:
-    columns = {}
-    for canon, spec in specs.items():
-        names = [canon] + spec.get("aliases", [])
-        pattern = r"^(?:" + "|".join(map(re.escape, names)) + r")$"
-        checks = []
-        for chk in spec.get("checks", []):
-            factory = getattr(Check, chk["type"])
-            checks.append(factory(*chk.get("args", []), **chk.get("kwargs", {})))
-
-        columns[pattern] = Column(
-            name=pattern,
-            regex=True,
-            dtype=spec["dtype"],
-            nullable=spec["nullable"],
-            required=spec.get("required", False),
-            checks=checks or None,
-            coerce=spec.get("coerce", False),
-        )
-    return DataFrameSchema(columns, strict=False)
-
-
-# --------------------- Actual Validation Layer ------------------------------
+# --------------------------------------------------------------------------- #
+# Encoding sniffing (mypy-friendly)
+# --------------------------------------------------------------------------- #
 
 Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper]
 
 
 def _read_sample(buf: BinaryIO, n: int) -> bytes:
     pos = buf.tell() if buf.seekable() else None
-    chunk = buf.read(n)  # -> bytes for BinaryIO
+    chunk = buf.read(n)
     if pos is not None:
         buf.seek(pos)
     return chunk
@@ -213,20 +165,221 @@ def sniff_encoding(src: Src, sample_bytes: int = 1_048_576) -> str:
         )
 
 
+def _reset_to_start_if_possible(src: Src) -> None:
+    """Best-effort reset to the beginning for file-like objects."""
+    try:
+        if hasattr(src, "seek") and callable(getattr(src, "seek")):
+            src.seek(0)  # type: ignore[attr-defined]
+    except Exception:
+        pass
+
+
+# --------------------------------------------------------------------------- #
+# Fast header pass & mapping
+# --------------------------------------------------------------------------- #
+
+
+def _spec_alias_lookup(
+    merged_specs: Dict[str, dict]
+) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
+    """
+    Build:
+      - alias2canon: normalized alias -> canonical
+      - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical)
+    """
+    alias2canon: Dict[str, str] = {}
+    canon_to_aliases_norm: Dict[str, List[str]] = {}
+    for canon, spec in merged_specs.items():
+        aliases = [canon] + spec.get("aliases", [])
+        normed = [normalize_col(a) for a in aliases]
+        canon_to_aliases_norm[canon] = normed
+        for a in normed:
+            alias2canon[a] = canon
+    return alias2canon, canon_to_aliases_norm
+
+
+def _fuzzy_map_unresolved(
+    unresolved: List[Tuple[str, str]],  # [(raw_header, normalized_header)]
+    choices: List[str],                 # normalized aliases
+    alias2canon: Dict[str, str],
+    threshold: int = 90,
+) -> Dict[str, str]:                    # raw_header -> canonical
+    """
+    Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz.
+    """
+    mapping: Dict[str, str] = {}
+    try:
+        from rapidfuzz import process, fuzz as rf_fuzz  # type: ignore
+        for raw, norm in unresolved:
+            hit = process.extractOne(norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold)
+            if hit:
+                best_alias, score, _ = hit
+                mapping[raw] = alias2canon[best_alias]  # type: ignore[index]
+    except Exception:
+        # fallback to thefuzz if rapidfuzz is unavailable
+        try:
+            from thefuzz import fuzz as tf_fuzz  # type: ignore
+        except Exception:
+            # If neither library is available, do not fuzz-map anything.
+            return mapping
+        for raw, norm in unresolved:
+            best_score = 0
+            best_alias = None
+            for alias in choices:
+                s = tf_fuzz.ratio(norm, alias)
+                if s > best_score:
+                    best_score, best_alias = s, alias
+            if best_alias and best_score >= threshold:
+                mapping[raw] = alias2canon[best_alias]
+    return mapping
+
+
+def _header_pass(
+    filename: Src,
+    encoding: str,
+    merged_specs: Dict[str, dict],
+    fuzzy_threshold: int = 90,
+) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]:
+    """
+    Read only the header. Return:
+      - raw_cols: list of column names as in file
+      - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy)
+      - missing_required: list of canonical columns missing
+      - missing_optional: list of optional canonical columns missing
+      - unknown_extra: normalized headers that don't map to any alias
+    """
+    header_df = pd.read_csv(filename, encoding=encoding, nrows=0)
+    raw_cols = list(header_df.columns)
+
+    alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs)
+    known_aliases = set(alias2canon.keys())
+
+    # exact (normalized) mapping first
+    raw_to_canon: Dict[str, str] = {}
+    unresolved: List[Tuple[str, str]] = []
+
+    for raw in raw_cols:
+        norm = normalize_col(raw)
+        if norm in alias2canon:
+            raw_to_canon[raw] = alias2canon[norm]
+        else:
+            unresolved.append((raw, norm))
+
+    # fuzzy match only for unresolved headers
+    if unresolved:
+        choices = list(known_aliases)
+        fuzzy_map = _fuzzy_map_unresolved(unresolved, choices, alias2canon, threshold=fuzzy_threshold)
+        raw_to_canon.update(fuzzy_map)
+
+    incoming_canons = set(raw_to_canon.values())
+    missing_required = [
+        c for c, spec in merged_specs.items()
+        if spec.get("required", False) and c not in incoming_canons
+    ]
+    missing_optional = [
+        c for c, spec in merged_specs.items()
+        if not spec.get("required", False) and c not in incoming_canons
+    ]
+    # normalized headers that remain unmapped and aren't known aliases
+    unknown_extra = sorted({norm for (_, norm) in unresolved if norm not in known_aliases})
+
+    return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra
+
+
+def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[str, Any], List[str]]:
+    """
+    Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates.
+    Keeps behavior stable while avoiding heavy inference.
+    """
+    dtype_map: Dict[str, Any] = {}
+    parse_dates: List[str] = []
+
+    for canon, spec in merged_specs.items():
+        dt = str(spec.get("dtype"))
+        if dt in {"string", "str", "object"}:
+            dtype_map[canon] = "string"
+        elif dt in {"int", "int64", "Int64"}:
+            dtype_map[canon] = "Int64"  # nullable integers are safer for dirty data
+        elif dt in {"float", "float64"}:
+            dtype_map[canon] = "float64"
+        elif "datetime" in dt or "date" in dt:
+            parse_dates.append(canon)
+        elif dt in {"bool", "boolean"}:
+            dtype_map[canon] = "boolean"
+        elif dt == "category":
+            dtype_map[canon] = "category"
+        else:
+            # leave to pandas inference
+            pass
+
+    return dtype_map, parse_dates
+
+
+def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataFrameSchema:
+    """
+    Build a Pandera schema with exact column names (no regex).
+    This avoids regex matching overhead during validation.
+    """
+    cols: Dict[str, Column] = {}
+    for canon in only_canons:
+        spec = specs[canon]
+        checks = []
+        for chk in spec.get("checks", []):
+            args = list(chk.get("args", []))
+            # precompile regex patterns once
+            if chk["type"] in {"str_matches", "matches"} and args and isinstance(args[0], str):
+                args[0] = re.compile(args[0])
+            # set-based membership for faster 'isin'
+            if chk["type"] in {"isin", "is_in"} and args and isinstance(args[0], list):
+                args[0] = set(args[0])
+
+            factory = getattr(Check, chk["type"])
+            checks.append(factory(*args, **chk.get("kwargs", {})))
+
+        cols[canon] = Column(
+            name=canon,
+            regex=False,
+            dtype=spec["dtype"],
+            nullable=spec["nullable"],
+            required=True,  # present-by-construction
+            checks=checks or None,
+            coerce=spec.get("coerce", False),
+        )
+    return DataFrameSchema(cols, strict=False)
+
+
+# --------------------------------------------------------------------------- #
+# Main validation
+# --------------------------------------------------------------------------- #
+
+
 def validate_dataset(
-    filename: str,
+    filename: Src,
     base_schema: dict,
     ext_schema: Optional[Dict[Any, Any]] = None,
     models: Union[str, List[str], None] = None,
     institution_id: str = "pdp",
 ) -> Dict[str, Any]:
-    # 0) encoding
+    """
+    Validate a dataset against merged base/extension schemas.
+
+    Steps:
+      1) Detect encoding (BOM/UTF-8)
+      2) Merge requested models' column specs
+      3) Header-only pass to map columns (exact + fuzzy) and detect missing/extra
+      4) Selective, typed read via pandas (skip unused columns)
+      5) Fail-fast validation for required columns; collect soft errors for optional
+    """
+    # --- 1) encoding ---
     try:
         enc = sniff_encoding(filename)  # latin-1 NOT allowed by default
     except UnicodeError as ex:
         raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)])
 
-    # 1) merge requested models
+    # Ensure both header and full reads start at the beginning for file-like handles
+    _reset_to_start_if_possible(filename)
+
+    # --- 2) merge requested models ---
     if models is None:
         model_list: List[str] = []
     elif isinstance(models, str):
@@ -248,60 +401,73 @@ def validate_dataset(
             "unknown_extra_columns": [],
         }
 
-    # 2) HEADER-ONLY PASS: map columns & find missing/extras cheaply
-    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = (
-        _header_pass(filename, enc, merged_specs, fuzzy_threshold=90)
+    # --- 3) HEADER-ONLY PASS ---
+    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = _header_pass(
+        filename, enc, merged_specs, fuzzy_threshold=90
     )
 
     if missing_required:
         logger.error("Missing required columns: %s", missing_required)
         raise HardValidationError(missing_required=missing_required)
 
-    # 3) selective typed load
-    present_canons = sorted(set(raw_to_canon.values()))
-    # choose one raw column per present canonical
+    # Reset again before the real read (important for file-like objects)
+    _reset_to_start_if_possible(filename)
+
+    # Choose one raw header per canonical; prefer exact canonical names when available
     canon_to_raw: Dict[str, str] = {}
     for raw, canon in raw_to_canon.items():
-        # prefer the raw header that's already exactly canonical if present
+        # Prefer if normalized raw equals canonical name
         if canon not in canon_to_raw or normalize_col(raw) == canon:
             canon_to_raw[canon] = raw
 
+    present_canons = sorted(canon_to_raw.keys())
     raw_usecols = list(canon_to_raw.values())
 
-    # dtype & parse_dates maps (by canonical); convert to raw keys for read_csv
+    # dtype & parse_dates maps (by canonical) -> convert to raw keys for read_csv
     canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs)
-    raw_dtype_map = {
-        canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw
-    }
+    raw_dtype_map = {canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw}
     parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw]
 
-    read_kwargs = dict(
+    # --- 4) Selective, typed read ---
+    # Default to fast C engine; try pyarrow if available.
+    engine = "c"
+    try:
+        import pyarrow  # noqa: F401
+        engine = "pyarrow"
+    except Exception:
+        pass
+
+    read_kwargs: Dict[str, Any] = dict(
         encoding=enc,
         usecols=raw_usecols,
         dtype=raw_dtype_map or None,
-        parse_dates=parse_dates_raw or None,
-        memory_map=True,  # often helps on local/posix filesystems
-        engine="c",  # default fast path; keep behavior stable
-    )
-
-    df = pd.read_csv(
-        filename, **{k: v for k, v in read_kwargs.items() if v is not None}
+        engine=engine,
     )
-
-    # 4) rename raw headers -> canon once (no DataFrame-wide fuzzy work)
-    df = df.rename(columns=canon_to_raw)  # temporarily raw->canon? Not quite.
-    # The above renames raw names to canonical because keys are canonical? Fix:
+    # memory_map works for path-like with the C engine
+    if engine == "c" and isinstance(filename, (str, os.PathLike)):
+        read_kwargs["memory_map"] = True
+        # only C engine supports parse_dates consistently across versions
+        if parse_dates_raw:
+            read_kwargs["parse_dates"] = parse_dates_raw
+
+    df = pd.read_csv(filename, **{k: v for k, v in read_kwargs.items() if v is not None})
+
+    # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy)
+    if engine == "pyarrow" and parse_dates_canons:
+        for canon in parse_dates_canons:
+            raw = canon_to_raw.get(canon)
+            if raw and raw in df.columns:
+                # coerce invalids to NaT; Pandera will flag according to nullability/checks
+                df[raw] = pd.to_datetime(df[raw], errors="coerce")
+
+    # Rename raw headers -> canonical names exactly once
     df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()})
 
-    # 5) REQUIRED FIRST (fail-fast), then OPTIONALS (collect soft errors)
-    required_canons = [
-        c for c in present_canons if merged_specs[c].get("required", False)
-    ]
-    optional_canons = [
-        c for c in present_canons if not merged_specs[c].get("required", False)
-    ]
+    # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) ---
+    required_canons = [c for c in present_canons if merged_specs[c].get("required", False)]
+    optional_canons = [c for c in present_canons if not merged_specs[c].get("required", False)]
 
-    # Build schemas with exact names only (faster than regex patterns)
+    # Build exact-name schemas (faster than regex)
     if required_canons:
         req_schema = _build_exact_schema(merged_specs, required_canons)
         try:
@@ -320,10 +486,13 @@ def validate_dataset(
         try:
             opt_schema.validate(df[optional_canons], lazy=True)
         except SchemaErrors as err:
+            # Columns are canonical already, so failure_cases['column'] are canonical names
             opt_failures = sorted(set(err.failure_cases["column"]))
             failure_cases_records = err.failure_cases.to_dict(orient="records")
 
-    # 6) return — status depends on soft errors / extras
+    logger.info("missing_optional = %s", missing_optional)
+
+    # Success (with potential soft issues)
     if opt_failures or missing_optional or unknown_extra:
         return {
             "validation_status": "passed_with_soft_errors",
diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py
deleted file mode 100644
index 3bc4cdc2..00000000
--- a/src/webapp/validation_helper.py
+++ /dev/null
@@ -1,200 +0,0 @@
-import re
-import logging
-from functools import lru_cache
-from typing import List, Dict, Any, Tuple
-
-import pandas as pd
-from pandera import Column, Check, DataFrameSchema
-
-logger = logging.getLogger(__name__)
-
-
-# ---------- normalization is pure; cache it ----------
-@lru_cache(maxsize=4096)
-def normalize_col(name: str) -> str:
-    name = name.strip().lower()
-    name = re.sub(r"[^a-z0-9_]", "_", name)
-    name = re.sub(r"_+", "_", name)
-    return name.strip("_")
-
-
-def _spec_alias_lookup(
-    merged_specs: Dict[str, dict],
-) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
-    """
-    Build fast lookups:
-      - alias2canon: normalized alias -> canonical
-      - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical)
-    """
-    alias2canon: Dict[str, str] = {}
-    canon_to_aliases_norm: Dict[str, List[str]] = {}
-    for canon, spec in merged_specs.items():
-        aliases = [canon] + spec.get("aliases", [])
-        normed = [normalize_col(a) for a in aliases]
-        canon_to_aliases_norm[canon] = normed
-        for a in normed:
-            alias2canon[a] = canon
-    return alias2canon, canon_to_aliases_norm
-
-
-def _fuzzy_map_unresolved(
-    unresolved: List[Tuple[str, str]],  # [(raw_header, normalized_header)]
-    choices: List[str],  # normalized aliases
-    alias2canon: Dict[str, str],
-    threshold: int = 90,
-) -> Dict[str, str]:  # raw_header -> canonical
-    """
-    Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz.
-    """
-    mapping: Dict[str, str] = {}
-    try:
-        from rapidfuzz import process, fuzz as rf_fuzz  # much faster
-
-        for raw, norm in unresolved:
-            hit = process.extractOne(
-                norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold
-            )
-            if hit:
-                best_alias, score, _ = hit
-                mapping[raw] = alias2canon[best_alias]
-    except Exception:
-        # fallback to thefuzz if rapidfuzz is unavailable
-        from thefuzz import fuzz as tf_fuzz
-
-        for raw, norm in unresolved:
-            best_score = 0
-            best_alias = None
-            for alias in choices:
-                s = tf_fuzz.ratio(norm, alias)
-                if s > best_score:
-                    best_score, best_alias = s, alias
-            if best_alias and best_score >= threshold:
-                mapping[raw] = alias2canon[best_alias]
-    return mapping
-
-
-def _header_pass(
-    filename: str,
-    encoding: str,
-    merged_specs: Dict[str, dict],
-    fuzzy_threshold: int = 90,
-) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]:
-    """
-    Read only the header. Return:
-      - raw_cols: list of column names as in file
-      - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy)
-      - missing_required: list of canonical columns missing
-      - missing_optional: list of optional canonical columns missing
-      - unknown_extra: normalized headers that don't map to any alias
-    """
-    header_df = pd.read_csv(filename, encoding=encoding, nrows=0)
-    raw_cols = list(header_df.columns)
-
-    alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs)
-    known_aliases = set(alias2canon.keys())
-
-    # exact (normalized) mapping first
-    raw_to_canon: Dict[str, str] = {}
-    unresolved: List[Tuple[str, str]] = []
-    incoming_norms: List[str] = []
-
-    for raw in raw_cols:
-        norm = normalize_col(raw)
-        incoming_norms.append(norm)
-        if norm in alias2canon:
-            raw_to_canon[raw] = alias2canon[norm]
-        else:
-            unresolved.append((raw, norm))
-
-    # fuzzy match only for unresolved headers
-    if unresolved:
-        choices = list(known_aliases)
-        fuzzy_map = _fuzzy_map_unresolved(
-            unresolved, choices, alias2canon, threshold=fuzzy_threshold
-        )
-        raw_to_canon.update(fuzzy_map)
-
-    # derive presence/missing/extras from header only
-    incoming_canons = set(raw_to_canon.values())
-    missing_required = [
-        c
-        for c, spec in merged_specs.items()
-        if spec.get("required", False) and c not in incoming_canons
-    ]
-    missing_optional = [
-        c
-        for c, spec in merged_specs.items()
-        if not spec.get("required", False) and c not in incoming_canons
-    ]
-    unknown_extra = sorted({n for (_, n) in unresolved if n not in known_aliases})
-
-    return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra
-
-
-def _pandas_dtype_and_parse_dates(
-    merged_specs: Dict[str, dict],
-) -> Tuple[Dict[str, Any], List[str]]:
-    """
-    Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates.
-    We keep it conservative to avoid accuracy loss.
-    """
-    dtype_map: Dict[str, Any] = {}
-    parse_dates: List[str] = []
-
-    for canon, spec in merged_specs.items():
-        dt = str(spec.get("dtype"))
-        # conservative mappings
-        if dt in {"string", "str", "object"}:
-            dtype_map[canon] = "string"
-        elif dt in {"int", "int64", "Int64"}:
-            # nullable integers are much safer for dirty data
-            dtype_map[canon] = "Int64"
-        elif dt in {"float", "float64"}:
-            dtype_map[canon] = "float64"
-        elif "datetime" in dt or "date" in dt:  # pandera often uses datetime64[ns]
-            parse_dates.append(canon)  # let pandas parse as datetime
-        elif dt in {"bool", "boolean"}:
-            dtype_map[canon] = "boolean"
-        elif dt == "category":
-            dtype_map[canon] = "category"
-        else:
-            # leave unmapped types to pandas inference (keeps behavior)
-            pass
-
-    return dtype_map, parse_dates
-
-
-def _build_exact_schema(
-    specs: Dict[str, dict], only_canons: List[str]
-) -> DataFrameSchema:
-    """
-    Build a Pandera schema with exact column names (no regex).
-    This avoids regex matching overhead during validation.
-    """
-    cols: Dict[str, Column] = {}
-    for canon in only_canons:
-        spec = specs[canon]
-        checks = []
-        for chk in spec.get("checks", []):
-            # small speedup opportunities:
-            #  - precompile regex patterns for str_matches
-            args = list(chk.get("args", []))
-            if (
-                chk["type"] in {"str_matches", "matches"}
-                and args
-                and isinstance(args[0], str)
-            ):
-                args[0] = re.compile(args[0])
-            factory = getattr(Check, chk["type"])
-            checks.append(factory(*args, **chk.get("kwargs", {})))
-
-        cols[canon] = Column(
-            name=canon,
-            regex=False,
-            dtype=spec["dtype"],
-            nullable=spec["nullable"],
-            required=True,  # present-by-construction here
-            checks=checks or None,
-            coerce=spec.get("coerce", False),
-        )
-    return DataFrameSchema(cols, strict=False)

From 1b5452eca2578b110e2cbe4c7eeec45fe744c647 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:45:44 -0500
Subject: [PATCH 72/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 58 +++++++++++++++++++++++++++++-----------
 1 file changed, 42 insertions(+), 16 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 8efc0d3d..7ff21755 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -200,18 +200,21 @@ def _spec_alias_lookup(
 
 def _fuzzy_map_unresolved(
     unresolved: List[Tuple[str, str]],  # [(raw_header, normalized_header)]
-    choices: List[str],                 # normalized aliases
+    choices: List[str],  # normalized aliases
     alias2canon: Dict[str, str],
     threshold: int = 90,
-) -> Dict[str, str]:                    # raw_header -> canonical
+) -> Dict[str, str]:  # raw_header -> canonical
     """
     Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz.
     """
     mapping: Dict[str, str] = {}
     try:
         from rapidfuzz import process, fuzz as rf_fuzz  # type: ignore
+
         for raw, norm in unresolved:
-            hit = process.extractOne(norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold)
+            hit = process.extractOne(
+                norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold
+            )
             if hit:
                 best_alias, score, _ = hit
                 mapping[raw] = alias2canon[best_alias]  # type: ignore[index]
@@ -268,25 +271,33 @@ def _header_pass(
     # fuzzy match only for unresolved headers
     if unresolved:
         choices = list(known_aliases)
-        fuzzy_map = _fuzzy_map_unresolved(unresolved, choices, alias2canon, threshold=fuzzy_threshold)
+        fuzzy_map = _fuzzy_map_unresolved(
+            unresolved, choices, alias2canon, threshold=fuzzy_threshold
+        )
         raw_to_canon.update(fuzzy_map)
 
     incoming_canons = set(raw_to_canon.values())
     missing_required = [
-        c for c, spec in merged_specs.items()
+        c
+        for c, spec in merged_specs.items()
         if spec.get("required", False) and c not in incoming_canons
     ]
     missing_optional = [
-        c for c, spec in merged_specs.items()
+        c
+        for c, spec in merged_specs.items()
         if not spec.get("required", False) and c not in incoming_canons
     ]
     # normalized headers that remain unmapped and aren't known aliases
-    unknown_extra = sorted({norm for (_, norm) in unresolved if norm not in known_aliases})
+    unknown_extra = sorted(
+        {norm for (_, norm) in unresolved if norm not in known_aliases}
+    )
 
     return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra
 
 
-def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[str, Any], List[str]]:
+def _pandas_dtype_and_parse_dates(
+    merged_specs: Dict[str, dict]
+) -> Tuple[Dict[str, Any], List[str]]:
     """
     Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates.
     Keeps behavior stable while avoiding heavy inference.
@@ -315,7 +326,9 @@ def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[s
     return dtype_map, parse_dates
 
 
-def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataFrameSchema:
+def _build_exact_schema(
+    specs: Dict[str, dict], only_canons: List[str]
+) -> DataFrameSchema:
     """
     Build a Pandera schema with exact column names (no regex).
     This avoids regex matching overhead during validation.
@@ -327,7 +340,11 @@ def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataF
         for chk in spec.get("checks", []):
             args = list(chk.get("args", []))
             # precompile regex patterns once
-            if chk["type"] in {"str_matches", "matches"} and args and isinstance(args[0], str):
+            if (
+                chk["type"] in {"str_matches", "matches"}
+                and args
+                and isinstance(args[0], str)
+            ):
                 args[0] = re.compile(args[0])
             # set-based membership for faster 'isin'
             if chk["type"] in {"isin", "is_in"} and args and isinstance(args[0], list):
@@ -402,8 +419,8 @@ def validate_dataset(
         }
 
     # --- 3) HEADER-ONLY PASS ---
-    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = _header_pass(
-        filename, enc, merged_specs, fuzzy_threshold=90
+    raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = (
+        _header_pass(filename, enc, merged_specs, fuzzy_threshold=90)
     )
 
     if missing_required:
@@ -425,7 +442,9 @@ def validate_dataset(
 
     # dtype & parse_dates maps (by canonical) -> convert to raw keys for read_csv
     canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs)
-    raw_dtype_map = {canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw}
+    raw_dtype_map = {
+        canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw
+    }
     parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw]
 
     # --- 4) Selective, typed read ---
@@ -433,6 +452,7 @@ def validate_dataset(
     engine = "c"
     try:
         import pyarrow  # noqa: F401
+
         engine = "pyarrow"
     except Exception:
         pass
@@ -450,7 +470,9 @@ def validate_dataset(
         if parse_dates_raw:
             read_kwargs["parse_dates"] = parse_dates_raw
 
-    df = pd.read_csv(filename, **{k: v for k, v in read_kwargs.items() if v is not None})
+    df = pd.read_csv(
+        filename, **{k: v for k, v in read_kwargs.items() if v is not None}
+    )
 
     # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy)
     if engine == "pyarrow" and parse_dates_canons:
@@ -464,8 +486,12 @@ def validate_dataset(
     df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()})
 
     # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) ---
-    required_canons = [c for c in present_canons if merged_specs[c].get("required", False)]
-    optional_canons = [c for c in present_canons if not merged_specs[c].get("required", False)]
+    required_canons = [
+        c for c in present_canons if merged_specs[c].get("required", False)
+    ]
+    optional_canons = [
+        c for c in present_canons if not merged_specs[c].get("required", False)
+    ]
 
     # Build exact-name schemas (faster than regex)
     if required_canons:

From c8a58720de47bc74f35118232f5c7fa61e8e1ee3 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:51:35 -0500
Subject: [PATCH 73/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 7ff21755..58922734 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -180,7 +180,7 @@ def _reset_to_start_if_possible(src: Src) -> None:
 
 
 def _spec_alias_lookup(
-    merged_specs: Dict[str, dict]
+    merged_specs: Dict[str, dict],
 ) -> Tuple[Dict[str, str], Dict[str, List[str]]]:
     """
     Build:
@@ -296,7 +296,7 @@ def _header_pass(
 
 
 def _pandas_dtype_and_parse_dates(
-    merged_specs: Dict[str, dict]
+    merged_specs: Dict[str, dict],
 ) -> Tuple[Dict[str, Any], List[str]]:
     """
     Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates.

From e9d2067114e0b31edd78a2377ca61fe4a5e0d489 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Fri, 12 Sep 2025 19:54:18 -0500
Subject: [PATCH 74/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 58922734..fd7abd13 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -477,7 +477,7 @@ def validate_dataset(
     # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy)
     if engine == "pyarrow" and parse_dates_canons:
         for canon in parse_dates_canons:
-            raw = canon_to_raw.get(canon)
+            raw = str(canon_to_raw.get(canon))
             if raw and raw in df.columns:
                 # coerce invalids to NaT; Pandera will flag according to nullability/checks
                 df[raw] = pd.to_datetime(df[raw], errors="coerce")

From 1d8fd3fb67c57f0223b07837cf66fe28160ec917 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Sat, 13 Sep 2025 08:36:20 -0500
Subject: [PATCH 75/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index fd7abd13..2b5ced20 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -452,7 +452,6 @@ def validate_dataset(
     engine = "c"
     try:
         import pyarrow  # noqa: F401
-
         engine = "pyarrow"
     except Exception:
         pass

From 1085628302842ae41dc2aaa4556c06c1d6b0fffd Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Sat, 13 Sep 2025 08:38:47 -0500
Subject: [PATCH 76/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index 2b5ced20..fd7abd13 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -452,6 +452,7 @@ def validate_dataset(
     engine = "c"
     try:
         import pyarrow  # noqa: F401
+
         engine = "pyarrow"
     except Exception:
         pass

From a5fd596075c7264b05cd10b5a6a271ad810324f2 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Sat, 13 Sep 2025 09:30:12 -0500
Subject: [PATCH 77/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index fd7abd13..dec777a7 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -387,16 +387,16 @@ def validate_dataset(
       4) Selective, typed read via pandas (skip unused columns)
       5) Fail-fast validation for required columns; collect soft errors for optional
     """
-    # --- 1) encoding ---
+    # ---------------------------- 1) Encoding
     try:
-        enc = sniff_encoding(filename)  # latin-1 NOT allowed by default
+        enc = sniff_encoding(filename)
     except UnicodeError as ex:
         raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)])
 
     # Ensure both header and full reads start at the beginning for file-like handles
     _reset_to_start_if_possible(filename)
 
-    # --- 2) merge requested models ---
+    # ---------------------------- 2) merge requested models
     if models is None:
         model_list: List[str] = []
     elif isinstance(models, str):
@@ -418,7 +418,7 @@ def validate_dataset(
             "unknown_extra_columns": [],
         }
 
-    # --- 3) HEADER-ONLY PASS ---
+    # ----------------------------  3) HEADER-ONLY PASS
     raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = (
         _header_pass(filename, enc, merged_specs, fuzzy_threshold=90)
     )
@@ -447,13 +447,22 @@ def validate_dataset(
     }
     parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw]
 
-    # --- 4) Selective, typed read ---
+    # ---------------------------- 4) Selective, typed read
     # Default to fast C engine; try pyarrow if available.
     engine = "c"
+    use_threads = None  # only meaningful for pyarrow engine
+    dtype_backend = None
+
     try:
         import pyarrow  # noqa: F401
 
         engine = "pyarrow"
+        use_threads = True  # multi-threaded CSV parsing
+        # pandas>=2.0 can store DataFrame blocks as Arrow arrays (often faster)
+        try:
+            dtype_backend = "pyarrow"
+        except TypeError:
+            dtype_backend = None
     except Exception:
         pass
 
@@ -462,11 +471,12 @@ def validate_dataset(
         usecols=raw_usecols,
         dtype=raw_dtype_map or None,
         engine=engine,
+        dtype_backend=dtype_backend,  # ignored if None / not supported
+        use_threads=use_threads,  # ignored by C engine
     )
     # memory_map works for path-like with the C engine
     if engine == "c" and isinstance(filename, (str, os.PathLike)):
         read_kwargs["memory_map"] = True
-        # only C engine supports parse_dates consistently across versions
         if parse_dates_raw:
             read_kwargs["parse_dates"] = parse_dates_raw
 
@@ -485,7 +495,7 @@ def validate_dataset(
     # Rename raw headers -> canonical names exactly once
     df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()})
 
-    # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) ---
+    # ---------------------------- 5) Validation: required fail-fast, optional lazy (collect soft errors)
     required_canons = [
         c for c in present_canons if merged_specs[c].get("required", False)
     ]

From e0f104aa66bd656f81d2e00de5a24291484d3742 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Sat, 13 Sep 2025 09:39:29 -0500
Subject: [PATCH 78/92] fixing validation issues with problematic MSUD file:
 Optimizing encoding check

---
 src/webapp/validation.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/webapp/validation.py b/src/webapp/validation.py
index dec777a7..e02df270 100644
--- a/src/webapp/validation.py
+++ b/src/webapp/validation.py
@@ -450,19 +450,10 @@ def validate_dataset(
     # ---------------------------- 4) Selective, typed read
     # Default to fast C engine; try pyarrow if available.
     engine = "c"
-    use_threads = None  # only meaningful for pyarrow engine
-    dtype_backend = None
-
     try:
         import pyarrow  # noqa: F401
 
         engine = "pyarrow"
-        use_threads = True  # multi-threaded CSV parsing
-        # pandas>=2.0 can store DataFrame blocks as Arrow arrays (often faster)
-        try:
-            dtype_backend = "pyarrow"
-        except TypeError:
-            dtype_backend = None
     except Exception:
         pass
 
@@ -471,12 +462,11 @@ def validate_dataset(
         usecols=raw_usecols,
         dtype=raw_dtype_map or None,
         engine=engine,
-        dtype_backend=dtype_backend,  # ignored if None / not supported
-        use_threads=use_threads,  # ignored by C engine
     )
     # memory_map works for path-like with the C engine
     if engine == "c" and isinstance(filename, (str, os.PathLike)):
         read_kwargs["memory_map"] = True
+        # only C engine supports parse_dates consistently across versions
         if parse_dates_raw:
             read_kwargs["parse_dates"] = parse_dates_raw
 
@@ -493,7 +483,7 @@ def validate_dataset(
                 df[raw] = pd.to_datetime(df[raw], errors="coerce")
 
     # Rename raw headers -> canonical names exactly once
-    df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()})
+    df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}, inplace=True)
 
     # ---------------------------- 5) Validation: required fail-fast, optional lazy (collect soft errors)
     required_canons = [

From edb2a34b240ce3eafa93bd2175dace851464d8b9 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 08:49:32 -0500
Subject: [PATCH 79/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 314 +++++++++++++++++++++++--------------
 1 file changed, 196 insertions(+), 118 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 29b8587e..70c679d8 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1033,143 +1033,225 @@ def validation_helper(
     storage_control: StorageControl,
     sql_session: Session,
 ) -> Any:
-    """Helper function for file validation."""
+    """Helper function for file validation (self-contained & optimized)."""
+    import time
+    import re
+    import os
+
+    # --- access check & quick input validation
     has_access_to_inst_or_err(inst_id, current_user)
-    if file_name.find("/") != -1:
-        raise HTTPException(
-            status_code=422,
-            detail="File name can't contain '/'.",
-        )
+    if "/" in file_name:
+        raise HTTPException(status_code=422, detail="File name can't contain '/'.")
+
+    # --- bind session once
     local_session.set(sql_session)
+    sess = local_session.get()
 
-    allowed_schemas = None
-    if not allowed_schemas:
-        allowed_schemas = infer_models_from_filename(file_name)
+    # --- one-time initialization on the function object (kept in-process)
+    if not hasattr(validation_helper, "_ar_re"):
+        validation_helper._ar_re = re.compile(
+            r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE
+        )
+    if not hasattr(validation_helper, "_base_cache"):
+        # {"exp": <monotonic expiry>, "val": (<schema_id>, <json_doc>)}
+        validation_helper._base_cache = {"exp": 0.0, "val": None}
+    if not hasattr(validation_helper, "_ext_cache"):
+        # { str(inst_uuid): (exp, extension_json_doc) }
+        validation_helper._ext_cache = {}
+    if not hasattr(validation_helper, "_pdp_cache"):
+        # PDP-wide extension (active), cached: (exp, doc)
+        validation_helper._pdp_cache = (0.0, None)
+
+    AR_RE = validation_helper._ar_re
+    BASE_TTL = 300  # seconds
+    EXT_TTL = 120  # seconds
+
+    # --- filename → allowed_schemas (fast, single pass)
+    name = os.path.basename(file_name).lower()
+    has_course = "course" in name
+    has_semester = "semester" in name
+    has_student = (
+        ("student" in name)
+        or ("cohort" in name)
+        or (
+            (not has_course)
+            and (AR_RE.search(name) is not None or "deidentified" in name)
+        )
+    )
 
-    inferred_schemas: list[str] = []
-    # ----------------------- Fetch base schema from DB -------------------------------
-    base_schema = (
-        local_session.get()
-        .execute(
+    inferred_from_name: set[str] = set()
+    if has_course:
+        inferred_from_name.add("COURSE")
+    if has_student:
+        inferred_from_name.add("STUDENT")
+    if has_semester:
+        inferred_from_name.add("SEMESTER")
+
+    if not inferred_from_name:
+        raise ValueError(
+            f"Could not infer model(s) from file name: {name}. "
+            "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')."
+        )
+
+    allowed_schemas = sorted(inferred_from_name)
+
+    # --- fetch active base schema (cached)
+    now = time.monotonic()
+    base_cache = validation_helper._base_cache
+    if now < base_cache["exp"] and base_cache["val"] is not None:
+        base_schema_id, base_schema = base_cache["val"]
+    else:
+        row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)
             .where(
                 SchemaRegistryTable.doc_type == DocType.base,
                 SchemaRegistryTable.is_active.is_(True),
             )
             .limit(1)
-        )
-        .first()
-    )
-    if base_schema is None:
-        raise RuntimeError("No active base schema found")
-
-    base_schema_id, base_schema = base_schema
-    # ----------------------- Fetch inst specific extension schema from DB ---------------------
-    inst = (
-        local_session.get()
-        .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id)))
-        .scalar_one_or_none()
-    )
+        ).first()
+        if row is None:
+            raise RuntimeError("No active base schema found")
+        base_schema_id, base_schema = row
+        base_cache["exp"] = now + BASE_TTL
+        base_cache["val"] = (base_schema_id, base_schema)
+
+    # --- fetch institution record
+    inst = sess.execute(
+        select(InstTable).where(InstTable.id == str_to_uuid(inst_id))
+    ).scalar_one_or_none()
     if inst is None:
         raise ValueError(f"Institution {inst_id} not found")
 
-    if inst.pdp_id:  # institution is PDP
-        inst_schema = (
-            local_session.get()
-            .execute(
+    bucket = get_external_bucket_name(inst_id)
+
+    # --- choose / prepare extension schema (try to avoid heavy path)
+    updated_inst_schema: Optional[dict] = None
+
+    def _ext_models_set(doc: Optional[dict]) -> set[str]:
+        """Extract model keys from an extension document (root or institutions.* layout)."""
+        if not doc or not isinstance(doc, dict):
+            return set()
+        # root-level
+        if isinstance(doc.get("data_models"), dict):
+            return {str(k).lower() for k in doc["data_models"].keys()}
+        # nested by institution
+        inst_key_candidates = {str(getattr(inst, "id", "")), inst_id}
+        insts = doc.get("institutions", {})
+        if isinstance(insts, dict):
+            for key in inst_key_candidates:
+                block = insts.get(key)
+                if isinstance(block, dict) and isinstance(
+                    block.get("data_models"), dict
+                ):
+                    return {str(k).lower() for k in block["data_models"].keys()}
+        return set()
+
+    if getattr(inst, "pdp_id", None):
+        # PDP institutions: use active PDP extension (cached)
+        pdp_exp, pdp_doc = validation_helper._pdp_cache
+        if now < pdp_exp and pdp_doc is not None:
+            inst_schema = pdp_doc
+        else:
+            inst_schema = sess.execute(
                 select(SchemaRegistryTable.json_doc)
                 .where(
                     SchemaRegistryTable.is_pdp.is_(True),
                     SchemaRegistryTable.is_active.is_(True),
                 )
                 .limit(1)
-            )
-            .scalar_one_or_none()
-        )
-        updated_inst_schema: dict | None = inst_schema
-    else:  # custom (or none)
-        inst_schema = (
-            local_session.get()
-            .execute(
+            ).scalar_one_or_none()
+            validation_helper._pdp_cache = (now + EXT_TTL, inst_schema)
+        updated_inst_schema = inst_schema
+    else:
+        # custom institutions: try cached extension first
+        ext_cache = validation_helper._ext_cache
+        key = str(getattr(inst, "id", ""))
+        cached = ext_cache.get(key)
+        if cached and now < cached[0]:
+            inst_schema = cached[1]
+        else:
+            inst_schema = sess.execute(
                 select(SchemaRegistryTable.json_doc)
                 .where(
-                    SchemaRegistryTable.inst_id == inst.id,
+                    SchemaRegistryTable.inst_id == getattr(inst, "id", None),
                     SchemaRegistryTable.is_active.is_(True),
-                    SchemaRegistryTable.doc_type == DocType.extension,  # be explicit
+                    SchemaRegistryTable.doc_type == DocType.extension,
                 )
                 .limit(1)
-            )
-            .scalar_one_or_none()
-        )
-
-        dbc = DatabricksControl()
-        schema_extension = dbc.create_custom_schema_extension(
-            bucket_name=get_external_bucket_name(inst_id),
-            inst_query=inst,
-            file_name=file_name,
-            base_schema=base_schema,
-            extension_schema=inst_schema,
-        )
+            ).scalar_one_or_none()
+            ext_cache[key] = (now + EXT_TTL, inst_schema)
 
-        if schema_extension is not None:
-            updated_inst_schema = schema_extension
-            try:
-                new_schema_extension_record = SchemaRegistryTable(
-                    doc_type=DocType.extension,
-                    inst_id=str_to_uuid(inst_id),
-                    is_pdp=False,  # type: ignore
-                    version_label="1.0.0",
-                    extends_schema_id=base_schema_id,
-                    json_doc=schema_extension,
-                    is_active=True,
-                )
-                sess = local_session.get()
-                sess.add(new_schema_extension_record)
-                sess.flush()
-                logging.info("Schema record inserted for '%s'", inst_id)
-            except IntegrityError as e:
-                sess = local_session.get()
-                sess.rollback()
-                logging.warning("IntegrityError: %s", e)
-            except Exception as e:
-                sess = local_session.get()
-                sess.rollback()
-                logging.error("Unexpected DB error: %s", e)
-                raise HTTPException(
-                    status_code=500,
-                    detail=f"Unexpected database error while inserting file record: {e}",
-                )
+        # If extension already includes all inferred models, skip Databricks work.
+        inferred_lower = {m.lower() for m in allowed_schemas}
+        ext_models = _ext_models_set(inst_schema)
+        if inferred_lower.issubset(ext_models):
+            updated_inst_schema = inst_schema
         else:
-            logging.info(
-                "No-op: extension already contains this model for inst %s", inst_id
+            # heavy path only when needed
+            dbc = DatabricksControl()
+            schema_extension = dbc.create_custom_schema_extension(
+                bucket_name=bucket,
+                inst_query=inst,
+                file_name=file_name,
+                base_schema=base_schema,
+                extension_schema=inst_schema,
             )
-            updated_inst_schema = inst_schema
+            if schema_extension is not None:
+                updated_inst_schema = schema_extension
+                try:
+                    new_schema_extension_record = SchemaRegistryTable(
+                        doc_type=DocType.extension,
+                        inst_id=str_to_uuid(inst_id),
+                        is_pdp=False,  # type: ignore
+                        version_label="1.0.0",
+                        extends_schema_id=base_schema_id,
+                        json_doc=schema_extension,
+                        is_active=True,
+                    )
+                    sess.add(new_schema_extension_record)
+                    sess.flush()
+                    logging.info("Schema record inserted for '%s'", inst_id)
+                    # refresh cache
+                    validation_helper._ext_cache[key] = (
+                        time.monotonic() + EXT_TTL,
+                        schema_extension,
+                    )
+                except IntegrityError as e:
+                    sess.rollback()
+                    logging.warning("IntegrityError: %s", e)
+                except Exception as e:
+                    sess.rollback()
+                    logging.error("Unexpected DB error: %s", e)
+                    raise HTTPException(
+                        status_code=500,
+                        detail=f"Unexpected database error while inserting file record: {e}",
+                    )
+            else:
+                logging.info(
+                    "No-op: extension already contains this model for inst %s", inst_id
+                )
+                updated_inst_schema = inst_schema
 
-    # ----------------------- File validation logic logic --------------------------------------
+    # --- run file validation (I/O + Pandera work happens inside storage layer)
     try:
         inferred_schemas = storage_control.validate_file(
-            get_external_bucket_name(inst_id),
+            bucket,
             file_name,
             allowed_schemas,
             base_schema,
             updated_inst_schema,
         )
-        logging.debug(
-            "!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas)
-        )
-
+        logging.debug("Inferred Schemas success %s", list(inferred_schemas))
     except HardValidationError as e:
-        logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e)
-        # Build a single string - frontend can render this reliably
-        msg_parts = ["VALIDATION_FAILED"]
+        logging.debug("Inferred Schemas FAILED (hard) %s", e)
+        parts = ["VALIDATION_FAILED"]
         if e.missing_required:
-            msg_parts.append(f"missing_required={e.missing_required}")
+            parts.append(f"missing_required={e.missing_required}")
         if e.extra_columns:
-            msg_parts.append(f"extra_columns={e.extra_columns}")
+            parts.append(f"extra_columns={e.extra_columns}")
         if e.schema_errors is not None:
-            msg_parts.append(f"schema_errors={e.schema_errors}")
+            parts.append(f"schema_errors={e.schema_errors}")
         if e.failure_cases is not None:
-            # keep short; avoid dumping huge tables
             try:
                 sample = (
                     e.failure_cases[:5]
@@ -1178,31 +1260,26 @@ def validation_helper(
                 )
             except Exception:
                 sample = "see server logs"
-            msg_parts.append(f"failure_cases_sample={sample}")
+            parts.append(f"failure_cases_sample={sample}")
         raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail="; ".join(msg_parts),
+            status_code=status.HTTP_400_BAD_REQUEST, detail="; ".join(parts)
         )
-
     except Exception as e:
-        logging.debug("!!!!!!!!!!Inferred Schemas FAILED (other) %s", e)
+        logging.debug("Inferred Schemas FAILED (other) %s", e)
         raise HTTPException(
             status_code=status.HTTP_400_BAD_REQUEST,
             detail=f"VALIDATION_ERROR: {type(e).__name__}: {e}",
         )
 
+    # --- upsert file record (cheap path)
     existing_file = (
-        local_session.get()
-        .query(FileTable)
-        .filter_by(
-            name=file_name,
-            inst_id=str_to_uuid(inst_id),
-        )
+        sess.query(FileTable)
+        .filter_by(name=file_name, inst_id=str_to_uuid(inst_id))
         .first()
     )
 
     if existing_file:
-        logging.info(f"File '{file_name}' already exists for institution {inst_id}.")
+        logging.info("File '%s' already exists for institution %s.", file_name, inst_id)
         db_status = f"File '{file_name}' already exists for institution {inst_id}."
     else:
         try:
@@ -1212,20 +1289,21 @@ def validation_helper(
                 uploader=str_to_uuid(current_user.user_id),  # type: ignore
                 source=source_str,
                 sst_generated=False,
-                schemas=list(allowed_schemas),
+                # Store what validation actually inferred (not only filename guess)
+                schemas=list(inferred_schemas),
                 valid=True,
             )
-            local_session.get().add(new_file_record)
-            local_session.get().flush()
-            logging.info(f"File record inserted for '{file_name}'")
+            sess.add(new_file_record)
+            sess.flush()
+            logging.info("File record inserted for '%s'", file_name)
             db_status = f"File record inserted for '{file_name}'"
         except IntegrityError as e:
-            local_session.get().rollback()
-            logging.warning(f"IntegrityError: {e}")
+            sess.rollback()
+            logging.warning("IntegrityError: %s", e)
             db_status = "Already exists"
         except Exception as e:
-            local_session.get().rollback()
-            logging.error(f"Unexpected DB error: {e}")
+            sess.rollback()
+            logging.error("Unexpected DB error: %s", e)
             raise HTTPException(
                 status_code=500,
                 detail=f"Unexpected database error while inserting file record: {e}",
@@ -1234,7 +1312,7 @@ def validation_helper(
     return {
         "name": file_name,
         "inst_id": inst_id,
-        "file_types": list(allowed_schemas),
+        "file_types": list(inferred_schemas),
         "source": source_str,
         "status": db_status,
     }

From 00d939de0d79980f4ac1526a33a98265f0227f54 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 08:53:26 -0500
Subject: [PATCH 80/92] revamped entire validation helper script

---
 src/webapp/routers/data_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py
index 9b1c1c31..d0aaf9e0 100644
--- a/src/webapp/routers/data_test.py
+++ b/src/webapp/routers/data_test.py
@@ -569,7 +569,7 @@ def test_update_batch(client: TestClient) -> None:
 
 def test_validate_success_batch(client: TestClient) -> None:
     """Test PATCH /institutions/<uuid>/batch."""
-    MOCK_STORAGE.validate_file.return_value = ["UNKNOWN"]
+    MOCK_STORAGE.validate_file.return_value = ["COURSE"]
 
     # Use validate for manual upload
     response_upload = client.post(

From 5a5cd32641d9c46534777bb8c5df10dd0d4825fc Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:10:29 -0500
Subject: [PATCH 81/92] revamped entire validation helper script

---
 src/webapp/routers/data.py      | 14 ++++++++++----
 src/webapp/routers/data_test.py |  2 +-
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 70c679d8..de47f693 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1277,7 +1277,14 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
         .filter_by(name=file_name, inst_id=str_to_uuid(inst_id))
         .first()
     )
-
+    if set(inferred_schemas) != set(allowed_schemas):
+        logging.info(
+            "Filename inference %s differs from validator result %s for %s; "
+            "returning filename-based types to preserve API contract.",
+            allowed_schemas,
+            inferred_schemas,
+            file_name,
+        )
     if existing_file:
         logging.info("File '%s' already exists for institution %s.", file_name, inst_id)
         db_status = f"File '{file_name}' already exists for institution {inst_id}."
@@ -1289,8 +1296,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
                 uploader=str_to_uuid(current_user.user_id),  # type: ignore
                 source=source_str,
                 sst_generated=False,
-                # Store what validation actually inferred (not only filename guess)
-                schemas=list(inferred_schemas),
+                schemas=list(allowed_schemas),
                 valid=True,
             )
             sess.add(new_file_record)
@@ -1312,7 +1318,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
     return {
         "name": file_name,
         "inst_id": inst_id,
-        "file_types": list(inferred_schemas),
+        "file_types": list(allowed_schemas),
         "source": source_str,
         "status": db_status,
     }
diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py
index d0aaf9e0..9b1c1c31 100644
--- a/src/webapp/routers/data_test.py
+++ b/src/webapp/routers/data_test.py
@@ -569,7 +569,7 @@ def test_update_batch(client: TestClient) -> None:
 
 def test_validate_success_batch(client: TestClient) -> None:
     """Test PATCH /institutions/<uuid>/batch."""
-    MOCK_STORAGE.validate_file.return_value = ["COURSE"]
+    MOCK_STORAGE.validate_file.return_value = ["UNKNOWN"]
 
     # Use validate for manual upload
     response_upload = client.post(

From a261e8e119ff2cd7600f1f26ee7d23a3fcce6ffd Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:24:02 -0500
Subject: [PATCH 82/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 40 ++++++++++++++++----------------------
 1 file changed, 17 insertions(+), 23 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index de47f693..3d8b914b 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -3,7 +3,7 @@
 import uuid
 from datetime import datetime, date
 from databricks.sdk import WorkspaceClient
-from typing import Annotated, Any, Dict, List, cast, IO, Optional
+from typing import Annotated, Any, Dict, List, cast, IO, Optional, Tuple
 from pydantic import BaseModel, Field
 from fastapi import APIRouter, Depends, HTTPException, status, Response, Query
 from fastapi.responses import FileResponse
@@ -1025,6 +1025,16 @@ def infer_models_from_filename(file_path: str) -> List[str]:
     return sorted(inferred)
 
 
+class _ValidationState:
+    _ar_re = re.compile(r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE)
+    _base_cache: Dict[str, Any] = {"exp": 0.0, "val": None}
+    _ext_cache: Dict[str, Tuple[float, Any]] = {}
+    _pdp_cache: Tuple[float, Optional[dict]] = (0.0, None)
+
+
+STATE = _ValidationState()
+
+
 def validation_helper(
     source_str: str,
     inst_id: str,
@@ -1047,22 +1057,7 @@ def validation_helper(
     local_session.set(sql_session)
     sess = local_session.get()
 
-    # --- one-time initialization on the function object (kept in-process)
-    if not hasattr(validation_helper, "_ar_re"):
-        validation_helper._ar_re = re.compile(
-            r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE
-        )
-    if not hasattr(validation_helper, "_base_cache"):
-        # {"exp": <monotonic expiry>, "val": (<schema_id>, <json_doc>)}
-        validation_helper._base_cache = {"exp": 0.0, "val": None}
-    if not hasattr(validation_helper, "_ext_cache"):
-        # { str(inst_uuid): (exp, extension_json_doc) }
-        validation_helper._ext_cache = {}
-    if not hasattr(validation_helper, "_pdp_cache"):
-        # PDP-wide extension (active), cached: (exp, doc)
-        validation_helper._pdp_cache = (0.0, None)
-
-    AR_RE = validation_helper._ar_re
+    AR_RE = STATE._ar_re
     BASE_TTL = 300  # seconds
     EXT_TTL = 120  # seconds
 
@@ -1097,7 +1092,7 @@ def validation_helper(
 
     # --- fetch active base schema (cached)
     now = time.monotonic()
-    base_cache = validation_helper._base_cache
+    base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
         base_schema_id, base_schema = base_cache["val"]
     else:
@@ -1123,7 +1118,6 @@ def validation_helper(
         raise ValueError(f"Institution {inst_id} not found")
 
     bucket = get_external_bucket_name(inst_id)
-
     # --- choose / prepare extension schema (try to avoid heavy path)
     updated_inst_schema: Optional[dict] = None
 
@@ -1148,7 +1142,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
 
     if getattr(inst, "pdp_id", None):
         # PDP institutions: use active PDP extension (cached)
-        pdp_exp, pdp_doc = validation_helper._pdp_cache
+        pdp_exp, pdp_doc = STATE._pdp_cache
         if now < pdp_exp and pdp_doc is not None:
             inst_schema = pdp_doc
         else:
@@ -1160,11 +1154,11 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
                 )
                 .limit(1)
             ).scalar_one_or_none()
-            validation_helper._pdp_cache = (now + EXT_TTL, inst_schema)
+            STATE._pdp_cache = (now + EXT_TTL, inst_schema)
         updated_inst_schema = inst_schema
     else:
         # custom institutions: try cached extension first
-        ext_cache = validation_helper._ext_cache
+        ext_cache = STATE._ext_cache
         key = str(getattr(inst, "id", ""))
         cached = ext_cache.get(key)
         if cached and now < cached[0]:
@@ -1212,7 +1206,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
                     sess.flush()
                     logging.info("Schema record inserted for '%s'", inst_id)
                     # refresh cache
-                    validation_helper._ext_cache[key] = (
+                    STATE._ext_cache[key] = (
                         time.monotonic() + EXT_TTL,
                         schema_extension,
                     )

From e332c3740c71dc5d4e403e3b190e2c829e9d5fb7 Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Mon, 15 Sep 2025 10:27:41 -0400
Subject: [PATCH 83/92] debugging not being able to find h2o pipeline

---
 src/webapp/databricks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py
index 7c0cbc29..94a89576 100644
--- a/src/webapp/databricks.py
+++ b/src/webapp/databricks.py
@@ -213,7 +213,7 @@ def run_pdp_inference(
             job = next(w.jobs.list(name=pipeline_type), None)
             if not job or job.job_id is None:
                 raise ValueError(
-                    f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id."
+                    f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id for '{gcs_vars['GCP_SERVICE_ACCOUNT_EMAIL']}' and '{databricks_vars['DATABRICKS_HOST_URL']}'."
                 )
             job_id = job.job_id
             LOGGER.info(f"Resolved job ID for '{pipeline_type}': {job_id}")

From 12715738e03462bb77cf69f45680269f3c45578e Mon Sep 17 00:00:00 2001
From: Vishakh Pillai <vishpillai97@gmail.com>
Date: Mon, 15 Sep 2025 10:31:49 -0400
Subject: [PATCH 84/92] style

---
 src/webapp/routers/data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 3d8b914b..a0987681 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1045,7 +1045,6 @@ def validation_helper(
 ) -> Any:
     """Helper function for file validation (self-contained & optimized)."""
     import time
-    import re
     import os
 
     # --- access check & quick input validation

From 955809afc8daedebec2089b963e0e6d04fdf32ef Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:35:34 -0500
Subject: [PATCH 85/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 3d8b914b..469561cf 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1094,7 +1094,9 @@ def validation_helper(
     now = time.monotonic()
     base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
-        base_schema_id, base_schema = base_cache["val"]
+        base_schema_id, base_schema = base_cache[
+            "val"
+        ]  # pylint: disable=unpacking-non-sequence
     else:
         row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)

From 86df88f8ac8c46e8123335ef34c74680aadb653d Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:39:36 -0500
Subject: [PATCH 86/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index e01478cb..0e6d2f57 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1093,7 +1093,7 @@ def validation_helper(
     now = time.monotonic()
     base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
-        base_schema_id, base_schema = base_cache[
+        base_schema_id, base_schema = base_cache[ # pylint: disable=unpacking-non-sequence
             "val"
         ]  # pylint: disable=unpacking-non-sequence
     else:

From 5cdcb99c4a25f225cc31e2e0c2a2b8526453180d Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:41:41 -0500
Subject: [PATCH 87/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 0e6d2f57..755b088b 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1093,9 +1093,9 @@ def validation_helper(
     now = time.monotonic()
     base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
-        base_schema_id, base_schema = base_cache[ # pylint: disable=unpacking-non-sequence
+        base_schema_id, base_schema = base_cache[
             "val"
-        ]  # pylint: disable=unpacking-non-sequence
+        ]  # pylint: disable=unpacking-non-sequence  # pylint: disable=unpacking-non-sequence
     else:
         row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)

From 3a1819aab25de899c29523f0ced274403ca9fc96 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:42:59 -0500
Subject: [PATCH 88/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 755b088b..af82ccc2 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1093,9 +1093,7 @@ def validation_helper(
     now = time.monotonic()
     base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
-        base_schema_id, base_schema = base_cache[
-            "val"
-        ]  # pylint: disable=unpacking-non-sequence  # pylint: disable=unpacking-non-sequence
+        base_schema_id, base_schema = base_cache["val"]  # pylint: disable=unpacking-non-sequence  # pylint: disable=unpacking-non-sequence
     else:
         row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)

From f7cae1db5f9b0c5629e75848713d002c05a3da41 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:47:00 -0500
Subject: [PATCH 89/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index af82ccc2..a08f8d2e 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1093,7 +1093,7 @@ def validation_helper(
     now = time.monotonic()
     base_cache = STATE._base_cache
     if now < base_cache["exp"] and base_cache["val"] is not None:
-        base_schema_id, base_schema = base_cache["val"]  # pylint: disable=unpacking-non-sequence  # pylint: disable=unpacking-non-sequence
+        base_schema_id, base_schema = base_cache["val"]  # pylint: disable=unpacking-non-sequence # fmt: skip
     else:
         row = sess.execute(
             select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc)

From 0f94774a79684859781068d4f1b1d6e9000c9d62 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 09:59:10 -0500
Subject: [PATCH 90/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index a08f8d2e..b09d2d23 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1143,7 +1143,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
         # PDP institutions: use active PDP extension (cached)
         pdp_exp, pdp_doc = STATE._pdp_cache
         if now < pdp_exp and pdp_doc is not None:
-            inst_schema = pdp_doc
+            inst_schema: Optional[Dict[str, Any]] = pdp_doc
         else:
             inst_schema = sess.execute(
                 select(SchemaRegistryTable.json_doc)
@@ -1182,12 +1182,14 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]:
         else:
             # heavy path only when needed
             dbc = DatabricksControl()
-            schema_extension = dbc.create_custom_schema_extension(
-                bucket_name=bucket,
-                inst_query=inst,
-                file_name=file_name,
-                base_schema=base_schema,
-                extension_schema=inst_schema,
+            schema_extension: Optional[Dict[str, Any]] = (
+                dbc.create_custom_schema_extension(
+                    bucket_name=bucket,
+                    inst_query=inst,
+                    file_name=file_name,
+                    base_schema=base_schema,
+                    extension_schema=inst_schema,
+                )
             )
             if schema_extension is not None:
                 updated_inst_schema = schema_extension

From b9a669509ac028b4a11df722db41c73023d7f7e3 Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 10:23:36 -0500
Subject: [PATCH 91/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index b09d2d23..3aafabe8 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -998,33 +998,6 @@ def download_url_inst_file(
     )
 
 
-_AR_WORD = re.compile(r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE)
-
-
-def infer_models_from_filename(file_path: str) -> List[str]:
-    name = os.path.basename(file_path).lower()
-
-    inferred = set()
-    if "course" in name:
-        inferred.add("COURSE")
-    if "student" in name:
-        inferred.add("STUDENT")
-    if "semester" in name:
-        inferred.add("SEMESTER")
-    if "cohort" in name:
-        inferred.add("STUDENT")
-    if "course" not in name and (_AR_WORD.search(name) or "deidentified" in name):
-        inferred.add("STUDENT")
-
-    if not inferred:
-        raise ValueError(
-            f"Could not infer model(s) from file name: {name}. "
-            "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')."
-        )
-
-    return sorted(inferred)
-
-
 class _ValidationState:
     _ar_re = re.compile(r"(?<![A-Za-z0-9])ar(?![A-Za-z0-9])", re.IGNORECASE)
     _base_cache: Dict[str, Any] = {"exp": 0.0, "val": None}

From 4f915e0d3f323ec8416035ecf82a6be5ac65173b Mon Sep 17 00:00:00 2001
From: Mesh <meshach.ogunmodede@datakind.org>
Date: Mon, 15 Sep 2025 10:25:14 -0500
Subject: [PATCH 92/92] revamped entire validation helper script

---
 src/webapp/routers/data.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 3aafabe8..c8491455 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -1018,7 +1018,6 @@ def validation_helper(
 ) -> Any:
     """Helper function for file validation (self-contained & optimized)."""
     import time
-    import os
 
     # --- access check & quick input validation
     has_access_to_inst_or_err(inst_id, current_user)