From 34d412d212ce7eea1f1e318d24a2f0e756ae0f93 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Tue, 2 Sep 2025 17:36:37 -0400 Subject: [PATCH 01/92] testing h2o pipeline on synthetic 2 --- src/webapp/databricks.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 0f9612ec..40643c53 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -35,6 +35,7 @@ # The name of the deployed pipeline in Databricks. Must match directly. PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline" +PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline" class DatabricksInferenceRunRequest(BaseModel): @@ -192,16 +193,21 @@ def run_pdp_inference( db_inst_name = databricksify_inst_name(req.inst_name) + if db_inst_name in ["synthetic_2", "synthetic_uni_2"]: + db_job_name = PDP_H2O_INFERENCE_JOB_NAME + else: + db_job_name = PDP_INFERENCE_JOB_NAME + try: - job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None) + job = next(w.jobs.list(name=db_job_name), None) if not job or job.job_id is None: raise ValueError( - f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id." + f"run_pdp_inference(): Job '{db_job_name}' was not found or has no job_id." ) job_id = job.job_id - LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}") + LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}") except Exception as e: - LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.") + LOGGER.exception(f"Job lookup failed for '{db_job_name}' and '{db_inst_name}.") raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") try: From 45888ae47c15a84740417873264d030f2d3162f4 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Tue, 2 Sep 2025 17:42:08 -0400 Subject: [PATCH 02/92] style --- src/webapp/databricks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 40643c53..eaf7b679 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -207,7 +207,9 @@ def run_pdp_inference( job_id = job.job_id LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}") except Exception as e: - LOGGER.exception(f"Job lookup failed for '{db_job_name}' and '{db_inst_name}.") + LOGGER.exception( + f"Job lookup failed for '{db_job_name}' and '{db_inst_name}." + ) raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") try: From 4963fa391b4b53578bb6f45b934e84e9029d34ee Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Tue, 2 Sep 2025 20:02:36 -0400 Subject: [PATCH 03/92] bcrypt dep issue --- pyproject.toml | 6 +++--- uv.lock | 11 ++++++++--- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 81c867a0..b6e45d94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,8 +13,8 @@ dependencies = [ "cloud-sql-python-connector[pymysql]~=1.14.0", "sqlalchemy~=2.0.36", "pyjwt~=2.10.1", - "passlib~=1.7.4", - "bcrypt~=4.2.0", + "passlib[bcrypt]>=1.7.4,<1.8", + "bcrypt>=4.0.1,<5", "pycryptodome~=3.20.0", "python-dotenv~=1.0.1", "strenum~=0.4.15", @@ -28,7 +28,7 @@ dependencies = [ "thefuzz[speedup]~=0.22.1", "databricks-sql-connector~=3.5.0", "pandera~=0.13", - "mlflow~=2.15.0" + "mlflow~=2.15.0", ] [project.urls] diff --git a/uv.lock b/uv.lock index 4c589d69..e4134cc3 100644 --- a/uv.lock +++ b/uv.lock @@ -2464,6 +2464,11 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" }, ] +[package.optional-dependencies] +bcrypt = [ + { name = "bcrypt" }, +] + [[package]] name = "pathspec" version = "0.12.1" @@ -3718,7 +3723,7 @@ dependencies = [ { name = "pandas" }, { name = "pandera" }, { name = "paramiko" }, - { name = "passlib" }, + { name = "passlib", extra = ["bcrypt"] }, { name = "pycryptodome" }, { name = "pydantic" }, { name = "pyjwt" }, @@ -3747,7 +3752,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "bcrypt", specifier = "~=4.2.0" }, + { name = "bcrypt", specifier = ">=4.0.1,<5" }, { name = "cloud-sql-python-connector", extras = ["pymysql"], specifier = "~=1.14.0" }, { name = "databricks-sdk", specifier = "~=0.38.0" }, { name = "databricks-sql-connector", specifier = "~=3.5.0" }, @@ -3758,7 +3763,7 @@ requires-dist = [ { name = "pandas", specifier = "~=2.0" }, { name = "pandera", specifier = "~=0.13" }, { name = "paramiko", specifier = "~=3.5.0" }, - { name = "passlib", specifier = "~=1.7.4" }, + { name = "passlib", extras = ["bcrypt"], specifier = ">=1.7.4,<1.8" }, { name = "pycryptodome", specifier = "~=3.20.0" }, { name = "pydantic", specifier = "~=2.10" }, { name = "pyjwt", specifier = "~=2.10.1" }, From 767a3fdafda8e42017f7fad2f8d8e4c22537cab7 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Tue, 2 Sep 2025 20:33:34 -0400 Subject: [PATCH 04/92] reverting deps for now --- pyproject.toml | 4 ++-- uv.lock | 11 +++-------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6e45d94..ca8f7b0b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,8 +13,8 @@ dependencies = [ "cloud-sql-python-connector[pymysql]~=1.14.0", "sqlalchemy~=2.0.36", "pyjwt~=2.10.1", - "passlib[bcrypt]>=1.7.4,<1.8", - "bcrypt>=4.0.1,<5", + "passlib~=1.7.4", + "bcrypt~=4.2.0", "pycryptodome~=3.20.0", "python-dotenv~=1.0.1", "strenum~=0.4.15", diff --git a/uv.lock b/uv.lock index e4134cc3..4c589d69 100644 --- a/uv.lock +++ b/uv.lock @@ -2464,11 +2464,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/a4/ab6b7589382ca3df236e03faa71deac88cae040af60c071a78d254a62172/passlib-1.7.4-py2.py3-none-any.whl", hash = "sha256:aa6bca462b8d8bda89c70b382f0c298a20b5560af6cbfa2dce410c0a2fb669f1", size = 525554, upload-time = "2020-10-08T19:00:49.856Z" }, ] -[package.optional-dependencies] -bcrypt = [ - { name = "bcrypt" }, -] - [[package]] name = "pathspec" version = "0.12.1" @@ -3723,7 +3718,7 @@ dependencies = [ { name = "pandas" }, { name = "pandera" }, { name = "paramiko" }, - { name = "passlib", extra = ["bcrypt"] }, + { name = "passlib" }, { name = "pycryptodome" }, { name = "pydantic" }, { name = "pyjwt" }, @@ -3752,7 +3747,7 @@ dev = [ [package.metadata] requires-dist = [ - { name = "bcrypt", specifier = ">=4.0.1,<5" }, + { name = "bcrypt", specifier = "~=4.2.0" }, { name = "cloud-sql-python-connector", extras = ["pymysql"], specifier = "~=1.14.0" }, { name = "databricks-sdk", specifier = "~=0.38.0" }, { name = "databricks-sql-connector", specifier = "~=3.5.0" }, @@ -3763,7 +3758,7 @@ requires-dist = [ { name = "pandas", specifier = "~=2.0" }, { name = "pandera", specifier = "~=0.13" }, { name = "paramiko", specifier = "~=3.5.0" }, - { name = "passlib", extras = ["bcrypt"], specifier = ">=1.7.4,<1.8" }, + { name = "passlib", specifier = "~=1.7.4" }, { name = "pycryptodome", specifier = "~=3.20.0" }, { name = "pydantic", specifier = "~=2.10" }, { name = "pyjwt", specifier = "~=2.10.1" }, From cabb5c361e087da8c2c1f3fc861a241ac7f97189 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Wed, 3 Sep 2025 12:26:30 -0400 Subject: [PATCH 05/92] reverting changes with pyproject --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ca8f7b0b..81c867a0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dependencies = [ "thefuzz[speedup]~=0.22.1", "databricks-sql-connector~=3.5.0", "pandera~=0.13", - "mlflow~=2.15.0", + "mlflow~=2.15.0" ] [project.urls] From 9fc7d85e748fc9ee55e93ea6557f7ab7e83fd493 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 11:47:09 -0500 Subject: [PATCH 06/92] changed FE inference and training endpoint args for better understanding --- src/webapp/routers/data.py | 56 +++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 36079908..c76db03e 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1276,14 +1276,14 @@ def get_upload_url( # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/top-features/{run_id}") +@router.get("/{inst_id}/inference/top-features/{job_run_id}") def get_inference_top_features( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns data for a specific institution.""" + """Returns top n features table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1308,7 +1308,7 @@ def get_inference_top_features( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_features_with_most_impact", + table_name=f"inference_{job_run_id}_features_with_most_impact", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1319,10 +1319,10 @@ def get_inference_top_features( # Get Box plot values -@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}") +@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}") def get_inference_feature_boxstats( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], feature_name: Optional[str] = Query( @@ -1355,7 +1355,7 @@ def get_inference_feature_boxstats( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_box_plot_table", + table_name=f"inference_{job_run_id}_box_plot_table", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) if not feature_name: @@ -1381,7 +1381,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: if not filtered: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.", + detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.", ) return filtered @@ -1392,14 +1392,14 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/support-overview/{run_id}") +@router.get("/{inst_id}/inference/support-overview/{job_run_id}") def get_inference_support_overview( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns support score distribution table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1424,7 +1424,7 @@ def get_inference_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_support_overview", + table_name=f"inference_{job_run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1434,14 +1434,14 @@ def get_inference_support_overview( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/inference/feature_importance/{run_id}") +@router.get("/{inst_id}/inference/feature_importance/{job_run_id}") def get_inference_feature_importance( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns feature importance table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1466,7 +1466,7 @@ def get_inference_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_shap_feature_importance", + table_name=f"inference_{job_run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1479,10 +1479,10 @@ def get_inference_feature_importance( ## FE Training Tables -@router.get("/{inst_id}/training/feature_importance/{run_id}") +@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}") def get_training_feature_importance( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1511,7 +1511,7 @@ def get_training_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_shap_feature_importance", + table_name=f"training_{experiment_run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1521,10 +1521,10 @@ def get_training_feature_importance( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/confusion_matrix/{run_id}") +@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}") def get_training_confusion_matrix( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1553,7 +1553,7 @@ def get_training_confusion_matrix( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_confusion_matrix", + table_name=f"training_{experiment_run_id}_confusion_matrix", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1563,10 +1563,10 @@ def get_training_confusion_matrix( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/roc_curve/{run_id}") +@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}") def get_training_roc_curve( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1595,7 +1595,7 @@ def get_training_roc_curve( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_roc_curve", + table_name=f"training_{experiment_run_id}_roc_curve", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1605,10 +1605,10 @@ def get_training_roc_curve( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/support-overview/{run_id}") +@router.get("/{inst_id}/training/support-overview/{experiment_run_id}") def get_training_support_overview( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1637,7 +1637,7 @@ def get_training_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_support_overview", + table_name=f"training_{experiment_run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) From 5b3fd3cdc34c4d26733f33c658864f3c7c011f18 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 11:48:52 -0500 Subject: [PATCH 07/92] changed FE inference and training endpoint args for better understanding --- src/webapp/routers/data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index c76db03e..7bd34f2b 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1486,7 +1486,7 @@ def get_training_feature_importance( current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns training feature importance table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1528,7 +1528,7 @@ def get_training_confusion_matrix( current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns training confusion matrix table for a specific instituion.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1570,7 +1570,7 @@ def get_training_roc_curve( current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns training roc curve table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) @@ -1612,7 +1612,7 @@ def get_training_support_overview( current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: - """Returns a signed URL for uploading data to a specific institution.""" + """Returns training support overview table for a specific institution.""" # raise error at this level instead bc otherwise it's getting wrapped as a 200 has_access_to_inst_or_err(inst_id, current_user) local_session.set(sql_session) From 99e6dc8efdc66ed519f803423e0ef828740c8e46 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:00:06 -0500 Subject: [PATCH 08/92] fixed course flags in filename inference --- src/webapp/routers/data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 7bd34f2b..02a52819 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -16,6 +16,7 @@ from ..config import databricks_vars, env_vars, gcs_vars import tempfile import pathlib +import re from ..utilities import ( has_access_to_inst_or_err, @@ -995,6 +996,7 @@ def download_url_inst_file( ) +_AR_WORD = re.compile(r'(? List[str]: name = os.path.basename(file_path).lower() @@ -1007,16 +1009,14 @@ def infer_models_from_filename(file_path: str, institution_id: str) -> List[str] inferred.add("SEMESTER") if "cohort" in name: inferred.add("STUDENT") - if "course" not in name and ("ar" in name or "deidentified" in name): + if "course" not in name and (_AR_WORD.search(name) or "deidentified" in name): inferred.add("STUDENT") if not inferred: - logging.error( - ValueError( - f"Could not infer model(s) from file name: {name}, filenames sould be descriptive of the kind of data it contains e.g. course, cohort" - ) + raise ValueError( + f"Could not infer model(s) from file name: {name}. " + "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')." ) - inferred.add("UNKNOWN") return sorted(inferred) From 73ef358e0f543a995325b553a92bf450f55c170a Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:03:21 -0500 Subject: [PATCH 09/92] fixed course flags in filename inference --- src/webapp/routers/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 02a52819..0471c44f 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -997,7 +997,7 @@ def download_url_inst_file( _AR_WORD = re.compile(r'(? List[str]: +def infer_models_from_filename(file_path: str) -> List[str]: name = os.path.basename(file_path).lower() inferred = set() @@ -1040,7 +1040,7 @@ def validation_helper( allowed_schemas = None if not allowed_schemas: - allowed_schemas = infer_models_from_filename(file_name, "pdp") + allowed_schemas = infer_models_from_filename(file_name) inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- From ba7c04faa3cb9841cf2f5df29f70330354ec3818 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:06:09 -0500 Subject: [PATCH 10/92] fixed course flags in filename inference --- src/webapp/routers/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 0471c44f..52b85536 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -996,7 +996,9 @@ def download_url_inst_file( ) -_AR_WORD = re.compile(r'(? List[str]: name = os.path.basename(file_path).lower() From c37030f9f3fd553e7b210864db2c0d3d2e767088 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:35:54 -0500 Subject: [PATCH 11/92] fixed course flags in filename inference --- src/webapp/routers/data_test.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index d1cce3ee..9b1c1c31 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -586,11 +586,11 @@ def test_validate_success_batch(client: TestClient) -> None: response_upload = client.post( "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) - + "/input/validate-upload/file_name.csv", + + "/input/validate-upload/pdp_course_deidentified.csv", ) assert response_upload.status_code == 200 - assert response_upload.json()["name"] == "file_name.csv" - assert response_upload.json()["file_types"] == ["UNKNOWN"] + assert response_upload.json()["name"] == "pdp_course_deidentified.csv" + assert response_upload.json()["file_types"] == ["COURSE"] assert response_upload.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID) assert response_upload.json()["source"] == "MANUAL_UPLOAD" @@ -598,7 +598,7 @@ def test_validate_success_batch(client: TestClient) -> None: response_sftp = client.post( "/institutions/" + uuid_to_str(UUID_INVALID) - + "/input/validate-sftp/file_name.csv", + + "/input/validate-sftp/pdp_ar_deidentified.csv", ) assert str(response_sftp) == "" assert ( @@ -609,11 +609,11 @@ def test_validate_success_batch(client: TestClient) -> None: response_sftp = client.post( "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) - + "/input/validate-sftp/file_name.csv", + + "/input/validate-sftp/pdp_ar_deidentified.csv", ) assert response_sftp.status_code == 200 - assert response_sftp.json()["name"] == "file_name.csv" - assert response_sftp.json()["file_types"] == ["UNKNOWN"] + assert response_sftp.json()["name"] == "pdp_ar_deidentified.csv" + assert response_sftp.json()["file_types"] == ["STUDENT"] assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID) assert response_sftp.json()["source"] == "PDP_SFTP" From 44a73e67d591792a8399be75bbcf6c24155354b9 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:45:38 -0500 Subject: [PATCH 12/92] fixed course flags in filename inference --- src/webapp/routers/data.py | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 52b85536..41e3a1ae 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1278,10 +1278,10 @@ def get_upload_url( # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/top-features/{job_run_id}") +@router.get("/{inst_id}/inference/top-features/{run_id}") def get_inference_top_features( inst_id: str, - job_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1310,7 +1310,7 @@ def get_inference_top_features( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{job_run_id}_features_with_most_impact", + table_name=f"inference_{run_id}_features_with_most_impact", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1321,10 +1321,10 @@ def get_inference_top_features( # Get Box plot values -@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}") +@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}") def get_inference_feature_boxstats( inst_id: str, - job_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], feature_name: Optional[str] = Query( @@ -1357,7 +1357,7 @@ def get_inference_feature_boxstats( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{job_run_id}_box_plot_table", + table_name=f"inference_{run_id}_box_plot_table", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) if not feature_name: @@ -1383,7 +1383,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: if not filtered: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.", + detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.", ) return filtered @@ -1394,10 +1394,10 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/support-overview/{job_run_id}") +@router.get("/{inst_id}/inference/support-overview/{run_id}") def get_inference_support_overview( inst_id: str, - job_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1426,7 +1426,7 @@ def get_inference_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{job_run_id}_support_overview", + table_name=f"inference_{run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1436,10 +1436,10 @@ def get_inference_support_overview( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/inference/feature_importance/{job_run_id}") +@router.get("/{inst_id}/inference/feature_importance/{run_id}") def get_inference_feature_importance( inst_id: str, - job_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1468,7 +1468,7 @@ def get_inference_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{job_run_id}_shap_feature_importance", + table_name=f"inference_{run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1481,10 +1481,10 @@ def get_inference_feature_importance( ## FE Training Tables -@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}") +@router.get("/{inst_id}/training/feature_importance/{run_id}") def get_training_feature_importance( inst_id: str, - experiment_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1513,7 +1513,7 @@ def get_training_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{experiment_run_id}_shap_feature_importance", + table_name=f"training_{run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1523,10 +1523,10 @@ def get_training_feature_importance( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}") +@router.get("/{inst_id}/training/confusion_matrix/{run_id}") def get_training_confusion_matrix( inst_id: str, - experiment_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1555,7 +1555,7 @@ def get_training_confusion_matrix( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{experiment_run_id}_confusion_matrix", + table_name=f"training_{run_id}_confusion_matrix", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1565,10 +1565,10 @@ def get_training_confusion_matrix( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}") +@router.get("/{inst_id}/training/roc_curve/{run_id}") def get_training_roc_curve( inst_id: str, - experiment_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1597,7 +1597,7 @@ def get_training_roc_curve( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{experiment_run_id}_roc_curve", + table_name=f"training_{run_id}_roc_curve", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1607,10 +1607,10 @@ def get_training_roc_curve( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/support-overview/{experiment_run_id}") +@router.get("/{inst_id}/training/support-overview/{run_id}") def get_training_support_overview( inst_id: str, - experiment_run_id: str, + run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1639,7 +1639,7 @@ def get_training_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{experiment_run_id}_support_overview", + table_name=f"training_{run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) From 51118f1885cd2d65795feac8f3d3259ce67909df Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 14:51:50 -0500 Subject: [PATCH 13/92] changed FE inference and training endpoint args for better understanding --- src/webapp/routers/data.py | 50 +++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 41e3a1ae..52b85536 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1278,10 +1278,10 @@ def get_upload_url( # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/top-features/{run_id}") +@router.get("/{inst_id}/inference/top-features/{job_run_id}") def get_inference_top_features( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1310,7 +1310,7 @@ def get_inference_top_features( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_features_with_most_impact", + table_name=f"inference_{job_run_id}_features_with_most_impact", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1321,10 +1321,10 @@ def get_inference_top_features( # Get Box plot values -@router.get("/{inst_id}/inference/features-boxplot-stat/{run_id}") +@router.get("/{inst_id}/inference/features-boxplot-stat/{job_run_id}") def get_inference_feature_boxstats( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], feature_name: Optional[str] = Query( @@ -1357,7 +1357,7 @@ def get_inference_feature_boxstats( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_box_plot_table", + table_name=f"inference_{job_run_id}_box_plot_table", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) if not feature_name: @@ -1383,7 +1383,7 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: if not filtered: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, - detail=f"Feature '{feature_name}' not found for run_id '{run_id}'.", + detail=f"Feature '{feature_name}' not found for run_id '{job_run_id}'.", ) return filtered @@ -1394,10 +1394,10 @@ def row_feature_name(row: dict[str, Any]) -> Optional[str]: # Get SHAP Values for Inference -@router.get("/{inst_id}/inference/support-overview/{run_id}") +@router.get("/{inst_id}/inference/support-overview/{job_run_id}") def get_inference_support_overview( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1426,7 +1426,7 @@ def get_inference_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_support_overview", + table_name=f"inference_{job_run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1436,10 +1436,10 @@ def get_inference_support_overview( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/inference/feature_importance/{run_id}") +@router.get("/{inst_id}/inference/feature_importance/{job_run_id}") def get_inference_feature_importance( inst_id: str, - run_id: str, + job_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1468,7 +1468,7 @@ def get_inference_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"inference_{run_id}_shap_feature_importance", + table_name=f"inference_{job_run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1481,10 +1481,10 @@ def get_inference_feature_importance( ## FE Training Tables -@router.get("/{inst_id}/training/feature_importance/{run_id}") +@router.get("/{inst_id}/training/feature_importance/{experiment_run_id}") def get_training_feature_importance( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1513,7 +1513,7 @@ def get_training_feature_importance( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_shap_feature_importance", + table_name=f"training_{experiment_run_id}_shap_feature_importance", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1523,10 +1523,10 @@ def get_training_feature_importance( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/confusion_matrix/{run_id}") +@router.get("/{inst_id}/training/confusion_matrix/{experiment_run_id}") def get_training_confusion_matrix( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1555,7 +1555,7 @@ def get_training_confusion_matrix( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_confusion_matrix", + table_name=f"training_{experiment_run_id}_confusion_matrix", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1565,10 +1565,10 @@ def get_training_confusion_matrix( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/roc_curve/{run_id}") +@router.get("/{inst_id}/training/roc_curve/{experiment_run_id}") def get_training_roc_curve( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1597,7 +1597,7 @@ def get_training_roc_curve( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_roc_curve", + table_name=f"training_{experiment_run_id}_roc_curve", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) @@ -1607,10 +1607,10 @@ def get_training_roc_curve( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/support-overview/{run_id}") +@router.get("/{inst_id}/training/support-overview/{experiment_run_id}") def get_training_support_overview( inst_id: str, - run_id: str, + experiment_run_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], sql_session: Annotated[Session, Depends(get_session)], ) -> List[dict[str, Any]]: @@ -1639,7 +1639,7 @@ def get_training_support_overview( rows = dbc.fetch_table_data( catalog_name=env_vars["CATALOG_NAME"], # type: ignore inst_name=f"{query_result[0][0].name}", - table_name=f"training_{run_id}_support_overview", + table_name=f"training_{experiment_run_id}_support_overview", warehouse_id=env_vars["SQL_WAREHOUSE_ID"], # type: ignore ) From 5e824f183b878b08f75a49f45a73ca64e95939da Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 17:17:42 -0500 Subject: [PATCH 14/92] patching validation.py --- src/webapp/gcsutil.py | 4 +++- src/webapp/routers/data.py | 22 +++++++++++++++++++--- src/webapp/validation.py | 8 +++++--- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index b6046daa..44bef984 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -340,8 +340,10 @@ def validate_file( f"If you see this file validation was successful {schems}" ) except Exception as e: + logging.exception("Validation failed for %s: %s", file_name, e) blob.delete() - raise e + raise + new_blob = bucket.blob(new_blob_name) if new_blob.exists(): raise ValueError(new_blob_name + ": File already exists.") diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 52b85536..7560bb9d 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -17,6 +17,7 @@ import tempfile import pathlib import re +from validation import HardValidationError from ..utilities import ( has_access_to_inst_or_err, @@ -1155,13 +1156,28 @@ def validation_helper( logging.debug( f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}" ) + except HardValidationError as e: + logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail={ + "code": "VALIDATION_FAILED", + "message": "Schema validation failed.", + "missing_required": e.missing_required, + "extra_columns": e.extra_columns, + "schema_errors": e.schema_errors, + "failure_cases": e.failure_cases, + }, + ) except Exception as e: logging.debug(f"!!!!!!!!!!Inferred Schemas FAILED {e}") raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail="File type is not valid and/or not accepted by this institution: " - + str(e), - ) from e + detail={ + "code": "VALIDATION_ERROR", + "message": str(e), + }, + ) existing_file = ( local_session.get() diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 3f359aaf..dc9f3c82 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -220,14 +220,15 @@ def validate_dataset( ] # Hard-fail on missing required or any extra columns - if missing_required or extra_columns: + if missing_required: if logging: logging.error( f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}" ) raise HardValidationError( - missing_required=missing_required, extra_columns=extra_columns + missing_required=missing_required ) + unknown_extra = extra_columns # 5) build Pandera schema & validate (hard-fail on any error) schema = build_schema(merged_specs) @@ -273,8 +274,9 @@ def validate_dataset( # 6) success (with possible soft misses) return { "validation_status": ( - "passed_with_soft_errors" if missing_optional else "passed" + "passed_with_soft_errors" if (missing_optional or unknown_extra) else "passed" ), "schemas": model_list, "missing_optional": missing_optional, + "unknown_extra_columns": unknown_extra, } From 92ca1eb80140846496757998203f10ffba446638 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 17:21:08 -0500 Subject: [PATCH 15/92] fix import ish --- src/webapp/routers/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 7560bb9d..74d1dabd 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -17,7 +17,7 @@ import tempfile import pathlib import re -from validation import HardValidationError +from ..validation import HardValidationError from ..utilities import ( has_access_to_inst_or_err, From 6f63a0ee25eddeafb967563dc8dde641aa5aa5c6 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 17:22:52 -0500 Subject: [PATCH 16/92] fix import ish --- src/webapp/gcsutil.py | 2 +- src/webapp/validation.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index 44bef984..5e955ea5 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -343,7 +343,7 @@ def validate_file( logging.exception("Validation failed for %s: %s", file_name, e) blob.delete() raise - + new_blob = bucket.blob(new_blob_name) if new_blob.exists(): raise ValueError(new_blob_name + ": File already exists.") diff --git a/src/webapp/validation.py b/src/webapp/validation.py index dc9f3c82..452ae678 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -225,9 +225,7 @@ def validate_dataset( logging.error( f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}" ) - raise HardValidationError( - missing_required=missing_required - ) + raise HardValidationError(missing_required=missing_required) unknown_extra = extra_columns # 5) build Pandera schema & validate (hard-fail on any error) @@ -274,7 +272,9 @@ def validate_dataset( # 6) success (with possible soft misses) return { "validation_status": ( - "passed_with_soft_errors" if (missing_optional or unknown_extra) else "passed" + "passed_with_soft_errors" + if (missing_optional or unknown_extra) + else "passed" ), "schemas": model_list, "missing_optional": missing_optional, From 410b8ea0c93c96627802069328a30c72f68c0e85 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 17:46:13 -0500 Subject: [PATCH 17/92] fixed table read --- src/webapp/routers/data.py | 5 ++--- src/webapp/validation.py | 11 ++++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 74d1dabd..399a375b 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1153,9 +1153,8 @@ def validation_helper( base_schema, updated_inst_schema, ) - logging.debug( - f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}" - ) + logging.debug("!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas)) + except HardValidationError as e: logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e) raise HTTPException( diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 452ae678..be111b08 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -162,7 +162,16 @@ def validate_dataset( models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: - df = pd.read_csv(filename) + read_errs = [] + for enc in ("utf-8", "utf-8-sig", "latin1"): + try: + df = pd.read_csv(filename, encoding=enc) + break + except UnicodeDecodeError as ex: + read_errs.append(f"{enc}: {ex}") + else: + raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs) + df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) From 51cd2525a42459b9c3f2a654d471deb0cba17311 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 17:49:22 -0500 Subject: [PATCH 18/92] fixed table read --- src/webapp/routers/data.py | 4 +++- src/webapp/validation.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 399a375b..8d481e59 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1153,7 +1153,9 @@ def validation_helper( base_schema, updated_inst_schema, ) - logging.debug("!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas)) + logging.debug( + "!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas) + ) except HardValidationError as e: logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index be111b08..b04dad58 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -171,7 +171,7 @@ def validate_dataset( read_errs.append(f"{enc}: {ex}") else: raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs) - + df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) From 4d1de4c5472317419c9a55c6b7cc39e99b8b4115 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 3 Sep 2025 18:04:00 -0500 Subject: [PATCH 19/92] fixed table read --- src/webapp/routers/data.py | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 8d481e59..69777abf 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1159,25 +1159,35 @@ def validation_helper( except HardValidationError as e: logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e) + # Build a single string - frontend can render this reliably + msg_parts = ["VALIDATION_FAILED"] + if e.missing_required: + msg_parts.append(f"missing_required={e.missing_required}") + if e.extra_columns: + msg_parts.append(f"extra_columns={e.extra_columns}") + if e.schema_errors is not None: + msg_parts.append(f"schema_errors={e.schema_errors}") + if e.failure_cases is not None: + # keep short; avoid dumping huge tables + try: + sample = ( + e.failure_cases[:5] + if isinstance(e.failure_cases, list) + else str(e.failure_cases)[:500] + ) + except Exception: + sample = "see server logs" + msg_parts.append(f"failure_cases_sample={sample}") raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail={ - "code": "VALIDATION_FAILED", - "message": "Schema validation failed.", - "missing_required": e.missing_required, - "extra_columns": e.extra_columns, - "schema_errors": e.schema_errors, - "failure_cases": e.failure_cases, - }, + detail="; ".join(msg_parts), ) + except Exception as e: - logging.debug(f"!!!!!!!!!!Inferred Schemas FAILED {e}") + logging.debug("!!!!!!!!!!Inferred Schemas FAILED (other) %s", e) raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, - detail={ - "code": "VALIDATION_ERROR", - "message": str(e), - }, + detail=f"VALIDATION_ERROR: {type(e).__name__}: {e}", ) existing_file = ( From f786e05fcc071ab34aa8438bb80353abd28e226d Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 4 Sep 2025 11:23:37 -0500 Subject: [PATCH 20/92] fixed table read --- src/webapp/gcsutil.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index 5e955ea5..b267d9eb 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -341,7 +341,6 @@ def validate_file( ) except Exception as e: logging.exception("Validation failed for %s: %s", file_name, e) - blob.delete() raise new_blob = bucket.blob(new_blob_name) From 182ef289afac8ce7f5441f40ebb5c984307d6ad9 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 4 Sep 2025 15:10:03 -0400 Subject: [PATCH 21/92] trying to test why pipeline isn't being found --- src/webapp/databricks.py | 72 +++++++++++++++++++++++++++++++++++----- 1 file changed, 64 insertions(+), 8 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index eaf7b679..7060f085 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -198,20 +198,76 @@ def run_pdp_inference( else: db_job_name = PDP_INFERENCE_JOB_NAME + # --- Resolve the Databricks Job by name, with diagnostics --- try: - job = next(w.jobs.list(name=db_job_name), None) - if not job or job.job_id is None: + # Helpful diagnostics about where we are and who we are + try: + me = w.current_user.me() + LOGGER.info("Databricks caller: user_name=%s, user_id=%s", getattr(me, "user_name", None), getattr(me, "id", None)) + except Exception: + LOGGER.info("Could not resolve current user; continuing.") + + host_url = databricks_vars["DATABRICKS_HOST_URL"] + LOGGER.info("Databricks host: %s", host_url) + + # Gather visible jobs and log a small sample for troubleshooting + visible_jobs = list(w.jobs.list()) # materialize generator + LOGGER.info("Visible jobs count: %d", len(visible_jobs)) + + log_preview = [] + for j in visible_jobs[:25]: + # In SDK, name commonly lives under settings.name + jname = getattr(getattr(j, "settings", None), "name", None) + jid = getattr(j, "job_id", None) + log_preview.append(f"{jid}:{jname}") + LOGGER.info("First up-to-25 visible jobs (id:name): %s", "; ".join(log_preview) if log_preview else "(none)") + + # Try to find by name (exact, then case-insensitive, then prefix/close match) + def job_name(j) -> str: + return (getattr(getattr(j, "settings", None), "name", None) or "").strip() + + target = db_job_name.strip() + candidates = [j for j in visible_jobs if job_name(j) == target] + + if not candidates: + # Case-insensitive exact + candidates = [j for j in visible_jobs if job_name(j).lower() == target.lower()] + + if not candidates: + # Prefix or contains + lowered = target.lower() + candidates = [j for j in visible_jobs if job_name(j).lower().startswith(lowered)] + if not candidates: + candidates = [j for j in visible_jobs if lowered in job_name(j).lower()] + + # If multiple, prefer exact case-insensitive match first; else first candidate + job_obj = candidates[0] if candidates else None + + # If still not found, compute close matches to guide debugging + if not job_obj: + import difflib + names = [job_name(j) for j in visible_jobs] + close = difflib.get_close_matches(target, names, n=5, cutoff=0.6) raise ValueError( - f"run_pdp_inference(): Job '{db_job_name}' was not found or has no job_id." + f"run_pdp_inference(): Job named '{db_job_name}' not found in workspace {host_url}. " + f"Service principal may lack permissions, or the job name differs. " + f"Close matches: {close}" ) - job_id = job.job_id - LOGGER.info(f"Resolved job ID for '{db_job_name}': {job_id}") + + job_id = getattr(job_obj, "job_id", None) + if not job_id: + raise ValueError( + f"run_pdp_inference(): Found job '{job_name(job_obj)}' but it has no job_id. " + "Check job visibility/permissions and that the SDK is returning full job metadata." + ) + + LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj)) + except Exception as e: - LOGGER.exception( - f"Job lookup failed for '{db_job_name}' and '{db_inst_name}." - ) + LOGGER.exception("Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name) raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") + try: run_job: Any = w.jobs.run_now( job_id, From fe99c8dd3659b39a3510306a6bd42afba2338cb9 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 4 Sep 2025 15:13:19 -0400 Subject: [PATCH 22/92] black --- src/webapp/databricks.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 7060f085..bece8120 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -203,7 +203,11 @@ def run_pdp_inference( # Helpful diagnostics about where we are and who we are try: me = w.current_user.me() - LOGGER.info("Databricks caller: user_name=%s, user_id=%s", getattr(me, "user_name", None), getattr(me, "id", None)) + LOGGER.info( + "Databricks caller: user_name=%s, user_id=%s", + getattr(me, "user_name", None), + getattr(me, "id", None), + ) except Exception: LOGGER.info("Could not resolve current user; continuing.") @@ -220,25 +224,36 @@ def run_pdp_inference( jname = getattr(getattr(j, "settings", None), "name", None) jid = getattr(j, "job_id", None) log_preview.append(f"{jid}:{jname}") - LOGGER.info("First up-to-25 visible jobs (id:name): %s", "; ".join(log_preview) if log_preview else "(none)") + LOGGER.info( + "First up-to-25 visible jobs (id:name): %s", + "; ".join(log_preview) if log_preview else "(none)", + ) # Try to find by name (exact, then case-insensitive, then prefix/close match) def job_name(j) -> str: - return (getattr(getattr(j, "settings", None), "name", None) or "").strip() + return ( + getattr(getattr(j, "settings", None), "name", None) or "" + ).strip() target = db_job_name.strip() candidates = [j for j in visible_jobs if job_name(j) == target] if not candidates: # Case-insensitive exact - candidates = [j for j in visible_jobs if job_name(j).lower() == target.lower()] + candidates = [ + j for j in visible_jobs if job_name(j).lower() == target.lower() + ] if not candidates: # Prefix or contains lowered = target.lower() - candidates = [j for j in visible_jobs if job_name(j).lower().startswith(lowered)] + candidates = [ + j for j in visible_jobs if job_name(j).lower().startswith(lowered) + ] if not candidates: - candidates = [j for j in visible_jobs if lowered in job_name(j).lower()] + candidates = [ + j for j in visible_jobs if lowered in job_name(j).lower() + ] # If multiple, prefer exact case-insensitive match first; else first candidate job_obj = candidates[0] if candidates else None @@ -246,6 +261,7 @@ def job_name(j) -> str: # If still not found, compute close matches to guide debugging if not job_obj: import difflib + names = [job_name(j) for j in visible_jobs] close = difflib.get_close_matches(target, names, n=5, cutoff=0.6) raise ValueError( @@ -264,10 +280,11 @@ def job_name(j) -> str: LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj)) except Exception as e: - LOGGER.exception("Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name) + LOGGER.exception( + "Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name + ) raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") - try: run_job: Any = w.jobs.run_now( job_id, From 23e5fb0ecf8f44923c28083a314519b7a140c602 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 4 Sep 2025 15:21:40 -0400 Subject: [PATCH 23/92] type check --- src/webapp/databricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index bece8120..cd89cf2c 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -230,7 +230,7 @@ def run_pdp_inference( ) # Try to find by name (exact, then case-insensitive, then prefix/close match) - def job_name(j) -> str: + def job_name(j: Any) -> str: return ( getattr(getattr(j, "settings", None), "name", None) or "" ).strip() From 7d2dc9130a234b100ba2e3d37a12f220b2d10db6 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 01:42:53 -0500 Subject: [PATCH 24/92] added framework column to cloud sql with default sklearn --- src/webapp/database.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/webapp/database.py b/src/webapp/database.py index 7fe974b0..da1fc9a5 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -511,6 +511,9 @@ class ModelTable(Base): ) # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. version: Mapped[int] = mapped_column(Integer, default=0) + framework: Mapped[str | None] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn' + ) # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) From 705dbaf1537b670eea4b6dfb684ec2fb797338ac Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 01:47:53 -0500 Subject: [PATCH 25/92] defined acceptance criteria from FE --- src/webapp/routers/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index cb7949f6..88ee733c 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -215,6 +215,7 @@ def create_model( created_by=str_to_uuid(current_user.user_id), valid=req.valid, schema_configs=jsonpickle.encode(req.schema_configs), + framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn" ) local_session.get().add(model) local_session.get().commit() From e510cacab4af06a2ba33e67cdb92b116ca24646b Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 07:34:57 -0500 Subject: [PATCH 26/92] reverted databricks to original file --- src/webapp/databricks.py | 95 +++--------------------------------- src/webapp/routers/models.py | 3 +- 2 files changed, 9 insertions(+), 89 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index cd89cf2c..592bc852 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -35,7 +35,6 @@ # The name of the deployed pipeline in Databricks. Must match directly. PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline" -PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline" class DatabricksInferenceRunRequest(BaseModel): @@ -193,96 +192,16 @@ def run_pdp_inference( db_inst_name = databricksify_inst_name(req.inst_name) - if db_inst_name in ["synthetic_2", "synthetic_uni_2"]: - db_job_name = PDP_H2O_INFERENCE_JOB_NAME - else: - db_job_name = PDP_INFERENCE_JOB_NAME - - # --- Resolve the Databricks Job by name, with diagnostics --- try: - # Helpful diagnostics about where we are and who we are - try: - me = w.current_user.me() - LOGGER.info( - "Databricks caller: user_name=%s, user_id=%s", - getattr(me, "user_name", None), - getattr(me, "id", None), - ) - except Exception: - LOGGER.info("Could not resolve current user; continuing.") - - host_url = databricks_vars["DATABRICKS_HOST_URL"] - LOGGER.info("Databricks host: %s", host_url) - - # Gather visible jobs and log a small sample for troubleshooting - visible_jobs = list(w.jobs.list()) # materialize generator - LOGGER.info("Visible jobs count: %d", len(visible_jobs)) - - log_preview = [] - for j in visible_jobs[:25]: - # In SDK, name commonly lives under settings.name - jname = getattr(getattr(j, "settings", None), "name", None) - jid = getattr(j, "job_id", None) - log_preview.append(f"{jid}:{jname}") - LOGGER.info( - "First up-to-25 visible jobs (id:name): %s", - "; ".join(log_preview) if log_preview else "(none)", - ) - - # Try to find by name (exact, then case-insensitive, then prefix/close match) - def job_name(j: Any) -> str: - return ( - getattr(getattr(j, "settings", None), "name", None) or "" - ).strip() - - target = db_job_name.strip() - candidates = [j for j in visible_jobs if job_name(j) == target] - - if not candidates: - # Case-insensitive exact - candidates = [ - j for j in visible_jobs if job_name(j).lower() == target.lower() - ] - - if not candidates: - # Prefix or contains - lowered = target.lower() - candidates = [ - j for j in visible_jobs if job_name(j).lower().startswith(lowered) - ] - if not candidates: - candidates = [ - j for j in visible_jobs if lowered in job_name(j).lower() - ] - - # If multiple, prefer exact case-insensitive match first; else first candidate - job_obj = candidates[0] if candidates else None - - # If still not found, compute close matches to guide debugging - if not job_obj: - import difflib - - names = [job_name(j) for j in visible_jobs] - close = difflib.get_close_matches(target, names, n=5, cutoff=0.6) + job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None) + if not job or job.job_id is None: raise ValueError( - f"run_pdp_inference(): Job named '{db_job_name}' not found in workspace {host_url}. " - f"Service principal may lack permissions, or the job name differs. " - f"Close matches: {close}" + f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id." ) - - job_id = getattr(job_obj, "job_id", None) - if not job_id: - raise ValueError( - f"run_pdp_inference(): Found job '{job_name(job_obj)}' but it has no job_id. " - "Check job visibility/permissions and that the SDK is returning full job metadata." - ) - - LOGGER.info("Resolved job: id=%s, name=%s", job_id, job_name(job_obj)) - + job_id = job.job_id + LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}") except Exception as e: - LOGGER.exception( - "Job lookup failed for '%s' in '%s'.", db_job_name, db_inst_name - ) + LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.") raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") try: @@ -633,4 +552,4 @@ def create_custom_schema_extension( existing_extension=extension_schema, # may be None ) - return updated_extension + return updated_extension \ No newline at end of file diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 88ee733c..3ff8df4a 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -215,7 +215,7 @@ def create_model( created_by=str_to_uuid(current_user.user_id), valid=req.valid, schema_configs=jsonpickle.encode(req.schema_configs), - framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn" + framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn", ) local_session.get().add(model) local_session.get().commit() @@ -253,6 +253,7 @@ def create_model( "created_by": uuid_to_str(query_result[0][0].created_by), "deleted": query_result[0][0].deleted, "valid": query_result[0][0].valid, + "framework": query_result[0][0].framework, } From a2275925f94f92c5928e5244f07f73214c4cdae8 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 07:49:17 -0500 Subject: [PATCH 27/92] feat: added databricks framework layer --- src/webapp/databricks.py | 4 ++-- src/webapp/routers/models.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 592bc852..9a39634f 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -44,10 +44,10 @@ class DatabricksInferenceRunRequest(BaseModel): # Note that the following should be the filepath. filepath_to_type: dict[str, list[SchemaType]] model_name: str - model_type: str = "sklearn" # The email where notifications will get sent. email: str gcp_external_bucket_name: str + framework: str class DatabricksInferenceRunResponse(BaseModel): @@ -220,8 +220,8 @@ def run_pdp_inference( ], # is this value the same PER environ? dev/staging/prod "gcp_bucket_name": req.gcp_external_bucket_name, "model_name": req.model_name, - "model_type": req.model_type, "notification_email": req.email, + "framework": req.framework, }, ) LOGGER.info( diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 3ff8df4a..308b9108 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -301,6 +301,7 @@ def read_inst_model( "created_by": uuid_to_str(query_result[0][0].created_by), "deleted": query_result[0][0].deleted, "valid": query_result[0][0].valid, + "framework": query_result[0][0].framework, } @@ -549,6 +550,7 @@ def trigger_inference_run( gcp_external_bucket_name=get_external_bucket_name(inst_id), # The institution email to which pipeline success/failure notifications will get sent. email=current_user.email, + framework=query_result[0][0].framework, ) try: res = databricks_control.run_pdp_inference(db_req) From 27a5eee78ea408b742cd1a9ebdbdb75a1224bb97 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 09:54:58 -0500 Subject: [PATCH 28/92] added framework param to job --- src/webapp/database.py | 4 +++- src/webapp/routers/models.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index da1fc9a5..2862e6c6 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -551,7 +551,9 @@ class JobTable(Base): String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) completed: Mapped[bool] = mapped_column(nullable=True) - + framework: Mapped[str | None] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn' + ) class DocType(enum.Enum): base = "base" diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 308b9108..40f28135 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -569,6 +569,7 @@ def trigger_inference_run( batch_name=req.batch_name, model_id=query_result[0][0].id, output_valid=False, + framework=query_result[0][0].framework, ) local_session.get().add(job) return { @@ -579,4 +580,5 @@ def trigger_inference_run( "triggered_at": triggered_timestamp, "batch_name": req.batch_name, "output_valid": False, + "framework": query_result[0][0].framework, } From 166b32926178008dd124641a760573ab29e5db37 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 09:59:56 -0500 Subject: [PATCH 29/92] added case block to job run --- src/webapp/databricks.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 9a39634f..f52fd3a2 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -35,7 +35,7 @@ # The name of the deployed pipeline in Databricks. Must match directly. PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline" - +PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline" class DatabricksInferenceRunRequest(BaseModel): """Databricks parameters for an inference run.""" @@ -192,8 +192,16 @@ def run_pdp_inference( db_inst_name = databricksify_inst_name(req.inst_name) + if req.framework == "sklearn": + pipeline_type = PDP_INFERENCE_JOB_NAME + elif req.framework == "h20": + pipeline_type = PDP_H2O_INFERENCE_JOB_NAME + else: + raise ValueError( + f"Invalid model framework assigned to institution model" + ) try: - job = next(w.jobs.list(name=PDP_INFERENCE_JOB_NAME), None) + job = next(w.jobs.list(name=pipeline_type), None) if not job or job.job_id is None: raise ValueError( f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id." From f45b00cbd83c1a152db90783a3b5b06b7224be35 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 10:08:47 -0500 Subject: [PATCH 30/92] added case block to job run --- src/webapp/databricks.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index f52fd3a2..b2d94361 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -44,10 +44,10 @@ class DatabricksInferenceRunRequest(BaseModel): # Note that the following should be the filepath. filepath_to_type: dict[str, list[SchemaType]] model_name: str + model_type: str # The email where notifications will get sent. email: str gcp_external_bucket_name: str - framework: str class DatabricksInferenceRunResponse(BaseModel): @@ -204,12 +204,12 @@ def run_pdp_inference( job = next(w.jobs.list(name=pipeline_type), None) if not job or job.job_id is None: raise ValueError( - f"run_pdp_inference(): Job '{PDP_INFERENCE_JOB_NAME}' was not found or has no job_id." + f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id." ) job_id = job.job_id - LOGGER.info(f"Resolved job ID for '{PDP_INFERENCE_JOB_NAME}': {job_id}") + LOGGER.info(f"Resolved job ID for '{pipeline_type}': {job_id}") except Exception as e: - LOGGER.exception(f"Job lookup failed for '{PDP_INFERENCE_JOB_NAME}'.") + LOGGER.exception(f"Job lookup failed for '{pipeline_type}'.") raise ValueError(f"run_pdp_inference(): Failed to find job: {e}") try: @@ -228,8 +228,8 @@ def run_pdp_inference( ], # is this value the same PER environ? dev/staging/prod "gcp_bucket_name": req.gcp_external_bucket_name, "model_name": req.model_name, + "model_type": req.framework, "notification_email": req.email, - "framework": req.framework, }, ) LOGGER.info( From 4d4bf6843699943cb57c3d6ae1593c6ea6ddd08c Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 10:13:25 -0500 Subject: [PATCH 31/92] added case block to job run --- src/webapp/databricks.py | 6 +++--- src/webapp/routers/models.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index b2d94361..252ffdb8 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -192,9 +192,9 @@ def run_pdp_inference( db_inst_name = databricksify_inst_name(req.inst_name) - if req.framework == "sklearn": + if req.model_type == "sklearn": pipeline_type = PDP_INFERENCE_JOB_NAME - elif req.framework == "h20": + elif req.model_type == "h20": pipeline_type = PDP_H2O_INFERENCE_JOB_NAME else: raise ValueError( @@ -228,7 +228,7 @@ def run_pdp_inference( ], # is this value the same PER environ? dev/staging/prod "gcp_bucket_name": req.gcp_external_bucket_name, "model_name": req.model_name, - "model_type": req.framework, + "model_type": req.model_type, "notification_email": req.email, }, ) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 40f28135..9b464f67 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -569,7 +569,7 @@ def trigger_inference_run( batch_name=req.batch_name, model_id=query_result[0][0].id, output_valid=False, - framework=query_result[0][0].framework, + model_type=query_result[0][0].framework, ) local_session.get().add(job) return { From ae912c21b9da127dd42a4ab8e4410ab1fc8644e1 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 10:17:43 -0500 Subject: [PATCH 32/92] fix linting and test --- src/webapp/databricks.py | 2 +- src/webapp/routers/models_test.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 252ffdb8..6cfb0bda 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -198,7 +198,7 @@ def run_pdp_inference( pipeline_type = PDP_H2O_INFERENCE_JOB_NAME else: raise ValueError( - f"Invalid model framework assigned to institution model" + "Invalid model framework assigned to institution model" ) try: job = next(w.jobs.list(name=pipeline_type), None) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 8643f98b..8d39a925 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -152,6 +152,7 @@ def session_fixture(): ] ), valid=True, + framework="sklearn", ) run_1 = JobTable( id=RUN_ID, @@ -161,6 +162,7 @@ def session_fixture(): completed=True, output_filename="file_output_one", created_by=created_by_UUID, + framework="sklearn", ) try: with sqlalchemy.orm.Session(engine) as session: From 18708f2c5b7771016c53ed858200205336795e53 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 10:20:32 -0500 Subject: [PATCH 33/92] fix linting and test --- src/webapp/databricks.py | 2 +- src/webapp/routers/models.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 6cfb0bda..2b553953 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -194,7 +194,7 @@ def run_pdp_inference( if req.model_type == "sklearn": pipeline_type = PDP_INFERENCE_JOB_NAME - elif req.model_type == "h20": + elif req.model_type == "h2o": pipeline_type = PDP_H2O_INFERENCE_JOB_NAME else: raise ValueError( diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 9b464f67..fd863e4c 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -215,7 +215,7 @@ def create_model( created_by=str_to_uuid(current_user.user_id), valid=req.valid, schema_configs=jsonpickle.encode(req.schema_configs), - framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h20"} else "sklearn", + framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h2o"} else "sklearn", ) local_session.get().add(model) local_session.get().commit() From cafcb107964394e035ce4aefefb4b14e496745c5 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 10:21:24 -0500 Subject: [PATCH 34/92] fix linting and test --- src/webapp/routers/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index fd863e4c..1d7c7422 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -96,6 +96,7 @@ class ModelCreationRequest(BaseModel): # valid = False, means the model is not ready for use. valid: bool = False schema_configs: list[list[SchemaConfigObj]] + framework: str class ModelInfo(BaseModel): From be4cbdae803f716f0f51c91f2131f1047366730b Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:08:58 -0500 Subject: [PATCH 35/92] fix linting and test --- src/webapp/routers/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 1d7c7422..69905cd9 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -551,7 +551,7 @@ def trigger_inference_run( gcp_external_bucket_name=get_external_bucket_name(inst_id), # The institution email to which pipeline success/failure notifications will get sent. email=current_user.email, - framework=query_result[0][0].framework, + model_type=query_result[0][0].framework, ) try: res = databricks_control.run_pdp_inference(db_req) From abcd80149e58436b2c13218c758a24c5bfd6148f Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:10:20 -0500 Subject: [PATCH 36/92] fix linting and test --- src/webapp/routers/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 69905cd9..b97cab56 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -570,7 +570,7 @@ def trigger_inference_run( batch_name=req.batch_name, model_id=query_result[0][0].id, output_valid=False, - model_type=query_result[0][0].framework, + framework=query_result[0][0].framework, ) local_session.get().add(job) return { From f14297615deed824124b5dd1351cd9f6ae2042a2 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:12:59 -0500 Subject: [PATCH 37/92] fix linting and test --- src/webapp/routers/models_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 8d39a925..8c828b8e 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -338,6 +338,7 @@ def test_create_model(client: TestClient): json={ "name": "my_model", "schema_configs": [[schema_config_1, schema_config_2]], + "framework": "h2o", }, ) From 9c4944311ea8ccb5c8a3b26886b5cb9cdb026dee Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:16:54 -0500 Subject: [PATCH 38/92] fix linting and test --- src/webapp/database.py | 5 +++-- src/webapp/databricks.py | 7 +++---- src/webapp/routers/models.py | 4 +++- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 2862e6c6..7c06d74d 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -512,7 +512,7 @@ class ModelTable(Base): # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. version: Mapped[int] = mapped_column(Integer, default=0) framework: Mapped[str | None] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn' + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn" ) # Within a given institution, there should be no duplicated model names. @@ -552,9 +552,10 @@ class JobTable(Base): ) completed: Mapped[bool] = mapped_column(nullable=True) framework: Mapped[str | None] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default='sklearn' + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, default="sklearn" ) + class DocType(enum.Enum): base = "base" extension = "extension" diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 2b553953..80ab290e 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -37,6 +37,7 @@ PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline" PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline" + class DatabricksInferenceRunRequest(BaseModel): """Databricks parameters for an inference run.""" @@ -197,9 +198,7 @@ def run_pdp_inference( elif req.model_type == "h2o": pipeline_type = PDP_H2O_INFERENCE_JOB_NAME else: - raise ValueError( - "Invalid model framework assigned to institution model" - ) + raise ValueError("Invalid model framework assigned to institution model") try: job = next(w.jobs.list(name=pipeline_type), None) if not job or job.job_id is None: @@ -560,4 +559,4 @@ def create_custom_schema_extension( existing_extension=extension_schema, # may be None ) - return updated_extension \ No newline at end of file + return updated_extension diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index b97cab56..876f557b 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -216,7 +216,9 @@ def create_model( created_by=str_to_uuid(current_user.user_id), valid=req.valid, schema_configs=jsonpickle.encode(req.schema_configs), - framework=f if (f := (req.framework or "").strip().lower()) in {"sklearn","h2o"} else "sklearn", + framework=f + if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"} + else "sklearn", ) local_session.get().add(model) local_session.get().commit() From 00065c76e37352972eb1c5de917dfae4cef7e55a Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:23:06 -0500 Subject: [PATCH 39/92] fix TYPECHECK --- src/webapp/utilities.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index 460d4e1d..c3350a22 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -2,7 +2,7 @@ import uuid import re -from typing import Annotated, Final, Any +from typing import Annotated, Final, Any, Optional from urllib.parse import unquote from strenum import StrEnum # needed for python pre 3.11 import jwt @@ -394,7 +394,7 @@ def uuid_to_str(uuid_val: uuid.UUID) -> str: return uuid_val.hex -def str_to_uuid(hex_str: str) -> uuid.UUID: +def str_to_uuid(hex_str: Optional[str]) -> uuid.UUID: """Convert str to UUID obj (database needs UUID obj).""" return uuid.UUID(hex_str) From 0badfd351568043412e8a531a1847e84f5e23be4 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 11:29:48 -0500 Subject: [PATCH 40/92] fix TYPECHECK --- src/webapp/utilities.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index c3350a22..ee4617c6 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -163,7 +163,7 @@ class BaseUser(BaseModel): disabled: bool | None = None # Constructor - def __init__(self, usr: str | None, inst: str, access: str, email: str) -> None: + def __init__(self, usr: str | None, inst: str | None, access: str | None, email: str | None) -> None: super().__init__(user_id=usr, institution=inst, access_type=access, email=email) def is_datakinder(self) -> Any: @@ -182,7 +182,7 @@ def is_viewer(self) -> Any: """Whether a given user is a viewer.""" return self.access_type and self.access_type == AccessType.VIEWER - def has_access_to_inst(self, inst: str) -> Any: + def has_access_to_inst(self, inst: str | None) -> Any: """Whether a given user has access to a given institution.""" return self.access_type and ( self.access_type == AccessType.DATAKINDER or self.institution == inst @@ -219,7 +219,7 @@ def get_user(sess: Session, username: str) -> BaseUser: """Get user from a given username.""" if username == "api_key_initial": return BaseUser( - usr=env_vars["INITIAL_API_KEY_ID"], + usr=str(env_vars["INITIAL_API_KEY_ID"]), inst=None, access="DATAKINDER", email="api_key_initial", @@ -260,7 +260,7 @@ def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser: # Check if it's the initial API key. This doesn't have enduser or inst. if key == env_vars["INITIAL_API_KEY"]: return BaseUser( - usr=env_vars["INITIAL_API_KEY_ID"], + usr=str(env_vars["INITIAL_API_KEY_ID"]), inst=None, access="DATAKINDER", email="api_key_initial", From fecef5aaba6c8280726c235872a062853e27ce1a Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 12:46:16 -0500 Subject: [PATCH 41/92] fix: alllllllllll the typecheck issues --- src/webapp/routers/models.py | 6 +- src/webapp/routers/models_test.py | 120 +++++++++++++++--------------- src/webapp/utilities.py | 38 +++++----- 3 files changed, 84 insertions(+), 80 deletions(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 876f557b..5b13003d 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -1,7 +1,7 @@ """API functions related to models.""" from datetime import datetime -from typing import Annotated, Any +from typing import Annotated, Any, cast import jsonpickle from fastapi import APIRouter, Depends, HTTPException, status from pydantic import BaseModel @@ -60,7 +60,7 @@ def check_file_types_valid_schema_configs( """Check that a list of files are valid for a given schema configuration.""" for config in valid_schema_configs: found = True - map_file_to_schema_config_obj = {} + map_file_to_schema_config_obj: dict= {} for idx, s in enumerate(file_types): for c in config: if c.schema_type in s: @@ -552,7 +552,7 @@ def trigger_inference_run( model_name=model_name, gcp_external_bucket_name=get_external_bucket_name(inst_id), # The institution email to which pipeline success/failure notifications will get sent. - email=current_user.email, + email=cast(str, current_user.email), model_type=query_result[0][0].framework, ) try: diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 8c828b8e..cf565b90 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -2,6 +2,7 @@ import uuid from unittest import mock +from typing import Any import pytest import jsonpickle from fastapi.testclient import TestClient @@ -50,32 +51,32 @@ # TODO plumb through schema configs -def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo): +def same_model_orderless(a_elem: ModelInfo, b_elem: ModelInfo) -> bool: """Check ModelInfo equality without order.""" if ( - a_elem["inst_id"] != b_elem["inst_id"] - or a_elem["name"] != b_elem["name"] - or a_elem["m_id"] != b_elem["m_id"] - or a_elem["valid"] != b_elem["valid"] - or a_elem["deleted"] != b_elem["deleted"] + a_elem.inst_id != b_elem.inst_id + or a_elem.name != b_elem.name + or a_elem.m_id != b_elem.m_id + or a_elem.valid != b_elem.valid + or a_elem.deleted != b_elem.deleted ): return False return True -def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo): +def same_run_info_orderless(a_elem: RunInfo, b_elem: RunInfo) -> bool: """Check RunInfo equality without order.""" if ( - a_elem["inst_id"] != b_elem["inst_id"] - or a_elem["m_name"] != b_elem["m_name"] - or a_elem["run_id"] != b_elem["run_id"] - or a_elem["created_by"] != b_elem["created_by"] - or a_elem["triggered_at"] != b_elem["triggered_at"] - or a_elem["output_filename"] != b_elem["output_filename"] - or a_elem["output_valid"] != b_elem["output_valid"] - or a_elem["err_msg"] != b_elem["err_msg"] - or a_elem["batch_name"] != b_elem["batch_name"] - or a_elem["completed"] != b_elem["completed"] + a_elem.inst_id != b_elem.inst_id + or a_elem.m_name != b_elem.m_name + or a_elem.run_id != b_elem.run_id + or a_elem.created_by != b_elem.created_by + or a_elem.triggered_at != b_elem.triggered_at + or a_elem.output_filename != b_elem.output_filename + or a_elem.output_valid != b_elem.output_valid + or a_elem.err_msg != b_elem.err_msg + or a_elem.batch_name != b_elem.batch_name + or a_elem.completed != b_elem.completed ): return False return True @@ -200,7 +201,7 @@ def session_fixture(): @pytest.fixture(name="client") -def client_fixture(session: sqlalchemy.orm.Session): +def client_fixture(session: sqlalchemy.orm.Session) -> Any: """Unit test mocks setup.""" def get_session_override(): @@ -226,26 +227,25 @@ def databricks_control_override(): app.dependency_overrides.clear() -def test_read_inst_models(client: TestClient): +def test_read_inst_models(client: TestClient) -> None: """Test GET /institutions/345/models.""" response = client.get( "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/models" ) assert response.status_code == 200 assert same_model_orderless( - response.json()[0], - { - "created_by": "", - "deleted": None, - "inst_id": "1d7c75c33eda42949c6675ea8af97b55", - "m_id": "e4862c62829440d8ab4c9c298f02f619", - "name": "sample_model_for_school_1", - "valid": True, - }, + ModelInfo(**response.json()[0]), + ModelInfo( + m_id="e4862c62829440d8ab4c9c298f02f619", + name= "sample_model_for_school_1", + inst_id= "1d7c75c33eda42949c6675ea8af97b55", + deleted= None, + valid= True, + ), ) -def test_read_inst_model(client: TestClient): +def test_read_inst_model(client: TestClient) -> None: """Test GET /institutions/345/models/10. For various user access types.""" # Unauthorized cases. response_unauth = client.get( @@ -266,10 +266,17 @@ def test_read_inst_model(client: TestClient): + "/models/sample_model_for_school_1" ) assert response.status_code == 200 - assert same_model_orderless(response.json(), MODEL_OBJ) + assert same_model_orderless(response.json(), + ModelInfo( + deleted= None, + inst_id= "1d7c75c33eda42949c6675ea8af97b55", + m_id="e4862c62829440d8ab4c9c298f02f619", + name="sample_model_for_school_1", + valid=True, + )) -def test_read_inst_model_outputs(client: TestClient): +def test_read_inst_model_outputs(client: TestClient) -> None: """Test GET /institutions/345/models/10/output.""" MOCK_STORAGE.list_blobs_in_folder.return_value = [] # Authorized. @@ -281,22 +288,20 @@ def test_read_inst_model_outputs(client: TestClient): assert response.status_code == 200 assert same_run_info_orderless( response.json()[0], - { - "batch_name": "batch_foo", - "completed": True, - "created_by": "0ad8b77c49fb459a84b18d2c05722c4a", - "err_msg": None, - "inst_id": "1d7c75c33eda42949c6675ea8af97b55", - "m_name": "sample_model_for_school_1", - "output_filename": "file_output_one", - "output_valid": False, - "run_id": 123, - "triggered_at": "2024-12-24T20:22:20.132022", - }, + RunInfo( + batch_name="batch_foo", + created_by="0ad8b77c49fb459a84b18d2c05722c4a", + err_msg=None, + inst_id="1d7c75c33eda42949c6675ea8af97b55", + m_name="sample_model_for_school_1", + output_filename="file_output_one", + output_valid=False, + run_id=123, + ), ) -def test_read_inst_model_output(client: TestClient): +def test_read_inst_model_output(client: TestClient) -> None: """Test GET /institutions/345/models/10/output/1.""" # Authorized. response = client.get( @@ -308,22 +313,21 @@ def test_read_inst_model_output(client: TestClient): assert response.status_code == 200 assert same_run_info_orderless( response.json(), - { - "batch_name": "batch_foo", - "completed": True, - "created_by": "0ad8b77c49fb459a84b18d2c05722c4a", - "err_msg": None, - "inst_id": "1d7c75c33eda42949c6675ea8af97b55", - "m_name": "sample_model_for_school_1", - "output_filename": "file_output_one", - "output_valid": False, - "run_id": 123, - "triggered_at": "2024-12-24T20:22:20.132022", - }, + RunInfo( + batch_name="batch_foo", + completed=True, + created_by="0ad8b77c49fb459a84b18d2c05722c4a", + err_msg=None, + inst_id="1d7c75c33eda42949c6675ea8af97b55", + m_name="sample_model_for_school_1", + output_filename="file_output_one", + output_valid=False, + run_id=123, + ), ) -def test_create_model(client: TestClient): +def test_create_model(client: TestClient) -> None: """Depending on timeline, fellows may not get to this.""" schema_config_1 = { "schema_type": SchemaType.COURSE, @@ -345,7 +349,7 @@ def test_create_model(client: TestClient): assert response.status_code == 200 -def test_trigger_inference_run(client: TestClient): +def test_trigger_inference_run(client: TestClient) -> None: """Depending on timeline, fellows may not get to this.""" MOCK_DATABRICKS.run_pdp_inference.return_value = DatabricksInferenceRunResponse( job_run_id=123 diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index ee4617c6..392da4d3 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -2,7 +2,7 @@ import uuid import re -from typing import Annotated, Final, Any, Optional +from typing import Annotated, Final, Any, Optional, Tuple, Union from urllib.parse import unquote from strenum import StrEnum # needed for python pre 3.11 import jwt @@ -215,7 +215,7 @@ def has_stronger_permissions_than(self, other_access_type: AccessType) -> bool: return False -def get_user(sess: Session, username: str) -> BaseUser: +def get_user(sess: Session, username: str) -> Optional[BaseUser]: """Get user from a given username.""" if username == "api_key_initial": return BaseUser( @@ -226,17 +226,17 @@ def get_user(sess: Session, username: str) -> BaseUser: ) if username.startswith("api_key_"): api_key_uuid = username.removeprefix("api_key_") - query_result = sess.execute( + apikey_query_result = sess.execute( select(ApiKeyTable).where( ApiKeyTable.id == str_to_uuid(api_key_uuid), ) ).all() - if len(query_result) == 0 or len(query_result) > 1: + if len(apikey_query_result) == 0 or len(apikey_query_result) > 1: return None return BaseUser( - usr=uuid_to_str(query_result[0][0].id), - inst=uuid_to_str(query_result[0][0].inst_id), - access=query_result[0][0].access_type, + usr=uuid_to_str(apikey_query_result[0][0].id), + inst=uuid_to_str(apikey_query_result[0][0].inst_id), + access=apikey_query_result[0][0].access_type, email=username, ) query_result = sess.execute( @@ -254,7 +254,7 @@ def get_user(sess: Session, username: str) -> BaseUser: ) -def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser: +def authenticate_api_key(api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session) -> Union[BaseUser, bool]: """Authenticate an API key.""" (key, inst, enduser) = api_key_enduser_tuple # Check if it's the initial API key. This doesn't have enduser or inst. @@ -291,7 +291,7 @@ def authenticate_api_key(api_key_enduser_tuple: str, sess: Session) -> BaseUser: user_query = select(AccountTable).where( and_( AccountTable.email == enduser, - AccountTable.inst_id == uuid_to_str(inst), + AccountTable.inst_id == inst, ) ) user_result = sess.execute(user_query).all() @@ -330,7 +330,7 @@ async def get_current_user( if not token_from_key: raise credentials_exception payload = jwt.decode( - token_from_key, env_vars["SECRET_KEY"], algorithms=env_vars["ALGORITHM"] + token_from_key, str(env_vars["SECRET_KEY"]), algorithms=env_vars["ALGORITHM"] ) usrname = payload.get("sub") if usrname is None: @@ -345,14 +345,14 @@ async def get_current_user( async def get_current_active_user( current_user: Annotated[BaseUser, Depends(get_current_user)], -): +) -> BaseUser: """Get the active user..""" if current_user.disabled: raise HTTPException(status_code=400, detail="Inactive user") return current_user -def has_access_to_inst_or_err(inst: str, user: BaseUser): +def has_access_to_inst_or_err(inst: str, user: BaseUser) -> None: """Raise error if a given user does not have access to a given institution.""" if not user.has_access_to_inst(inst): raise HTTPException( @@ -361,7 +361,7 @@ def has_access_to_inst_or_err(inst: str, user: BaseUser): ) -def has_full_data_access_or_err(user: BaseUser, resource_type: str): +def has_full_data_access_or_err(user: BaseUser, resource_type: str) -> None: """Raise error if a given user does not have data access to a given institution.""" if not user.has_full_data_access(): raise HTTPException( @@ -370,7 +370,7 @@ def has_full_data_access_or_err(user: BaseUser, resource_type: str): ) -def model_owner_and_higher_or_err(user: BaseUser, resource_type: str): +def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None: """Raise error if a given user does not have model ownership or higher.""" if not user.access_type or user.access_type not in ( AccessType.MODEL_OWNER, @@ -382,12 +382,12 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str): ) -def prepend_env_prefix(name: str) -> str: +def prepend_env_prefix(name: str) -> Any: """Prepend the env prefix. At this point the value should not be empty as we checked on app startup.""" - return env_vars["ENV"].lower() + "_" + name + return str(env_vars["ENV"]).lower() + "_" + name -def uuid_to_str(uuid_val: uuid.UUID) -> str: +def uuid_to_str(uuid_val: uuid.UUID) -> Any: """Convert UUID obj to string.""" if uuid_val is None: return "" @@ -399,12 +399,12 @@ def str_to_uuid(hex_str: Optional[str]) -> uuid.UUID: return uuid.UUID(hex_str) -def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> str: +def get_external_bucket_name_from_uuid(inst_id: uuid.UUID) -> Any: """Get the GCP bucket name which has the env prepended taking in the UUID obj.""" return prepend_env_prefix(uuid_to_str(inst_id)) -def get_external_bucket_name(inst_id: str) -> str: +def get_external_bucket_name(inst_id: str) -> Any: """Get the GCP bucket name which has the env prepended taking in the uuid as str.""" return prepend_env_prefix(inst_id) From 457fd14821f19451226eee9cbeea5bb774dd5af1 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 12:47:36 -0500 Subject: [PATCH 42/92] fix: alllllllllll the typecheck issues --- src/webapp/routers/models_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index cf565b90..45f57327 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -14,7 +14,6 @@ USER_UUID, UUID_INVALID, DATETIME_TESTING, - MODEL_OBJ, SAMPLE_UUID, ) from ..main import app From ee85f6b200c9c609bdbcd5ebd67d347a4d66d20b Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 12:53:16 -0500 Subject: [PATCH 43/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 45f57327..0459d3da 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -265,14 +265,15 @@ def test_read_inst_model(client: TestClient) -> None: + "/models/sample_model_for_school_1" ) assert response.status_code == 200 - assert same_model_orderless(response.json(), - ModelInfo( - deleted= None, - inst_id= "1d7c75c33eda42949c6675ea8af97b55", - m_id="e4862c62829440d8ab4c9c298f02f619", - name="sample_model_for_school_1", - valid=True, - )) + response_model = ModelInfo(**response.json()) + expected_model = ModelInfo( + deleted=None, + inst_id="1d7c75c33eda42949c6675ea8af97b55", + m_id="e4862c62829440d8ab4c9c298f02f619", + name="sample_model_for_school_1", + valid=True, + ) + assert same_model_orderless(response_model, expected_model) def test_read_inst_model_outputs(client: TestClient) -> None: From 2cc5937f14fe0872656e50bf6604ad9e17aeb748 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 12:56:04 -0500 Subject: [PATCH 44/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 0459d3da..4ae17ff2 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -286,19 +286,18 @@ def test_read_inst_model_outputs(client: TestClient) -> None: + "/models/sample_model_for_school_1/runs" ) assert response.status_code == 200 - assert same_run_info_orderless( - response.json()[0], - RunInfo( - batch_name="batch_foo", - created_by="0ad8b77c49fb459a84b18d2c05722c4a", - err_msg=None, - inst_id="1d7c75c33eda42949c6675ea8af97b55", - m_name="sample_model_for_school_1", - output_filename="file_output_one", - output_valid=False, - run_id=123, - ), + response_model = RunInfo(**response.json()[0]) + expected_model = RunInfo( + batch_name="batch_foo", + created_by="0ad8b77c49fb459a84b18d2c05722c4a", + err_msg=None, + inst_id="1d7c75c33eda42949c6675ea8af97b55", + m_name="sample_model_for_school_1", + output_filename="file_output_one", + output_valid=False, + run_id=123, ) + assert same_model_orderless(response_model, expected_model) def test_read_inst_model_output(client: TestClient) -> None: From 75006261311e0081feacb7e597158c9ed12bef35 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 12:59:34 -0500 Subject: [PATCH 45/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 4ae17ff2..5b0f6a15 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -297,7 +297,8 @@ def test_read_inst_model_outputs(client: TestClient) -> None: output_valid=False, run_id=123, ) - assert same_model_orderless(response_model, expected_model) + assert same_run_info_orderless(response_model, expected_model) + def test_read_inst_model_output(client: TestClient) -> None: From 7aeefabc0fa380ad45bbe9601a7460dfae502fd6 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:03:23 -0500 Subject: [PATCH 46/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 5b0f6a15..277da913 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -296,6 +296,8 @@ def test_read_inst_model_outputs(client: TestClient) -> None: output_filename="file_output_one", output_valid=False, run_id=123, + triggered_at=response_model.triggered_at, # copy from response + completed=response_model.completed ) assert same_run_info_orderless(response_model, expected_model) @@ -311,9 +313,8 @@ def test_read_inst_model_output(client: TestClient) -> None: + str(RUN_ID) ) assert response.status_code == 200 - assert same_run_info_orderless( - response.json(), - RunInfo( + response_model = RunInfo(response.json()) + expected_model = RunInfo( batch_name="batch_foo", completed=True, created_by="0ad8b77c49fb459a84b18d2c05722c4a", @@ -323,8 +324,8 @@ def test_read_inst_model_output(client: TestClient) -> None: output_filename="file_output_one", output_valid=False, run_id=123, - ), - ) + ) + assert same_run_info_orderless(response_model, expected_model) def test_create_model(client: TestClient) -> None: From baee11d2521b6ae041e2c22b879f8f791940da26 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:05:02 -0500 Subject: [PATCH 47/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 277da913..af59c66b 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -313,7 +313,7 @@ def test_read_inst_model_output(client: TestClient) -> None: + str(RUN_ID) ) assert response.status_code == 200 - response_model = RunInfo(response.json()) + response_model = RunInfo(**response.json()) expected_model = RunInfo( batch_name="batch_foo", completed=True, From 2ede593afdf4d4330346820b3e6e465f5e6330e3 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:06:02 -0500 Subject: [PATCH 48/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index af59c66b..ed600522 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -324,6 +324,8 @@ def test_read_inst_model_output(client: TestClient) -> None: output_filename="file_output_one", output_valid=False, run_id=123, + triggered_at=response_model.triggered_at, # copy from response + completed=response_model.completed ) assert same_run_info_orderless(response_model, expected_model) From 83380111e3732f658f85b2d12acabc5a7669ab2b Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:15:16 -0500 Subject: [PATCH 49/92] fix: typecheck issues --- src/webapp/routers/models_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index ed600522..6138498c 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -325,7 +325,6 @@ def test_read_inst_model_output(client: TestClient) -> None: output_valid=False, run_id=123, triggered_at=response_model.triggered_at, # copy from response - completed=response_model.completed ) assert same_run_info_orderless(response_model, expected_model) From 5ba9886d7b99e7cc1b457388d776156621de3d09 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:16:24 -0500 Subject: [PATCH 50/92] fix: typecheck issues --- src/webapp/routers/models.py | 2 +- src/webapp/routers/models_test.py | 33 +++++++++++++++---------------- src/webapp/utilities.py | 12 ++++++++--- 3 files changed, 26 insertions(+), 21 deletions(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index 5b13003d..f7737f36 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -60,7 +60,7 @@ def check_file_types_valid_schema_configs( """Check that a list of files are valid for a given schema configuration.""" for config in valid_schema_configs: found = True - map_file_to_schema_config_obj: dict= {} + map_file_to_schema_config_obj: dict = {} for idx, s in enumerate(file_types): for c in config: if c.schema_type in s: diff --git a/src/webapp/routers/models_test.py b/src/webapp/routers/models_test.py index 6138498c..1da27834 100644 --- a/src/webapp/routers/models_test.py +++ b/src/webapp/routers/models_test.py @@ -236,10 +236,10 @@ def test_read_inst_models(client: TestClient) -> None: ModelInfo(**response.json()[0]), ModelInfo( m_id="e4862c62829440d8ab4c9c298f02f619", - name= "sample_model_for_school_1", - inst_id= "1d7c75c33eda42949c6675ea8af97b55", - deleted= None, - valid= True, + name="sample_model_for_school_1", + inst_id="1d7c75c33eda42949c6675ea8af97b55", + deleted=None, + valid=True, ), ) @@ -297,12 +297,11 @@ def test_read_inst_model_outputs(client: TestClient) -> None: output_valid=False, run_id=123, triggered_at=response_model.triggered_at, # copy from response - completed=response_model.completed + completed=response_model.completed, ) assert same_run_info_orderless(response_model, expected_model) - def test_read_inst_model_output(client: TestClient) -> None: """Test GET /institutions/345/models/10/output/1.""" # Authorized. @@ -315,17 +314,17 @@ def test_read_inst_model_output(client: TestClient) -> None: assert response.status_code == 200 response_model = RunInfo(**response.json()) expected_model = RunInfo( - batch_name="batch_foo", - completed=True, - created_by="0ad8b77c49fb459a84b18d2c05722c4a", - err_msg=None, - inst_id="1d7c75c33eda42949c6675ea8af97b55", - m_name="sample_model_for_school_1", - output_filename="file_output_one", - output_valid=False, - run_id=123, - triggered_at=response_model.triggered_at, # copy from response - ) + batch_name="batch_foo", + completed=True, + created_by="0ad8b77c49fb459a84b18d2c05722c4a", + err_msg=None, + inst_id="1d7c75c33eda42949c6675ea8af97b55", + m_name="sample_model_for_school_1", + output_filename="file_output_one", + output_valid=False, + run_id=123, + triggered_at=response_model.triggered_at, # copy from response + ) assert same_run_info_orderless(response_model, expected_model) diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index 392da4d3..8b35088b 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -163,7 +163,9 @@ class BaseUser(BaseModel): disabled: bool | None = None # Constructor - def __init__(self, usr: str | None, inst: str | None, access: str | None, email: str | None) -> None: + def __init__( + self, usr: str | None, inst: str | None, access: str | None, email: str | None + ) -> None: super().__init__(user_id=usr, institution=inst, access_type=access, email=email) def is_datakinder(self) -> Any: @@ -254,7 +256,9 @@ def get_user(sess: Session, username: str) -> Optional[BaseUser]: ) -def authenticate_api_key(api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session) -> Union[BaseUser, bool]: +def authenticate_api_key( + api_key_enduser_tuple: Tuple[str, Optional[str], Optional[str]], sess: Session +) -> Union[BaseUser, bool]: """Authenticate an API key.""" (key, inst, enduser) = api_key_enduser_tuple # Check if it's the initial API key. This doesn't have enduser or inst. @@ -330,7 +334,9 @@ async def get_current_user( if not token_from_key: raise credentials_exception payload = jwt.decode( - token_from_key, str(env_vars["SECRET_KEY"]), algorithms=env_vars["ALGORITHM"] + token_from_key, + str(env_vars["SECRET_KEY"]), + algorithms=env_vars["ALGORITHM"], ) usrname = payload.get("sub") if usrname is None: From 34c7093a96652e899f94a8db9ebd94db7b059aec Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 10 Sep 2025 13:47:37 -0500 Subject: [PATCH 51/92] fix: typecheck issues --- src/webapp/routers/models.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index f7737f36..c838004d 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -216,9 +216,11 @@ def create_model( created_by=str_to_uuid(current_user.user_id), valid=req.valid, schema_configs=jsonpickle.encode(req.schema_configs), - framework=f - if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"} - else "sklearn", + framework=( + f + if (f := (req.framework or "").strip().lower()) in {"sklearn", "h2o"} + else "sklearn" + ), ) local_session.get().add(model) local_session.get().commit() From d234f1c957550389f13bea25acb477d71e815622 Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 11 Sep 2025 15:08:22 -0500 Subject: [PATCH 52/92] fix added logging --- src/webapp/databricks.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 80ab290e..85a16e02 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -99,7 +99,15 @@ def setup_new_inst(self, inst_name: str) -> None: db_inst_name = databricksify_inst_name(inst_name) cat_name = databricks_vars["CATALOG_NAME"] for medallion in MEDALLION_LEVELS: - w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name) + try: + w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name) + except Exception as e: + LOGGER.exception( + f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}", + databricks_vars["DATABRICKS_HOST_URL"], + gcs_vars["GCP_SERVICE_ACCOUNT_EMAIL"], + ) + raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}") LOGGER.info( f"Creating medallion level schemas for {db_inst_name} & {medallion}." ) From 85d59c9c0f2f9dee635e8805f7e3f0cbb444a191 Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 11 Sep 2025 15:08:46 -0500 Subject: [PATCH 53/92] fix added logging --- src/webapp/databricks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 85a16e02..59434bb6 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -103,9 +103,7 @@ def setup_new_inst(self, inst_name: str) -> None: w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name) except Exception as e: LOGGER.exception( - f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}", - databricks_vars["DATABRICKS_HOST_URL"], - gcs_vars["GCP_SERVICE_ACCOUNT_EMAIL"], + f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}" ) raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}") LOGGER.info( From 8741a8349c1b34bb25650b1733da5bd482726746 Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 11 Sep 2025 15:09:06 -0500 Subject: [PATCH 54/92] fix added logging --- src/webapp/databricks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 59434bb6..80c31657 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -100,12 +100,16 @@ def setup_new_inst(self, inst_name: str) -> None: cat_name = databricks_vars["CATALOG_NAME"] for medallion in MEDALLION_LEVELS: try: - w.schemas.create(name=f"{db_inst_name}_{medallion}", catalog_name=cat_name) + w.schemas.create( + name=f"{db_inst_name}_{medallion}", catalog_name=cat_name + ) except Exception as e: LOGGER.exception( f"Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}" ) - raise ValueError(f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}") + raise ValueError( + f"setup_new_inst(): Failed to provision schemas in databricks for {db_inst_name}_{medallion}: {e}" + ) LOGGER.info( f"Creating medallion level schemas for {db_inst_name} & {medallion}." ) From 11f2aceaed6d7f5a49b6670282ac15a017d88544 Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 11 Sep 2025 16:32:34 -0500 Subject: [PATCH 55/92] fix added logging --- src/webapp/routers/data.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 69777abf..29b8587e 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -504,6 +504,7 @@ def create_batch( ) f_names = [] if not req.file_names else req.file_names f_ids = [] if not req.file_ids else strs_to_uuids(req.file_ids) + print(f"File names: {f_names}, File Ids: {f_ids}") # Check that the files requested for this batch exists. # Only valid non-sst generated files can be added to a batch at creation time. query_result_files = ( From 13e1b30421f52a830367f5c7841f448ed59df9b4 Mon Sep 17 00:00:00 2001 From: Mesh Date: Thu, 11 Sep 2025 16:48:31 -0500 Subject: [PATCH 56/92] fix added logging --- src/webapp/routers/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/models.py b/src/webapp/routers/models.py index c838004d..abbb0a36 100644 --- a/src/webapp/routers/models.py +++ b/src/webapp/routers/models.py @@ -96,7 +96,7 @@ class ModelCreationRequest(BaseModel): # valid = False, means the model is not ready for use. valid: bool = False schema_configs: list[list[SchemaConfigObj]] - framework: str + framework: str | None = None class ModelInfo(BaseModel): From 39396f572db7d5ac3ea0f96d8239ce9bca39da72 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 17:02:55 -0500 Subject: [PATCH 57/92] fix databricks h2o job name --- src/webapp/databricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 80c31657..7c0cbc29 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -35,7 +35,7 @@ # The name of the deployed pipeline in Databricks. Must match directly. PDP_INFERENCE_JOB_NAME = "github_sourced_pdp_inference_pipeline" -PDP_H2O_INFERENCE_JOB_NAME = "github_sourced_pdp_h2o_inference_pipeline" +PDP_H2O_INFERENCE_JOB_NAME = "edvise_github_sourced_pdp_inference_pipeline" class DatabricksInferenceRunRequest(BaseModel): From c2d657672e01173392bb897d01422972025fdcef Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 17:29:10 -0500 Subject: [PATCH 58/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 46 ++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index b04dad58..82623d6b 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -155,6 +155,37 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: return DataFrameSchema(columns, strict=False) +def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str: + """ + Return a best-guess encoding using BOM detection + trial decode on a small sample. + Prefers utf-8-sig for BOMmed utf-8 to avoid \ufeff in headers. + """ + with open(path, "rb") as f: + chunk = f.read(sample_bytes) + + # BOM checks first + if chunk.startswith(b"\xef\xbb\xbf"): + return "utf-8-sig" + if chunk.startswith(b"\xff\xfe\x00\x00"): + return "utf-32le" + if chunk.startswith(b"\x00\x00\xfe\xff"): + return "utf-32be" + if chunk.startswith(b"\xff\xfe"): + return "utf-16le" + if chunk.startswith(b"\xfe\xff"): + return "utf-16be" + + # Try utf-8 (strict) on sample; if it works, it will work for the file + try: + chunk.decode("utf-8") + return "utf-8" + except UnicodeDecodeError: + pass + + # Last resort: latin-1 (will not fail, but log later if you want) + return "latin1" + + def validate_dataset( filename: str, base_schema: dict, @@ -162,15 +193,12 @@ def validate_dataset( models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: - read_errs = [] - for enc in ("utf-8", "utf-8-sig", "latin1"): - try: - df = pd.read_csv(filename, encoding=enc) - break - except UnicodeDecodeError as ex: - read_errs.append(f"{enc}: {ex}") - else: - raise HardValidationError(schema_errors="decode_error", failure_cases=read_errs) + enc = sniff_encoding(filename) + try: + df = pd.read_csv(filename, encoding=enc) + except UnicodeDecodeError as ex: + # extremely rare: sample passed but full file fails + raise HardValidationError(schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"]) df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) From 91890625eb6d7d325ddbe74100ab36f8556d2f18 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 17:44:03 -0500 Subject: [PATCH 59/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 82623d6b..f63c4f2a 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -198,7 +198,9 @@ def validate_dataset( df = pd.read_csv(filename, encoding=enc) except UnicodeDecodeError as ex: # extremely rare: sample passed but full file fails - raise HardValidationError(schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"]) + raise HardValidationError( + schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"] + ) df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) From 2a81f1d3eb90808bf4f6263880849813af49909e Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 17:51:45 -0500 Subject: [PATCH 60/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index f63c4f2a..d461bec1 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -182,7 +182,6 @@ def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str: except UnicodeDecodeError: pass - # Last resort: latin-1 (will not fail, but log later if you want) return "latin1" From 04ed7804ea9927267a8ccfc95c85b993bb4c5212 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:15:21 -0500 Subject: [PATCH 61/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 47 ++++++++++++++++++++++++++++------------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index d461bec1..4a6fc1c0 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -2,11 +2,10 @@ pipelines, this is for general file validation.) """ -from typing import Any - +import io, os import json import re -from typing import Union, List, Dict, Optional +from typing import Union, List, Dict, Optional, Any import logging import pandas as pd @@ -155,15 +154,34 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: return DataFrameSchema(columns, strict=False) -def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str: +def sniff_encoding( + src: Union[str, os.PathLike, io.IOBase], sample_bytes: int = 1_048_576 +) -> str: """ - Return a best-guess encoding using BOM detection + trial decode on a small sample. - Prefers utf-8-sig for BOMmed utf-8 to avoid \ufeff in headers. + Return best-guess encoding using BOM detection + utf-8 trial decode. + Accepts path or file-like. Restores stream position if seekable. + If utf-8 fails, raises UnicodeError. """ - with open(path, "rb") as f: - chunk = f.read(sample_bytes) - - # BOM checks first + # --- read small binary sample --- + if isinstance(src, (str, os.PathLike)): + with open(src, "rb") as f: + chunk = f.read(sample_bytes) + else: + buf = src.buffer if isinstance(src, io.TextIOBase) else src + pos = None + try: + if buf.seekable(): + pos = buf.tell() + except Exception: + pass + chunk = buf.read(sample_bytes) + if pos is not None: + try: + buf.seek(pos) + except Exception: + pass + + # --- BOMs first --- if chunk.startswith(b"\xef\xbb\xbf"): return "utf-8-sig" if chunk.startswith(b"\xff\xfe\x00\x00"): @@ -175,14 +193,15 @@ def sniff_encoding(path: str, sample_bytes: int = 1_048_576) -> str: if chunk.startswith(b"\xfe\xff"): return "utf-16be" - # Try utf-8 (strict) on sample; if it works, it will work for the file + # --- utf-8 strict on sample --- try: chunk.decode("utf-8") return "utf-8" except UnicodeDecodeError: - pass - - return "latin1" + raise UnicodeError( + "file is not UTF-8/UTF-16/UTF-32; " + "re-export as UTF-8 (with or without BOM)." + ) def validate_dataset( From 84c03f8f795893a533cb6a37dbea685722a3c856 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:17:14 -0500 Subject: [PATCH 62/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 4a6fc1c0..ad530682 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -211,14 +211,19 @@ def validate_dataset( models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: - enc = sniff_encoding(filename) try: - df = pd.read_csv(filename, encoding=enc) - except UnicodeDecodeError as ex: - # extremely rare: sample passed but full file fails - raise HardValidationError( - schema_errors="decode_error", failure_cases=[f"{enc}: {ex}"] - ) + enc = sniff_encoding(filename) # latin-1 is NOT allowed by default + except UnicodeError as ex: + raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)]) + + # ensure a file-like starts at beginning, then one real read + if hasattr(filename, "seek"): + try: + filename.seek(0) + except Exception: + pass + + df = pd.read_csv(filename, encoding=enc) df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) From 7e9d2095466f680e06638602120e73857b1a5695 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:18:58 -0500 Subject: [PATCH 63/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index ad530682..b59128e3 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -2,7 +2,8 @@ pipelines, this is for general file validation.) """ -import io, os +import io +import os import json import re from typing import Union, List, Dict, Optional, Any From f8c3b20df86efbd9a4f1eb0b9919cc93ae1105c5 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:20:57 -0500 Subject: [PATCH 64/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index b59128e3..6673dc86 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -200,8 +200,7 @@ def sniff_encoding( return "utf-8" except UnicodeDecodeError: raise UnicodeError( - "file is not UTF-8/UTF-16/UTF-32; " - "re-export as UTF-8 (with or without BOM)." + "file is not UTF-8/UTF-16/UTF-32; re-export as UTF-8 (with or without BOM)." ) From 666d4557fa1049f74d44a4aca4e6093e3c12751c Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:23:52 -0500 Subject: [PATCH 65/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 44 ++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 6673dc86..80f341b4 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -155,32 +155,40 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: return DataFrameSchema(columns, strict=False) +Src = Union[str, os.PathLike, io.BufferedIOBase, io.TextIOWrapper] + + def sniff_encoding( - src: Union[str, os.PathLike, io.IOBase], sample_bytes: int = 1_048_576 + src: Src, + sample_bytes: int = 1_048_576, ) -> str: """ - Return best-guess encoding using BOM detection + utf-8 trial decode. - Accepts path or file-like. Restores stream position if seekable. - If utf-8 fails, raises UnicodeError. + Best-guess encoding via BOM detection + utf-8 trial. + Works with a filesystem path, a binary stream, or a TextIOWrapper. + Restores stream position if seekable. Raises if latin-1 would be used (by default). """ - # --- read small binary sample --- + # --- read a small binary sample --- if isinstance(src, (str, os.PathLike)): with open(src, "rb") as f: chunk = f.read(sample_bytes) - else: - buf = src.buffer if isinstance(src, io.TextIOBase) else src - pos = None - try: - if buf.seekable(): - pos = buf.tell() - except Exception: - pass + elif isinstance(src, io.TextIOWrapper): + # Text wrapper => use underlying binary buffer (mypy-safe) + buf = src.buffer + pos = buf.tell() if buf.seekable() else None + chunk = buf.read(sample_bytes) + if pos is not None: + buf.seek(pos) + elif isinstance(src, io.BufferedIOBase): + # Already binary + buf = src + pos = buf.tell() if buf.seekable() else None chunk = buf.read(sample_bytes) if pos is not None: - try: - buf.seek(pos) - except Exception: - pass + buf.seek(pos) + else: + raise TypeError( + "sniff_encoding expects path, io.TextIOWrapper, or binary buffer" + ) # --- BOMs first --- if chunk.startswith(b"\xef\xbb\xbf"): @@ -200,7 +208,7 @@ def sniff_encoding( return "utf-8" except UnicodeDecodeError: raise UnicodeError( - "file is not UTF-8/UTF-16/UTF-32; re-export as UTF-8 (with or without BOM)." + "file is not UTF-8/UTF-16/UTF-32; please re-export as UTF-8." ) From af4053aea4bba676a2299c4f138fc8a9ce6ae8e5 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 18:39:48 -0500 Subject: [PATCH 66/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 80f341b4..9c0fcb40 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -6,7 +6,7 @@ import os import json import re -from typing import Union, List, Dict, Optional, Any +from typing import Union, List, Dict, Optional, Any, BinaryIO, cast import logging import pandas as pd @@ -155,13 +155,18 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: return DataFrameSchema(columns, strict=False) -Src = Union[str, os.PathLike, io.BufferedIOBase, io.TextIOWrapper] +Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper] -def sniff_encoding( - src: Src, - sample_bytes: int = 1_048_576, -) -> str: +def _read_sample(buf: BinaryIO, n: int) -> bytes: + pos = buf.tell() if buf.seekable() else None + chunk = buf.read(n) # -> bytes for BinaryIO + if pos is not None: + buf.seek(pos) + return chunk + + +def sniff_encoding(src: Src, sample_bytes: int = 1_048_576) -> str: """ Best-guess encoding via BOM detection + utf-8 trial. Works with a filesystem path, a binary stream, or a TextIOWrapper. @@ -170,25 +175,13 @@ def sniff_encoding( # --- read a small binary sample --- if isinstance(src, (str, os.PathLike)): with open(src, "rb") as f: - chunk = f.read(sample_bytes) + chunk: bytes = f.read(sample_bytes) elif isinstance(src, io.TextIOWrapper): - # Text wrapper => use underlying binary buffer (mypy-safe) - buf = src.buffer - pos = buf.tell() if buf.seekable() else None - chunk = buf.read(sample_bytes) - if pos is not None: - buf.seek(pos) - elif isinstance(src, io.BufferedIOBase): - # Already binary - buf = src - pos = buf.tell() if buf.seekable() else None - chunk = buf.read(sample_bytes) - if pos is not None: - buf.seek(pos) + # Text wrapper => use underlying binary buffer, cast to BinaryIO for mypy + chunk = _read_sample(cast(BinaryIO, src.buffer), sample_bytes) else: - raise TypeError( - "sniff_encoding expects path, io.TextIOWrapper, or binary buffer" - ) + # Already a binary stream + chunk = _read_sample(cast(BinaryIO, src), sample_bytes) # --- BOMs first --- if chunk.startswith(b"\xef\xbb\xbf"): From ca6801d8d9df379af5e31609fabbb04826c495c9 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:26:41 -0500 Subject: [PATCH 67/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 215 +++++++++++++++++--------------- src/webapp/validation_helper.py | 204 ++++++++++++++++++++++++++++++ 2 files changed, 316 insertions(+), 103 deletions(-) create mode 100644 src/webapp/validation_helper.py diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 9c0fcb40..d6e1f169 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -13,6 +13,13 @@ from pandera import Column, Check, DataFrameSchema from pandera.errors import SchemaErrors from thefuzz import fuzz +from validation_helper import ( + _header_pass, + _pandas_dtype_and_parse_dates, + _build_exact_schema, +) + +logger = logging.getLogger(__name__) def validate_file_reader( @@ -51,8 +58,7 @@ def normalize_col(name: str) -> str: name = name.strip().lower() # Lowercase and trim whitespace name = re.sub(r"[^a-z0-9_]", "_", name) # Replace non-alphanum with underscore name = re.sub(r"_+", "_", name) # Collapse multiple underscores - name = name.strip("_") # Remove leading/trailing underscores - return name + return name.strip("_") # Remove leading/trailing underscores def load_json(path: str) -> Any: @@ -155,6 +161,8 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: return DataFrameSchema(columns, strict=False) +# --------------------- Actual Validation Layer ------------------------------ + Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper] @@ -212,134 +220,135 @@ def validate_dataset( models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: + # 0) encoding try: - enc = sniff_encoding(filename) # latin-1 is NOT allowed by default + enc = sniff_encoding(filename) # latin-1 NOT allowed by default except UnicodeError as ex: raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)]) - # ensure a file-like starts at beginning, then one real read - if hasattr(filename, "seek"): - try: - filename.seek(0) - except Exception: - pass - - df = pd.read_csv(filename, encoding=enc) - - df = df.rename(columns={c: normalize_col(c) for c in df.columns}) - incoming = set(df.columns) - - # 2) merge requested models + # 1) merge requested models if models is None: - model_list = [] + model_list: List[str] = [] elif isinstance(models, str): model_list = [models] else: - model_list = list(models) # <- ensures it's not a set + model_list = list(models) merged_specs: Dict[str, dict] = {} for m in model_list: specs = merge_model_columns(base_schema, ext_schema, institution_id, m.lower()) merged_specs.update(specs) - canon_to_aliases = { - canon: [normalize_col(alias) for alias in [canon] + spec.get("aliases", [])] - for canon, spec in merged_specs.items() - } - df = rename_columns_to_match_schema(df, canon_to_aliases) - df.columns = [ - normalize_col(c) for c in df.columns - ] # Final normalization after renaming + if not merged_specs: + # nothing to validate; short-circuit + return { + "validation_status": "passed", + "schemas": model_list, + "missing_optional": [], + "unknown_extra_columns": [], + } - incoming = set(df.columns) + # 2) HEADER-ONLY PASS: map columns & find missing/extras cheaply + raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = ( + _header_pass(filename, enc, merged_specs, fuzzy_threshold=90) + ) - # 3) build canon → set(normalized names) - canon_to_norms: Dict[str, set] = { - canon: {normalize_col(alias) for alias in [canon] + spec.get("aliases", [])} - for canon, spec in merged_specs.items() - } + if missing_required: + logger.error("Missing required columns: %s", missing_required) + raise HardValidationError(missing_required=missing_required) - pattern_to_canon = { - r"^(?:" - + "|".join(map(re.escape, [canon] + spec.get("aliases", []))) - + r")$": canon - for canon, spec in merged_specs.items() + # 3) selective typed load + present_canons = sorted(set(raw_to_canon.values())) + # choose one raw column per present canonical + canon_to_raw: Dict[str, str] = {} + for raw, canon in raw_to_canon.items(): + # prefer the raw header that's already exactly canonical if present + if canon not in canon_to_raw or normalize_col(raw) == canon: + canon_to_raw[canon] = raw + + raw_usecols = list(canon_to_raw.values()) + + # dtype & parse_dates maps (by canonical); convert to raw keys for read_csv + canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs) + raw_dtype_map = { + canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw } + parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw] + + read_kwargs = dict( + encoding=enc, + usecols=raw_usecols, + dtype=raw_dtype_map or None, + parse_dates=parse_dates_raw or None, + memory_map=True, # often helps on local/posix filesystems + engine="c", # default fast path; keep behavior stable + ) + # optional speed-up if pyarrow is available; behavior stays correct + try: + import pyarrow # noqa: F401 - # 4) find extra / missing - all_norms = set().union(*canon_to_norms.values()) if canon_to_norms else set() - extra_columns = sorted(incoming - all_norms) + read_kwargs["engine"] = "pyarrow" + # pandas>=2: dtype_backend speeds strings/ints; ignore if not supported + try: + read_kwargs["dtype_backend"] = "pyarrow" + except TypeError: + pass + except Exception: + pass - missing_required = [ - canon - for canon, norms in canon_to_norms.items() - if merged_specs[canon].get("required", False) and norms.isdisjoint(incoming) - ] + df = pd.read_csv( + filename, **{k: v for k, v in read_kwargs.items() if v is not None} + ) - missing_optional = [ - canon - for canon, norms in canon_to_norms.items() - if not merged_specs[canon].get("required", False) and norms.isdisjoint(incoming) - ] + # 4) rename raw headers -> canon once (no DataFrame-wide fuzzy work) + df = df.rename(columns=canon_to_raw) # temporarily raw->canon? Not quite. + # The above renames raw names to canonical because keys are canonical? Fix: + df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}) - # Hard-fail on missing required or any extra columns - if missing_required: - if logging: - logging.error( - f"Missing required or extra columns detected, missing_required = {missing_required}, extra_columns = {extra_columns}" - ) - raise HardValidationError(missing_required=missing_required) - unknown_extra = extra_columns + # 5) REQUIRED FIRST (fail-fast), then OPTIONALS (collect soft errors) + required_canons = [ + c for c in present_canons if merged_specs[c].get("required", False) + ] + optional_canons = [ + c for c in present_canons if not merged_specs[c].get("required", False) + ] - # 5) build Pandera schema & validate (hard-fail on any error) - schema = build_schema(merged_specs) - try: - schema.validate(df, lazy=True) - except SchemaErrors as err: - # TODO: Log validation failure for DS to review - failed_normals = set(err.failure_cases["column"]) - failed_canons = {pattern_to_canon.get(p, p) for p in failed_normals} - - # split into required vs optional failures - req_failures = [ - c for c in failed_canons if merged_specs.get(c, {}).get("required", False) - ] - opt_failures = [ - c - for c in failed_canons - if not merged_specs.get(c, {}).get("required", False) - ] - - if req_failures: - if logging: - logging.error( - f"Schema validation failed on required columns, schema_errors = {err.schema_errors}, failure_cases = {err.failure_cases.to_dict(orient='records')}" - ) + # Build schemas with exact names only (faster than regex patterns) + if required_canons: + req_schema = _build_exact_schema(merged_specs, required_canons) + try: + req_schema.validate(df[required_canons], lazy=False) + except SchemaErrors as err: + logger.error("Required column validation failed.") raise HardValidationError( schema_errors=err.schema_errors, failure_cases=err.failure_cases.to_dict(orient="records"), ) - else: - if logging: - logging.info(f"missing_optional = {missing_optional}") - print("Optional column validation errors on: ", opt_failures) - return { - "validation_status": "passed_with_soft_errors", - "schemas": model_list, - "missing_optional": missing_optional, - "optional_validation_failures": opt_failures, - "failure_cases": err.failure_cases.to_dict(orient="records"), - } - if logging: - logging.info(f"missing_optional = {missing_optional}") - # 6) success (with possible soft misses) + + opt_failures: List[str] = [] + failure_cases_records: List[dict] = [] + if optional_canons: + opt_schema = _build_exact_schema(merged_specs, optional_canons) + try: + opt_schema.validate(df[optional_canons], lazy=True) + except SchemaErrors as err: + opt_failures = sorted(set(err.failure_cases["column"])) + failure_cases_records = err.failure_cases.to_dict(orient="records") + + # 6) return — status depends on soft errors / extras + if opt_failures or missing_optional or unknown_extra: + return { + "validation_status": "passed_with_soft_errors", + "schemas": model_list, + "missing_optional": missing_optional, + "optional_validation_failures": opt_failures, + "failure_cases": failure_cases_records, + "unknown_extra_columns": unknown_extra, + } + return { - "validation_status": ( - "passed_with_soft_errors" - if (missing_optional or unknown_extra) - else "passed" - ), + "validation_status": "passed", "schemas": model_list, - "missing_optional": missing_optional, - "unknown_extra_columns": unknown_extra, + "missing_optional": [], + "unknown_extra_columns": [], } diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py new file mode 100644 index 00000000..451f7839 --- /dev/null +++ b/src/webapp/validation_helper.py @@ -0,0 +1,204 @@ +import io +import os +import json +import re +import logging +from functools import lru_cache +from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple + +import pandas as pd +from pandera import Column, Check, DataFrameSchema +from pandera.errors import SchemaErrors + +logger = logging.getLogger(__name__) + + +# ---------- normalization is pure; cache it ---------- +@lru_cache(maxsize=4096) +def normalize_col(name: str) -> str: + name = name.strip().lower() + name = re.sub(r"[^a-z0-9_]", "_", name) + name = re.sub(r"_+", "_", name) + return name.strip("_") + + +def _spec_alias_lookup( + merged_specs: Dict[str, dict] +) -> Tuple[Dict[str, str], Dict[str, List[str]]]: + """ + Build fast lookups: + - alias2canon: normalized alias -> canonical + - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical) + """ + alias2canon: Dict[str, str] = {} + canon_to_aliases_norm: Dict[str, List[str]] = {} + for canon, spec in merged_specs.items(): + aliases = [canon] + spec.get("aliases", []) + normed = [normalize_col(a) for a in aliases] + canon_to_aliases_norm[canon] = normed + for a in normed: + alias2canon[a] = canon + return alias2canon, canon_to_aliases_norm + + +def _fuzzy_map_unresolved( + unresolved: List[Tuple[str, str]], # [(raw_header, normalized_header)] + choices: List[str], # normalized aliases + alias2canon: Dict[str, str], + threshold: int = 90, +) -> Dict[str, str]: # raw_header -> canonical + """ + Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz. + """ + mapping: Dict[str, str] = {} + try: + from rapidfuzz import process, fuzz as rf_fuzz # much faster + + for raw, norm in unresolved: + hit = process.extractOne( + norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold + ) + if hit: + best_alias, score, _ = hit + mapping[raw] = alias2canon[best_alias] + except Exception: + # fallback to thefuzz if rapidfuzz is unavailable + from thefuzz import fuzz as tf_fuzz + + for raw, norm in unresolved: + best_score = 0 + best_alias = None + for alias in choices: + s = tf_fuzz.ratio(norm, alias) + if s > best_score: + best_score, best_alias = s, alias + if best_alias and best_score >= threshold: + mapping[raw] = alias2canon[best_alias] + return mapping + + +def _header_pass( + filename: str, + encoding: str, + merged_specs: Dict[str, dict], + fuzzy_threshold: int = 90, +) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]: + """ + Read only the header. Return: + - raw_cols: list of column names as in file + - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy) + - missing_required: list of canonical columns missing + - missing_optional: list of optional canonical columns missing + - unknown_extra: normalized headers that don't map to any alias + """ + header_df = pd.read_csv(filename, encoding=encoding, nrows=0) + raw_cols = list(header_df.columns) + + alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs) + known_aliases = set(alias2canon.keys()) + + # exact (normalized) mapping first + raw_to_canon: Dict[str, str] = {} + unresolved: List[Tuple[str, str]] = [] + incoming_norms: List[str] = [] + + for raw in raw_cols: + norm = normalize_col(raw) + incoming_norms.append(norm) + if norm in alias2canon: + raw_to_canon[raw] = alias2canon[norm] + else: + unresolved.append((raw, norm)) + + # fuzzy match only for unresolved headers + if unresolved: + choices = list(known_aliases) + fuzzy_map = _fuzzy_map_unresolved( + unresolved, choices, alias2canon, threshold=fuzzy_threshold + ) + raw_to_canon.update(fuzzy_map) + + # derive presence/missing/extras from header only + incoming_canons = set(raw_to_canon.values()) + missing_required = [ + c + for c, spec in merged_specs.items() + if spec.get("required", False) and c not in incoming_canons + ] + missing_optional = [ + c + for c, spec in merged_specs.items() + if not spec.get("required", False) and c not in incoming_canons + ] + unknown_extra = sorted({n for (_, n) in unresolved if n not in known_aliases}) + + return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra + + +def _pandas_dtype_and_parse_dates( + merged_specs: Dict[str, dict] +) -> Tuple[Dict[str, Any], List[str]]: + """ + Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates. + We keep it conservative to avoid accuracy loss. + """ + dtype_map: Dict[str, Any] = {} + parse_dates: List[str] = [] + + for canon, spec in merged_specs.items(): + dt = str(spec.get("dtype")) + # conservative mappings + if dt in {"string", "str", "object"}: + dtype_map[canon] = "string" + elif dt in {"int", "int64", "Int64"}: + # nullable integers are much safer for dirty data + dtype_map[canon] = "Int64" + elif dt in {"float", "float64"}: + dtype_map[canon] = "float64" + elif "datetime" in dt or "date" in dt: # pandera often uses datetime64[ns] + parse_dates.append(canon) # let pandas parse as datetime + elif dt in {"bool", "boolean"}: + dtype_map[canon] = "boolean" + elif dt == "category": + dtype_map[canon] = "category" + else: + # leave unmapped types to pandas inference (keeps behavior) + pass + + return dtype_map, parse_dates + + +def _build_exact_schema( + specs: Dict[str, dict], only_canons: List[str] +) -> DataFrameSchema: + """ + Build a Pandera schema with exact column names (no regex). + This avoids regex matching overhead during validation. + """ + cols: Dict[str, Column] = {} + for canon in only_canons: + spec = specs[canon] + checks = [] + for chk in spec.get("checks", []): + # small speedup opportunities: + # - precompile regex patterns for str_matches + args = list(chk.get("args", [])) + if ( + chk["type"] in {"str_matches", "matches"} + and args + and isinstance(args[0], str) + ): + args[0] = re.compile(args[0]) + factory = getattr(Check, chk["type"]) + checks.append(factory(*args, **chk.get("kwargs", {}))) + + cols[canon] = Column( + name=canon, + regex=False, + dtype=spec["dtype"], + nullable=spec["nullable"], + required=True, # present-by-construction here + checks=checks or None, + coerce=spec.get("coerce", False), + ) + return DataFrameSchema(cols, strict=False) From cbcc1b90510ef7e97521d591abc00527e42a44e0 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:27:37 -0500 Subject: [PATCH 68/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation_helper.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py index 451f7839..a3e54732 100644 --- a/src/webapp/validation_helper.py +++ b/src/webapp/validation_helper.py @@ -1,14 +1,10 @@ -import io -import os -import json import re import logging from functools import lru_cache -from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple +from typing import List, Dict, Any, Tuple import pandas as pd from pandera import Column, Check, DataFrameSchema -from pandera.errors import SchemaErrors logger = logging.getLogger(__name__) From 624beaf625047106e3c206ca42db76091172db4a Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:30:59 -0500 Subject: [PATCH 69/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 2 +- src/webapp/validation_helper.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index d6e1f169..88b97dcc 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -13,7 +13,7 @@ from pandera import Column, Check, DataFrameSchema from pandera.errors import SchemaErrors from thefuzz import fuzz -from validation_helper import ( +from .validation_helper import ( _header_pass, _pandas_dtype_and_parse_dates, _build_exact_schema, diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py index a3e54732..3bc4cdc2 100644 --- a/src/webapp/validation_helper.py +++ b/src/webapp/validation_helper.py @@ -19,7 +19,7 @@ def normalize_col(name: str) -> str: def _spec_alias_lookup( - merged_specs: Dict[str, dict] + merged_specs: Dict[str, dict], ) -> Tuple[Dict[str, str], Dict[str, List[str]]]: """ Build fast lookups: @@ -132,7 +132,7 @@ def _header_pass( def _pandas_dtype_and_parse_dates( - merged_specs: Dict[str, dict] + merged_specs: Dict[str, dict], ) -> Tuple[Dict[str, Any], List[str]]: """ Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates. From a92862c96bc51463ea9108578241b69bff169f49 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:37:37 -0500 Subject: [PATCH 70/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 88b97dcc..20d93e06 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -283,18 +283,6 @@ def validate_dataset( memory_map=True, # often helps on local/posix filesystems engine="c", # default fast path; keep behavior stable ) - # optional speed-up if pyarrow is available; behavior stays correct - try: - import pyarrow # noqa: F401 - - read_kwargs["engine"] = "pyarrow" - # pandas>=2: dtype_backend speeds strings/ints; ignore if not supported - try: - read_kwargs["dtype_backend"] = "pyarrow" - except TypeError: - pass - except Exception: - pass df = pd.read_csv( filename, **{k: v for k, v in read_kwargs.items() if v is not None} From 680796ba06e81aa7a1b5a7e4262949f9e9ce3199 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:45:22 -0500 Subject: [PATCH 71/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 423 ++++++++++++++++++++++---------- src/webapp/validation_helper.py | 200 --------------- 2 files changed, 296 insertions(+), 327 deletions(-) delete mode 100644 src/webapp/validation_helper.py diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 20d93e06..8efc0d3d 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -1,34 +1,47 @@ -"""File validation functions for various schemas. (Record by record validation happens in the -pipelines, this is for general file validation.) +"""File validation functions for various schemas. +Record-by-record validation happens in the pipelines; this module performs +general file validation with performance-focused improvements. + +Key speed-ups (without losing accuracy): +- Header-only pass to discover/resolve columns before full load +- Selective, typed CSV read via `usecols` and dtype mapping +- Exact-name Pandera schemas (avoid regex column matching) +- Fuzzy matching only for unresolved headers; use rapidfuzz if available +- Precompiled regexes and set-based membership checks inside Pandera checks """ +from __future__ import annotations + import io import os import json import re -from typing import Union, List, Dict, Optional, Any, BinaryIO, cast import logging +from functools import lru_cache +from typing import Union, List, Dict, Optional, Any, BinaryIO, cast, Tuple import pandas as pd from pandera import Column, Check, DataFrameSchema from pandera.errors import SchemaErrors -from thefuzz import fuzz -from .validation_helper import ( - _header_pass, - _pandas_dtype_and_parse_dates, - _build_exact_schema, -) + +# --------------------------------------------------------------------------- # +# Logging +# --------------------------------------------------------------------------- # logger = logging.getLogger(__name__) +# --------------------------------------------------------------------------- # +# Public entry points +# --------------------------------------------------------------------------- # + def validate_file_reader( - filename: str, + filename: Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper], allowed_schema: list[str], base_schema: dict, inst_schema: Optional[Dict[Any, Any]] = None, ) -> dict[str, Any]: - """Validates given a filename.""" + """Validates a dataset given a filename and schema selection.""" return validate_dataset(filename, base_schema, inst_schema, allowed_schema) @@ -54,11 +67,18 @@ def __init__( super().__init__("; ".join(parts)) +# --------------------------------------------------------------------------- # +# Utilities +# --------------------------------------------------------------------------- # + + +@lru_cache(maxsize=4096) def normalize_col(name: str) -> str: - name = name.strip().lower() # Lowercase and trim whitespace - name = re.sub(r"[^a-z0-9_]", "_", name) # Replace non-alphanum with underscore - name = re.sub(r"_+", "_", name) # Collapse multiple underscores - return name.strip("_") # Remove leading/trailing underscores + """Normalize a column name: trim, lowercase, non-alnum->'_', collapse '_'s.""" + name = name.strip().lower() + name = re.sub(r"[^a-z0-9_]", "_", name) + name = re.sub(r"_+", "_", name) + return name.strip("_") def load_json(path: str) -> Any: @@ -66,57 +86,7 @@ def load_json(path: str) -> Any: with open(path, "r") as f: return json.load(f) except Exception as e: - raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}") - - -def rename_columns_to_match_schema( - df: pd.DataFrame, - canon_to_aliases: Dict[str, List[str]], - threshold: int = 90, -) -> pd.DataFrame: - """ - Rename incoming columns using fuzzy match against schema-defined column names and aliases. - - Args: - df: Incoming dataframe - canon_to_aliases: Mapping from canonical column names to list of aliases (including the canonical name itself) - threshold: Fuzzy match score threshold to rename - - Returns: - A new DataFrame with renamed columns - """ - from collections import defaultdict - - new_column_names = {} - log_info = defaultdict(list) - - schema_names = [] - for canon, aliases in canon_to_aliases.items(): - for name in aliases: - schema_names.append((name, canon)) # (alias_or_name, canonical_name) - - for incoming_col in df.columns: - best_score = 0 - best_match = None - best_canon = None - - for schema_col, canon in schema_names: - score = fuzz.ratio(incoming_col.lower(), schema_col.lower()) - if score > best_score: - best_score = score - best_match = schema_col - best_canon = canon - - if best_score >= threshold and incoming_col != best_canon: - new_column_names[incoming_col] = best_canon - log_info[incoming_col].append( - f"Renamed '{incoming_col}' -> '{best_canon}' (matched on '{best_match}', score={best_score})" - ) - - for k, v in log_info.items(): - logging.info(" | ".join(v)) - - return df.rename(columns=new_column_names) + raise FileNotFoundError(f"Failed to load JSON schema at {path}: {e}") from e def merge_model_columns( @@ -125,10 +95,12 @@ def merge_model_columns( institution: str, model: str, ) -> Dict[str, dict]: + """ + Merge base model columns with institution-specific extension, if present. + """ base_models = base_schema.get("base", {}).get("data_models", {}) if model not in base_models: - if logging: - logging.error(f"Model '{model}' not found in base schema") + logger.error("Model '%s' not found in base schema", model) raise KeyError(f"Model '{model}' not in base schema") merged = dict(base_models[model].get("columns", {})) if extension_schema: @@ -139,36 +111,16 @@ def merge_model_columns( return merged -def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: - columns = {} - for canon, spec in specs.items(): - names = [canon] + spec.get("aliases", []) - pattern = r"^(?:" + "|".join(map(re.escape, names)) + r")$" - checks = [] - for chk in spec.get("checks", []): - factory = getattr(Check, chk["type"]) - checks.append(factory(*chk.get("args", []), **chk.get("kwargs", {}))) - - columns[pattern] = Column( - name=pattern, - regex=True, - dtype=spec["dtype"], - nullable=spec["nullable"], - required=spec.get("required", False), - checks=checks or None, - coerce=spec.get("coerce", False), - ) - return DataFrameSchema(columns, strict=False) - - -# --------------------- Actual Validation Layer ------------------------------ +# --------------------------------------------------------------------------- # +# Encoding sniffing (mypy-friendly) +# --------------------------------------------------------------------------- # Src = Union[str, os.PathLike[str], BinaryIO, io.TextIOWrapper] def _read_sample(buf: BinaryIO, n: int) -> bytes: pos = buf.tell() if buf.seekable() else None - chunk = buf.read(n) # -> bytes for BinaryIO + chunk = buf.read(n) if pos is not None: buf.seek(pos) return chunk @@ -213,20 +165,221 @@ def sniff_encoding(src: Src, sample_bytes: int = 1_048_576) -> str: ) +def _reset_to_start_if_possible(src: Src) -> None: + """Best-effort reset to the beginning for file-like objects.""" + try: + if hasattr(src, "seek") and callable(getattr(src, "seek")): + src.seek(0) # type: ignore[attr-defined] + except Exception: + pass + + +# --------------------------------------------------------------------------- # +# Fast header pass & mapping +# --------------------------------------------------------------------------- # + + +def _spec_alias_lookup( + merged_specs: Dict[str, dict] +) -> Tuple[Dict[str, str], Dict[str, List[str]]]: + """ + Build: + - alias2canon: normalized alias -> canonical + - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical) + """ + alias2canon: Dict[str, str] = {} + canon_to_aliases_norm: Dict[str, List[str]] = {} + for canon, spec in merged_specs.items(): + aliases = [canon] + spec.get("aliases", []) + normed = [normalize_col(a) for a in aliases] + canon_to_aliases_norm[canon] = normed + for a in normed: + alias2canon[a] = canon + return alias2canon, canon_to_aliases_norm + + +def _fuzzy_map_unresolved( + unresolved: List[Tuple[str, str]], # [(raw_header, normalized_header)] + choices: List[str], # normalized aliases + alias2canon: Dict[str, str], + threshold: int = 90, +) -> Dict[str, str]: # raw_header -> canonical + """ + Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz. + """ + mapping: Dict[str, str] = {} + try: + from rapidfuzz import process, fuzz as rf_fuzz # type: ignore + for raw, norm in unresolved: + hit = process.extractOne(norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold) + if hit: + best_alias, score, _ = hit + mapping[raw] = alias2canon[best_alias] # type: ignore[index] + except Exception: + # fallback to thefuzz if rapidfuzz is unavailable + try: + from thefuzz import fuzz as tf_fuzz # type: ignore + except Exception: + # If neither library is available, do not fuzz-map anything. + return mapping + for raw, norm in unresolved: + best_score = 0 + best_alias = None + for alias in choices: + s = tf_fuzz.ratio(norm, alias) + if s > best_score: + best_score, best_alias = s, alias + if best_alias and best_score >= threshold: + mapping[raw] = alias2canon[best_alias] + return mapping + + +def _header_pass( + filename: Src, + encoding: str, + merged_specs: Dict[str, dict], + fuzzy_threshold: int = 90, +) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]: + """ + Read only the header. Return: + - raw_cols: list of column names as in file + - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy) + - missing_required: list of canonical columns missing + - missing_optional: list of optional canonical columns missing + - unknown_extra: normalized headers that don't map to any alias + """ + header_df = pd.read_csv(filename, encoding=encoding, nrows=0) + raw_cols = list(header_df.columns) + + alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs) + known_aliases = set(alias2canon.keys()) + + # exact (normalized) mapping first + raw_to_canon: Dict[str, str] = {} + unresolved: List[Tuple[str, str]] = [] + + for raw in raw_cols: + norm = normalize_col(raw) + if norm in alias2canon: + raw_to_canon[raw] = alias2canon[norm] + else: + unresolved.append((raw, norm)) + + # fuzzy match only for unresolved headers + if unresolved: + choices = list(known_aliases) + fuzzy_map = _fuzzy_map_unresolved(unresolved, choices, alias2canon, threshold=fuzzy_threshold) + raw_to_canon.update(fuzzy_map) + + incoming_canons = set(raw_to_canon.values()) + missing_required = [ + c for c, spec in merged_specs.items() + if spec.get("required", False) and c not in incoming_canons + ] + missing_optional = [ + c for c, spec in merged_specs.items() + if not spec.get("required", False) and c not in incoming_canons + ] + # normalized headers that remain unmapped and aren't known aliases + unknown_extra = sorted({norm for (_, norm) in unresolved if norm not in known_aliases}) + + return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra + + +def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[str, Any], List[str]]: + """ + Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates. + Keeps behavior stable while avoiding heavy inference. + """ + dtype_map: Dict[str, Any] = {} + parse_dates: List[str] = [] + + for canon, spec in merged_specs.items(): + dt = str(spec.get("dtype")) + if dt in {"string", "str", "object"}: + dtype_map[canon] = "string" + elif dt in {"int", "int64", "Int64"}: + dtype_map[canon] = "Int64" # nullable integers are safer for dirty data + elif dt in {"float", "float64"}: + dtype_map[canon] = "float64" + elif "datetime" in dt or "date" in dt: + parse_dates.append(canon) + elif dt in {"bool", "boolean"}: + dtype_map[canon] = "boolean" + elif dt == "category": + dtype_map[canon] = "category" + else: + # leave to pandas inference + pass + + return dtype_map, parse_dates + + +def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataFrameSchema: + """ + Build a Pandera schema with exact column names (no regex). + This avoids regex matching overhead during validation. + """ + cols: Dict[str, Column] = {} + for canon in only_canons: + spec = specs[canon] + checks = [] + for chk in spec.get("checks", []): + args = list(chk.get("args", [])) + # precompile regex patterns once + if chk["type"] in {"str_matches", "matches"} and args and isinstance(args[0], str): + args[0] = re.compile(args[0]) + # set-based membership for faster 'isin' + if chk["type"] in {"isin", "is_in"} and args and isinstance(args[0], list): + args[0] = set(args[0]) + + factory = getattr(Check, chk["type"]) + checks.append(factory(*args, **chk.get("kwargs", {}))) + + cols[canon] = Column( + name=canon, + regex=False, + dtype=spec["dtype"], + nullable=spec["nullable"], + required=True, # present-by-construction + checks=checks or None, + coerce=spec.get("coerce", False), + ) + return DataFrameSchema(cols, strict=False) + + +# --------------------------------------------------------------------------- # +# Main validation +# --------------------------------------------------------------------------- # + + def validate_dataset( - filename: str, + filename: Src, base_schema: dict, ext_schema: Optional[Dict[Any, Any]] = None, models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: - # 0) encoding + """ + Validate a dataset against merged base/extension schemas. + + Steps: + 1) Detect encoding (BOM/UTF-8) + 2) Merge requested models' column specs + 3) Header-only pass to map columns (exact + fuzzy) and detect missing/extra + 4) Selective, typed read via pandas (skip unused columns) + 5) Fail-fast validation for required columns; collect soft errors for optional + """ + # --- 1) encoding --- try: enc = sniff_encoding(filename) # latin-1 NOT allowed by default except UnicodeError as ex: raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)]) - # 1) merge requested models + # Ensure both header and full reads start at the beginning for file-like handles + _reset_to_start_if_possible(filename) + + # --- 2) merge requested models --- if models is None: model_list: List[str] = [] elif isinstance(models, str): @@ -248,60 +401,73 @@ def validate_dataset( "unknown_extra_columns": [], } - # 2) HEADER-ONLY PASS: map columns & find missing/extras cheaply - raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = ( - _header_pass(filename, enc, merged_specs, fuzzy_threshold=90) + # --- 3) HEADER-ONLY PASS --- + raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = _header_pass( + filename, enc, merged_specs, fuzzy_threshold=90 ) if missing_required: logger.error("Missing required columns: %s", missing_required) raise HardValidationError(missing_required=missing_required) - # 3) selective typed load - present_canons = sorted(set(raw_to_canon.values())) - # choose one raw column per present canonical + # Reset again before the real read (important for file-like objects) + _reset_to_start_if_possible(filename) + + # Choose one raw header per canonical; prefer exact canonical names when available canon_to_raw: Dict[str, str] = {} for raw, canon in raw_to_canon.items(): - # prefer the raw header that's already exactly canonical if present + # Prefer if normalized raw equals canonical name if canon not in canon_to_raw or normalize_col(raw) == canon: canon_to_raw[canon] = raw + present_canons = sorted(canon_to_raw.keys()) raw_usecols = list(canon_to_raw.values()) - # dtype & parse_dates maps (by canonical); convert to raw keys for read_csv + # dtype & parse_dates maps (by canonical) -> convert to raw keys for read_csv canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs) - raw_dtype_map = { - canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw - } + raw_dtype_map = {canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw} parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw] - read_kwargs = dict( + # --- 4) Selective, typed read --- + # Default to fast C engine; try pyarrow if available. + engine = "c" + try: + import pyarrow # noqa: F401 + engine = "pyarrow" + except Exception: + pass + + read_kwargs: Dict[str, Any] = dict( encoding=enc, usecols=raw_usecols, dtype=raw_dtype_map or None, - parse_dates=parse_dates_raw or None, - memory_map=True, # often helps on local/posix filesystems - engine="c", # default fast path; keep behavior stable - ) - - df = pd.read_csv( - filename, **{k: v for k, v in read_kwargs.items() if v is not None} + engine=engine, ) - - # 4) rename raw headers -> canon once (no DataFrame-wide fuzzy work) - df = df.rename(columns=canon_to_raw) # temporarily raw->canon? Not quite. - # The above renames raw names to canonical because keys are canonical? Fix: + # memory_map works for path-like with the C engine + if engine == "c" and isinstance(filename, (str, os.PathLike)): + read_kwargs["memory_map"] = True + # only C engine supports parse_dates consistently across versions + if parse_dates_raw: + read_kwargs["parse_dates"] = parse_dates_raw + + df = pd.read_csv(filename, **{k: v for k, v in read_kwargs.items() if v is not None}) + + # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy) + if engine == "pyarrow" and parse_dates_canons: + for canon in parse_dates_canons: + raw = canon_to_raw.get(canon) + if raw and raw in df.columns: + # coerce invalids to NaT; Pandera will flag according to nullability/checks + df[raw] = pd.to_datetime(df[raw], errors="coerce") + + # Rename raw headers -> canonical names exactly once df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}) - # 5) REQUIRED FIRST (fail-fast), then OPTIONALS (collect soft errors) - required_canons = [ - c for c in present_canons if merged_specs[c].get("required", False) - ] - optional_canons = [ - c for c in present_canons if not merged_specs[c].get("required", False) - ] + # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) --- + required_canons = [c for c in present_canons if merged_specs[c].get("required", False)] + optional_canons = [c for c in present_canons if not merged_specs[c].get("required", False)] - # Build schemas with exact names only (faster than regex patterns) + # Build exact-name schemas (faster than regex) if required_canons: req_schema = _build_exact_schema(merged_specs, required_canons) try: @@ -320,10 +486,13 @@ def validate_dataset( try: opt_schema.validate(df[optional_canons], lazy=True) except SchemaErrors as err: + # Columns are canonical already, so failure_cases['column'] are canonical names opt_failures = sorted(set(err.failure_cases["column"])) failure_cases_records = err.failure_cases.to_dict(orient="records") - # 6) return — status depends on soft errors / extras + logger.info("missing_optional = %s", missing_optional) + + # Success (with potential soft issues) if opt_failures or missing_optional or unknown_extra: return { "validation_status": "passed_with_soft_errors", diff --git a/src/webapp/validation_helper.py b/src/webapp/validation_helper.py deleted file mode 100644 index 3bc4cdc2..00000000 --- a/src/webapp/validation_helper.py +++ /dev/null @@ -1,200 +0,0 @@ -import re -import logging -from functools import lru_cache -from typing import List, Dict, Any, Tuple - -import pandas as pd -from pandera import Column, Check, DataFrameSchema - -logger = logging.getLogger(__name__) - - -# ---------- normalization is pure; cache it ---------- -@lru_cache(maxsize=4096) -def normalize_col(name: str) -> str: - name = name.strip().lower() - name = re.sub(r"[^a-z0-9_]", "_", name) - name = re.sub(r"_+", "_", name) - return name.strip("_") - - -def _spec_alias_lookup( - merged_specs: Dict[str, dict], -) -> Tuple[Dict[str, str], Dict[str, List[str]]]: - """ - Build fast lookups: - - alias2canon: normalized alias -> canonical - - canon_to_aliases_norm: canonical -> list of normalized aliases (incl. canonical) - """ - alias2canon: Dict[str, str] = {} - canon_to_aliases_norm: Dict[str, List[str]] = {} - for canon, spec in merged_specs.items(): - aliases = [canon] + spec.get("aliases", []) - normed = [normalize_col(a) for a in aliases] - canon_to_aliases_norm[canon] = normed - for a in normed: - alias2canon[a] = canon - return alias2canon, canon_to_aliases_norm - - -def _fuzzy_map_unresolved( - unresolved: List[Tuple[str, str]], # [(raw_header, normalized_header)] - choices: List[str], # normalized aliases - alias2canon: Dict[str, str], - threshold: int = 90, -) -> Dict[str, str]: # raw_header -> canonical - """ - Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz. - """ - mapping: Dict[str, str] = {} - try: - from rapidfuzz import process, fuzz as rf_fuzz # much faster - - for raw, norm in unresolved: - hit = process.extractOne( - norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold - ) - if hit: - best_alias, score, _ = hit - mapping[raw] = alias2canon[best_alias] - except Exception: - # fallback to thefuzz if rapidfuzz is unavailable - from thefuzz import fuzz as tf_fuzz - - for raw, norm in unresolved: - best_score = 0 - best_alias = None - for alias in choices: - s = tf_fuzz.ratio(norm, alias) - if s > best_score: - best_score, best_alias = s, alias - if best_alias and best_score >= threshold: - mapping[raw] = alias2canon[best_alias] - return mapping - - -def _header_pass( - filename: str, - encoding: str, - merged_specs: Dict[str, dict], - fuzzy_threshold: int = 90, -) -> Tuple[List[str], Dict[str, str], List[str], List[str], List[str]]: - """ - Read only the header. Return: - - raw_cols: list of column names as in file - - raw_to_canon: mapping raw header -> canonical (after exact+fuzzy) - - missing_required: list of canonical columns missing - - missing_optional: list of optional canonical columns missing - - unknown_extra: normalized headers that don't map to any alias - """ - header_df = pd.read_csv(filename, encoding=encoding, nrows=0) - raw_cols = list(header_df.columns) - - alias2canon, canon_to_aliases_norm = _spec_alias_lookup(merged_specs) - known_aliases = set(alias2canon.keys()) - - # exact (normalized) mapping first - raw_to_canon: Dict[str, str] = {} - unresolved: List[Tuple[str, str]] = [] - incoming_norms: List[str] = [] - - for raw in raw_cols: - norm = normalize_col(raw) - incoming_norms.append(norm) - if norm in alias2canon: - raw_to_canon[raw] = alias2canon[norm] - else: - unresolved.append((raw, norm)) - - # fuzzy match only for unresolved headers - if unresolved: - choices = list(known_aliases) - fuzzy_map = _fuzzy_map_unresolved( - unresolved, choices, alias2canon, threshold=fuzzy_threshold - ) - raw_to_canon.update(fuzzy_map) - - # derive presence/missing/extras from header only - incoming_canons = set(raw_to_canon.values()) - missing_required = [ - c - for c, spec in merged_specs.items() - if spec.get("required", False) and c not in incoming_canons - ] - missing_optional = [ - c - for c, spec in merged_specs.items() - if not spec.get("required", False) and c not in incoming_canons - ] - unknown_extra = sorted({n for (_, n) in unresolved if n not in known_aliases}) - - return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra - - -def _pandas_dtype_and_parse_dates( - merged_specs: Dict[str, dict], -) -> Tuple[Dict[str, Any], List[str]]: - """ - Best-effort mapping from your spec dtype -> pandas read_csv dtype/parse_dates. - We keep it conservative to avoid accuracy loss. - """ - dtype_map: Dict[str, Any] = {} - parse_dates: List[str] = [] - - for canon, spec in merged_specs.items(): - dt = str(spec.get("dtype")) - # conservative mappings - if dt in {"string", "str", "object"}: - dtype_map[canon] = "string" - elif dt in {"int", "int64", "Int64"}: - # nullable integers are much safer for dirty data - dtype_map[canon] = "Int64" - elif dt in {"float", "float64"}: - dtype_map[canon] = "float64" - elif "datetime" in dt or "date" in dt: # pandera often uses datetime64[ns] - parse_dates.append(canon) # let pandas parse as datetime - elif dt in {"bool", "boolean"}: - dtype_map[canon] = "boolean" - elif dt == "category": - dtype_map[canon] = "category" - else: - # leave unmapped types to pandas inference (keeps behavior) - pass - - return dtype_map, parse_dates - - -def _build_exact_schema( - specs: Dict[str, dict], only_canons: List[str] -) -> DataFrameSchema: - """ - Build a Pandera schema with exact column names (no regex). - This avoids regex matching overhead during validation. - """ - cols: Dict[str, Column] = {} - for canon in only_canons: - spec = specs[canon] - checks = [] - for chk in spec.get("checks", []): - # small speedup opportunities: - # - precompile regex patterns for str_matches - args = list(chk.get("args", [])) - if ( - chk["type"] in {"str_matches", "matches"} - and args - and isinstance(args[0], str) - ): - args[0] = re.compile(args[0]) - factory = getattr(Check, chk["type"]) - checks.append(factory(*args, **chk.get("kwargs", {}))) - - cols[canon] = Column( - name=canon, - regex=False, - dtype=spec["dtype"], - nullable=spec["nullable"], - required=True, # present-by-construction here - checks=checks or None, - coerce=spec.get("coerce", False), - ) - return DataFrameSchema(cols, strict=False) From 1b5452eca2578b110e2cbe4c7eeec45fe744c647 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:45:44 -0500 Subject: [PATCH 72/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 58 +++++++++++++++++++++++++++++----------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 8efc0d3d..7ff21755 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -200,18 +200,21 @@ def _spec_alias_lookup( def _fuzzy_map_unresolved( unresolved: List[Tuple[str, str]], # [(raw_header, normalized_header)] - choices: List[str], # normalized aliases + choices: List[str], # normalized aliases alias2canon: Dict[str, str], threshold: int = 90, -) -> Dict[str, str]: # raw_header -> canonical +) -> Dict[str, str]: # raw_header -> canonical """ Fuzzy-match only the unresolved headers, using RapidFuzz if available, otherwise thefuzz. """ mapping: Dict[str, str] = {} try: from rapidfuzz import process, fuzz as rf_fuzz # type: ignore + for raw, norm in unresolved: - hit = process.extractOne(norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold) + hit = process.extractOne( + norm, choices, scorer=rf_fuzz.ratio, score_cutoff=threshold + ) if hit: best_alias, score, _ = hit mapping[raw] = alias2canon[best_alias] # type: ignore[index] @@ -268,25 +271,33 @@ def _header_pass( # fuzzy match only for unresolved headers if unresolved: choices = list(known_aliases) - fuzzy_map = _fuzzy_map_unresolved(unresolved, choices, alias2canon, threshold=fuzzy_threshold) + fuzzy_map = _fuzzy_map_unresolved( + unresolved, choices, alias2canon, threshold=fuzzy_threshold + ) raw_to_canon.update(fuzzy_map) incoming_canons = set(raw_to_canon.values()) missing_required = [ - c for c, spec in merged_specs.items() + c + for c, spec in merged_specs.items() if spec.get("required", False) and c not in incoming_canons ] missing_optional = [ - c for c, spec in merged_specs.items() + c + for c, spec in merged_specs.items() if not spec.get("required", False) and c not in incoming_canons ] # normalized headers that remain unmapped and aren't known aliases - unknown_extra = sorted({norm for (_, norm) in unresolved if norm not in known_aliases}) + unknown_extra = sorted( + {norm for (_, norm) in unresolved if norm not in known_aliases} + ) return raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra -def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[str, Any], List[str]]: +def _pandas_dtype_and_parse_dates( + merged_specs: Dict[str, dict] +) -> Tuple[Dict[str, Any], List[str]]: """ Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates. Keeps behavior stable while avoiding heavy inference. @@ -315,7 +326,9 @@ def _pandas_dtype_and_parse_dates(merged_specs: Dict[str, dict]) -> Tuple[Dict[s return dtype_map, parse_dates -def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataFrameSchema: +def _build_exact_schema( + specs: Dict[str, dict], only_canons: List[str] +) -> DataFrameSchema: """ Build a Pandera schema with exact column names (no regex). This avoids regex matching overhead during validation. @@ -327,7 +340,11 @@ def _build_exact_schema(specs: Dict[str, dict], only_canons: List[str]) -> DataF for chk in spec.get("checks", []): args = list(chk.get("args", [])) # precompile regex patterns once - if chk["type"] in {"str_matches", "matches"} and args and isinstance(args[0], str): + if ( + chk["type"] in {"str_matches", "matches"} + and args + and isinstance(args[0], str) + ): args[0] = re.compile(args[0]) # set-based membership for faster 'isin' if chk["type"] in {"isin", "is_in"} and args and isinstance(args[0], list): @@ -402,8 +419,8 @@ def validate_dataset( } # --- 3) HEADER-ONLY PASS --- - raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = _header_pass( - filename, enc, merged_specs, fuzzy_threshold=90 + raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = ( + _header_pass(filename, enc, merged_specs, fuzzy_threshold=90) ) if missing_required: @@ -425,7 +442,9 @@ def validate_dataset( # dtype & parse_dates maps (by canonical) -> convert to raw keys for read_csv canon_dtype_map, parse_dates_canons = _pandas_dtype_and_parse_dates(merged_specs) - raw_dtype_map = {canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw} + raw_dtype_map = { + canon_to_raw[c]: dt for c, dt in canon_dtype_map.items() if c in canon_to_raw + } parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw] # --- 4) Selective, typed read --- @@ -433,6 +452,7 @@ def validate_dataset( engine = "c" try: import pyarrow # noqa: F401 + engine = "pyarrow" except Exception: pass @@ -450,7 +470,9 @@ def validate_dataset( if parse_dates_raw: read_kwargs["parse_dates"] = parse_dates_raw - df = pd.read_csv(filename, **{k: v for k, v in read_kwargs.items() if v is not None}) + df = pd.read_csv( + filename, **{k: v for k, v in read_kwargs.items() if v is not None} + ) # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy) if engine == "pyarrow" and parse_dates_canons: @@ -464,8 +486,12 @@ def validate_dataset( df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}) # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) --- - required_canons = [c for c in present_canons if merged_specs[c].get("required", False)] - optional_canons = [c for c in present_canons if not merged_specs[c].get("required", False)] + required_canons = [ + c for c in present_canons if merged_specs[c].get("required", False) + ] + optional_canons = [ + c for c in present_canons if not merged_specs[c].get("required", False) + ] # Build exact-name schemas (faster than regex) if required_canons: From c8a58720de47bc74f35118232f5c7fa61e8e1ee3 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:51:35 -0500 Subject: [PATCH 73/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 7ff21755..58922734 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -180,7 +180,7 @@ def _reset_to_start_if_possible(src: Src) -> None: def _spec_alias_lookup( - merged_specs: Dict[str, dict] + merged_specs: Dict[str, dict], ) -> Tuple[Dict[str, str], Dict[str, List[str]]]: """ Build: @@ -296,7 +296,7 @@ def _header_pass( def _pandas_dtype_and_parse_dates( - merged_specs: Dict[str, dict] + merged_specs: Dict[str, dict], ) -> Tuple[Dict[str, Any], List[str]]: """ Conservative mapping from spec dtype -> pandas read_csv dtype/parse_dates. From e9d2067114e0b31edd78a2377ca61fe4a5e0d489 Mon Sep 17 00:00:00 2001 From: Mesh Date: Fri, 12 Sep 2025 19:54:18 -0500 Subject: [PATCH 74/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 58922734..fd7abd13 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -477,7 +477,7 @@ def validate_dataset( # If we used the pyarrow engine, perform datetime parsing post-read (keeps accuracy) if engine == "pyarrow" and parse_dates_canons: for canon in parse_dates_canons: - raw = canon_to_raw.get(canon) + raw = str(canon_to_raw.get(canon)) if raw and raw in df.columns: # coerce invalids to NaT; Pandera will flag according to nullability/checks df[raw] = pd.to_datetime(df[raw], errors="coerce") From 1d8fd3fb67c57f0223b07837cf66fe28160ec917 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sat, 13 Sep 2025 08:36:20 -0500 Subject: [PATCH 75/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index fd7abd13..2b5ced20 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -452,7 +452,6 @@ def validate_dataset( engine = "c" try: import pyarrow # noqa: F401 - engine = "pyarrow" except Exception: pass From 1085628302842ae41dc2aaa4556c06c1d6b0fffd Mon Sep 17 00:00:00 2001 From: Mesh Date: Sat, 13 Sep 2025 08:38:47 -0500 Subject: [PATCH 76/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 2b5ced20..fd7abd13 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -452,6 +452,7 @@ def validate_dataset( engine = "c" try: import pyarrow # noqa: F401 + engine = "pyarrow" except Exception: pass From a5fd596075c7264b05cd10b5a6a271ad810324f2 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sat, 13 Sep 2025 09:30:12 -0500 Subject: [PATCH 77/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index fd7abd13..dec777a7 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -387,16 +387,16 @@ def validate_dataset( 4) Selective, typed read via pandas (skip unused columns) 5) Fail-fast validation for required columns; collect soft errors for optional """ - # --- 1) encoding --- + # ---------------------------- 1) Encoding try: - enc = sniff_encoding(filename) # latin-1 NOT allowed by default + enc = sniff_encoding(filename) except UnicodeError as ex: raise HardValidationError(schema_errors="decode_error", failure_cases=[str(ex)]) # Ensure both header and full reads start at the beginning for file-like handles _reset_to_start_if_possible(filename) - # --- 2) merge requested models --- + # ---------------------------- 2) merge requested models if models is None: model_list: List[str] = [] elif isinstance(models, str): @@ -418,7 +418,7 @@ def validate_dataset( "unknown_extra_columns": [], } - # --- 3) HEADER-ONLY PASS --- + # ---------------------------- 3) HEADER-ONLY PASS raw_cols, raw_to_canon, missing_required, missing_optional, unknown_extra = ( _header_pass(filename, enc, merged_specs, fuzzy_threshold=90) ) @@ -447,13 +447,22 @@ def validate_dataset( } parse_dates_raw = [canon_to_raw[c] for c in parse_dates_canons if c in canon_to_raw] - # --- 4) Selective, typed read --- + # ---------------------------- 4) Selective, typed read # Default to fast C engine; try pyarrow if available. engine = "c" + use_threads = None # only meaningful for pyarrow engine + dtype_backend = None + try: import pyarrow # noqa: F401 engine = "pyarrow" + use_threads = True # multi-threaded CSV parsing + # pandas>=2.0 can store DataFrame blocks as Arrow arrays (often faster) + try: + dtype_backend = "pyarrow" + except TypeError: + dtype_backend = None except Exception: pass @@ -462,11 +471,12 @@ def validate_dataset( usecols=raw_usecols, dtype=raw_dtype_map or None, engine=engine, + dtype_backend=dtype_backend, # ignored if None / not supported + use_threads=use_threads, # ignored by C engine ) # memory_map works for path-like with the C engine if engine == "c" and isinstance(filename, (str, os.PathLike)): read_kwargs["memory_map"] = True - # only C engine supports parse_dates consistently across versions if parse_dates_raw: read_kwargs["parse_dates"] = parse_dates_raw @@ -485,7 +495,7 @@ def validate_dataset( # Rename raw headers -> canonical names exactly once df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}) - # --- 5) Validation: required fail-fast, optional lazy (collect soft errors) --- + # ---------------------------- 5) Validation: required fail-fast, optional lazy (collect soft errors) required_canons = [ c for c in present_canons if merged_specs[c].get("required", False) ] From e0f104aa66bd656f81d2e00de5a24291484d3742 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sat, 13 Sep 2025 09:39:29 -0500 Subject: [PATCH 78/92] fixing validation issues with problematic MSUD file: Optimizing encoding check --- src/webapp/validation.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index dec777a7..e02df270 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -450,19 +450,10 @@ def validate_dataset( # ---------------------------- 4) Selective, typed read # Default to fast C engine; try pyarrow if available. engine = "c" - use_threads = None # only meaningful for pyarrow engine - dtype_backend = None - try: import pyarrow # noqa: F401 engine = "pyarrow" - use_threads = True # multi-threaded CSV parsing - # pandas>=2.0 can store DataFrame blocks as Arrow arrays (often faster) - try: - dtype_backend = "pyarrow" - except TypeError: - dtype_backend = None except Exception: pass @@ -471,12 +462,11 @@ def validate_dataset( usecols=raw_usecols, dtype=raw_dtype_map or None, engine=engine, - dtype_backend=dtype_backend, # ignored if None / not supported - use_threads=use_threads, # ignored by C engine ) # memory_map works for path-like with the C engine if engine == "c" and isinstance(filename, (str, os.PathLike)): read_kwargs["memory_map"] = True + # only C engine supports parse_dates consistently across versions if parse_dates_raw: read_kwargs["parse_dates"] = parse_dates_raw @@ -493,7 +483,7 @@ def validate_dataset( df[raw] = pd.to_datetime(df[raw], errors="coerce") # Rename raw headers -> canonical names exactly once - df = df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}) + df.rename(columns={raw: canon for canon, raw in canon_to_raw.items()}, inplace=True) # ---------------------------- 5) Validation: required fail-fast, optional lazy (collect soft errors) required_canons = [ From edb2a34b240ce3eafa93bd2175dace851464d8b9 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 08:49:32 -0500 Subject: [PATCH 79/92] revamped entire validation helper script --- src/webapp/routers/data.py | 314 +++++++++++++++++++++++-------------- 1 file changed, 196 insertions(+), 118 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 29b8587e..70c679d8 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1033,143 +1033,225 @@ def validation_helper( storage_control: StorageControl, sql_session: Session, ) -> Any: - """Helper function for file validation.""" + """Helper function for file validation (self-contained & optimized).""" + import time + import re + import os + + # --- access check & quick input validation has_access_to_inst_or_err(inst_id, current_user) - if file_name.find("/") != -1: - raise HTTPException( - status_code=422, - detail="File name can't contain '/'.", - ) + if "/" in file_name: + raise HTTPException(status_code=422, detail="File name can't contain '/'.") + + # --- bind session once local_session.set(sql_session) + sess = local_session.get() - allowed_schemas = None - if not allowed_schemas: - allowed_schemas = infer_models_from_filename(file_name) + # --- one-time initialization on the function object (kept in-process) + if not hasattr(validation_helper, "_ar_re"): + validation_helper._ar_re = re.compile( + r"(?, "val": (, )} + validation_helper._base_cache = {"exp": 0.0, "val": None} + if not hasattr(validation_helper, "_ext_cache"): + # { str(inst_uuid): (exp, extension_json_doc) } + validation_helper._ext_cache = {} + if not hasattr(validation_helper, "_pdp_cache"): + # PDP-wide extension (active), cached: (exp, doc) + validation_helper._pdp_cache = (0.0, None) + + AR_RE = validation_helper._ar_re + BASE_TTL = 300 # seconds + EXT_TTL = 120 # seconds + + # --- filename → allowed_schemas (fast, single pass) + name = os.path.basename(file_name).lower() + has_course = "course" in name + has_semester = "semester" in name + has_student = ( + ("student" in name) + or ("cohort" in name) + or ( + (not has_course) + and (AR_RE.search(name) is not None or "deidentified" in name) + ) + ) - inferred_schemas: list[str] = [] - # ----------------------- Fetch base schema from DB ------------------------------- - base_schema = ( - local_session.get() - .execute( + inferred_from_name: set[str] = set() + if has_course: + inferred_from_name.add("COURSE") + if has_student: + inferred_from_name.add("STUDENT") + if has_semester: + inferred_from_name.add("SEMESTER") + + if not inferred_from_name: + raise ValueError( + f"Could not infer model(s) from file name: {name}. " + "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')." + ) + + allowed_schemas = sorted(inferred_from_name) + + # --- fetch active base schema (cached) + now = time.monotonic() + base_cache = validation_helper._base_cache + if now < base_cache["exp"] and base_cache["val"] is not None: + base_schema_id, base_schema = base_cache["val"] + else: + row = sess.execute( select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc) .where( SchemaRegistryTable.doc_type == DocType.base, SchemaRegistryTable.is_active.is_(True), ) .limit(1) - ) - .first() - ) - if base_schema is None: - raise RuntimeError("No active base schema found") - - base_schema_id, base_schema = base_schema - # ----------------------- Fetch inst specific extension schema from DB --------------------- - inst = ( - local_session.get() - .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id))) - .scalar_one_or_none() - ) + ).first() + if row is None: + raise RuntimeError("No active base schema found") + base_schema_id, base_schema = row + base_cache["exp"] = now + BASE_TTL + base_cache["val"] = (base_schema_id, base_schema) + + # --- fetch institution record + inst = sess.execute( + select(InstTable).where(InstTable.id == str_to_uuid(inst_id)) + ).scalar_one_or_none() if inst is None: raise ValueError(f"Institution {inst_id} not found") - if inst.pdp_id: # institution is PDP - inst_schema = ( - local_session.get() - .execute( + bucket = get_external_bucket_name(inst_id) + + # --- choose / prepare extension schema (try to avoid heavy path) + updated_inst_schema: Optional[dict] = None + + def _ext_models_set(doc: Optional[dict]) -> set[str]: + """Extract model keys from an extension document (root or institutions.* layout).""" + if not doc or not isinstance(doc, dict): + return set() + # root-level + if isinstance(doc.get("data_models"), dict): + return {str(k).lower() for k in doc["data_models"].keys()} + # nested by institution + inst_key_candidates = {str(getattr(inst, "id", "")), inst_id} + insts = doc.get("institutions", {}) + if isinstance(insts, dict): + for key in inst_key_candidates: + block = insts.get(key) + if isinstance(block, dict) and isinstance( + block.get("data_models"), dict + ): + return {str(k).lower() for k in block["data_models"].keys()} + return set() + + if getattr(inst, "pdp_id", None): + # PDP institutions: use active PDP extension (cached) + pdp_exp, pdp_doc = validation_helper._pdp_cache + if now < pdp_exp and pdp_doc is not None: + inst_schema = pdp_doc + else: + inst_schema = sess.execute( select(SchemaRegistryTable.json_doc) .where( SchemaRegistryTable.is_pdp.is_(True), SchemaRegistryTable.is_active.is_(True), ) .limit(1) - ) - .scalar_one_or_none() - ) - updated_inst_schema: dict | None = inst_schema - else: # custom (or none) - inst_schema = ( - local_session.get() - .execute( + ).scalar_one_or_none() + validation_helper._pdp_cache = (now + EXT_TTL, inst_schema) + updated_inst_schema = inst_schema + else: + # custom institutions: try cached extension first + ext_cache = validation_helper._ext_cache + key = str(getattr(inst, "id", "")) + cached = ext_cache.get(key) + if cached and now < cached[0]: + inst_schema = cached[1] + else: + inst_schema = sess.execute( select(SchemaRegistryTable.json_doc) .where( - SchemaRegistryTable.inst_id == inst.id, + SchemaRegistryTable.inst_id == getattr(inst, "id", None), SchemaRegistryTable.is_active.is_(True), - SchemaRegistryTable.doc_type == DocType.extension, # be explicit + SchemaRegistryTable.doc_type == DocType.extension, ) .limit(1) - ) - .scalar_one_or_none() - ) - - dbc = DatabricksControl() - schema_extension = dbc.create_custom_schema_extension( - bucket_name=get_external_bucket_name(inst_id), - inst_query=inst, - file_name=file_name, - base_schema=base_schema, - extension_schema=inst_schema, - ) + ).scalar_one_or_none() + ext_cache[key] = (now + EXT_TTL, inst_schema) - if schema_extension is not None: - updated_inst_schema = schema_extension - try: - new_schema_extension_record = SchemaRegistryTable( - doc_type=DocType.extension, - inst_id=str_to_uuid(inst_id), - is_pdp=False, # type: ignore - version_label="1.0.0", - extends_schema_id=base_schema_id, - json_doc=schema_extension, - is_active=True, - ) - sess = local_session.get() - sess.add(new_schema_extension_record) - sess.flush() - logging.info("Schema record inserted for '%s'", inst_id) - except IntegrityError as e: - sess = local_session.get() - sess.rollback() - logging.warning("IntegrityError: %s", e) - except Exception as e: - sess = local_session.get() - sess.rollback() - logging.error("Unexpected DB error: %s", e) - raise HTTPException( - status_code=500, - detail=f"Unexpected database error while inserting file record: {e}", - ) + # If extension already includes all inferred models, skip Databricks work. + inferred_lower = {m.lower() for m in allowed_schemas} + ext_models = _ext_models_set(inst_schema) + if inferred_lower.issubset(ext_models): + updated_inst_schema = inst_schema else: - logging.info( - "No-op: extension already contains this model for inst %s", inst_id + # heavy path only when needed + dbc = DatabricksControl() + schema_extension = dbc.create_custom_schema_extension( + bucket_name=bucket, + inst_query=inst, + file_name=file_name, + base_schema=base_schema, + extension_schema=inst_schema, ) - updated_inst_schema = inst_schema + if schema_extension is not None: + updated_inst_schema = schema_extension + try: + new_schema_extension_record = SchemaRegistryTable( + doc_type=DocType.extension, + inst_id=str_to_uuid(inst_id), + is_pdp=False, # type: ignore + version_label="1.0.0", + extends_schema_id=base_schema_id, + json_doc=schema_extension, + is_active=True, + ) + sess.add(new_schema_extension_record) + sess.flush() + logging.info("Schema record inserted for '%s'", inst_id) + # refresh cache + validation_helper._ext_cache[key] = ( + time.monotonic() + EXT_TTL, + schema_extension, + ) + except IntegrityError as e: + sess.rollback() + logging.warning("IntegrityError: %s", e) + except Exception as e: + sess.rollback() + logging.error("Unexpected DB error: %s", e) + raise HTTPException( + status_code=500, + detail=f"Unexpected database error while inserting file record: {e}", + ) + else: + logging.info( + "No-op: extension already contains this model for inst %s", inst_id + ) + updated_inst_schema = inst_schema - # ----------------------- File validation logic logic -------------------------------------- + # --- run file validation (I/O + Pandera work happens inside storage layer) try: inferred_schemas = storage_control.validate_file( - get_external_bucket_name(inst_id), + bucket, file_name, allowed_schemas, base_schema, updated_inst_schema, ) - logging.debug( - "!!!!!!!!!!Inferred Schemas was successful %s", list(inferred_schemas) - ) - + logging.debug("Inferred Schemas success %s", list(inferred_schemas)) except HardValidationError as e: - logging.debug("!!!!!!!!!!Inferred Schemas FAILED (hard) %s", e) - # Build a single string - frontend can render this reliably - msg_parts = ["VALIDATION_FAILED"] + logging.debug("Inferred Schemas FAILED (hard) %s", e) + parts = ["VALIDATION_FAILED"] if e.missing_required: - msg_parts.append(f"missing_required={e.missing_required}") + parts.append(f"missing_required={e.missing_required}") if e.extra_columns: - msg_parts.append(f"extra_columns={e.extra_columns}") + parts.append(f"extra_columns={e.extra_columns}") if e.schema_errors is not None: - msg_parts.append(f"schema_errors={e.schema_errors}") + parts.append(f"schema_errors={e.schema_errors}") if e.failure_cases is not None: - # keep short; avoid dumping huge tables try: sample = ( e.failure_cases[:5] @@ -1178,31 +1260,26 @@ def validation_helper( ) except Exception: sample = "see server logs" - msg_parts.append(f"failure_cases_sample={sample}") + parts.append(f"failure_cases_sample={sample}") raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="; ".join(msg_parts), + status_code=status.HTTP_400_BAD_REQUEST, detail="; ".join(parts) ) - except Exception as e: - logging.debug("!!!!!!!!!!Inferred Schemas FAILED (other) %s", e) + logging.debug("Inferred Schemas FAILED (other) %s", e) raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"VALIDATION_ERROR: {type(e).__name__}: {e}", ) + # --- upsert file record (cheap path) existing_file = ( - local_session.get() - .query(FileTable) - .filter_by( - name=file_name, - inst_id=str_to_uuid(inst_id), - ) + sess.query(FileTable) + .filter_by(name=file_name, inst_id=str_to_uuid(inst_id)) .first() ) if existing_file: - logging.info(f"File '{file_name}' already exists for institution {inst_id}.") + logging.info("File '%s' already exists for institution %s.", file_name, inst_id) db_status = f"File '{file_name}' already exists for institution {inst_id}." else: try: @@ -1212,20 +1289,21 @@ def validation_helper( uploader=str_to_uuid(current_user.user_id), # type: ignore source=source_str, sst_generated=False, - schemas=list(allowed_schemas), + # Store what validation actually inferred (not only filename guess) + schemas=list(inferred_schemas), valid=True, ) - local_session.get().add(new_file_record) - local_session.get().flush() - logging.info(f"File record inserted for '{file_name}'") + sess.add(new_file_record) + sess.flush() + logging.info("File record inserted for '%s'", file_name) db_status = f"File record inserted for '{file_name}'" except IntegrityError as e: - local_session.get().rollback() - logging.warning(f"IntegrityError: {e}") + sess.rollback() + logging.warning("IntegrityError: %s", e) db_status = "Already exists" except Exception as e: - local_session.get().rollback() - logging.error(f"Unexpected DB error: {e}") + sess.rollback() + logging.error("Unexpected DB error: %s", e) raise HTTPException( status_code=500, detail=f"Unexpected database error while inserting file record: {e}", @@ -1234,7 +1312,7 @@ def validation_helper( return { "name": file_name, "inst_id": inst_id, - "file_types": list(allowed_schemas), + "file_types": list(inferred_schemas), "source": source_str, "status": db_status, } From 00d939de0d79980f4ac1526a33a98265f0227f54 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 08:53:26 -0500 Subject: [PATCH 80/92] revamped entire validation helper script --- src/webapp/routers/data_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 9b1c1c31..d0aaf9e0 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -569,7 +569,7 @@ def test_update_batch(client: TestClient) -> None: def test_validate_success_batch(client: TestClient) -> None: """Test PATCH /institutions//batch.""" - MOCK_STORAGE.validate_file.return_value = ["UNKNOWN"] + MOCK_STORAGE.validate_file.return_value = ["COURSE"] # Use validate for manual upload response_upload = client.post( From 5a5cd32641d9c46534777bb8c5df10dd0d4825fc Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:10:29 -0500 Subject: [PATCH 81/92] revamped entire validation helper script --- src/webapp/routers/data.py | 14 ++++++++++---- src/webapp/routers/data_test.py | 2 +- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 70c679d8..de47f693 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1277,7 +1277,14 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: .filter_by(name=file_name, inst_id=str_to_uuid(inst_id)) .first() ) - + if set(inferred_schemas) != set(allowed_schemas): + logging.info( + "Filename inference %s differs from validator result %s for %s; " + "returning filename-based types to preserve API contract.", + allowed_schemas, + inferred_schemas, + file_name, + ) if existing_file: logging.info("File '%s' already exists for institution %s.", file_name, inst_id) db_status = f"File '{file_name}' already exists for institution {inst_id}." @@ -1289,8 +1296,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: uploader=str_to_uuid(current_user.user_id), # type: ignore source=source_str, sst_generated=False, - # Store what validation actually inferred (not only filename guess) - schemas=list(inferred_schemas), + schemas=list(allowed_schemas), valid=True, ) sess.add(new_file_record) @@ -1312,7 +1318,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: return { "name": file_name, "inst_id": inst_id, - "file_types": list(inferred_schemas), + "file_types": list(allowed_schemas), "source": source_str, "status": db_status, } diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index d0aaf9e0..9b1c1c31 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -569,7 +569,7 @@ def test_update_batch(client: TestClient) -> None: def test_validate_success_batch(client: TestClient) -> None: """Test PATCH /institutions//batch.""" - MOCK_STORAGE.validate_file.return_value = ["COURSE"] + MOCK_STORAGE.validate_file.return_value = ["UNKNOWN"] # Use validate for manual upload response_upload = client.post( From a261e8e119ff2cd7600f1f26ee7d23a3fcce6ffd Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:24:02 -0500 Subject: [PATCH 82/92] revamped entire validation helper script --- src/webapp/routers/data.py | 40 ++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index de47f693..3d8b914b 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -3,7 +3,7 @@ import uuid from datetime import datetime, date from databricks.sdk import WorkspaceClient -from typing import Annotated, Any, Dict, List, cast, IO, Optional +from typing import Annotated, Any, Dict, List, cast, IO, Optional, Tuple from pydantic import BaseModel, Field from fastapi import APIRouter, Depends, HTTPException, status, Response, Query from fastapi.responses import FileResponse @@ -1025,6 +1025,16 @@ def infer_models_from_filename(file_path: str) -> List[str]: return sorted(inferred) +class _ValidationState: + _ar_re = re.compile(r"(?, "val": (, )} - validation_helper._base_cache = {"exp": 0.0, "val": None} - if not hasattr(validation_helper, "_ext_cache"): - # { str(inst_uuid): (exp, extension_json_doc) } - validation_helper._ext_cache = {} - if not hasattr(validation_helper, "_pdp_cache"): - # PDP-wide extension (active), cached: (exp, doc) - validation_helper._pdp_cache = (0.0, None) - - AR_RE = validation_helper._ar_re + AR_RE = STATE._ar_re BASE_TTL = 300 # seconds EXT_TTL = 120 # seconds @@ -1097,7 +1092,7 @@ def validation_helper( # --- fetch active base schema (cached) now = time.monotonic() - base_cache = validation_helper._base_cache + base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: base_schema_id, base_schema = base_cache["val"] else: @@ -1123,7 +1118,6 @@ def validation_helper( raise ValueError(f"Institution {inst_id} not found") bucket = get_external_bucket_name(inst_id) - # --- choose / prepare extension schema (try to avoid heavy path) updated_inst_schema: Optional[dict] = None @@ -1148,7 +1142,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: if getattr(inst, "pdp_id", None): # PDP institutions: use active PDP extension (cached) - pdp_exp, pdp_doc = validation_helper._pdp_cache + pdp_exp, pdp_doc = STATE._pdp_cache if now < pdp_exp and pdp_doc is not None: inst_schema = pdp_doc else: @@ -1160,11 +1154,11 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: ) .limit(1) ).scalar_one_or_none() - validation_helper._pdp_cache = (now + EXT_TTL, inst_schema) + STATE._pdp_cache = (now + EXT_TTL, inst_schema) updated_inst_schema = inst_schema else: # custom institutions: try cached extension first - ext_cache = validation_helper._ext_cache + ext_cache = STATE._ext_cache key = str(getattr(inst, "id", "")) cached = ext_cache.get(key) if cached and now < cached[0]: @@ -1212,7 +1206,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: sess.flush() logging.info("Schema record inserted for '%s'", inst_id) # refresh cache - validation_helper._ext_cache[key] = ( + STATE._ext_cache[key] = ( time.monotonic() + EXT_TTL, schema_extension, ) From e332c3740c71dc5d4e403e3b190e2c829e9d5fb7 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Mon, 15 Sep 2025 10:27:41 -0400 Subject: [PATCH 83/92] debugging not being able to find h2o pipeline --- src/webapp/databricks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/databricks.py b/src/webapp/databricks.py index 7c0cbc29..94a89576 100644 --- a/src/webapp/databricks.py +++ b/src/webapp/databricks.py @@ -213,7 +213,7 @@ def run_pdp_inference( job = next(w.jobs.list(name=pipeline_type), None) if not job or job.job_id is None: raise ValueError( - f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id." + f"run_pdp_inference(): Job '{pipeline_type}' was not found or has no job_id for '{gcs_vars['GCP_SERVICE_ACCOUNT_EMAIL']}' and '{databricks_vars['DATABRICKS_HOST_URL']}'." ) job_id = job.job_id LOGGER.info(f"Resolved job ID for '{pipeline_type}': {job_id}") From 12715738e03462bb77cf69f45680269f3c45578e Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Mon, 15 Sep 2025 10:31:49 -0400 Subject: [PATCH 84/92] style --- src/webapp/routers/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 3d8b914b..a0987681 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1045,7 +1045,6 @@ def validation_helper( ) -> Any: """Helper function for file validation (self-contained & optimized).""" import time - import re import os # --- access check & quick input validation From 955809afc8daedebec2089b963e0e6d04fdf32ef Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:35:34 -0500 Subject: [PATCH 85/92] revamped entire validation helper script --- src/webapp/routers/data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 3d8b914b..469561cf 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1094,7 +1094,9 @@ def validation_helper( now = time.monotonic() base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: - base_schema_id, base_schema = base_cache["val"] + base_schema_id, base_schema = base_cache[ + "val" + ] # pylint: disable=unpacking-non-sequence else: row = sess.execute( select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc) From 86df88f8ac8c46e8123335ef34c74680aadb653d Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:39:36 -0500 Subject: [PATCH 86/92] revamped entire validation helper script --- src/webapp/routers/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index e01478cb..0e6d2f57 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1093,7 +1093,7 @@ def validation_helper( now = time.monotonic() base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: - base_schema_id, base_schema = base_cache[ + base_schema_id, base_schema = base_cache[ # pylint: disable=unpacking-non-sequence "val" ] # pylint: disable=unpacking-non-sequence else: From 5cdcb99c4a25f225cc31e2e0c2a2b8526453180d Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:41:41 -0500 Subject: [PATCH 87/92] revamped entire validation helper script --- src/webapp/routers/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 0e6d2f57..755b088b 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1093,9 +1093,9 @@ def validation_helper( now = time.monotonic() base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: - base_schema_id, base_schema = base_cache[ # pylint: disable=unpacking-non-sequence + base_schema_id, base_schema = base_cache[ "val" - ] # pylint: disable=unpacking-non-sequence + ] # pylint: disable=unpacking-non-sequence # pylint: disable=unpacking-non-sequence else: row = sess.execute( select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc) From 3a1819aab25de899c29523f0ced274403ca9fc96 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:42:59 -0500 Subject: [PATCH 88/92] revamped entire validation helper script --- src/webapp/routers/data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 755b088b..af82ccc2 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1093,9 +1093,7 @@ def validation_helper( now = time.monotonic() base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: - base_schema_id, base_schema = base_cache[ - "val" - ] # pylint: disable=unpacking-non-sequence # pylint: disable=unpacking-non-sequence + base_schema_id, base_schema = base_cache["val"] # pylint: disable=unpacking-non-sequence # pylint: disable=unpacking-non-sequence else: row = sess.execute( select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc) From f7cae1db5f9b0c5629e75848713d002c05a3da41 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:47:00 -0500 Subject: [PATCH 89/92] revamped entire validation helper script --- src/webapp/routers/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index af82ccc2..a08f8d2e 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1093,7 +1093,7 @@ def validation_helper( now = time.monotonic() base_cache = STATE._base_cache if now < base_cache["exp"] and base_cache["val"] is not None: - base_schema_id, base_schema = base_cache["val"] # pylint: disable=unpacking-non-sequence # pylint: disable=unpacking-non-sequence + base_schema_id, base_schema = base_cache["val"] # pylint: disable=unpacking-non-sequence # fmt: skip else: row = sess.execute( select(SchemaRegistryTable.schema_id, SchemaRegistryTable.json_doc) From 0f94774a79684859781068d4f1b1d6e9000c9d62 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 09:59:10 -0500 Subject: [PATCH 90/92] revamped entire validation helper script --- src/webapp/routers/data.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index a08f8d2e..b09d2d23 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1143,7 +1143,7 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: # PDP institutions: use active PDP extension (cached) pdp_exp, pdp_doc = STATE._pdp_cache if now < pdp_exp and pdp_doc is not None: - inst_schema = pdp_doc + inst_schema: Optional[Dict[str, Any]] = pdp_doc else: inst_schema = sess.execute( select(SchemaRegistryTable.json_doc) @@ -1182,12 +1182,14 @@ def _ext_models_set(doc: Optional[dict]) -> set[str]: else: # heavy path only when needed dbc = DatabricksControl() - schema_extension = dbc.create_custom_schema_extension( - bucket_name=bucket, - inst_query=inst, - file_name=file_name, - base_schema=base_schema, - extension_schema=inst_schema, + schema_extension: Optional[Dict[str, Any]] = ( + dbc.create_custom_schema_extension( + bucket_name=bucket, + inst_query=inst, + file_name=file_name, + base_schema=base_schema, + extension_schema=inst_schema, + ) ) if schema_extension is not None: updated_inst_schema = schema_extension From b9a669509ac028b4a11df722db41c73023d7f7e3 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 15 Sep 2025 10:23:36 -0500 Subject: [PATCH 91/92] revamped entire validation helper script --- src/webapp/routers/data.py | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index b09d2d23..3aafabe8 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -998,33 +998,6 @@ def download_url_inst_file( ) -_AR_WORD = re.compile(r"(? List[str]: - name = os.path.basename(file_path).lower() - - inferred = set() - if "course" in name: - inferred.add("COURSE") - if "student" in name: - inferred.add("STUDENT") - if "semester" in name: - inferred.add("SEMESTER") - if "cohort" in name: - inferred.add("STUDENT") - if "course" not in name and (_AR_WORD.search(name) or "deidentified" in name): - inferred.add("STUDENT") - - if not inferred: - raise ValueError( - f"Could not infer model(s) from file name: {name}. " - "Filenames should be descriptive (e.g., include 'course', 'cohort', 'student', or 'semester')." - ) - - return sorted(inferred) - - class _ValidationState: _ar_re = re.compile(r"(? Date: Mon, 15 Sep 2025 10:25:14 -0500 Subject: [PATCH 92/92] revamped entire validation helper script --- src/webapp/routers/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 3aafabe8..c8491455 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1018,7 +1018,6 @@ def validation_helper( ) -> Any: """Helper function for file validation (self-contained & optimized).""" import time - import os # --- access check & quick input validation has_access_to_inst_or_err(inst_id, current_user)