datakind
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 0 additions & 11 deletions b/‎README.md‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎config/local_inst_data.example.json‎
Lines changed: 60 additions & 0 deletions b/‎config/local_inst_data.example.json‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/webapp/README.md‎
Lines changed: 22 additions & 0 deletions b/‎src/webapp/README.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/webapp/database.py‎
Lines changed: 46 additions & 0 deletions b/‎src/webapp/database.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎src/webapp/databricks.py‎
Lines changed: 0 additions & 78 deletions b/‎src/webapp/databricks.py‎
Lines changed: 0 additions & 78 deletions
diff --git a/‎src/webapp/gcsutil.py‎
Lines changed: 1 addition & 1 deletion b/‎src/webapp/gcsutil.py‎
Lines changed: 1 addition & 1 deletion
@@ -97,6 +97,9 @@ ENV/
 env.bak/
 venv.bak/
 
+# Local config / dev data
+config/local_inst_data.json
+
 # mkdocs documentation
 /site
 
 
@@ -13,14 +13,3 @@ This repo contains:
 
 
 NOTE: this repo was forked from the https://github.com/datakind/student-success-tool repo, which means some of the static files (e.g. CONTRIBUTING.md) may be outdated or may include irrelevant information from that repo. Please update those as you see fit. For information about the specific items listed above, defer to the specific readmes in the relevant directory.
-
-## Local edvise development override
-
-Production uses a pinned Git reference for `edvise`. For local development, use an
-editable install after syncing the environment.
-
-1. Clone `edvise` alongside `edvise-api` (so `../edvise` exists).
-2. Run `uv sync`.
-3. Override locally: `uv pip install -e ../edvise`
-
-To revert back to the pinned Git dependency, run `uv sync --reinstall-package edvise`.
@@ -0,0 +1,60 @@
+[
+  {
+    "inst_id": "inst-uuid-here",
+    "name": "Example institution",
+    "state": "XX",
+    "retention_days": null,
+    "pdp_id": "",
+    "edvise_id": null,
+    "batches": [
+      {
+        "batch_id": "batch-uuid-here",
+        "inst_id": "inst-uuid-here",
+        "file_names_to_ids": {
+          "example_course.csv": "file-id-course",
+          "example_student.csv": "file-id-student"
+        },
+        "name": "example_batch_1",
+        "created_by": "uploader-uuid-here",
+        "deleted": false,
+        "completed": true,
+        "deletion_request_time": null,
+        "created_at": "2025-01-15T12:00:00",
+        "updated_at": "2025-01-15T12:00:00",
+        "updated_by": ""
+      }
+    ],
+    "files": [
+      {
+        "name": "example_course.csv",
+        "data_id": "file-id-course",
+        "batch_ids": ["batch-uuid-here"],
+        "inst_id": "inst-uuid-here",
+        "uploader": "uploader-uuid-here",
+        "source": "MANUAL_UPLOAD",
+        "schemas": ["COURSE"],
+        "deleted": false,
+        "deletion_request_time": null,
+        "retention_days": null,
+        "sst_generated": false,
+        "valid": true,
+        "uploaded_date": "2025-01-15T11:58:00"
+      },
+      {
+        "name": "example_student.csv",
+        "data_id": "file-id-student",
+        "batch_ids": ["batch-uuid-here"],
+        "inst_id": "inst-uuid-here",
+        "uploader": "uploader-uuid-here",
+        "source": "MANUAL_UPLOAD",
+        "schemas": ["STUDENT"],
+        "deleted": false,
+        "deletion_request_time": null,
+        "retention_days": null,
+        "sst_generated": false,
+        "valid": true,
+        "uploaded_date": "2025-01-15T11:57:00"
+      }
+    ]
+  }
+]
@@ -31,7 +31,7 @@ dependencies = [
     "mlflow~=2.22",
     "cachetools",
     "types-cachetools",
-    "edvise~=0.2.0",
+    "edvise~=0.2.1",
 ]
 
 [project.urls]
 
@@ -168,3 +168,25 @@ The process to upload a file involves three API calls:
 ## Local VSCode Debugging
 
 From the Run & Debug panel (⇧⌘D on 🍎) you can run the [debug launch config](../../.vscode/launch.json) for the webapp or worker modules. This will allow you to set breakpoints within the source code while the applications are running.
+
+## Local edvise development override
+
+Production uses a pinned Git reference for `edvise`. For local development, use an
+editable install after syncing the environment.
+
+1. Clone `edvise` alongside `edvise-api` (so `../edvise` exists).
+2. Run `uv sync`.
+3. Override locally: `uv pip install -e ../edvise`
+
+To revert back to the pinned Git dependency, run `uv sync --reinstall-package edvise`.
+
+## Local institutions (optional)
+
+You can seed the local database with institution, batch, and file metadata that matches dev or staging (names, UUIDs, batch membership) without checking secrets into Git.
+
+1. Copy `config/local_inst_data.example.json` to `config/local_inst_data.json`. The latter is gitignored.
+2. Edit `local_inst_data.json` to match your needs. Use the example file as the schema: one array element per institution, with `inst_id`, `name`, and optionally `state`, `pdp_id`, `batches`, and `files`.
+
+If the file is missing, startup skips this step and the default local seed in code still applies.
+
+**Limitation:** Endpoints that read uploaded CSV (for example EDA) load blobs from GCS under the bucket name `dev_<institution_uuid_hex>`, not from this JSON. To exercise those flows locally you still need GCP credentials and the corresponding objects in that bucket, or you rely on tests/mocks instead.
@@ -1,7 +1,9 @@
 """Database configuration."""
 
+import json
 import uuid
 import datetime
+from pathlib import Path
 from typing import Set, List, Any
 from contextvars import ContextVar
 import enum
@@ -61,6 +63,49 @@ class Base(DeclarativeBase):
 DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357)
 
 
+def _setup_test_institutions(session: Session) -> None:
+    """Load optional local institution display data from config/local_inst_data.json (gitignored)."""
+    file = Path("config/local_inst_data.json")
+    if file.exists():
+        with open(file) as f:
+            for inst in json.load(f):
+                session.merge(
+                    InstTable(
+                        id=uuid.UUID(inst["inst_id"]),
+                        name=inst["name"],
+                        state=inst.get("state"),
+                        pdp_id=inst.get("pdp_id"),
+                        created_at=DATETIME_TESTING,
+                        updated_at=DATETIME_TESTING,
+                        created_by=LOCAL_USER_UUID,
+                    )
+                )
+                schemas_by_file_id = {
+                    f["data_id"]: f.get("schemas", []) for f in inst.get("files", [])
+                }
+                for batch in inst.get("batches", []):
+                    batch_table = BatchTable(
+                        id=uuid.UUID(batch["batch_id"]),
+                        inst_id=uuid.UUID(inst["inst_id"]),
+                        name=batch["name"],
+                        created_at=DATETIME_TESTING,
+                        updated_at=DATETIME_TESTING,
+                        created_by=LOCAL_USER_UUID,
+                    )
+                    for file_name, file_id in batch["file_names_to_ids"].items():
+                        batch_table.files.add(
+                            session.merge(
+                                FileTable(
+                                    id=uuid.UUID(file_id),
+                                    inst_id=uuid.UUID(inst["inst_id"]),
+                                    name=file_name,
+                                    schemas=schemas_by_file_id.get(file_id, []),
+                                )
+                            )  # type: ignore
+                        )
+                    session.merge(batch_table)
+
+
 @event.listens_for(Mapper, "before_insert")
 @event.listens_for(Mapper, "before_update")
 def validate_string_lengths(mapper, connection, target):
@@ -121,6 +166,7 @@ def init_db(env: str) -> None:
             )
             # Create test files and batches for LOCAL environment
             if env == "LOCAL":
+                _setup_test_institutions(session)
                 # Create test files
                 test_file_1 = FileTable(
                     id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562a"),
 
@@ -13,19 +13,16 @@
 )
 from google.cloud import storage
 from google.api_core import exceptions as gcs_errors
-from .validation_extension import generate_extension_schema
 from .config import databricks_vars, gcs_vars
 from .utilities import databricksify_inst_name, SchemaType
 from typing import List, Any, Dict, Optional
-from fastapi import HTTPException
 import requests
 import hashlib
 import json
 import gzip
 from cachetools import TTLCache
 import threading
 import re
-import pandas as pd
 
 # Setting up logger
 LOGGER = logging.getLogger(__name__)
@@ -623,78 +620,3 @@ def matches_one(pat: Any) -> bool:
                     return key
 
         return None
-
-    def create_custom_schema_extension(
-        self,
-        bucket_name: str,
-        inst_query: Any,
-        file_name: str,
-        base_schema: Dict[str, Any],  # pass base schema dict in
-        extension_schema: Optional[dict] = None,  # existing extension or None
-    ) -> Any:
-        if (
-            os.getenv("SST_SKIP_EXT_GEN") == "1"
-        ):  # skip using workspace client for tests
-            LOGGER.info("SST_SKIP_EXT_GEN=1; skipping Databricks extension generation.")
-            return None
-
-        inst_name = inst_query.name
-        inst_id = str(inst_query.id)
-
-        mapping = {
-            "course": [
-                "course.csv",
-                "courses.csv",
-                r"^(?=.*AR_DEIDENTIFIED)(?=.*COURSE).*\.csv$",
-            ],
-            "student": ["student.csv", r"^(?=.*AR_DEIDENTIFIED)(?!.*COURSE).*\.csv$"],
-            "semester": ["semester.csv"],
-        }
-
-        key = self.get_key_for_file(mapping, file_name)  # e.g., "student"
-        if key is None:
-            raise HTTPException(
-                404, detail=f"{file_name} not found in {inst_name} validation_mapping"
-            )
-
-        key_lc = key.lower()
-
-        # 4) If this model already exists in the provided extension for this institution, skip
-        if extension_schema is not None:
-            if not isinstance(extension_schema, dict):
-                raise HTTPException(
-                    400, detail="extension_schema must be a dict if provided"
-                )
-
-            inst_block = extension_schema.get("institutions", {}).get(inst_id, {})
-            data_models = inst_block.get("data_models", {})
-            existing_keys_lc = {str(k).lower() for k in data_models.keys()}
-
-            if key_lc in existing_keys_lc:
-                LOGGER.info(
-                    "Model '%s' already present for institution '%s' — skipping (return None).",
-                    key,
-                    inst_id,
-                )
-                return None  # <-- sentinel: do not write
-
-        # 5) Read the unvalidated CSV from GCS
-        try:
-            client = storage.Client()
-            bucket = client.bucket(bucket_name)
-            blob = bucket.blob(f"unvalidated/{file_name}")
-            with blob.open("r") as fh:
-                df = pd.read_csv(fh)
-        except Exception as e:
-            LOGGER.exception("Failed to read %s from GCS", file_name)
-            raise HTTPException(500, detail=f"Failed to read {file_name} from GCS: {e}")
-
-        updated_extension = generate_extension_schema(
-            df=df,
-            models=key,  # exactly one model
-            institution_id=inst_id,
-            base_schema=base_schema,  # reference only, not mutated
-            existing_extension=extension_schema,  # may be None
-        )
-
-        return updated_extension
@@ -378,7 +378,7 @@ def validate_file(
             base_schema: Base schema dict.
             inst_schema: Optional extension schema with institutions.* blocks.
             institution_id: Key into inst_schema["institutions"]: "edvise", "pdp",
-                "legacy" (any-format uploads), or institution UUID for custom. Default "pdp".
+                or "legacy" (any-format uploads). Default "pdp".
             institution_identifier: Optional institution ID (e.g. UUID). Reserved for
                 future use; Edvise uses JSON-based validation only (different shape).
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ dependencies = [`
`31`	`31`	`"mlflow~=2.22",`
`32`	`32`	`"cachetools",`
`33`	`33`	`"types-cachetools",`
`34`		`- "edvise~=0.2.0",`
	`34`	`+ "edvise~=0.2.1",`
`35`	`35`	`]`
`36`	`36`
`37`	`37`	`[project.urls]`