Skip to content

Commit 00288f2

Browse files
Merge pull request #231 from datakind/develop
Merge develop into staging for edvise-api release
2 parents 4ac0977 + ca41320 commit 00288f2

19 files changed

Lines changed: 859 additions & 655 deletions

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@ ENV/
9797
env.bak/
9898
venv.bak/
9999

100+
# Local config / dev data
101+
config/local_inst_data.json
102+
100103
# mkdocs documentation
101104
/site
102105

README.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,3 @@ This repo contains:
1313

1414

1515
NOTE: this repo was forked from the https://github.com/datakind/student-success-tool repo, which means some of the static files (e.g. CONTRIBUTING.md) may be outdated or may include irrelevant information from that repo. Please update those as you see fit. For information about the specific items listed above, defer to the specific readmes in the relevant directory.
16-
17-
## Local edvise development override
18-
19-
Production uses a pinned Git reference for `edvise`. For local development, use an
20-
editable install after syncing the environment.
21-
22-
1. Clone `edvise` alongside `edvise-api` (so `../edvise` exists).
23-
2. Run `uv sync`.
24-
3. Override locally: `uv pip install -e ../edvise`
25-
26-
To revert back to the pinned Git dependency, run `uv sync --reinstall-package edvise`.
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
[
2+
{
3+
"inst_id": "inst-uuid-here",
4+
"name": "Example institution",
5+
"state": "XX",
6+
"retention_days": null,
7+
"pdp_id": "",
8+
"edvise_id": null,
9+
"batches": [
10+
{
11+
"batch_id": "batch-uuid-here",
12+
"inst_id": "inst-uuid-here",
13+
"file_names_to_ids": {
14+
"example_course.csv": "file-id-course",
15+
"example_student.csv": "file-id-student"
16+
},
17+
"name": "example_batch_1",
18+
"created_by": "uploader-uuid-here",
19+
"deleted": false,
20+
"completed": true,
21+
"deletion_request_time": null,
22+
"created_at": "2025-01-15T12:00:00",
23+
"updated_at": "2025-01-15T12:00:00",
24+
"updated_by": ""
25+
}
26+
],
27+
"files": [
28+
{
29+
"name": "example_course.csv",
30+
"data_id": "file-id-course",
31+
"batch_ids": ["batch-uuid-here"],
32+
"inst_id": "inst-uuid-here",
33+
"uploader": "uploader-uuid-here",
34+
"source": "MANUAL_UPLOAD",
35+
"schemas": ["COURSE"],
36+
"deleted": false,
37+
"deletion_request_time": null,
38+
"retention_days": null,
39+
"sst_generated": false,
40+
"valid": true,
41+
"uploaded_date": "2025-01-15T11:58:00"
42+
},
43+
{
44+
"name": "example_student.csv",
45+
"data_id": "file-id-student",
46+
"batch_ids": ["batch-uuid-here"],
47+
"inst_id": "inst-uuid-here",
48+
"uploader": "uploader-uuid-here",
49+
"source": "MANUAL_UPLOAD",
50+
"schemas": ["STUDENT"],
51+
"deleted": false,
52+
"deletion_request_time": null,
53+
"retention_days": null,
54+
"sst_generated": false,
55+
"valid": true,
56+
"uploaded_date": "2025-01-15T11:57:00"
57+
}
58+
]
59+
}
60+
]

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ dependencies = [
3131
"mlflow~=2.22",
3232
"cachetools",
3333
"types-cachetools",
34-
"edvise~=0.2.0",
34+
"edvise~=0.2.1",
3535
]
3636

3737
[project.urls]

src/webapp/README.md

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,25 @@ The process to upload a file involves three API calls:
168168
## Local VSCode Debugging
169169

170170
From the Run & Debug panel (⇧⌘D on 🍎) you can run the [debug launch config](../../.vscode/launch.json) for the webapp or worker modules. This will allow you to set breakpoints within the source code while the applications are running.
171+
172+
## Local edvise development override
173+
174+
Production uses a pinned Git reference for `edvise`. For local development, use an
175+
editable install after syncing the environment.
176+
177+
1. Clone `edvise` alongside `edvise-api` (so `../edvise` exists).
178+
2. Run `uv sync`.
179+
3. Override locally: `uv pip install -e ../edvise`
180+
181+
To revert back to the pinned Git dependency, run `uv sync --reinstall-package edvise`.
182+
183+
## Local institutions (optional)
184+
185+
You can seed the local database with institution, batch, and file metadata that matches dev or staging (names, UUIDs, batch membership) without checking secrets into Git.
186+
187+
1. Copy `config/local_inst_data.example.json` to `config/local_inst_data.json`. The latter is gitignored.
188+
2. Edit `local_inst_data.json` to match your needs. Use the example file as the schema: one array element per institution, with `inst_id`, `name`, and optionally `state`, `pdp_id`, `batches`, and `files`.
189+
190+
If the file is missing, startup skips this step and the default local seed in code still applies.
191+
192+
**Limitation:** Endpoints that read uploaded CSV (for example EDA) load blobs from GCS under the bucket name `dev_<institution_uuid_hex>`, not from this JSON. To exercise those flows locally you still need GCP credentials and the corresponding objects in that bucket, or you rely on tests/mocks instead.

src/webapp/database.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
"""Database configuration."""
22

3+
import json
34
import uuid
45
import datetime
6+
from pathlib import Path
57
from typing import Set, List, Any
68
from contextvars import ContextVar
79
import enum
@@ -61,6 +63,49 @@ class Base(DeclarativeBase):
6163
DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357)
6264

6365

66+
def _setup_test_institutions(session: Session) -> None:
67+
"""Load optional local institution display data from config/local_inst_data.json (gitignored)."""
68+
file = Path("config/local_inst_data.json")
69+
if file.exists():
70+
with open(file) as f:
71+
for inst in json.load(f):
72+
session.merge(
73+
InstTable(
74+
id=uuid.UUID(inst["inst_id"]),
75+
name=inst["name"],
76+
state=inst.get("state"),
77+
pdp_id=inst.get("pdp_id"),
78+
created_at=DATETIME_TESTING,
79+
updated_at=DATETIME_TESTING,
80+
created_by=LOCAL_USER_UUID,
81+
)
82+
)
83+
schemas_by_file_id = {
84+
f["data_id"]: f.get("schemas", []) for f in inst.get("files", [])
85+
}
86+
for batch in inst.get("batches", []):
87+
batch_table = BatchTable(
88+
id=uuid.UUID(batch["batch_id"]),
89+
inst_id=uuid.UUID(inst["inst_id"]),
90+
name=batch["name"],
91+
created_at=DATETIME_TESTING,
92+
updated_at=DATETIME_TESTING,
93+
created_by=LOCAL_USER_UUID,
94+
)
95+
for file_name, file_id in batch["file_names_to_ids"].items():
96+
batch_table.files.add(
97+
session.merge(
98+
FileTable(
99+
id=uuid.UUID(file_id),
100+
inst_id=uuid.UUID(inst["inst_id"]),
101+
name=file_name,
102+
schemas=schemas_by_file_id.get(file_id, []),
103+
)
104+
) # type: ignore
105+
)
106+
session.merge(batch_table)
107+
108+
64109
@event.listens_for(Mapper, "before_insert")
65110
@event.listens_for(Mapper, "before_update")
66111
def validate_string_lengths(mapper, connection, target):
@@ -121,6 +166,7 @@ def init_db(env: str) -> None:
121166
)
122167
# Create test files and batches for LOCAL environment
123168
if env == "LOCAL":
169+
_setup_test_institutions(session)
124170
# Create test files
125171
test_file_1 = FileTable(
126172
id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562a"),

src/webapp/databricks.py

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,16 @@
1313
)
1414
from google.cloud import storage
1515
from google.api_core import exceptions as gcs_errors
16-
from .validation_extension import generate_extension_schema
1716
from .config import databricks_vars, gcs_vars
1817
from .utilities import databricksify_inst_name, SchemaType
1918
from typing import List, Any, Dict, Optional
20-
from fastapi import HTTPException
2119
import requests
2220
import hashlib
2321
import json
2422
import gzip
2523
from cachetools import TTLCache
2624
import threading
2725
import re
28-
import pandas as pd
2926

3027
# Setting up logger
3128
LOGGER = logging.getLogger(__name__)
@@ -623,78 +620,3 @@ def matches_one(pat: Any) -> bool:
623620
return key
624621

625622
return None
626-
627-
def create_custom_schema_extension(
628-
self,
629-
bucket_name: str,
630-
inst_query: Any,
631-
file_name: str,
632-
base_schema: Dict[str, Any], # pass base schema dict in
633-
extension_schema: Optional[dict] = None, # existing extension or None
634-
) -> Any:
635-
if (
636-
os.getenv("SST_SKIP_EXT_GEN") == "1"
637-
): # skip using workspace client for tests
638-
LOGGER.info("SST_SKIP_EXT_GEN=1; skipping Databricks extension generation.")
639-
return None
640-
641-
inst_name = inst_query.name
642-
inst_id = str(inst_query.id)
643-
644-
mapping = {
645-
"course": [
646-
"course.csv",
647-
"courses.csv",
648-
r"^(?=.*AR_DEIDENTIFIED)(?=.*COURSE).*\.csv$",
649-
],
650-
"student": ["student.csv", r"^(?=.*AR_DEIDENTIFIED)(?!.*COURSE).*\.csv$"],
651-
"semester": ["semester.csv"],
652-
}
653-
654-
key = self.get_key_for_file(mapping, file_name) # e.g., "student"
655-
if key is None:
656-
raise HTTPException(
657-
404, detail=f"{file_name} not found in {inst_name} validation_mapping"
658-
)
659-
660-
key_lc = key.lower()
661-
662-
# 4) If this model already exists in the provided extension for this institution, skip
663-
if extension_schema is not None:
664-
if not isinstance(extension_schema, dict):
665-
raise HTTPException(
666-
400, detail="extension_schema must be a dict if provided"
667-
)
668-
669-
inst_block = extension_schema.get("institutions", {}).get(inst_id, {})
670-
data_models = inst_block.get("data_models", {})
671-
existing_keys_lc = {str(k).lower() for k in data_models.keys()}
672-
673-
if key_lc in existing_keys_lc:
674-
LOGGER.info(
675-
"Model '%s' already present for institution '%s' — skipping (return None).",
676-
key,
677-
inst_id,
678-
)
679-
return None # <-- sentinel: do not write
680-
681-
# 5) Read the unvalidated CSV from GCS
682-
try:
683-
client = storage.Client()
684-
bucket = client.bucket(bucket_name)
685-
blob = bucket.blob(f"unvalidated/{file_name}")
686-
with blob.open("r") as fh:
687-
df = pd.read_csv(fh)
688-
except Exception as e:
689-
LOGGER.exception("Failed to read %s from GCS", file_name)
690-
raise HTTPException(500, detail=f"Failed to read {file_name} from GCS: {e}")
691-
692-
updated_extension = generate_extension_schema(
693-
df=df,
694-
models=key, # exactly one model
695-
institution_id=inst_id,
696-
base_schema=base_schema, # reference only, not mutated
697-
existing_extension=extension_schema, # may be None
698-
)
699-
700-
return updated_extension

src/webapp/gcsutil.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ def validate_file(
378378
base_schema: Base schema dict.
379379
inst_schema: Optional extension schema with institutions.* blocks.
380380
institution_id: Key into inst_schema["institutions"]: "edvise", "pdp",
381-
"legacy" (any-format uploads), or institution UUID for custom. Default "pdp".
381+
or "legacy" (any-format uploads). Default "pdp".
382382
institution_identifier: Optional institution ID (e.g. UUID). Reserved for
383383
future use; Edvise uses JSON-based validation only (different shape).
384384

0 commit comments

Comments
 (0)