Skip to content

Commit 34b0f8b

Browse files
authored
Merge pull request #146 from datakind/develop
Meing latest changes to staging
2 parents 93fb931 + 0741bcd commit 34b0f8b

9 files changed

Lines changed: 3008 additions & 1951 deletions

File tree

pyproject.toml

Lines changed: 23 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -7,43 +7,41 @@ requires-python = ">=3.10,<3.13"
77
dependencies = [
88
"databricks-sdk~=0.38.0",
99
"pydantic~=2.10",
10-
"fastapi[standard]>=0.115.4",
11-
"google-cloud>=0.34.0",
12-
"google-cloud-storage>=2.18.2",
13-
"paramiko>=3.5.0",
14-
"cloud-sql-python-connector[pymysql]>=1.14.0",
15-
"sqlalchemy>=2.0.36",
16-
"pyjwt>=2.10.1",
17-
"passlib>=1.7.4",
18-
"bcrypt>=4.2.0",
19-
"crypto>=1.4.1",
20-
"python-dotenv>=1.0.1",
21-
"strenum>=0.4.15",
10+
"fastapi[standard]~=0.115.4",
11+
"google-cloud-storage~=2.18.2",
12+
"paramiko~=3.5.0",
13+
"cloud-sql-python-connector[pymysql]~=1.14.0",
14+
"sqlalchemy~=2.0.36",
15+
"pyjwt~=2.10.1",
16+
"passlib~=1.7.4",
17+
"bcrypt~=4.2.0",
18+
"pycryptodome~=3.20.0",
19+
"python-dotenv~=1.0.1",
20+
"strenum~=0.4.15",
2221
"tomli~=2.0; python_version<'3.11'",
23-
"jsonpickle>=4.0.1",
24-
"requests>=2.0.0",
25-
"types-requests",
26-
"types-paramiko",
27-
"pandas",
28-
"six",
29-
"types-six",
30-
"fuzzywuzzy",
31-
"databricks-sql-connector",
22+
"jsonpickle~=4.0.1",
23+
"requests~=2.32.0",
24+
"types-requests~=2.32.0.0",
25+
"types-paramiko~=3.5.0.0",
26+
"pandas~=2.0",
27+
"six~=1.16.0",
28+
"thefuzz[speedup]~=0.22.1",
29+
"databricks-sql-connector~=3.5.0",
3230
"pandera~=0.13",
33-
"mlflow"
31+
"mlflow~=2.15.0"
3432
]
3533

3634
[project.urls]
3735
Repository = "https://github.com/datakind/sst-app-api"
3836

3937
[dependency-groups]
4038
dev = [
41-
"black>=25.1.0",
42-
"coverage>=7.6.9",
39+
"black~=25.1.0",
40+
"coverage~=7.6.9",
4341
"ipykernel~=6.29",
4442
"jupyterlab~=4.2",
4543
"mypy~=1.11",
46-
"pylint>=3.3.2",
44+
"pylint~=3.3.2",
4745
"pytest~=8.3",
4846
"ruff~=0.8",
4947
]

src/webapp/databricks.py

Lines changed: 188 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,20 @@
1111
Disposition,
1212
StatementState,
1313
)
14+
from google.cloud import storage
15+
from .validation_extension import generate_extension_schema
1416
from .config import databricks_vars, gcs_vars
1517
from .utilities import databricksify_inst_name, SchemaType
16-
from typing import List, Any, Dict
18+
from typing import List, Any, Dict, IO, cast, Optional
1719
from databricks.sdk.errors import DatabricksError
20+
from fastapi import HTTPException
21+
22+
try:
23+
import tomllib as _toml # Py 3.11+
24+
except ModuleNotFoundError:
25+
import tomli as _toml # Py ≤ 3.10
26+
import pandas as pd
27+
import re
1828

1929
# Setting up logger
2030
LOGGER = logging.getLogger(__name__)
@@ -366,3 +376,180 @@ def fetch_table_data(
366376

367377
# Combine column names with corresponding row values
368378
return [dict(zip(column_names, row)) for row in data_rows]
379+
380+
def get_key_for_file(
381+
self, mapping: Dict[str, Any], file_name: str
382+
) -> Optional[str]:
383+
"""
384+
Case-insensitive match of file_name against mapping values.
385+
Values may be:
386+
- str literal (e.g., "student.csv") → allow optional base suffixes before the ext.
387+
- str regex (e.g., r"^course_.*\.csv$") → re.IGNORECASE fullmatch.
388+
- compiled regex (re.Pattern) → fullmatch, adding IGNORECASE if missing.
389+
- list of any of the above.
390+
"""
391+
# normalize filename (handles windows paths + stray whitespace)
392+
name = os.path.basename(file_name.replace("\\", "/")).strip()
393+
394+
REGEX_META = re.compile(r"[()\[\]\{\}\|\?\+\*\\]")
395+
396+
def looks_like_regex(s: str) -> bool:
397+
s = s.strip()
398+
return (
399+
s.startswith("^") or s.endswith("$") or REGEX_META.search(s) is not None
400+
)
401+
402+
def matches_one(pat: Any) -> bool:
403+
# compiled regex
404+
if isinstance(pat, re.Pattern):
405+
# ensure case-insensitive
406+
flags = pat.flags | re.IGNORECASE
407+
return re.fullmatch(re.compile(pat.pattern, flags), name) is not None
408+
409+
# string literal / regex
410+
if isinstance(pat, str):
411+
p = pat.strip()
412+
413+
# exact literal (case-insensitive)
414+
if name.casefold() == p.casefold():
415+
return True
416+
417+
if looks_like_regex(p):
418+
try:
419+
return re.fullmatch(p, name, flags=re.IGNORECASE) is not None
420+
except re.error:
421+
return False
422+
423+
# literal with suffix tolerance
424+
p_base, p_ext = os.path.splitext(p)
425+
if p_ext:
426+
# ^base(?:[._-].+)?ext$
427+
rx = re.compile(
428+
rf"^{re.escape(p_base)}(?:[._-].+)?{re.escape(p_ext)}$",
429+
re.IGNORECASE,
430+
)
431+
else:
432+
# ^literal(?:[._-].+)?(?:\..+)?$
433+
rx = re.compile(
434+
rf"^{re.escape(p)}(?:[._-].+)?(?:\..+)?$",
435+
re.IGNORECASE,
436+
)
437+
return rx.fullmatch(name) is not None
438+
439+
# unsupported type
440+
return False
441+
442+
for key, val in mapping.items():
443+
items = val if isinstance(val, list) else [val]
444+
for pat in items:
445+
if matches_one(pat):
446+
return key
447+
448+
return None
449+
450+
def create_custom_schema_extension(
451+
self,
452+
bucket_name: str,
453+
inst_query: Any,
454+
file_name: str,
455+
base_schema: Dict[str, Any], # pass base schema dict in
456+
extension_schema: Optional[dict] = None, # existing extension or None
457+
) -> Any:
458+
if (
459+
os.getenv("SST_SKIP_EXT_GEN") == "1"
460+
): # skip using workspace client for tests
461+
LOGGER.info("SST_SKIP_EXT_GEN=1; skipping Databricks extension generation.")
462+
return None
463+
464+
# 1) Databricks client
465+
try:
466+
w = WorkspaceClient(
467+
host=databricks_vars["DATABRICKS_HOST_URL"],
468+
google_service_account=gcs_vars["GCP_SERVICE_ACCOUNT_EMAIL"],
469+
)
470+
LOGGER.info("Successfully created Databricks WorkspaceClient.")
471+
except Exception as e:
472+
LOGGER.exception("WorkspaceClient init failed")
473+
raise ValueError(f"Workspace client initialization failed: {e}")
474+
475+
# 2) Fetch & parse config.toml to get validation_mapping
476+
try:
477+
inst_name = inst_query[0][0].name
478+
inst_id_raw = inst_query[0][0].id
479+
inst_id = str(inst_id_raw) # be robust if id is not a string
480+
config_volume_path = (
481+
f"/Volumes/staging_sst_01/"
482+
f"{databricksify_inst_name(inst_name)}_bronze/bronze_volume/config.toml"
483+
)
484+
LOGGER.info("Attempting to download from %s", config_volume_path)
485+
response = w.files.download(config_volume_path)
486+
stream = cast(IO[bytes], response.contents)
487+
file_bytes = stream.read()
488+
LOGGER.info("Download successful, received %d bytes", len(file_bytes))
489+
except Exception as e:
490+
LOGGER.exception("Failed to fetch config.toml")
491+
raise HTTPException(500, detail=f"Failed to fetch config: {e}")
492+
493+
try:
494+
cfg = _toml.loads(file_bytes.decode("utf-8"))
495+
mapping = cfg["webapp"]["validation_mapping"]
496+
except KeyError:
497+
raise HTTPException(
498+
404, detail="Missing [webapp].validation_mapping in config.toml"
499+
)
500+
except Exception as e:
501+
LOGGER.exception("Invalid TOML")
502+
raise HTTPException(400, detail=f"Invalid TOML in {file_name}: {e}")
503+
504+
if not isinstance(mapping, dict):
505+
raise HTTPException(
506+
400, detail="validation_mapping must be a TOML table (dictionary)"
507+
)
508+
509+
key = self.get_key_for_file(mapping, file_name) # e.g., "student"
510+
if key is None:
511+
raise HTTPException(
512+
404, detail=f"{file_name} not found in {inst_name} validation_mapping"
513+
)
514+
515+
key_lc = key.lower()
516+
517+
# 4) If this model already exists in the provided extension for this institution, skip
518+
if extension_schema is not None:
519+
if not isinstance(extension_schema, dict):
520+
raise HTTPException(
521+
400, detail="extension_schema must be a dict if provided"
522+
)
523+
524+
inst_block = extension_schema.get("institutions", {}).get(inst_id, {})
525+
data_models = inst_block.get("data_models", {})
526+
existing_keys_lc = {str(k).lower() for k in data_models.keys()}
527+
528+
if key_lc in existing_keys_lc:
529+
LOGGER.info(
530+
"Model '%s' already present for institution '%s' — skipping (return None).",
531+
key,
532+
inst_id,
533+
)
534+
return None # <-- sentinel: do not write
535+
536+
# 5) Read the unvalidated CSV from GCS
537+
try:
538+
client = storage.Client()
539+
bucket = client.bucket(bucket_name)
540+
blob = bucket.blob(f"unvalidated/{file_name}")
541+
with blob.open("r") as fh:
542+
df = pd.read_csv(fh)
543+
except Exception as e:
544+
LOGGER.exception("Failed to read %s from GCS", file_name)
545+
raise HTTPException(500, detail=f"Failed to read {file_name} from GCS: {e}")
546+
547+
updated_extension = generate_extension_schema(
548+
df=df,
549+
models=key, # exactly one model
550+
institution_id=inst_id,
551+
base_schema=base_schema, # reference only, not mutated
552+
existing_extension=extension_schema, # may be None
553+
)
554+
555+
return updated_extension

src/webapp/databricks_test.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
import pytest
2+
3+
from .databricks import DatabricksControl
4+
5+
6+
@pytest.fixture
7+
def ctrl():
8+
return DatabricksControl()
9+
10+
11+
def test_exact_literal_case_insensitive(ctrl):
12+
mapping = {"student": "student.csv"}
13+
assert ctrl.get_key_for_file(mapping, "Student.csv") == "student"
14+
15+
16+
def test_literal_with_suffix_and_same_ext(ctrl):
17+
mapping = {"student": "student.csv"}
18+
assert ctrl.get_key_for_file(mapping, "student_20240101.csv") == "student"
19+
assert ctrl.get_key_for_file(mapping, "student-final.csv") == "student"
20+
# should not match a different extension
21+
assert ctrl.get_key_for_file(mapping, "student_20240101.tsv") is None
22+
23+
24+
def test_literal_without_ext_allows_suffix_and_optional_ext(ctrl):
25+
mapping = {"student": "student"}
26+
assert ctrl.get_key_for_file(mapping, "student") == "student"
27+
assert ctrl.get_key_for_file(mapping, "student_v2") == "student"
28+
assert ctrl.get_key_for_file(mapping, "student_v2.csv") == "student"
29+
30+
31+
def test_regex_fullmatch_ignorecase(ctrl):
32+
mapping = {"course": r"^course(?:[._-].+)?\.csv$"}
33+
assert ctrl.get_key_for_file(mapping, "Course_20240101.CSV") == "course"
34+
assert ctrl.get_key_for_file(mapping, "COURSE.csv") == "course"
35+
# ensure fullmatch (not substring)
36+
assert ctrl.get_key_for_file(mapping, "my_course_20240101.csv") is None
37+
38+
39+
def test_list_values_mixed_literal_and_regex(ctrl):
40+
mapping = {"student": ["students.csv", r"^stud\d+\.csv$"]}
41+
assert ctrl.get_key_for_file(mapping, "STUD123.csv") == "student"
42+
assert ctrl.get_key_for_file(mapping, "students_2024.csv") == "student"
43+
44+
45+
def test_invalid_regex_is_ignored(ctrl):
46+
mapping = {"bad": ["(unclosed", "ok.csv"]}
47+
# bad regex should be skipped; literal should match
48+
assert ctrl.get_key_for_file(mapping, "OK.csv") == "bad"
49+
50+
51+
def test_returns_none_when_no_match(ctrl):
52+
mapping = {"student": "student.csv"}
53+
assert ctrl.get_key_for_file(mapping, "unknown.csv") is None

src/webapp/gcsutil.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -267,6 +267,55 @@ def delete_file(self, bucket_name: str, file_name: str) -> None:
267267
raise ValueError(file_name + ": File not found.")
268268
blob.delete()
269269

270+
def delete_batch_files(
271+
self,
272+
bucket_name: str,
273+
batch_files: list[str],
274+
) -> Any:
275+
prefix = "validated/"
276+
277+
now_iso = datetime.datetime.now()
278+
deleted: List[Dict[str, str]] = []
279+
not_found: List[str] = []
280+
errors: List[Dict[str, str]] = []
281+
282+
for fname in batch_files:
283+
if not isinstance(fname, str) or not fname.strip():
284+
errors.append(
285+
{
286+
"file": str(fname),
287+
"path": f"{prefix}{fname}",
288+
"error": "invalid filename",
289+
}
290+
)
291+
continue
292+
293+
blob_path = f"{prefix}{fname}"
294+
try:
295+
logger.info("Attempting to delete gs://%s/%s", bucket_name, blob_path)
296+
# One-liner delete; raises NotFound if missing
297+
self.delete_file(bucket_name=bucket_name, file_name=blob_path)
298+
logger.info("Delete successful: gs://%s/%s", bucket_name, blob_path)
299+
deleted.append(
300+
{"file": fname, "path": blob_path, "deleted_at": str(now_iso)}
301+
)
302+
except ValueError:
303+
logger.warning(
304+
"Blob or bucket not found: gs://%s/%s", bucket_name, blob_path
305+
)
306+
not_found.append(fname)
307+
except Exception as e: # network/other unexpected errors
308+
logger.exception(
309+
"Unexpected error deleting gs://%s/%s", bucket_name, blob_path
310+
)
311+
errors.append({"file": fname, "path": blob_path, "error": str(e)})
312+
313+
return {
314+
"deleted": deleted,
315+
"not_found": not_found,
316+
"errors": errors,
317+
}
318+
270319
def validate_file(
271320
self,
272321
bucket_name: str,

0 commit comments

Comments
 (0)