From d1ec68ebcb9abfcf1d03345465504a1980fe08ac Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 14 Jul 2025 07:41:56 -0500 Subject: [PATCH 01/36] fixed endpoint issues --- src/webapp/routers/data.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index bc2b762d..e294c034 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1330,9 +1330,8 @@ def get_training_support_overview( raise HTTPException(status_code=status.HTTP_400_BAD_REQUEST, detail=str(ve)) -@router.get("/{inst_id}/training/model-cards/{run_id}/{model_name}") +@router.get("/{inst_id}/training/model-cards/{model_name}") def get_model_cards( - run_id: str, model_name: str, inst_id: str, current_user: Annotated[BaseUser, Depends(get_current_active_user)], From d638514a28b95b17281f7241159a1fefef876d0b Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 15:14:12 -0500 Subject: [PATCH 02/36] feat: published schema_registry table in cloud sql --- src/webapp/database.py | 75 ++++++++++++++++++++++++++++++++++++++ src/webapp/routers/data.py | 61 +++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/src/webapp/database.py b/src/webapp/database.py index 28dcc139..ca276cf6 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -4,6 +4,7 @@ import datetime from typing import Set, List from contextvars import ContextVar +import enum import sqlalchemy from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.ext.mutable import MutableDict, MutableList @@ -16,9 +17,12 @@ String, UniqueConstraint, Text, + Enum, + Boolean, JSON, Integer, BigInteger, + Index, ) from sqlalchemy.orm import sessionmaker, Session, relationship, mapped_column, Mapped from sqlalchemy.sql import func @@ -394,6 +398,77 @@ class JobTable(Base): err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) completed: Mapped[bool] = mapped_column(nullable=True) +class DocType(enum.Enum): + base = "base" + extension = "extension" + +class SchemaRegistry(Base): + """ + Stores versioned schema documents: + - Base schema (doc_type=base, is_pdp=False, inst_id NULL) + - PDP shared extension (doc_type=extension, is_pdp=True, inst_id NULL) + - Custom institution extension (doc_type=extension, is_pdp=False, inst_id=) + Layers can reference a parent (extends_schema_id) that they extend. + """ + + __tablename__ = "schema_registry" + + schema_id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) + doc_type: Mapped[DocType] = mapped_column(Enum(DocType), nullable=False) + # Nullable: NULL for base and PDP shared extension + inst_id: Mapped[uuid.UUID | None] = mapped_column( + ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), + nullable=True + ) + is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + version_label: Mapped[str] = mapped_column(String(32), nullable=False) + extends_schema_id: Mapped[int | None] = mapped_column( + BigInteger, + ForeignKey("schema_registry.schema_id", ondelete="SET NULL", onupdate="CASCADE"), + nullable=True + ) + json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) + is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) + created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + + # ---------------- Relationships ---------------- + inst: Mapped["InstTable | None"] = relationship( + "InstTable", + back_populates="schemas_registry", # we'll add this new relationship on InstTable (see below) + ) + + parent_schema: Mapped["SchemaRegistry | None"] = relationship( + "SchemaRegistry", + remote_side="SchemaRegistry.schema_id", + foreign_keys=[extends_schema_id], + back_populates="child_schemas" + ) + + child_schemas: Mapped[List["SchemaRegistry"]] = relationship( + "SchemaRegistry", + back_populates="parent_schema", + cascade="all, delete-orphan" + ) + + __table_args__ = ( + UniqueConstraint("doc_type", "version_label", name="uq_base_version"), + UniqueConstraint("is_pdp", "version_label", name="uq_pdp_version"), + UniqueConstraint("inst_id", "version_label", name="uq_inst_version"), + Index("idx_schema_active_base", "doc_type", "is_active"), + Index("idx_schema_active_pdp", "is_pdp", "is_active"), + Index("idx_schema_active_inst", "inst_id", "is_active"), + ) + + # Convenience: identify logical namespace + @property + def namespace(self) -> str: + if self.doc_type == DocType.base: + return "base" + if self.is_pdp: + return "pdp" + if self.inst_id: + return f"inst:{self.inst_id}" + return "unknown" def get_session(): """Get the session.""" diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index e294c034..7a2ce1f0 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -16,6 +16,7 @@ from ..config import databricks_vars, env_vars, gcs_vars import tempfile import pathlib +import json from ..utilities import ( has_access_to_inst_or_err, @@ -37,6 +38,7 @@ BatchTable, FileTable, InstTable, + SchemaRegistry, ) from ..databricks import DatabricksControl @@ -901,7 +903,66 @@ def validation_helper( allowed_schemas = infer_models_from_filename(file_name, "pdp") inferred_schemas: list[str] = [] + # ----------------------- Fetch base schema from DB ------------------------------- + base_json_val = ( + local_session.execute( + select(SchemaRegistry.json_doc) + .where( + SchemaRegistry.doc_type == DocType.base, + SchemaRegistry.is_active.is_(True) + ) + .limit(1) + ).scalar_one_or_none() + ) + if base_json_val is None: + raise RuntimeError("No active base schema found") + + # ----------------------- Fetch inst specific extension schema from DB --------------------- + inst = ( + local_session.get().execute( + select(InstTable).where(InstTable.id == str_to_uuid(inst_id)) + ) + .scalar_one_or_none() + ) + if inst is None: + raise ValueError(f"Institution {inst_id} not found") + + if inst.pdp_id: # institution is PDP + inst_layer_val = ( + local_session.get().execute( + select(SchemaRegistry.json_doc) + .where( + SchemaRegistry.is_pdp.is_(True), + SchemaRegistry.is_active.is_(True) + ) + .limit(1) + ).scalar_one_or_none() + ) + else: # custom (or none) + inst_layer_val = ( + local_session.get().execute( + select(SchemaRegistry.json_doc) + .where( + SchemaRegistry.inst_id == inst.id, + SchemaRegistry.is_active.is_(True) + ) + .limit(1) + ).scalar_one_or_none() + ) + + if isinstance(base_json_val, (dict, list)): + base_json_str = json.dumps(base_json_val) + else: + base_json_str = str(base_json_val) + + if inst_layer_val is None: + inst_schema_json_str = None + elif isinstance(inst_layer_val, (dict, list)): + inst_schema_json_str = json.dumps(inst_layer_val) + else: + inst_schema_json_str = str(inst_layer_val) + # ----------------------- File validation logic logic -------------------------------------- try: inferred_schemas = storage_control.validate_file( get_external_bucket_name(inst_id), From 6ee04b9c7cc1cb2263aea812ac8d5e2e568d8b6a Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:16:13 -0500 Subject: [PATCH 03/36] feat: adjusted validation logic --- src/webapp/gcsutil.py | 4 ++-- src/webapp/routers/data.py | 21 ++++++--------------- src/webapp/validation.py | 18 ++++-------------- 3 files changed, 12 insertions(+), 31 deletions(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index f871e272..5d247f7a 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -268,7 +268,7 @@ def delete_file(self, bucket_name: str, file_name: str): blob.delete() def validate_file( - self, bucket_name: str, file_name: str, allowed_schemas: list[str] + self, bucket_name: str, file_name: str, allowed_schemas: list[str], base_schema: dict, inst_schema: dict ) -> List[str]: """Validate that a file is one of the allowed schemas.""" client = storage.Client() @@ -278,7 +278,7 @@ def validate_file( schems: List[str] = [] try: with blob.open("r") as file: - schemas = validate_file_reader(file, allowed_schemas) + schemas = validate_file_reader(file, allowed_schemas, base_schema, inst_schema) schems = [str(s) for s in schemas.get("schemas", [])] logging.debug( f"If you see this file validation was successful {schems}" diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 7a2ce1f0..cf6c90d5 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -904,7 +904,7 @@ def validation_helper( inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- - base_json_val = ( + base_schema = ( local_session.execute( select(SchemaRegistry.json_doc) .where( @@ -914,7 +914,7 @@ def validation_helper( .limit(1) ).scalar_one_or_none() ) - if base_json_val is None: + if base_schema is None: raise RuntimeError("No active base schema found") # ----------------------- Fetch inst specific extension schema from DB --------------------- @@ -928,7 +928,7 @@ def validation_helper( raise ValueError(f"Institution {inst_id} not found") if inst.pdp_id: # institution is PDP - inst_layer_val = ( + inst_schema = ( local_session.get().execute( select(SchemaRegistry.json_doc) .where( @@ -939,7 +939,7 @@ def validation_helper( ).scalar_one_or_none() ) else: # custom (or none) - inst_layer_val = ( + inst_schema = ( local_session.get().execute( select(SchemaRegistry.json_doc) .where( @@ -950,17 +950,8 @@ def validation_helper( ).scalar_one_or_none() ) - if isinstance(base_json_val, (dict, list)): - base_json_str = json.dumps(base_json_val) - else: - base_json_str = str(base_json_val) - - if inst_layer_val is None: - inst_schema_json_str = None - elif isinstance(inst_layer_val, (dict, list)): - inst_schema_json_str = json.dumps(inst_layer_val) - else: - inst_schema_json_str = str(inst_layer_val) + base_json = json.dumps(base_schema) + inst_schema_json = json.dumps(inst_schema) if inst_schema is not None else None # ----------------------- File validation logic logic -------------------------------------- try: diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 2c13e047..ce7877ff 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -16,9 +16,9 @@ from fuzzywuzzy import fuzz -def validate_file_reader(filename: str, allowed_schema: list[str]) -> dict[str, Any]: +def validate_file_reader(filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: dict) -> dict[str, Any]: """Validates given a filename.""" - return validate_dataset(filename, allowed_schema) + return validate_dataset(filename, base_schema, inst_schema, allowed_schema) class HardValidationError(Exception): @@ -153,6 +153,8 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: def validate_dataset( filename: str, + base_schema: dict, + ext_schema: dict = None, models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: @@ -160,18 +162,6 @@ def validate_dataset( df = df.rename(columns={c: normalize_col(c) for c in df.columns}) incoming = set(df.columns) - # 1) load schemas - BASE_DIR = os.path.dirname(os.path.abspath(__file__)) - base_schema_path = os.path.join(BASE_DIR, "validation_schemas/base_schema.json") - base_schema = load_json(base_schema_path) - ext_schema = None - - extension_schema_path = os.path.join( - BASE_DIR, f"validation_schemas/{institution_id}_schema_extension.json" - ) - if extension_schema_path and os.path.exists(extension_schema_path): - ext_schema = load_json(extension_schema_path) - # 2) merge requested models if models is None: model_list = [] From cf1d775faea9c1aef6a3d82c98291840e20ffdb5 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:36:59 -0500 Subject: [PATCH 04/36] feat: adjusted validation logic --- src/webapp/routers/data.py | 5 ++++- src/webapp/validation.py | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index cf6c90d5..35cb720f 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -39,6 +39,7 @@ FileTable, InstTable, SchemaRegistry, + DocType ) from ..databricks import DatabricksControl @@ -951,7 +952,7 @@ def validation_helper( ) base_json = json.dumps(base_schema) - inst_schema_json = json.dumps(inst_schema) if inst_schema is not None else None + inst_json = json.dumps(inst_schema) if inst_schema is not None else None # ----------------------- File validation logic logic -------------------------------------- try: @@ -959,6 +960,8 @@ def validation_helper( get_external_bucket_name(inst_id), file_name, allowed_schemas, + base_json, + inst_json, ) logging.debug( f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}" diff --git a/src/webapp/validation.py b/src/webapp/validation.py index ce7877ff..036c2d97 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -5,7 +5,6 @@ from typing import Any import json -import os import re from typing import Union, List, Dict, Optional import logging From 1235047dc06a9c616e7ca57b080deb80191dae58 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:38:15 -0500 Subject: [PATCH 05/36] feat: adjusted validation logic --- src/webapp/database.py | 26 ++++++++++++++--------- src/webapp/gcsutil.py | 11 ++++++++-- src/webapp/routers/data.py | 43 +++++++++++++++++++------------------- src/webapp/validation.py | 4 +++- 4 files changed, 49 insertions(+), 35 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index ca276cf6..389e698d 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -398,10 +398,12 @@ class JobTable(Base): err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) completed: Mapped[bool] = mapped_column(nullable=True) + class DocType(enum.Enum): base = "base" extension = "extension" + class SchemaRegistry(Base): """ Stores versioned schema documents: @@ -413,23 +415,28 @@ class SchemaRegistry(Base): __tablename__ = "schema_registry" - schema_id: Mapped[int] = mapped_column(BigInteger, primary_key=True, autoincrement=True) + schema_id: Mapped[int] = mapped_column( + BigInteger, primary_key=True, autoincrement=True + ) doc_type: Mapped[DocType] = mapped_column(Enum(DocType), nullable=False) # Nullable: NULL for base and PDP shared extension inst_id: Mapped[uuid.UUID | None] = mapped_column( - ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), - nullable=True + ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True ) is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) version_label: Mapped[str] = mapped_column(String(32), nullable=False) extends_schema_id: Mapped[int | None] = mapped_column( BigInteger, - ForeignKey("schema_registry.schema_id", ondelete="SET NULL", onupdate="CASCADE"), - nullable=True + ForeignKey( + "schema_registry.schema_id", ondelete="SET NULL", onupdate="CASCADE" + ), + nullable=True, ) json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - created_at = Column(DateTime(timezone=True), server_default=func.now(), nullable=False) + created_at = Column( + DateTime(timezone=True), server_default=func.now(), nullable=False + ) # ---------------- Relationships ---------------- inst: Mapped["InstTable | None"] = relationship( @@ -441,13 +448,11 @@ class SchemaRegistry(Base): "SchemaRegistry", remote_side="SchemaRegistry.schema_id", foreign_keys=[extends_schema_id], - back_populates="child_schemas" + back_populates="child_schemas", ) child_schemas: Mapped[List["SchemaRegistry"]] = relationship( - "SchemaRegistry", - back_populates="parent_schema", - cascade="all, delete-orphan" + "SchemaRegistry", back_populates="parent_schema", cascade="all, delete-orphan" ) __table_args__ = ( @@ -470,6 +475,7 @@ def namespace(self) -> str: return f"inst:{self.inst_id}" return "unknown" + def get_session(): """Get the session.""" sess: Session = LocalSession() diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index 5d247f7a..c3c7accc 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -268,7 +268,12 @@ def delete_file(self, bucket_name: str, file_name: str): blob.delete() def validate_file( - self, bucket_name: str, file_name: str, allowed_schemas: list[str], base_schema: dict, inst_schema: dict + self, + bucket_name: str, + file_name: str, + allowed_schemas: list[str], + base_schema: dict, + inst_schema: dict, ) -> List[str]: """Validate that a file is one of the allowed schemas.""" client = storage.Client() @@ -278,7 +283,9 @@ def validate_file( schems: List[str] = [] try: with blob.open("r") as file: - schemas = validate_file_reader(file, allowed_schemas, base_schema, inst_schema) + schemas = validate_file_reader( + file, allowed_schemas, base_schema, inst_schema + ) schems = [str(s) for s in schemas.get("schemas", [])] logging.debug( f"If you see this file validation was successful {schems}" diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 35cb720f..e6117ca9 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -39,7 +39,7 @@ FileTable, InstTable, SchemaRegistry, - DocType + DocType, ) from ..databricks import DatabricksControl @@ -905,24 +905,20 @@ def validation_helper( inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- - base_schema = ( - local_session.execute( - select(SchemaRegistry.json_doc) - .where( - SchemaRegistry.doc_type == DocType.base, - SchemaRegistry.is_active.is_(True) - ) - .limit(1) - ).scalar_one_or_none() - ) + base_schema = local_session.execute( + select(SchemaRegistry.json_doc) + .where( + SchemaRegistry.doc_type == DocType.base, SchemaRegistry.is_active.is_(True) + ) + .limit(1) + ).scalar_one_or_none() if base_schema is None: raise RuntimeError("No active base schema found") # ----------------------- Fetch inst specific extension schema from DB --------------------- inst = ( - local_session.get().execute( - select(InstTable).where(InstTable.id == str_to_uuid(inst_id)) - ) + local_session.get() + .execute(select(InstTable).where(InstTable.id == str_to_uuid(inst_id))) .scalar_one_or_none() ) if inst is None: @@ -930,27 +926,30 @@ def validation_helper( if inst.pdp_id: # institution is PDP inst_schema = ( - local_session.get().execute( + local_session.get() + .execute( select(SchemaRegistry.json_doc) .where( - SchemaRegistry.is_pdp.is_(True), - SchemaRegistry.is_active.is_(True) + SchemaRegistry.is_pdp.is_(True), SchemaRegistry.is_active.is_(True) ) .limit(1) - ).scalar_one_or_none() + ) + .scalar_one_or_none() ) else: # custom (or none) inst_schema = ( - local_session.get().execute( + local_session.get() + .execute( select(SchemaRegistry.json_doc) .where( SchemaRegistry.inst_id == inst.id, - SchemaRegistry.is_active.is_(True) + SchemaRegistry.is_active.is_(True), ) .limit(1) - ).scalar_one_or_none() + ) + .scalar_one_or_none() ) - + base_json = json.dumps(base_schema) inst_json = json.dumps(inst_schema) if inst_schema is not None else None diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 036c2d97..8988ce92 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -15,7 +15,9 @@ from fuzzywuzzy import fuzz -def validate_file_reader(filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: dict) -> dict[str, Any]: +def validate_file_reader( + filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: dict +) -> dict[str, Any]: """Validates given a filename.""" return validate_dataset(filename, base_schema, inst_schema, allowed_schema) From 80c3854ae04d7dd0ca003e1ff2d1e0f3d7504e2f Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:49:04 -0500 Subject: [PATCH 06/36] feat: adjusted database logic --- src/webapp/database.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 389e698d..0bbe805f 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -116,6 +116,11 @@ class InstTable(Base): back_populates="inst" ) models: Mapped[Set["ModelTable"]] = relationship(back_populates="inst") + schemas_registry: Mapped[List["SchemaRegistryTable"]] = relationship( + "SchemaRegistryTable", + back_populates="inst", + cascade="all, delete-orphan" + ) name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # If retention unset, the Datakind default is used. File-level retentions overrides @@ -404,7 +409,7 @@ class DocType(enum.Enum): extension = "extension" -class SchemaRegistry(Base): +class SchemaRegistryTable(Base): """ Stores versioned schema documents: - Base schema (doc_type=base, is_pdp=False, inst_id NULL) @@ -444,14 +449,14 @@ class SchemaRegistry(Base): back_populates="schemas_registry", # we'll add this new relationship on InstTable (see below) ) - parent_schema: Mapped["SchemaRegistry | None"] = relationship( + parent_schema: Mapped["SchemaRegistryTable | None"] = relationship( "SchemaRegistry", remote_side="SchemaRegistry.schema_id", foreign_keys=[extends_schema_id], back_populates="child_schemas", ) - child_schemas: Mapped[List["SchemaRegistry"]] = relationship( + child_schemas: Mapped[List["SchemaRegistryTable"]] = relationship( "SchemaRegistry", back_populates="parent_schema", cascade="all, delete-orphan" ) From b27bcbd8c8995a6edad25390753f4b9ac052a356 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:49:19 -0500 Subject: [PATCH 07/36] feat: adjusted database logic --- src/webapp/database.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 0bbe805f..56abcb09 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -117,9 +117,7 @@ class InstTable(Base): ) models: Mapped[Set["ModelTable"]] = relationship(back_populates="inst") schemas_registry: Mapped[List["SchemaRegistryTable"]] = relationship( - "SchemaRegistryTable", - back_populates="inst", - cascade="all, delete-orphan" + "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) From 109d9c7b9e38d8b7bc800de07b3b764efc5b6971 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:51:54 -0500 Subject: [PATCH 08/36] feat: adjusted database logic --- src/webapp/routers/data.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index e6117ca9..2ffafc28 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -38,7 +38,7 @@ BatchTable, FileTable, InstTable, - SchemaRegistry, + SchemaRegistryTable, DocType, ) @@ -906,9 +906,10 @@ def validation_helper( inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- base_schema = local_session.execute( - select(SchemaRegistry.json_doc) + select(SchemaRegistryTable.json_doc) .where( - SchemaRegistry.doc_type == DocType.base, SchemaRegistry.is_active.is_(True) + SchemaRegistryTable.doc_type == DocType.base, + SchemaRegistryTable.is_active.is_(True), ) .limit(1) ).scalar_one_or_none() @@ -928,9 +929,10 @@ def validation_helper( inst_schema = ( local_session.get() .execute( - select(SchemaRegistry.json_doc) + select(SchemaRegistryTable.json_doc) .where( - SchemaRegistry.is_pdp.is_(True), SchemaRegistry.is_active.is_(True) + SchemaRegistryTable.is_pdp.is_(True), + SchemaRegistryTable.is_active.is_(True), ) .limit(1) ) @@ -940,10 +942,10 @@ def validation_helper( inst_schema = ( local_session.get() .execute( - select(SchemaRegistry.json_doc) + select(SchemaRegistryTable.json_doc) .where( - SchemaRegistry.inst_id == inst.id, - SchemaRegistry.is_active.is_(True), + SchemaRegistryTable.inst_id == inst.id, + SchemaRegistryTable.is_active.is_(True), ) .limit(1) ) From e9d4170e0f35f34fcfbf1997d42ed43a9f0c533f Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:56:11 -0500 Subject: [PATCH 09/36] feat: adjusted database logic --- src/webapp/database.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 56abcb09..fd5ac694 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -448,14 +448,14 @@ class SchemaRegistryTable(Base): ) parent_schema: Mapped["SchemaRegistryTable | None"] = relationship( - "SchemaRegistry", - remote_side="SchemaRegistry.schema_id", + "SchemaRegistryTable", + remote_side="SchemaRegistryTable.schema_id", foreign_keys=[extends_schema_id], back_populates="child_schemas", ) child_schemas: Mapped[List["SchemaRegistryTable"]] = relationship( - "SchemaRegistry", back_populates="parent_schema", cascade="all, delete-orphan" + "SchemaRegistryTable", back_populates="parent_schema", cascade="all, delete-orphan" ) __table_args__ = ( From c538f946f3730364efafbeba5480592c09fb844c Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 16:59:49 -0500 Subject: [PATCH 10/36] feat: adjusted database logic --- src/webapp/routers/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 2ffafc28..015366c7 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -905,7 +905,7 @@ def validation_helper( inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- - base_schema = local_session.execute( + base_schema = local_session.get().execute( select(SchemaRegistryTable.json_doc) .where( SchemaRegistryTable.doc_type == DocType.base, From cb13b884cfd578952c162c2c37e7f15d071763b5 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:18:38 -0500 Subject: [PATCH 11/36] feat: adjusted database logic --- src/webapp/routers/data_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 90bd6030..1d6632c9 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -20,6 +20,7 @@ FileTable, BatchTable, InstTable, + SchemaRegistryTable, Base, get_session, ) @@ -157,6 +158,15 @@ def session_fixture(): created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ), + SchemaRegistryTable( + doc_type="base", + is_pdp=False, + inst_id=None, + version_label="1.0.0", + extends_schema_id=None, + json_doc={"version": "1.0.0", "base": {"data_models": {}}}, + is_active=True, + ), batch_1, file_1, FileTable( From 0e4a8dc134b5fcb6e553d886d1d8024289449978 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:31:25 -0500 Subject: [PATCH 12/36] feat: adjusted database logic --- src/webapp/routers/data_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 1d6632c9..bdeb8859 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -161,11 +161,11 @@ def session_fixture(): SchemaRegistryTable( doc_type="base", is_pdp=False, - inst_id=None, version_label="1.0.0", extends_schema_id=None, json_doc={"version": "1.0.0", "base": {"data_models": {}}}, is_active=True, + created_at=DATETIME_TESTING, ), batch_1, file_1, From a0350ca345dc33419ec5ad1b35397ad8f459449c Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:32:00 -0500 Subject: [PATCH 13/36] feat: adjusted database logic --- src/webapp/routers/data_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index bdeb8859..da67c3ea 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -162,7 +162,6 @@ def session_fixture(): doc_type="base", is_pdp=False, version_label="1.0.0", - extends_schema_id=None, json_doc={"version": "1.0.0", "base": {"data_models": {}}}, is_active=True, created_at=DATETIME_TESTING, From a3a41e92f7cb1f89a1d5235ff3c9855d26ec1bba Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:34:55 -0500 Subject: [PATCH 14/36] feat: adjusted database logic --- src/webapp/routers/data_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index da67c3ea..23fc0c61 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -21,6 +21,7 @@ BatchTable, InstTable, SchemaRegistryTable, + DocType, Base, get_session, ) @@ -159,7 +160,7 @@ def session_fixture(): updated_at=DATETIME_TESTING, ), SchemaRegistryTable( - doc_type="base", + doc_type=DocType.base, # ✅ fix this is_pdp=False, version_label="1.0.0", json_doc={"version": "1.0.0", "base": {"data_models": {}}}, From 68216a5879e873cd1d1af06ed1cb5f7f49a1cbdc Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:37:10 -0500 Subject: [PATCH 15/36] feat: adjusted database logic --- src/webapp/database.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index fd5ac694..4f6f8c51 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -417,9 +417,8 @@ class SchemaRegistryTable(Base): """ __tablename__ = "schema_registry" - schema_id: Mapped[int] = mapped_column( - BigInteger, primary_key=True, autoincrement=True + Integer, primary_key=True, autoincrement=True ) doc_type: Mapped[DocType] = mapped_column(Enum(DocType), nullable=False) # Nullable: NULL for base and PDP shared extension From 1b1d9968d340283bd0fad0664a0986b6ddd4bb9a Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:41:37 -0500 Subject: [PATCH 16/36] feat: adjusted database logic --- src/webapp/database.py | 4 +++- src/webapp/routers/data.py | 20 ++++++++++++-------- src/webapp/validation_test.py | 4 ++-- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 4f6f8c51..758c2a57 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -454,7 +454,9 @@ class SchemaRegistryTable(Base): ) child_schemas: Mapped[List["SchemaRegistryTable"]] = relationship( - "SchemaRegistryTable", back_populates="parent_schema", cascade="all, delete-orphan" + "SchemaRegistryTable", + back_populates="parent_schema", + cascade="all, delete-orphan", ) __table_args__ = ( diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 015366c7..4bee64dd 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -905,14 +905,18 @@ def validation_helper( inferred_schemas: list[str] = [] # ----------------------- Fetch base schema from DB ------------------------------- - base_schema = local_session.get().execute( - select(SchemaRegistryTable.json_doc) - .where( - SchemaRegistryTable.doc_type == DocType.base, - SchemaRegistryTable.is_active.is_(True), - ) - .limit(1) - ).scalar_one_or_none() + base_schema = ( + local_session.get() + .execute( + select(SchemaRegistryTable.json_doc) + .where( + SchemaRegistryTable.doc_type == DocType.base, + SchemaRegistryTable.is_active.is_(True), + ) + .limit(1) + ) + .scalar_one_or_none() + ) if base_schema is None: raise RuntimeError("No active base schema found") diff --git a/src/webapp/validation_test.py b/src/webapp/validation_test.py index aa69bb65..0db46278 100644 --- a/src/webapp/validation_test.py +++ b/src/webapp/validation_test.py @@ -47,7 +47,7 @@ def test_validate_file_reader_passes(tmp_csv_file): mock_load.side_effect = lambda path: ( MOCK_BASE_SCHEMA if "base" in path else MOCK_EXT_SCHEMA ) - result = validate_file_reader(tmp_csv_file, ["test_model"]) + result = validate_file_reader(tmp_csv_file, ["test_model"], base_schema=MOCK_BASE_SCHEMA, inst_schema=MOCK_EXT_SCHEMA,) assert result["validation_status"] == "passed" assert result["schemas"] == ["test_model"] @@ -65,5 +65,5 @@ def test_validate_file_reader_fails_missing_required(tmp_path): MOCK_BASE_SCHEMA if "base" in path else MOCK_EXT_SCHEMA ) with pytest.raises(HardValidationError) as exc_info: - validate_file_reader(str(file_path), ["test_model"]) + validate_file_reader(str(file_path), ["test_model"], base_schema=MOCK_BASE_SCHEMA, inst_schema=MOCK_EXT_SCHEMA,) assert "Missing required columns" in str(exc_info.value) From f40071c52a6d2cdb7a48f64e9cb56446f136818d Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 18:59:32 -0500 Subject: [PATCH 17/36] feat: adjusted database logic --- src/webapp/database.py | 30 +++++++++++++++--------------- src/webapp/gcsutil.py | 6 +++--- src/webapp/validation.py | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 758c2a57..de7a69f3 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -2,7 +2,7 @@ import uuid import datetime -from typing import Set, List +from typing import Set, List, Any, Optional, Dict from contextvars import ContextVar import enum import sqlalchemy @@ -48,7 +48,7 @@ DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) -def init_db(env: str): +def init_db(env: str) -> Any: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. if env not in ("LOCAL", "DEV"): @@ -100,7 +100,7 @@ def init_db(env: str): session.close() -class InstTable(Base): +class InstTable(Base): # type: ignore[misc] """The institution overview table that maps ids to names. The parent table to all other tables except for AccountHistory and JobTable.""" @@ -126,9 +126,9 @@ class InstTable(Base): retention_days: Mapped[int] = mapped_column(nullable=True) # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} - allowed_emails = Column(MutableDict.as_mutable(JSON)) + allowed_emails: Mapped[Optional[Dict[str, str]]] = Column(MutableDict.as_mutable(JSON)) # Schemas that are allowed for validation. - schemas = Column(MutableList.as_mutable(JSON)) + schemas: Mapped[Optional[List[str]]] = Column(MutableList.as_mutable(JSON)) state = Column(String(VAR_CHAR_LENGTH), nullable=True) # Only populated for PDP schools. pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) @@ -140,7 +140,7 @@ class InstTable(Base): __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) -class ApiKeyTable(Base): +class ApiKeyTable(Base): # type: ignore[misc] """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" @@ -175,7 +175,7 @@ class ApiKeyTable(Base): ) -class AccountTable(Base): +class AccountTable(Base): # type: ignore[misc] """ NOTE: only users created by the frontend are accessible through the fronted. Users created by API calls can only directly call API calls. Frontend will not work. The user accounts table""" @@ -216,7 +216,7 @@ class AccountTable(Base): updated_at = Column(DateTime(timezone=True), onupdate=func.now()) -class AccountHistoryTable(Base): +class AccountHistoryTable(Base): # type: ignore[misc] """The user history table""" __tablename__ = "account_history" @@ -264,8 +264,8 @@ class AccountHistoryTable(Base): ) -class FileTable(Base): - """The file table""" +class FileTable(Base): # type: ignore[misc] + """The file table""" __tablename__ = "file" name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) @@ -307,7 +307,7 @@ class FileTable(Base): __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) -class BatchTable(Base): +class BatchTable(Base): # type: ignore[misc] """The batch table""" __tablename__ = "batch" @@ -341,7 +341,7 @@ class BatchTable(Base): __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) -class ModelTable(Base): +class ModelTable(Base): # type: ignore[misc] """The model table""" __tablename__ = "model" @@ -376,7 +376,7 @@ class ModelTable(Base): __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) -class JobTable(Base): +class JobTable(Base): # type: ignore[misc] """The job table""" __tablename__ = "job" @@ -402,12 +402,12 @@ class JobTable(Base): completed: Mapped[bool] = mapped_column(nullable=True) -class DocType(enum.Enum): +class DocType(enum.Enum): # type: ignore[misc] base = "base" extension = "extension" -class SchemaRegistryTable(Base): +class SchemaRegistryTable(Base): # type: ignore[misc] """ Stores versioned schema documents: - Base schema (doc_type=base, is_pdp=False, inst_id NULL) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index c3c7accc..caec4a7a 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -241,7 +241,7 @@ def download_file( raise ValueError(file_name + ": File not found.") blob.download_to_filename(destination_file_name) - def move_file(self, bucket_name: str, prev_name: str, new_name: str): + def move_file(self, bucket_name: str, prev_name: str, new_name: str) -> None: """Rename a file.""" storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) @@ -256,7 +256,7 @@ def move_file(self, bucket_name: str, prev_name: str, new_name: str): bucket.copy_blob(blob, bucket, new_name) blob.delete() - def delete_file(self, bucket_name: str, file_name: str): + def delete_file(self, bucket_name: str, file_name: str) -> None: """Delete a file.""" storage_client = storage.Client() bucket = storage_client.bucket(bucket_name) @@ -301,7 +301,7 @@ def validate_file( logging.debug("If you see this file validation was complete") return schems - def get_file_contents(self, bucket_name: str, file_name: str): + def get_file_contents(self, bucket_name: str, file_name: str) -> Any: """Returns a file as a bytes object.""" storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 8988ce92..f2449eea 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -155,7 +155,7 @@ def build_schema(specs: Dict[str, dict]) -> DataFrameSchema: def validate_dataset( filename: str, base_schema: dict, - ext_schema: dict = None, + ext_schema: Optional[Dict[Any, Any]] = None, models: Union[str, List[str], None] = None, institution_id: str = "pdp", ) -> Dict[str, Any]: From 7757ca318616558072986bde05903e400310c806 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:04:12 -0500 Subject: [PATCH 18/36] feat: adjusted database logic --- src/webapp/database.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index de7a69f3..0d481efa 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -100,12 +100,12 @@ def init_db(env: str) -> Any: session.close() -class InstTable(Base): # type: ignore[misc] +class InstTable(Base): # type: ignore """The institution overview table that maps ids to names. The parent table to all other tables except for AccountHistory and JobTable.""" __tablename__ = "inst" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore # Linked children tables. accounts: Mapped[Set["AccountTable"]] = relationship(back_populates="inst") @@ -120,27 +120,27 @@ class InstTable(Base): # type: ignore[misc] "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # type: ignore # If retention unset, the Datakind default is used. File-level retentions overrides # this value. retention_days: Mapped[int] = mapped_column(nullable=True) # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} - allowed_emails: Mapped[Optional[Dict[str, str]]] = Column(MutableDict.as_mutable(JSON)) + allowed_emails = Column(MutableDict.as_mutable(JSON)) # type: ignore # Schemas that are allowed for validation. - schemas: Mapped[Optional[List[str]]] = Column(MutableList.as_mutable(JSON)) - state = Column(String(VAR_CHAR_LENGTH), nullable=True) + schemas = Column(MutableList.as_mutable(JSON)) # type: ignore + state = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore # Only populated for PDP schools. - pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) - created_at = Column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) - created_by = Column(Uuid(as_uuid=True), nullable=True) + pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Within the institutions, the set of name + state should be unique __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) -class ApiKeyTable(Base): # type: ignore[misc] +class ApiKeyTable(Base): # type: ignore """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" @@ -175,7 +175,7 @@ class ApiKeyTable(Base): # type: ignore[misc] ) -class AccountTable(Base): # type: ignore[misc] +class AccountTable(Base): # type: ignore """ NOTE: only users created by the frontend are accessible through the fronted. Users created by API calls can only directly call API calls. Frontend will not work. The user accounts table""" @@ -216,7 +216,7 @@ class AccountTable(Base): # type: ignore[misc] updated_at = Column(DateTime(timezone=True), onupdate=func.now()) -class AccountHistoryTable(Base): # type: ignore[misc] +class AccountHistoryTable(Base): # type: ignore """The user history table""" __tablename__ = "account_history" @@ -264,7 +264,7 @@ class AccountHistoryTable(Base): # type: ignore[misc] ) -class FileTable(Base): # type: ignore[misc] +class FileTable(Base): # type: ignore """The file table""" __tablename__ = "file" @@ -307,7 +307,7 @@ class FileTable(Base): # type: ignore[misc] __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) -class BatchTable(Base): # type: ignore[misc] +class BatchTable(Base): # type: ignore """The batch table""" __tablename__ = "batch" @@ -341,7 +341,7 @@ class BatchTable(Base): # type: ignore[misc] __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) -class ModelTable(Base): # type: ignore[misc] +class ModelTable(Base): # type: ignore """The model table""" __tablename__ = "model" @@ -376,7 +376,7 @@ class ModelTable(Base): # type: ignore[misc] __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) -class JobTable(Base): # type: ignore[misc] +class JobTable(Base): # type: ignore """The job table""" __tablename__ = "job" @@ -402,12 +402,12 @@ class JobTable(Base): # type: ignore[misc] completed: Mapped[bool] = mapped_column(nullable=True) -class DocType(enum.Enum): # type: ignore[misc] +class DocType(enum.Enum): # type: ignore base = "base" extension = "extension" -class SchemaRegistryTable(Base): # type: ignore[misc] +class SchemaRegistryTable(Base): # type: ignore """ Stores versioned schema documents: - Base schema (doc_type=base, is_pdp=False, inst_id NULL) From 39aabd3ccd447cc221481f199f4c5a87ea682154 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:08:46 -0500 Subject: [PATCH 19/36] feat: adjusted database logic --- src/webapp/database.py | 66 +++++++++++++++++------------------ src/webapp/validation_test.py | 4 +-- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 0d481efa..943a084b 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -268,8 +268,8 @@ class FileTable(Base): # type: ignore """The file table""" __tablename__ = "file" - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore batches: Mapped[Set["BatchTable"]] = relationship( secondary=association_table, back_populates="files" ) @@ -283,25 +283,25 @@ class FileTable(Base): # type: ignore # The size to the nearest mb. # size_mb: Mapped[int] = mapped_column(nullable=False) # Who uploaded the file. For SST generated files, this field would be null. - uploader = Column(Uuid(as_uuid=True), nullable=True) + uploader = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Can be PDP_SFTP, MANUAL_UPLOAD etc. May be empty for generated files. - source = Column(String(VAR_CHAR_LENGTH), nullable=True) + source = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore # The schema type(s) of this file. - schemas = Column(MutableList.as_mutable(JSON), nullable=False) + schemas = Column(MutableList.as_mutable(JSON), nullable=False) # type: ignore # If null, the following is non-deleted. # The deleted field indicates whether there is a pending deletion request on the data. # The data may stil be available to Datakind debug role in a soft-delete state but for all # intents and purposes is no longer accessible by the app. deleted: Mapped[bool] = mapped_column(nullable=True) # When the deletion request was made - deleted_at = Column(DateTime(timezone=True), nullable=True) + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore retention_days: Mapped[int] = mapped_column(nullable=True) # Whether the file was generated by SST. (e.g. was it input or output) sst_generated: Mapped[bool] = mapped_column(nullable=False) # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False) - created_at = Column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # Within a given institution, there should be no duplicated file names. __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) @@ -310,33 +310,33 @@ class FileTable(Base): # type: ignore class BatchTable(Base): # type: ignore """The batch table""" - __tablename__ = "batch" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + __tablename__ = "batch" + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore # Set the parent foreign key to link to the institution table. inst_id = Column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) + ) # type: ignore inst: Mapped["InstTable"] = relationship(back_populates="batches") files: Mapped[Set["FileTable"]] = relationship( secondary=association_table, back_populates="batches" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - created_by = Column(Uuid(as_uuid=True)) + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + created_by = Column(Uuid(as_uuid=True)) # type: ignore # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the batch is ready for use. completed: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) - created_at = Column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. - updated_by = Column(Uuid(as_uuid=True), nullable=True) + updated_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Within a given institution, there should be no duplicated batch names. __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) @@ -352,25 +352,25 @@ class ModelTable(Base): # type: ignore Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) + ) # type: ignore inst: Mapped["InstTable"] = relationship(back_populates="models") jobs: Mapped[Set["JobTable"]] = relationship(back_populates="model") - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore # What configuration of schemas are allowed (list of maps e.g. [PDP Course : 1 + PDP Cohort : 1, X_schema :1 + Y_schema: 2]) - schema_configs = Column(JSON, nullable=True) - created_by = Column(Uuid(as_uuid=True), nullable=True) + schema_configs = Column(JSON, nullable=True) # type: ignore + created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the model has been approved and is ready for use. - valid: Mapped[bool] = mapped_column(nullable=True) + valid: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) - created_at = Column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. - version = Column(Integer, default=0) + version = Column(Integer, default=0) # type: ignore # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) @@ -387,18 +387,18 @@ class JobTable(Base): # type: ignore Uuid(as_uuid=True), ForeignKey("model.id", ondelete="CASCADE"), nullable=False, - ) + ) # type: ignore model: Mapped["ModelTable"] = relationship(back_populates="jobs") - created_by = Column(Uuid(as_uuid=True), nullable=False) + created_by = Column(Uuid(as_uuid=True), nullable=False) # type: ignore # The time the deletion request was set. - triggered_at = Column(DateTime(timezone=True), nullable=False) - batch_name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + triggered_at = Column(DateTime(timezone=True), nullable=False) # type: ignore + batch_name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore # The following will be empty if not completed or if job errored out. Getting additional details will require a call to the Databricks table. - output_filename = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + output_filename = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore # Whether the file was approved. output_valid: Mapped[bool] = mapped_column(nullable=True, default=False) - err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore completed: Mapped[bool] = mapped_column(nullable=True) @@ -438,7 +438,7 @@ class SchemaRegistryTable(Base): # type: ignore is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) created_at = Column( DateTime(timezone=True), server_default=func.now(), nullable=False - ) + ) # type: ignore # ---------------- Relationships ---------------- inst: Mapped["InstTable | None"] = relationship( diff --git a/src/webapp/validation_test.py b/src/webapp/validation_test.py index 0db46278..7f5a15da 100644 --- a/src/webapp/validation_test.py +++ b/src/webapp/validation_test.py @@ -28,11 +28,11 @@ } } -MOCK_EXT_SCHEMA = {"institutions": {"pdp": {"data_models": {}}}} +MOCK_EXT_SCHEMA: dict = {"institutions": {"pdp": {"data_models": {}}}} @pytest.fixture -def tmp_csv_file(tmp_path: Path): +def tmp_csv_file(tmp_path: Path) -> str: df = pd.DataFrame({"foo_col": [1, 2], "bar_col": ["a", "b"]}) file_path = tmp_path / "test.csv" df.to_csv(file_path, index=False) From 28bf01cf0f0b988b0b9c2de600c6d73c62b77c2c Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:20:09 -0500 Subject: [PATCH 20/36] fix: type checks --- src/webapp/database.py | 6 +++--- src/webapp/routers/data.py | 7 ++----- src/webapp/routers/data_test.py | 2 +- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 943a084b..3e5b178a 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -434,7 +434,7 @@ class SchemaRegistryTable(Base): # type: ignore ), nullable=True, ) - json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) + json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) # type: ignore is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) created_at = Column( DateTime(timezone=True), server_default=func.now(), nullable=False @@ -520,7 +520,7 @@ def connect_tcp_socket( username=engine_args["DB_USER"], password=engine_args["DB_PASS"], host=engine_args["INSTANCE_HOST"], - port=engine_args["DB_PORT"], + port=int(engine_args["DB_PORT"]), database=engine_args["DB_NAME"], ), connect_args=connect_args, @@ -556,7 +556,7 @@ def init_connection_pool() -> sqlalchemy.engine.Engine: return connect_tcp_socket(engine_vars, ssl_args) -def setup_db(env: str): +def setup_db(env: str) -> Any: """Setup Database. Called by all environments.""" # initialize connection pool global db_engine diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 4bee64dd..47fe55ad 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -956,17 +956,14 @@ def validation_helper( .scalar_one_or_none() ) - base_json = json.dumps(base_schema) - inst_json = json.dumps(inst_schema) if inst_schema is not None else None - # ----------------------- File validation logic logic -------------------------------------- try: inferred_schemas = storage_control.validate_file( get_external_bucket_name(inst_id), file_name, allowed_schemas, - base_json, - inst_json, + base_schema, + inst_schema, ) logging.debug( f"!!!!!!!!!!Inferred Schemas was successful {list(inferred_schemas)}" diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 23fc0c61..3d53ae21 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -44,7 +44,7 @@ def counter_repr(x): return {frozenset(Counter(item).items()) for item in x} -def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): +def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): # type: ignore """Compares two DataInfo objects.""" if ( a_elem["inst_id"] != b_elem["inst_id"] From 114ef9f17096caccf43e6ad13420315d1c2b85b3 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:33:36 -0500 Subject: [PATCH 21/36] fix: type checks --- src/webapp/gcsutil.py | 4 +-- src/webapp/routers/data_test.py | 61 +++++++++++++++++---------------- src/webapp/validation.py | 2 +- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index caec4a7a..91a988db 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -8,7 +8,7 @@ from .config import gcs_vars, databricks_vars from .validation import validate_file_reader -from typing import Any, List +from typing import Any, List, Optional, Dict import logging # Set the logging @@ -273,7 +273,7 @@ def validate_file( file_name: str, allowed_schemas: list[str], base_schema: dict, - inst_schema: dict, + inst_schema: Optional[Dict[Any, Any]] = None, ) -> List[str]: """Validate that a file is one of the allowed schemas.""" client = storage.Client() diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 3d53ae21..c4a5d0de 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -4,6 +4,7 @@ from unittest import mock from collections import Counter from fastapi.testclient import TestClient +from typing import Any import pytest import sqlalchemy from sqlalchemy.pool import StaticPool @@ -47,27 +48,27 @@ def counter_repr(x): def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): # type: ignore """Compares two DataInfo objects.""" if ( - a_elem["inst_id"] != b_elem["inst_id"] - or counter_repr(a_elem["batch_ids"]) != counter_repr(b_elem["batch_ids"]) - or a_elem["name"] != b_elem["name"] - or a_elem["uploader"] != b_elem["uploader"] - or a_elem["deleted"] != b_elem["deleted"] - or a_elem["source"] != b_elem["source"] - or a_elem["deletion_request_time"] != b_elem["deletion_request_time"] - or a_elem["retention_days"] != b_elem["retention_days"] - or a_elem["sst_generated"] != b_elem["sst_generated"] - or a_elem["valid"] != b_elem["valid"] - or a_elem["uploaded_date"] != b_elem["uploaded_date"] + a_elem["inst_id"] != b_elem["inst_id"] # type: ignore + or counter_repr(a_elem["batch_ids"]) != counter_repr(b_elem["batch_ids"]) # type: ignore + or a_elem["name"] != b_elem["name"] # type: ignore + or a_elem["uploader"] != b_elem["uploader"] # type: ignore + or a_elem["deleted"] != b_elem["deleted"] # type: ignore + or a_elem["source"] != b_elem["source"] # type: ignore + or a_elem["deletion_request_time"] != b_elem["deletion_request_time"] # type: ignore + or a_elem["retention_days"] != b_elem["retention_days"] # type: ignore + or a_elem["sst_generated"] != b_elem["sst_generated"] # type: ignore + or a_elem["valid"] != b_elem["valid"] # type: ignore + or a_elem["uploaded_date"] != b_elem["uploaded_date"] # type: ignore ): return False return True -def same_orderless(a: DataOverview, b: DataOverview): +def same_orderless(a: DataOverview, b: DataOverview) -> bool: """Compares two DataOverview objects.""" - for a_elem in a["batches"]: + for a_elem in a["batches"]: # type: ignore found = False - for b_elem in b["batches"]: + for b_elem in b["batches"]: # type: ignore if a_elem["batch_id"] != b_elem["batch_id"]: continue found = True @@ -84,9 +85,9 @@ def same_orderless(a: DataOverview, b: DataOverview): return False if not found: return False - for a_elem in a["files"]: + for a_elem in a["files"]: # type: ignore found = False - for b_elem in b["files"]: + for b_elem in b["files"]: # type: ignore if a_elem["data_id"] != b_elem["data_id"]: continue found = True @@ -191,7 +192,7 @@ def session_fixture(): @pytest.fixture(name="client") -def client_fixture(session: sqlalchemy.orm.Session): +def client_fixture(session: sqlalchemy.orm.Session) -> Any: """Unit test mocks setup.""" def get_session_override(): @@ -213,7 +214,7 @@ def storage_control_override(): app.dependency_overrides.clear() -def test_read_inst_all_input_files(client: TestClient): +def test_read_inst_all_input_files(client: TestClient) -> Any: """Test GET /institutions//input.""" response = client.get("/institutions/" + uuid_to_str(UUID_INVALID) + "/input") @@ -227,7 +228,7 @@ def test_read_inst_all_input_files(client: TestClient): "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/input" ) assert response.status_code == 200 - assert same_orderless( + assert same_orderless( # type: ignore response.json(), { "batches": [ @@ -280,7 +281,7 @@ def test_read_inst_all_input_files(client: TestClient): ) -def test_read_inst_all_output_files(client: TestClient): +def test_read_inst_all_output_files(client: TestClient) -> Any: """Test GET /institutions//output.""" MOCK_STORAGE.list_blobs_in_folder.return_value = [] response = client.get("/institutions/" + uuid_to_str(UUID_INVALID) + "/output") @@ -295,7 +296,7 @@ def test_read_inst_all_output_files(client: TestClient): "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/output" ) assert response.status_code == 200 - assert same_orderless( + assert same_orderless( # type: ignore response.json(), { "batches": [ @@ -348,7 +349,7 @@ def test_read_inst_all_output_files(client: TestClient): ) -def test_read_batch_info(client: TestClient): +def test_read_batch_info(client: TestClient) -> Any: """Test GET /institutions//batch/.""" response = client.get( "/institutions/" @@ -370,7 +371,7 @@ def test_read_batch_info(client: TestClient): + uuid_to_str(BATCH_UUID) ) assert response.status_code == 200 - assert same_orderless( + assert same_orderless( # type: ignore response.json(), { "batches": [ @@ -423,7 +424,7 @@ def test_read_batch_info(client: TestClient): ) -def test_read_file_id_info(client: TestClient): +def test_read_file_id_info(client: TestClient) -> Any: """Test GET /institutions//file-id/.""" response = client.get( "/institutions/" @@ -445,7 +446,7 @@ def test_read_file_id_info(client: TestClient): + uuid_to_str(FILE_UUID_1) ) assert response.status_code == 200 - assert same_file_orderless( + assert same_file_orderless( # type: ignore response.json(), { "name": "file_input_one", @@ -464,7 +465,7 @@ def test_read_file_id_info(client: TestClient): ) -def test_retrieve_file_as_bytes(client: TestClient): +def test_retrieve_file_as_bytes(client: TestClient) -> Any: """Test GET /institutions//output-file-contents/.""" response = client.get( "/institutions/" @@ -489,7 +490,7 @@ def test_retrieve_file_as_bytes(client: TestClient): assert response.text == '{"detail":"No such output file exists."}' -def test_create_batch(client: TestClient): +def test_create_batch(client: TestClient) -> None: """Test POST /institutions//batch.""" response = client.post( "/institutions/" + uuid_to_str(UUID_INVALID) + "/batch", @@ -527,7 +528,7 @@ def test_create_batch(client: TestClient): assert len(response.json()["file_names_to_ids"]) == 1 -def test_update_batch(client: TestClient): +def test_update_batch(client: TestClient) -> None: """Test PATCH /institutions//batch.""" response = client.patch( "/institutions/" @@ -565,7 +566,7 @@ def test_update_batch(client: TestClient): } -def test_validate_success_batch(client: TestClient): +def test_validate_success_batch(client: TestClient) -> None: """Test PATCH /institutions//batch.""" MOCK_STORAGE.validate_file.return_value = ["UNKNOWN"] @@ -616,7 +617,7 @@ def test_validate_success_batch(client: TestClient): assert response_sftp.json()["source"] == "PDP_SFTP" -def test_validate_failure_batch(client: TestClient): +def test_validate_failure_batch(client: TestClient) -> None: """Test PATCH /institutions//batch.""" MOCK_STORAGE.validate_file.return_value = ["COURSE"] # Authorized. diff --git a/src/webapp/validation.py b/src/webapp/validation.py index f2449eea..7bfdcf70 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -16,7 +16,7 @@ def validate_file_reader( - filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: dict + filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: Optional[Dict[Any, Any]] = None ) -> dict[str, Any]: """Validates given a filename.""" return validate_dataset(filename, base_schema, inst_schema, allowed_schema) From 1a2edc070d1cbd119f31a1b24bcb8acb2597205e Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:35:40 -0500 Subject: [PATCH 22/36] fix: type checks --- src/webapp/routers/data_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index c4a5d0de..d356d1c7 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -230,7 +230,7 @@ def test_read_inst_all_input_files(client: TestClient) -> Any: assert response.status_code == 200 assert same_orderless( # type: ignore response.json(), - { + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -298,7 +298,7 @@ def test_read_inst_all_output_files(client: TestClient) -> Any: assert response.status_code == 200 assert same_orderless( # type: ignore response.json(), - { + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -373,7 +373,7 @@ def test_read_batch_info(client: TestClient) -> Any: assert response.status_code == 200 assert same_orderless( # type: ignore response.json(), - { + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -448,7 +448,7 @@ def test_read_file_id_info(client: TestClient) -> Any: assert response.status_code == 200 assert same_file_orderless( # type: ignore response.json(), - { + { # type: ignore "name": "file_input_one", "data_id": "f0bb3a206d924254afed6a72f43c562a", "batch_ids": ["5b2420f3103546ab90eb74d5df97de43"], From 3b2d18fc2d9ea6c9d4adeb6a9cb3036be05f8c01 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:36:56 -0500 Subject: [PATCH 23/36] fix: type checks --- src/webapp/database.py | 2 +- src/webapp/routers/data.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 3e5b178a..1d2b26b8 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -2,7 +2,7 @@ import uuid import datetime -from typing import Set, List, Any, Optional, Dict +from typing import Set, List, Any from contextvars import ContextVar import enum import sqlalchemy diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 47fe55ad..932e75d2 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -16,7 +16,6 @@ from ..config import databricks_vars, env_vars, gcs_vars import tempfile import pathlib -import json from ..utilities import ( has_access_to_inst_or_err, From 7336f4e1a5174cc8c1584ff7c098ccc173920f95 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:38:04 -0500 Subject: [PATCH 24/36] fix: type checks --- src/webapp/database.py | 108 ++++++++++++++++---------------- src/webapp/routers/data_test.py | 48 +++++++------- src/webapp/validation.py | 5 +- src/webapp/validation_test.py | 14 ++++- 4 files changed, 94 insertions(+), 81 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 1d2b26b8..7020cbc1 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -100,12 +100,12 @@ def init_db(env: str) -> Any: session.close() -class InstTable(Base): # type: ignore +class InstTable(Base): # type: ignore """The institution overview table that maps ids to names. The parent table to all other tables except for AccountHistory and JobTable.""" __tablename__ = "inst" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore # Linked children tables. accounts: Mapped[Set["AccountTable"]] = relationship(back_populates="inst") @@ -120,27 +120,27 @@ class InstTable(Base): # type: ignore "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # type: ignore + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # type: ignore # If retention unset, the Datakind default is used. File-level retentions overrides # this value. retention_days: Mapped[int] = mapped_column(nullable=True) # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} - allowed_emails = Column(MutableDict.as_mutable(JSON)) # type: ignore + allowed_emails = Column(MutableDict.as_mutable(JSON)) # type: ignore # Schemas that are allowed for validation. - schemas = Column(MutableList.as_mutable(JSON)) # type: ignore - state = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + schemas = Column(MutableList.as_mutable(JSON)) # type: ignore + state = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore # Only populated for PDP schools. - pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore - created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Within the institutions, the set of name + state should be unique __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) -class ApiKeyTable(Base): # type: ignore +class ApiKeyTable(Base): # type: ignore """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" @@ -175,7 +175,7 @@ class ApiKeyTable(Base): # type: ignore ) -class AccountTable(Base): # type: ignore +class AccountTable(Base): # type: ignore """ NOTE: only users created by the frontend are accessible through the fronted. Users created by API calls can only directly call API calls. Frontend will not work. The user accounts table""" @@ -216,7 +216,7 @@ class AccountTable(Base): # type: ignore updated_at = Column(DateTime(timezone=True), onupdate=func.now()) -class AccountHistoryTable(Base): # type: ignore +class AccountHistoryTable(Base): # type: ignore """The user history table""" __tablename__ = "account_history" @@ -264,12 +264,12 @@ class AccountHistoryTable(Base): # type: ignore ) -class FileTable(Base): # type: ignore - """The file table""" +class FileTable(Base): # type: ignore + """The file table""" __tablename__ = "file" - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore batches: Mapped[Set["BatchTable"]] = relationship( secondary=association_table, back_populates="files" ) @@ -283,65 +283,65 @@ class FileTable(Base): # type: ignore # The size to the nearest mb. # size_mb: Mapped[int] = mapped_column(nullable=False) # Who uploaded the file. For SST generated files, this field would be null. - uploader = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + uploader = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Can be PDP_SFTP, MANUAL_UPLOAD etc. May be empty for generated files. - source = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + source = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore # The schema type(s) of this file. - schemas = Column(MutableList.as_mutable(JSON), nullable=False) # type: ignore + schemas = Column(MutableList.as_mutable(JSON), nullable=False) # type: ignore # If null, the following is non-deleted. # The deleted field indicates whether there is a pending deletion request on the data. # The data may stil be available to Datakind debug role in a soft-delete state but for all # intents and purposes is no longer accessible by the app. deleted: Mapped[bool] = mapped_column(nullable=True) # When the deletion request was made - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore retention_days: Mapped[int] = mapped_column(nullable=True) # Whether the file was generated by SST. (e.g. was it input or output) sst_generated: Mapped[bool] = mapped_column(nullable=False) # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False) - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # Within a given institution, there should be no duplicated file names. __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) -class BatchTable(Base): # type: ignore +class BatchTable(Base): # type: ignore """The batch table""" - __tablename__ = "batch" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + __tablename__ = "batch" + id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore # Set the parent foreign key to link to the institution table. inst_id = Column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) # type: ignore inst: Mapped["InstTable"] = relationship(back_populates="batches") files: Mapped[Set["FileTable"]] = relationship( secondary=association_table, back_populates="batches" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore - created_by = Column(Uuid(as_uuid=True)) # type: ignore + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + created_by = Column(Uuid(as_uuid=True)) # type: ignore # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the batch is ready for use. completed: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. - updated_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + updated_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # Within a given institution, there should be no duplicated batch names. __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) -class ModelTable(Base): # type: ignore +class ModelTable(Base): # type: ignore """The model table""" __tablename__ = "model" @@ -352,31 +352,31 @@ class ModelTable(Base): # type: ignore Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) # type: ignore inst: Mapped["InstTable"] = relationship(back_populates="models") jobs: Mapped[Set["JobTable"]] = relationship(back_populates="model") - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore # What configuration of schemas are allowed (list of maps e.g. [PDP Course : 1 + PDP Cohort : 1, X_schema :1 + Y_schema: 2]) - schema_configs = Column(JSON, nullable=True) # type: ignore - created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + schema_configs = Column(JSON, nullable=True) # type: ignore + created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the model has been approved and is ready for use. - valid: Mapped[bool] = mapped_column(nullable=True) + valid: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore + updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. - version = Column(Integer, default=0) # type: ignore + version = Column(Integer, default=0) # type: ignore # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) -class JobTable(Base): # type: ignore +class JobTable(Base): # type: ignore """The job table""" __tablename__ = "job" @@ -387,27 +387,27 @@ class JobTable(Base): # type: ignore Uuid(as_uuid=True), ForeignKey("model.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) # type: ignore model: Mapped["ModelTable"] = relationship(back_populates="jobs") - created_by = Column(Uuid(as_uuid=True), nullable=False) # type: ignore + created_by = Column(Uuid(as_uuid=True), nullable=False) # type: ignore # The time the deletion request was set. - triggered_at = Column(DateTime(timezone=True), nullable=False) # type: ignore - batch_name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + triggered_at = Column(DateTime(timezone=True), nullable=False) # type: ignore + batch_name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore # The following will be empty if not completed or if job errored out. Getting additional details will require a call to the Databricks table. - output_filename = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore + output_filename = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore # Whether the file was approved. output_valid: Mapped[bool] = mapped_column(nullable=True, default=False) - err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore + err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore completed: Mapped[bool] = mapped_column(nullable=True) -class DocType(enum.Enum): # type: ignore +class DocType(enum.Enum): # type: ignore base = "base" extension = "extension" -class SchemaRegistryTable(Base): # type: ignore +class SchemaRegistryTable(Base): # type: ignore """ Stores versioned schema documents: - Base schema (doc_type=base, is_pdp=False, inst_id NULL) @@ -434,11 +434,11 @@ class SchemaRegistryTable(Base): # type: ignore ), nullable=True, ) - json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) # type: ignore + json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) # type: ignore is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) created_at = Column( DateTime(timezone=True), server_default=func.now(), nullable=False - ) # type: ignore + ) # type: ignore # ---------------- Relationships ---------------- inst: Mapped["InstTable | None"] = relationship( diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index d356d1c7..cc529934 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -45,20 +45,20 @@ def counter_repr(x): return {frozenset(Counter(item).items()) for item in x} -def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): # type: ignore +def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): # type: ignore """Compares two DataInfo objects.""" if ( - a_elem["inst_id"] != b_elem["inst_id"] # type: ignore - or counter_repr(a_elem["batch_ids"]) != counter_repr(b_elem["batch_ids"]) # type: ignore - or a_elem["name"] != b_elem["name"] # type: ignore - or a_elem["uploader"] != b_elem["uploader"] # type: ignore - or a_elem["deleted"] != b_elem["deleted"] # type: ignore - or a_elem["source"] != b_elem["source"] # type: ignore - or a_elem["deletion_request_time"] != b_elem["deletion_request_time"] # type: ignore - or a_elem["retention_days"] != b_elem["retention_days"] # type: ignore - or a_elem["sst_generated"] != b_elem["sst_generated"] # type: ignore - or a_elem["valid"] != b_elem["valid"] # type: ignore - or a_elem["uploaded_date"] != b_elem["uploaded_date"] # type: ignore + a_elem["inst_id"] != b_elem["inst_id"] # type: ignore + or counter_repr(a_elem["batch_ids"]) != counter_repr(b_elem["batch_ids"]) # type: ignore + or a_elem["name"] != b_elem["name"] # type: ignore + or a_elem["uploader"] != b_elem["uploader"] # type: ignore + or a_elem["deleted"] != b_elem["deleted"] # type: ignore + or a_elem["source"] != b_elem["source"] # type: ignore + or a_elem["deletion_request_time"] != b_elem["deletion_request_time"] # type: ignore + or a_elem["retention_days"] != b_elem["retention_days"] # type: ignore + or a_elem["sst_generated"] != b_elem["sst_generated"] # type: ignore + or a_elem["valid"] != b_elem["valid"] # type: ignore + or a_elem["uploaded_date"] != b_elem["uploaded_date"] # type: ignore ): return False return True @@ -66,9 +66,9 @@ def same_file_orderless(a_elem: DataInfo, b_elem: DataInfo): # type: ignore def same_orderless(a: DataOverview, b: DataOverview) -> bool: """Compares two DataOverview objects.""" - for a_elem in a["batches"]: # type: ignore + for a_elem in a["batches"]: # type: ignore found = False - for b_elem in b["batches"]: # type: ignore + for b_elem in b["batches"]: # type: ignore if a_elem["batch_id"] != b_elem["batch_id"]: continue found = True @@ -85,9 +85,9 @@ def same_orderless(a: DataOverview, b: DataOverview) -> bool: return False if not found: return False - for a_elem in a["files"]: # type: ignore + for a_elem in a["files"]: # type: ignore found = False - for b_elem in b["files"]: # type: ignore + for b_elem in b["files"]: # type: ignore if a_elem["data_id"] != b_elem["data_id"]: continue found = True @@ -228,9 +228,9 @@ def test_read_inst_all_input_files(client: TestClient) -> Any: "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/input" ) assert response.status_code == 200 - assert same_orderless( # type: ignore + assert same_orderless( # type: ignore response.json(), - { # type: ignore + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -296,9 +296,9 @@ def test_read_inst_all_output_files(client: TestClient) -> Any: "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) + "/output" ) assert response.status_code == 200 - assert same_orderless( # type: ignore + assert same_orderless( # type: ignore response.json(), - { # type: ignore + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -371,9 +371,9 @@ def test_read_batch_info(client: TestClient) -> Any: + uuid_to_str(BATCH_UUID) ) assert response.status_code == 200 - assert same_orderless( # type: ignore + assert same_orderless( # type: ignore response.json(), - { # type: ignore + { # type: ignore "batches": [ { "batch_id": "5b2420f3103546ab90eb74d5df97de43", @@ -446,9 +446,9 @@ def test_read_file_id_info(client: TestClient) -> Any: + uuid_to_str(FILE_UUID_1) ) assert response.status_code == 200 - assert same_file_orderless( # type: ignore + assert same_file_orderless( # type: ignore response.json(), - { # type: ignore + { # type: ignore "name": "file_input_one", "data_id": "f0bb3a206d924254afed6a72f43c562a", "batch_ids": ["5b2420f3103546ab90eb74d5df97de43"], diff --git a/src/webapp/validation.py b/src/webapp/validation.py index 7bfdcf70..90583fb6 100644 --- a/src/webapp/validation.py +++ b/src/webapp/validation.py @@ -16,7 +16,10 @@ def validate_file_reader( - filename: str, allowed_schema: list[str], base_schema: dict, inst_schema: Optional[Dict[Any, Any]] = None + filename: str, + allowed_schema: list[str], + base_schema: dict, + inst_schema: Optional[Dict[Any, Any]] = None, ) -> dict[str, Any]: """Validates given a filename.""" return validate_dataset(filename, base_schema, inst_schema, allowed_schema) diff --git a/src/webapp/validation_test.py b/src/webapp/validation_test.py index 7f5a15da..92bc1f48 100644 --- a/src/webapp/validation_test.py +++ b/src/webapp/validation_test.py @@ -47,7 +47,12 @@ def test_validate_file_reader_passes(tmp_csv_file): mock_load.side_effect = lambda path: ( MOCK_BASE_SCHEMA if "base" in path else MOCK_EXT_SCHEMA ) - result = validate_file_reader(tmp_csv_file, ["test_model"], base_schema=MOCK_BASE_SCHEMA, inst_schema=MOCK_EXT_SCHEMA,) + result = validate_file_reader( + tmp_csv_file, + ["test_model"], + base_schema=MOCK_BASE_SCHEMA, + inst_schema=MOCK_EXT_SCHEMA, + ) assert result["validation_status"] == "passed" assert result["schemas"] == ["test_model"] @@ -65,5 +70,10 @@ def test_validate_file_reader_fails_missing_required(tmp_path): MOCK_BASE_SCHEMA if "base" in path else MOCK_EXT_SCHEMA ) with pytest.raises(HardValidationError) as exc_info: - validate_file_reader(str(file_path), ["test_model"], base_schema=MOCK_BASE_SCHEMA, inst_schema=MOCK_EXT_SCHEMA,) + validate_file_reader( + str(file_path), + ["test_model"], + base_schema=MOCK_BASE_SCHEMA, + inst_schema=MOCK_EXT_SCHEMA, + ) assert "Missing required columns" in str(exc_info.value) From 952705e6f3cd257463af825e0e4df0e32c755096 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:52:33 -0500 Subject: [PATCH 25/36] feat: added length guards for all varchar variables --- src/webapp/database.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 7020cbc1..e895d885 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -23,8 +23,9 @@ Integer, BigInteger, Index, + event, ) -from sqlalchemy.orm import sessionmaker, Session, relationship, mapped_column, Mapped +from sqlalchemy.orm import sessionmaker, Session, relationship, mapped_column, Mapped, Mapper from sqlalchemy.sql import func from sqlalchemy.pool import StaticPool from .config import engine_vars, ssl_env_vars, setup_database_vars @@ -47,7 +48,19 @@ LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) - +@event.listens_for(Mapper, "before_insert") +@event.listens_for(Mapper, "before_update") +def validate_string_lengths(mapper, connection, target): + for column in mapper.columns: + col_type = column.type + if isinstance(col_type, String) and col_type.length: + value = getattr(target, column.name, None) + if value is not None and len(value) > col_type.length: + raise ValueError( + f"Value for '{column.name}' exceeds max length " + f"{col_type.length}: {len(value)} characters provided" + ) + def init_db(env: str) -> Any: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. @@ -426,7 +439,7 @@ class SchemaRegistryTable(Base): # type: ignore ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True ) is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - version_label: Mapped[str] = mapped_column(String(32), nullable=False) + version_label = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) extends_schema_id: Mapped[int | None] = mapped_column( BigInteger, ForeignKey( From 50bd248945575e252f348c8d14c2e4eae86e7fc7 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:52:53 -0500 Subject: [PATCH 26/36] fix: type checks --- src/webapp/database.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index e895d885..00340da7 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -25,7 +25,14 @@ Index, event, ) -from sqlalchemy.orm import sessionmaker, Session, relationship, mapped_column, Mapped, Mapper +from sqlalchemy.orm import ( + sessionmaker, + Session, + relationship, + mapped_column, + Mapped, + Mapper, +) from sqlalchemy.sql import func from sqlalchemy.pool import StaticPool from .config import engine_vars, ssl_env_vars, setup_database_vars @@ -48,6 +55,7 @@ LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) + @event.listens_for(Mapper, "before_insert") @event.listens_for(Mapper, "before_update") def validate_string_lengths(mapper, connection, target): @@ -60,7 +68,8 @@ def validate_string_lengths(mapper, connection, target): f"Value for '{column.name}' exceeds max length " f"{col_type.length}: {len(value)} characters provided" ) - + + def init_db(env: str) -> Any: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. From 3f5a4eefa6c4effe6c97afe45976717b2305a2d8 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 19:57:38 -0500 Subject: [PATCH 27/36] fix: type checks --- src/webapp/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 00340da7..a9aec7ef 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -442,7 +442,7 @@ class SchemaRegistryTable(Base): # type: ignore schema_id: Mapped[int] = mapped_column( Integer, primary_key=True, autoincrement=True ) - doc_type: Mapped[DocType] = mapped_column(Enum(DocType), nullable=False) + doc_type: Mapped[DocType] = mapped_column(Enum(DocType, native_enum=False), nullable=False) # Nullable: NULL for base and PDP shared extension inst_id: Mapped[uuid.UUID | None] = mapped_column( ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True From 03ec835b002a57d975a3c9487c6e4b1fe92e04e0 Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 20:00:08 -0500 Subject: [PATCH 28/36] fix: type checks --- src/webapp/database.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index a9aec7ef..fb03d5a3 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -55,21 +55,19 @@ LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) - @event.listens_for(Mapper, "before_insert") @event.listens_for(Mapper, "before_update") def validate_string_lengths(mapper, connection, target): for column in mapper.columns: col_type = column.type if isinstance(col_type, String) and col_type.length: - value = getattr(target, column.name, None) - if value is not None and len(value) > col_type.length: + val = getattr(target, column.name, None) + if isinstance(val, str) and len(val) > col_type.length: raise ValueError( f"Value for '{column.name}' exceeds max length " - f"{col_type.length}: {len(value)} characters provided" + f"{col_type.length}: {len(val)} characters provided" ) - def init_db(env: str) -> Any: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. From 4b89e1818110ec109104c4504936b66bd6feeaea Mon Sep 17 00:00:00 2001 From: Mesh Date: Sun, 20 Jul 2025 20:01:34 -0500 Subject: [PATCH 29/36] fix: type checks --- src/webapp/database.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index fb03d5a3..533b6246 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -55,6 +55,7 @@ LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) + @event.listens_for(Mapper, "before_insert") @event.listens_for(Mapper, "before_update") def validate_string_lengths(mapper, connection, target): @@ -68,6 +69,7 @@ def validate_string_lengths(mapper, connection, target): f"{col_type.length}: {len(val)} characters provided" ) + def init_db(env: str) -> Any: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. @@ -440,7 +442,9 @@ class SchemaRegistryTable(Base): # type: ignore schema_id: Mapped[int] = mapped_column( Integer, primary_key=True, autoincrement=True ) - doc_type: Mapped[DocType] = mapped_column(Enum(DocType, native_enum=False), nullable=False) + doc_type: Mapped[DocType] = mapped_column( + Enum(DocType, native_enum=False), nullable=False + ) # Nullable: NULL for base and PDP shared extension inst_id: Mapped[uuid.UUID | None] = mapped_column( ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True From 42311af3dd65d2088788ccad8e7cda3a2793cf30 Mon Sep 17 00:00:00 2001 From: Mesh Date: Mon, 21 Jul 2025 13:50:07 -0500 Subject: [PATCH 30/36] fix: type checks --- src/webapp/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 533b6246..5e42c145 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -452,7 +452,7 @@ class SchemaRegistryTable(Base): # type: ignore is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) version_label = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) extends_schema_id: Mapped[int | None] = mapped_column( - BigInteger, + Integer, ForeignKey( "schema_registry.schema_id", ondelete="SET NULL", onupdate="CASCADE" ), From 6ce08bbcf4da47d1232fce683bae16b9b0b4757a Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 19:36:20 -0400 Subject: [PATCH 31/36] trying to fix type check --- src/webapp/database.py | 184 +++++++++++++++++++++-------------------- 1 file changed, 93 insertions(+), 91 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 5e42c145..ef419a10 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -6,7 +6,7 @@ from contextvars import ContextVar import enum import sqlalchemy -from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped from sqlalchemy.ext.mutable import MutableDict, MutableList from sqlalchemy import ( Column, @@ -38,7 +38,9 @@ from .config import engine_vars, ssl_env_vars, setup_database_vars from .authn import get_password_hash, get_api_key_hash -Base = declarative_base() +class Base(DeclarativeBase): + pass + LocalSession = None local_session: ContextVar[Session] = ContextVar("local_session") db_engine = None @@ -70,7 +72,7 @@ def validate_string_lengths(mapper, connection, target): ) -def init_db(env: str) -> Any: +def init_db(env: str) -> None: """Initialize the database for LOCAL and DEV environemtns for ease of use.""" # add some sample users to the database for development utility. if env not in ("LOCAL", "DEV"): @@ -122,12 +124,12 @@ def init_db(env: str) -> Any: session.close() -class InstTable(Base): # type: ignore +class InstTable(Base): """The institution overview table that maps ids to names. The parent table to all other tables except for AccountHistory and JobTable.""" __tablename__ = "inst" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # Linked children tables. accounts: Mapped[Set["AccountTable"]] = relationship(back_populates="inst") @@ -142,51 +144,51 @@ class InstTable(Base): # type: ignore "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # type: ignore + name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) # If retention unset, the Datakind default is used. File-level retentions overrides # this value. retention_days: Mapped[int] = mapped_column(nullable=True) # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} - allowed_emails = Column(MutableDict.as_mutable(JSON)) # type: ignore + allowed_emails: Mapped[dict[str, str]] = mapped_column(MutableDict.as_mutable(JSON())) # Schemas that are allowed for validation. - schemas = Column(MutableList.as_mutable(JSON)) # type: ignore - state = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + schemas: Mapped[dict[str, str]] = mapped_column(MutableList.as_mutable(JSON())) + state: Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # Only populated for PDP schools. - pdp_id = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore - created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + pdp_id: Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + created_at: Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) + updated_at: Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) # Within the institutions, the set of name + state should be unique __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) -class ApiKeyTable(Base): # type: ignore +class ApiKeyTable(Base): """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # A hash of the key_value, so the user must store the generated key_value secretly. - hashed_key_value = Column( + hashed_key_value : Mapped[dict[str, str]] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True ) # Set the foreign key to link to the institution table. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="apikeys") - created_by = Column(Uuid(as_uuid=True), nullable=False) - notes = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) + notes : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # Whether this key allows changing the enduser. ONLY SET FOR THE FRONTEND KEY. Can only be set when the API key has DATAKINDER access type as this allows Datakinder level endusers. allows_enduser: Mapped[bool] = mapped_column(nullable=True) - access_type = Column(String(VAR_CHAR_LENGTH), nullable=False) + access_type : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=False) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) + updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) # API key must be valid and not deleted. deleted: Mapped[bool] = mapped_column(nullable=True) valid: Mapped[bool] = mapped_column(nullable=True) @@ -197,13 +199,13 @@ class ApiKeyTable(Base): # type: ignore ) -class AccountTable(Base): # type: ignore +class AccountTable(Base): """ NOTE: only users created by the frontend are accessible through the fronted. Users created by API calls can only directly call API calls. Frontend will not work. The user accounts table""" __tablename__ = "users" # Name to be compliant with Laravel. - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # Set account histories to be children account_histories: Mapped[List["AccountHistoryTable"]] = relationship( @@ -211,44 +213,44 @@ class AccountTable(Base): # type: ignore ) # Set the foreign key to link to the institution table. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="accounts") - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - email = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) - google_id = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) - azure_id = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + email : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) + google_id : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + azure_id : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) - email_verified_at = Column(DateTime(timezone=True), nullable=True) - password = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - two_factor_secret = Column(Text, nullable=True) - two_factor_recovery_codes = Column(Text, nullable=True) - two_factor_confirmed_at = Column(DateTime(timezone=True), nullable=True) + email_verified_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) + password : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + two_factor_secret : Mapped[dict[str, str]] = mapped_column(Text, nullable=True) + two_factor_recovery_codes : Mapped[dict[str, str]] = mapped_column(Text, nullable=True) + two_factor_confirmed_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) - remember_token = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + remember_token : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # Required for team integration with laravel - current_team_id = Column(Uuid(as_uuid=True), nullable=True) - access_type = Column(String(VAR_CHAR_LENGTH), nullable=True) - # profile_photo_path = Column(String(VAR_CHAR_LENGTH), nullable=True) + current_team_id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + access_type : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + # profile_photo_path : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) + updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) -class AccountHistoryTable(Base): # type: ignore +class AccountHistoryTable(Base): """The user history table""" __tablename__ = "account_history" - id = Column(Integer, primary_key=True) # Auto-increment should be default - timestamp = Column( + id : Mapped[dict[str, str]] = mapped_column(Integer, primary_key=True) # Auto-increment should be default + timestamp : Mapped[dict[str, str]] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) # Set the parent foreign key to link to the users table. - account_id = Column( + account_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, @@ -256,15 +258,15 @@ class AccountHistoryTable(Base): # type: ignore account: Mapped["AccountTable"] = relationship(back_populates="account_histories") # This field is nullable if the action was taken by a Datakinder. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="account_histories") - action = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - resource_id = Column(Uuid(as_uuid=True), nullable=False) + action : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + resource_id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) # An intermediary association table allows bi-directional many-to-many between files and batches. @@ -286,17 +288,17 @@ class AccountHistoryTable(Base): # type: ignore ) -class FileTable(Base): # type: ignore +class FileTable(Base): """The file table""" __tablename__ = "file" - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) batches: Mapped[Set["BatchTable"]] = relationship( secondary=association_table, back_populates="files" ) # Set the parent foreign key to link to the institution table. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, @@ -305,131 +307,131 @@ class FileTable(Base): # type: ignore # The size to the nearest mb. # size_mb: Mapped[int] = mapped_column(nullable=False) # Who uploaded the file. For SST generated files, this field would be null. - uploader = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + uploader : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) # Can be PDP_SFTP, MANUAL_UPLOAD etc. May be empty for generated files. - source = Column(String(VAR_CHAR_LENGTH), nullable=True) # type: ignore + source : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # The schema type(s) of this file. - schemas = Column(MutableList.as_mutable(JSON), nullable=False) # type: ignore + schemas : Mapped[dict[str, str]] = mapped_column(MutableList.as_mutable((JSON())), nullable=False) # If null, the following is non-deleted. # The deleted field indicates whether there is a pending deletion request on the data. # The data may stil be available to Datakind debug role in a soft-delete state but for all # intents and purposes is no longer accessible by the app. deleted: Mapped[bool] = mapped_column(nullable=True) # When the deletion request was made - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore + deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) retention_days: Mapped[int] = mapped_column(nullable=True) # Whether the file was generated by SST. (e.g. was it input or output) sst_generated: Mapped[bool] = mapped_column(nullable=False) # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False) - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) + updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) # Within a given institution, there should be no duplicated file names. __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) -class BatchTable(Base): # type: ignore +class BatchTable(Base): """The batch table""" __tablename__ = "batch" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # type: ignore + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # Set the parent foreign key to link to the institution table. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) inst: Mapped["InstTable"] = relationship(back_populates="batches") files: Mapped[Set["FileTable"]] = relationship( secondary=association_table, back_populates="batches" ) - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore - created_by = Column(Uuid(as_uuid=True)) # type: ignore + name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True)) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the batch is ready for use. completed: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) + created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) + updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. - updated_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + updated_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) # Within a given institution, there should be no duplicated batch names. __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) -class ModelTable(Base): # type: ignore +class ModelTable(Base): """The model table""" __tablename__ = "model" - id = Column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) # Set the parent foreign key to link to the institution table. - inst_id = Column( + inst_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) inst: Mapped["InstTable"] = relationship(back_populates="models") jobs: Mapped[Set["JobTable"]] = relationship(back_populates="model") - name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # What configuration of schemas are allowed (list of maps e.g. [PDP Course : 1 + PDP Cohort : 1, X_schema :1 + Y_schema: 2]) - schema_configs = Column(JSON, nullable=True) # type: ignore - created_by = Column(Uuid(as_uuid=True), nullable=True) # type: ignore + schema_configs : Mapped[dict[str, str]] = mapped_column(JSON(), nullable=True) + created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the model has been approved and is ready for use. valid: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at = Column(DateTime(timezone=True), nullable=True) # type: ignore - created_at = Column(DateTime(timezone=True), server_default=func.now()) # type: ignore - updated_at = Column(DateTime(timezone=True), onupdate=func.now()) # type: ignore + deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) + created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) + updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. - version = Column(Integer, default=0) # type: ignore + version : Mapped[dict[str, str]] = mapped_column(Integer, default=0) # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) -class JobTable(Base): # type: ignore +class JobTable(Base): """The job table""" __tablename__ = "job" - id = Column(BigInteger, primary_key=True) + id : Mapped[dict[str, str]] = mapped_column(BigInteger, primary_key=True) # Set the parent foreign key to link to the institution table. - model_id = Column( + model_id : Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("model.id", ondelete="CASCADE"), nullable=False, - ) # type: ignore + ) model: Mapped["ModelTable"] = relationship(back_populates="jobs") - created_by = Column(Uuid(as_uuid=True), nullable=False) # type: ignore + created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) # The time the deletion request was set. - triggered_at = Column(DateTime(timezone=True), nullable=False) # type: ignore - batch_name = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # type: ignore + triggered_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=False) + batch_name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # The following will be empty if not completed or if job errored out. Getting additional details will require a call to the Databricks table. - output_filename = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore + output_filename : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # Whether the file was approved. output_valid: Mapped[bool] = mapped_column(nullable=True, default=False) - err_msg = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) # type: ignore + err_msg : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) completed: Mapped[bool] = mapped_column(nullable=True) -class DocType(enum.Enum): # type: ignore +class DocType(enum.Enum): base = "base" extension = "extension" -class SchemaRegistryTable(Base): # type: ignore +class SchemaRegistryTable(Base): """ Stores versioned schema documents: - Base schema (doc_type=base, is_pdp=False, inst_id NULL) @@ -450,7 +452,7 @@ class SchemaRegistryTable(Base): # type: ignore ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True ) is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - version_label = Column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + version_label : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) extends_schema_id: Mapped[int | None] = mapped_column( Integer, ForeignKey( @@ -458,11 +460,11 @@ class SchemaRegistryTable(Base): # type: ignore ), nullable=True, ) - json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON), nullable=False) # type: ignore + json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON()), nullable=False) is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - created_at = Column( + created_at : Mapped[dict[str, str]] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False - ) # type: ignore + ) # ---------------- Relationships ---------------- inst: Mapped["InstTable | None"] = relationship( From 7e34c9e61a827b6f97c06e2940c4ea421579b13e Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 19:39:07 -0400 Subject: [PATCH 32/36] running black --- src/webapp/database.py | 266 +++++++++++++++++++++++++++++------------ 1 file changed, 189 insertions(+), 77 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index ef419a10..31a825ee 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -38,9 +38,11 @@ from .config import engine_vars, ssl_env_vars, setup_database_vars from .authn import get_password_hash, get_api_key_hash + class Base(DeclarativeBase): pass + LocalSession = None local_session: ContextVar[Session] = ContextVar("local_session") db_engine = None @@ -129,7 +131,9 @@ class InstTable(Base): all other tables except for AccountHistory and JobTable.""" __tablename__ = "inst" - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # Linked children tables. accounts: Mapped[Set["AccountTable"]] = relationship(back_populates="inst") @@ -144,21 +148,35 @@ class InstTable(Base): "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) - name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) + name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True + ) # If retention unset, the Datakind default is used. File-level retentions overrides # this value. retention_days: Mapped[int] = mapped_column(nullable=True) # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} - allowed_emails: Mapped[dict[str, str]] = mapped_column(MutableDict.as_mutable(JSON())) + allowed_emails: Mapped[dict[str, str]] = mapped_column( + MutableDict.as_mutable(JSON()) + ) # Schemas that are allowed for validation. schemas: Mapped[dict[str, str]] = mapped_column(MutableList.as_mutable(JSON())) - state: Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + state: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_LENGTH), nullable=True + ) # Only populated for PDP schools. - pdp_id: Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) - created_at: Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at: Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) - created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + pdp_id: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_LENGTH), nullable=True + ) + created_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) + created_by: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=True + ) # Within the institutions, the set of name + state should be unique __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) @@ -168,27 +186,37 @@ class ApiKeyTable(Base): """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # A hash of the key_value, so the user must store the generated key_value secretly. - hashed_key_value : Mapped[dict[str, str]] = mapped_column( + hashed_key_value: Mapped[dict[str, str]] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True ) # Set the foreign key to link to the institution table. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="apikeys") - created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) - notes : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + created_by: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=False + ) + notes: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) # Whether this key allows changing the enduser. ONLY SET FOR THE FRONTEND KEY. Can only be set when the API key has DATAKINDER access type as this allows Datakinder level endusers. allows_enduser: Mapped[bool] = mapped_column(nullable=True) - access_type : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=False) + access_type: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_LENGTH), nullable=False + ) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) # API key must be valid and not deleted. deleted: Mapped[bool] = mapped_column(nullable=True) valid: Mapped[bool] = mapped_column(nullable=True) @@ -205,7 +233,9 @@ class AccountTable(Base): The user accounts table""" __tablename__ = "users" # Name to be compliant with Laravel. - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # Set account histories to be children account_histories: Mapped[List["AccountHistoryTable"]] = relationship( @@ -213,44 +243,70 @@ class AccountTable(Base): ) # Set the foreign key to link to the institution table. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="accounts") - name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - email : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True) - google_id : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) - azure_id : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) + email: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True + ) + google_id: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) + azure_id: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) - email_verified_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) - password : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - two_factor_secret : Mapped[dict[str, str]] = mapped_column(Text, nullable=True) - two_factor_recovery_codes : Mapped[dict[str, str]] = mapped_column(Text, nullable=True) - two_factor_confirmed_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) + email_verified_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=True + ) + password: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) + two_factor_secret: Mapped[dict[str, str]] = mapped_column(Text, nullable=True) + two_factor_recovery_codes: Mapped[dict[str, str]] = mapped_column( + Text, nullable=True + ) + two_factor_confirmed_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=True + ) - remember_token : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + remember_token: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) # Required for team integration with laravel - current_team_id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) - access_type : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + current_team_id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=True + ) + access_type: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_LENGTH), nullable=True + ) # profile_photo_path : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) class AccountHistoryTable(Base): """The user history table""" __tablename__ = "account_history" - id : Mapped[dict[str, str]] = mapped_column(Integer, primary_key=True) # Auto-increment should be default - timestamp : Mapped[dict[str, str]] = mapped_column( + id: Mapped[dict[str, str]] = mapped_column( + Integer, primary_key=True + ) # Auto-increment should be default + timestamp: Mapped[dict[str, str]] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) # Set the parent foreign key to link to the users table. - account_id : Mapped[dict[str, str]] = mapped_column( + account_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, @@ -258,15 +314,19 @@ class AccountHistoryTable(Base): account: Mapped["AccountTable"] = relationship(back_populates="account_histories") # This field is nullable if the action was taken by a Datakinder. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="account_histories") - action : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - resource_id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) + action: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) + resource_id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=False + ) # An intermediary association table allows bi-directional many-to-many between files and batches. @@ -292,13 +352,17 @@ class FileTable(Base): """The file table""" __tablename__ = "file" - name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) batches: Mapped[Set["BatchTable"]] = relationship( secondary=association_table, back_populates="files" ) # Set the parent foreign key to link to the institution table. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, @@ -307,25 +371,35 @@ class FileTable(Base): # The size to the nearest mb. # size_mb: Mapped[int] = mapped_column(nullable=False) # Who uploaded the file. For SST generated files, this field would be null. - uploader : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + uploader: Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) # Can be PDP_SFTP, MANUAL_UPLOAD etc. May be empty for generated files. - source : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + source: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_LENGTH), nullable=True + ) # The schema type(s) of this file. - schemas : Mapped[dict[str, str]] = mapped_column(MutableList.as_mutable((JSON())), nullable=False) + schemas: Mapped[dict[str, str]] = mapped_column( + MutableList.as_mutable((JSON())), nullable=False + ) # If null, the following is non-deleted. # The deleted field indicates whether there is a pending deletion request on the data. # The data may stil be available to Datakind debug role in a soft-delete state but for all # intents and purposes is no longer accessible by the app. deleted: Mapped[bool] = mapped_column(nullable=True) # When the deletion request was made - deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) + deleted_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=True + ) retention_days: Mapped[int] = mapped_column(nullable=True) # Whether the file was generated by SST. (e.g. was it input or output) sst_generated: Mapped[bool] = mapped_column(nullable=False) # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False) - created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + created_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) # Within a given institution, there should be no duplicated file names. __table_args__ = (UniqueConstraint("name", "inst_id", name="file_name_inst_uc"),) @@ -335,32 +409,44 @@ class BatchTable(Base): """The batch table""" __tablename__ = "batch" - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # Set the parent foreign key to link to the institution table. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) + ) inst: Mapped["InstTable"] = relationship(back_populates="batches") files: Mapped[Set["FileTable"]] = relationship( secondary=association_table, back_populates="batches" ) - name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) - created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True)) + name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) + created_by: Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True)) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the batch is ready for use. completed: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) - created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + deleted_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=True + ) + created_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. - updated_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + updated_by: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=True + ) # Within a given institution, there should be no duplicated batch names. __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) @@ -369,32 +455,44 @@ class ModelTable(Base): """The model table""" __tablename__ = "model" - id : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4) + id: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 + ) # Set the parent foreign key to link to the institution table. - inst_id : Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, - ) + ) inst: Mapped["InstTable"] = relationship(back_populates="models") jobs: Mapped[Set["JobTable"]] = relationship(back_populates="model") - name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) # What configuration of schemas are allowed (list of maps e.g. [PDP Course : 1 + PDP Cohort : 1, X_schema :1 + Y_schema: 2]) - schema_configs : Mapped[dict[str, str]] = mapped_column(JSON(), nullable=True) - created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + schema_configs: Mapped[dict[str, str]] = mapped_column(JSON(), nullable=True) + created_by: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=True + ) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the model has been approved and is ready for use. valid: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=True) - created_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), onupdate=func.now()) + deleted_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=True + ) + created_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), server_default=func.now() + ) + updated_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), onupdate=func.now() + ) # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. - version : Mapped[dict[str, str]] = mapped_column(Integer, default=0) + version: Mapped[dict[str, str]] = mapped_column(Integer, default=0) # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) @@ -404,29 +502,39 @@ class JobTable(Base): """The job table""" __tablename__ = "job" - id : Mapped[dict[str, str]] = mapped_column(BigInteger, primary_key=True) + id: Mapped[dict[str, str]] = mapped_column(BigInteger, primary_key=True) # Set the parent foreign key to link to the institution table. - model_id : Mapped[dict[str, str]] = mapped_column( + model_id: Mapped[dict[str, str]] = mapped_column( Uuid(as_uuid=True), ForeignKey("model.id", ondelete="CASCADE"), nullable=False, - ) + ) model: Mapped["ModelTable"] = relationship(back_populates="jobs") - created_by : Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=False) + created_by: Mapped[dict[str, str]] = mapped_column( + Uuid(as_uuid=True), nullable=False + ) # The time the deletion request was set. - triggered_at : Mapped[dict[str, str]] = mapped_column(DateTime(timezone=True), nullable=False) - batch_name : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + triggered_at: Mapped[dict[str, str]] = mapped_column( + DateTime(timezone=True), nullable=False + ) + batch_name: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) # The following will be empty if not completed or if job errored out. Getting additional details will require a call to the Databricks table. - output_filename : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + output_filename: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) # Whether the file was approved. output_valid: Mapped[bool] = mapped_column(nullable=True, default=False) - err_msg : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=True) + err_msg: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=True + ) completed: Mapped[bool] = mapped_column(nullable=True) -class DocType(enum.Enum): +class DocType(enum.Enum): base = "base" extension = "extension" @@ -452,7 +560,9 @@ class SchemaRegistryTable(Base): ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True ) is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - version_label : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + version_label: Mapped[dict[str, str]] = mapped_column( + String(VAR_CHAR_STANDARD_LENGTH), nullable=False + ) extends_schema_id: Mapped[int | None] = mapped_column( Integer, ForeignKey( @@ -460,11 +570,13 @@ class SchemaRegistryTable(Base): ), nullable=True, ) - json_doc: Mapped[dict] = mapped_column(MutableDict.as_mutable(JSON()), nullable=False) + json_doc: Mapped[dict] = mapped_column( + MutableDict.as_mutable(JSON()), nullable=False + ) is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - created_at : Mapped[dict[str, str]] = mapped_column( + created_at: Mapped[dict[str, str]] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False - ) + ) # ---------------- Relationships ---------------- inst: Mapped["InstTable | None"] = relationship( From 13deb12ed3d3e41256e8c8c680ac268b31f1e054 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 19:59:58 -0400 Subject: [PATCH 33/36] trying to fix type check --- src/webapp/database.py | 170 +++++++++++++++++------------------------ 1 file changed, 70 insertions(+), 100 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 31a825ee..ed9bc49a 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -131,7 +131,7 @@ class InstTable(Base): all other tables except for AccountHistory and JobTable.""" __tablename__ = "inst" - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) @@ -148,7 +148,7 @@ class InstTable(Base): "SchemaRegistryTable", back_populates="inst", cascade="all, delete-orphan" ) - name: Mapped[dict[str, str]] = mapped_column( + name: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True ) # If retention unset, the Datakind default is used. File-level retentions overrides @@ -160,23 +160,17 @@ class InstTable(Base): MutableDict.as_mutable(JSON()) ) # Schemas that are allowed for validation. - schemas: Mapped[dict[str, str]] = mapped_column(MutableList.as_mutable(JSON())) - state: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_LENGTH), nullable=True - ) + schemas: Mapped[list[str]] = mapped_column(MutableList.as_mutable(JSON())) + state: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # Only populated for PDP schools. - pdp_id: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_LENGTH), nullable=True - ) - created_at: Mapped[dict[str, str]] = mapped_column( + pdp_id: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) + created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() ) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) - created_by: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=True - ) + created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) # Within the institutions, the set of name + state should be unique __table_args__ = (UniqueConstraint("name", "state", name="inst_name_state_uc"),) @@ -186,35 +180,31 @@ class ApiKeyTable(Base): """API KEYS should match the format generated by `openssl rand -hex 32`""" __tablename__ = "apikey" - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) # A hash of the key_value, so the user must store the generated key_value secretly. - hashed_key_value: Mapped[dict[str, str]] = mapped_column( + hashed_key_value: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True ) # Set the foreign key to link to the institution table. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="apikeys") - created_by: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=False - ) - notes: Mapped[dict[str, str]] = mapped_column( + created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=False) + notes: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) # Whether this key allows changing the enduser. ONLY SET FOR THE FRONTEND KEY. Can only be set when the API key has DATAKINDER access type as this allows Datakinder level endusers. allows_enduser: Mapped[bool] = mapped_column(nullable=True) - access_type: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_LENGTH), nullable=False - ) + access_type: Mapped[str] = mapped_column(String(VAR_CHAR_LENGTH), nullable=False) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) # API key must be valid and not deleted. @@ -233,7 +223,7 @@ class AccountTable(Base): The user accounts table""" __tablename__ = "users" # Name to be compliant with Laravel. - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) @@ -243,53 +233,49 @@ class AccountTable(Base): ) # Set the foreign key to link to the institution table. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="accounts") - name: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False - ) - email: Mapped[dict[str, str]] = mapped_column( + name: Mapped[str] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + email: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False, unique=True ) - google_id: Mapped[dict[str, str]] = mapped_column( + google_id: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) - azure_id: Mapped[dict[str, str]] = mapped_column( + azure_id: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) - email_verified_at: Mapped[dict[str, str]] = mapped_column( + email_verified_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=True ) - password: Mapped[dict[str, str]] = mapped_column( + password: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False ) - two_factor_secret: Mapped[dict[str, str]] = mapped_column(Text, nullable=True) - two_factor_recovery_codes: Mapped[dict[str, str]] = mapped_column( - Text, nullable=True - ) - two_factor_confirmed_at: Mapped[dict[str, str]] = mapped_column( + two_factor_secret: Mapped[str | None] = mapped_column(Text, nullable=True) + two_factor_recovery_codes: Mapped[str | None] = mapped_column(Text, nullable=True) + two_factor_confirmed_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=True ) - remember_token: Mapped[dict[str, str]] = mapped_column( + remember_token: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) # Required for team integration with laravel - current_team_id: Mapped[dict[str, str]] = mapped_column( + current_team_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), nullable=True ) - access_type: Mapped[dict[str, str]] = mapped_column( + access_type: Mapped[str | None] = mapped_column( String(VAR_CHAR_LENGTH), nullable=True ) # profile_photo_path : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) @@ -298,15 +284,15 @@ class AccountHistoryTable(Base): """The user history table""" __tablename__ = "account_history" - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[int] = mapped_column( Integer, primary_key=True ) # Auto-increment should be default - timestamp: Mapped[dict[str, str]] = mapped_column( + timestamp: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) # Set the parent foreign key to link to the users table. - account_id: Mapped[dict[str, str]] = mapped_column( + account_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("users.id", ondelete="CASCADE"), nullable=False, @@ -314,19 +300,17 @@ class AccountHistoryTable(Base): account: Mapped["AccountTable"] = relationship(back_populates="account_histories") # This field is nullable if the action was taken by a Datakinder. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=True, ) inst: Mapped["InstTable"] = relationship(back_populates="account_histories") - action: Mapped[dict[str, str]] = mapped_column( + action: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False ) - resource_id: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=False - ) + resource_id: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=False) # An intermediary association table allows bi-directional many-to-many between files and batches. @@ -352,17 +336,15 @@ class FileTable(Base): """The file table""" __tablename__ = "file" - name: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False - ) - id: Mapped[dict[str, str]] = mapped_column( + name: Mapped[str] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) batches: Mapped[Set["BatchTable"]] = relationship( secondary=association_table, back_populates="files" ) # Set the parent foreign key to link to the institution table. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, @@ -371,13 +353,11 @@ class FileTable(Base): # The size to the nearest mb. # size_mb: Mapped[int] = mapped_column(nullable=False) # Who uploaded the file. For SST generated files, this field would be null. - uploader: Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True), nullable=True) + uploader: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) # Can be PDP_SFTP, MANUAL_UPLOAD etc. May be empty for generated files. - source: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_LENGTH), nullable=True - ) + source: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # The schema type(s) of this file. - schemas: Mapped[dict[str, str]] = mapped_column( + schemas: Mapped[list[str]] = mapped_column( MutableList.as_mutable((JSON())), nullable=False ) # If null, the following is non-deleted. @@ -386,7 +366,7 @@ class FileTable(Base): # intents and purposes is no longer accessible by the app. deleted: Mapped[bool] = mapped_column(nullable=True) # When the deletion request was made - deleted_at: Mapped[dict[str, str]] = mapped_column( + deleted_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=True ) retention_days: Mapped[int] = mapped_column(nullable=True) @@ -394,10 +374,10 @@ class FileTable(Base): sst_generated: Mapped[bool] = mapped_column(nullable=False) # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False) - created_at: Mapped[dict[str, str]] = mapped_column( + created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() ) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) @@ -409,12 +389,12 @@ class BatchTable(Base): """The batch table""" __tablename__ = "batch" - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) # Set the parent foreign key to link to the institution table. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, @@ -425,28 +405,24 @@ class BatchTable(Base): secondary=association_table, back_populates="batches" ) - name: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False - ) - created_by: Mapped[dict[str, str]] = mapped_column(Uuid(as_uuid=True)) + name: Mapped[str] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) + created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True)) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the batch is ready for use. completed: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at: Mapped[dict[str, str]] = mapped_column( + deleted_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=True ) - created_at: Mapped[dict[str, str]] = mapped_column( + created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() ) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. - updated_by: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=True - ) + updated_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) # Within a given institution, there should be no duplicated batch names. __table_args__ = (UniqueConstraint("name", "inst_id", name="batch_name_inst_uc"),) @@ -455,12 +431,12 @@ class ModelTable(Base): """The model table""" __tablename__ = "model" - id: Mapped[dict[str, str]] = mapped_column( + id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), primary_key=True, default=uuid.uuid4 ) # Set the parent foreign key to link to the institution table. - inst_id: Mapped[dict[str, str]] = mapped_column( + inst_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("inst.id", ondelete="CASCADE"), nullable=False, @@ -469,30 +445,26 @@ class ModelTable(Base): jobs: Mapped[Set["JobTable"]] = relationship(back_populates="model") - name: Mapped[dict[str, str]] = mapped_column( - String(VAR_CHAR_STANDARD_LENGTH), nullable=False - ) + name: Mapped[str] = mapped_column(String(VAR_CHAR_STANDARD_LENGTH), nullable=False) # What configuration of schemas are allowed (list of maps e.g. [PDP Course : 1 + PDP Cohort : 1, X_schema :1 + Y_schema: 2]) schema_configs: Mapped[dict[str, str]] = mapped_column(JSON(), nullable=True) - created_by: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=True - ) + created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) # If null, the following is non-deleted. deleted: Mapped[bool] = mapped_column(nullable=True) # If true, the model has been approved and is ready for use. valid: Mapped[bool] = mapped_column(nullable=True) # The time the deletion request was set. - deleted_at: Mapped[dict[str, str]] = mapped_column( + deleted_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=True ) - created_at: Mapped[dict[str, str]] = mapped_column( + created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() ) - updated_at: Mapped[dict[str, str]] = mapped_column( + updated_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), onupdate=func.now() ) # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. - version: Mapped[dict[str, str]] = mapped_column(Integer, default=0) + version: Mapped[int] = mapped_column(Integer, default=0) # Within a given institution, there should be no duplicated model names. __table_args__ = (UniqueConstraint("name", "inst_id", name="model_name_inst_uc"),) @@ -502,33 +474,31 @@ class JobTable(Base): """The job table""" __tablename__ = "job" - id: Mapped[dict[str, str]] = mapped_column(BigInteger, primary_key=True) + id: Mapped[int] = mapped_column(BigInteger, primary_key=True) # Set the parent foreign key to link to the institution table. - model_id: Mapped[dict[str, str]] = mapped_column( + model_id: Mapped[uuid.UUID] = mapped_column( Uuid(as_uuid=True), ForeignKey("model.id", ondelete="CASCADE"), nullable=False, ) model: Mapped["ModelTable"] = relationship(back_populates="jobs") - created_by: Mapped[dict[str, str]] = mapped_column( - Uuid(as_uuid=True), nullable=False - ) + created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=False) # The time the deletion request was set. - triggered_at: Mapped[dict[str, str]] = mapped_column( + triggered_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), nullable=False ) - batch_name: Mapped[dict[str, str]] = mapped_column( + batch_name: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False ) # The following will be empty if not completed or if job errored out. Getting additional details will require a call to the Databricks table. - output_filename: Mapped[dict[str, str]] = mapped_column( + output_filename: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) # Whether the file was approved. output_valid: Mapped[bool] = mapped_column(nullable=True, default=False) - err_msg: Mapped[dict[str, str]] = mapped_column( + err_msg: Mapped[str | None] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=True ) completed: Mapped[bool] = mapped_column(nullable=True) @@ -560,7 +530,7 @@ class SchemaRegistryTable(Base): ForeignKey("inst.id", ondelete="RESTRICT", onupdate="CASCADE"), nullable=True ) is_pdp: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - version_label: Mapped[dict[str, str]] = mapped_column( + version_label: Mapped[str] = mapped_column( String(VAR_CHAR_STANDARD_LENGTH), nullable=False ) extends_schema_id: Mapped[int | None] = mapped_column( @@ -574,7 +544,7 @@ class SchemaRegistryTable(Base): MutableDict.as_mutable(JSON()), nullable=False ) is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) - created_at: Mapped[dict[str, str]] = mapped_column( + created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now(), nullable=False ) From 0780b142a5cb933dca05b9a1757f3ff2db8ce4f6 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 20:06:00 -0400 Subject: [PATCH 34/36] correcting nullable=False which is causing unit test issues --- src/webapp/database.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index ed9bc49a..ce9e3597 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -358,7 +358,9 @@ class FileTable(Base): source: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # The schema type(s) of this file. schemas: Mapped[list[str]] = mapped_column( - MutableList.as_mutable((JSON())), nullable=False + MutableList.as_mutable((JSON())), + nullable=False, + default=list, ) # If null, the following is non-deleted. # The deleted field indicates whether there is a pending deletion request on the data. @@ -371,9 +373,9 @@ class FileTable(Base): ) retention_days: Mapped[int] = mapped_column(nullable=True) # Whether the file was generated by SST. (e.g. was it input or output) - sst_generated: Mapped[bool] = mapped_column(nullable=False) + sst_generated: Mapped[bool] = mapped_column(nullable=False, default=False) # Whether the file was approved (in the case of output) or valid for input. - valid: Mapped[bool] = mapped_column(nullable=False) + valid: Mapped[bool] = mapped_column(nullable=False, default=False) created_at: Mapped[datetime.datetime] = mapped_column( DateTime(timezone=True), server_default=func.now() ) From 047e14ae02a82f2c253419afa040c184b8ec13df Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 20:15:10 -0400 Subject: [PATCH 35/36] fixing nullable=False and default for unit tests --- src/webapp/database.py | 77 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 15 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index ce9e3597..f460841d 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -157,18 +157,28 @@ class InstTable(Base): # The emails for which self sign up will be allowed for this institution and will automatically be assigned to this institution. # The dict structure is {email: AccessType string} allowed_emails: Mapped[dict[str, str]] = mapped_column( - MutableDict.as_mutable(JSON()) + MutableDict.as_mutable(JSON()), + nullable=False, + default=dict, ) # Schemas that are allowed for validation. - schemas: Mapped[list[str]] = mapped_column(MutableList.as_mutable(JSON())) + schemas: Mapped[list[str]] = mapped_column( + MutableList.as_mutable(JSON()), nullable=False, default=list + ) state: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) # Only populated for PDP schools. pdp_id: Mapped[str | None] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) created_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now() + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) created_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) @@ -203,9 +213,17 @@ class ApiKeyTable(Base): allows_enduser: Mapped[bool] = mapped_column(nullable=True) access_type: Mapped[str] = mapped_column(String(VAR_CHAR_LENGTH), nullable=False) - created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) + created_at = mapped_column( + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), + ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) # API key must be valid and not deleted. deleted: Mapped[bool] = mapped_column(nullable=True) @@ -274,9 +292,17 @@ class AccountTable(Base): String(VAR_CHAR_LENGTH), nullable=True ) # profile_photo_path : Mapped[dict[str, str]] = mapped_column(String(VAR_CHAR_LENGTH), nullable=True) - created_at = mapped_column(DateTime(timezone=True), server_default=func.now()) + created_at = mapped_column( + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), + ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) @@ -377,10 +403,16 @@ class FileTable(Base): # Whether the file was approved (in the case of output) or valid for input. valid: Mapped[bool] = mapped_column(nullable=False, default=False) created_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now() + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) # Within a given institution, there should be no duplicated file names. @@ -418,10 +450,16 @@ class BatchTable(Base): DateTime(timezone=True), nullable=True ) created_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now() + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) # If a batch is deleted, the uuid of the user in the updated_by section is the deleter. updated_by: Mapped[uuid.UUID] = mapped_column(Uuid(as_uuid=True), nullable=True) @@ -460,10 +498,16 @@ class ModelTable(Base): DateTime(timezone=True), nullable=True ) created_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now() + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), ) updated_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), onupdate=func.now() + DateTime(timezone=True), + onupdate=func.now(), + nullable=False, + default=func.now(), ) # version is unused. version is not currently supported. The webapp only knows about the name of the model and any usages of a model will only use the live version. version: Mapped[int] = mapped_column(Integer, default=0) @@ -547,7 +591,10 @@ class SchemaRegistryTable(Base): ) is_active: Mapped[bool] = mapped_column(Boolean, nullable=False, default=False) created_at: Mapped[datetime.datetime] = mapped_column( - DateTime(timezone=True), server_default=func.now(), nullable=False + DateTime(timezone=True), + server_default=func.now(), + nullable=False, + default=func.now(), ) # ---------------- Relationships ---------------- From a03adffad0f3bee9173bd6ce411de05ddac1e694 Mon Sep 17 00:00:00 2001 From: Vishakh Pillai Date: Thu, 24 Jul 2025 20:17:43 -0400 Subject: [PATCH 36/36] lint --- src/webapp/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index f460841d..7fe974b0 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -6,7 +6,6 @@ from contextvars import ContextVar import enum import sqlalchemy -from sqlalchemy.orm import DeclarativeBase, mapped_column, Mapped from sqlalchemy.ext.mutable import MutableDict, MutableList from sqlalchemy import ( Column, @@ -26,6 +25,7 @@ event, ) from sqlalchemy.orm import ( + DeclarativeBase, sessionmaker, Session, relationship,