33import uuid
44from datetime import datetime , date
55
6- from typing import Annotated , Any , Dict
6+ from typing import Annotated , Any , Dict , List
77from pydantic import BaseModel
88from fastapi import APIRouter , Depends , HTTPException , status , Response
99from sqlalchemy import and_ , or_
1010from sqlalchemy .orm import Session
1111from sqlalchemy .future import select
12+ import os
13+ import logging
14+ from sqlalchemy .exc import IntegrityError
1215
1316from ..utilities import (
1417 has_access_to_inst_or_err ,
2023 get_current_active_user ,
2124 DataSource ,
2225 get_external_bucket_name ,
23- SchemaType ,
2426 decode_url_piece ,
2527)
2628
2931 local_session ,
3032 BatchTable ,
3133 FileTable ,
32- InstTable ,
3334)
3435
3536from ..gcsdbutils import update_db_from_bucket
3637
3738from ..gcsutil import StorageControl
3839
40+ # Set the logging
41+ logging .basicConfig (format = "%(asctime)s [%(levelname)s]: %(message)s" )
42+ logger = logging .getLogger (__name__ )
43+ logger .setLevel (logging .DEBUG )
44+
3945router = APIRouter (
4046 prefix = "/institutions" ,
4147 tags = ["data" ],
@@ -91,7 +97,7 @@ class DataInfo(BaseModel):
9197 name : str
9298 data_id : str
9399 # The batch(es) that this data is present in.
94- batch_ids : set [str ] = {}
100+ batch_ids : set [str ] = set ()
95101 inst_id : str
96102 # Size to the nearest MB.
97103 # size_mb: int
@@ -123,7 +129,7 @@ class ValidationResult(BaseModel):
123129 # Must be unique within an institution to avoid confusion.
124130 name : str
125131 inst_id : str
126- file_types : set [ SchemaType ]
132+ file_types : List [ str ]
127133 source : str
128134
129135
@@ -838,6 +844,33 @@ def download_url_inst_file(
838844 )
839845
840846
847+ def infer_models_from_filename (file_path : str , institution_id : str ) -> List [str ]:
848+ name = os .path .basename (file_path ).lower ()
849+
850+ inferred = set ()
851+ if "course" in name :
852+ inferred .add ("COURSE" )
853+ if "student" in name :
854+ inferred .add ("STUDENT" )
855+ if institution_id == "pdp" :
856+ inferred .add ("SEMESTER" )
857+ if "semester" in name :
858+ inferred .add ("SEMESTER" )
859+ if "cohort" in name :
860+ inferred .add ("STUDENT" )
861+ inferred .add ("SEMESTER" )
862+
863+ if not inferred :
864+ logging .error (
865+ ValueError (
866+ f"Could not infer model(s) from file name: { name } , filenames sould be descriptive of the kind of data it contains e.g. course, cohort"
867+ )
868+ )
869+ inferred .add ("UNKNOWN" )
870+
871+ return sorted (inferred )
872+
873+
841874def validation_helper (
842875 source_str : str ,
843876 inst_id : str ,
@@ -854,51 +887,76 @@ def validation_helper(
854887 detail = "File name can't contain '/'." ,
855888 )
856889 local_session .set (sql_session )
857- inst_query_result = (
858- local_session .get ()
859- .execute (select (InstTable ).where (InstTable .id == str_to_uuid (inst_id )))
860- .all ()
861- )
862- if len (inst_query_result ) == 0 :
863- raise HTTPException (
864- status_code = status .HTTP_404_NOT_FOUND ,
865- detail = "Institution not found." ,
866- )
867- if len (inst_query_result ) > 1 :
868- raise HTTPException (
869- status_code = status .HTTP_500_INTERNAL_SERVER_ERROR ,
870- detail = "Institution duplicates found." ,
871- )
872- allowed_schemas = set ()
873- if inst_query_result [0 ][0 ].schemas :
874- allowed_schemas = set (inst_query_result [0 ][0 ].schemas )
875890
876- inferred_schemas = set ()
891+ allowed_schemas = None
892+ if not allowed_schemas :
893+ allowed_schemas = infer_models_from_filename (file_name , "pdp" )
894+
895+ inferred_schemas : list [str ] = []
896+
877897 try :
878898 inferred_schemas = storage_control .validate_file (
879- get_external_bucket_name (inst_id ), file_name , allowed_schemas
899+ get_external_bucket_name (inst_id ),
900+ file_name ,
901+ allowed_schemas ,
902+ )
903+ logging .debug (
904+ f"!!!!!!!!!!Inferred Schemas was successful { list (inferred_schemas )} "
880905 )
881906 except Exception as e :
907+ logging .debug (f"!!!!!!!!!!Inferred Schemas FAILED { e } " )
882908 raise HTTPException (
883909 status_code = status .HTTP_400_BAD_REQUEST ,
884910 detail = "File type is not valid and/or not accepted by this institution: "
885911 + str (e ),
886912 ) from e
887- new_file_record = FileTable (
888- name = file_name ,
889- inst_id = str_to_uuid (inst_id ),
890- uploader = str_to_uuid (current_user .user_id ),
891- source = source_str ,
892- sst_generated = False ,
893- schemas = list (inferred_schemas ),
894- valid = True ,
913+
914+ existing_file = (
915+ local_session .get ()
916+ .query (FileTable )
917+ .filter_by (
918+ name = file_name ,
919+ inst_id = str_to_uuid (inst_id ),
920+ )
921+ .first ()
895922 )
896- local_session .get ().add (new_file_record )
923+
924+ if existing_file :
925+ logging .info (f"File '{ file_name } ' already exists for institution { inst_id } ." )
926+ db_status = f"File '{ file_name } ' already exists for institution { inst_id } ."
927+ else :
928+ try :
929+ new_file_record = FileTable (
930+ name = file_name ,
931+ inst_id = str_to_uuid (inst_id ),
932+ uploader = str_to_uuid (current_user .user_id ),
933+ source = source_str ,
934+ sst_generated = False ,
935+ schemas = list (inferred_schemas ),
936+ valid = True ,
937+ )
938+ local_session .get ().add (new_file_record )
939+ local_session .get ().flush ()
940+ logging .info (f"File record inserted for '{ file_name } '" )
941+ db_status = f"File record inserted for '{ file_name } '"
942+ except IntegrityError as e :
943+ local_session .get ().rollback ()
944+ logging .warning (f"IntegrityError: { e } " )
945+ db_status = "Already exists"
946+ except Exception as e :
947+ local_session .get ().rollback ()
948+ logging .error (f"Unexpected DB error: { e } " )
949+ raise HTTPException (
950+ status_code = 500 ,
951+ detail = f"Unexpected database error while inserting file record: { e } " ,
952+ )
953+
897954 return {
898955 "name" : file_name ,
899956 "inst_id" : inst_id ,
900- "file_types" : inferred_schemas ,
957+ "file_types" : list ( inferred_schemas ) ,
901958 "source" : source_str ,
959+ "status" : db_status ,
902960 }
903961
904962
0 commit comments