diff --git a/src/webapp/database.py b/src/webapp/database.py index 9365cc52..f9c56fea 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -59,6 +59,10 @@ class Base(DeclarativeBase): LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) +# Test institution - same ID as DEV USC Beaufort for testing +TEST_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") +TEST_BATCH_UUID = uuid.UUID("3182f472-e079-4678-a0a1-9ca5ead6c49a") + @event.listens_for(Mapper, "before_insert") @event.listens_for(Mapper, "before_update") @@ -106,6 +110,19 @@ def init_db(env: str) -> None: updated_at=DATETIME_TESTING, ) ) + # USC Beaufort - matches DEV for testing + session.merge( + InstTable( + id=TEST_INST_UUID, + name="University of South Carolina - Beaufort", + state="SC", + pdp_id="345000", + schemas=["COURSE", "STUDENT"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + created_by=LOCAL_USER_UUID, + ) + ) session.merge( ApiKeyTable( id=LOCAL_APIKEY_UUID, @@ -118,6 +135,94 @@ def init_db(env: str) -> None: valid=True, ) ) + # Create test files and batches for LOCAL environment + if env == "LOCAL": + # Create test files + test_file_1 = FileTable( + id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562a"), + inst_id=LOCAL_INST_UUID, + name="test_course_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["COURSE"], # Using string literal to avoid circular import + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + test_file_2 = FileTable( + id=uuid.UUID("cb02d06c-2a59-486a-9bdd-d394a4fcb833"), + inst_id=LOCAL_INST_UUID, + name="test_cohort_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=[ + "STUDENT" + ], # Using string literal to avoid circular import + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Create test batch for LOCAL_INST_UUID (using a different ID) + test_batch = BatchTable( + id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562b"), + inst_id=LOCAL_INST_UUID, + name="test_batch_1", + created_by=LOCAL_USER_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Associate files with batch + test_batch.files.add(test_file_1) + test_batch.files.add(test_file_2) + session.merge(test_file_1) + session.merge(test_file_2) + session.merge(test_batch) + + # Create test files for EDA test institution (TEST_INST_UUID) + # Real files from DEV batch 3182f472e0794678a0a19ca5ead6c49a + test_file_student = FileTable( + id=uuid.UUID("f1d7c0a4-5211-459f-a79a-a1c2752f45c5"), + inst_id=TEST_INST_UUID, + name="1762967705679_AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv", + source="MANUAL_UPLOAD", + uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), + sst_generated=False, + valid=True, + schemas=["STUDENT"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + test_file_course = FileTable( + id=uuid.UUID("d19d0129-96de-464c-98e9-694996965c7b"), + inst_id=TEST_INST_UUID, + name="1762967705683_AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv", + source="MANUAL_UPLOAD", + uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), + sst_generated=False, + valid=True, + schemas=["COURSE"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + + # Test batch - matches DEV USC Beaufort + test_batch = BatchTable( + id=TEST_BATCH_UUID, + inst_id=TEST_INST_UUID, + name="Batch_2025-11-12_1762967767400", + completed=True, + created_by=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Associate files with batch + test_batch.files.add(test_file_student) + test_batch.files.add(test_file_course) + session.merge(test_file_student) + session.merge(test_file_course) + session.merge(test_batch) session.commit() except Exception as e: session.rollback() diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index b267d9eb..b74d14c9 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -358,3 +358,28 @@ def get_file_contents(self, bucket_name: str, file_name: str) -> Any: blob = bucket.blob(file_name) res = blob.download_as_bytes() return res + + def read_csv_as_dataframe(self, bucket_name: str, file_name: str) -> Any: + """Read a CSV file from GCS and return as pandas DataFrame. + + Args: + bucket_name: GCS bucket name + file_name: Full blob path (e.g., 'validated/filename.csv') + + Returns: + pandas DataFrame + + Raises: + ValueError: If bucket or file not found + """ + import pandas as pd + + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(file_name) + + if not blob.exists(): + raise ValueError(f"File not found: {file_name}") + + with blob.open("r") as fh: + return pd.read_csv(fh) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 1bacf660..bc7c05a9 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -13,6 +13,8 @@ from sqlalchemy.exc import IntegrityError import re from ..validation import HardValidationError +import pandas as pd +from cachetools import TTLCache from ..utilities import ( has_access_to_inst_or_err, @@ -50,6 +52,11 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +# Cache for EDA data - TTL of 10 minutes (600 seconds) +# Cache key format: f"{inst_id}:{batch_id}" +EDA_CACHE_TTL = int(os.getenv("EDA_CACHE_TTL", "600")) # Default 10 minutes +EDA_CACHE: Any = TTLCache(maxsize=64, ttl=EDA_CACHE_TTL) + router = APIRouter( prefix="/institutions", tags=["data"], @@ -470,6 +477,543 @@ def read_batch_info( return {"batches": [batch_info], "files": data_infos} +## EDA (Exploratory Data Analysis) Endpoints + + +class SummaryStats(BaseModel): + """Summary statistics for the EDA dashboard.""" + + total_students: str + transfer_students: str + avg_year1_gpa_all_students: str + + +class GpaSeriesData(BaseModel): + """GPA data series for a chart.""" + + name: str + data: List[float] + + +class GpaChartData(BaseModel): + """GPA chart data with cohort years and series.""" + + cohort_years: List[str] + series: List[GpaSeriesData] + + +class TermData(BaseModel): + """Term-based data (fall, winter, spring, summer).""" + + fall: List[int] + winter: List[int] + spring: List[int] + summer: List[int] + + +class DegreeTypeData(BaseModel): + """Degree type data for donut chart.""" + + value: int + name: str + color: str + + +class StackedBarSeries(BaseModel): + """Series data for stacked bar charts.""" + + name: str + type: str = "bar" + stack: str + data: List[int] + color: str + + +class EdaDataResponse(BaseModel): + """Complete EDA data response matching frontend expectations.""" + + summary_stats: Optional[SummaryStats] = None + gpa_by_enrollment_type: Optional[GpaChartData] = None + gpa_by_enrollment_intensity: Optional[GpaChartData] = None + students_by_cohort_term: Optional[TermData] = None + course_enrollments: Optional[TermData] = None + degree_types: Optional[List[DegreeTypeData]] = None + enrollment_type_by_intensity: Dict[str, Any] # Categories and series + pell_recipient_by_first_gen: Dict[str, Any] # Categories and series + student_age_by_gender: Dict[str, Any] # Categories and series + race_by_pell_status: Dict[str, Any] # Categories and series + + +def read_batch_files_as_dataframes( + inst_id: str, + batch_files: Any, # Set[FileTable] + storage_control: StorageControl, +) -> Dict[str, pd.DataFrame]: + """Read CSV files from a batch and return as DataFrames. + + Args: + inst_id: Institution ID + batch_files: Set of FileTable objects from the batch + storage_control: StorageControl instance for GCS access + + Returns: + Dictionary mapping schema_type -> pandas.DataFrame + + Raises: + HTTPException: If no valid files found + """ + bucket_name = get_external_bucket_name(inst_id) + + # Temporary storage: file_record -> DataFrame + loaded_files: Dict[Any, pd.DataFrame] = {} + missing_files: List[str] = [] + + for file_record in batch_files: + file_name = file_record.name + + # Skip SST-generated output files (only process input files) + if file_record.sst_generated: + logger.debug(f"Skipping SST-generated file: {file_name}") + continue + + df = None + + # Read from GCS + try: + blob_path = f"validated/{file_name}" + df = storage_control.read_csv_as_dataframe(bucket_name, blob_path) + logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)") + except ValueError as e: + logger.warning(f"File not found in GCS: {e}") + missing_files.append(file_name) + except Exception as e: + logger.error(f"Failed to read from GCS: {e}") + missing_files.append(file_name) + + if df is not None: + loaded_files[file_record] = df + + if not loaded_files: + error_msg = f"No valid input files found in batch (checked GCS: {bucket_name}/validated/)" + if missing_files: + error_msg += f". Expected files not found: {', '.join(missing_files[:5])}" + if len(missing_files) > 5: + error_msg += f" (and {len(missing_files) - 5} more)" + error_msg += ( + ". Files must be uploaded and validated before they can be used for EDA." + ) + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=error_msg, + ) + + # Group by schema type and combine DataFrames + schema_dataframes: Dict[str, List[pd.DataFrame]] = {} + for file_record, df in loaded_files.items(): + for schema in file_record.schemas: + if schema not in schema_dataframes: + schema_dataframes[schema] = [] + schema_dataframes[schema].append(df) + + result = {} + for schema, dfs in schema_dataframes.items(): + if len(dfs) == 1: + result[schema] = dfs[0] + else: + result[schema] = pd.concat(dfs, ignore_index=True) + logger.info( + f"Combined {len(dfs)} files for schema {schema} ({len(result[schema])} total rows)" + ) + + return result + + +def calculate_gpa_series( + df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str +) -> List[float]: + """Calculate GPA data for one category across cohort years. + + Args: + df: DataFrame (cohort data) + cohort_years: List of cohort years + grouping_col: Column to filter by (e.g., 'enrollment_type') + category_value: Specific value to filter for (e.g., 'First-Time') + + Returns: + List of GPA values, one per cohort year + """ + + # Filter by category + filtered = df[df[grouping_col] == category_value] + + # Group by cohort and calculate mean GPA + gpa_by_cohort = ( + pd.to_numeric(filtered["gpa_group_year_1"], errors="coerce") + .groupby(filtered["cohort"]) + .mean() + ) + + # Convert to list aligned with cohort_years + data = [round(gpa_by_cohort.get(year, 0), 1) for year in cohort_years] + + return data + + +def get_term_counts( + df: pd.DataFrame, cohort_years: List[str], term_name: str +) -> List[int]: + """Get student counts for a specific term across cohort years. + + Args: + df: DataFrame (cohort or course data) + cohort_years: List of cohort years + term_name: Term name to filter for (e.g., 'FALL', 'WINTER') + + Returns: + List of student counts, one per cohort year + """ + result_series = ( + df[df["cohort_term"] == term_name] + .groupby("cohort") + .size() + .reindex(cohort_years, fill_value=0) + .astype(int) + ) + return [int(x) for x in result_series.tolist()] # Explicitly convert to List[int] + + +@router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse) +def get_eda_data( + inst_id: str, + batch_id: str, + current_user: Annotated[BaseUser, Depends(get_current_active_user)], + sql_session: Annotated[Session, Depends(get_session)], + storage_control: Annotated[StorageControl, Depends(StorageControl)], +) -> Any: + """Returns EDA (Exploratory Data Analysis) data for a specific batch. + + This endpoint provides all the data needed to populate the EDA dashboard, + including summary statistics, GPA charts, enrollment data, and demographic breakdowns. + Analyzes all files in the batch together to provide comprehensive insights. + """ + has_access_to_inst_or_err(inst_id, current_user) + has_full_data_access_or_err(current_user, "EDA data") + local_session.set(sql_session) + + # Verify batch exists and belongs to institution + batch_result = ( + local_session.get() + .execute( + select(BatchTable).where( + and_( + BatchTable.id == str_to_uuid(batch_id), + BatchTable.inst_id == str_to_uuid(inst_id), + ) + ) + ) + .all() + ) + + if not batch_result or len(batch_result) == 0: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Batch not found.", + ) + + if len(batch_result) > 1: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Batch duplicates found.", + ) + + batch_record = batch_result[0][0] + batch_files = batch_record.files + + # Check cache first + cache_key = f"{inst_id}:{batch_id}" + cached_result = EDA_CACHE.get(cache_key) + if cached_result is not None: + logger.debug(f"EDA cache hit for {cache_key}") + return cached_result + + logger.debug(f"EDA cache miss for {cache_key}, computing...") + + # Read files from batch using helper function + file_dataframes = read_batch_files_as_dataframes( + inst_id, batch_files, storage_control + ) + df_cohort = file_dataframes.get("STUDENT") + df_course = file_dataframes.get("COURSE") + + if df_cohort is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No STUDENT schema files found in batch for EDA.", + ) + + cohort_years = sorted(df_cohort["cohort"].unique().tolist()) + + result = EdaDataResponse( + summary_stats=SummaryStats( + total_students=f"{df_cohort['study_id'].nunique():,}", + transfer_students=f"{(df_cohort['enrollment_type'] == 'Transfer-In').sum():,}", + avg_year1_gpa_all_students=f"{pd.to_numeric(df_cohort['gpa_group_year_1'], errors='coerce').mean():.2f}", + ), + gpa_by_enrollment_type=GpaChartData( + cohort_years=cohort_years, + series=[ + GpaSeriesData( + name="First Time Student", + data=calculate_gpa_series( + df_cohort, cohort_years, "enrollment_type", "First-Time" + ), + ), + GpaSeriesData( + name="Transfer Student", + data=calculate_gpa_series( + df_cohort, cohort_years, "enrollment_type", "Transfer-In" + ), + ), + ], + ), + gpa_by_enrollment_intensity=GpaChartData( + cohort_years=cohort_years, + series=[ + GpaSeriesData( + name="Full Time Student", + data=calculate_gpa_series( + df_cohort, + cohort_years, + "enrollment_intensity_first_term", + "Full-Time", + ), + ), + GpaSeriesData( + name="Part Time Student", + data=calculate_gpa_series( + df_cohort, + cohort_years, + "enrollment_intensity_first_term", + "Part-Time", + ), + ), + ], + ), + students_by_cohort_term=TermData( + fall=get_term_counts(df_cohort, cohort_years, "FALL"), + winter=get_term_counts(df_cohort, cohort_years, "WINTER"), + spring=get_term_counts(df_cohort, cohort_years, "SPRING"), + summer=get_term_counts(df_cohort, cohort_years, "SUMMER"), + ), + course_enrollments=TermData( + fall=get_term_counts(df_course, cohort_years, "FALL"), + winter=get_term_counts(df_course, cohort_years, "WINTER"), + spring=get_term_counts(df_course, cohort_years, "SPRING"), + summer=get_term_counts(df_course, cohort_years, "SUMMER"), + ), + degree_types=[ + DegreeTypeData( + value=int( + round( + count / df_cohort["credential_type_sought_year_1"].count() * 100 + ) + ), + name=str(degree_type), + color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5], + ) + for i, (degree_type, count) in enumerate( + df_cohort["credential_type_sought_year_1"].value_counts().items() + ) + ], + enrollment_type_by_intensity={ + "categories": ( + categories := sorted(df_cohort["enrollment_type"].unique().tolist()) + ), + "series": [ + { + "name": "Full Time", + "type": "bar", + "stack": "intensity", + "data": ( + df_cohort[ + df_cohort["enrollment_intensity_first_term"] == "Full-Time" + ] + .groupby("enrollment_type") + .size() + .reindex(categories, fill_value=0) + .tolist() + ), + "color": "#F79222", + }, + { + "name": "Part Time", + "type": "bar", + "stack": "intensity", + "data": ( + df_cohort[ + df_cohort["enrollment_intensity_first_term"] == "Part-Time" + ] + .groupby("enrollment_type") + .size() + .reindex(categories, fill_value=0) + .tolist() + ), + "color": "#00CFEA", + }, + ], + }, + pell_recipient_by_first_gen={ + "categories": ( + pell_categories := sorted( + df_cohort["pell_status_first_year"] + .dropna() + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .loc[lambda x: x.isin(["Yes", "No"])] + .unique() + .tolist() + ) + ), + "series": [ + { + "name": first_gen_normalized, + "type": "bar", + "stack": "firstGen", + "data": ( + df_cohort.assign( + _pell=df_cohort["pell_status_first_year"].replace( + {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"} + ), + _first_gen=df_cohort["first_gen"] + .fillna("Nan") + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}), + ) + .query( + f"_first_gen == '{first_gen_normalized}' and _pell in ['Yes', 'No']" + ) + .groupby("_pell") + .size() + .reindex(pell_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3], + } + for i, first_gen_normalized in enumerate( + sorted( + df_cohort["first_gen"] + .fillna("Nan") + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .unique() + .tolist() + ) + ) + ], + }, + student_age_by_gender={ + "categories": ( + gender_categories := sorted( + df_cohort["gender"].dropna().unique().tolist() + ) + ), + "series": [ + { + "name": age_group, + "type": "bar", + "stack": "age", + "data": ( + df_cohort.assign( + _age_group=( + ( + df_cohort["student_age"] + if "student_age" in df_cohort.columns + else df_cohort["age"] + if "age" in df_cohort.columns + else pd.Series([None] * len(df_cohort)) + ).apply( + lambda x: ( + "20 or younger" + if pd.isna(x) + or any( + term in str(x).lower() + for term in [ + "20 or younger", + "20 or under", + "under 20", + "<=20", + ] + ) + or (isinstance(x, (int, float)) and x <= 20) + else "20 - 24" + if any( + term in str(x).lower() + for term in ["20-24", "20 to 24", "20 - 24"] + ) + or ( + isinstance(x, (int, float)) and 20 < x <= 24 + ) + else "Older than 24" + ) + ) + ) + ) + .query(f"_age_group == '{age_group}'") + .groupby("gender") + .size() + .reindex(gender_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3], + } + for i, age_group in enumerate( + ["20 or younger", "20 - 24", "Older than 24"] + ) + ], + }, + race_by_pell_status={ + "categories": ( + race_categories := sorted(df_cohort["race"].dropna().unique().tolist()) + ), + "series": [ + { + "name": pell_status_normalized, + "type": "bar", + "stack": "pell", + "data": ( + df_cohort.assign( + _pell=df_cohort["pell_status_first_year"].replace( + {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"} + ) + ) + .query( + f"_pell == '{pell_status_normalized}' and _pell in ['Yes', 'No']" + ) + .groupby("race") + .size() + .reindex(race_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA"][i % 2], + } + for i, pell_status_normalized in enumerate( + sorted( + df_cohort["pell_status_first_year"] + .dropna() + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .loc[lambda x: x.isin(["Yes", "No"])] + .unique() + .tolist() + ) + ) + ], + }, + ) + + # Cache the result before returning + EDA_CACHE[cache_key] = result + logger.debug(f"EDA result cached for {cache_key}") + + return result + + @router.post("/{inst_id}/batch", response_model=BatchInfo) def create_batch( inst_id: str, @@ -1412,7 +1956,7 @@ def add_custom_school_job( batch_name=f"{model_name}_{triggered_timestamp}", # update later when we figure out how to add batches to custom jobs output_filename=f"{job_run_id}/inference_output.csv", model_id=query_result[0][0].id, - output_valid=False, + output_valid=True, completed=True, model_version=latest_model_version.version, model_run_id=latest_model_version.run_id, diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 9b1c1c31..789ca72d 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -645,3 +645,228 @@ def test_validate_failure_batch(client: TestClient) -> None: assert response_sftp.json()["file_types"] == ["COURSE"] assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID) assert response_sftp.json()["source"] == "MANUAL_UPLOAD" + + +def test_get_eda_data_unauthorized(client: TestClient) -> None: + """Test GET /institutions//batch//eda with unauthorized access.""" + response = client.get( + "/institutions/" + + uuid_to_str(UUID_INVALID) + + "/batch/" + + uuid_to_str(BATCH_UUID) + + "/eda" + ) + assert str(response) == "" + assert ( + response.text + == '{"detail":"Not authorized to read this institution\'s resources."}' + ) + + +def test_get_eda_data_batch_not_found(client: TestClient) -> None: + """Test GET /institutions//batch//eda with non-existent batch.""" + fake_batch_uuid = uuid.UUID("00000000-0000-0000-0000-000000000000") + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(fake_batch_uuid) + + "/eda" + ) + assert response.status_code == 404 + assert response.json()["detail"] == "Batch not found." + + +def test_get_eda_data_no_student_files( + client: TestClient, session: sqlalchemy.orm.Session +) -> None: + """Test GET /institutions//batch//eda with batch containing no STUDENT files.""" + # Create a batch with only COURSE files + batch_with_course = BatchTable( + id=uuid.UUID("11111111-1111-1111-1111-111111111111"), + inst_id=USER_VALID_INST_UUID, + name="batch_course_only", + created_by=CREATOR_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + course_file = FileTable( + id=uuid.UUID("22222222-2222-2222-2222-222222222222"), + inst_id=USER_VALID_INST_UUID, + name="course_file.csv", + source="MANUAL_UPLOAD", + batches={batch_with_course}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.COURSE], + ) + session.add_all([batch_with_course, course_file]) + session.commit() + + # Mock storage to return empty (no files found) + MOCK_STORAGE.read_csv_as_dataframe.side_effect = ValueError("File not found") + + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(batch_with_course.id) + + "/eda" + ) + assert response.status_code == 404 + # When files can't be loaded from GCS, we get "No valid input files found" + # The "No STUDENT schema files found" error only occurs after files are loaded + assert "No valid input files found" in response.json()["detail"] + + +def test_get_eda_data_success( + client: TestClient, session: sqlalchemy.orm.Session +) -> None: + """Test GET /institutions//batch//eda with valid data.""" + import pandas as pd + + # Create a batch with STUDENT and COURSE files + eda_batch = BatchTable( + id=uuid.UUID("33333333-3333-3333-3333-333333333333"), + inst_id=USER_VALID_INST_UUID, + name="batch_eda_test", + created_by=CREATOR_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + completed=True, + ) + student_file = FileTable( + id=uuid.UUID("44444444-4444-4444-4444-444444444444"), + inst_id=USER_VALID_INST_UUID, + name="student_file.csv", + source="MANUAL_UPLOAD", + batches={eda_batch}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.STUDENT], + ) + course_file = FileTable( + id=uuid.UUID("55555555-5555-5555-5555-555555555555"), + inst_id=USER_VALID_INST_UUID, + name="course_file.csv", + source="MANUAL_UPLOAD", + batches={eda_batch}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.COURSE], + ) + session.add_all([eda_batch, student_file, course_file]) + session.commit() + + # Create mock DataFrames + df_student = pd.DataFrame( + { + "study_id": ["S001", "S002", "S003", "S001"], # S001 appears twice + "cohort": ["2020", "2020", "2021", "2021"], + "cohort_term": ["FALL", "FALL", "SPRING", "SPRING"], + "enrollment_type": [ + "First-Time", + "Transfer-In", + "First-Time", + "Transfer-In", + ], + "enrollment_intensity_first_term": [ + "Full-Time", + "Part-Time", + "Full-Time", + "Part-Time", + ], + "gpa_group_year_1": [3.5, 3.2, 3.8, 2.9], + "credential_type_sought_year_1": [ + "Bachelor", + "Associate", + "Bachelor", + "Associate", + ], + "pell_status_first_year": ["Y", "N", "Y", "N"], + "first_gen": ["Y", "N", "Y", "N"], + "gender": ["Female", "Male", "Female", "Male"], + "race": ["White", "Black or African American", "Asian", "White"], + "student_age": ["20 - 24", "20 or younger", "Older than 24", "20 - 24"], + } + ) + + df_course = pd.DataFrame( + { + "study_id": ["S001", "S002", "S003"], + "cohort": ["2020", "2020", "2021"], + "cohort_term": ["FALL", "FALL", "SPRING"], + } + ) + + # Mock storage to return our test DataFrames + def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame: + if "student" in blob_path.lower(): + return df_student + elif "course" in blob_path.lower(): + return df_course + else: + raise ValueError(f"File not found: {blob_path}") + + MOCK_STORAGE.read_csv_as_dataframe.side_effect = mock_read_csv + + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(eda_batch.id) + + "/eda" + ) + + assert response.status_code == 200 + data = response.json() + + # Check response structure + assert "summary_stats" in data + assert "gpa_by_enrollment_type" in data + assert "gpa_by_enrollment_intensity" in data + assert "students_by_cohort_term" in data + assert "course_enrollments" in data + assert "degree_types" in data + assert "enrollment_type_by_intensity" in data + assert "pell_recipient_by_first_gen" in data + assert "student_age_by_gender" in data + assert "race_by_pell_status" in data + + # Check summary stats + assert data["summary_stats"]["total_students"] == "3" # 3 unique study_ids + assert data["summary_stats"]["transfer_students"] == "2" # 2 Transfer-In + + # Check GPA charts have cohort years + assert "cohort_years" in data["gpa_by_enrollment_type"] + assert len(data["gpa_by_enrollment_type"]["cohort_years"]) == 2 # 2020, 2021 + assert "2020" in data["gpa_by_enrollment_type"]["cohort_years"] + assert "2021" in data["gpa_by_enrollment_type"]["cohort_years"] + + # Check term data structure + assert "fall" in data["students_by_cohort_term"] + assert "spring" in data["students_by_cohort_term"] + assert len(data["students_by_cohort_term"]["fall"]) == 2 # One per cohort year + + # Check enrollment type by intensity has categories and series + assert "categories" in data["enrollment_type_by_intensity"] + assert "series" in data["enrollment_type_by_intensity"] + assert len(data["enrollment_type_by_intensity"]["series"]) > 0 + + # Check pell recipient chart structure + assert "categories" in data["pell_recipient_by_first_gen"] + assert "series" in data["pell_recipient_by_first_gen"] + + # Check student age by gender structure + assert "categories" in data["student_age_by_gender"] + assert "series" in data["student_age_by_gender"] + + # Check race by pell status structure + assert "categories" in data["race_by_pell_status"] + assert "series" in data["race_by_pell_status"] diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index 8b35088b..c1bc240a 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -390,7 +390,11 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None: def prepend_env_prefix(name: str) -> Any: """Prepend the env prefix. At this point the value should not be empty as we checked on app startup.""" - return str(env_vars["ENV"]).lower() + "_" + name + env = str(env_vars["ENV"]).lower() + # Use dev_ prefix for LOCAL environment + if env == "local": + env = "dev" + return env + "_" + name def uuid_to_str(uuid_val: uuid.UUID) -> Any: