From c617008f3301b7c343cf19fec3324da4531e6ffb Mon Sep 17 00:00:00 2001 From: William Carr Date: Mon, 24 Nov 2025 10:49:21 -0500 Subject: [PATCH 01/25] test batch and file data --- src/webapp/database.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/webapp/database.py b/src/webapp/database.py index 9365cc52..effb9304 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -118,6 +118,48 @@ def init_db(env: str) -> None: valid=True, ) ) + # Create test files and batches for LOCAL environment + if env == "LOCAL": + # Create test files + test_file_1 = FileTable( + id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562a"), + inst_id=LOCAL_INST_UUID, + name="test_course_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["COURSE"], # Using string literal to avoid circular import + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + test_file_2 = FileTable( + id=uuid.UUID("cb02d06c-2a59-486a-9bdd-d394a4fcb833"), + inst_id=LOCAL_INST_UUID, + name="test_cohort_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["STUDENT"], # Using string literal to avoid circular import + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Create test batch + test_batch = BatchTable( + id=uuid.UUID("5b2420f3-1035-46ab-90eb-74d5df97de43"), + inst_id=LOCAL_INST_UUID, + name="test_batch_1", + created_by=LOCAL_USER_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Associate files with batch + test_batch.files.add(test_file_1) + test_batch.files.add(test_file_2) + session.merge(test_file_1) + session.merge(test_file_2) + session.merge(test_batch) session.commit() except Exception as e: session.rollback() From 1c9ee92bf66a0ef153f2af430d1e172a0d175a76 Mon Sep 17 00:00:00 2001 From: William Carr Date: Mon, 24 Nov 2025 10:49:39 -0500 Subject: [PATCH 02/25] eda endpoints --- src/webapp/routers/data.py | 190 +++++++++++++++++++++++++++++++++++++ 1 file changed, 190 insertions(+) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index d062f84e..784a74c2 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -473,6 +473,196 @@ def read_batch_info( return {"batches": [batch_info], "files": data_infos} +## EDA (Exploratory Data Analysis) Endpoints + + +class SummaryStats(BaseModel): + """Summary statistics for the EDA dashboard.""" + total_students: str + transfer_students: str + avg_year1_gpa_all_students: str + + +class GpaSeriesData(BaseModel): + """GPA data series for a chart.""" + name: str + data: List[float] + + +class GpaChartData(BaseModel): + """GPA chart data with cohort years and series.""" + cohort_years: List[str] + series: List[GpaSeriesData] + + +class TermData(BaseModel): + """Term-based data (fall, winter, spring, summer).""" + fall: List[int] + winter: List[int] + spring: List[int] + summer: List[int] + + +class DegreeTypeData(BaseModel): + """Degree type data for donut chart.""" + value: int + name: str + color: str + + +class StackedBarSeries(BaseModel): + """Series data for stacked bar charts.""" + name: str + type: str = "bar" + stack: str + data: List[int] + color: str + + +class EdaDataResponse(BaseModel): + """Complete EDA data response matching frontend expectations.""" + summary_stats: SummaryStats + gpa_by_enrollment_type: GpaChartData + gpa_by_enrollment_intensity: GpaChartData + students_by_cohort_term: TermData + course_enrollments: TermData + degree_types: List[DegreeTypeData] + enrollment_type_by_intensity: Dict[str, Any] # Categories and series + pell_recipient_by_first_gen: Dict[str, Any] # Categories and series + student_age_by_gender: Dict[str, Any] # Categories and series + race_by_pell_status: Dict[str, Any] # Categories and series + + +@router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse) +def get_eda_data( + inst_id: str, + batch_id: str, + current_user: Annotated[BaseUser, Depends(get_current_active_user)], + sql_session: Annotated[Session, Depends(get_session)], + storage_control: Annotated[StorageControl, Depends(StorageControl)], +) -> Any: + """Returns EDA (Exploratory Data Analysis) data for a specific batch. + + This endpoint provides all the data needed to populate the EDA dashboard, + including summary statistics, GPA charts, enrollment data, and demographic breakdowns. + Analyzes all files in the batch together to provide comprehensive insights. + """ + has_access_to_inst_or_err(inst_id, current_user) + has_full_data_access_or_err(current_user, "EDA data") + local_session.set(sql_session) + + # Verify batch exists and belongs to institution + batch_result = ( + local_session.get() + .execute( + select(BatchTable).where( + and_( + BatchTable.id == str_to_uuid(batch_id), + BatchTable.inst_id == str_to_uuid(inst_id), + ) + ) + ) + .all() + ) + + if not batch_result or len(batch_result) == 0: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="Batch not found.", + ) + + if len(batch_result) > 1: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail="Batch duplicates found.", + ) + + batch_record = batch_result[0][0] + batch_files = batch_record.files + + if not batch_files or len(batch_files) == 0: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="Batch contains no files.", + ) + + # TODO: Implement actual data fetching from GCS/Databricks + # For now, return mock data matching the frontend structure + # This should be replaced with actual data analysis from all files in the batch + # Files can be accessed via batch_files (Set[FileTable]) + # Each file has: name, id, schemas, inst_id, etc. + + return EdaDataResponse( + summary_stats=SummaryStats( + total_students="15,203", + transfer_students="806", + avg_year1_gpa_all_students="3.1", + ), + gpa_by_enrollment_type=GpaChartData( + cohort_years=['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'], + series=[ + GpaSeriesData(name="First Time Student", data=[2.6, 2.7, 2.5, 2.7, 2.7, 2.7, 2.8]), + GpaSeriesData(name="Transfer Student", data=[3.3, 3.6, 3.1, 3.4, 3.1, 3.5, 3.6]), + ], + ), + gpa_by_enrollment_intensity=GpaChartData( + cohort_years=['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'], + series=[ + GpaSeriesData(name="Full Time Student", data=[3.25, 3.15, 3.0, 3.4, 3.25, 3.4, 3.5]), + GpaSeriesData(name="Part Time Student", data=[2.55, 2.9, 2.75, 3.15, 3.0, 3.15, 3.25]), + ], + ), + students_by_cohort_term=TermData( + fall=[180, 200, 220, 250, 270, 290, 320], + winter=[60, 65, 70, 75, 80, 85, 90], + spring=[50, 55, 60, 65, 70, 75, 80], + summer=[10, 12, 15, 18, 20, 22, 25], + ), + course_enrollments=TermData( + fall=[3000, 3200, 3400, 3600, 3800, 4000, 4200], + winter=[2000, 2100, 2200, 2300, 2400, 2500, 2600], + spring=[1000, 1100, 1200, 1300, 1400, 1500, 1600], + summer=[500, 550, 600, 650, 700, 750, 800], + ), + degree_types=[ + DegreeTypeData(value=67, name="Associate's Degree", color="#F79222"), + DegreeTypeData(value=15, name="1 - 2 year certificate", color="#00CFEA"), + DegreeTypeData(value=8, name="2 - 4 year certificate", color="#25A95A"), + DegreeTypeData(value=7, name="Degree seeking", color="#A92532"), + DegreeTypeData(value=3, name="Unknown", color="#385981"), + ], + enrollment_type_by_intensity={ + "categories": ['First-Time', 'Re-Admit', 'Transfer-In'], + "series": [ + {"name": "Full Time", "type": "bar", "stack": "intensity", "data": [9800, 600, 8500], "color": "#F79222"}, + {"name": "Part Time", "type": "bar", "stack": "intensity", "data": [200, 1100, 1200], "color": "#00CFEA"}, + ], + }, + pell_recipient_by_first_gen={ + "categories": ['Yes', 'No'], + "series": [ + {"name": "Yes", "type": "bar", "stack": "firstGen", "data": [3000, 3700], "color": "#F79222"}, + {"name": "No", "type": "bar", "stack": "firstGen", "data": [4200, 3000], "color": "#00CFEA"}, + {"name": "Nan", "type": "bar", "stack": "firstGen", "data": [1800, 1800], "color": "#25A95A"}, + ], + }, + student_age_by_gender={ + "categories": ['Female', 'Male', 'Nonbinary, intersex, and gender-nonconforming', 'Prefer not to specify', 'Unknown'], + "series": [ + {"name": "20 or younger", "type": "bar", "stack": "age", "data": [5000, 5000, 800, 2000, 1500], "color": "#F79222"}, + {"name": "20 - 24", "type": "bar", "stack": "age", "data": [2500, 2500, 100, 1000, 1000], "color": "#00CFEA"}, + {"name": "Older than 24", "type": "bar", "stack": "age", "data": [2000, 1300, 100, 500, 1000], "color": "#25A95A"}, + ], + }, + race_by_pell_status={ + "categories": ['American Indian or Alaska Native', 'Asian', 'Black or African American', 'Native Hawaiian or other Pacific Islander', 'Nonresident Alien', 'Two or More Races', 'Unknown', 'White'], + "series": [ + {"name": "Yes", "type": "bar", "stack": "pell", "data": [30, 250, 400, 20, 50, 100, 150, 2000], "color": "#F79222"}, + {"name": "No", "type": "bar", "stack": "pell", "data": [20, 50, 200, 10, 25, 50, 50, 250], "color": "#00CFEA"}, + ], + }, + ) + @router.post("/{inst_id}/batch", response_model=BatchInfo) def create_batch( inst_id: str, From dddc0a52859416576225e85d666a8e1d1918d2db Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 25 Nov 2025 15:24:22 -0500 Subject: [PATCH 03/25] test data --- src/webapp/database.py | 74 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 72 insertions(+), 2 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index effb9304..dbea8ca4 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -59,6 +59,10 @@ class Base(DeclarativeBase): LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) +# Test institution ID for EDA dashboard +TEST_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") +TEST_BATCH_UUID = uuid.UUID("5b2420f3-1035-46ab-90eb-74d5df97de43") + @event.listens_for(Mapper, "before_insert") @event.listens_for(Mapper, "before_update") @@ -106,6 +110,15 @@ def init_db(env: str) -> None: updated_at=DATETIME_TESTING, ) ) + # Create test institution for EDA dashboard + session.merge( + InstTable( + id=TEST_INST_UUID, + name="Test Institution for EDA", + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + ) session.merge( ApiKeyTable( id=LOCAL_APIKEY_UUID, @@ -145,9 +158,9 @@ def init_db(env: str) -> None: created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) - # Create test batch + # Create test batch for LOCAL_INST_UUID (using a different ID) test_batch = BatchTable( - id=uuid.UUID("5b2420f3-1035-46ab-90eb-74d5df97de43"), + id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562b"), inst_id=LOCAL_INST_UUID, name="test_batch_1", created_by=LOCAL_USER_UUID, @@ -160,6 +173,63 @@ def init_db(env: str) -> None: session.merge(test_file_1) session.merge(test_file_2) session.merge(test_batch) + + # Create test files for EDA test institution (TEST_INST_UUID) + eda_test_file_1 = FileTable( + id=uuid.UUID("a1b2c3d4-5e6f-7890-abcd-ef1234567890"), + inst_id=TEST_INST_UUID, + name="eda_test_course_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["COURSE"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + eda_test_file_2 = FileTable( + id=uuid.UUID("b2c3d4e5-6f78-9012-bcde-f23456789012"), + inst_id=TEST_INST_UUID, + name="eda_test_cohort_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["STUDENT"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + eda_test_file_3 = FileTable( + id=uuid.UUID("c3d4e5f6-7890-1234-cdef-345678901234"), + inst_id=TEST_INST_UUID, + name="eda_test_financial_file.csv", + source="MANUAL_UPLOAD", + uploader=LOCAL_USER_UUID, + sst_generated=False, + valid=True, + schemas=["FINANCIAL"], + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + + # Create test batch for EDA dashboard + eda_test_batch = BatchTable( + id=TEST_BATCH_UUID, + inst_id=TEST_INST_UUID, + name="eda_test_batch", + completed=True, + created_by=LOCAL_USER_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + # Associate files with EDA test batch + eda_test_batch.files.add(eda_test_file_1) + eda_test_batch.files.add(eda_test_file_2) + eda_test_batch.files.add(eda_test_file_3) + session.merge(eda_test_file_1) + session.merge(eda_test_file_2) + session.merge(eda_test_file_3) + session.merge(eda_test_batch) session.commit() except Exception as e: session.rollback() From 225427c045a2328d1743e9b75f5439cfea1d493d Mon Sep 17 00:00:00 2001 From: William Carr Date: Wed, 3 Dec 2025 10:59:30 -0500 Subject: [PATCH 04/25] eda calculations --- src/webapp/gcsutil.py | 25 +++++++ src/webapp/routers/data.py | 140 ++++++++++++++++++++++++++++++++----- 2 files changed, 146 insertions(+), 19 deletions(-) diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index b267d9eb..79a50d19 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -358,3 +358,28 @@ def get_file_contents(self, bucket_name: str, file_name: str) -> Any: blob = bucket.blob(file_name) res = blob.download_as_bytes() return res + + def read_csv_as_dataframe(self, bucket_name: str, file_name: str) -> Any: + """Read a CSV file from GCS and return as pandas DataFrame. + + Args: + bucket_name: GCS bucket name + file_name: Full blob path (e.g., 'validated/filename.csv') + + Returns: + pandas DataFrame + + Raises: + ValueError: If bucket or file not found + """ + import pandas as pd + + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name) + blob = bucket.blob(file_name) + + if not blob.exists(): + raise ValueError(f"File not found: {file_name}") + + with blob.open("r") as fh: + return pd.read_csv(fh) \ No newline at end of file diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 784a74c2..5ceca924 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -18,6 +18,7 @@ import pathlib import re from ..validation import HardValidationError +import pandas as pd from ..utilities import ( has_access_to_inst_or_err, @@ -533,6 +534,110 @@ class EdaDataResponse(BaseModel): race_by_pell_status: Dict[str, Any] # Categories and series +def read_batch_files_as_dataframes( + inst_id: str, + batch_files: Any, # Set[FileTable] + storage_control: StorageControl, +) -> Dict[str, pd.DataFrame]: + """Read CSV files from a batch and return as DataFrames. + + In LOCAL mode, checks ../test_cloud_storage/validated/ first, then falls back to GCS. + In deployed environments (DEV/STAGING/PROD), only reads from GCS. + + Args: + inst_id: Institution ID + batch_files: Set of FileTable objects from the batch + storage_control: StorageControl instance for GCS access + + Returns: + Dictionary mapping file_name -> pandas.DataFrame + + Raises: + HTTPException: If no valid files found + """ + file_dataframes: Dict[str, pd.DataFrame] = {} + is_local = env_vars.get("ENV", "").upper() == "LOCAL" + bucket_name = get_external_bucket_name(inst_id) + + # For LOCAL mode, set up local test storage path + local_test_storage_path = None + if is_local: + project_root = pathlib.Path(__file__).parent.parent.parent.parent + local_test_storage_path = project_root.parent / "test_cloud_storage" / "validated" + logger.info(f"LOCAL mode: Will check local storage at {local_test_storage_path}") + + for file_record in batch_files: + file_name = file_record.name + + # Skip SST-generated output files (only process input files) + if file_record.sst_generated: + logger.debug(f"Skipping SST-generated file: {file_name}") + continue + + df = None + + # Try local filesystem first in LOCAL mode + if is_local and local_test_storage_path: + local_file_path = local_test_storage_path / file_name + if local_file_path.exists(): + try: + df = pd.read_csv(local_file_path) + logger.info(f"Loaded {file_name} from local filesystem ({len(df)} rows)") + except Exception as e: + logger.warning(f"Failed to read local file {local_file_path}: {e}") + + # Fall back to GCS using StorageControl + if df is None: + try: + blob_path = f"validated/{file_name}" + df = storage_control.read_csv_as_dataframe(bucket_name, blob_path) + logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)") + except ValueError as e: + logger.warning(f"File not found in GCS: {e}") + except Exception as e: + logger.error(f"Failed to read from GCS: {e}") + + if df is not None: + file_dataframes[file_name] = df + + if not file_dataframes: + error_msg = f"No valid input files found in batch" + if is_local and local_test_storage_path: + error_msg += f" (checked local: {local_test_storage_path} and GCS: {bucket_name}/validated/)" + else: + error_msg += f" (checked GCS: {bucket_name}/validated/)" + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=error_msg, + ) + + return file_dataframes + + +def calculate_gpa_series(df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str) -> List[float]: + """Calculate GPA data for one category across cohort years. + + Args: + df: DataFrame (cohort data) + cohort_years: List of cohort years + grouping_col: Column to filter by (e.g., 'Enrollment Type') + category_value: Specific value to filter for (e.g., 'First-Time') + + Returns: + List of GPA values, one per cohort year + """ + # Filter by category + filtered = df[df[grouping_col] == category_value] + + # Group by cohort and calculate mean GPA + gpa_by_cohort = filtered.groupby('Cohort')['GPA Group Year 1'].mean() + + # Convert to list aligned with cohort_years + data = [round(gpa_by_cohort.get(year, 0), 1) for year in cohort_years] + + return data + + @router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse) def get_eda_data( inst_id: str, @@ -580,36 +685,33 @@ def get_eda_data( batch_record = batch_result[0][0] batch_files = batch_record.files - if not batch_files or len(batch_files) == 0: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail="Batch contains no files.", - ) + # Read files from batch using helper function + file_dataframes = read_batch_files_as_dataframes( + inst_id, batch_files, storage_control + ) + df_cohort = file_dataframes['eda_test_cohort_file.csv'] - # TODO: Implement actual data fetching from GCS/Databricks - # For now, return mock data matching the frontend structure - # This should be replaced with actual data analysis from all files in the batch - # Files can be accessed via batch_files (Set[FileTable]) - # Each file has: name, id, schemas, inst_id, etc. + # Calculate cohort years + cohort_years = sorted(df_cohort['Cohort'].unique().tolist()) return EdaDataResponse( summary_stats=SummaryStats( - total_students="15,203", - transfer_students="806", - avg_year1_gpa_all_students="3.1", + total_students=f"{df_cohort['Student GUID'].nunique():,}", + transfer_students=f"{(df_cohort['Enrollment Type'] == 'Transfer-In').sum():,}", + avg_year1_gpa_all_students=f"{df_cohort['GPA Group Year 1'].mean():.2f}", ), gpa_by_enrollment_type=GpaChartData( - cohort_years=['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'], + cohort_years=cohort_years, series=[ - GpaSeriesData(name="First Time Student", data=[2.6, 2.7, 2.5, 2.7, 2.7, 2.7, 2.8]), - GpaSeriesData(name="Transfer Student", data=[3.3, 3.6, 3.1, 3.4, 3.1, 3.5, 3.6]), + GpaSeriesData(name="First Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Type', 'First-Time')), + GpaSeriesData(name="Transfer Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Type', 'Transfer-In')), ], ), gpa_by_enrollment_intensity=GpaChartData( - cohort_years=['2017-18', '2018-19', '2019-20', '2020-21', '2021-22', '2022-23', '2023-24'], + cohort_years=cohort_years, series=[ - GpaSeriesData(name="Full Time Student", data=[3.25, 3.15, 3.0, 3.4, 3.25, 3.4, 3.5]), - GpaSeriesData(name="Part Time Student", data=[2.55, 2.9, 2.75, 3.15, 3.0, 3.15, 3.25]), + GpaSeriesData(name="Full Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Intensity First Term', 'Full-Time')), + GpaSeriesData(name="Part Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Intensity First Term', 'Part-Time')), ], ), students_by_cohort_term=TermData( From abac3773bff7df9526498e56a5e93b53b9a0d348 Mon Sep 17 00:00:00 2001 From: William Carr Date: Wed, 3 Dec 2025 13:23:24 -0500 Subject: [PATCH 05/25] eda year and term, course enrollemnts --- src/webapp/routers/data.py | 36 ++++++++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 5ceca924..1820a850 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -638,6 +638,23 @@ def calculate_gpa_series(df: pd.DataFrame, cohort_years: List[str], grouping_col return data +def get_term_counts(df: pd.DataFrame, cohort_years: List[str], term_name: str) -> List[int]: + """Get student counts for a specific term across cohort years. + + Args: + df: DataFrame (cohort or course data) + cohort_years: List of cohort years + term_name: Term name to filter for (e.g., 'FALL', 'WINTER') + + Returns: + List of student counts, one per cohort year + """ + return (df[df['Cohort Term'] == term_name] + .groupby('Cohort').size() + .reindex(cohort_years, fill_value=0) + .astype(int).tolist()) + + @router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse) def get_eda_data( inst_id: str, @@ -690,8 +707,7 @@ def get_eda_data( inst_id, batch_files, storage_control ) df_cohort = file_dataframes['eda_test_cohort_file.csv'] - - # Calculate cohort years + df_course = file_dataframes['eda_test_course_file.csv'] cohort_years = sorted(df_cohort['Cohort'].unique().tolist()) return EdaDataResponse( @@ -715,16 +731,16 @@ def get_eda_data( ], ), students_by_cohort_term=TermData( - fall=[180, 200, 220, 250, 270, 290, 320], - winter=[60, 65, 70, 75, 80, 85, 90], - spring=[50, 55, 60, 65, 70, 75, 80], - summer=[10, 12, 15, 18, 20, 22, 25], + fall=get_term_counts(df_cohort, cohort_years, 'FALL'), + winter=get_term_counts(df_cohort, cohort_years, 'WINTER'), + spring=get_term_counts(df_cohort, cohort_years, 'SPRING'), + summer=get_term_counts(df_cohort, cohort_years, 'SUMMER'), ), course_enrollments=TermData( - fall=[3000, 3200, 3400, 3600, 3800, 4000, 4200], - winter=[2000, 2100, 2200, 2300, 2400, 2500, 2600], - spring=[1000, 1100, 1200, 1300, 1400, 1500, 1600], - summer=[500, 550, 600, 650, 700, 750, 800], + fall=get_term_counts(df_course, cohort_years, 'FALL'), + winter=get_term_counts(df_course, cohort_years, 'WINTER'), + spring=get_term_counts(df_course, cohort_years, 'SPRING'), + summer=get_term_counts(df_course, cohort_years, 'SUMMER'), ), degree_types=[ DegreeTypeData(value=67, name="Associate's Degree", color="#F79222"), From ab432f6649c55225f1ff5ef5414c90c956c743e2 Mon Sep 17 00:00:00 2001 From: William Carr Date: Wed, 3 Dec 2025 14:10:54 -0500 Subject: [PATCH 06/25] eda degree types --- src/webapp/routers/data.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 1820a850..e3692298 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -743,11 +743,12 @@ def get_eda_data( summer=get_term_counts(df_course, cohort_years, 'SUMMER'), ), degree_types=[ - DegreeTypeData(value=67, name="Associate's Degree", color="#F79222"), - DegreeTypeData(value=15, name="1 - 2 year certificate", color="#00CFEA"), - DegreeTypeData(value=8, name="2 - 4 year certificate", color="#25A95A"), - DegreeTypeData(value=7, name="Degree seeking", color="#A92532"), - DegreeTypeData(value=3, name="Unknown", color="#385981"), + DegreeTypeData( + value=int(round(count / df_cohort['Credential Type Sought Year 1'].count() * 100)), + name=str(degree_type), + color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5] + ) + for i, (degree_type, count) in enumerate(df_cohort['Credential Type Sought Year 1'].value_counts().items()) ], enrollment_type_by_intensity={ "categories": ['First-Time', 'Re-Admit', 'Transfer-In'], From 1a37ab16ade4f3e6f74e02564b95cc7c51e05009 Mon Sep 17 00:00:00 2001 From: William Carr Date: Mon, 8 Dec 2025 11:46:32 -0500 Subject: [PATCH 07/25] eda test institution data --- src/webapp/database.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index dbea8ca4..bbe5561d 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -59,9 +59,9 @@ class Base(DeclarativeBase): LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) -# Test institution ID for EDA dashboard -TEST_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") -TEST_BATCH_UUID = uuid.UUID("5b2420f3-1035-46ab-90eb-74d5df97de43") +# USC Beaufort - same ID as DEV for testing +USCB_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") +USCB_BATCH_UUID = uuid.UUID("3182f472-e079-4678-a0a1-9ca5ead6c49a") @event.listens_for(Mapper, "before_insert") @@ -110,13 +110,17 @@ def init_db(env: str) -> None: updated_at=DATETIME_TESTING, ) ) - # Create test institution for EDA dashboard + # USC Beaufort - matches DEV for testing session.merge( InstTable( - id=TEST_INST_UUID, - name="Test Institution for EDA", + id=USCB_INST_UUID, + name="University of South Carolina - Beaufort", + state="SC", + pdp_id="345000", + schemas=["COURSE", "STUDENT"], created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, + created_by=LOCAL_USER_UUID, ) ) session.merge( @@ -174,10 +178,10 @@ def init_db(env: str) -> None: session.merge(test_file_2) session.merge(test_batch) - # Create test files for EDA test institution (TEST_INST_UUID) + # Create test files for EDA test institution (USCB_INST_UUID) eda_test_file_1 = FileTable( id=uuid.UUID("a1b2c3d4-5e6f-7890-abcd-ef1234567890"), - inst_id=TEST_INST_UUID, + inst_id=USCB_INST_UUID, name="eda_test_course_file.csv", source="MANUAL_UPLOAD", uploader=LOCAL_USER_UUID, @@ -189,7 +193,7 @@ def init_db(env: str) -> None: ) eda_test_file_2 = FileTable( id=uuid.UUID("b2c3d4e5-6f78-9012-bcde-f23456789012"), - inst_id=TEST_INST_UUID, + inst_id=USCB_INST_UUID, name="eda_test_cohort_file.csv", source="MANUAL_UPLOAD", uploader=LOCAL_USER_UUID, @@ -201,7 +205,7 @@ def init_db(env: str) -> None: ) eda_test_file_3 = FileTable( id=uuid.UUID("c3d4e5f6-7890-1234-cdef-345678901234"), - inst_id=TEST_INST_UUID, + inst_id=USCB_INST_UUID, name="eda_test_financial_file.csv", source="MANUAL_UPLOAD", uploader=LOCAL_USER_UUID, @@ -214,8 +218,8 @@ def init_db(env: str) -> None: # Create test batch for EDA dashboard eda_test_batch = BatchTable( - id=TEST_BATCH_UUID, - inst_id=TEST_INST_UUID, + id=USCB_BATCH_UUID, + inst_id=USCB_INST_UUID, name="eda_test_batch", completed=True, created_by=LOCAL_USER_UUID, From 8ffea4a8b9b23ee8f805e6d1a8c44f71d4fe5063 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 10:22:30 -0500 Subject: [PATCH 08/25] eda test institution --- src/webapp/database.py | 73 +++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 43 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index bbe5561d..e2676e0e 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -59,9 +59,9 @@ class Base(DeclarativeBase): LOCAL_PASSWORD = "tester_password" DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357) -# USC Beaufort - same ID as DEV for testing -USCB_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") -USCB_BATCH_UUID = uuid.UUID("3182f472-e079-4678-a0a1-9ca5ead6c49a") +# Test institution - same ID as DEV USC Beaufort for testing +TEST_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c") +TEST_BATCH_UUID = uuid.UUID("3182f472-e079-4678-a0a1-9ca5ead6c49a") @event.listens_for(Mapper, "before_insert") @@ -113,7 +113,7 @@ def init_db(env: str) -> None: # USC Beaufort - matches DEV for testing session.merge( InstTable( - id=USCB_INST_UUID, + id=TEST_INST_UUID, name="University of South Carolina - Beaufort", state="SC", pdp_id="345000", @@ -178,62 +178,49 @@ def init_db(env: str) -> None: session.merge(test_file_2) session.merge(test_batch) - # Create test files for EDA test institution (USCB_INST_UUID) - eda_test_file_1 = FileTable( - id=uuid.UUID("a1b2c3d4-5e6f-7890-abcd-ef1234567890"), - inst_id=USCB_INST_UUID, - name="eda_test_course_file.csv", + # Create test files for EDA test institution (TEST_INST_UUID) + # Real files from DEV batch 3182f472e0794678a0a19ca5ead6c49a + test_file_student = FileTable( + id=uuid.UUID("f1d7c0a4-5211-459f-a79a-a1c2752f45c5"), + inst_id=TEST_INST_UUID, + name="1762967705679_AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv", source="MANUAL_UPLOAD", - uploader=LOCAL_USER_UUID, - sst_generated=False, - valid=True, - schemas=["COURSE"], - created_at=DATETIME_TESTING, - updated_at=DATETIME_TESTING, - ) - eda_test_file_2 = FileTable( - id=uuid.UUID("b2c3d4e5-6f78-9012-bcde-f23456789012"), - inst_id=USCB_INST_UUID, - name="eda_test_cohort_file.csv", - source="MANUAL_UPLOAD", - uploader=LOCAL_USER_UUID, + uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), sst_generated=False, valid=True, schemas=["STUDENT"], created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) - eda_test_file_3 = FileTable( - id=uuid.UUID("c3d4e5f6-7890-1234-cdef-345678901234"), - inst_id=USCB_INST_UUID, - name="eda_test_financial_file.csv", + test_file_course = FileTable( + id=uuid.UUID("d19d0129-96de-464c-98e9-694996965c7b"), + inst_id=TEST_INST_UUID, + name="1762967705683_AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv", source="MANUAL_UPLOAD", - uploader=LOCAL_USER_UUID, + uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), sst_generated=False, valid=True, - schemas=["FINANCIAL"], + schemas=["COURSE"], created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) - # Create test batch for EDA dashboard - eda_test_batch = BatchTable( - id=USCB_BATCH_UUID, - inst_id=USCB_INST_UUID, - name="eda_test_batch", - completed=True, - created_by=LOCAL_USER_UUID, + # Test batch - matches DEV USC Beaufort + test_batch = BatchTable( + id=TEST_BATCH_UUID, + inst_id=TEST_INST_UUID, + name="Batch_2025-11-12_1762967767400", + completed=False, + created_by=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) - # Associate files with EDA test batch - eda_test_batch.files.add(eda_test_file_1) - eda_test_batch.files.add(eda_test_file_2) - eda_test_batch.files.add(eda_test_file_3) - session.merge(eda_test_file_1) - session.merge(eda_test_file_2) - session.merge(eda_test_file_3) - session.merge(eda_test_batch) + # Associate files with batch + test_batch.files.add(test_file_student) + test_batch.files.add(test_file_course) + session.merge(test_file_student) + session.merge(test_file_course) + session.merge(test_batch) session.commit() except Exception as e: session.rollback() From 2807313474782de0cb5e44e8add40b57dfbe50c7 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 10:28:38 -0500 Subject: [PATCH 09/25] eda data --- src/webapp/routers/data.py | 77 ++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 36 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 30f187f8..6b0c6c4f 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -554,14 +554,11 @@ def read_batch_files_as_dataframes( """ file_dataframes: Dict[str, pd.DataFrame] = {} is_local = env_vars.get("ENV", "").upper() == "LOCAL" - bucket_name = get_external_bucket_name(inst_id) + # For LOCAL development, use DEV GCS buckets for file storage + bucket_name = f"dev_{inst_id}" if is_local else get_external_bucket_name(inst_id) - # For LOCAL mode, set up local test storage path - local_test_storage_path = None - if is_local: - project_root = pathlib.Path(__file__).parent.parent.parent.parent - local_test_storage_path = project_root.parent / "test_cloud_storage" / "validated" - logger.info(f"LOCAL mode: Will check local storage at {local_test_storage_path}") + # Temporary storage: file_record -> DataFrame + loaded_files: Dict[Any, pd.DataFrame] = {} for file_record in batch_files: file_name = file_record.name @@ -573,42 +570,43 @@ def read_batch_files_as_dataframes( df = None - # Try local filesystem first in LOCAL mode - if is_local and local_test_storage_path: - local_file_path = local_test_storage_path / file_name - if local_file_path.exists(): - try: - df = pd.read_csv(local_file_path) - logger.info(f"Loaded {file_name} from local filesystem ({len(df)} rows)") - except Exception as e: - logger.warning(f"Failed to read local file {local_file_path}: {e}") - # Fall back to GCS using StorageControl - if df is None: - try: - blob_path = f"validated/{file_name}" - df = storage_control.read_csv_as_dataframe(bucket_name, blob_path) - logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)") - except ValueError as e: - logger.warning(f"File not found in GCS: {e}") - except Exception as e: - logger.error(f"Failed to read from GCS: {e}") + try: + blob_path = f"validated/{file_name}" + df = storage_control.read_csv_as_dataframe(bucket_name, blob_path) + logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)") + except ValueError as e: + logger.warning(f"File not found in GCS: {e}") + except Exception as e: + logger.error(f"Failed to read from GCS: {e}") if df is not None: - file_dataframes[file_name] = df + loaded_files[file_record] = df - if not file_dataframes: - error_msg = f"No valid input files found in batch" - if is_local and local_test_storage_path: - error_msg += f" (checked local: {local_test_storage_path} and GCS: {bucket_name}/validated/)" - else: - error_msg += f" (checked GCS: {bucket_name}/validated/)" + if not loaded_files: + error_msg = f"No valid input files found in batch (checked GCS: {bucket_name}/validated/)" raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=error_msg, ) - return file_dataframes + # Group by schema type and combine DataFrames + schema_dataframes: Dict[str, List[pd.DataFrame]] = {} + for file_record, df in loaded_files.items(): + for schema in file_record.schemas: + if schema not in schema_dataframes: + schema_dataframes[schema] = [] + schema_dataframes[schema].append(df) + + result = {} + for schema, dfs in schema_dataframes.items(): + if len(dfs) == 1: + result[schema] = dfs[0] + else: + result[schema] = pd.concat(dfs, ignore_index=True) + logger.info(f"Combined {len(dfs)} files for schema {schema} ({len(result[schema])} total rows)") + + return result def calculate_gpa_series(df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str) -> List[float]: @@ -703,8 +701,15 @@ def get_eda_data( file_dataframes = read_batch_files_as_dataframes( inst_id, batch_files, storage_control ) - df_cohort = file_dataframes['eda_test_cohort_file.csv'] - df_course = file_dataframes['eda_test_course_file.csv'] + df_cohort = file_dataframes.get('STUDENT') + df_course = file_dataframes.get('COURSE') + + if df_cohort is None: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail="No STUDENT schema files found in batch for EDA.", + ) + cohort_years = sorted(df_cohort['Cohort'].unique().tolist()) return EdaDataResponse( From 94108d62f71a35d89783c08348f0a3f5a136f4fa Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 13:32:16 -0500 Subject: [PATCH 10/25] eda test data --- src/webapp/database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index e2676e0e..7035f369 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -210,7 +210,7 @@ def init_db(env: str) -> None: id=TEST_BATCH_UUID, inst_id=TEST_INST_UUID, name="Batch_2025-11-12_1762967767400", - completed=False, + completed=True, created_by=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"), created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, From f814620ac52b4abdca128cdac44c3d1a33dc43d7 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 14:39:46 -0500 Subject: [PATCH 11/25] allow missing eda data --- src/webapp/routers/data.py | 41 +++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 6b0c6c4f..2d74f581 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -519,12 +519,12 @@ class StackedBarSeries(BaseModel): class EdaDataResponse(BaseModel): """Complete EDA data response matching frontend expectations.""" - summary_stats: SummaryStats - gpa_by_enrollment_type: GpaChartData - gpa_by_enrollment_intensity: GpaChartData - students_by_cohort_term: TermData - course_enrollments: TermData - degree_types: List[DegreeTypeData] + summary_stats: Optional[SummaryStats] = None + gpa_by_enrollment_type: Optional[GpaChartData] = None + gpa_by_enrollment_intensity: Optional[GpaChartData] = None + students_by_cohort_term: Optional[TermData] = None + course_enrollments: Optional[TermData] = None + degree_types: Optional[List[DegreeTypeData]] = None enrollment_type_by_intensity: Dict[str, Any] # Categories and series pell_recipient_by_first_gen: Dict[str, Any] # Categories and series student_age_by_gender: Dict[str, Any] # Categories and series @@ -615,17 +615,18 @@ def calculate_gpa_series(df: pd.DataFrame, cohort_years: List[str], grouping_col Args: df: DataFrame (cohort data) cohort_years: List of cohort years - grouping_col: Column to filter by (e.g., 'Enrollment Type') + grouping_col: Column to filter by (e.g., 'enrollment_type') category_value: Specific value to filter for (e.g., 'First-Time') Returns: List of GPA values, one per cohort year """ + # Filter by category filtered = df[df[grouping_col] == category_value] # Group by cohort and calculate mean GPA - gpa_by_cohort = filtered.groupby('Cohort')['GPA Group Year 1'].mean() + gpa_by_cohort = pd.to_numeric(filtered['gpa_group_year_1'], errors='coerce').groupby(filtered['cohort']).mean() # Convert to list aligned with cohort_years data = [round(gpa_by_cohort.get(year, 0), 1) for year in cohort_years] @@ -644,8 +645,8 @@ def get_term_counts(df: pd.DataFrame, cohort_years: List[str], term_name: str) - Returns: List of student counts, one per cohort year """ - return (df[df['Cohort Term'] == term_name] - .groupby('Cohort').size() + return (df[df['cohort_term'] == term_name] + .groupby('cohort').size() .reindex(cohort_years, fill_value=0) .astype(int).tolist()) @@ -710,26 +711,26 @@ def get_eda_data( detail="No STUDENT schema files found in batch for EDA.", ) - cohort_years = sorted(df_cohort['Cohort'].unique().tolist()) + cohort_years = sorted(df_cohort['cohort'].unique().tolist()) return EdaDataResponse( summary_stats=SummaryStats( - total_students=f"{df_cohort['Student GUID'].nunique():,}", - transfer_students=f"{(df_cohort['Enrollment Type'] == 'Transfer-In').sum():,}", - avg_year1_gpa_all_students=f"{df_cohort['GPA Group Year 1'].mean():.2f}", + total_students=f"{df_cohort['study_id'].nunique():,}", + transfer_students=f"{(df_cohort['enrollment_type'] == 'Transfer-In').sum():,}", + avg_year1_gpa_all_students=f"{pd.to_numeric(df_cohort['gpa_group_year_1'], errors='coerce').mean():.2f}", ), gpa_by_enrollment_type=GpaChartData( cohort_years=cohort_years, series=[ - GpaSeriesData(name="First Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Type', 'First-Time')), - GpaSeriesData(name="Transfer Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Type', 'Transfer-In')), + GpaSeriesData(name="First Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_type', 'First-Time')), + GpaSeriesData(name="Transfer Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_type', 'Transfer-In')), ], ), gpa_by_enrollment_intensity=GpaChartData( cohort_years=cohort_years, series=[ - GpaSeriesData(name="Full Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Intensity First Term', 'Full-Time')), - GpaSeriesData(name="Part Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'Enrollment Intensity First Term', 'Part-Time')), + GpaSeriesData(name="Full Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_intensity_first_term', 'Full-Time')), + GpaSeriesData(name="Part Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_intensity_first_term', 'Part-Time')), ], ), students_by_cohort_term=TermData( @@ -746,11 +747,11 @@ def get_eda_data( ), degree_types=[ DegreeTypeData( - value=int(round(count / df_cohort['Credential Type Sought Year 1'].count() * 100)), + value=int(round(count / df_cohort['credential_type_sought_year_1'].count() * 100)), name=str(degree_type), color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5] ) - for i, (degree_type, count) in enumerate(df_cohort['Credential Type Sought Year 1'].value_counts().items()) + for i, (degree_type, count) in enumerate(df_cohort['credential_type_sought_year_1'].value_counts().items()) ], enrollment_type_by_intensity={ "categories": ['First-Time', 'Re-Admit', 'Transfer-In'], From 1650d64353b74fdb1f66efce95e3ae153afb92a9 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 14:44:32 -0500 Subject: [PATCH 12/25] eda enrollment type by intensity --- src/webapp/routers/data.py | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 2d74f581..e4cca9ae 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -754,10 +754,34 @@ def get_eda_data( for i, (degree_type, count) in enumerate(df_cohort['credential_type_sought_year_1'].value_counts().items()) ], enrollment_type_by_intensity={ - "categories": ['First-Time', 'Re-Admit', 'Transfer-In'], + "categories": (categories := sorted(df_cohort['enrollment_type'].unique().tolist())), "series": [ - {"name": "Full Time", "type": "bar", "stack": "intensity", "data": [9800, 600, 8500], "color": "#F79222"}, - {"name": "Part Time", "type": "bar", "stack": "intensity", "data": [200, 1100, 1200], "color": "#00CFEA"}, + { + "name": "Full Time", + "type": "bar", + "stack": "intensity", + "data": ( + df_cohort[df_cohort['enrollment_intensity_first_term'] == 'Full-Time'] + .groupby('enrollment_type') + .size() + .reindex(categories, fill_value=0) + .tolist() + ), + "color": "#F79222" + }, + { + "name": "Part Time", + "type": "bar", + "stack": "intensity", + "data": ( + df_cohort[df_cohort['enrollment_intensity_first_term'] == 'Part-Time'] + .groupby('enrollment_type') + .size() + .reindex(categories, fill_value=0) + .tolist() + ), + "color": "#00CFEA" + }, ], }, pell_recipient_by_first_gen={ From 12d3b4acc65439bcc29e75b6ff30112a1ab26325 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 15:11:33 -0500 Subject: [PATCH 13/25] eda pell recipient by 1st gen --- src/webapp/routers/data.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index e4cca9ae..06d527d9 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -785,11 +785,35 @@ def get_eda_data( ], }, pell_recipient_by_first_gen={ - "categories": ['Yes', 'No'], + "categories": (pell_categories := sorted( + df_cohort['pell_status_first_year'] + .dropna() + .replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + .loc[lambda x: x.isin(['Yes', 'No'])] + .unique() + .tolist() + )), "series": [ - {"name": "Yes", "type": "bar", "stack": "firstGen", "data": [3000, 3700], "color": "#F79222"}, - {"name": "No", "type": "bar", "stack": "firstGen", "data": [4200, 3000], "color": "#00CFEA"}, - {"name": "Nan", "type": "bar", "stack": "firstGen", "data": [1800, 1800], "color": "#25A95A"}, + { + "name": first_gen_normalized, + "type": "bar", + "stack": "firstGen", + "data": ( + df_cohort.assign( + _pell=df_cohort['pell_status_first_year'].replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}), + _first_gen=df_cohort['first_gen'].fillna("Nan").replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + ) + .query(f"_first_gen == '{first_gen_normalized}' and _pell in ['Yes', 'No']") + .groupby('_pell') + .size() + .reindex(pell_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3] + } + for i, first_gen_normalized in enumerate(sorted( + df_cohort['first_gen'].fillna("Nan").replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}).unique().tolist() + )) ], }, student_age_by_gender={ From fcfe9cc7ad24690c3e4899d5b0a18696153be8f9 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 15:15:56 -0500 Subject: [PATCH 14/25] eda student age by gender --- src/webapp/routers/data.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 06d527d9..28409c04 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -817,11 +817,36 @@ def get_eda_data( ], }, student_age_by_gender={ - "categories": ['Female', 'Male', 'Nonbinary, intersex, and gender-nonconforming', 'Prefer not to specify', 'Unknown'], + "categories": (gender_categories := sorted(df_cohort['gender'].dropna().unique().tolist())), "series": [ - {"name": "20 or younger", "type": "bar", "stack": "age", "data": [5000, 5000, 800, 2000, 1500], "color": "#F79222"}, - {"name": "20 - 24", "type": "bar", "stack": "age", "data": [2500, 2500, 100, 1000, 1000], "color": "#00CFEA"}, - {"name": "Older than 24", "type": "bar", "stack": "age", "data": [2000, 1300, 100, 500, 1000], "color": "#25A95A"}, + { + "name": age_group, + "type": "bar", + "stack": "age", + "data": ( + df_cohort.assign( + _age_group=( + (df_cohort['student_age'] if 'student_age' in df_cohort.columns + else df_cohort['age'] if 'age' in df_cohort.columns + else pd.Series([None] * len(df_cohort))) + .apply( + lambda x: ( + "20 or younger" if pd.isna(x) or any(term in str(x).lower() for term in ['20 or younger', '20 or under', 'under 20', '<=20']) or (isinstance(x, (int, float)) and x <= 20) + else "20 - 24" if any(term in str(x).lower() for term in ['20-24', '20 to 24', '20 - 24']) or (isinstance(x, (int, float)) and 20 < x <= 24) + else "Older than 24" + ) + ) + ) + ) + .query(f"_age_group == '{age_group}'") + .groupby('gender') + .size() + .reindex(gender_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3] + } + for i, age_group in enumerate(["20 or younger", "20 - 24", "Older than 24"]) ], }, race_by_pell_status={ From bf8b288fc34cfc4341b274638494fedabbc751c8 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 15:24:47 -0500 Subject: [PATCH 15/25] eda pell status by race --- src/webapp/routers/data.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 28409c04..5be43db0 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -850,10 +850,32 @@ def get_eda_data( ], }, race_by_pell_status={ - "categories": ['American Indian or Alaska Native', 'Asian', 'Black or African American', 'Native Hawaiian or other Pacific Islander', 'Nonresident Alien', 'Two or More Races', 'Unknown', 'White'], + "categories": (race_categories := sorted(df_cohort['race'].dropna().unique().tolist())), "series": [ - {"name": "Yes", "type": "bar", "stack": "pell", "data": [30, 250, 400, 20, 50, 100, 150, 2000], "color": "#F79222"}, - {"name": "No", "type": "bar", "stack": "pell", "data": [20, 50, 200, 10, 25, 50, 50, 250], "color": "#00CFEA"}, + { + "name": pell_status_normalized, + "type": "bar", + "stack": "pell", + "data": ( + df_cohort.assign( + _pell=df_cohort['pell_status_first_year'].replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + ) + .query(f"_pell == '{pell_status_normalized}' and _pell in ['Yes', 'No']") + .groupby('race') + .size() + .reindex(race_categories, fill_value=0) + .tolist() + ), + "color": ["#F79222", "#00CFEA"][i % 2] + } + for i, pell_status_normalized in enumerate(sorted( + df_cohort['pell_status_first_year'] + .dropna() + .replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + .loc[lambda x: x.isin(['Yes', 'No'])] + .unique() + .tolist() + )) ], }, ) From 34288878a8c52a665b516454e2f1f7848ce0fda8 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 15:47:02 -0500 Subject: [PATCH 16/25] eda tests --- src/webapp/routers/data_test.py | 204 ++++++++++++++++++++++++++++++++ 1 file changed, 204 insertions(+) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 9b1c1c31..3bb82f6b 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -645,3 +645,207 @@ def test_validate_failure_batch(client: TestClient) -> None: assert response_sftp.json()["file_types"] == ["COURSE"] assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID) assert response_sftp.json()["source"] == "MANUAL_UPLOAD" + + +def test_get_eda_data_unauthorized(client: TestClient) -> None: + """Test GET /institutions//batch//eda with unauthorized access.""" + response = client.get( + "/institutions/" + + uuid_to_str(UUID_INVALID) + + "/batch/" + + uuid_to_str(BATCH_UUID) + + "/eda" + ) + assert str(response) == "" + assert ( + response.text + == '{"detail":"Not authorized to read this institution\'s resources."}' + ) + + +def test_get_eda_data_batch_not_found(client: TestClient) -> None: + """Test GET /institutions//batch//eda with non-existent batch.""" + fake_batch_uuid = uuid.UUID("00000000-0000-0000-0000-000000000000") + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(fake_batch_uuid) + + "/eda" + ) + assert response.status_code == 404 + assert response.json()["detail"] == "Batch not found." + + +def test_get_eda_data_no_student_files(client: TestClient, session: sqlalchemy.orm.Session) -> None: + """Test GET /institutions//batch//eda with batch containing no STUDENT files.""" + import pandas as pd + + # Create a batch with only COURSE files + batch_with_course = BatchTable( + id=uuid.UUID("11111111-1111-1111-1111-111111111111"), + inst_id=USER_VALID_INST_UUID, + name="batch_course_only", + created_by=CREATOR_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + ) + course_file = FileTable( + id=uuid.UUID("22222222-2222-2222-2222-222222222222"), + inst_id=USER_VALID_INST_UUID, + name="course_file.csv", + source="MANUAL_UPLOAD", + batches={batch_with_course}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.COURSE], + ) + session.add_all([batch_with_course, course_file]) + session.commit() + + # Mock storage to return empty (no files found) + MOCK_STORAGE.read_csv_as_dataframe.side_effect = ValueError("File not found") + + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(batch_with_course.id) + + "/eda" + ) + assert response.status_code == 404 + # When files can't be loaded from GCS, we get "No valid input files found" + # The "No STUDENT schema files found" error only occurs after files are loaded + assert "No valid input files found" in response.json()["detail"] + + +def test_get_eda_data_success(client: TestClient, session: sqlalchemy.orm.Session) -> None: + """Test GET /institutions//batch//eda with valid data.""" + import pandas as pd + + # Create a batch with STUDENT and COURSE files + eda_batch = BatchTable( + id=uuid.UUID("33333333-3333-3333-3333-333333333333"), + inst_id=USER_VALID_INST_UUID, + name="batch_eda_test", + created_by=CREATOR_UUID, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + completed=True, + ) + student_file = FileTable( + id=uuid.UUID("44444444-4444-4444-4444-444444444444"), + inst_id=USER_VALID_INST_UUID, + name="student_file.csv", + source="MANUAL_UPLOAD", + batches={eda_batch}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.STUDENT], + ) + course_file = FileTable( + id=uuid.UUID("55555555-5555-5555-5555-555555555555"), + inst_id=USER_VALID_INST_UUID, + name="course_file.csv", + source="MANUAL_UPLOAD", + batches={eda_batch}, + created_at=DATETIME_TESTING, + updated_at=DATETIME_TESTING, + sst_generated=False, + valid=True, + schemas=[SchemaType.COURSE], + ) + session.add_all([eda_batch, student_file, course_file]) + session.commit() + + # Create mock DataFrames + df_student = pd.DataFrame({ + 'study_id': ['S001', 'S002', 'S003', 'S001'], # S001 appears twice + 'cohort': ['2020', '2020', '2021', '2021'], + 'cohort_term': ['FALL', 'FALL', 'SPRING', 'SPRING'], + 'enrollment_type': ['First-Time', 'Transfer-In', 'First-Time', 'Transfer-In'], + 'enrollment_intensity_first_term': ['Full-Time', 'Part-Time', 'Full-Time', 'Part-Time'], + 'gpa_group_year_1': [3.5, 3.2, 3.8, 2.9], + 'credential_type_sought_year_1': ['Bachelor', 'Associate', 'Bachelor', 'Associate'], + 'pell_status_first_year': ['Y', 'N', 'Y', 'N'], + 'first_gen': ['Y', 'N', 'Y', 'N'], + 'gender': ['Female', 'Male', 'Female', 'Male'], + 'race': ['White', 'Black or African American', 'Asian', 'White'], + 'student_age': ['20 - 24', '20 or younger', 'Older than 24', '20 - 24'], + }) + + df_course = pd.DataFrame({ + 'study_id': ['S001', 'S002', 'S003'], + 'cohort': ['2020', '2020', '2021'], + 'cohort_term': ['FALL', 'FALL', 'SPRING'], + }) + + # Mock storage to return our test DataFrames + def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame: + if 'student' in blob_path.lower(): + return df_student + elif 'course' in blob_path.lower(): + return df_course + else: + raise ValueError(f"File not found: {blob_path}") + + MOCK_STORAGE.read_csv_as_dataframe.side_effect = mock_read_csv + + response = client.get( + "/institutions/" + + uuid_to_str(USER_VALID_INST_UUID) + + "/batch/" + + uuid_to_str(eda_batch.id) + + "/eda" + ) + + assert response.status_code == 200 + data = response.json() + + # Check response structure + assert "summary_stats" in data + assert "gpa_by_enrollment_type" in data + assert "gpa_by_enrollment_intensity" in data + assert "students_by_cohort_term" in data + assert "course_enrollments" in data + assert "degree_types" in data + assert "enrollment_type_by_intensity" in data + assert "pell_recipient_by_first_gen" in data + assert "student_age_by_gender" in data + assert "race_by_pell_status" in data + + # Check summary stats + assert data["summary_stats"]["total_students"] == "3" # 3 unique study_ids + assert data["summary_stats"]["transfer_students"] == "2" # 2 Transfer-In + + # Check GPA charts have cohort years + assert "cohort_years" in data["gpa_by_enrollment_type"] + assert len(data["gpa_by_enrollment_type"]["cohort_years"]) == 2 # 2020, 2021 + assert "2020" in data["gpa_by_enrollment_type"]["cohort_years"] + assert "2021" in data["gpa_by_enrollment_type"]["cohort_years"] + + # Check term data structure + assert "fall" in data["students_by_cohort_term"] + assert "spring" in data["students_by_cohort_term"] + assert len(data["students_by_cohort_term"]["fall"]) == 2 # One per cohort year + + # Check enrollment type by intensity has categories and series + assert "categories" in data["enrollment_type_by_intensity"] + assert "series" in data["enrollment_type_by_intensity"] + assert len(data["enrollment_type_by_intensity"]["series"]) > 0 + + # Check pell recipient chart structure + assert "categories" in data["pell_recipient_by_first_gen"] + assert "series" in data["pell_recipient_by_first_gen"] + + # Check student age by gender structure + assert "categories" in data["student_age_by_gender"] + assert "series" in data["student_age_by_gender"] + + # Check race by pell status structure + assert "categories" in data["race_by_pell_status"] + assert "series" in data["race_by_pell_status"] From aef0b96dc664ce9e5da40a9d8135dd19b7ea25cd Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 16:22:05 -0500 Subject: [PATCH 17/25] cache eda --- src/webapp/routers/data.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 5be43db0..9f02ea75 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -14,6 +14,7 @@ import re from ..validation import HardValidationError import pandas as pd +from cachetools import TTLCache from ..utilities import ( has_access_to_inst_or_err, @@ -51,6 +52,11 @@ logger = logging.getLogger(__name__) logger.setLevel(logging.DEBUG) +# Cache for EDA data - TTL of 10 minutes (600 seconds) +# Cache key format: f"{inst_id}:{batch_id}" +EDA_CACHE_TTL = int(os.getenv("EDA_CACHE_TTL", "600")) # Default 10 minutes +EDA_CACHE: Any = TTLCache(maxsize=64, ttl=EDA_CACHE_TTL) + router = APIRouter( prefix="/institutions", tags=["data"], @@ -698,6 +704,15 @@ def get_eda_data( batch_record = batch_result[0][0] batch_files = batch_record.files + # Check cache first + cache_key = f"{inst_id}:{batch_id}" + cached_result = EDA_CACHE.get(cache_key) + if cached_result is not None: + logger.debug(f"EDA cache hit for {cache_key}") + return cached_result + + logger.debug(f"EDA cache miss for {cache_key}, computing...") + # Read files from batch using helper function file_dataframes = read_batch_files_as_dataframes( inst_id, batch_files, storage_control @@ -713,7 +728,7 @@ def get_eda_data( cohort_years = sorted(df_cohort['cohort'].unique().tolist()) - return EdaDataResponse( + result = EdaDataResponse( summary_stats=SummaryStats( total_students=f"{df_cohort['study_id'].nunique():,}", transfer_students=f"{(df_cohort['enrollment_type'] == 'Transfer-In').sum():,}", @@ -879,6 +894,12 @@ def get_eda_data( ], }, ) + + # Cache the result before returning + EDA_CACHE[cache_key] = result + logger.debug(f"EDA result cached for {cache_key}") + + return result @router.post("/{inst_id}/batch", response_model=BatchInfo) def create_batch( From fd176255ba681e789a166a2ec763dcc80967131a Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 16:53:15 -0500 Subject: [PATCH 18/25] tidy up --- src/webapp/routers/data.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 9f02ea75..c019829f 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -544,9 +544,6 @@ def read_batch_files_as_dataframes( ) -> Dict[str, pd.DataFrame]: """Read CSV files from a batch and return as DataFrames. - In LOCAL mode, checks ../test_cloud_storage/validated/ first, then falls back to GCS. - In deployed environments (DEV/STAGING/PROD), only reads from GCS. - Args: inst_id: Institution ID batch_files: Set of FileTable objects from the batch From 1ca1a36280cfe5c990cf5268f8d69723dd8bba3e Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 18:04:27 -0500 Subject: [PATCH 19/25] remove LOCAL test bucket setup --- src/webapp/routers/data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index c019829f..c14a2f6a 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -556,9 +556,7 @@ def read_batch_files_as_dataframes( HTTPException: If no valid files found """ file_dataframes: Dict[str, pd.DataFrame] = {} - is_local = env_vars.get("ENV", "").upper() == "LOCAL" - # For LOCAL development, use DEV GCS buckets for file storage - bucket_name = f"dev_{inst_id}" if is_local else get_external_bucket_name(inst_id) + bucket_name = get_external_bucket_name(inst_id) # Temporary storage: file_record -> DataFrame loaded_files: Dict[Any, pd.DataFrame] = {} From 2560c9c3332ca740cc43086679ae443820afca76 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 18:06:50 -0500 Subject: [PATCH 20/25] return List from get_term_counts --- src/webapp/routers/data.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index c14a2f6a..facffc4b 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -646,10 +646,11 @@ def get_term_counts(df: pd.DataFrame, cohort_years: List[str], term_name: str) - Returns: List of student counts, one per cohort year """ - return (df[df['cohort_term'] == term_name] - .groupby('cohort').size() - .reindex(cohort_years, fill_value=0) - .astype(int).tolist()) + result_series = (df[df['cohort_term'] == term_name] + .groupby('cohort').size() + .reindex(cohort_years, fill_value=0) + .astype(int)) + return [int(x) for x in result_series.tolist()] # Explicitly convert to List[int] @router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse) From 22c427c2af6a54f9badb7e64847b60ddd52b7361 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 18:13:39 -0500 Subject: [PATCH 21/25] import pandas --- src/webapp/routers/data_test.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 3bb82f6b..3f5dafc7 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -679,8 +679,6 @@ def test_get_eda_data_batch_not_found(client: TestClient) -> None: def test_get_eda_data_no_student_files(client: TestClient, session: sqlalchemy.orm.Session) -> None: """Test GET /institutions//batch//eda with batch containing no STUDENT files.""" - import pandas as pd - # Create a batch with only COURSE files batch_with_course = BatchTable( id=uuid.UUID("11111111-1111-1111-1111-111111111111"), From decd8698a4200623f3f9ec03a70b3dc9e368ae61 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 18:14:32 -0500 Subject: [PATCH 22/25] remove unused variable --- src/webapp/routers/data.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index facffc4b..4e2f0130 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -555,7 +555,6 @@ def read_batch_files_as_dataframes( Raises: HTTPException: If no valid files found """ - file_dataframes: Dict[str, pd.DataFrame] = {} bucket_name = get_external_bucket_name(inst_id) # Temporary storage: file_record -> DataFrame From 4844bd14eb64ae4fec83187550a709f99ce400b8 Mon Sep 17 00:00:00 2001 From: William Carr Date: Tue, 9 Dec 2025 18:16:59 -0500 Subject: [PATCH 23/25] tidy up --- src/webapp/database.py | 8 +- src/webapp/gcsutil.py | 16 +- src/webapp/routers/data.py | 318 +++++++++++++++++++++----------- src/webapp/routers/data_test.py | 103 +++++++---- 4 files changed, 289 insertions(+), 156 deletions(-) diff --git a/src/webapp/database.py b/src/webapp/database.py index 7035f369..f9c56fea 100644 --- a/src/webapp/database.py +++ b/src/webapp/database.py @@ -158,7 +158,9 @@ def init_db(env: str) -> None: uploader=LOCAL_USER_UUID, sst_generated=False, valid=True, - schemas=["STUDENT"], # Using string literal to avoid circular import + schemas=[ + "STUDENT" + ], # Using string literal to avoid circular import created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) @@ -177,7 +179,7 @@ def init_db(env: str) -> None: session.merge(test_file_1) session.merge(test_file_2) session.merge(test_batch) - + # Create test files for EDA test institution (TEST_INST_UUID) # Real files from DEV batch 3182f472e0794678a0a19ca5ead6c49a test_file_student = FileTable( @@ -204,7 +206,7 @@ def init_db(env: str) -> None: created_at=DATETIME_TESTING, updated_at=DATETIME_TESTING, ) - + # Test batch - matches DEV USC Beaufort test_batch = BatchTable( id=TEST_BATCH_UUID, diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py index 79a50d19..b74d14c9 100644 --- a/src/webapp/gcsutil.py +++ b/src/webapp/gcsutil.py @@ -358,28 +358,28 @@ def get_file_contents(self, bucket_name: str, file_name: str) -> Any: blob = bucket.blob(file_name) res = blob.download_as_bytes() return res - + def read_csv_as_dataframe(self, bucket_name: str, file_name: str) -> Any: """Read a CSV file from GCS and return as pandas DataFrame. - + Args: bucket_name: GCS bucket name file_name: Full blob path (e.g., 'validated/filename.csv') - + Returns: pandas DataFrame - + Raises: ValueError: If bucket or file not found """ import pandas as pd - + storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(file_name) - + if not blob.exists(): raise ValueError(f"File not found: {file_name}") - + with blob.open("r") as fh: - return pd.read_csv(fh) \ No newline at end of file + return pd.read_csv(fh) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 4e2f0130..28bd60fd 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -482,6 +482,7 @@ def read_batch_info( class SummaryStats(BaseModel): """Summary statistics for the EDA dashboard.""" + total_students: str transfer_students: str avg_year1_gpa_all_students: str @@ -489,18 +490,21 @@ class SummaryStats(BaseModel): class GpaSeriesData(BaseModel): """GPA data series for a chart.""" + name: str data: List[float] class GpaChartData(BaseModel): """GPA chart data with cohort years and series.""" + cohort_years: List[str] series: List[GpaSeriesData] class TermData(BaseModel): """Term-based data (fall, winter, spring, summer).""" + fall: List[int] winter: List[int] spring: List[int] @@ -509,6 +513,7 @@ class TermData(BaseModel): class DegreeTypeData(BaseModel): """Degree type data for donut chart.""" + value: int name: str color: str @@ -516,6 +521,7 @@ class DegreeTypeData(BaseModel): class StackedBarSeries(BaseModel): """Series data for stacked bar charts.""" + name: str type: str = "bar" stack: str @@ -525,6 +531,7 @@ class StackedBarSeries(BaseModel): class EdaDataResponse(BaseModel): """Complete EDA data response matching frontend expectations.""" + summary_stats: Optional[SummaryStats] = None gpa_by_enrollment_type: Optional[GpaChartData] = None gpa_by_enrollment_intensity: Optional[GpaChartData] = None @@ -543,33 +550,33 @@ def read_batch_files_as_dataframes( storage_control: StorageControl, ) -> Dict[str, pd.DataFrame]: """Read CSV files from a batch and return as DataFrames. - + Args: inst_id: Institution ID batch_files: Set of FileTable objects from the batch storage_control: StorageControl instance for GCS access - + Returns: Dictionary mapping file_name -> pandas.DataFrame - + Raises: HTTPException: If no valid files found """ bucket_name = get_external_bucket_name(inst_id) - + # Temporary storage: file_record -> DataFrame loaded_files: Dict[Any, pd.DataFrame] = {} - + for file_record in batch_files: file_name = file_record.name - + # Skip SST-generated output files (only process input files) if file_record.sst_generated: logger.debug(f"Skipping SST-generated file: {file_name}") continue - + df = None - + # Fall back to GCS using StorageControl try: blob_path = f"validated/{file_name}" @@ -579,17 +586,17 @@ def read_batch_files_as_dataframes( logger.warning(f"File not found in GCS: {e}") except Exception as e: logger.error(f"Failed to read from GCS: {e}") - + if df is not None: loaded_files[file_record] = df - + if not loaded_files: error_msg = f"No valid input files found in batch (checked GCS: {bucket_name}/validated/)" raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=error_msg, ) - + # Group by schema type and combine DataFrames schema_dataframes: Dict[str, List[pd.DataFrame]] = {} for file_record, df in loaded_files.items(): @@ -597,58 +604,71 @@ def read_batch_files_as_dataframes( if schema not in schema_dataframes: schema_dataframes[schema] = [] schema_dataframes[schema].append(df) - + result = {} for schema, dfs in schema_dataframes.items(): if len(dfs) == 1: result[schema] = dfs[0] else: result[schema] = pd.concat(dfs, ignore_index=True) - logger.info(f"Combined {len(dfs)} files for schema {schema} ({len(result[schema])} total rows)") - + logger.info( + f"Combined {len(dfs)} files for schema {schema} ({len(result[schema])} total rows)" + ) + return result -def calculate_gpa_series(df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str) -> List[float]: +def calculate_gpa_series( + df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str +) -> List[float]: """Calculate GPA data for one category across cohort years. - + Args: df: DataFrame (cohort data) cohort_years: List of cohort years grouping_col: Column to filter by (e.g., 'enrollment_type') category_value: Specific value to filter for (e.g., 'First-Time') - + Returns: List of GPA values, one per cohort year """ # Filter by category filtered = df[df[grouping_col] == category_value] - + # Group by cohort and calculate mean GPA - gpa_by_cohort = pd.to_numeric(filtered['gpa_group_year_1'], errors='coerce').groupby(filtered['cohort']).mean() - + gpa_by_cohort = ( + pd.to_numeric(filtered["gpa_group_year_1"], errors="coerce") + .groupby(filtered["cohort"]) + .mean() + ) + # Convert to list aligned with cohort_years data = [round(gpa_by_cohort.get(year, 0), 1) for year in cohort_years] - + return data -def get_term_counts(df: pd.DataFrame, cohort_years: List[str], term_name: str) -> List[int]: +def get_term_counts( + df: pd.DataFrame, cohort_years: List[str], term_name: str +) -> List[int]: """Get student counts for a specific term across cohort years. - + Args: df: DataFrame (cohort or course data) cohort_years: List of cohort years term_name: Term name to filter for (e.g., 'FALL', 'WINTER') - + Returns: List of student counts, one per cohort year """ - result_series = (df[df['cohort_term'] == term_name] - .groupby('cohort').size() - .reindex(cohort_years, fill_value=0) - .astype(int)) + result_series = ( + df[df["cohort_term"] == term_name] + .groupby("cohort") + .size() + .reindex(cohort_years, fill_value=0) + .astype(int) + ) return [int(x) for x in result_series.tolist()] # Explicitly convert to List[int] @@ -661,7 +681,7 @@ def get_eda_data( storage_control: Annotated[StorageControl, Depends(StorageControl)], ) -> Any: """Returns EDA (Exploratory Data Analysis) data for a specific batch. - + This endpoint provides all the data needed to populate the EDA dashboard, including summary statistics, GPA charts, enrollment data, and demographic breakdowns. Analyzes all files in the batch together to provide comprehensive insights. @@ -669,7 +689,7 @@ def get_eda_data( has_access_to_inst_or_err(inst_id, current_user) has_full_data_access_or_err(current_user, "EDA data") local_session.set(sql_session) - + # Verify batch exists and belongs to institution batch_result = ( local_session.get() @@ -683,46 +703,46 @@ def get_eda_data( ) .all() ) - + if not batch_result or len(batch_result) == 0: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="Batch not found.", ) - + if len(batch_result) > 1: raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail="Batch duplicates found.", ) - + batch_record = batch_result[0][0] batch_files = batch_record.files - + # Check cache first cache_key = f"{inst_id}:{batch_id}" cached_result = EDA_CACHE.get(cache_key) if cached_result is not None: logger.debug(f"EDA cache hit for {cache_key}") return cached_result - + logger.debug(f"EDA cache miss for {cache_key}, computing...") - + # Read files from batch using helper function file_dataframes = read_batch_files_as_dataframes( inst_id, batch_files, storage_control ) - df_cohort = file_dataframes.get('STUDENT') - df_course = file_dataframes.get('COURSE') - + df_cohort = file_dataframes.get("STUDENT") + df_course = file_dataframes.get("COURSE") + if df_cohort is None: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail="No STUDENT schema files found in batch for EDA.", ) - - cohort_years = sorted(df_cohort['cohort'].unique().tolist()) - + + cohort_years = sorted(df_cohort["cohort"].unique().tolist()) + result = EdaDataResponse( summary_stats=SummaryStats( total_students=f"{df_cohort['study_id'].nunique():,}", @@ -732,77 +752,117 @@ def get_eda_data( gpa_by_enrollment_type=GpaChartData( cohort_years=cohort_years, series=[ - GpaSeriesData(name="First Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_type', 'First-Time')), - GpaSeriesData(name="Transfer Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_type', 'Transfer-In')), + GpaSeriesData( + name="First Time Student", + data=calculate_gpa_series( + df_cohort, cohort_years, "enrollment_type", "First-Time" + ), + ), + GpaSeriesData( + name="Transfer Student", + data=calculate_gpa_series( + df_cohort, cohort_years, "enrollment_type", "Transfer-In" + ), + ), ], ), gpa_by_enrollment_intensity=GpaChartData( cohort_years=cohort_years, series=[ - GpaSeriesData(name="Full Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_intensity_first_term', 'Full-Time')), - GpaSeriesData(name="Part Time Student", data=calculate_gpa_series(df_cohort, cohort_years, 'enrollment_intensity_first_term', 'Part-Time')), + GpaSeriesData( + name="Full Time Student", + data=calculate_gpa_series( + df_cohort, + cohort_years, + "enrollment_intensity_first_term", + "Full-Time", + ), + ), + GpaSeriesData( + name="Part Time Student", + data=calculate_gpa_series( + df_cohort, + cohort_years, + "enrollment_intensity_first_term", + "Part-Time", + ), + ), ], ), students_by_cohort_term=TermData( - fall=get_term_counts(df_cohort, cohort_years, 'FALL'), - winter=get_term_counts(df_cohort, cohort_years, 'WINTER'), - spring=get_term_counts(df_cohort, cohort_years, 'SPRING'), - summer=get_term_counts(df_cohort, cohort_years, 'SUMMER'), + fall=get_term_counts(df_cohort, cohort_years, "FALL"), + winter=get_term_counts(df_cohort, cohort_years, "WINTER"), + spring=get_term_counts(df_cohort, cohort_years, "SPRING"), + summer=get_term_counts(df_cohort, cohort_years, "SUMMER"), ), course_enrollments=TermData( - fall=get_term_counts(df_course, cohort_years, 'FALL'), - winter=get_term_counts(df_course, cohort_years, 'WINTER'), - spring=get_term_counts(df_course, cohort_years, 'SPRING'), - summer=get_term_counts(df_course, cohort_years, 'SUMMER'), + fall=get_term_counts(df_course, cohort_years, "FALL"), + winter=get_term_counts(df_course, cohort_years, "WINTER"), + spring=get_term_counts(df_course, cohort_years, "SPRING"), + summer=get_term_counts(df_course, cohort_years, "SUMMER"), ), degree_types=[ DegreeTypeData( - value=int(round(count / df_cohort['credential_type_sought_year_1'].count() * 100)), + value=int( + round( + count / df_cohort["credential_type_sought_year_1"].count() * 100 + ) + ), name=str(degree_type), - color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5] + color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5], + ) + for i, (degree_type, count) in enumerate( + df_cohort["credential_type_sought_year_1"].value_counts().items() ) - for i, (degree_type, count) in enumerate(df_cohort['credential_type_sought_year_1'].value_counts().items()) ], enrollment_type_by_intensity={ - "categories": (categories := sorted(df_cohort['enrollment_type'].unique().tolist())), + "categories": ( + categories := sorted(df_cohort["enrollment_type"].unique().tolist()) + ), "series": [ { "name": "Full Time", "type": "bar", "stack": "intensity", "data": ( - df_cohort[df_cohort['enrollment_intensity_first_term'] == 'Full-Time'] - .groupby('enrollment_type') + df_cohort[ + df_cohort["enrollment_intensity_first_term"] == "Full-Time" + ] + .groupby("enrollment_type") .size() .reindex(categories, fill_value=0) .tolist() ), - "color": "#F79222" + "color": "#F79222", }, { "name": "Part Time", "type": "bar", "stack": "intensity", "data": ( - df_cohort[df_cohort['enrollment_intensity_first_term'] == 'Part-Time'] - .groupby('enrollment_type') + df_cohort[ + df_cohort["enrollment_intensity_first_term"] == "Part-Time" + ] + .groupby("enrollment_type") .size() .reindex(categories, fill_value=0) .tolist() ), - "color": "#00CFEA" + "color": "#00CFEA", }, ], }, pell_recipient_by_first_gen={ - "categories": (pell_categories := sorted( - df_cohort['pell_status_first_year'] - .dropna() - .replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) - .loc[lambda x: x.isin(['Yes', 'No'])] - .unique() - .tolist() - )), + "categories": ( + pell_categories := sorted( + df_cohort["pell_status_first_year"] + .dropna() + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .loc[lambda x: x.isin(["Yes", "No"])] + .unique() + .tolist() + ) + ), "series": [ { "name": first_gen_normalized, @@ -810,24 +870,40 @@ def get_eda_data( "stack": "firstGen", "data": ( df_cohort.assign( - _pell=df_cohort['pell_status_first_year'].replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}), - _first_gen=df_cohort['first_gen'].fillna("Nan").replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + _pell=df_cohort["pell_status_first_year"].replace( + {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"} + ), + _first_gen=df_cohort["first_gen"] + .fillna("Nan") + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}), ) - .query(f"_first_gen == '{first_gen_normalized}' and _pell in ['Yes', 'No']") - .groupby('_pell') + .query( + f"_first_gen == '{first_gen_normalized}' and _pell in ['Yes', 'No']" + ) + .groupby("_pell") .size() .reindex(pell_categories, fill_value=0) .tolist() ), - "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3] + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3], } - for i, first_gen_normalized in enumerate(sorted( - df_cohort['first_gen'].fillna("Nan").replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}).unique().tolist() - )) + for i, first_gen_normalized in enumerate( + sorted( + df_cohort["first_gen"] + .fillna("Nan") + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .unique() + .tolist() + ) + ) ], }, student_age_by_gender={ - "categories": (gender_categories := sorted(df_cohort['gender'].dropna().unique().tolist())), + "categories": ( + gender_categories := sorted( + df_cohort["gender"].dropna().unique().tolist() + ) + ), "series": [ { "name": age_group, @@ -836,31 +912,56 @@ def get_eda_data( "data": ( df_cohort.assign( _age_group=( - (df_cohort['student_age'] if 'student_age' in df_cohort.columns - else df_cohort['age'] if 'age' in df_cohort.columns - else pd.Series([None] * len(df_cohort))) - .apply( + ( + df_cohort["student_age"] + if "student_age" in df_cohort.columns + else df_cohort["age"] + if "age" in df_cohort.columns + else pd.Series([None] * len(df_cohort)) + ).apply( lambda x: ( - "20 or younger" if pd.isna(x) or any(term in str(x).lower() for term in ['20 or younger', '20 or under', 'under 20', '<=20']) or (isinstance(x, (int, float)) and x <= 20) - else "20 - 24" if any(term in str(x).lower() for term in ['20-24', '20 to 24', '20 - 24']) or (isinstance(x, (int, float)) and 20 < x <= 24) + "20 or younger" + if pd.isna(x) + or any( + term in str(x).lower() + for term in [ + "20 or younger", + "20 or under", + "under 20", + "<=20", + ] + ) + or (isinstance(x, (int, float)) and x <= 20) + else "20 - 24" + if any( + term in str(x).lower() + for term in ["20-24", "20 to 24", "20 - 24"] + ) + or ( + isinstance(x, (int, float)) and 20 < x <= 24 + ) else "Older than 24" ) ) ) ) .query(f"_age_group == '{age_group}'") - .groupby('gender') + .groupby("gender") .size() .reindex(gender_categories, fill_value=0) .tolist() ), - "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3] + "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3], } - for i, age_group in enumerate(["20 or younger", "20 - 24", "Older than 24"]) + for i, age_group in enumerate( + ["20 or younger", "20 - 24", "Older than 24"] + ) ], }, race_by_pell_status={ - "categories": (race_categories := sorted(df_cohort['race'].dropna().unique().tolist())), + "categories": ( + race_categories := sorted(df_cohort["race"].dropna().unique().tolist()) + ), "series": [ { "name": pell_status_normalized, @@ -868,34 +969,41 @@ def get_eda_data( "stack": "pell", "data": ( df_cohort.assign( - _pell=df_cohort['pell_status_first_year'].replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) + _pell=df_cohort["pell_status_first_year"].replace( + {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"} + ) ) - .query(f"_pell == '{pell_status_normalized}' and _pell in ['Yes', 'No']") - .groupby('race') + .query( + f"_pell == '{pell_status_normalized}' and _pell in ['Yes', 'No']" + ) + .groupby("race") .size() .reindex(race_categories, fill_value=0) .tolist() ), - "color": ["#F79222", "#00CFEA"][i % 2] + "color": ["#F79222", "#00CFEA"][i % 2], } - for i, pell_status_normalized in enumerate(sorted( - df_cohort['pell_status_first_year'] - .dropna() - .replace({'Y': 'Yes', 'N': 'No', 'y': 'Yes', 'n': 'No'}) - .loc[lambda x: x.isin(['Yes', 'No'])] - .unique() - .tolist() - )) + for i, pell_status_normalized in enumerate( + sorted( + df_cohort["pell_status_first_year"] + .dropna() + .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}) + .loc[lambda x: x.isin(["Yes", "No"])] + .unique() + .tolist() + ) + ) ], }, ) - + # Cache the result before returning EDA_CACHE[cache_key] = result logger.debug(f"EDA result cached for {cache_key}") - + return result + @router.post("/{inst_id}/batch", response_model=BatchInfo) def create_batch( inst_id: str, diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py index 3f5dafc7..789ca72d 100644 --- a/src/webapp/routers/data_test.py +++ b/src/webapp/routers/data_test.py @@ -677,7 +677,9 @@ def test_get_eda_data_batch_not_found(client: TestClient) -> None: assert response.json()["detail"] == "Batch not found." -def test_get_eda_data_no_student_files(client: TestClient, session: sqlalchemy.orm.Session) -> None: +def test_get_eda_data_no_student_files( + client: TestClient, session: sqlalchemy.orm.Session +) -> None: """Test GET /institutions//batch//eda with batch containing no STUDENT files.""" # Create a batch with only COURSE files batch_with_course = BatchTable( @@ -702,10 +704,10 @@ def test_get_eda_data_no_student_files(client: TestClient, session: sqlalchemy.o ) session.add_all([batch_with_course, course_file]) session.commit() - + # Mock storage to return empty (no files found) MOCK_STORAGE.read_csv_as_dataframe.side_effect = ValueError("File not found") - + response = client.get( "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) @@ -719,10 +721,12 @@ def test_get_eda_data_no_student_files(client: TestClient, session: sqlalchemy.o assert "No valid input files found" in response.json()["detail"] -def test_get_eda_data_success(client: TestClient, session: sqlalchemy.orm.Session) -> None: +def test_get_eda_data_success( + client: TestClient, session: sqlalchemy.orm.Session +) -> None: """Test GET /institutions//batch//eda with valid data.""" import pandas as pd - + # Create a batch with STUDENT and COURSE files eda_batch = BatchTable( id=uuid.UUID("33333333-3333-3333-3333-333333333333"), @@ -759,40 +763,59 @@ def test_get_eda_data_success(client: TestClient, session: sqlalchemy.orm.Sessio ) session.add_all([eda_batch, student_file, course_file]) session.commit() - + # Create mock DataFrames - df_student = pd.DataFrame({ - 'study_id': ['S001', 'S002', 'S003', 'S001'], # S001 appears twice - 'cohort': ['2020', '2020', '2021', '2021'], - 'cohort_term': ['FALL', 'FALL', 'SPRING', 'SPRING'], - 'enrollment_type': ['First-Time', 'Transfer-In', 'First-Time', 'Transfer-In'], - 'enrollment_intensity_first_term': ['Full-Time', 'Part-Time', 'Full-Time', 'Part-Time'], - 'gpa_group_year_1': [3.5, 3.2, 3.8, 2.9], - 'credential_type_sought_year_1': ['Bachelor', 'Associate', 'Bachelor', 'Associate'], - 'pell_status_first_year': ['Y', 'N', 'Y', 'N'], - 'first_gen': ['Y', 'N', 'Y', 'N'], - 'gender': ['Female', 'Male', 'Female', 'Male'], - 'race': ['White', 'Black or African American', 'Asian', 'White'], - 'student_age': ['20 - 24', '20 or younger', 'Older than 24', '20 - 24'], - }) - - df_course = pd.DataFrame({ - 'study_id': ['S001', 'S002', 'S003'], - 'cohort': ['2020', '2020', '2021'], - 'cohort_term': ['FALL', 'FALL', 'SPRING'], - }) - + df_student = pd.DataFrame( + { + "study_id": ["S001", "S002", "S003", "S001"], # S001 appears twice + "cohort": ["2020", "2020", "2021", "2021"], + "cohort_term": ["FALL", "FALL", "SPRING", "SPRING"], + "enrollment_type": [ + "First-Time", + "Transfer-In", + "First-Time", + "Transfer-In", + ], + "enrollment_intensity_first_term": [ + "Full-Time", + "Part-Time", + "Full-Time", + "Part-Time", + ], + "gpa_group_year_1": [3.5, 3.2, 3.8, 2.9], + "credential_type_sought_year_1": [ + "Bachelor", + "Associate", + "Bachelor", + "Associate", + ], + "pell_status_first_year": ["Y", "N", "Y", "N"], + "first_gen": ["Y", "N", "Y", "N"], + "gender": ["Female", "Male", "Female", "Male"], + "race": ["White", "Black or African American", "Asian", "White"], + "student_age": ["20 - 24", "20 or younger", "Older than 24", "20 - 24"], + } + ) + + df_course = pd.DataFrame( + { + "study_id": ["S001", "S002", "S003"], + "cohort": ["2020", "2020", "2021"], + "cohort_term": ["FALL", "FALL", "SPRING"], + } + ) + # Mock storage to return our test DataFrames def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame: - if 'student' in blob_path.lower(): + if "student" in blob_path.lower(): return df_student - elif 'course' in blob_path.lower(): + elif "course" in blob_path.lower(): return df_course else: raise ValueError(f"File not found: {blob_path}") - + MOCK_STORAGE.read_csv_as_dataframe.side_effect = mock_read_csv - + response = client.get( "/institutions/" + uuid_to_str(USER_VALID_INST_UUID) @@ -800,10 +823,10 @@ def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame: + uuid_to_str(eda_batch.id) + "/eda" ) - + assert response.status_code == 200 data = response.json() - + # Check response structure assert "summary_stats" in data assert "gpa_by_enrollment_type" in data @@ -815,35 +838,35 @@ def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame: assert "pell_recipient_by_first_gen" in data assert "student_age_by_gender" in data assert "race_by_pell_status" in data - + # Check summary stats assert data["summary_stats"]["total_students"] == "3" # 3 unique study_ids assert data["summary_stats"]["transfer_students"] == "2" # 2 Transfer-In - + # Check GPA charts have cohort years assert "cohort_years" in data["gpa_by_enrollment_type"] assert len(data["gpa_by_enrollment_type"]["cohort_years"]) == 2 # 2020, 2021 assert "2020" in data["gpa_by_enrollment_type"]["cohort_years"] assert "2021" in data["gpa_by_enrollment_type"]["cohort_years"] - + # Check term data structure assert "fall" in data["students_by_cohort_term"] assert "spring" in data["students_by_cohort_term"] assert len(data["students_by_cohort_term"]["fall"]) == 2 # One per cohort year - + # Check enrollment type by intensity has categories and series assert "categories" in data["enrollment_type_by_intensity"] assert "series" in data["enrollment_type_by_intensity"] assert len(data["enrollment_type_by_intensity"]["series"]) > 0 - + # Check pell recipient chart structure assert "categories" in data["pell_recipient_by_first_gen"] assert "series" in data["pell_recipient_by_first_gen"] - + # Check student age by gender structure assert "categories" in data["student_age_by_gender"] assert "series" in data["student_age_by_gender"] - + # Check race by pell status structure assert "categories" in data["race_by_pell_status"] assert "series" in data["race_by_pell_status"] From 6b1d4fbf3c6d6566ca5ec464156826959986dcf8 Mon Sep 17 00:00:00 2001 From: William Carr Date: Wed, 10 Dec 2025 13:37:13 -0500 Subject: [PATCH 24/25] eda bucket names --- src/webapp/routers/data.py | 14 ++++++++++++-- src/webapp/utilities.py | 6 +++++- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 28bd60fd..1d6bf5ae 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -557,7 +557,7 @@ def read_batch_files_as_dataframes( storage_control: StorageControl instance for GCS access Returns: - Dictionary mapping file_name -> pandas.DataFrame + Dictionary mapping schema_type -> pandas.DataFrame Raises: HTTPException: If no valid files found @@ -566,6 +566,7 @@ def read_batch_files_as_dataframes( # Temporary storage: file_record -> DataFrame loaded_files: Dict[Any, pd.DataFrame] = {} + missing_files: List[str] = [] for file_record in batch_files: file_name = file_record.name @@ -577,21 +578,30 @@ def read_batch_files_as_dataframes( df = None - # Fall back to GCS using StorageControl + # Read from GCS try: blob_path = f"validated/{file_name}" df = storage_control.read_csv_as_dataframe(bucket_name, blob_path) logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)") except ValueError as e: logger.warning(f"File not found in GCS: {e}") + missing_files.append(file_name) except Exception as e: logger.error(f"Failed to read from GCS: {e}") + missing_files.append(file_name) if df is not None: loaded_files[file_record] = df if not loaded_files: error_msg = f"No valid input files found in batch (checked GCS: {bucket_name}/validated/)" + if missing_files: + error_msg += f". Expected files not found: {', '.join(missing_files[:5])}" + if len(missing_files) > 5: + error_msg += f" (and {len(missing_files) - 5} more)" + error_msg += ( + ". Files must be uploaded and validated before they can be used for EDA." + ) raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=error_msg, diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py index 8b35088b..c1bc240a 100644 --- a/src/webapp/utilities.py +++ b/src/webapp/utilities.py @@ -390,7 +390,11 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None: def prepend_env_prefix(name: str) -> Any: """Prepend the env prefix. At this point the value should not be empty as we checked on app startup.""" - return str(env_vars["ENV"]).lower() + "_" + name + env = str(env_vars["ENV"]).lower() + # Use dev_ prefix for LOCAL environment + if env == "local": + env = "dev" + return env + "_" + name def uuid_to_str(uuid_val: uuid.UUID) -> Any: From 4a789cf7ac377cdc67c71fa79430e0caf0cd1e10 Mon Sep 17 00:00:00 2001 From: Mesh Date: Wed, 17 Dec 2025 08:08:20 -0600 Subject: [PATCH 25/25] fix: changed output_valid to true --- src/webapp/routers/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py index 1bacf660..3fa90a6f 100644 --- a/src/webapp/routers/data.py +++ b/src/webapp/routers/data.py @@ -1412,7 +1412,7 @@ def add_custom_school_job( batch_name=f"{model_name}_{triggered_timestamp}", # update later when we figure out how to add batches to custom jobs output_filename=f"{job_run_id}/inference_output.csv", model_id=query_result[0][0].id, - output_valid=False, + output_valid=True, completed=True, model_version=latest_model_version.version, model_run_id=latest_model_version.run_id,