Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion src/webapp/routers/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from datetime import datetime, date
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union, cast
from pydantic import BaseModel, Field
from fastapi import APIRouter, Depends, HTTPException, status, Response
from fastapi import APIRouter, Depends, HTTPException, status, Response, Query
from sqlalchemy import and_, or_
from sqlalchemy.orm import Session
from sqlalchemy.future import select
Expand Down Expand Up @@ -664,12 +664,14 @@ def get_eda_data(
current_user: Annotated[BaseUser, Depends(get_current_active_user)],
sql_session: Annotated[Session, Depends(get_session)],
storage_control: Annotated[StorageControl, Depends(StorageControl)],
clear_cache: Annotated[Optional[str], Query(alias="clear-cache")] = None,
) -> Any:
"""Returns EDA (Exploratory Data Analysis) data for a specific batch.

This endpoint provides all the data needed to populate the EDA dashboard,
including summary statistics, GPA charts, enrollment data, and demographic breakdowns.
Analyzes all files in the batch together to provide comprehensive insights.
Pass query ``clear-cache=1`` to drop any cached EDA result for this batch before serving.
"""
has_access_to_inst_or_err(inst_id, current_user)
has_full_data_access_or_err(current_user, "EDA data")
Expand All @@ -694,6 +696,9 @@ def get_eda_data(
)

cache_key = f"{inst_id}:{batch_id}"
if clear_cache == "1":
EDA_CACHE.pop(cache_key, None)

cached_result = EDA_CACHE.get(cache_key)
if cached_result is not None:
logger.debug(f"EDA cache hit for {cache_key}")
Expand Down
87 changes: 87 additions & 0 deletions src/webapp/routers/data_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
get_session,
)
from ..utilities import uuid_to_str, get_current_active_user, SchemaType
from . import data as data_router
from .data import (
router,
DataOverview,
Expand Down Expand Up @@ -892,6 +893,92 @@ def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame:
assert "series" in data["race_by_pell_status"]


def test_get_eda_data_clear_cache(
client: TestClient, session: sqlalchemy.orm.Session
) -> None:
"""clear-cache=1 evicts the TTL entry so batch files are read from storage again."""
import pandas as pd

data_router.EDA_CACHE.clear()

eda_batch = BatchTable(
id=uuid.UUID("66666666-6666-6666-6666-666666666666"),
inst_id=USER_VALID_INST_UUID,
name="batch_eda_clear_cache",
created_by=CREATOR_UUID,
created_at=DATETIME_TESTING,
updated_at=DATETIME_TESTING,
completed=True,
)
student_file = FileTable(
id=uuid.UUID("77777777-7777-7777-7777-777777777777"),
inst_id=USER_VALID_INST_UUID,
name="student_clear_cache.csv",
source="MANUAL_UPLOAD",
batches={eda_batch},
created_at=DATETIME_TESTING,
updated_at=DATETIME_TESTING,
sst_generated=False,
valid=True,
schemas=[SchemaType.STUDENT],
)
session.add_all([eda_batch, student_file])
session.commit()

df_cohort = pd.DataFrame(
{
"student_id": ["S001"],
"cohort": ["2020"],
"cohort_term": ["FALL"],
"enrollment_type": ["FIRST-TIME"],
"enrollment_intensity_first_term": ["Full-Time"],
"gpa_group_year_1": [3.5],
"credential_type_sought_year_1": ["Bachelor"],
"pell_status_first_year": ["N"],
"first_gen": ["N"],
"gender": ["Female"],
"race": ["White"],
"student_age": ["20 - 24"],
}
)

def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame:
if "student" in blob_path.lower():
return df_cohort
raise ValueError(f"File not found: {blob_path}")

MOCK_STORAGE.read_csv_as_dataframe.side_effect = mock_read_csv

base_path = (
"/institutions/"
+ uuid_to_str(USER_VALID_INST_UUID)
+ "/batch/"
+ uuid_to_str(eda_batch.id)
+ "/eda"
)

try:
with mock.patch.object(
data_router,
"read_batch_files_as_dataframes",
wraps=data_router.read_batch_files_as_dataframes,
) as read_dfs:
r1 = client.get(base_path)
assert r1.status_code == 200
assert read_dfs.call_count == 1
r2 = client.get(base_path)
assert r2.status_code == 200
assert read_dfs.call_count == 1
r3 = client.get(base_path + "?clear-cache=1")
assert r3.status_code == 200
assert read_dfs.call_count == 2
r4 = client.get(base_path)
assert r4.status_code == 200
assert read_dfs.call_count == 2
finally:
MOCK_STORAGE.reset_mock()


# ==================== EDVISE VALIDATION TESTS ====================

EDVISE_INST_UUID = uuid.UUID("aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa")
Expand Down
Loading