diff --git a/src/webapp/database.py b/src/webapp/database.py
index 9365cc52..f9c56fea 100644
--- a/src/webapp/database.py
+++ b/src/webapp/database.py
@@ -59,6 +59,10 @@ class Base(DeclarativeBase):
 LOCAL_PASSWORD = "tester_password"
 DATETIME_TESTING = datetime.datetime(2024, 12, 26, 19, 37, 59, 753357)
 
+# Test institution - same ID as DEV USC Beaufort for testing
+TEST_INST_UUID = uuid.UUID("942d4b0e-12e7-4d2a-9187-9508ae3cef7c")
+TEST_BATCH_UUID = uuid.UUID("3182f472-e079-4678-a0a1-9ca5ead6c49a")
+
 
 @event.listens_for(Mapper, "before_insert")
 @event.listens_for(Mapper, "before_update")
@@ -106,6 +110,19 @@ def init_db(env: str) -> None:
                     updated_at=DATETIME_TESTING,
                 )
             )
+            # USC Beaufort - matches DEV for testing
+            session.merge(
+                InstTable(
+                    id=TEST_INST_UUID,
+                    name="University of South Carolina - Beaufort",
+                    state="SC",
+                    pdp_id="345000",
+                    schemas=["COURSE", "STUDENT"],
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                    created_by=LOCAL_USER_UUID,
+                )
+            )
             session.merge(
                 ApiKeyTable(
                     id=LOCAL_APIKEY_UUID,
@@ -118,6 +135,94 @@ def init_db(env: str) -> None:
                     valid=True,
                 )
             )
+            # Create test files and batches for LOCAL environment
+            if env == "LOCAL":
+                # Create test files
+                test_file_1 = FileTable(
+                    id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562a"),
+                    inst_id=LOCAL_INST_UUID,
+                    name="test_course_file.csv",
+                    source="MANUAL_UPLOAD",
+                    uploader=LOCAL_USER_UUID,
+                    sst_generated=False,
+                    valid=True,
+                    schemas=["COURSE"],  # Using string literal to avoid circular import
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+                test_file_2 = FileTable(
+                    id=uuid.UUID("cb02d06c-2a59-486a-9bdd-d394a4fcb833"),
+                    inst_id=LOCAL_INST_UUID,
+                    name="test_cohort_file.csv",
+                    source="MANUAL_UPLOAD",
+                    uploader=LOCAL_USER_UUID,
+                    sst_generated=False,
+                    valid=True,
+                    schemas=[
+                        "STUDENT"
+                    ],  # Using string literal to avoid circular import
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+                # Create test batch for LOCAL_INST_UUID (using a different ID)
+                test_batch = BatchTable(
+                    id=uuid.UUID("f0bb3a20-6d92-4254-afed-6a72f43c562b"),
+                    inst_id=LOCAL_INST_UUID,
+                    name="test_batch_1",
+                    created_by=LOCAL_USER_UUID,
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+                # Associate files with batch
+                test_batch.files.add(test_file_1)
+                test_batch.files.add(test_file_2)
+                session.merge(test_file_1)
+                session.merge(test_file_2)
+                session.merge(test_batch)
+
+                # Create test files for EDA test institution (TEST_INST_UUID)
+                # Real files from DEV batch 3182f472e0794678a0a19ca5ead6c49a
+                test_file_student = FileTable(
+                    id=uuid.UUID("f1d7c0a4-5211-459f-a79a-a1c2752f45c5"),
+                    inst_id=TEST_INST_UUID,
+                    name="1762967705679_AO1600pdp_AO1600_AR_DEIDENTIFIED_STUDYID_20250522120554.csv",
+                    source="MANUAL_UPLOAD",
+                    uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"),
+                    sst_generated=False,
+                    valid=True,
+                    schemas=["STUDENT"],
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+                test_file_course = FileTable(
+                    id=uuid.UUID("d19d0129-96de-464c-98e9-694996965c7b"),
+                    inst_id=TEST_INST_UUID,
+                    name="1762967705683_AO1600pdp_AO1600_COURSE_LEVEL_AR_DEIDENTIFIED_STUDYID_20250522120554.csv",
+                    source="MANUAL_UPLOAD",
+                    uploader=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"),
+                    sst_generated=False,
+                    valid=True,
+                    schemas=["COURSE"],
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+
+                # Test batch - matches DEV USC Beaufort
+                test_batch = BatchTable(
+                    id=TEST_BATCH_UUID,
+                    inst_id=TEST_INST_UUID,
+                    name="Batch_2025-11-12_1762967767400",
+                    completed=True,
+                    created_by=uuid.UUID("c8b57138-2529-4e1f-9e89-07399d165f85"),
+                    created_at=DATETIME_TESTING,
+                    updated_at=DATETIME_TESTING,
+                )
+                # Associate files with batch
+                test_batch.files.add(test_file_student)
+                test_batch.files.add(test_file_course)
+                session.merge(test_file_student)
+                session.merge(test_file_course)
+                session.merge(test_batch)
             session.commit()
     except Exception as e:
         session.rollback()
diff --git a/src/webapp/gcsutil.py b/src/webapp/gcsutil.py
index b267d9eb..b74d14c9 100644
--- a/src/webapp/gcsutil.py
+++ b/src/webapp/gcsutil.py
@@ -358,3 +358,28 @@ def get_file_contents(self, bucket_name: str, file_name: str) -> Any:
         blob = bucket.blob(file_name)
         res = blob.download_as_bytes()
         return res
+
+    def read_csv_as_dataframe(self, bucket_name: str, file_name: str) -> Any:
+        """Read a CSV file from GCS and return as pandas DataFrame.
+
+        Args:
+            bucket_name: GCS bucket name
+            file_name: Full blob path (e.g., 'validated/filename.csv')
+
+        Returns:
+            pandas DataFrame
+
+        Raises:
+            ValueError: If bucket or file not found
+        """
+        import pandas as pd
+
+        storage_client = storage.Client()
+        bucket = storage_client.get_bucket(bucket_name)
+        blob = bucket.blob(file_name)
+
+        if not blob.exists():
+            raise ValueError(f"File not found: {file_name}")
+
+        with blob.open("r") as fh:
+            return pd.read_csv(fh)
diff --git a/src/webapp/routers/data.py b/src/webapp/routers/data.py
index 1bacf660..bc7c05a9 100644
--- a/src/webapp/routers/data.py
+++ b/src/webapp/routers/data.py
@@ -13,6 +13,8 @@
 from sqlalchemy.exc import IntegrityError
 import re
 from ..validation import HardValidationError
+import pandas as pd
+from cachetools import TTLCache
 
 from ..utilities import (
     has_access_to_inst_or_err,
@@ -50,6 +52,11 @@
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
+# Cache for EDA data - TTL of 10 minutes (600 seconds)
+# Cache key format: f"{inst_id}:{batch_id}"
+EDA_CACHE_TTL = int(os.getenv("EDA_CACHE_TTL", "600"))  # Default 10 minutes
+EDA_CACHE: Any = TTLCache(maxsize=64, ttl=EDA_CACHE_TTL)
+
 router = APIRouter(
     prefix="/institutions",
     tags=["data"],
@@ -470,6 +477,543 @@ def read_batch_info(
     return {"batches": [batch_info], "files": data_infos}
 
 
+## EDA (Exploratory Data Analysis) Endpoints
+
+
+class SummaryStats(BaseModel):
+    """Summary statistics for the EDA dashboard."""
+
+    total_students: str
+    transfer_students: str
+    avg_year1_gpa_all_students: str
+
+
+class GpaSeriesData(BaseModel):
+    """GPA data series for a chart."""
+
+    name: str
+    data: List[float]
+
+
+class GpaChartData(BaseModel):
+    """GPA chart data with cohort years and series."""
+
+    cohort_years: List[str]
+    series: List[GpaSeriesData]
+
+
+class TermData(BaseModel):
+    """Term-based data (fall, winter, spring, summer)."""
+
+    fall: List[int]
+    winter: List[int]
+    spring: List[int]
+    summer: List[int]
+
+
+class DegreeTypeData(BaseModel):
+    """Degree type data for donut chart."""
+
+    value: int
+    name: str
+    color: str
+
+
+class StackedBarSeries(BaseModel):
+    """Series data for stacked bar charts."""
+
+    name: str
+    type: str = "bar"
+    stack: str
+    data: List[int]
+    color: str
+
+
+class EdaDataResponse(BaseModel):
+    """Complete EDA data response matching frontend expectations."""
+
+    summary_stats: Optional[SummaryStats] = None
+    gpa_by_enrollment_type: Optional[GpaChartData] = None
+    gpa_by_enrollment_intensity: Optional[GpaChartData] = None
+    students_by_cohort_term: Optional[TermData] = None
+    course_enrollments: Optional[TermData] = None
+    degree_types: Optional[List[DegreeTypeData]] = None
+    enrollment_type_by_intensity: Dict[str, Any]  # Categories and series
+    pell_recipient_by_first_gen: Dict[str, Any]  # Categories and series
+    student_age_by_gender: Dict[str, Any]  # Categories and series
+    race_by_pell_status: Dict[str, Any]  # Categories and series
+
+
+def read_batch_files_as_dataframes(
+    inst_id: str,
+    batch_files: Any,  # Set[FileTable]
+    storage_control: StorageControl,
+) -> Dict[str, pd.DataFrame]:
+    """Read CSV files from a batch and return as DataFrames.
+
+    Args:
+        inst_id: Institution ID
+        batch_files: Set of FileTable objects from the batch
+        storage_control: StorageControl instance for GCS access
+
+    Returns:
+        Dictionary mapping schema_type -> pandas.DataFrame
+
+    Raises:
+        HTTPException: If no valid files found
+    """
+    bucket_name = get_external_bucket_name(inst_id)
+
+    # Temporary storage: file_record -> DataFrame
+    loaded_files: Dict[Any, pd.DataFrame] = {}
+    missing_files: List[str] = []
+
+    for file_record in batch_files:
+        file_name = file_record.name
+
+        # Skip SST-generated output files (only process input files)
+        if file_record.sst_generated:
+            logger.debug(f"Skipping SST-generated file: {file_name}")
+            continue
+
+        df = None
+
+        # Read from GCS
+        try:
+            blob_path = f"validated/{file_name}"
+            df = storage_control.read_csv_as_dataframe(bucket_name, blob_path)
+            logger.info(f"Loaded {file_name} from GCS ({len(df)} rows)")
+        except ValueError as e:
+            logger.warning(f"File not found in GCS: {e}")
+            missing_files.append(file_name)
+        except Exception as e:
+            logger.error(f"Failed to read from GCS: {e}")
+            missing_files.append(file_name)
+
+        if df is not None:
+            loaded_files[file_record] = df
+
+    if not loaded_files:
+        error_msg = f"No valid input files found in batch (checked GCS: {bucket_name}/validated/)"
+        if missing_files:
+            error_msg += f". Expected files not found: {', '.join(missing_files[:5])}"
+            if len(missing_files) > 5:
+                error_msg += f" (and {len(missing_files) - 5} more)"
+        error_msg += (
+            ". Files must be uploaded and validated before they can be used for EDA."
+        )
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=error_msg,
+        )
+
+    # Group by schema type and combine DataFrames
+    schema_dataframes: Dict[str, List[pd.DataFrame]] = {}
+    for file_record, df in loaded_files.items():
+        for schema in file_record.schemas:
+            if schema not in schema_dataframes:
+                schema_dataframes[schema] = []
+            schema_dataframes[schema].append(df)
+
+    result = {}
+    for schema, dfs in schema_dataframes.items():
+        if len(dfs) == 1:
+            result[schema] = dfs[0]
+        else:
+            result[schema] = pd.concat(dfs, ignore_index=True)
+            logger.info(
+                f"Combined {len(dfs)} files for schema {schema} ({len(result[schema])} total rows)"
+            )
+
+    return result
+
+
+def calculate_gpa_series(
+    df: pd.DataFrame, cohort_years: List[str], grouping_col: str, category_value: str
+) -> List[float]:
+    """Calculate GPA data for one category across cohort years.
+
+    Args:
+        df: DataFrame (cohort data)
+        cohort_years: List of cohort years
+        grouping_col: Column to filter by (e.g., 'enrollment_type')
+        category_value: Specific value to filter for (e.g., 'First-Time')
+
+    Returns:
+        List of GPA values, one per cohort year
+    """
+
+    # Filter by category
+    filtered = df[df[grouping_col] == category_value]
+
+    # Group by cohort and calculate mean GPA
+    gpa_by_cohort = (
+        pd.to_numeric(filtered["gpa_group_year_1"], errors="coerce")
+        .groupby(filtered["cohort"])
+        .mean()
+    )
+
+    # Convert to list aligned with cohort_years
+    data = [round(gpa_by_cohort.get(year, 0), 1) for year in cohort_years]
+
+    return data
+
+
+def get_term_counts(
+    df: pd.DataFrame, cohort_years: List[str], term_name: str
+) -> List[int]:
+    """Get student counts for a specific term across cohort years.
+
+    Args:
+        df: DataFrame (cohort or course data)
+        cohort_years: List of cohort years
+        term_name: Term name to filter for (e.g., 'FALL', 'WINTER')
+
+    Returns:
+        List of student counts, one per cohort year
+    """
+    result_series = (
+        df[df["cohort_term"] == term_name]
+        .groupby("cohort")
+        .size()
+        .reindex(cohort_years, fill_value=0)
+        .astype(int)
+    )
+    return [int(x) for x in result_series.tolist()]  # Explicitly convert to List[int]
+
+
+@router.get("/{inst_id}/batch/{batch_id}/eda", response_model=EdaDataResponse)
+def get_eda_data(
+    inst_id: str,
+    batch_id: str,
+    current_user: Annotated[BaseUser, Depends(get_current_active_user)],
+    sql_session: Annotated[Session, Depends(get_session)],
+    storage_control: Annotated[StorageControl, Depends(StorageControl)],
+) -> Any:
+    """Returns EDA (Exploratory Data Analysis) data for a specific batch.
+
+    This endpoint provides all the data needed to populate the EDA dashboard,
+    including summary statistics, GPA charts, enrollment data, and demographic breakdowns.
+    Analyzes all files in the batch together to provide comprehensive insights.
+    """
+    has_access_to_inst_or_err(inst_id, current_user)
+    has_full_data_access_or_err(current_user, "EDA data")
+    local_session.set(sql_session)
+
+    # Verify batch exists and belongs to institution
+    batch_result = (
+        local_session.get()
+        .execute(
+            select(BatchTable).where(
+                and_(
+                    BatchTable.id == str_to_uuid(batch_id),
+                    BatchTable.inst_id == str_to_uuid(inst_id),
+                )
+            )
+        )
+        .all()
+    )
+
+    if not batch_result or len(batch_result) == 0:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="Batch not found.",
+        )
+
+    if len(batch_result) > 1:
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Batch duplicates found.",
+        )
+
+    batch_record = batch_result[0][0]
+    batch_files = batch_record.files
+
+    # Check cache first
+    cache_key = f"{inst_id}:{batch_id}"
+    cached_result = EDA_CACHE.get(cache_key)
+    if cached_result is not None:
+        logger.debug(f"EDA cache hit for {cache_key}")
+        return cached_result
+
+    logger.debug(f"EDA cache miss for {cache_key}, computing...")
+
+    # Read files from batch using helper function
+    file_dataframes = read_batch_files_as_dataframes(
+        inst_id, batch_files, storage_control
+    )
+    df_cohort = file_dataframes.get("STUDENT")
+    df_course = file_dataframes.get("COURSE")
+
+    if df_cohort is None:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail="No STUDENT schema files found in batch for EDA.",
+        )
+
+    cohort_years = sorted(df_cohort["cohort"].unique().tolist())
+
+    result = EdaDataResponse(
+        summary_stats=SummaryStats(
+            total_students=f"{df_cohort['study_id'].nunique():,}",
+            transfer_students=f"{(df_cohort['enrollment_type'] == 'Transfer-In').sum():,}",
+            avg_year1_gpa_all_students=f"{pd.to_numeric(df_cohort['gpa_group_year_1'], errors='coerce').mean():.2f}",
+        ),
+        gpa_by_enrollment_type=GpaChartData(
+            cohort_years=cohort_years,
+            series=[
+                GpaSeriesData(
+                    name="First Time Student",
+                    data=calculate_gpa_series(
+                        df_cohort, cohort_years, "enrollment_type", "First-Time"
+                    ),
+                ),
+                GpaSeriesData(
+                    name="Transfer Student",
+                    data=calculate_gpa_series(
+                        df_cohort, cohort_years, "enrollment_type", "Transfer-In"
+                    ),
+                ),
+            ],
+        ),
+        gpa_by_enrollment_intensity=GpaChartData(
+            cohort_years=cohort_years,
+            series=[
+                GpaSeriesData(
+                    name="Full Time Student",
+                    data=calculate_gpa_series(
+                        df_cohort,
+                        cohort_years,
+                        "enrollment_intensity_first_term",
+                        "Full-Time",
+                    ),
+                ),
+                GpaSeriesData(
+                    name="Part Time Student",
+                    data=calculate_gpa_series(
+                        df_cohort,
+                        cohort_years,
+                        "enrollment_intensity_first_term",
+                        "Part-Time",
+                    ),
+                ),
+            ],
+        ),
+        students_by_cohort_term=TermData(
+            fall=get_term_counts(df_cohort, cohort_years, "FALL"),
+            winter=get_term_counts(df_cohort, cohort_years, "WINTER"),
+            spring=get_term_counts(df_cohort, cohort_years, "SPRING"),
+            summer=get_term_counts(df_cohort, cohort_years, "SUMMER"),
+        ),
+        course_enrollments=TermData(
+            fall=get_term_counts(df_course, cohort_years, "FALL"),
+            winter=get_term_counts(df_course, cohort_years, "WINTER"),
+            spring=get_term_counts(df_course, cohort_years, "SPRING"),
+            summer=get_term_counts(df_course, cohort_years, "SUMMER"),
+        ),
+        degree_types=[
+            DegreeTypeData(
+                value=int(
+                    round(
+                        count / df_cohort["credential_type_sought_year_1"].count() * 100
+                    )
+                ),
+                name=str(degree_type),
+                color=["#F79222", "#00CFEA", "#25A95A", "#A92532", "#385981"][i % 5],
+            )
+            for i, (degree_type, count) in enumerate(
+                df_cohort["credential_type_sought_year_1"].value_counts().items()
+            )
+        ],
+        enrollment_type_by_intensity={
+            "categories": (
+                categories := sorted(df_cohort["enrollment_type"].unique().tolist())
+            ),
+            "series": [
+                {
+                    "name": "Full Time",
+                    "type": "bar",
+                    "stack": "intensity",
+                    "data": (
+                        df_cohort[
+                            df_cohort["enrollment_intensity_first_term"] == "Full-Time"
+                        ]
+                        .groupby("enrollment_type")
+                        .size()
+                        .reindex(categories, fill_value=0)
+                        .tolist()
+                    ),
+                    "color": "#F79222",
+                },
+                {
+                    "name": "Part Time",
+                    "type": "bar",
+                    "stack": "intensity",
+                    "data": (
+                        df_cohort[
+                            df_cohort["enrollment_intensity_first_term"] == "Part-Time"
+                        ]
+                        .groupby("enrollment_type")
+                        .size()
+                        .reindex(categories, fill_value=0)
+                        .tolist()
+                    ),
+                    "color": "#00CFEA",
+                },
+            ],
+        },
+        pell_recipient_by_first_gen={
+            "categories": (
+                pell_categories := sorted(
+                    df_cohort["pell_status_first_year"]
+                    .dropna()
+                    .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"})
+                    .loc[lambda x: x.isin(["Yes", "No"])]
+                    .unique()
+                    .tolist()
+                )
+            ),
+            "series": [
+                {
+                    "name": first_gen_normalized,
+                    "type": "bar",
+                    "stack": "firstGen",
+                    "data": (
+                        df_cohort.assign(
+                            _pell=df_cohort["pell_status_first_year"].replace(
+                                {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}
+                            ),
+                            _first_gen=df_cohort["first_gen"]
+                            .fillna("Nan")
+                            .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}),
+                        )
+                        .query(
+                            f"_first_gen == '{first_gen_normalized}' and _pell in ['Yes', 'No']"
+                        )
+                        .groupby("_pell")
+                        .size()
+                        .reindex(pell_categories, fill_value=0)
+                        .tolist()
+                    ),
+                    "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3],
+                }
+                for i, first_gen_normalized in enumerate(
+                    sorted(
+                        df_cohort["first_gen"]
+                        .fillna("Nan")
+                        .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"})
+                        .unique()
+                        .tolist()
+                    )
+                )
+            ],
+        },
+        student_age_by_gender={
+            "categories": (
+                gender_categories := sorted(
+                    df_cohort["gender"].dropna().unique().tolist()
+                )
+            ),
+            "series": [
+                {
+                    "name": age_group,
+                    "type": "bar",
+                    "stack": "age",
+                    "data": (
+                        df_cohort.assign(
+                            _age_group=(
+                                (
+                                    df_cohort["student_age"]
+                                    if "student_age" in df_cohort.columns
+                                    else df_cohort["age"]
+                                    if "age" in df_cohort.columns
+                                    else pd.Series([None] * len(df_cohort))
+                                ).apply(
+                                    lambda x: (
+                                        "20 or younger"
+                                        if pd.isna(x)
+                                        or any(
+                                            term in str(x).lower()
+                                            for term in [
+                                                "20 or younger",
+                                                "20 or under",
+                                                "under 20",
+                                                "<=20",
+                                            ]
+                                        )
+                                        or (isinstance(x, (int, float)) and x <= 20)
+                                        else "20 - 24"
+                                        if any(
+                                            term in str(x).lower()
+                                            for term in ["20-24", "20 to 24", "20 - 24"]
+                                        )
+                                        or (
+                                            isinstance(x, (int, float)) and 20 < x <= 24
+                                        )
+                                        else "Older than 24"
+                                    )
+                                )
+                            )
+                        )
+                        .query(f"_age_group == '{age_group}'")
+                        .groupby("gender")
+                        .size()
+                        .reindex(gender_categories, fill_value=0)
+                        .tolist()
+                    ),
+                    "color": ["#F79222", "#00CFEA", "#25A95A"][i % 3],
+                }
+                for i, age_group in enumerate(
+                    ["20 or younger", "20 - 24", "Older than 24"]
+                )
+            ],
+        },
+        race_by_pell_status={
+            "categories": (
+                race_categories := sorted(df_cohort["race"].dropna().unique().tolist())
+            ),
+            "series": [
+                {
+                    "name": pell_status_normalized,
+                    "type": "bar",
+                    "stack": "pell",
+                    "data": (
+                        df_cohort.assign(
+                            _pell=df_cohort["pell_status_first_year"].replace(
+                                {"Y": "Yes", "N": "No", "y": "Yes", "n": "No"}
+                            )
+                        )
+                        .query(
+                            f"_pell == '{pell_status_normalized}' and _pell in ['Yes', 'No']"
+                        )
+                        .groupby("race")
+                        .size()
+                        .reindex(race_categories, fill_value=0)
+                        .tolist()
+                    ),
+                    "color": ["#F79222", "#00CFEA"][i % 2],
+                }
+                for i, pell_status_normalized in enumerate(
+                    sorted(
+                        df_cohort["pell_status_first_year"]
+                        .dropna()
+                        .replace({"Y": "Yes", "N": "No", "y": "Yes", "n": "No"})
+                        .loc[lambda x: x.isin(["Yes", "No"])]
+                        .unique()
+                        .tolist()
+                    )
+                )
+            ],
+        },
+    )
+
+    # Cache the result before returning
+    EDA_CACHE[cache_key] = result
+    logger.debug(f"EDA result cached for {cache_key}")
+
+    return result
+
+
 @router.post("/{inst_id}/batch", response_model=BatchInfo)
 def create_batch(
     inst_id: str,
@@ -1412,7 +1956,7 @@ def add_custom_school_job(
             batch_name=f"{model_name}_{triggered_timestamp}",  # update later when we figure out how to add batches to custom jobs
             output_filename=f"{job_run_id}/inference_output.csv",
             model_id=query_result[0][0].id,
-            output_valid=False,
+            output_valid=True,
             completed=True,
             model_version=latest_model_version.version,
             model_run_id=latest_model_version.run_id,
diff --git a/src/webapp/routers/data_test.py b/src/webapp/routers/data_test.py
index 9b1c1c31..789ca72d 100644
--- a/src/webapp/routers/data_test.py
+++ b/src/webapp/routers/data_test.py
@@ -645,3 +645,228 @@ def test_validate_failure_batch(client: TestClient) -> None:
     assert response_sftp.json()["file_types"] == ["COURSE"]
     assert response_sftp.json()["inst_id"] == uuid_to_str(USER_VALID_INST_UUID)
     assert response_sftp.json()["source"] == "MANUAL_UPLOAD"
+
+
+def test_get_eda_data_unauthorized(client: TestClient) -> None:
+    """Test GET /institutions/<uuid>/batch/<uuid>/eda with unauthorized access."""
+    response = client.get(
+        "/institutions/"
+        + uuid_to_str(UUID_INVALID)
+        + "/batch/"
+        + uuid_to_str(BATCH_UUID)
+        + "/eda"
+    )
+    assert str(response) == "<Response [401 Unauthorized]>"
+    assert (
+        response.text
+        == '{"detail":"Not authorized to read this institution\'s resources."}'
+    )
+
+
+def test_get_eda_data_batch_not_found(client: TestClient) -> None:
+    """Test GET /institutions/<uuid>/batch/<uuid>/eda with non-existent batch."""
+    fake_batch_uuid = uuid.UUID("00000000-0000-0000-0000-000000000000")
+    response = client.get(
+        "/institutions/"
+        + uuid_to_str(USER_VALID_INST_UUID)
+        + "/batch/"
+        + uuid_to_str(fake_batch_uuid)
+        + "/eda"
+    )
+    assert response.status_code == 404
+    assert response.json()["detail"] == "Batch not found."
+
+
+def test_get_eda_data_no_student_files(
+    client: TestClient, session: sqlalchemy.orm.Session
+) -> None:
+    """Test GET /institutions/<uuid>/batch/<uuid>/eda with batch containing no STUDENT files."""
+    # Create a batch with only COURSE files
+    batch_with_course = BatchTable(
+        id=uuid.UUID("11111111-1111-1111-1111-111111111111"),
+        inst_id=USER_VALID_INST_UUID,
+        name="batch_course_only",
+        created_by=CREATOR_UUID,
+        created_at=DATETIME_TESTING,
+        updated_at=DATETIME_TESTING,
+    )
+    course_file = FileTable(
+        id=uuid.UUID("22222222-2222-2222-2222-222222222222"),
+        inst_id=USER_VALID_INST_UUID,
+        name="course_file.csv",
+        source="MANUAL_UPLOAD",
+        batches={batch_with_course},
+        created_at=DATETIME_TESTING,
+        updated_at=DATETIME_TESTING,
+        sst_generated=False,
+        valid=True,
+        schemas=[SchemaType.COURSE],
+    )
+    session.add_all([batch_with_course, course_file])
+    session.commit()
+
+    # Mock storage to return empty (no files found)
+    MOCK_STORAGE.read_csv_as_dataframe.side_effect = ValueError("File not found")
+
+    response = client.get(
+        "/institutions/"
+        + uuid_to_str(USER_VALID_INST_UUID)
+        + "/batch/"
+        + uuid_to_str(batch_with_course.id)
+        + "/eda"
+    )
+    assert response.status_code == 404
+    # When files can't be loaded from GCS, we get "No valid input files found"
+    # The "No STUDENT schema files found" error only occurs after files are loaded
+    assert "No valid input files found" in response.json()["detail"]
+
+
+def test_get_eda_data_success(
+    client: TestClient, session: sqlalchemy.orm.Session
+) -> None:
+    """Test GET /institutions/<uuid>/batch/<uuid>/eda with valid data."""
+    import pandas as pd
+
+    # Create a batch with STUDENT and COURSE files
+    eda_batch = BatchTable(
+        id=uuid.UUID("33333333-3333-3333-3333-333333333333"),
+        inst_id=USER_VALID_INST_UUID,
+        name="batch_eda_test",
+        created_by=CREATOR_UUID,
+        created_at=DATETIME_TESTING,
+        updated_at=DATETIME_TESTING,
+        completed=True,
+    )
+    student_file = FileTable(
+        id=uuid.UUID("44444444-4444-4444-4444-444444444444"),
+        inst_id=USER_VALID_INST_UUID,
+        name="student_file.csv",
+        source="MANUAL_UPLOAD",
+        batches={eda_batch},
+        created_at=DATETIME_TESTING,
+        updated_at=DATETIME_TESTING,
+        sst_generated=False,
+        valid=True,
+        schemas=[SchemaType.STUDENT],
+    )
+    course_file = FileTable(
+        id=uuid.UUID("55555555-5555-5555-5555-555555555555"),
+        inst_id=USER_VALID_INST_UUID,
+        name="course_file.csv",
+        source="MANUAL_UPLOAD",
+        batches={eda_batch},
+        created_at=DATETIME_TESTING,
+        updated_at=DATETIME_TESTING,
+        sst_generated=False,
+        valid=True,
+        schemas=[SchemaType.COURSE],
+    )
+    session.add_all([eda_batch, student_file, course_file])
+    session.commit()
+
+    # Create mock DataFrames
+    df_student = pd.DataFrame(
+        {
+            "study_id": ["S001", "S002", "S003", "S001"],  # S001 appears twice
+            "cohort": ["2020", "2020", "2021", "2021"],
+            "cohort_term": ["FALL", "FALL", "SPRING", "SPRING"],
+            "enrollment_type": [
+                "First-Time",
+                "Transfer-In",
+                "First-Time",
+                "Transfer-In",
+            ],
+            "enrollment_intensity_first_term": [
+                "Full-Time",
+                "Part-Time",
+                "Full-Time",
+                "Part-Time",
+            ],
+            "gpa_group_year_1": [3.5, 3.2, 3.8, 2.9],
+            "credential_type_sought_year_1": [
+                "Bachelor",
+                "Associate",
+                "Bachelor",
+                "Associate",
+            ],
+            "pell_status_first_year": ["Y", "N", "Y", "N"],
+            "first_gen": ["Y", "N", "Y", "N"],
+            "gender": ["Female", "Male", "Female", "Male"],
+            "race": ["White", "Black or African American", "Asian", "White"],
+            "student_age": ["20 - 24", "20 or younger", "Older than 24", "20 - 24"],
+        }
+    )
+
+    df_course = pd.DataFrame(
+        {
+            "study_id": ["S001", "S002", "S003"],
+            "cohort": ["2020", "2020", "2021"],
+            "cohort_term": ["FALL", "FALL", "SPRING"],
+        }
+    )
+
+    # Mock storage to return our test DataFrames
+    def mock_read_csv(bucket_name: str, blob_path: str) -> pd.DataFrame:
+        if "student" in blob_path.lower():
+            return df_student
+        elif "course" in blob_path.lower():
+            return df_course
+        else:
+            raise ValueError(f"File not found: {blob_path}")
+
+    MOCK_STORAGE.read_csv_as_dataframe.side_effect = mock_read_csv
+
+    response = client.get(
+        "/institutions/"
+        + uuid_to_str(USER_VALID_INST_UUID)
+        + "/batch/"
+        + uuid_to_str(eda_batch.id)
+        + "/eda"
+    )
+
+    assert response.status_code == 200
+    data = response.json()
+
+    # Check response structure
+    assert "summary_stats" in data
+    assert "gpa_by_enrollment_type" in data
+    assert "gpa_by_enrollment_intensity" in data
+    assert "students_by_cohort_term" in data
+    assert "course_enrollments" in data
+    assert "degree_types" in data
+    assert "enrollment_type_by_intensity" in data
+    assert "pell_recipient_by_first_gen" in data
+    assert "student_age_by_gender" in data
+    assert "race_by_pell_status" in data
+
+    # Check summary stats
+    assert data["summary_stats"]["total_students"] == "3"  # 3 unique study_ids
+    assert data["summary_stats"]["transfer_students"] == "2"  # 2 Transfer-In
+
+    # Check GPA charts have cohort years
+    assert "cohort_years" in data["gpa_by_enrollment_type"]
+    assert len(data["gpa_by_enrollment_type"]["cohort_years"]) == 2  # 2020, 2021
+    assert "2020" in data["gpa_by_enrollment_type"]["cohort_years"]
+    assert "2021" in data["gpa_by_enrollment_type"]["cohort_years"]
+
+    # Check term data structure
+    assert "fall" in data["students_by_cohort_term"]
+    assert "spring" in data["students_by_cohort_term"]
+    assert len(data["students_by_cohort_term"]["fall"]) == 2  # One per cohort year
+
+    # Check enrollment type by intensity has categories and series
+    assert "categories" in data["enrollment_type_by_intensity"]
+    assert "series" in data["enrollment_type_by_intensity"]
+    assert len(data["enrollment_type_by_intensity"]["series"]) > 0
+
+    # Check pell recipient chart structure
+    assert "categories" in data["pell_recipient_by_first_gen"]
+    assert "series" in data["pell_recipient_by_first_gen"]
+
+    # Check student age by gender structure
+    assert "categories" in data["student_age_by_gender"]
+    assert "series" in data["student_age_by_gender"]
+
+    # Check race by pell status structure
+    assert "categories" in data["race_by_pell_status"]
+    assert "series" in data["race_by_pell_status"]
diff --git a/src/webapp/utilities.py b/src/webapp/utilities.py
index 8b35088b..c1bc240a 100644
--- a/src/webapp/utilities.py
+++ b/src/webapp/utilities.py
@@ -390,7 +390,11 @@ def model_owner_and_higher_or_err(user: BaseUser, resource_type: str) -> None:
 
 def prepend_env_prefix(name: str) -> Any:
     """Prepend the env prefix. At this point the value should not be empty as we checked on app startup."""
-    return str(env_vars["ENV"]).lower() + "_" + name
+    env = str(env_vars["ENV"]).lower()
+    # Use dev_ prefix for LOCAL environment
+    if env == "local":
+        env = "dev"
+    return env + "_" + name
 
 
 def uuid_to_str(uuid_val: uuid.UUID) -> Any: