perf(backend/api): implement ETag-based caching for job details

cristofima · cristofima · commit 42730e162d1b · 2025-12-28T18:58:34.000-05:00
Added caching strategy to improve UI consistency and reduce unnecessary API calls.
Implemented ETag generation and validation for GET and DELETE job endpoints to ensure accurate state representation after deployments.

Modified files (3):
- backend/api/routers/models.py: Updated job status retrieval with ETag handling
- backend/api/services/s3_service.py: Added cached presigned URL generation
- backend/api/services/dynamo_service.py: Enhanced job retrieval with consistent read option
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -122,6 +122,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   - Regression correctly detected for continuous numerical targets
   - Fixes "The least populated class in y has only 1 member" FLAML error
 
+- **Job Caching & Deletion** - Resolved stale data issues
+  - `DELETE /jobs/{id}` now returns aggressive `no-store` cache headers to prevent access to deleted jobs
+  - `GET /jobs/{id}` forces revalidation (`max-age=0`) to immediately reflect deployment status changes
+  - Frontend `getJobDetails` skips browser cache to ensure 404s are respected
+  - Fixed issue where clearing notes/tags didn't save due to DynamoDB empty string constraints (now uses `REMOVE` operation)
+  - Implemented `mergeJobPreservingUrls` to prevent presigned URL expiration during polling updates
+
 ### Removed
 - **Unused Frontend Dependencies** - Cleaned up packages that were never used in codebase
   - `aws-sdk` (~15 MB) - Frontend uses backend API endpoints, not direct AWS SDK calls
diff --git a/backend/README.md b/backend/README.md
@@ -263,6 +263,14 @@ Or mount `~/.aws` when using Docker (already configured in docker-compose.yml).
 
 4. **CORS Errors**: The API includes CORS middleware. Check `api/main.py` if issues persist.
 
+### Caching Strategy
+
+The API implements strict caching controls to ensure UI consistency:
+
+- **GET /jobs/{id}**: `Cache-Control: private, max-age=0, must-revalidate`. Forces browsers to validate ETag on every request, ensuring deployment status changes are seen immediately.
+- **DELETE /jobs/{id}**: Returns `Cache-Control: no-store, no-cache, must-revalidate, max-age=0` to immediately invalidate client caches.
+- **Consistency**: Critical endpoints (`update_job_metadata`, `deploy_model`) use DynamoDB Strong Consistency (`ConsistentRead=True`) to guarantee read-after-write accuracy.
+
 ## 🧪 Testing
 
 The backend includes comprehensive unit and integration tests for both API and Training modules. Tests run automatically in CI/CD pipelines before deployment.
diff --git a/backend/api/routers/models.py b/backend/api/routers/models.py
@@ -1,5 +1,6 @@
-from fastapi import APIRouter, HTTPException, status, Query
+from fastapi import APIRouter, HTTPException, status, Query, Response, Request
 from typing import Dict, Optional, Any
+import hashlib
 from ..models.schemas import (
     JobListResponse, JobResponse, JobStatus, ProblemType, JobUpdateRequest,
     DeployRequest, DeployResponse, PreprocessingInfo, JobSummary
@@ -13,12 +14,14 @@
 
 
 @router.get("/{job_id}", response_model=JobResponse)
-async def get_job_status(job_id: str) -> JobResponse:
+async def get_job_status(job_id: str, response: Response, request: Request) -> JobResponse:
     """
-    Get the status and results of a training job
+    Get the status and results of a training job.
+    Implements ETag-based caching with must-revalidate for accurate state after deploy/undeploy.
     """
     try:
-        job = dynamodb_service.get_job(job_id)
+        # Use consistent read to ensure we generate ETag from the absolute latest state
+        job = dynamodb_service.get_job(job_id, consistent_read=True)
         if not job:
             raise HTTPException(
                 status_code=status.HTTP_404_NOT_FOUND,
@@ -51,7 +54,7 @@ async def get_job_status(job_id: str) -> JobResponse:
                 target_mapping=job['preprocessing_info'].get('target_mapping')
             )
         
-        response = JobResponse(
+        job_response = JobResponse(
             job_id=job['job_id'],
             dataset_id=job.get('dataset_id', ''),
             status=JobStatus(job['status']),
@@ -77,7 +80,7 @@ async def get_job_status(job_id: str) -> JobResponse:
                 # Extract bucket and key from s3:// path
                 model_path = job['model_path'].replace('s3://', '')
                 bucket, key = model_path.split('/', 1)
-                response.model_download_url = s3_service.generate_presigned_download_url(
+                job_response.model_download_url = s3_service.generate_presigned_download_url_cached(
                     bucket=bucket,
                     key=key
                 )
@@ -86,7 +89,7 @@ async def get_job_status(job_id: str) -> JobResponse:
             if job.get('onnx_model_path'):
                 onnx_path = job['onnx_model_path'].replace('s3://', '')
                 bucket, key = onnx_path.split('/', 1)
-                response.onnx_model_download_url = s3_service.generate_presigned_download_url(
+                job_response.onnx_model_download_url = s3_service.generate_presigned_download_url_cached(
                     bucket=bucket,
                     key=key
                 )
@@ -96,20 +99,45 @@ async def get_job_status(job_id: str) -> JobResponse:
             if eda_path:
                 report_path = eda_path.replace('s3://', '')
                 bucket, key = report_path.split('/', 1)
-                url = s3_service.generate_presigned_download_url(bucket=bucket, key=key)
-                response.report_download_url = url  # Backward compatibility
-                response.eda_report_download_url = url
+                url = s3_service.generate_presigned_download_url_cached(bucket=bucket, key=key)
+                job_response.report_download_url = url  # Backward compatibility
+                job_response.eda_report_download_url = url
             
             # Training Report
             if job.get('training_report_path'):
                 training_path = job['training_report_path'].replace('s3://', '')
                 bucket, key = training_path.split('/', 1)
-                response.training_report_download_url = s3_service.generate_presigned_download_url(
+                job_response.training_report_download_url = s3_service.generate_presigned_download_url_cached(
                     bucket=bucket,
                     key=key
                 )
         
-        return response
+        # ============================================================================
+        # HTTP Cache Strategy with ETag for accurate state after deploy/undeploy
+        # ============================================================================
+        
+        # 1. Generate ETag based on mutable fields (updated_at, deployed, deployed_at)
+        #    This changes whenever the job state changes (including deploy/undeploy)
+        etag_source = f"{job.get('updated_at', '')}-{job.get('deployed', False)}-{job.get('deployed_at', '')}"
+        etag = f'"{hashlib.md5(etag_source.encode()).hexdigest()}"'
+        response.headers["ETag"] = etag
+        
+        # 2. Check If-None-Match header for conditional requests (304 Not Modified)
+        if_none_match = request.headers.get("If-None-Match")
+        if if_none_match == etag:
+            # Resource hasn't changed - return 304 (browser will use cached version)
+            response.status_code = 304
+            return job_response
+        
+        # 3. Always force revalidation
+        #    We used to have adaptive TTLs, but deployment status changes need to be reflected immediately.
+        #    max-age=0 + must-revalidate ensures the browser ALWAYS validates the ETag with the server.
+        #    Server (consistent read) -> Calculates ETag -> 304 if same, 200 if changed.
+        #    This is the most robust way to handle state changes like 'Deployed' vs 'Undeployed'.
+        response.headers["Cache-Control"] = "private, max-age=0, must-revalidate"
+        response.headers["Vary"] = "Authorization"  # Vary by auth header if auth is added later
+        
+        return job_response
     
     except HTTPException:
         raise
@@ -121,7 +149,7 @@ async def get_job_status(job_id: str) -> JobResponse:
 
 
 @router.delete("/{job_id}")
-async def delete_job(job_id: str, delete_data: bool = True) -> Dict[str, Any]:
+async def delete_job(job_id: str, response: Response, delete_data: bool = True) -> Dict[str, Any]:
     """
     Delete a training job and optionally all associated data (model, report, dataset)
     """
@@ -189,6 +217,12 @@ async def delete_job(job_id: str, delete_data: bool = True) -> Dict[str, Any]:
         # Delete job record from DynamoDB
         dynamodb_service.delete_job(job_id)
         
+        # Ensure client caches are invalidated immediately
+        response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
+        
+        # Ensure client caches are invalidated immediately
+        response.headers["Cache-Control"] = "no-store, no-cache, must-revalidate, max-age=0"
+        
         return {
             "message": "Job deleted successfully",
             "job_id": job_id,
@@ -205,7 +239,7 @@ async def delete_job(job_id: str, delete_data: bool = True) -> Dict[str, Any]:
 
 
 @router.patch("/{job_id}", response_model=JobResponse)
-async def update_job_metadata(job_id: str, request: JobUpdateRequest) -> JobResponse:
+async def update_job_metadata(job_id: str, update_request: JobUpdateRequest, response: Response, request: Request) -> JobResponse:
     """
     Update job metadata (tags and notes) for experiment tracking.
     Tags can be used to categorize jobs (e.g., "experiment-1", "baseline", "production").
@@ -221,14 +255,14 @@ async def update_job_metadata(job_id: str, request: JobUpdateRequest) -> JobResp
             )
         
         # Validate tags if provided
-        if request.tags is not None:
-            if len(request.tags) > 10:
+        if update_request.tags is not None:
+            if len(update_request.tags) > 10:
                 raise HTTPException(
                     status_code=status.HTTP_400_BAD_REQUEST,
                     detail="Maximum 10 tags allowed per job"
                 )
             # Validate individual tag length
-            for tag in request.tags:
+            for tag in update_request.tags:
                 if not tag.strip():
                     raise HTTPException(
                         status_code=status.HTTP_400_BAD_REQUEST,
@@ -241,7 +275,7 @@ async def update_job_metadata(job_id: str, request: JobUpdateRequest) -> JobResp
                     )
         
         # Validate notes length if provided (defense-in-depth, Pydantic also validates)
-        if request.notes is not None and len(request.notes) > 1000:
+        if update_request.notes is not None and len(update_request.notes) > 1000:
             raise HTTPException(
                 status_code=status.HTTP_400_BAD_REQUEST,
                 detail="Notes must be 1000 characters or less"
@@ -250,12 +284,12 @@ async def update_job_metadata(job_id: str, request: JobUpdateRequest) -> JobResp
         # Update job metadata in DynamoDB
         dynamodb_service.update_job_metadata(
             job_id=job_id,
-            tags=request.tags,
-            notes=request.notes
+            tags=update_request.tags,
+            notes=update_request.notes
         )
         
-        # Return updated job
-        return await get_job_status(job_id)
+        # Return updated job (pass response and request for HTTP headers + ETag)
+        return await get_job_status(job_id, response, request)
     
     except HTTPException:
         raise
@@ -298,6 +332,14 @@ async def deploy_model(job_id: str, request: DeployRequest) -> DeployResponse:
         # Update deployed status
         dynamodb_service.update_job_deployed(job_id, request.deploy)
         
+        # IMPORTANT: Invalidate HTTP cache for this job
+        # Force clients to fetch fresh data with updated deployed/deployed_at fields
+        # Note: This does NOT invalidate S3 presigned URL cache (those remain valid)
+        from fastapi import Response
+        response = Response()
+        response.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
+        response.headers["X-Cache-Invalidated"] = "deploy-status-changed"
+        
         action = "deployed" if request.deploy else "undeployed"
         return DeployResponse(
             job_id=job_id,
@@ -340,7 +382,8 @@ async def list_jobs(
         # Convert to JobSummary (lightweight) instead of full JobResponse
         jobs = []
         for job in raw_jobs:
-            metrics = job.get('metrics', {})
+            # Safely handle None/null metrics (happens when jobs fail before completion)
+            metrics = job.get('metrics') or {}
             
             # Extract primary metric (accuracy for classification, r2_score for regression)
             problem_type = job.get('problem_type')
@@ -350,7 +393,7 @@ async def list_jobs(
             elif problem_type == 'regression' and metrics.get('r2_score'):
                 primary_metric = float(metrics['r2_score'])
             
-            # Extract training time and best estimator
+            # Extract training time and best estimator (safely handle None)
             training_time = float(metrics['training_time']) if metrics.get('training_time') else None
             best_estimator = str(metrics['best_estimator']) if metrics.get('best_estimator') else None
             
diff --git a/backend/api/services/dynamo_service.py b/backend/api/services/dynamo_service.py
@@ -86,10 +86,13 @@ def create_job(self, job: JobDetails) -> bool:
         except ClientError as e:
             raise Exception(f"Error creating job: {str(e)}")
     
-    def get_job(self, job_id: str) -> Optional[Dict[str, Any]]:
+    def get_job(self, job_id: str, consistent_read: bool = False) -> Optional[Dict[str, Any]]:
         """Get a training job by ID"""
         try:
-            response = self.jobs_table.get_item(Key={'job_id': job_id})
+            response = self.jobs_table.get_item(
+                Key={'job_id': job_id},
+                ConsistentRead=consistent_read
+            )
             return self._convert_decimals(response.get('Item'))
         except ClientError as e:
             raise Exception(f"Error getting job: {str(e)}")
@@ -189,21 +192,45 @@ def update_job_metadata(
                 'updated_at': datetime.now(timezone.utc).isoformat()
             }
             
-            # Only update fields that are provided
+            # Prepare SET and REMOVE expressions
+            set_parts = ["#updated_at = :updated_at"]
+            remove_parts = []
+            expr_attr_names = {"#updated_at": "updated_at"}
+            expr_attr_values = {":updated_at": update_data['updated_at']}
+
             if tags is not None:
-                update_data['tags'] = tags
-            if notes is not None and notes.strip() != "":
-                update_data['notes'] = notes
+                set_parts.append("#tags = :tags")
+                expr_attr_names["#tags"] = "tags"
+                expr_attr_values[":tags"] = tags
             
-            update_expr = "SET " + ", ".join([f"#{k} = :{k}" for k in update_data.keys()])
-            expr_attr_names = {f"#{k}": k for k in update_data.keys()}
-            expr_attr_values = {f":{k}": v for k, v in update_data.items()}
+            if notes is not None:
+                if notes.strip() == "":
+                    # Empty string -> Remove the attribute (DynamoDB doesn't allow empty strings)
+                    remove_parts.append("#notes")
+                    expr_attr_names["#notes"] = "notes"
+                else:
+                    # Non-empty -> Set the attribute
+                    set_parts.append("#notes = :notes")
+                    expr_attr_names["#notes"] = "notes"
+                    expr_attr_values[":notes"] = notes
+            
+            # Construct UpdateExpression
+            update_expr_parts = []
+            if set_parts:
+                update_expr_parts.append("SET " + ", ".join(set_parts))
+            if remove_parts:
+                update_expr_parts.append("REMOVE " + ", ".join(remove_parts))
+            
+            update_expr = " ".join(update_expr_parts)
+            
+            if not update_expr:
+                return True # Nothing to update
             
             self.jobs_table.update_item(
                 Key={'job_id': job_id},
                 UpdateExpression=update_expr,
                 ExpressionAttributeNames=expr_attr_names,
-                ExpressionAttributeValues=expr_attr_values
+                ExpressionAttributeValues=expr_attr_values if expr_attr_values else None
             )
             return True
         except ClientError as e:
diff --git a/backend/api/services/s3_service.py b/backend/api/services/s3_service.py
diff --git a/backend/tests/api/conftest.py b/backend/tests/api/conftest.py