Merge pull request #14 from cristofima/dev - refactor: enhance security and improve error handling in training and frontend components

cristofima · web-flow · commit 7009df2fe051 · 2025-12-09T13:16:24.000-05:00
- Replace wildcard CORS origins (*) with specific allowed origins based on Amplify domain and environment
- Improve error handling by replacing bare exception catches with specific exception types and logging
- Add guards against empty datasets and NaN values in training preprocessing
diff --git a/.github/SETUP_CICD.md b/.github/SETUP_CICD.md
@@ -127,13 +127,41 @@ cat > github-actions-permissions.json <<EOF
     {
       "Sid": "APIGatewayManagement",
       "Effect": "Allow",
-      "Action": "apigateway:*",
-      "Resource": "*"
+      "Action": [
+        "apigateway:GET",
+        "apigateway:POST",
+        "apigateway:PUT",
+        "apigateway:PATCH",
+        "apigateway:DELETE",
+        "apigateway:UpdateRestApiPolicy"
+      ],
+      "Resource": [
+        "arn:aws:apigateway:*::/restapis",
+        "arn:aws:apigateway:*::/restapis/*"
+      ]
     },
     {
       "Sid": "BatchManagement",
       "Effect": "Allow",
-      "Action": "batch:*",
+      "Action": [
+        "batch:CreateComputeEnvironment",
+        "batch:UpdateComputeEnvironment",
+        "batch:DeleteComputeEnvironment",
+        "batch:DescribeComputeEnvironments",
+        "batch:CreateJobQueue",
+        "batch:UpdateJobQueue",
+        "batch:DeleteJobQueue",
+        "batch:DescribeJobQueues",
+        "batch:RegisterJobDefinition",
+        "batch:DeregisterJobDefinition",
+        "batch:DescribeJobDefinitions",
+        "batch:SubmitJob",
+        "batch:DescribeJobs",
+        "batch:ListJobs",
+        "batch:TerminateJob",
+        "batch:TagResource",
+        "batch:UntagResource"
+      ],
       "Resource": "*"
     },
     {
diff --git a/backend/training/eda.py b/backend/training/eda.py
@@ -1,6 +1,6 @@
 import pandas as pd
 import numpy as np
-from typing import Dict, List, Tuple, Any
+from typing import List, Tuple
 import re
 
 
@@ -54,6 +54,10 @@ def __init__(self, df: pd.DataFrame, target_column: str):
     
     def _detect_problem_type(self) -> str:
         """Detect if classification or regression"""
+        # Guard against empty target
+        if len(self.target) == 0:
+            return 'classification'  # Default fallback
+        
         if pd.api.types.is_numeric_dtype(self.target):
             unique_ratio = self.target.nunique() / len(self.target)
             if unique_ratio < 0.05 or self.target.nunique() < 20:
@@ -108,9 +112,10 @@ def _analyze_columns(self):
         
         if self.problem_type == 'classification':
             class_counts = self.target.value_counts()
-            imbalance_ratio = class_counts.max() / class_counts.min()
-            if imbalance_ratio > 3:
-                self.warnings.append(f"Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")
+            if len(class_counts) > 0 and class_counts.min() > 0:
+                imbalance_ratio = class_counts.max() / class_counts.min()
+                if imbalance_ratio > 3:
+                    self.warnings.append(f"Class imbalance detected (ratio: {imbalance_ratio:.1f}:1)")
     
     def _get_css(self) -> str:
         """Return CSS styles"""
diff --git a/backend/training/model_trainer.py b/backend/training/model_trainer.py
@@ -170,8 +170,8 @@ def get_feature_importance(model: AutoML, feature_names) -> Dict[str, float]:
                         print(f"  {i+1}. {feature}: {importance:.4f}")
                     
                     return feature_importance
-            except Exception:
-                pass
+            except (AttributeError, TypeError) as e:
+                print(f"Could not extract feature importances from model: {e}")
         
         # Fallback: Create equal importance for all features
         print("\nCould not extract feature importances, using equal weights")
diff --git a/backend/training/preprocessor.py b/backend/training/preprocessor.py
@@ -184,6 +184,10 @@ def detect_useless_columns(self, df: pd.DataFrame) -> List[str]:
     
     def detect_problem_type(self, y: pd.Series) -> str:
         """Detect if problem is classification or regression"""
+        # Guard against empty target
+        if len(y) == 0:
+            return 'classification'  # Default fallback
+        
         # Check if target is numeric
         if pd.api.types.is_numeric_dtype(y):
             # If numeric, check unique values ratio
diff --git a/backend/training/training_report.py b/backend/training/training_report.py
@@ -1,5 +1,5 @@
 from typing import Dict, Any
-from datetime import datetime
+from datetime import datetime, timezone
 
 
 def generate_training_report(
@@ -382,7 +382,7 @@ def _generate_config_info(self) -> str:
     
     def generate(self) -> str:
         """Generate complete HTML report"""
-        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
+        timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
         
         html = f"""
         <!DOCTYPE html>
diff --git a/frontend/app/results/[jobId]/page.tsx b/frontend/app/results/[jobId]/page.tsx
@@ -30,6 +30,25 @@ export default function ResultsPage() {
     setTimeout(() => setCopiedPython(false), 2000);
   };
 
+  // Generate Docker commands for model prediction (extracted to avoid duplication)
+  const getDockerCommands = (jobId: string) => {
+    const modelFile = `model_${jobId.slice(0, 8)}.pkl`;
+    return `# Build prediction container (one time)
+docker build -f scripts/Dockerfile.predict -t automl-predict .
+
+# Show model info and required features
+docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} --info
+
+# Generate sample input JSON (auto-detects features)
+docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} -g /data/sample_input.json
+
+# Edit sample_input.json with your values, then predict
+docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} --json /data/sample_input.json
+
+# Batch predictions from CSV
+docker run --rm -v \${PWD}:/data automl-predict /data/${modelFile} -i /data/test.csv -o /data/predictions.csv`;
+  };
+
   useEffect(() => {
     const fetchResults = async () => {
       try {
@@ -276,39 +295,10 @@ export default function ResultsPage() {
             </div>
             <div className="relative">
               <pre className="bg-gray-900 text-gray-100 rounded-lg p-4 overflow-x-auto text-sm font-mono">
-                <code>{`# Build prediction container (one time)
-docker build -f scripts/Dockerfile.predict -t automl-predict .
-
-# Show model info and required features
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --info
-
-# Generate sample input JSON (auto-detects features)
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -g /data/sample_input.json
-
-# Edit sample_input.json with your values, then predict
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --json /data/sample_input.json
-
-# Batch predictions from CSV
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -i /data/test.csv -o /data/predictions.csv`}</code>
+                <code>{getDockerCommands(job.job_id)}</code>
               </pre>
               <button
-                onClick={() => {
-                  const code = `# Build prediction container (one time)
-docker build -f scripts/Dockerfile.predict -t automl-predict .
-
-# Show model info and required features
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --info
-
-# Generate sample input JSON (auto-detects features)
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -g /data/sample_input.json
-
-# Edit sample_input.json with your values, then predict
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl --json /data/sample_input.json
-
-# Batch predictions from CSV
-docker run --rm -v \${PWD}:/data automl-predict /data/model_${job.job_id.slice(0, 8)}.pkl -i /data/test.csv -o /data/predictions.csv`;
-                  handleCopyDocker(code);
-                }}
+                onClick={() => handleCopyDocker(getDockerCommands(job.job_id))}
                 className={`absolute top-2 right-2 px-3 py-1 text-xs rounded transition-all cursor-pointer ${
                   copiedDocker 
                     ? 'bg-green-600 text-white' 
diff --git a/frontend/next.config.ts b/frontend/next.config.ts
@@ -6,7 +6,9 @@ const nextConfig: NextConfig = {
     unoptimized: true,
   },
   
-  // Trailing slashes for better compatibility
+  // Trailing slashes ensure consistent URL handling across:
+  // - AWS Amplify SSR deployments (prevents 404 on refresh)
+  // - Static file serving and client-side navigation
   trailingSlash: true,
 };
 
diff --git a/infrastructure/terraform/s3.tf b/infrastructure/terraform/s3.tf
@@ -1,3 +1,20 @@
+# =============================================================================
+# CORS Origins - Computed once and validated
+# =============================================================================
+# The cors_origins local calculates allowed origins based on:
+# 1. Manual override via var.cors_allowed_origins (highest priority)
+# 2. Amplify domain (if enabled via github_repository + github_token)
+# 3. localhost:3000 (only in dev environment)
+#
+# IMPORTANT: In production, either enable Amplify OR set cors_allowed_origins
+# =============================================================================
+locals {
+  cors_origins = length(var.cors_allowed_origins) > 0 ? var.cors_allowed_origins : concat(
+    local.amplify_enabled ? ["https://${aws_amplify_app.frontend[0].default_domain}"] : [],
+    var.environment == "dev" ? ["http://localhost:3000"] : []
+  )
+}
+
 # S3 Bucket for Datasets
 resource "aws_s3_bucket" "datasets" {
   bucket = "${local.name_prefix}-datasets-${local.account_id}"
@@ -35,9 +52,16 @@ resource "aws_s3_bucket_cors_configuration" "datasets" {
   cors_rule {
     allowed_headers = ["*"]
     allowed_methods = ["PUT", "GET"]
-    allowed_origins = ["*"]
+    allowed_origins = local.cors_origins
     max_age_seconds = 3600
   }
+
+  lifecycle {
+    precondition {
+      condition     = length(local.cors_origins) > 0
+      error_message = "CORS allowed_origins cannot be empty. Either enable Amplify (set github_repository and github_token), use dev environment, or set cors_allowed_origins manually."
+    }
+  }
 }
 
 # S3 Bucket for Models
@@ -77,10 +101,17 @@ resource "aws_s3_bucket_cors_configuration" "models" {
   cors_rule {
     allowed_headers = ["*"]
     allowed_methods = ["GET"]
-    allowed_origins = ["*"]
+    allowed_origins = local.cors_origins
     expose_headers  = ["Content-Disposition"]
     max_age_seconds = 3600
   }
+
+  lifecycle {
+    precondition {
+      condition     = length(local.cors_origins) > 0
+      error_message = "CORS allowed_origins cannot be empty. Either enable Amplify (set github_repository and github_token), use dev environment, or set cors_allowed_origins manually."
+    }
+  }
 }
 
 # S3 Bucket for Reports
@@ -120,8 +151,15 @@ resource "aws_s3_bucket_cors_configuration" "reports" {
   cors_rule {
     allowed_headers = ["*"]
     allowed_methods = ["GET"]
-    allowed_origins = ["*"]
+    allowed_origins = local.cors_origins
     expose_headers  = ["Content-Disposition"]
     max_age_seconds = 3600
   }
+
+  lifecycle {
+    precondition {
+      condition     = length(local.cors_origins) > 0
+      error_message = "CORS allowed_origins cannot be empty. Either enable Amplify (set github_repository and github_token), use dev environment, or set cors_allowed_origins manually."
+    }
+  }
 }
diff --git a/infrastructure/terraform/variables.tf b/infrastructure/terraform/variables.tf
@@ -124,3 +124,11 @@ variable "github_token" {
   sensitive   = true
   default     = ""
 }
+
+variable "cors_allowed_origins" {
+  description = "List of allowed origins for S3 CORS configuration. Use specific domains for security."
+  type        = list(string)
+  default     = []
+  # When empty, defaults to Amplify domain + localhost for dev
+  # For production, specify exact frontend URLs
+}
diff --git a/scripts/predict.py b/scripts/predict.py
@@ -176,7 +176,11 @@ def prepare_input(data: pd.DataFrame, preprocessor) -> pd.DataFrame:
     numeric_cols = df.select_dtypes(include=[np.number]).columns
     for col in numeric_cols:
         if df[col].isnull().any():
-            df[col].fillna(df[col].median(), inplace=True)
+            median_val = df[col].median()
+            # Fallback to 0 if median is NaN (empty column or all NaN)
+            if pd.isna(median_val):
+                median_val = 0
+            df[col].fillna(median_val, inplace=True)
     
     categorical_cols = df.select_dtypes(include=['object']).columns
     for col in categorical_cols:
@@ -243,8 +247,9 @@ def predict_single(model_package: dict, input_data: dict) -> dict:
             
             result['probabilities'] = {str(label): float(p) for label, p in zip(class_labels, probas)}
             result['confidence'] = float(max(probas))
-        except Exception:
-            pass
+        except (AttributeError, ValueError, IndexError) as e:
+            # Log warning but continue - probabilities are optional
+            print(f"⚠️  Could not compute class probabilities: {e}")
     
     return result
 
@@ -283,8 +288,9 @@ def predict_batch(model_package: dict, input_path: str, output_path: str) -> Non
         try:
             probas = model.predict_proba(X)
             df['confidence'] = probas.max(axis=1)
-        except Exception:
-            pass
+        except (AttributeError, ValueError) as e:
+            # Log warning but continue - confidence scores are optional
+            print(f"⚠️  Could not compute confidence scores: {e}")
     
     # Save results
     df.to_csv(output_path, index=False)