Merge pull request #90 from fuzziecoder/codex/implement-monitoring-and-observability-stack

fuzziecoder · web-flow · commit 21b0575c6620 · 2026-02-25T08:24:23.000-05:00
Implement Prometheus metrics and ELK/Grafana observability stack
diff --git a/backend/README.md b/backend/README.md
@@ -202,6 +202,33 @@ TOPIC_EXECUTION_COMPLETED=execution.completed
 
 If Kafka/Redpanda is unavailable, the backend falls back to structured application logs for events so local development continues to work.
 
+
+## Monitoring & Observability
+
+FlexiRoaster now includes a production observability baseline:
+
+- **Prometheus** metrics scrape endpoint: `GET /metrics`
+- **Grafana** dashboards (using Prometheus datasource)
+- **Elasticsearch + Logstash** centralized JSON logs
+
+Key telemetry includes:
+- Pipeline latency (`flexiroaster_pipeline_execution_latency_seconds`)
+- Failure rates (`flexiroaster_pipeline_failure_rate`)
+- Resource usage (`flexiroaster_process_cpu_percent`, `flexiroaster_process_memory_rss_bytes`)
+- SLA tracking (`flexiroaster_pipeline_sla_breaches_total`)
+
+Runtime knobs (environment variables):
+
+```env
+ENABLE_PROMETHEUS_METRICS=true
+PIPELINE_SLA_TARGET_SECONDS=30
+ENABLE_LOGSTASH_LOGGING=true
+LOGSTASH_HOST=logstash
+LOGSTASH_PORT=5000
+```
+
+Use `docker compose up` to launch backend + monitoring stack (Prometheus/Grafana/Elasticsearch/Logstash).
+
 ## Next Steps
 
 1. Install Python and dependencies
diff --git a/backend/api/middleware/logging_middleware.py b/backend/api/middleware/logging_middleware.py
@@ -19,6 +19,7 @@
 from starlette.responses import Response
 
 from backend.config import settings
+from backend.observability import observability_metrics
 
 
 # ---------------------------------------------------------------------------
@@ -104,6 +105,7 @@ async def dispatch(self, request: Request, call_next) -> Response:
         request_id = request.headers.get("x-request-id", str(uuid.uuid4())[:8])
 
         # Timing
+        timer = observability_metrics.start_timer()
         start = time.perf_counter()
 
         # Request info
@@ -154,6 +156,13 @@ async def dispatch(self, request: Request, call_next) -> Response:
                 request_id, method, path, status, phrase, duration_ms, length,
             )
 
+            observability_metrics.observe_http_request(
+                method=method,
+                path=path,
+                status_code=status,
+                duration_seconds=timer.elapsed_seconds(),
+            )
+
             response.headers["X-Request-ID"] = request_id
             return response
 
@@ -164,4 +173,10 @@ async def dispatch(self, request: Request, call_next) -> Response:
                 request_id, method, path, duration_ms, str(exc),
                 exc_info=True,
             )
+            observability_metrics.observe_http_request(
+                method=method,
+                path=path,
+                status_code=500,
+                duration_seconds=timer.elapsed_seconds(),
+            )
             raise
diff --git a/backend/api/routes/executions.py b/backend/api/routes/executions.py
@@ -25,6 +25,7 @@
 from backend.config import settings
 from backend.core.executor import PipelineExecutor
 from backend.events import get_event_publisher
+from backend.observability import observability_metrics
 from backend.models.pipeline import Execution, ExecutionStatus
 
 router = APIRouter(prefix="/executions", tags=["executions"])
@@ -38,6 +39,26 @@
 TERMINAL_STATUSES = {ExecutionStatus.COMPLETED, ExecutionStatus.FAILED, ExecutionStatus.CANCELLED}
 
 
+def _pipeline_failure_rate_percent(pipeline_id: str) -> float:
+    """Compute failure rate for completed pipeline executions."""
+    terminal = [
+        execution
+        for execution in executions_db.values()
+        if execution.pipeline_id == pipeline_id and execution.status in TERMINAL_STATUSES
+    ]
+    if not terminal:
+        return 0.0
+
+    failed = len([execution for execution in terminal if execution.status == ExecutionStatus.FAILED])
+    return (failed / len(terminal)) * 100
+
+
+def _update_active_executions_metric() -> None:
+    active_count = len([
+        execution for execution in executions_db.values()
+        if execution.status in {ExecutionStatus.PENDING, ExecutionStatus.RUNNING}
+    ])
+    observability_metrics.set_active_executions(active_count)
 ORCHESTRATION_SCHEMA_TO_CORE = {
     OrchestrationEngineSchema.LOCAL: OrchestrationEngine.LOCAL,
     OrchestrationEngineSchema.AIRFLOW: OrchestrationEngine.AIRFLOW,
@@ -116,6 +137,7 @@ def initialize_execution(
         context=context or {},
     )
     executions_db[execution_id] = execution
+    _update_active_executions_metric()
 
     get_event_publisher().publish(
         topic=settings.TOPIC_EXECUTION_STARTED,
@@ -165,6 +187,16 @@ async def execute_pipeline_background(
         result.id = execution_id
         executions_db[execution_id] = result
 
+        observability_metrics.observe_execution_outcome(
+            pipeline_id=result.pipeline_id,
+            status=result.status.value,
+            duration_seconds=result.duration,
+            failure_rate_percent=_pipeline_failure_rate_percent(result.pipeline_id),
+            sla_target_seconds=settings.PIPELINE_SLA_TARGET_SECONDS,
+        )
+        _update_active_executions_metric()
+        observability_metrics.observe_process_resources()
+
         if result.status == ExecutionStatus.COMPLETED:
             get_event_publisher().publish(
                 topic=settings.TOPIC_EXECUTION_COMPLETED,
@@ -198,6 +230,15 @@ async def execute_pipeline_background(
             executions_db[execution_id].status = ExecutionStatus.FAILED
             executions_db[execution_id].error = str(e)
             failed_execution = executions_db[execution_id]
+            observability_metrics.observe_execution_outcome(
+                pipeline_id=failed_execution.pipeline_id,
+                status=failed_execution.status.value,
+                duration_seconds=failed_execution.duration,
+                failure_rate_percent=_pipeline_failure_rate_percent(failed_execution.pipeline_id),
+                sla_target_seconds=settings.PIPELINE_SLA_TARGET_SECONDS,
+            )
+            _update_active_executions_metric()
+            observability_metrics.observe_process_resources()
             get_event_publisher().publish(
                 topic=settings.TOPIC_EXECUTION_FAILED,
                 key=failed_execution.id,
diff --git a/backend/api/routes/metrics.py b/backend/api/routes/metrics.py
@@ -5,9 +5,10 @@
 from fastapi import APIRouter
 from datetime import datetime, timedelta
 from typing import List
-import random
+import os
 
 from backend.api.schemas import SystemMetricsResponse, MetricResponse, MetricsHistoryResponse
+from backend.observability import observability_metrics
 
 router = APIRouter(prefix="/metrics", tags=["metrics"])
 
@@ -67,9 +68,23 @@ async def get_metrics():
     ]
     pipeline_throughput = len(recent_executions)
     
-    # Simulated CPU and memory (would be real in production)
-    cpu_usage = random.uniform(20, 80)
-    memory_usage = random.uniform(30, 70)
+    # Resource usage snapshots
+    observability_metrics.observe_process_resources()
+
+    try:
+        load1, _, _ = os.getloadavg()
+        cpu_usage = max(min(load1 * 100, 100.0), 0.0)
+    except Exception:
+        cpu_usage = 0.0
+
+    try:
+        import resource
+
+        rss_kb = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
+        rss_mb = rss_kb / 1024 if rss_kb < 10_000_000 else rss_kb / (1024 * 1024)
+        memory_usage = max(min((rss_mb / 1024) * 100, 100.0), 0.0)
+    except Exception:
+        memory_usage = 0.0
     
     return SystemMetricsResponse(
         cpu_usage=cpu_usage,
@@ -137,16 +152,16 @@ async def get_metrics_history(
         
         # Generate sample values based on metric type
         if metric == "throughput":
-            value = random.uniform(800, 2200)
+            value = 800 + (i * 120) % 1400
             unit = "req/min"
         elif metric == "cpu":
-            value = random.uniform(20, 80)
+            value = 20 + (i * 7) % 60
             unit = "%"
         elif metric == "memory":
-            value = random.uniform(30, 70)
+            value = 30 + (i * 5) % 40
             unit = "%"
         else:
-            value = random.uniform(0, 100)
+            value = (i * 11) % 100
             unit = None
         
         metrics.append(MetricResponse(
diff --git a/backend/config.py b/backend/config.py
@@ -92,6 +92,15 @@ def parse_kafka_bootstrap_servers(cls, v):
     LOG_REQUEST_BODY: bool = False
     LOG_RESPONSE_BODY: bool = False
     SENSITIVE_FIELDS: List[str] = ["password", "token", "secret", "api_key", "authorization", "credit_card", "ssn"]
+
+    # Observability
+    ENABLE_PROMETHEUS_METRICS: bool = True
+    PIPELINE_SLA_TARGET_SECONDS: float = 30.0
+
+    # Centralized logs (Logstash TCP)
+    ENABLE_LOGSTASH_LOGGING: bool = False
+    LOGSTASH_HOST: str = "localhost"
+    LOGSTASH_PORT: int = 5000
     
     class Config:
         env_file = ".env"
diff --git a/backend/main.py b/backend/main.py
@@ -7,14 +7,16 @@
 import uvicorn
 from fastapi import Depends, FastAPI, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
+from fastapi.responses import JSONResponse, Response
 
 from backend.api.middleware.rate_limit_middleware import RateLimitMiddleware
 from backend.api.routes import airflow, executions, metrics, pipelines
 from backend.api.routes.auth import router as auth_router
 from backend.api.security import require_roles
 from backend.config import settings
 from backend.services.secrets import secret_manager
+from backend.observability import observability_metrics
+from backend.observability.logging import configure_logstash_logging
 
 # Create FastAPI app
 app = FastAPI(
@@ -44,6 +46,9 @@
 
     app.add_middleware(RequestLoggingMiddleware)
 
+# Optional centralized log shipping
+configure_logstash_logging()
+
 
 @app.on_event("startup")
 async def load_runtime_secrets() -> None:
@@ -79,6 +84,13 @@ async def health_check():
     }
 
 
+@app.get("/metrics", include_in_schema=False)
+async def prometheus_metrics():
+    """Prometheus scrape endpoint for infrastructure monitoring."""
+    payload, content_type = observability_metrics.prometheus_payload()
+    return Response(content=payload, media_type=content_type)
+
+
 # Root endpoint
 @app.get("/", tags=["root"])
 async def root():
diff --git a/backend/observability/__init__.py b/backend/observability/__init__.py
@@ -0,0 +1,5 @@
+"""Observability utilities for metrics and logging."""
+
+from backend.observability.metrics import observability_metrics
+
+__all__ = ["observability_metrics"]
diff --git a/backend/observability/logging.py b/backend/observability/logging.py
@@ -0,0 +1,41 @@
+"""Centralized logging helpers for shipping logs to Logstash/Elasticsearch."""
+from __future__ import annotations
+
+import json
+import logging
+import logging.handlers
+from datetime import datetime, timezone
+
+from backend.config import settings
+
+
+class JsonTcpLogstashHandler(logging.handlers.SocketHandler):
+    """Send structured JSON logs over TCP to a Logstash input."""
+
+    def makePickle(self, record: logging.LogRecord) -> bytes:  # noqa: N802
+        document = {
+            "timestamp": datetime.now(timezone.utc).isoformat(),
+            "service": "flexiroaster-backend",
+            "logger": record.name,
+            "level": record.levelname,
+            "message": record.getMessage(),
+            "pathname": record.pathname,
+            "lineno": record.lineno,
+        }
+        return (json.dumps(document, default=str) + "\n").encode("utf-8")
+
+
+def configure_logstash_logging() -> bool:
+    """Attach a Logstash TCP handler to the API logger when enabled."""
+    if not settings.ENABLE_LOGSTASH_LOGGING:
+        return False
+
+    logger = logging.getLogger("flexiroaster.api")
+    for handler in logger.handlers:
+        if isinstance(handler, JsonTcpLogstashHandler):
+            return True
+
+    handler = JsonTcpLogstashHandler(settings.LOGSTASH_HOST, settings.LOGSTASH_PORT)
+    handler.setLevel(getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO))
+    logger.addHandler(handler)
+    return True
diff --git a/backend/observability/metrics.py b/backend/observability/metrics.py
diff --git a/backend/requirements.txt b/backend/requirements.txt
diff --git a/docker-compose.yml b/docker-compose.yml