|
25 | 25 | from backend.config import settings |
26 | 26 | from backend.core.executor import PipelineExecutor |
27 | 27 | from backend.events import get_event_publisher |
| 28 | +from backend.observability import observability_metrics |
28 | 29 | from backend.models.pipeline import Execution, ExecutionStatus |
29 | 30 |
|
30 | 31 | router = APIRouter(prefix="/executions", tags=["executions"]) |
|
38 | 39 | TERMINAL_STATUSES = {ExecutionStatus.COMPLETED, ExecutionStatus.FAILED, ExecutionStatus.CANCELLED} |
39 | 40 |
|
40 | 41 |
|
| 42 | +def _pipeline_failure_rate_percent(pipeline_id: str) -> float: |
| 43 | + """Compute failure rate for completed pipeline executions.""" |
| 44 | + terminal = [ |
| 45 | + execution |
| 46 | + for execution in executions_db.values() |
| 47 | + if execution.pipeline_id == pipeline_id and execution.status in TERMINAL_STATUSES |
| 48 | + ] |
| 49 | + if not terminal: |
| 50 | + return 0.0 |
| 51 | + |
| 52 | + failed = len([execution for execution in terminal if execution.status == ExecutionStatus.FAILED]) |
| 53 | + return (failed / len(terminal)) * 100 |
| 54 | + |
| 55 | + |
| 56 | +def _update_active_executions_metric() -> None: |
| 57 | + active_count = len([ |
| 58 | + execution for execution in executions_db.values() |
| 59 | + if execution.status in {ExecutionStatus.PENDING, ExecutionStatus.RUNNING} |
| 60 | + ]) |
| 61 | + observability_metrics.set_active_executions(active_count) |
41 | 62 | ORCHESTRATION_SCHEMA_TO_CORE = { |
42 | 63 | OrchestrationEngineSchema.LOCAL: OrchestrationEngine.LOCAL, |
43 | 64 | OrchestrationEngineSchema.AIRFLOW: OrchestrationEngine.AIRFLOW, |
@@ -116,6 +137,7 @@ def initialize_execution( |
116 | 137 | context=context or {}, |
117 | 138 | ) |
118 | 139 | executions_db[execution_id] = execution |
| 140 | + _update_active_executions_metric() |
119 | 141 |
|
120 | 142 | get_event_publisher().publish( |
121 | 143 | topic=settings.TOPIC_EXECUTION_STARTED, |
@@ -165,6 +187,16 @@ async def execute_pipeline_background( |
165 | 187 | result.id = execution_id |
166 | 188 | executions_db[execution_id] = result |
167 | 189 |
|
| 190 | + observability_metrics.observe_execution_outcome( |
| 191 | + pipeline_id=result.pipeline_id, |
| 192 | + status=result.status.value, |
| 193 | + duration_seconds=result.duration, |
| 194 | + failure_rate_percent=_pipeline_failure_rate_percent(result.pipeline_id), |
| 195 | + sla_target_seconds=settings.PIPELINE_SLA_TARGET_SECONDS, |
| 196 | + ) |
| 197 | + _update_active_executions_metric() |
| 198 | + observability_metrics.observe_process_resources() |
| 199 | + |
168 | 200 | if result.status == ExecutionStatus.COMPLETED: |
169 | 201 | get_event_publisher().publish( |
170 | 202 | topic=settings.TOPIC_EXECUTION_COMPLETED, |
@@ -198,6 +230,15 @@ async def execute_pipeline_background( |
198 | 230 | executions_db[execution_id].status = ExecutionStatus.FAILED |
199 | 231 | executions_db[execution_id].error = str(e) |
200 | 232 | failed_execution = executions_db[execution_id] |
| 233 | + observability_metrics.observe_execution_outcome( |
| 234 | + pipeline_id=failed_execution.pipeline_id, |
| 235 | + status=failed_execution.status.value, |
| 236 | + duration_seconds=failed_execution.duration, |
| 237 | + failure_rate_percent=_pipeline_failure_rate_percent(failed_execution.pipeline_id), |
| 238 | + sla_target_seconds=settings.PIPELINE_SLA_TARGET_SECONDS, |
| 239 | + ) |
| 240 | + _update_active_executions_metric() |
| 241 | + observability_metrics.observe_process_resources() |
201 | 242 | get_event_publisher().publish( |
202 | 243 | topic=settings.TOPIC_EXECUTION_FAILED, |
203 | 244 | key=failed_execution.id, |
|
0 commit comments