Add distributed execution support with Celery and Ray backends

fuzziecoder · fuzziecoder · commit af1a4098c6c6 · 2026-02-25T08:21:22.000-05:00
diff --git a/backend/README.md b/backend/README.md
@@ -103,6 +103,33 @@ curl -X POST http://localhost:8000/api/executions \
 ```
 
 
+### Distributed Task Execution (Celery / Ray)
+
+FlexiRoaster supports selectable execution backends for asynchronous and distributed workloads:
+
+- `local`: default in-process execution
+- `celery`: async jobs, retries, and scheduling support through Celery workers
+- `ray`: distributed Python execution, optimized for ML/AI-heavy pipelines
+
+Use the optional `execution_backend` field when creating an execution:
+
+```bash
+curl -X POST http://localhost:8000/api/executions   -H "Content-Type: application/json"   -d '{"pipeline_id": "your-pipeline-id", "execution_backend": "ray"}'
+```
+
+Or set a default backend via environment variables in `backend/.env`:
+
+```env
+DISTRIBUTED_EXECUTION_BACKEND=local
+CELERY_BROKER_URL=redis://localhost:6379/0
+CELERY_RESULT_BACKEND=redis://localhost:6379/1
+CELERY_EXECUTION_TASK=flexiroaster.execute_pipeline
+RAY_ADDRESS=auto
+RAY_NAMESPACE=flexiroaster
+```
+
+If Celery or Ray is unavailable, FlexiRoaster automatically falls back to local execution and records the fallback reason in execution context.
+
 ## Authentication & Security
 
 - JWT authentication endpoint: `POST /api/auth/token`
diff --git a/backend/api/routes/executions.py b/backend/api/routes/executions.py
@@ -14,7 +14,7 @@
     SuccessResponse
 )
 from backend.models.pipeline import Execution, ExecutionStatus
-from backend.core.executor import PipelineExecutor
+from backend.core.distributed_executor import DistributedExecutionDispatcher
 from backend.config import settings
 from backend.events import get_event_publisher
 
@@ -57,19 +57,27 @@ def _merge_execution_logs(result_logs: List, existing_logs: List) -> List:
     return merged_logs
 
 
-def initialize_execution(pipeline_id: str, context: Optional[Dict[str, Any]] = None) -> Execution:
+def initialize_execution(
+    pipeline_id: str,
+    context: Optional[Dict[str, Any]] = None,
+    execution_backend: Optional[str] = None,
+) -> Execution:
     """Create and store a pending execution record."""
     from datetime import datetime
 
     execution_id = f"exec-{uuid.uuid4()}"
     pipeline = pipelines_db[pipeline_id]
+    merged_context = context.copy() if context else {}
+    if execution_backend:
+        merged_context["requested_execution_backend"] = execution_backend
+
     execution = Execution(
         id=execution_id,
         pipeline_id=pipeline_id,
         status=ExecutionStatus.PENDING,
         started_at=datetime.now(),
         total_stages=len(pipeline.stages),
-        context=context or {}
+        context=merged_context
     )
     executions_db[execution_id] = execution
 
@@ -86,12 +94,19 @@ def initialize_execution(pipeline_id: str, context: Optional[Dict[str, Any]] = N
     return execution
 
 
-async def execute_pipeline_background(pipeline_id: str, execution_id: str):
+async def execute_pipeline_background(
+    pipeline_id: str,
+    execution_id: str,
+    execution_backend: Optional[str] = None,
+):
     """Background task to execute pipeline."""
     try:
         pipeline = pipelines_db[pipeline_id]
-        executor = PipelineExecutor()
-        result = executor.execute(pipeline)
+        dispatcher = DistributedExecutionDispatcher()
+        dispatch_result = dispatcher.run(pipeline, backend_override=execution_backend)
+        result = dispatch_result.execution
+        result.context.setdefault("distributed_execution", {})
+        result.context["distributed_execution"]["backend_used"] = dispatch_result.backend_used
 
         existing = executions_db.get(execution_id)
         if existing:
@@ -183,13 +198,17 @@ async def create_execution(
         )
     
     # Create execution record
-    execution = initialize_execution(execution_data.pipeline_id)
+    execution = initialize_execution(
+        execution_data.pipeline_id,
+        execution_backend=execution_data.execution_backend
+    )
     
     # Start execution in background
     background_tasks.add_task(
         execute_pipeline_background,
         execution_data.pipeline_id,
-        execution.id
+        execution.id,
+        execution_data.execution_backend
     )
     
     return execution
diff --git a/backend/api/schemas.py b/backend/api/schemas.py
@@ -103,6 +103,10 @@ class PipelineListResponse(BaseModel):
 class ExecutionCreate(BaseModel):
     """Schema for creating an execution"""
     pipeline_id: str
+    execution_backend: Optional[str] = Field(
+        default=None,
+        description="Optional override for distributed backend: local, celery, or ray",
+    )
 
 
 class AirflowTriggerRequest(BaseModel):
diff --git a/backend/config.py b/backend/config.py
@@ -50,6 +50,18 @@ class Settings(BaseSettings):
     TOPIC_EXECUTION_FAILED: str = "execution.failed"
     TOPIC_EXECUTION_COMPLETED: str = "execution.completed"
 
+    # Distributed execution backends: local|celery|ray
+    DISTRIBUTED_EXECUTION_BACKEND: str = "local"
+
+    # Celery settings
+    CELERY_BROKER_URL: str = "redis://localhost:6379/0"
+    CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
+    CELERY_EXECUTION_TASK: str = "flexiroaster.execute_pipeline"
+
+    # Ray settings
+    RAY_ADDRESS: str = "auto"
+    RAY_NAMESPACE: str = "flexiroaster"
+
     @field_validator("CORS_ORIGINS", mode="before")
     @classmethod
     def parse_cors_origins(cls, v):
diff --git a/backend/core/distributed_executor.py b/backend/core/distributed_executor.py
@@ -0,0 +1,114 @@
+"""Distributed execution dispatcher with optional Celery and Ray backends."""
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import Optional
+import logging
+
+from backend.config import settings
+from backend.core.executor import PipelineExecutor
+from backend.models.pipeline import Execution, Pipeline
+
+logger = logging.getLogger(__name__)
+
+SUPPORTED_BACKENDS = {"local", "celery", "ray"}
+
+
+@dataclass
+class DispatchResult:
+    """Result wrapper for execution dispatch metadata."""
+
+    execution: Execution
+    backend_used: str
+
+
+class DistributedExecutionDispatcher:
+    """Run pipeline execution on local runtime or distributed frameworks."""
+
+    def __init__(self):
+        self.executor = PipelineExecutor()
+
+    def run(self, pipeline: Pipeline, backend_override: Optional[str] = None) -> DispatchResult:
+        backend = (backend_override or settings.DISTRIBUTED_EXECUTION_BACKEND or "local").lower().strip()
+
+        if backend not in SUPPORTED_BACKENDS:
+            logger.warning("Unsupported backend '%s'. Falling back to local.", backend)
+            backend = "local"
+
+        if backend == "celery":
+            execution, used_backend = self._execute_with_celery(pipeline)
+            return DispatchResult(execution=execution, backend_used=used_backend)
+
+        if backend == "ray":
+            execution, used_backend = self._execute_with_ray(pipeline)
+            return DispatchResult(execution=execution, backend_used=used_backend)
+
+        execution = self.executor.execute(pipeline)
+        return DispatchResult(execution=execution, backend_used="local")
+
+    def _execute_with_celery(self, pipeline: Pipeline) -> tuple[Execution, str]:
+        """Try Celery path; fallback to local execution when unavailable."""
+        try:
+            from celery import Celery
+
+            app = Celery(
+                "flexiroaster",
+                broker=settings.CELERY_BROKER_URL,
+                backend=settings.CELERY_RESULT_BACKEND,
+            )
+            task_name = settings.CELERY_EXECUTION_TASK
+
+            payload = pipeline.model_dump(mode="json")
+            async_result = app.send_task(task_name, kwargs={"pipeline": payload})
+            remote_output = async_result.get(timeout=600)
+            execution = Execution.model_validate(remote_output)
+            logger.info("Pipeline %s executed via Celery task %s", pipeline.id, task_name)
+            return execution, "celery"
+        except Exception as exc:
+            logger.warning("Celery backend unavailable (%s). Executing locally.", exc)
+            execution = self.executor.execute(pipeline)
+            execution.context.setdefault("distributed_execution", {})
+            execution.context["distributed_execution"].update(
+                {
+                    "requested_backend": "celery",
+                    "fallback_backend": "local",
+                    "fallback_reason": str(exc),
+                }
+            )
+            return execution, "local"
+
+    def _execute_with_ray(self, pipeline: Pipeline) -> tuple[Execution, str]:
+        """Try Ray path; fallback to local execution when unavailable."""
+        try:
+            import ray
+
+            if not ray.is_initialized():
+                ray.init(address=settings.RAY_ADDRESS, namespace=settings.RAY_NAMESPACE, ignore_reinit_error=True)
+
+            @ray.remote
+            def execute_pipeline_remote(pipeline_payload: dict):
+                from backend.core.executor import PipelineExecutor
+                from backend.models.pipeline import Pipeline
+
+                model = Pipeline.model_validate(pipeline_payload)
+                result = PipelineExecutor().execute(model)
+                return result.model_dump(mode="json")
+
+            payload = pipeline.model_dump(mode="json")
+            remote_ref = execute_pipeline_remote.remote(payload)
+            remote_output = ray.get(remote_ref)
+            execution = Execution.model_validate(remote_output)
+            logger.info("Pipeline %s executed via Ray remote function", pipeline.id)
+            return execution, "ray"
+        except Exception as exc:
+            logger.warning("Ray backend unavailable (%s). Executing locally.", exc)
+            execution = self.executor.execute(pipeline)
+            execution.context.setdefault("distributed_execution", {})
+            execution.context["distributed_execution"].update(
+                {
+                    "requested_backend": "ray",
+                    "fallback_backend": "local",
+                    "fallback_reason": str(exc),
+                }
+            )
+            return execution, "local"
diff --git a/backend/tests/test_distributed_execution.py b/backend/tests/test_distributed_execution.py
@@ -0,0 +1,61 @@
+import asyncio
+
+from fastapi import BackgroundTasks
+
+from backend.api.routes.executions import create_execution, execute_pipeline_background, executions_db
+from backend.api.routes.pipelines import create_pipeline, pipelines_db
+from backend.api.schemas import ExecutionCreate, PipelineCreate, StageCreate, StageTypeSchema
+from backend.core.distributed_executor import DistributedExecutionDispatcher
+from backend.models.pipeline import ExecutionStatus
+
+
+def _build_pipeline():
+    pipelines_db.clear()
+    executions_db.clear()
+    return asyncio.run(
+        create_pipeline(
+            PipelineCreate(
+                name="distributed-pipeline",
+                description="pipeline for distributed testing",
+                stages=[
+                    StageCreate(id="in", name="Input", type=StageTypeSchema.INPUT, config={"source": "x", "data": [1, 2]}),
+                    StageCreate(
+                        id="out",
+                        name="Output",
+                        type=StageTypeSchema.OUTPUT,
+                        config={"destination": "console"},
+                        dependencies=["in"],
+                    ),
+                ],
+            )
+        )
+    )
+
+
+def test_dispatcher_falls_back_to_local_for_unknown_backend():
+    pipeline = _build_pipeline()
+
+    dispatcher = DistributedExecutionDispatcher()
+    result = dispatcher.run(pipeline, backend_override="spark")
+
+    assert result.backend_used == "local"
+    assert result.execution.status == ExecutionStatus.COMPLETED
+
+
+def test_create_execution_tracks_requested_backend_and_backend_used():
+    pipeline = _build_pipeline()
+
+    execution = asyncio.run(
+        create_execution(
+            ExecutionCreate(pipeline_id=pipeline.id, execution_backend="celery"),
+            BackgroundTasks(),
+        )
+    )
+
+    assert execution.context["requested_execution_backend"] == "celery"
+
+    asyncio.run(execute_pipeline_background(pipeline.id, execution.id, "celery"))
+    stored = executions_db[execution.id]
+
+    assert stored.context["requested_execution_backend"] == "celery"
+    assert stored.context["distributed_execution"]["backend_used"] == "local"