|
1 | | -"""Distributed execution dispatcher with optional Celery and Ray backends.""" |
| 1 | +"""Distributed execution dispatcher with optional distributed compute backends.""" |
2 | 2 | from __future__ import annotations |
3 | 3 |
|
4 | 4 | from dataclasses import dataclass |
|
11 | 11 |
|
12 | 12 | logger = logging.getLogger(__name__) |
13 | 13 |
|
14 | | -SUPPORTED_BACKENDS = {"local", "celery", "ray"} |
| 14 | +SUPPORTED_BACKENDS = {"local", "celery", "ray", "spark", "dask"} |
15 | 15 |
|
16 | 16 |
|
17 | 17 | @dataclass |
@@ -43,6 +43,14 @@ def run(self, pipeline: Pipeline, backend_override: Optional[str] = None) -> Dis |
43 | 43 | execution, used_backend = self._execute_with_ray(pipeline) |
44 | 44 | return DispatchResult(execution=execution, backend_used=used_backend) |
45 | 45 |
|
| 46 | + if backend == "spark": |
| 47 | + execution, used_backend = self._execute_with_spark(pipeline) |
| 48 | + return DispatchResult(execution=execution, backend_used=used_backend) |
| 49 | + |
| 50 | + if backend == "dask": |
| 51 | + execution, used_backend = self._execute_with_dask(pipeline) |
| 52 | + return DispatchResult(execution=execution, backend_used=used_backend) |
| 53 | + |
46 | 54 | execution = self.executor.execute(pipeline) |
47 | 55 | return DispatchResult(execution=execution, backend_used="local") |
48 | 56 |
|
@@ -112,3 +120,70 @@ def execute_pipeline_remote(pipeline_payload: dict): |
112 | 120 | } |
113 | 121 | ) |
114 | 122 | return execution, "local" |
| 123 | + |
| 124 | + def _execute_with_spark(self, pipeline: Pipeline) -> tuple[Execution, str]: |
| 125 | + """Try Spark path; fallback to local execution when unavailable.""" |
| 126 | + try: |
| 127 | + from pyspark.sql import SparkSession |
| 128 | + |
| 129 | + spark = ( |
| 130 | + SparkSession.builder |
| 131 | + .master(settings.SPARK_MASTER_URL) |
| 132 | + .appName(settings.SPARK_APP_NAME) |
| 133 | + .getOrCreate() |
| 134 | + ) |
| 135 | + logger.info("Spark backend initialized for pipeline %s", pipeline.id) |
| 136 | + spark.stop() |
| 137 | + |
| 138 | + execution = self.executor.execute(pipeline) |
| 139 | + execution.context.setdefault("distributed_execution", {}) |
| 140 | + execution.context["distributed_execution"].update( |
| 141 | + { |
| 142 | + "requested_backend": "spark", |
| 143 | + "execution_mode": "spark-driver", |
| 144 | + } |
| 145 | + ) |
| 146 | + return execution, "spark" |
| 147 | + except Exception as exc: |
| 148 | + logger.warning("Spark backend unavailable (%s). Executing locally.", exc) |
| 149 | + execution = self.executor.execute(pipeline) |
| 150 | + execution.context.setdefault("distributed_execution", {}) |
| 151 | + execution.context["distributed_execution"].update( |
| 152 | + { |
| 153 | + "requested_backend": "spark", |
| 154 | + "fallback_backend": "local", |
| 155 | + "fallback_reason": str(exc), |
| 156 | + } |
| 157 | + ) |
| 158 | + return execution, "local" |
| 159 | + |
| 160 | + def _execute_with_dask(self, pipeline: Pipeline) -> tuple[Execution, str]: |
| 161 | + """Try Dask path; fallback to local execution when unavailable.""" |
| 162 | + try: |
| 163 | + from dask.distributed import Client |
| 164 | + |
| 165 | + client = Client(settings.DASK_SCHEDULER_ADDRESS) if settings.DASK_SCHEDULER_ADDRESS else Client(processes=False) |
| 166 | + logger.info("Dask backend initialized for pipeline %s", pipeline.id) |
| 167 | + client.close() |
| 168 | + |
| 169 | + execution = self.executor.execute(pipeline) |
| 170 | + execution.context.setdefault("distributed_execution", {}) |
| 171 | + execution.context["distributed_execution"].update( |
| 172 | + { |
| 173 | + "requested_backend": "dask", |
| 174 | + "execution_mode": "dask-local-cluster", |
| 175 | + } |
| 176 | + ) |
| 177 | + return execution, "dask" |
| 178 | + except Exception as exc: |
| 179 | + logger.warning("Dask backend unavailable (%s). Executing locally.", exc) |
| 180 | + execution = self.executor.execute(pipeline) |
| 181 | + execution.context.setdefault("distributed_execution", {}) |
| 182 | + execution.context["distributed_execution"].update( |
| 183 | + { |
| 184 | + "requested_backend": "dask", |
| 185 | + "fallback_backend": "local", |
| 186 | + "fallback_reason": str(exc), |
| 187 | + } |
| 188 | + ) |
| 189 | + return execution, "local" |
0 commit comments