feat: enhance pipeline validation with parameter and dependency checks

VTvito · VTvito · commit 3d9891fc78ee · 2026-03-02T13:15:29.000+01:00
diff --git a/ai_agent/pipeline_agent.py b/ai_agent/pipeline_agent.py
@@ -128,6 +128,11 @@ def validate_pipeline(pipeline_def: dict, registry: dict) -> tuple[list[str], li
     for i, step in enumerate(pipeline["steps"]):
         step_id = step.get("id", f"step_{i}")
 
+        params = step.get("params") or {}
+        if not isinstance(params, dict):
+            errors.append(f"Step '{step_id}': 'params' must be an object/dict")
+            params = {}
+
         if step_id in step_ids:
             errors.append(f"Duplicate step ID: '{step_id}'")
         step_ids.add(step_id)
@@ -150,16 +155,35 @@ def validate_pipeline(pipeline_def: dict, registry: dict) -> tuple[list[str], li
             for pname, pinfo in svc_info.get("params", {}).items():
                 if pname == "dataset_name":
                     continue  # auto-injected by the compiler
-                if pinfo.get("required") and pname not in step.get("params", {}):
+                if pinfo.get("required") and pname not in params:
                     errors.append(
                         f"Step '{step_id}': missing required param '{pname}' for service '{service}'"
                     )
 
+        # Validate depends_on references and semantics
+        depends_on = step.get("depends_on", [])
+        if depends_on is None:
+            depends_on = []
+        if not isinstance(depends_on, list):
+            errors.append(f"Step '{step_id}': 'depends_on' must be a list")
+            depends_on = []
+
         # Validate depends_on references
-        for dep in step.get("depends_on", []):
+        for dep in depends_on:
             if dep not in all_step_ids:
                 errors.append(f"Step '{step_id}': depends_on references unknown step '{dep}'")
 
+        if service in valid_services:
+            svc_type = registry["services"][service]["type"]
+            if svc_type == "extract" and depends_on:
+                errors.append(f"Step '{step_id}': extract steps must not have depends_on")
+            if svc_type != "extract" and not depends_on:
+                errors.append(f"Step '{step_id}': non-extract steps require depends_on")
+            if service == "join_datasets" and len(depends_on) != 2:
+                errors.append(f"Step '{step_id}': join_datasets requires exactly 2 depends_on entries")
+            if service != "join_datasets" and len(depends_on) > 1:
+                errors.append(f"Step '{step_id}': only join_datasets supports multiple depends_on entries")
+
     if not has_extract:
         errors.append("Pipeline must have at least one extract step")
 
diff --git a/ai_agent/pipeline_compiler.py b/ai_agent/pipeline_compiler.py
@@ -189,6 +189,9 @@ def _build_dispatch_registry(prep) -> dict[str, Callable]:
     }
 
 
+_EXTRACT_SERVICES = {"extract_csv", "extract_excel", "extract_api", "extract_sql"}
+
+
 # ── Pipeline Compiler ──────────────────────────────────────────────
 
 class PipelineCompiler:
@@ -417,4 +420,10 @@ def _dispatch_step(
                 "(one for each input dataset)"
             )
 
+        if service not in _EXTRACT_SERVICES and input_data is None:
+            raise ValueError(
+                f"Service '{service}' requires input data. "
+                "Check depends_on and upstream outputs."
+            )
+
         return handler(params, input_data, dataset_name, input_data_2)
diff --git a/airflow/Dockerfile b/airflow/Dockerfile
@@ -6,6 +6,7 @@ FROM apache/airflow:2.10.4
 ENV AIRFLOW__METRICS__STATSD_ON=True
 ENV AIRFLOW__METRICS__STATSD_HOST=statsd-exporter
 ENV AIRFLOW__METRICS__STATSD_PORT=9125
+ENV PYTHONWARNINGS="ignore:invalid escape sequence.*:SyntaxWarning:azure\\.synapse\\.artifacts\\.models\\._models_py3"
 
 # Crea le cartelle necessarie
 RUN mkdir -p /opt/airflow/dags /opt/airflow/logs /opt/airflow/plugins
diff --git a/services/data-quality-service/app/dq.py b/services/data-quality-service/app/dq.py
@@ -102,6 +102,9 @@ def basic_quality_checks(arrow_table, rules=None):
     if range_rules:
         range_results = {}
         for col, bounds in range_rules.items():
+            if not isinstance(bounds, dict):
+                range_results[col] = {"pass": False, "reason": "invalid bounds"}
+                continue
             if col not in df.columns:
                 range_results[col] = {"pass": False, "reason": "column not found"}
                 continue
@@ -111,15 +114,22 @@ def basic_quality_checks(arrow_table, rules=None):
             col_min = float(df[col].min()) if df[col].notna().any() else None
             col_max = float(df[col].max()) if df[col].notna().any() else None
             ok = True
-            if "min" in bounds and col_min is not None:
-                ok = ok and col_min >= bounds["min"]
-            if "max" in bounds and col_max is not None:
-                ok = ok and col_max <= bounds["max"]
+            try:
+                expected_min = float(bounds["min"]) if "min" in bounds else None
+                expected_max = float(bounds["max"]) if "max" in bounds else None
+            except (TypeError, ValueError):
+                range_results[col] = {"pass": False, "reason": "invalid bounds"}
+                continue
+
+            if expected_min is not None and col_min is not None:
+                ok = ok and col_min >= expected_min
+            if expected_max is not None and col_max is not None:
+                ok = ok and col_max <= expected_max
             range_results[col] = {
                 "actual_min": col_min,
                 "actual_max": col_max,
-                "expected_min": bounds.get("min"),
-                "expected_max": bounds.get("max"),
+                "expected_min": expected_min,
+                "expected_max": expected_max,
                 "pass": bool(ok)
             }
         result["checks"]["value_range"] = range_results
diff --git a/services/outlier-detection-service/app/outliers.py b/services/outlier-detection-service/app/outliers.py
@@ -40,7 +40,9 @@ def detect_and_remove_outliers(arrow_table, column, z_threshold=3.0):
         return pa.Table.from_pandas(df), 0
 
     z_score = (series_numeric - mean_val).abs() / std_val
-    filtered_df = df[z_score <= z_threshold]
+    # Keep rows where z_score is NaN to avoid dropping non-numeric rows silently
+    keep_mask = (z_score <= z_threshold) | z_score.isna()
+    filtered_df = df[keep_mask]
     removed_count = before_rows - filtered_df.shape[0]
 
     new_table = pa.Table.from_pandas(filtered_df)
diff --git a/tests/unit/test_pipeline_agent.py b/tests/unit/test_pipeline_agent.py
@@ -42,6 +42,14 @@
                 "dataset_name": {"type": "string", "required": True, "description": "Dataset name"},
             },
         },
+        "join_datasets": {
+            "type": "transform",
+            "params": {
+                "dataset_name": {"type": "string", "required": True, "description": "Dataset name"},
+                "join_key": {"type": "string", "required": False},
+                "join_type": {"type": "string", "required": False},
+            },
+        },
     }
 }
 
@@ -171,6 +179,62 @@ def test_step_missing_service_field(self):
         errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
         assert any("service" in e.lower() for e in errors)
 
+    def test_non_extract_requires_depends_on(self):
+        pipeline = _pipeline([
+            {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}},
+            {"id": "clean", "service": "clean_nan"},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("requires depends_on" in e.lower() for e in errors)
+
+    def test_extract_cannot_have_depends_on(self):
+        pipeline = _pipeline([
+            {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}, "depends_on": ["x"]},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("must not have depends_on" in e.lower() for e in errors)
+
+    def test_params_must_be_dict(self):
+        pipeline = _pipeline([
+            {"id": "extract", "service": "extract_csv", "params": []},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("params" in e.lower() and "dict" in e.lower() for e in errors)
+
+    def test_depends_on_must_be_list(self):
+        pipeline = _pipeline([
+            {"id": "extract", "service": "extract_csv", "params": {"file_path": "/data/f.csv"}},
+            {"id": "clean", "service": "clean_nan", "depends_on": "extract"},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("depends_on" in e.lower() and "list" in e.lower() for e in errors)
+
+    def test_non_join_multiple_depends_on_invalid(self):
+        pipeline = _pipeline([
+            {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}},
+            {"id": "extract2", "service": "extract_csv", "params": {"file_path": "/data/b.csv"}},
+            {"id": "clean", "service": "clean_nan", "depends_on": ["extract1", "extract2"]},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("multiple depends_on" in e.lower() for e in errors)
+
+    def test_join_requires_two_depends_on(self):
+        pipeline = _pipeline([
+            {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}},
+            {"id": "join", "service": "join_datasets", "depends_on": ["extract1"]},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert any("join_datasets" in e.lower() and "exactly 2" in e.lower() for e in errors)
+
+    def test_join_accepts_two_depends_on(self):
+        pipeline = _pipeline([
+            {"id": "extract1", "service": "extract_csv", "params": {"file_path": "/data/a.csv"}},
+            {"id": "extract2", "service": "extract_csv", "params": {"file_path": "/data/b.csv"}},
+            {"id": "join", "service": "join_datasets", "depends_on": ["extract1", "extract2"]},
+        ])
+        errors, warnings = validate_pipeline(pipeline, MINIMAL_REGISTRY)
+        assert errors == []
+
 
 # ── Return type contract ──────────────────────────────────────────────────────