feat: enhance Streamlit UI with platform readiness checks and quick Airflow triggers; improve error handling in data quality checks; update documentation for new features

VTvito · VTvito · commit 75a78965c2a6 · 2026-03-05T13:07:24.000+01:00
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 - **Natural Language Pipelines** &mdash; Describe what you need in plain text; the AI agent generates and executes a validated YAML pipeline
 - **11 Composable Services** &mdash; Extract (CSV, SQL, API, Excel), Transform (clean, filter, join, quality checks, outlier detection, LLM), Load (CSV, Excel, JSON, Parquet)
 - **High-Performance Data Transfer** &mdash; Apache Arrow IPC binary format between all services (zero-copy, no CSV/JSON parsing overhead)
-- **Visual Pipeline Builder** &mdash; Streamlit UI with YAML editor, real-time execution monitor, dataset explorer (browse outputs, preview, download), and service catalog
+- **Visual Pipeline Builder** &mdash; Streamlit UI with YAML editor, platform readiness checks, one-click Airflow triggers, real-time execution monitor, dataset explorer (browse outputs, preview, download), and service catalog
 - **Airflow Orchestration** &mdash; Production-ready DAGs with file-based XCom for large datasets
 - **Full Observability** &mdash; Prometheus metrics + Grafana dashboards + structured JSON logging + correlation ID tracing
 - **Extensible** &mdash; Add a new service in minutes using the included scaffold template and step-by-step guide
@@ -64,7 +64,16 @@ Trigger one of the pre-built DAGs from the Airflow UI:
 
 Or paste a YAML from [`examples/pipelines/`](examples/pipelines/) into the Streamlit YAML Editor.
 
-After execution, switch to the **Datasets** tab to browse output files, preview data, and download results.
+After execution, switch to the **Datasets** tab to browse output files, preview data, download results, and compare the latest run against the previous successful run.
+
+### New in Streamlit UX
+
+- **Platform Readiness** panel in Execution tab: live checks for Airflow, Streamlit, Prometheus, Grafana, including Airflow scheduler heartbeat status
+- **Quick Airflow Triggers** in Execution tab: trigger `hr_analytics_pipeline`, `ecommerce_pipeline`, or `weather_api_pipeline` without leaving Streamlit
+- **Execution insights**: successful steps, processed data volume, slowest step, and orchestration overhead (%)
+- **Run diagnostics** in Datasets tab: per-run active processing vs queue/orchestration gap timeline
+- **Run Comparison** in Datasets tab: current run vs previous successful run deltas for duration, final rows, and removed outliers
+- **Business KPI snapshot** from latest output file (domain-aware: HR, e-commerce, weather, or generic completeness)
 
 ---
 
@@ -215,7 +224,7 @@ Results including PNG charts and an interactive Plotly report are saved to `benc
 ### Testing
 
 ```bash
-make test              # Run all 208 tests (unit + integration)
+make test              # Run all tests (unit + integration)
 make test-coverage     # With coverage report
 make lint              # Ruff linter
 ```
diff --git a/airflow/dags/xcom_file_utils.py b/airflow/dags/xcom_file_utils.py
@@ -18,6 +18,16 @@
 SHARED_DATA_ROOT = "/app/data"
 
 
+def _shared_dir_mode() -> int:
+    raw_mode = os.getenv("ETL_SHARED_DIR_MODE", "775")
+    if raw_mode.startswith("0o"):
+        raw_mode = raw_mode[2:]
+    try:
+        return int(raw_mode, 8)
+    except ValueError:
+        return 0o775
+
+
 def save_ipc_to_shared(ipc_data: bytes, dataset_name: str, step_name: str) -> str:
     """
     Save Arrow IPC data to the shared volume.
@@ -32,10 +42,11 @@ def save_ipc_to_shared(ipc_data: bytes, dataset_name: str, step_name: str) -> st
     """
     xcom_dir = os.path.join(SHARED_DATA_ROOT, dataset_name, "xcom")
     os.makedirs(xcom_dir, exist_ok=True)
-    # Ensure the dataset dir and xcom dir are writable by all containers
+    # Keep write access configurable; default to least-privilege group writable.
+    mode = _shared_dir_mode()
     try:
-        os.chmod(os.path.join(SHARED_DATA_ROOT, dataset_name), 0o777)
-        os.chmod(xcom_dir, 0o777)
+        os.chmod(os.path.join(SHARED_DATA_ROOT, dataset_name), mode)
+        os.chmod(xcom_dir, mode)
     except OSError:
         pass
 
diff --git a/benchmark/monolithic_pipeline.py b/benchmark/monolithic_pipeline.py
@@ -55,11 +55,13 @@ def run_monolithic_pipeline(
 
     # ── Step 2: Data Quality Check ──
     t0 = time.time()
-    assert len(df) >= 10, f"Dataset has only {len(df)} rows (min: 10)"
+    if len(df) < 10:
+        raise ValueError(f"Dataset has only {len(df)} rows (min: 10)")
     total_cells = df.shape[0] * df.shape[1]
     total_nulls = df.isnull().sum().sum()
     null_ratio = total_nulls / total_cells if total_cells > 0 else 0
-    assert null_ratio <= null_threshold, f"Null ratio {null_ratio:.3f} exceeds threshold {null_threshold}"
+    if null_ratio > null_threshold:
+        raise ValueError(f"Null ratio {null_ratio:.3f} exceeds threshold {null_threshold}")
     results["quality_check"] = {
         "duration_sec": time.time() - t0,
         "null_ratio": float(null_ratio),
diff --git a/docs/access-credentials.md b/docs/access-credentials.md
@@ -61,6 +61,11 @@ cp .env.example .env
 | `LLM_PROVIDER` | `openai` | AI provider: `openai` or `local` |
 | `OPENAI_API_KEY` | *(empty)* | OpenAI API key — required if `LLM_PROVIDER=openai` |
 | `OPENAI_MODEL` | `gpt-4o-mini` | OpenAI model to use |
+| `AIRFLOW_BASE_URL` | `http://localhost:8080` | Streamlit quick-trigger target Airflow URL (optional override) |
+| `AIRFLOW_USERNAME` | `admin` | Streamlit quick-trigger Airflow username (optional override) |
+| `AIRFLOW_PASSWORD` | `admin` | Streamlit quick-trigger Airflow password (optional override) |
+
+> The Streamlit UI shows a warning when Airflow still uses default credentials.
 
 ---
 
diff --git a/docs/architecture.md b/docs/architecture.md
@@ -518,6 +518,21 @@ Every service writes a JSON file to `/app/data/<dataset_name>/metadata/` after p
 These files are low-latency (written locally on shared volume) and allow
 reconstructing what happened at each step without depending on external systems.
 
+### Streamlit observability surface (operator UX)
+
+The Streamlit UI now exposes runtime diagnostics directly in the **Execution** and
+**Datasets** tabs to reduce context switching between Airflow, Grafana, and logs.
+
+- **Platform Readiness**: probes Airflow (`/health`), Streamlit, Prometheus, and Grafana
+- **Airflow scheduler heartbeat status**: explicit healthy/not-ready signal from Airflow health payload
+- **Execution insights**: successful steps, total processed KB, slowest step, orchestration overhead (%)
+- **Run timeline diagnostics**: active processing vs queue/orchestration gap per step
+- **Run comparison**: latest run vs previous successful run deltas for duration, final rows, and outliers removed
+
+This UX layer does not replace Prometheus/Grafana but gives immediate operator feedback
+for common questions like "is the platform ready?", "where is time spent?", and
+"did this run improve or regress vs the previous one?".
+
 ### Prometheus scraping
 
 ```
diff --git a/docs/demo-guide.md b/docs/demo-guide.md
@@ -54,6 +54,11 @@ http://localhost:8501
 
 You'll see four tabs: **Pipeline Editor**, **Execution**, **Datasets**, **Services**.
 
+The **Execution** tab now includes:
+- **Platform Readiness** checks (Airflow, Streamlit, Prometheus, Grafana)
+- **Quick Airflow Triggers** (trigger core DAGs directly from Streamlit)
+- **Execution Insights** (slowest step, processed data, orchestration overhead)
+
 ### 2. Chat Tab — describe the pipeline in natural language
 
 In the chat panel, type something like:
@@ -74,6 +79,8 @@ The panel shows:
 Click **Execute**. You'll see steps complete sequentially (or in parallel if independent).
 When done, a table preview and saved file path in `/app/data/` appear.
 
+You can also use **Quick Airflow Triggers** in the same tab to launch DAGs directly via Airflow API.
+
 ---
 
 ## Scenario B — YAML Editor (pre-built pipeline)
@@ -233,9 +240,14 @@ to the shared volume at `/app/data/<dataset_name>/`.
 Open http://localhost:8501 → **Datasets** tab to:
 
 - **Output Files** — preview CSV, Parquet, JSON, or Excel files and download them directly
-- **Pipeline Runs** — view run history grouped by `correlation_id`, with per-step duration, row counts, and service details
+- **Pipeline Runs** — view run history grouped by `correlation_id`, with:
+    - run timeline (active processing vs queue/orchestration gap)
+    - compact run comparison (current vs previous successful run)
+    - per-step duration, row counts, and service details
 - **Raw Metadata** — inspect the JSON metadata files written by each service
 
+The dataset overview also includes a **Business KPI snapshot** from the latest output (for example: revenue/AOV for e-commerce, attrition rate for HR).
+
 Select any dataset from the sidebar dropdown to explore its contents.
 
 ---
diff --git a/services/common/path_utils.py b/services/common/path_utils.py
@@ -6,6 +6,16 @@
 _DATASET_NAME_PATTERN = re.compile(r"^[A-Za-z0-9._-]{1,128}$")
 
 
+def _shared_dir_mode():
+    raw_mode = os.getenv("ETL_SHARED_DIR_MODE", "775")
+    if raw_mode.startswith("0o"):
+        raw_mode = raw_mode[2:]
+    try:
+        return int(raw_mode, 8)
+    except ValueError:
+        return 0o775
+
+
 def sanitize_dataset_name(dataset_name):
     if not isinstance(dataset_name, str):
         raise ValueError("Parameter 'dataset_name' must be a string")
@@ -35,11 +45,11 @@ def ensure_dataset_dirs(dataset_name):
     dataset_folder = resolved_dataset_folder
     metadata_dir = os.path.join(dataset_folder, "metadata")
     os.makedirs(metadata_dir, exist_ok=True)
-    # Ensure directories are world-writable so all containers (services run as
-    # root, Airflow runs as uid 50000) can read/write to the same shared volume.
+    # Keep write access configurable; default to least-privilege group writable.
+    mode = _shared_dir_mode()
     try:
-        os.chmod(dataset_folder, 0o777)
-        os.chmod(metadata_dir, 0o777)
+        os.chmod(dataset_folder, mode)
+        os.chmod(metadata_dir, mode)
     except OSError:
         pass  # best-effort; may fail on read-only mounts
     return dataset_folder, metadata_dir
diff --git a/services/extract-excel-service/app/extract.py b/services/extract-excel-service/app/extract.py
@@ -4,6 +4,7 @@
 import pyarrow as pa
 from common.path_utils import resolve_input_path
 
+
 def process_excel(file_path):
     """
     Load Excel into DataFrame and return Arrow Table.
diff --git a/streamlit_app/app.py b/streamlit_app/app.py
diff --git a/tests/integration/test_service_endpoints.py b/tests/integration/test_service_endpoints.py