feat: enhance Streamlit UI with Dataset Explorer and update pipeline descriptions

VTvito · VTvito · commit 27176eb7182a · 2026-03-01T22:27:24.000+01:00
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -141,7 +141,7 @@ etl_microservices/
 │   └── pipeline_compiler.py       # Parallel pipeline execution via Preparator SDK (dispatch registry + topological layering)
 │
 ├── streamlit_app/
-│   ├── app.py                     # Streamlit UI: chat, YAML editor, execution monitor, service catalog, data preview/download, health dashboard
+│   ├── app.py                     # Streamlit UI: pipeline editor, YAML validation, execution monitor, dataset explorer (browse/preview/download outputs), service catalog
 │   ├── Dockerfile
 │   └── requirements.txt
 │
@@ -646,7 +646,7 @@ These are hard-won insights from building and debugging the platform. They shoul
 | **Orchestration** | Apache Airflow | 2.10.4 | PostgreSQL 16 backend, DAG-based |
 | **AI (cloud)** | OpenAI API | GPT-4o-mini default | Pipeline generation |
 | **AI (local)** | HuggingFace Transformers | Llama 3.2 1B Instruct | Text completion service |
-| **UI** | Streamlit | 1.30+ | Chat + pipeline builder + data preview/download + health dashboard |
+| **UI** | Streamlit | 1.30+ | Pipeline editor + execution monitor + dataset explorer + service catalog |
 | **Containers** | Docker Compose | v2 | Single bridge network |
 | **Monitoring** | Prometheus + Grafana | latest | Per-service metrics |
 | **Testing** | pytest | 7.x+ | Unit + integration |
diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 - **Natural Language Pipelines** &mdash; Describe what you need in plain text; the AI agent generates and executes a validated YAML pipeline
 - **11 Composable Services** &mdash; Extract (CSV, SQL, API, Excel), Transform (clean, filter, join, quality checks, outlier detection, LLM), Load (CSV, Excel, JSON, Parquet)
 - **High-Performance Data Transfer** &mdash; Apache Arrow IPC binary format between all services (zero-copy, no CSV/JSON parsing overhead)
-- **Visual Pipeline Builder** &mdash; Streamlit UI with chat panel, YAML editor, real-time execution monitor, and data preview/download
+- **Visual Pipeline Builder** &mdash; Streamlit UI with YAML editor, real-time execution monitor, dataset explorer (browse outputs, preview, download), and service catalog
 - **Airflow Orchestration** &mdash; Production-ready DAGs with file-based XCom for large datasets
 - **Full Observability** &mdash; Prometheus metrics + Grafana dashboards + structured JSON logging + correlation ID tracing
 - **Extensible** &mdash; Add a new service in minutes using the included scaffold template and step-by-step guide
@@ -46,7 +46,7 @@ The Airflow admin user (`admin`/`admin`) is created automatically on first boot.
 
 | Interface | URL | Credentials |
 |---|---|---|
-| **Streamlit** (AI Pipeline Builder) | http://localhost:8501 | &mdash; |
+| **Streamlit** (Pipeline Builder + Dataset Explorer) | http://localhost:8501 | &mdash; |
 | **Airflow** | http://localhost:8080 | admin / admin |
 | **Grafana** (pre-provisioned dashboard) | http://localhost:3000 | admin / *GF_SECURITY_ADMIN_PASSWORD from .env* |
 | **Prometheus** | http://localhost:9090 | &mdash; |
@@ -64,6 +64,8 @@ Trigger one of the pre-built DAGs from the Airflow UI:
 
 Or paste a YAML from [`examples/pipelines/`](examples/pipelines/) into the Streamlit YAML Editor.
 
+After execution, switch to the **Datasets** tab to browse output files, preview data, and download results.
+
 ---
 
 ## How It Works
@@ -243,7 +245,7 @@ Full walkthrough: [docs/extending.md](docs/extending.md)
 <summary>Click to expand</summary>
 
 ```
-├── docker-compose.yml          # Full stack (17 containers)
+├── docker-compose.yml          # Full stack (18 containers)
 ├── Makefile                    # Common commands
 ├── data/demo/                  # Bundled demo datasets
 │   ├── hr_sample.csv
diff --git a/airflow/dags/hr_analytics_pipeline.py b/airflow/dags/hr_analytics_pipeline.py
@@ -43,9 +43,9 @@
     description="HR People Analytics ETL pipeline (IBM HR Attrition dataset)",
     tags=["hr", "analytics", "etl", "v4"],
     params={
-        "dataset_name": Param("hr_attrition", type="string", description="Dataset identifier"),
+        "dataset_name": Param("hr_demo", type="string", description="Dataset identifier"),
         "file_path": Param(
-            "/app/data/hr_attrition/WA_Fn-UseC_-HR-Employee-Attrition.csv",
+            "/app/data/hr_demo/data.csv",
             type="string",
             description="Path to HR CSV file on shared volume",
         ),
diff --git a/airflow/dags/xcom_file_utils.py b/airflow/dags/xcom_file_utils.py
@@ -32,6 +32,12 @@ def save_ipc_to_shared(ipc_data: bytes, dataset_name: str, step_name: str) -> st
     """
     xcom_dir = os.path.join(SHARED_DATA_ROOT, dataset_name, "xcom")
     os.makedirs(xcom_dir, exist_ok=True)
+    # Ensure the dataset dir and xcom dir are writable by all containers
+    try:
+        os.chmod(os.path.join(SHARED_DATA_ROOT, dataset_name), 0o777)
+        os.chmod(xcom_dir, 0o777)
+    except OSError:
+        pass
 
     timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
     unique_id = uuid.uuid4().hex[:8]
diff --git a/docs/demo-guide.md b/docs/demo-guide.md
@@ -52,11 +52,11 @@ curl http://localhost:5002/health  # clean-nan-service
 
 http://localhost:8501
 
-You'll see four tabs: **Chat**, **YAML Editor**, **Service Catalog**, **Health Dashboard**.
+You'll see four tabs: **Pipeline Editor**, **Execution**, **Datasets**, **Services**.
 
 ### 2. Chat Tab — describe the pipeline in natural language
 
-In the text box, type something like:
+In the chat panel, type something like:
 
 > *"Load the HR dataset, check data quality, remove outliers on monthly salary and save as CSV"*
 
@@ -88,7 +88,7 @@ cat examples/pipelines/hr_analytics.yaml
 
 ### 2. Paste it in the editor
 
-Open http://localhost:8501 → tab **YAML Editor** → paste the content.
+Open http://localhost:8501 → tab **Pipeline Editor** → paste the content in the YAML editor.
 
 The validator shows any errors (nonexistent service, missing parameter, cycle in graph)
 before execution even starts.
@@ -225,6 +225,21 @@ Click **Trigger DAG w/ config** and pass JSON:
 
 ---
 
+## Browsing Results — Dataset Explorer
+
+After any pipeline completes (UI, Airflow, or SDK), output files and metadata are written
+to the shared volume at `/app/data/<dataset_name>/`.
+
+Open http://localhost:8501 → **Datasets** tab to:
+
+- **Output Files** — preview CSV, Parquet, JSON, or Excel files and download them directly
+- **Pipeline Runs** — view run history grouped by `correlation_id`, with per-step duration, row counts, and service details
+- **Raw Metadata** — inspect the JSON metadata files written by each service
+
+Select any dataset from the sidebar dropdown to explore its contents.
+
+---
+
 ## What to observe during execution
 
 ### Metrics in Grafana
@@ -270,5 +285,6 @@ docker exec extract-csv-service ls /app/data/hr_demo/metadata/
 | `curl /health` → connection refused | service not started | `docker compose ps` + `docker compose up -d <service>` |
 | Streamlit shows "AI agent not available" | `OPENAI_API_KEY` missing | add key to `.env` + `docker compose restart streamlit-app` |
 | Pipeline fails with "file not found" | demo data not loaded | `make demo-data` |
+| Datasets tab shows no datasets | no pipelines have run yet | run a pipeline first, then refresh |
 | Airflow task fail "No module named..." | Airflow container not updated | `docker compose up -d --build airflow` |
 | Grafana "No data" in panels | services not scraped yet | wait 15s or run a pipeline to generate traffic |
diff --git a/preparator/preparator_v4.py b/preparator/preparator_v4.py
@@ -122,6 +122,29 @@ def run_service_ipc_in_ipc_out_with_header(self, service_key, ipc_data, header_d
         self._handle_error_response(resp, service_key)
         return resp.content  # Arrow IPC bytes in case of success
 
+    def run_service_ipc_in_json_out_with_header(self, service_key, ipc_data, header_dict):
+        """
+        Executes a POST request to the microservice identified by `service_key`,
+        sending `ipc_data` in the body and parameters (header_dict) in a header called 'X-Params' (as JSON).
+        Returns the parsed JSON response body as a dict (used by services that respond with JSON, e.g. load-data).
+        """
+        header_json = json.dumps(header_dict)
+        self.logger.info(f"Calling {service_key} with IPC data (size={len(ipc_data)}). Header: {header_json}")
+        url = self.services[service_key]
+
+        resp = self.session.post(
+            url,
+            data=ipc_data,  # Arrow IPC in body
+            headers={
+                "Content-Type": "application/vnd.apache.arrow.stream",
+                "X-Params": header_json,
+                "X-Correlation-ID": self.correlation_id,
+            },
+            timeout=self.timeout
+        )
+        self._handle_error_response(resp, service_key)
+        return resp.json()  # JSON dict in case of success
+
     # ================================================================
     # EXTRACTION
     # ================================================================
@@ -297,9 +320,11 @@ def text_completion_llm(
     def load_data(self, ipc_data, format='csv', dataset_name="default_dataset"):
         """
         'load_data' microservice
+        The load-data-service returns a JSON status response (not Arrow IPC),
+        so we use run_service_ipc_in_json_out_with_header.
         """
         header_dict = {
             "dataset_name": dataset_name,
             "format": format
         }
-        return self.run_service_ipc_in_ipc_out_with_header("load_data", ipc_data, header_dict)
+        return self.run_service_ipc_in_json_out_with_header("load_data", ipc_data, header_dict)
diff --git a/services/common/path_utils.py b/services/common/path_utils.py
@@ -35,6 +35,13 @@ def ensure_dataset_dirs(dataset_name):
     dataset_folder = resolved_dataset_folder
     metadata_dir = os.path.join(dataset_folder, "metadata")
     os.makedirs(metadata_dir, exist_ok=True)
+    # Ensure directories are world-writable so all containers (services run as
+    # root, Airflow runs as uid 50000) can read/write to the same shared volume.
+    try:
+        os.chmod(dataset_folder, 0o777)
+        os.chmod(metadata_dir, 0o777)
+    except OSError:
+        pass  # best-effort; may fail on read-only mounts
     return dataset_folder, metadata_dir
 
 
diff --git a/streamlit_app/app.py b/streamlit_app/app.py