rename just commands, fix error in tests, fix max_workers not working for run-local

Michael Aydinbas · Michael Aydinbas · commit 7f8823059dfb · 2026-02-26T11:51:30.000+01:00
diff --git a/a4d-python/SETUP.md b/a4d-python/SETUP.md
@@ -58,10 +58,13 @@ gcloud config set project a4dphase2
 # Test with a single file (fastest)
 just run-file /path/to/tracker.xlsx
 
-# Process all trackers in A4D_DATA_ROOT, skip GCS/BigQuery
-just run --skip-upload
+# Process all files already in A4D_DATA_ROOT — no GCS
+just run-local
 
-# Full pipeline (downloads from GCS, uploads results, loads into BigQuery)
+# Download latest files from GCS, process locally — no upload
+just run-download
+
+# Full pipeline: download from GCS, process, upload results + load BigQuery
 just run
 ```
 
diff --git a/a4d-python/justfile b/a4d-python/justfile
@@ -59,30 +59,27 @@ clean:
     find . -type d -name __pycache__ -exec rm -rf {} +
     find . -type f -name "*.pyc" -delete
 
-# Run full pipeline (extract + clean + tables)
+# Full pipeline: download from GCS, process, upload to GCS + BigQuery
 run *ARGS:
-    uv run a4d process-patient {{ARGS}}
+    uv run a4d run-pipeline {{ARGS}}
 
-# Run pipeline with 8 workers (parallel processing)
-run-parallel:
-    uv run a4d process-patient --workers 8
+# Download from GCS, process locally, no upload
+run-download *ARGS:
+    uv run a4d run-pipeline --skip-upload {{ARGS}}
 
-# Extract and clean only (skip table creation)
-run-clean:
-    uv run a4d process-patient --workers 8 --skip-tables
+# Process local files only, no GCS (use files already in data_root)
+# Optionally pass a path: just run-local --data-root /path/to/trackers
+run-local *ARGS:
+    uv run a4d process-patient {{ARGS}}
 
-# Force reprocess all files (ignore existing outputs)
-run-force:
-    uv run a4d process-patient --workers 8 --force
+# Process a single tracker file
+run-file FILE:
+    uv run a4d process-patient --file "{{FILE}}"
 
 # Create tables from existing cleaned parquet files
 create-tables INPUT:
     uv run a4d create-tables --input "{{INPUT}}"
 
-# Process a single tracker file
-run-file FILE:
-    uv run a4d process-patient --file "{{FILE}}"
-
 # Build Docker image
 docker-build:
     docker build -t a4d-python:latest .
diff --git a/a4d-python/src/a4d/cli.py b/a4d-python/src/a4d/cli.py
@@ -8,7 +8,7 @@
 from rich.console import Console
 from rich.table import Table
 
-from a4d.pipeline.patient import process_patient_tables, run_patient_pipeline
+from a4d.pipeline.patient import discover_tracker_files, process_patient_tables, run_patient_pipeline
 from a4d.tables.logs import create_table_logs
 
 app = typer.Typer(
@@ -69,30 +69,36 @@ def process_patient_cmd(
         ),
     ] = None,
     workers: Annotated[
-        int, typer.Option("--workers", "-w", help="Number of parallel workers (1 = sequential)")
-    ] = 1,
+        int | None, typer.Option("--workers", "-w", help="Number of parallel workers (default: A4D_MAX_WORKERS)")
+    ] = None,
     skip_tables: Annotated[
         bool, typer.Option("--skip-tables", help="Skip table creation (only extract + clean)")
     ] = False,
     force: Annotated[
         bool, typer.Option("--force", help="Force reprocessing (ignore existing outputs)")
     ] = False,
-    clean: Annotated[
-        bool,
-        typer.Option("--clean", help="Wipe output directory before running (default when --file is used)"),
-    ] = False,
+    data_root: Annotated[
+        Path | None,
+        typer.Option("--data-root", "-d", help="Directory containing tracker files (default: from config)"),
+    ] = None,
     output_root: Annotated[
         Path | None, typer.Option("--output", "-o", help="Output directory (default: from config)")
     ] = None,
 ):
     """Process patient data pipeline.
 
     \b
+    Output is always cleaned before each run so tables reflect only the
+    current run's files.
+
     Examples:
-        # Process all trackers in data_root
+        # Process all trackers in data_root (from config)
         uv run a4d process-patient
 
-        # Process specific file (output is always cleaned first)
+        # Process all trackers in a specific directory
+        uv run a4d process-patient --data-root /path/to/trackers
+
+        # Process specific file
         uv run a4d process-patient --file /path/to/tracker.xlsx
 
         # Parallel processing with 8 workers
@@ -101,25 +107,45 @@ def process_patient_cmd(
         # Just extract + clean, skip tables
         uv run a4d process-patient --skip-tables
     """
-    console.print("\n[bold blue]A4D Patient Pipeline[/bold blue]\n")
+    from a4d.config import settings as _settings
 
-    # Prepare tracker files list
-    tracker_files = [file] if file else None
+    console.print("\n[bold blue]A4D Patient Pipeline[/bold blue]\n")
 
-    # Single-file mode always cleans first — there's no reason to keep stale
-    # outputs from previous runs when testing a specific file.
-    clean_output = clean or (file is not None)
+    if file:
+        tracker_files = [file]
+        data_root_display = f"{file} (single file)"
+    elif data_root:
+        tracker_files = discover_tracker_files(data_root)
+        if not tracker_files:
+            console.print(f"[bold red]Error: No tracker files found in {data_root}[/bold red]\n")
+            raise typer.Exit(1)
+        data_root_display = str(data_root)
+    else:
+        tracker_files = None  # pipeline uses settings.data_root
+        data_root_display = str(_settings.data_root)
+
+    _output_root = output_root or _settings.output_root
+    _workers = workers if workers is not None else _settings.max_workers
+
+    console.print(f"Data root:   {data_root_display}")
+    console.print(f"Output root: {_output_root}")
+    console.print(f"Workers:     {_workers}")
+    if skip_tables:
+        console.print("Tables:      skipped")
+    if force:
+        console.print("Force:       yes")
+    console.print()
 
     # Step 1: Extract + clean (table creation handled below for visible progress)
     console.print("[bold]Step 1/3:[/bold] Extracting and cleaning tracker files...")
     try:
         result = run_patient_pipeline(
             tracker_files=tracker_files,
-            max_workers=workers,
+            max_workers=_workers,
             output_root=output_root,
             skip_tables=True,  # tables created below with console feedback
             force=force,
-            clean_output=clean_output,
+            clean_output=True,
             show_progress=True,
             console_log_level="ERROR",
         )
@@ -130,9 +156,6 @@ def process_patient_cmd(
     # Step 2+3: Table and log creation with console feedback
     tables: dict[str, Path] = {}
     if not skip_tables and result.successful_trackers > 0:
-        from a4d.config import settings as _settings
-
-        _output_root = output_root or _settings.output_root
         cleaned_dir = _output_root / "patient_data_cleaned"
         tables_dir = _output_root / "tables"
         logs_dir = _output_root / "logs"
@@ -483,14 +506,18 @@ def upload_output_cmd(
 @app.command("run-pipeline")
 def run_pipeline_cmd(
     workers: Annotated[
-        int, typer.Option("--workers", "-w", help="Number of parallel workers (1 = sequential)")
-    ] = 4,
+        int | None, typer.Option("--workers", "-w", help="Number of parallel workers (default: A4D_MAX_WORKERS)")
+    ] = None,
     force: Annotated[
         bool, typer.Option("--force", help="Force reprocessing (ignore existing outputs)")
     ] = False,
+    skip_download: Annotated[
+        bool,
+        typer.Option("--skip-download", help="Skip GCS download (use files already in data_root)"),
+    ] = False,
     skip_upload: Annotated[
         bool,
-        typer.Option("--skip-upload", help="Skip GCS and BigQuery uploads (local testing)"),
+        typer.Option("--skip-upload", help="Skip GCS and BigQuery upload steps"),
     ] = False,
 ):
     """Run the full end-to-end A4D pipeline.
@@ -506,28 +533,33 @@ def run_pipeline_cmd(
 
     \b
     Examples:
-        # Full pipeline with 4 workers
+        # Full pipeline (download + process + upload)
         uv run a4d run-pipeline
 
-        # Force reprocess all files
-        uv run a4d run-pipeline --force
-
-        # Local testing without GCS/BigQuery uploads
+        # Download latest files, process locally, skip upload
         uv run a4d run-pipeline --skip-upload
+
+        # Process local files only, no download or upload
+        uv run a4d run-pipeline --skip-download --skip-upload
     """
     from a4d.config import settings
     from a4d.gcp.bigquery import load_pipeline_tables
     from a4d.gcp.storage import download_tracker_files, upload_output
 
+    _workers = workers if workers is not None else settings.max_workers
+
     console.print("\n[bold blue]A4D Full Pipeline[/bold blue]\n")
     console.print(f"Data root:   {settings.data_root}")
     console.print(f"Output root: {settings.output_root}")
-    console.print(f"Workers:     {workers}")
+    console.print(f"Workers:     {_workers}")
     console.print(f"Project:     {settings.project_id}")
-    console.print(f"Dataset:     {settings.dataset}\n")
+    console.print(f"Dataset:     {settings.dataset}")
+    console.print(f"Download:    {'yes' if not skip_download else 'skipped (--skip-download)'}")
+    console.print(f"Upload:      {'yes' if not skip_upload else 'skipped (--skip-upload)'}")
+    console.print()
 
     # Step 1 – Download tracker files from GCS
-    if not skip_upload:
+    if not skip_download:
         console.print("[bold]Step 1/5:[/bold] Downloading tracker files from GCS...")
         try:
             downloaded = download_tracker_files(destination=settings.data_root)
@@ -536,13 +568,13 @@ def run_pipeline_cmd(
             console.print(f"\n[bold red]Error during download: {e}[/bold red]\n")
             raise typer.Exit(1) from e
     else:
-        console.print("[bold]Step 1/5:[/bold] Skipping GCS download (--skip-upload)\n")
+        console.print("[bold]Step 1/5:[/bold] Skipping GCS download (--skip-download)\n")
 
     # Step 2+3 – Extract, clean and build tables
     console.print("[bold]Steps 2–3/5:[/bold] Processing tracker files...\n")
     try:
         result = run_patient_pipeline(
-            max_workers=workers,
+            max_workers=_workers,
             force=force,
             show_progress=True,
             console_log_level="WARNING",
diff --git a/a4d-python/src/a4d/pipeline/patient.py b/a4d-python/src/a4d/pipeline/patient.py
@@ -132,8 +132,8 @@ def run_patient_pipeline(
 
     Pipeline steps:
     1. For each tracker (optionally parallel):
-       - Extract patient data from Excel → raw parquet
-       - Clean raw data → cleaned parquet
+        - Extract patient data from Excel → raw parquet
+        - Clean raw data → cleaned parquet
     2. Create final tables from all cleaned parquets (if not skipped)
 
     Args:
@@ -142,6 +142,7 @@ def run_patient_pipeline(
         output_root: Output directory (None = use settings.output_root)
         skip_tables: If True, only extract + clean, skip table creation
         force: If True, reprocess even if outputs exist
+        clean_output: If True, wipe patient_data_raw/, patient_data_cleaned/, tables/ before run
         progress_callback: Optional callback(tracker_name, success) called after each tracker
         show_progress: If True, show tqdm progress bar
         console_log_level: Console log level (None=INFO, ERROR=quiet, etc)
@@ -175,10 +176,9 @@ def run_patient_pipeline(
     if output_root is None:
         output_root = settings.output_root
 
-    # Wipe previous run's intermediate outputs so tables only reflect this run.
-    # Does not delete logs (useful for debugging) or the tables dir itself.
+    # Wipe previous run's outputs so tables reflect only this run.
     if clean_output:
-        for subdir in ("patient_data_raw", "patient_data_cleaned", "tables"):
+        for subdir in ("patient_data_raw", "patient_data_cleaned", "tables", "logs"):
             target = output_root / subdir
             if target.exists():
                 shutil.rmtree(target)
@@ -215,11 +215,7 @@ def run_patient_pipeline(
         logger.info("Processing trackers sequentially")
 
         # Use tqdm if requested
-        iterator = (
-            tqdm(tracker_files, desc="Processing trackers", unit="file")
-            if show_progress
-            else tracker_files
-        )
+        iterator = tqdm(tracker_files, desc="Processing trackers", unit="file") if show_progress else tracker_files
 
         for tracker_file in iterator:
             if show_progress:
@@ -265,9 +261,7 @@ def run_patient_pipeline(
             # Collect results as they complete
             futures_iterator = as_completed(futures)
             if show_progress:
-                futures_iterator = tqdm(
-                    futures_iterator, total=len(futures), desc="Processing trackers", unit="file"
-                )
+                futures_iterator = tqdm(futures_iterator, total=len(futures), desc="Processing trackers", unit="file")
 
             for future in futures_iterator:
                 tracker_file = futures[future]
diff --git a/a4d-python/tests/test_cli/test_cli.py b/a4d-python/tests/test_cli/test_cli.py
@@ -43,6 +43,7 @@ def test_upload_tables_help(self):
     def test_run_pipeline_help(self):
         result = runner.invoke(app, ["run-pipeline", "--help"])
         assert result.exit_code == 0
+        assert "--skip-download" in result.output
         assert "--skip-upload" in result.output
 
 
@@ -92,6 +93,7 @@ def test_skip_upload_calls_pipeline(self, mock_settings, mock_run_pipeline, tmp_
         mock_settings.output_root = tmp_path / "output"
         mock_settings.project_id = "test-project"
         mock_settings.dataset = "test-dataset"
+        mock_settings.max_workers = 4
 
         (tmp_path / "data").mkdir()
         (tmp_path / "output").mkdir()
@@ -105,7 +107,7 @@ def test_skip_upload_calls_pipeline(self, mock_settings, mock_run_pipeline, tmp_
         mock_result.tables = {}
         mock_run_pipeline.return_value = mock_result
 
-        result = runner.invoke(app, ["run-pipeline", "--skip-upload"])
+        result = runner.invoke(app, ["run-pipeline", "--skip-download", "--skip-upload"])
 
         mock_run_pipeline.assert_called_once()
         assert result.exit_code == 0
@@ -117,6 +119,7 @@ def test_pipeline_failure_exits_nonzero(self, mock_settings, mock_run_pipeline,
         mock_settings.output_root = tmp_path / "output"
         mock_settings.project_id = "test-project"
         mock_settings.dataset = "test-dataset"
+        mock_settings.max_workers = 4
 
         (tmp_path / "data").mkdir()
         (tmp_path / "output").mkdir()
@@ -132,7 +135,7 @@ def test_pipeline_failure_exits_nonzero(self, mock_settings, mock_run_pipeline,
         mock_result.tables = {}
         mock_run_pipeline.return_value = mock_result
 
-        result = runner.invoke(app, ["run-pipeline", "--skip-upload"])
+        result = runner.invoke(app, ["run-pipeline", "--skip-download", "--skip-upload"])
 
         assert result.exit_code == 1