Add require_artifacts flag and LLM validation error (#62)

parashardhapola · web-flow · commit fdf94c2765e5 · 2026-02-19T12:09:13.000+01:00
* Add require_artifacts flag and LLM validation error

Bump version to 0.15.0. Introduce a require_artifacts parameter to CyteType.run (default True) so callers can choose whether artifact build/upload failures should abort the run; when False the run continues without uploaded_files and logs a warning. Add LLMValidationError and map the API error code LLM_VALIDATION_FAILED to this exception, and export it from the api package. Add tests covering artifact-failure behavior (raising by default and continuing when require_artifacts=False).

* docs update
diff --git a/README.md b/README.md
@@ -80,10 +80,7 @@ sc.pl.umap(adata, color='cytetype_annotation_clusters')
 ```
 🚀 [Try it in Google Colab](https://colab.research.google.com/drive/1aRLsI3mx8JR8u5BKHs48YUbLsqRsh2N7?usp=sharing)
 
-> **Note:** No API keys required for default configuration. See [custom LLM configuration](docs/configuration.md#llm-configuration) for advanced options.
->
-> `run()` now handles artifact packaging and upload automatically (`vars.h5` + `obs.duckdb`) before annotation.
-> Generated artifact files are kept on disk by default; use `cleanup_artifacts=True` to remove them after run completion/failure.
+> **Note:** No API keys required for default configuration. See [Configuration](docs/configuration.md) for LLM setup, artifact handling, and advanced options.
 
 **Using R/Seurat?** → [CyteTypeR](https://github.com/NygenAnalytics/CyteTypeR)
 
diff --git a/cytetype/__init__.py b/cytetype/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.14.1"
+__version__ = "0.15.0"
 
 import requests
 
diff --git a/cytetype/api/__init__.py b/cytetype/api/__init__.py
@@ -23,6 +23,7 @@
     QuotaExceededError,
     JobNotFoundError,
     JobFailedError,
+    LLMValidationError,
     TimeoutError,
     NetworkError,
 )
@@ -49,6 +50,7 @@
     "QuotaExceededError",
     "JobNotFoundError",
     "JobFailedError",
+    "LLMValidationError",
     "TimeoutError",
     "NetworkError",
 ]
diff --git a/cytetype/api/exceptions.py b/cytetype/api/exceptions.py
@@ -50,6 +50,12 @@ class JobFailedError(APIError):
     pass
 
 
+class LLMValidationError(APIError):
+    """LLM validation failed - LLM_VALIDATION_FAILED."""
+
+    pass
+
+
 # Client-side errors with default messages
 class TimeoutError(CyteTypeError):
     """Client-side timeout waiting for results."""
@@ -80,6 +86,7 @@ def __init__(
     "QUOTA_EXCEEDED": QuotaExceededError,
     "JOB_NOT_FOUND": JobNotFoundError,
     "JOB_FAILED": JobFailedError,
+    "LLM_VALIDATION_FAILED": LLMValidationError,
     "JOB_PROCESSING": APIError,  # Generic - expected during polling
     "JOB_NOT_COMPLETED": APIError,  # Generic
     "HTTP_ERROR": APIError,  # Generic
diff --git a/cytetype/main.py b/cytetype/main.py
@@ -268,6 +268,7 @@ def run(
         obs_duckdb_path: str = "obs.duckdb",
         upload_timeout_seconds: int = 3600,
         cleanup_artifacts: bool = False,
+        require_artifacts: bool = True,
         show_progress: bool = True,
         override_existing_results: bool = False,
     ) -> anndata.AnnData:
@@ -310,6 +311,9 @@ def run(
                 Defaults to 3600.
             cleanup_artifacts (bool, optional): Whether to delete generated artifact files after run
                 completes or fails. Defaults to False.
+            require_artifacts (bool, optional): Whether to raise an error if artifact building or
+                uploading fails. When True (default), any artifact failure stops the run. Set to
+                False to skip artifacts and continue with annotation only. Defaults to True.
             show_progress (bool, optional): Whether to display progress updates with spinner and
                 cluster status. Set to False to disable all visual progress output. Defaults to True.
             override_existing_results (bool, optional): Whether to override existing results with the
@@ -362,12 +366,27 @@ def run(
 
         artifact_paths = [vars_h5_path, obs_duckdb_path]
         try:
-            uploaded_file_refs = self._build_and_upload_artifacts(
-                vars_h5_path=vars_h5_path,
-                obs_duckdb_path=obs_duckdb_path,
-                upload_timeout_seconds=upload_timeout_seconds,
-            )
-            payload["uploaded_files"] = uploaded_file_refs
+            try:
+                uploaded_file_refs = self._build_and_upload_artifacts(
+                    vars_h5_path=vars_h5_path,
+                    obs_duckdb_path=obs_duckdb_path,
+                    upload_timeout_seconds=upload_timeout_seconds,
+                )
+                payload["uploaded_files"] = uploaded_file_refs
+            except Exception as exc:
+                if require_artifacts:
+                    logger.error(
+                        "Artifact build/upload failed. "
+                        "Rerun with `require_artifacts=False` to skip this error.\n"
+                        "Please report the error below in a new issue at "
+                        "https://github.com/NygenAnalytics/CyteType\n"
+                        f"({type(exc).__name__}: {exc})"
+                    )
+                    raise
+                logger.warning(
+                    "Artifact build/upload failed. Continuing without artifacts. "
+                    "Set `require_artifacts=True` to see the full traceback."
+                )
 
             # Save query if requested
             if save_query:
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -29,14 +29,6 @@ adata = annotator.run(
 )
 ```
 
-`run()` now performs the full upload pipeline internally:
-- Creates `vars.h5` from `adata.X`
-- Creates `obs.duckdb` from `adata.obs`
-- Uploads both artifacts to the CyteType API
-- Calls `/annotate` with uploaded file references
-
-If artifact creation or upload fails, `run()` fails fast.
-
 ## LLM Configuration
 You can provide your own LLM providers/models:
 ```python
@@ -64,20 +56,42 @@ adata = annotator.run(
 )
 ```
 
-## Advanced
+## Artifacts
+
+`run()` automatically builds and uploads two artifact files before submitting an annotation job:
+
+- **`vars.h5`** — a compressed HDF5 file containing the normalized expression matrix (`adata.X`) and variable metadata (`adata.var`). Used by the server for on-demand gene expression lookups during annotation and in the interactive report.
+- **`obs.duckdb`** — a DuckDB database containing the observation metadata (`adata.obs`). Used by the server to power metadata queries and filtering in the interactive report.
+
+Both files are created locally and then uploaded to the CyteType API. The uploaded references are attached to the `/annotate` payload so the server can link them to the job.
+
+### Artifact Parameters
+
 ```python
 adata = annotator.run(
     ...
-    poll_interval_seconds=30,   # How often to poll (default)
-    timeout_seconds=7200,       # Max wait time (default: 2 hours)
-    api_url="https://custom-api.example.com",  # Custom API endpoint if needed
-    vars_h5_path="vars.h5",     # Local artifact output path
-    obs_duckdb_path="obs.duckdb",  # Local artifact output path
-    upload_timeout_seconds=3600,   # Per-upload socket read timeout
-    cleanup_artifacts=False,       # Keep artifacts by default
+    vars_h5_path="vars.h5",        # Local output path for vars artifact
+    obs_duckdb_path="obs.duckdb",   # Local output path for obs artifact
+    upload_timeout_seconds=3600,    # Socket read timeout per upload (seconds)
+    cleanup_artifacts=False,        # Delete local artifact files after run
+    require_artifacts=True,         # Raise on artifact failure (set False to skip)
 )
 ```
 
+| Parameter | Default | Description |
+|-----------|---------|-------------|
+| `vars_h5_path` | `"vars.h5"` | Local path where the vars HDF5 file is written |
+| `obs_duckdb_path` | `"obs.duckdb"` | Local path where the obs DuckDB file is written |
+| `upload_timeout_seconds` | `3600` | Socket read timeout for each artifact upload |
+| `cleanup_artifacts` | `False` | Delete local artifact files after run completes or fails |
+| `require_artifacts` | `True` | Raise on artifact build/upload failure. Set to `False` to skip artifacts and continue with annotation only |
+
+### Error Handling
+
+By default (`require_artifacts=True`), any failure during artifact building or uploading stops the run and surfaces the full error. The error message includes a link to report the issue on GitHub.
+
+If you want the annotation to proceed even when artifacts fail (e.g. due to disk space or network issues), set `require_artifacts=False`. The job will submit without artifacts — annotation still works, but the interactive report will not have expression lookups or metadata filtering.
+
 ### Memory Recommendation for Large Datasets
 
 For large datasets, open your AnnData object in backed mode to reduce memory usage while building `vars.h5`:
@@ -86,4 +100,15 @@ For large datasets, open your AnnData object in backed mode to reduce memory usa
 import scanpy as sc
 
 adata = sc.read_h5ad("input.h5ad", backed="r")
+```
+
+## Advanced
+
+```python
+adata = annotator.run(
+    ...
+    poll_interval_seconds=30,   # How often to poll (default)
+    timeout_seconds=7200,       # Max wait time (default: 2 hours)
+    api_url="https://custom-api.example.com",  # Custom API endpoint if needed
+)
 ```
diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md
@@ -5,4 +5,5 @@
 - Make sure you have valid gene symbols in the AnnData object and are passing the correct gene symbols column name to parameter `gene_symbols_column`.
 - If you are using a custom LLM, make sure you have the correct API key and base URL.
 - For large datasets, load AnnData in backed mode (`sc.read_h5ad(..., backed="r")`) to reduce memory use during artifact generation.
-- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
+- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
+- If artifact building or uploading fails, `run()` will raise an error by default. Set `require_artifacts=False` to skip artifacts and continue with annotation only.
diff --git a/tests/test_cytetype_integration.py b/tests/test_cytetype_integration.py
@@ -149,6 +149,59 @@ def test_cytetype_run_auto_uploads_artifacts(
     }
 
 
+@patch("cytetype.main.wait_for_completion")
+@patch("cytetype.main.submit_annotation_job")
+def test_cytetype_run_artifact_failure_raises_by_default(
+    mock_submit: MagicMock,
+    mock_wait: MagicMock,
+    mock_adata: anndata.AnnData,
+    mock_api_response: dict[str, Any],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test run() raises when artifact build fails and require_artifacts=True (default)."""
+    mock_submit.return_value = "job_no_artifacts"
+    mock_wait.return_value = mock_api_response
+
+    monkeypatch.setattr(
+        "cytetype.main.save_features_matrix",
+        MagicMock(side_effect=RuntimeError("disk full")),
+    )
+
+    ct = CyteType(mock_adata, group_key="leiden")
+    with pytest.raises(RuntimeError, match="disk full"):
+        ct.run(study_context="Test")
+
+
+@patch("cytetype.main.wait_for_completion")
+@patch("cytetype.main.submit_annotation_job")
+def test_cytetype_run_artifact_failure_continues_when_not_required(
+    mock_submit: MagicMock,
+    mock_wait: MagicMock,
+    mock_adata: anndata.AnnData,
+    mock_api_response: dict[str, Any],
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    """Test run() proceeds without uploaded_files when require_artifacts=False."""
+    mock_submit.return_value = "job_no_artifacts"
+    mock_wait.return_value = mock_api_response
+
+    monkeypatch.setattr(
+        "cytetype.main.save_features_matrix",
+        MagicMock(side_effect=RuntimeError("disk full")),
+    )
+
+    ct = CyteType(mock_adata, group_key="leiden")
+    result = ct.run(study_context="Test", require_artifacts=False)
+
+    # Job should still complete successfully
+    assert result is not None
+    assert mock_submit.called
+
+    # Payload must not contain uploaded_files
+    payload = mock_submit.call_args.args[2]
+    assert "uploaded_files" not in payload
+
+
 @patch("cytetype.main.wait_for_completion")
 @patch("cytetype.main.submit_annotation_job")
 def test_cytetype_run_cleanup_artifacts(

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-__version__ = "0.14.1"`
	`1`	`+__version__ = "0.15.0"`
`2`	`2`
`3`	`3`	`import requests`
`4`	`4`