Skip to content

Commit fdf94c2

Browse files
Add require_artifacts flag and LLM validation error (#62)
* Add require_artifacts flag and LLM validation error Bump version to 0.15.0. Introduce a require_artifacts parameter to CyteType.run (default True) so callers can choose whether artifact build/upload failures should abort the run; when False the run continues without uploaded_files and logs a warning. Add LLMValidationError and map the API error code LLM_VALIDATION_FAILED to this exception, and export it from the api package. Add tests covering artifact-failure behavior (raising by default and continuing when require_artifacts=False). * docs update
1 parent b18cada commit fdf94c2

8 files changed

Lines changed: 132 additions & 28 deletions

File tree

README.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -80,10 +80,7 @@ sc.pl.umap(adata, color='cytetype_annotation_clusters')
8080
```
8181
🚀 [Try it in Google Colab](https://colab.research.google.com/drive/1aRLsI3mx8JR8u5BKHs48YUbLsqRsh2N7?usp=sharing)
8282

83-
> **Note:** No API keys required for default configuration. See [custom LLM configuration](docs/configuration.md#llm-configuration) for advanced options.
84-
>
85-
> `run()` now handles artifact packaging and upload automatically (`vars.h5` + `obs.duckdb`) before annotation.
86-
> Generated artifact files are kept on disk by default; use `cleanup_artifacts=True` to remove them after run completion/failure.
83+
> **Note:** No API keys required for default configuration. See [Configuration](docs/configuration.md) for LLM setup, artifact handling, and advanced options.
8784
8885
**Using R/Seurat?**[CyteTypeR](https://github.com/NygenAnalytics/CyteTypeR)
8986

cytetype/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
__version__ = "0.14.1"
1+
__version__ = "0.15.0"
22

33
import requests
44

cytetype/api/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
QuotaExceededError,
2424
JobNotFoundError,
2525
JobFailedError,
26+
LLMValidationError,
2627
TimeoutError,
2728
NetworkError,
2829
)
@@ -49,6 +50,7 @@
4950
"QuotaExceededError",
5051
"JobNotFoundError",
5152
"JobFailedError",
53+
"LLMValidationError",
5254
"TimeoutError",
5355
"NetworkError",
5456
]

cytetype/api/exceptions.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ class JobFailedError(APIError):
5050
pass
5151

5252

53+
class LLMValidationError(APIError):
54+
"""LLM validation failed - LLM_VALIDATION_FAILED."""
55+
56+
pass
57+
58+
5359
# Client-side errors with default messages
5460
class TimeoutError(CyteTypeError):
5561
"""Client-side timeout waiting for results."""
@@ -80,6 +86,7 @@ def __init__(
8086
"QUOTA_EXCEEDED": QuotaExceededError,
8187
"JOB_NOT_FOUND": JobNotFoundError,
8288
"JOB_FAILED": JobFailedError,
89+
"LLM_VALIDATION_FAILED": LLMValidationError,
8390
"JOB_PROCESSING": APIError, # Generic - expected during polling
8491
"JOB_NOT_COMPLETED": APIError, # Generic
8592
"HTTP_ERROR": APIError, # Generic

cytetype/main.py

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,7 @@ def run(
268268
obs_duckdb_path: str = "obs.duckdb",
269269
upload_timeout_seconds: int = 3600,
270270
cleanup_artifacts: bool = False,
271+
require_artifacts: bool = True,
271272
show_progress: bool = True,
272273
override_existing_results: bool = False,
273274
) -> anndata.AnnData:
@@ -310,6 +311,9 @@ def run(
310311
Defaults to 3600.
311312
cleanup_artifacts (bool, optional): Whether to delete generated artifact files after run
312313
completes or fails. Defaults to False.
314+
require_artifacts (bool, optional): Whether to raise an error if artifact building or
315+
uploading fails. When True (default), any artifact failure stops the run. Set to
316+
False to skip artifacts and continue with annotation only. Defaults to True.
313317
show_progress (bool, optional): Whether to display progress updates with spinner and
314318
cluster status. Set to False to disable all visual progress output. Defaults to True.
315319
override_existing_results (bool, optional): Whether to override existing results with the
@@ -362,12 +366,27 @@ def run(
362366

363367
artifact_paths = [vars_h5_path, obs_duckdb_path]
364368
try:
365-
uploaded_file_refs = self._build_and_upload_artifacts(
366-
vars_h5_path=vars_h5_path,
367-
obs_duckdb_path=obs_duckdb_path,
368-
upload_timeout_seconds=upload_timeout_seconds,
369-
)
370-
payload["uploaded_files"] = uploaded_file_refs
369+
try:
370+
uploaded_file_refs = self._build_and_upload_artifacts(
371+
vars_h5_path=vars_h5_path,
372+
obs_duckdb_path=obs_duckdb_path,
373+
upload_timeout_seconds=upload_timeout_seconds,
374+
)
375+
payload["uploaded_files"] = uploaded_file_refs
376+
except Exception as exc:
377+
if require_artifacts:
378+
logger.error(
379+
"Artifact build/upload failed. "
380+
"Rerun with `require_artifacts=False` to skip this error.\n"
381+
"Please report the error below in a new issue at "
382+
"https://github.com/NygenAnalytics/CyteType\n"
383+
f"({type(exc).__name__}: {exc})"
384+
)
385+
raise
386+
logger.warning(
387+
"Artifact build/upload failed. Continuing without artifacts. "
388+
"Set `require_artifacts=True` to see the full traceback."
389+
)
371390

372391
# Save query if requested
373392
if save_query:

docs/configuration.md

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -29,14 +29,6 @@ adata = annotator.run(
2929
)
3030
```
3131

32-
`run()` now performs the full upload pipeline internally:
33-
- Creates `vars.h5` from `adata.X`
34-
- Creates `obs.duckdb` from `adata.obs`
35-
- Uploads both artifacts to the CyteType API
36-
- Calls `/annotate` with uploaded file references
37-
38-
If artifact creation or upload fails, `run()` fails fast.
39-
4032
## LLM Configuration
4133
You can provide your own LLM providers/models:
4234
```python
@@ -64,20 +56,42 @@ adata = annotator.run(
6456
)
6557
```
6658

67-
## Advanced
59+
## Artifacts
60+
61+
`run()` automatically builds and uploads two artifact files before submitting an annotation job:
62+
63+
- **`vars.h5`** — a compressed HDF5 file containing the normalized expression matrix (`adata.X`) and variable metadata (`adata.var`). Used by the server for on-demand gene expression lookups during annotation and in the interactive report.
64+
- **`obs.duckdb`** — a DuckDB database containing the observation metadata (`adata.obs`). Used by the server to power metadata queries and filtering in the interactive report.
65+
66+
Both files are created locally and then uploaded to the CyteType API. The uploaded references are attached to the `/annotate` payload so the server can link them to the job.
67+
68+
### Artifact Parameters
69+
6870
```python
6971
adata = annotator.run(
7072
...
71-
poll_interval_seconds=30, # How often to poll (default)
72-
timeout_seconds=7200, # Max wait time (default: 2 hours)
73-
api_url="https://custom-api.example.com", # Custom API endpoint if needed
74-
vars_h5_path="vars.h5", # Local artifact output path
75-
obs_duckdb_path="obs.duckdb", # Local artifact output path
76-
upload_timeout_seconds=3600, # Per-upload socket read timeout
77-
cleanup_artifacts=False, # Keep artifacts by default
73+
vars_h5_path="vars.h5", # Local output path for vars artifact
74+
obs_duckdb_path="obs.duckdb", # Local output path for obs artifact
75+
upload_timeout_seconds=3600, # Socket read timeout per upload (seconds)
76+
cleanup_artifacts=False, # Delete local artifact files after run
77+
require_artifacts=True, # Raise on artifact failure (set False to skip)
7878
)
7979
```
8080

81+
| Parameter | Default | Description |
82+
|-----------|---------|-------------|
83+
| `vars_h5_path` | `"vars.h5"` | Local path where the vars HDF5 file is written |
84+
| `obs_duckdb_path` | `"obs.duckdb"` | Local path where the obs DuckDB file is written |
85+
| `upload_timeout_seconds` | `3600` | Socket read timeout for each artifact upload |
86+
| `cleanup_artifacts` | `False` | Delete local artifact files after run completes or fails |
87+
| `require_artifacts` | `True` | Raise on artifact build/upload failure. Set to `False` to skip artifacts and continue with annotation only |
88+
89+
### Error Handling
90+
91+
By default (`require_artifacts=True`), any failure during artifact building or uploading stops the run and surfaces the full error. The error message includes a link to report the issue on GitHub.
92+
93+
If you want the annotation to proceed even when artifacts fail (e.g. due to disk space or network issues), set `require_artifacts=False`. The job will submit without artifacts — annotation still works, but the interactive report will not have expression lookups or metadata filtering.
94+
8195
### Memory Recommendation for Large Datasets
8296

8397
For large datasets, open your AnnData object in backed mode to reduce memory usage while building `vars.h5`:
@@ -86,4 +100,15 @@ For large datasets, open your AnnData object in backed mode to reduce memory usa
86100
import scanpy as sc
87101

88102
adata = sc.read_h5ad("input.h5ad", backed="r")
103+
```
104+
105+
## Advanced
106+
107+
```python
108+
adata = annotator.run(
109+
...
110+
poll_interval_seconds=30, # How often to poll (default)
111+
timeout_seconds=7200, # Max wait time (default: 2 hours)
112+
api_url="https://custom-api.example.com", # Custom API endpoint if needed
113+
)
89114
```

docs/troubleshooting.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
- Make sure you have valid gene symbols in the AnnData object and are passing the correct gene symbols column name to parameter `gene_symbols_column`.
66
- If you are using a custom LLM, make sure you have the correct API key and base URL.
77
- For large datasets, load AnnData in backed mode (`sc.read_h5ad(..., backed="r")`) to reduce memory use during artifact generation.
8-
- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
8+
- `run()` creates `vars.h5` and `obs.duckdb` before annotation. Use `cleanup_artifacts=True` if you do not want to keep these local files.
9+
- If artifact building or uploading fails, `run()` will raise an error by default. Set `require_artifacts=False` to skip artifacts and continue with annotation only.

tests/test_cytetype_integration.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,59 @@ def test_cytetype_run_auto_uploads_artifacts(
149149
}
150150

151151

152+
@patch("cytetype.main.wait_for_completion")
153+
@patch("cytetype.main.submit_annotation_job")
154+
def test_cytetype_run_artifact_failure_raises_by_default(
155+
mock_submit: MagicMock,
156+
mock_wait: MagicMock,
157+
mock_adata: anndata.AnnData,
158+
mock_api_response: dict[str, Any],
159+
monkeypatch: pytest.MonkeyPatch,
160+
) -> None:
161+
"""Test run() raises when artifact build fails and require_artifacts=True (default)."""
162+
mock_submit.return_value = "job_no_artifacts"
163+
mock_wait.return_value = mock_api_response
164+
165+
monkeypatch.setattr(
166+
"cytetype.main.save_features_matrix",
167+
MagicMock(side_effect=RuntimeError("disk full")),
168+
)
169+
170+
ct = CyteType(mock_adata, group_key="leiden")
171+
with pytest.raises(RuntimeError, match="disk full"):
172+
ct.run(study_context="Test")
173+
174+
175+
@patch("cytetype.main.wait_for_completion")
176+
@patch("cytetype.main.submit_annotation_job")
177+
def test_cytetype_run_artifact_failure_continues_when_not_required(
178+
mock_submit: MagicMock,
179+
mock_wait: MagicMock,
180+
mock_adata: anndata.AnnData,
181+
mock_api_response: dict[str, Any],
182+
monkeypatch: pytest.MonkeyPatch,
183+
) -> None:
184+
"""Test run() proceeds without uploaded_files when require_artifacts=False."""
185+
mock_submit.return_value = "job_no_artifacts"
186+
mock_wait.return_value = mock_api_response
187+
188+
monkeypatch.setattr(
189+
"cytetype.main.save_features_matrix",
190+
MagicMock(side_effect=RuntimeError("disk full")),
191+
)
192+
193+
ct = CyteType(mock_adata, group_key="leiden")
194+
result = ct.run(study_context="Test", require_artifacts=False)
195+
196+
# Job should still complete successfully
197+
assert result is not None
198+
assert mock_submit.called
199+
200+
# Payload must not contain uploaded_files
201+
payload = mock_submit.call_args.args[2]
202+
assert "uploaded_files" not in payload
203+
204+
152205
@patch("cytetype.main.wait_for_completion")
153206
@patch("cytetype.main.submit_annotation_job")
154207
def test_cytetype_run_cleanup_artifacts(

0 commit comments

Comments
 (0)