Skip to content

Commit cc2736c

Browse files
ehsanestajiSPPearcenictru
authored
Add Zarr support to scanpy/pca (#11697)
* Add zarr support to scanpy pca * Fix scanpy/pca CI checks * Move scanpy pca zarr fixture to test-datasets * review: align scanpy pca zarr outputs * Harshil-align --------- Co-authored-by: Simon Pearce <24893913+SPPearce@users.noreply.github.com> Co-authored-by: Nico Trummer <nictru32@gmail.com>
1 parent e340200 commit cc2736c

6 files changed

Lines changed: 145 additions & 73 deletions

File tree

modules/nf-core/scanpy/pca/main.nf

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,27 +8,29 @@ process SCANPY_PCA {
88
: 'community.wave.seqera.io/library/python_pyyaml_scanpy:ffeb5af98a9a5bde'}"
99

1010
input:
11-
tuple val(meta), path(h5ad)
11+
tuple val(meta), path(anndata)
1212
val key_added
1313

1414
output:
15-
tuple val(meta), path("*.h5ad") , emit: h5ad
16-
tuple val(meta), path("X_*.pkl"), emit: obsm
17-
path "versions.yml" , emit: versions
15+
tuple val(meta), path("*.{h5ad,zarr}"), emit: anndata
16+
tuple val(meta), path("X_*.pkl") , emit: obsm
17+
path "versions.yml" , emit: versions, topic: versions
1818

1919
when:
2020
task.ext.when == null || task.ext.when
2121

2222
script:
2323
prefix = task.ext.prefix ?: "${meta.id}_pca"
24-
if ("${prefix}.h5ad" == "${h5ad}") {
24+
output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
25+
if (output_file == anndata.name) {
2526
error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
2627
}
2728
template('pca.py')
2829

2930
stub:
3031
prefix = task.ext.prefix ?: "${meta.id}_pca"
31-
if ("${prefix}.h5ad" == "${h5ad}") {
32+
output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
33+
if (output_file == anndata.name) {
3234
error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
3335
}
3436
"""
@@ -37,13 +39,17 @@ process SCANPY_PCA {
3739
export MPLCONFIGDIR=./tmp/mpl
3840
export NUMBA_CACHE_DIR=./tmp/numba
3941
40-
touch ${prefix}.h5ad
42+
if [[ "${output_file}" == *.zarr ]]; then
43+
mkdir -p "${output_file}"
44+
else
45+
touch "${output_file}"
46+
fi
4147
touch X_${prefix}.pkl
4248
4349
cat <<-END_VERSIONS > versions.yml
4450
"${task.process}":
4551
python: \$(python3 -c 'import platform; print(platform.python_version())')
46-
scanpy: \$(python3 -c 'import scanpy; print(scanpy.__version__)')
52+
scanpy: \$(python3 -c 'import importlib.metadata; print(importlib.metadata.version("scanpy"))')
4753
END_VERSIONS
4854
"""
4955
}

modules/nf-core/scanpy/pca/meta.yml

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,30 +23,32 @@ input:
2323
description: |
2424
Groovy Map containing sample information
2525
e.g. [ id:'test' ]
26-
- h5ad:
26+
- anndata:
2727
type: file
28-
description: AnnData object in h5ad format
29-
pattern: "*.{h5ad}"
28+
description: AnnData object in h5ad or zarr format
29+
pattern: "*.{h5ad,zarr}"
3030
ontologies:
3131
- edam: "http://edamontology.org/format_3590" # HDF5 format
32+
- edam: "http://edamontology.org/format_3915" # Zarr format
3233
- key_added:
3334
type: string
3435
description: |
3536
Key to add to obsm with PCA coordinates, usually 'X_pca'
3637
3738
output:
38-
h5ad:
39+
anndata:
3940
- - meta:
4041
type: map
4142
description: |
4243
Groovy Map containing sample information
4344
e.g. [ id:'test' ]
44-
- "*.h5ad":
45+
- "*.{h5ad,zarr}":
4546
type: file
46-
description: AnnData object with PCA coordinates added
47-
pattern: "*.h5ad"
47+
description: AnnData object with PCA coordinates added in h5ad or zarr format
48+
pattern: "*.{h5ad,zarr}"
4849
ontologies:
4950
- edam: "http://edamontology.org/format_3590" # HDF5 format
51+
- edam: "http://edamontology.org/format_3915" # Zarr format
5052
obsm:
5153
- - meta:
5254
type: map
@@ -66,6 +68,14 @@ output:
6668
pattern: "versions.yml"
6769
ontologies:
6870
- edam: http://edamontology.org/format_3750 # YAML
71+
topics:
72+
versions:
73+
- versions.yml:
74+
type: file
75+
description: File containing software versions
76+
pattern: "versions.yml"
77+
ontologies:
78+
- edam: http://edamontology.org/format_3750 # YAML
6979

7080
authors:
7181
- "@nictru"

modules/nf-core/scanpy/pca/templates/pca.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba"
99

1010
import platform
11+
from pathlib import Path
1112

13+
import anndata as ad
1214
import numpy as np
1315
import pandas as pd
1416
import scanpy as sc
@@ -18,7 +20,17 @@
1820
threadpool_limits(int("${task.cpus}"))
1921
sc.settings.n_jobs = int("${task.cpus}")
2022

21-
adata = sc.read_h5ad("${h5ad}")
23+
input_file = "${anndata}"
24+
output_file = "${output_file}"
25+
26+
input_suffix = Path(input_file).suffix
27+
if input_suffix == ".h5ad":
28+
adata = ad.read_h5ad(input_file)
29+
elif input_suffix == ".zarr":
30+
adata = ad.read_zarr(input_file)
31+
else:
32+
raise ValueError(f"Unsupported AnnData input format: {input_suffix}")
33+
2234
prefix = "${prefix}"
2335
key_added = "${key_added}"
2436

@@ -29,7 +41,14 @@
2941
# This ensures hashes are stable
3042
adata.obsm[key_added] = np.round(adata.obsm[key_added], 8)
3143

32-
adata.write_h5ad(f"{prefix}.h5ad")
44+
output_suffix = Path(output_file).suffix
45+
if output_suffix == ".h5ad":
46+
adata.write_h5ad(output_file)
47+
elif output_suffix == ".zarr":
48+
adata.write_zarr(output_file)
49+
else:
50+
raise ValueError(f"Unsupported AnnData output format: {output_suffix}")
51+
3352
df = pd.DataFrame(adata.obsm[key_added], index=adata.obs_names)
3453
df.to_pickle(f"X_{prefix}.pkl")
3554

modules/nf-core/scanpy/pca/tests/main.nf.test

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,83 @@ nextflow_process {
88
tag "modules_nfcore"
99
tag "scanpy"
1010
tag "scanpy/pca"
11+
tag "untar"
12+
13+
test("Should emit zarr output for zarr input - stub") {
14+
15+
options '-stub'
16+
17+
setup {
18+
run("UNTAR") {
19+
config "./nextflow.config"
20+
script "modules/nf-core/untar/main.nf"
21+
process {
22+
"""
23+
input[0] = [
24+
[ id: 'test_zarr' ],
25+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
26+
]
27+
"""
28+
}
29+
}
30+
}
31+
32+
when {
33+
process {
34+
"""
35+
input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
36+
input[1] = "X_pca"
37+
"""
38+
}
39+
}
40+
41+
then {
42+
assertAll(
43+
{ assert process.success },
44+
{ assert process.out.anndata },
45+
{ assert file(process.out.anndata[0][1]).name == "test_zarr_pca.zarr" }
46+
)
47+
}
48+
49+
}
50+
51+
test("Should run with zarr input") {
52+
53+
setup {
54+
run("UNTAR") {
55+
config "./nextflow.config"
56+
script "modules/nf-core/untar/main.nf"
57+
process {
58+
"""
59+
input[0] = [
60+
[ id: 'test_zarr' ],
61+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
62+
]
63+
"""
64+
}
65+
}
66+
}
67+
68+
when {
69+
process {
70+
"""
71+
input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
72+
input[1] = "X_pca"
73+
"""
74+
}
75+
}
76+
77+
then {
78+
assertAll(
79+
{ assert process.success },
80+
{ assert process.out.anndata },
81+
{ assert file(process.out.anndata[0][1]).name == "test_zarr_pca.zarr" },
82+
{ assert file(process.out.anndata[0][1] + "/obsm/X_pca/.zarray").exists() },
83+
{ assert file(process.out.obsm[0][1]).name == "X_test_zarr_pca.pkl" }
84+
)
85+
}
86+
87+
}
1188

1289
test("Should run without failures") {
1390

@@ -27,8 +104,12 @@ nextflow_process {
27104
then {
28105
assertAll(
29106
{ assert process.success },
30-
{ assert snapshot(process.out).match() },
31-
{ assert "X_pca" in anndata(process.out.h5ad[0][1]).obsm }
107+
{ assert process.out.anndata },
108+
{ assert file(process.out.anndata[0][1]).name == "test_pca.h5ad" },
109+
{ assert process.out.obsm },
110+
{ assert file(process.out.obsm[0][1]).name == "X_test_pca.pkl" },
111+
{ assert process.out.versions },
112+
{ assert "X_pca" in anndata(process.out.anndata[0][1]).obsm }
32113
)
33114
}
34115

modules/nf-core/scanpy/pca/tests/main.nf.test.snap

Lines changed: 5 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
"2": [
2222
"versions.yml:md5,c56ab37f2451ae0c75d5ed843bcffc2d"
2323
],
24-
"h5ad": [
24+
"anndata": [
2525
[
2626
{
2727
"id": "test"
@@ -42,59 +42,10 @@
4242
]
4343
}
4444
],
45+
"timestamp": "2026-05-28T10:39:29.434058",
4546
"meta": {
46-
"nf-test": "0.9.2",
47-
"nextflow": "25.04.3"
48-
},
49-
"timestamp": "2025-08-01T18:51:27.339434146"
50-
},
51-
"Should run without failures": {
52-
"content": [
53-
{
54-
"0": [
55-
[
56-
{
57-
"id": "test"
58-
},
59-
"test_pca.h5ad:md5,9959a48a84953cda0489e804077383da"
60-
]
61-
],
62-
"1": [
63-
[
64-
{
65-
"id": "test"
66-
},
67-
"X_test_pca.pkl:md5,16ca5954c16bf62e50385e41bd0cb556"
68-
]
69-
],
70-
"2": [
71-
"versions.yml:md5,6ccd7660c2690433e6b70c9600511ee5"
72-
],
73-
"h5ad": [
74-
[
75-
{
76-
"id": "test"
77-
},
78-
"test_pca.h5ad:md5,9959a48a84953cda0489e804077383da"
79-
]
80-
],
81-
"obsm": [
82-
[
83-
{
84-
"id": "test"
85-
},
86-
"X_test_pca.pkl:md5,16ca5954c16bf62e50385e41bd0cb556"
87-
]
88-
],
89-
"versions": [
90-
"versions.yml:md5,6ccd7660c2690433e6b70c9600511ee5"
91-
]
92-
}
93-
],
94-
"meta": {
95-
"nf-test": "0.9.2",
96-
"nextflow": "25.04.3"
97-
},
98-
"timestamp": "2025-08-01T18:51:19.796989014"
47+
"nf-test": "0.9.5",
48+
"nextflow": "26.04.1"
49+
}
9950
}
10051
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
process {
2+
withName: UNTAR {
3+
ext.prefix = "test_zarr.zarr"
4+
}
5+
}

0 commit comments

Comments
 (0)