Skip to content

Commit 5795e46

Browse files
ehsanestajinictru
andauthored
Add Zarr support to scanpy/filter (#11756)
* Add zarr support to scanpy filter * fix: address scanpy filter review feedback * style: use suffix matching for scanpy filter io * review: unify scanpy filter anndata output * review: pin scanpy filter python patch * style: keep scanpy filter template lint-compatible * Harshil-align --------- Co-authored-by: Nico Trummer <nictru32@gmail.com>
1 parent 6ed53ff commit 5795e46

7 files changed

Lines changed: 202 additions & 75 deletions

File tree

modules/nf-core/scanpy/filter/environment.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@ channels:
44
- conda-forge
55
- bioconda
66
dependencies:
7-
- conda-forge::python=3.12.11
7+
- conda-forge::python=3.14.5
88
- conda-forge::pyyaml=6.0.2
9-
- conda-forge::scanpy=1.11.2
9+
- conda-forge::scanpy=1.11.4

modules/nf-core/scanpy/filter/main.nf

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@ process SCANPY_FILTER {
44

55
conda "${moduleDir}/environment.yml"
66
container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container
7-
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/16/168ecbbe27ccef766741ccbf937b0d2675be2e19b0565035e0719f1e9ea5ee95/data'
8-
: 'community.wave.seqera.io/library/python_pyyaml_scanpy:b5509a698e9aae25'}"
7+
? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/81/8158c8824afb7a57da5327fbd188082d24d205e078103ca249d74e93cc1cd603/data'
8+
: 'community.wave.seqera.io/library/python_pyyaml_scanpy:da8fc259e2b95ada'}"
99

1010
input:
11-
tuple val(meta), path(h5ad)
11+
tuple val(meta), path(anndata)
1212
val min_genes
1313
val min_cells
1414
val min_counts_gene
@@ -17,22 +17,24 @@ process SCANPY_FILTER {
1717
val symbol_col
1818

1919
output:
20-
tuple val(meta), path("*.h5ad"), emit: h5ad
21-
path "versions.yml" , emit: versions
20+
tuple val(meta), path("*.{h5ad,zarr}"), emit: anndata
21+
path "versions.yml" , emit: versions, topic: versions
2222

2323
when:
2424
task.ext.when == null || task.ext.when
2525

2626
script:
2727
prefix = task.ext.prefix ?: "${meta.id}_filtered"
28-
if ("${prefix}.h5ad" == "${h5ad}") {
28+
output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
29+
if (output_file == anndata.name) {
2930
error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
3031
}
3132
template('filter.py')
3233

3334
stub:
3435
prefix = task.ext.prefix ?: "${meta.id}_filtered"
35-
if ("${prefix}.h5ad" == "${h5ad}") {
36+
output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
37+
if (output_file == anndata.name) {
3638
error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
3739
}
3840
"""
@@ -41,7 +43,11 @@ process SCANPY_FILTER {
4143
export MPLCONFIGDIR=./tmp/mpl
4244
export NUMBA_CACHE_DIR=./tmp/numba
4345
44-
touch ${prefix}.h5ad
46+
if [[ "${output_file}" == *.zarr ]]; then
47+
mkdir -p "${output_file}"
48+
else
49+
touch "${output_file}"
50+
fi
4551
4652
cat <<-END_VERSIONS > versions.yml
4753
"${task.process}":

modules/nf-core/scanpy/filter/meta.yml

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,58 +15,70 @@ tools:
1515
documentation: https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.filter_cells.html
1616
tool_dev_url: https://github.com/scverse/scanpy
1717
doi: 10.1186/s13059-017-1382-0
18-
licence: ["BSD-3-Clause"]
19-
18+
licence:
19+
- "BSD-3-Clause"
20+
identifier: biotools:scanpy
2021
input:
2122
- - meta:
2223
type: map
2324
description: |
2425
Groovy Map containing sample information
2526
e.g. [ id:'test' ]
26-
- h5ad:
27+
- anndata:
2728
type: file
28-
description: AnnData object in h5ad format
29-
pattern: "*.{h5ad}"
29+
description: AnnData object in h5ad or zarr format
30+
pattern: "*.{h5ad,zarr}"
3031
ontologies:
31-
- edam: "http://edamontology.org/format_3590" # HDF5 format
32-
- - min_genes:
33-
type: integer
34-
description: Minimum number of genes expressed per cell
35-
- - min_cells:
36-
type: integer
37-
description: Minimum number of cells expressing each gene
38-
- - min_counts_gene:
39-
type: integer
40-
description: Minimum number of counts per gene
41-
- - min_counts_cell:
42-
type: integer
43-
description: Minimum number of counts per cell
44-
- - max_mito_percentage:
45-
type: integer
46-
description: Maximum percentage of mitochondrial genes per cell
47-
- - symbol_col:
48-
type: string
49-
description: Column name of the gene symbols in the `var` of the AnnData object. Use `index` if the gene symbols are the row names.
50-
32+
- edam: "http://edamontology.org/format_3590"
33+
- edam: "http://edamontology.org/format_3915"
34+
- min_genes:
35+
type: integer
36+
description: Minimum number of genes expressed per cell
37+
- min_cells:
38+
type: integer
39+
description: Minimum number of cells expressing each gene
40+
- min_counts_gene:
41+
type: integer
42+
description: Minimum number of counts per gene
43+
- min_counts_cell:
44+
type: integer
45+
description: Minimum number of counts per cell
46+
- max_mito_percentage:
47+
type: integer
48+
description: Maximum percentage of mitochondrial genes per cell
49+
- symbol_col:
50+
type: string
51+
description: Column name of the gene symbols in the `var` of the AnnData
52+
object. Use `index` if the gene symbols are the row names.
5153
output:
52-
h5ad:
53-
- meta:
54-
type: map
55-
description: |
56-
Groovy Map containing sample information
57-
e.g. [ id:'test' ]
58-
- "*.h5ad":
54+
anndata:
55+
- - meta:
56+
type: map
57+
description: |
58+
Groovy Map containing sample information
59+
e.g. [ id:'test' ]
60+
- "*.{h5ad,zarr}":
61+
type: file
62+
description: Filtered AnnData object in h5ad or zarr format
63+
pattern: "*.{h5ad,zarr}"
64+
ontologies:
65+
- edam: "http://edamontology.org/format_3590"
66+
- edam: "http://edamontology.org/format_3915"
67+
versions:
68+
- versions.yml:
5969
type: file
60-
description: Filtered AnnData object
61-
pattern: "*.h5ad"
70+
description: File containing software versions
71+
pattern: "versions.yml"
6272
ontologies:
63-
- edam: "http://edamontology.org/format_3590" # HDF5 format
73+
- edam: http://edamontology.org/format_3750
74+
topics:
6475
versions:
6576
- versions.yml:
6677
type: file
6778
description: File containing software versions
6879
pattern: "versions.yml"
69-
80+
ontologies:
81+
- edam: http://edamontology.org/format_3750
7082
authors:
7183
- "@nictru"
7284
maintainers:

modules/nf-core/scanpy/filter/templates/filter.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,15 +8,27 @@
88
os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba"
99

1010
import platform
11+
from pathlib import Path
1112

13+
import anndata as ad
1214
import scanpy as sc
1315
import yaml
1416
from threadpoolctl import threadpool_limits
1517

1618
threadpool_limits(int("${task.cpus}"))
1719
sc.settings.n_jobs = int("${task.cpus}")
1820

19-
adata = sc.read_h5ad("${h5ad}")
21+
input_file = "${anndata}"
22+
output_file = "${output_file}"
23+
24+
input_suffix = Path(input_file).suffix
25+
if input_suffix == ".h5ad":
26+
adata = ad.read_h5ad(input_file)
27+
elif input_suffix == ".zarr":
28+
adata = ad.read_zarr(input_file)
29+
else:
30+
raise ValueError(f"Unsupported AnnData input format: {input_suffix}")
31+
2032
prefix = "${prefix}"
2133
symbol_col = "${symbol_col}"
2234

@@ -36,7 +48,13 @@
3648
sc.pp.filter_cells(adata, min_genes=int("${min_genes}"))
3749
sc.pp.filter_genes(adata, min_cells=int("${min_cells}"))
3850

39-
adata.write_h5ad(f"{prefix}.h5ad")
51+
output_suffix = Path(output_file).suffix
52+
if output_suffix == ".h5ad":
53+
adata.write_h5ad(output_file)
54+
elif output_suffix == ".zarr":
55+
adata.write_zarr(output_file)
56+
else:
57+
raise ValueError(f"Unsupported AnnData output format: {output_suffix}")
4058

4159
# Versions
4260

modules/nf-core/scanpy/filter/tests/main.nf.test

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,92 @@ nextflow_process {
88
tag "modules_nfcore"
99
tag "scanpy"
1010
tag "scanpy/filter"
11+
tag "untar"
12+
13+
test("Should emit zarr output for zarr input - stub") {
14+
15+
options '-stub'
16+
17+
setup {
18+
run("UNTAR") {
19+
config "./nextflow.config"
20+
script "modules/nf-core/untar/main.nf"
21+
process {
22+
"""
23+
input[0] = [
24+
[ id: 'test_zarr' ],
25+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
26+
]
27+
"""
28+
}
29+
}
30+
}
31+
32+
when {
33+
process {
34+
"""
35+
input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
36+
input[1] = 0
37+
input[2] = 0
38+
input[3] = 0
39+
input[4] = 0
40+
input[5] = 100
41+
input[6] = 'index'
42+
"""
43+
}
44+
}
45+
46+
then {
47+
assertAll(
48+
{ assert process.success },
49+
{ assert process.out.anndata },
50+
{ assert file(process.out.anndata[0][1]).name == "test_zarr_filtered.zarr" }
51+
)
52+
}
53+
54+
}
55+
56+
test("Should run with zarr input") {
57+
58+
setup {
59+
run("UNTAR") {
60+
config "./nextflow.config"
61+
script "modules/nf-core/untar/main.nf"
62+
process {
63+
"""
64+
input[0] = [
65+
[ id: 'test_zarr' ],
66+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
67+
]
68+
"""
69+
}
70+
}
71+
}
72+
73+
when {
74+
process {
75+
"""
76+
input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
77+
input[1] = 0
78+
input[2] = 0
79+
input[3] = 0
80+
input[4] = 0
81+
input[5] = 100
82+
input[6] = 'index'
83+
"""
84+
}
85+
}
86+
87+
then {
88+
assertAll(
89+
{ assert process.success },
90+
{ assert process.out.anndata },
91+
{ assert file(process.out.anndata[0][1]).name == "test_zarr_filtered.zarr" },
92+
{ assert file(process.out.anndata[0][1] + "/X/.zarray").exists() }
93+
)
94+
}
95+
96+
}
1197

1298
test("Should run without failures") {
1399

0 commit comments

Comments
 (0)