Add Zarr support to scanpy/filter (#11756)

ehsanestaji · nictru · web-flow · commit 5795e46cae24 · 2026-05-27T21:13:59.000Z
* Add zarr support to scanpy filter

* fix: address scanpy filter review feedback

* style: use suffix matching for scanpy filter io

* review: unify scanpy filter anndata output

* review: pin scanpy filter python patch

* style: keep scanpy filter template lint-compatible

* Harshil-align

---------

Co-authored-by: Nico Trummer &lt;nictru32@gmail.com&gt;
diff --git a/modules/nf-core/scanpy/filter/environment.yml b/modules/nf-core/scanpy/filter/environment.yml
@@ -4,6 +4,6 @@ channels:
   - conda-forge
   - bioconda
 dependencies:
-  - conda-forge::python=3.12.11
+  - conda-forge::python=3.14.5
   - conda-forge::pyyaml=6.0.2
-  - conda-forge::scanpy=1.11.2
+  - conda-forge::scanpy=1.11.4
diff --git a/modules/nf-core/scanpy/filter/main.nf b/modules/nf-core/scanpy/filter/main.nf
@@ -4,11 +4,11 @@ process SCANPY_FILTER {
 
     conda "${moduleDir}/environment.yml"
     container "${workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container
-        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/16/168ecbbe27ccef766741ccbf937b0d2675be2e19b0565035e0719f1e9ea5ee95/data'
-        : 'community.wave.seqera.io/library/python_pyyaml_scanpy:b5509a698e9aae25'}"
+        ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/81/8158c8824afb7a57da5327fbd188082d24d205e078103ca249d74e93cc1cd603/data'
+        : 'community.wave.seqera.io/library/python_pyyaml_scanpy:da8fc259e2b95ada'}"
 
     input:
-    tuple val(meta), path(h5ad)
+    tuple val(meta), path(anndata)
     val min_genes
     val min_cells
     val min_counts_gene
@@ -17,22 +17,24 @@ process SCANPY_FILTER {
     val symbol_col
 
     output:
-    tuple val(meta), path("*.h5ad"), emit: h5ad
-    path "versions.yml"            , emit: versions
+    tuple val(meta), path("*.{h5ad,zarr}"), emit: anndata
+    path "versions.yml"                   , emit: versions, topic: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     prefix     = task.ext.prefix ?: "${meta.id}_filtered"
-    if ("${prefix}.h5ad" == "${h5ad}") {
+    output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
+    if (output_file == anndata.name) {
         error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
     }
     template('filter.py')
 
     stub:
     prefix = task.ext.prefix ?: "${meta.id}_filtered"
-    if ("${prefix}.h5ad" == "${h5ad}") {
+    output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
+    if (output_file == anndata.name) {
         error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
     }
     """
@@ -41,7 +43,11 @@ process SCANPY_FILTER {
     export MPLCONFIGDIR=./tmp/mpl
     export NUMBA_CACHE_DIR=./tmp/numba
 
-    touch ${prefix}.h5ad
+    if [[ "${output_file}" == *.zarr ]]; then
+        mkdir -p "${output_file}"
+    else
+        touch "${output_file}"
+    fi
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
diff --git a/modules/nf-core/scanpy/filter/meta.yml b/modules/nf-core/scanpy/filter/meta.yml
@@ -15,58 +15,70 @@ tools:
       documentation: https://scanpy.readthedocs.io/en/stable/api/generated/scanpy.pp.filter_cells.html
       tool_dev_url: https://github.com/scverse/scanpy
       doi: 10.1186/s13059-017-1382-0
-      licence: ["BSD-3-Clause"]
-
+      licence:
+        - "BSD-3-Clause"
+      identifier: biotools:scanpy
 input:
   - - meta:
         type: map
         description: |
           Groovy Map containing sample information
           e.g. [ id:'test' ]
-    - h5ad:
+    - anndata:
         type: file
-        description: AnnData object in h5ad format
-        pattern: "*.{h5ad}"
+        description: AnnData object in h5ad or zarr format
+        pattern: "*.{h5ad,zarr}"
         ontologies:
-          - edam: "http://edamontology.org/format_3590" # HDF5 format
-  - - min_genes:
-        type: integer
-        description: Minimum number of genes expressed per cell
-  - - min_cells:
-        type: integer
-        description: Minimum number of cells expressing each gene
-  - - min_counts_gene:
-        type: integer
-        description: Minimum number of counts per gene
-  - - min_counts_cell:
-        type: integer
-        description: Minimum number of counts per cell
-  - - max_mito_percentage:
-        type: integer
-        description: Maximum percentage of mitochondrial genes per cell
-  - - symbol_col:
-        type: string
-        description: Column name of the gene symbols in the `var` of the AnnData object. Use `index` if the gene symbols are the row names.
-
+          - edam: "http://edamontology.org/format_3590"
+          - edam: "http://edamontology.org/format_3915"
+  - min_genes:
+      type: integer
+      description: Minimum number of genes expressed per cell
+  - min_cells:
+      type: integer
+      description: Minimum number of cells expressing each gene
+  - min_counts_gene:
+      type: integer
+      description: Minimum number of counts per gene
+  - min_counts_cell:
+      type: integer
+      description: Minimum number of counts per cell
+  - max_mito_percentage:
+      type: integer
+      description: Maximum percentage of mitochondrial genes per cell
+  - symbol_col:
+      type: string
+      description: Column name of the gene symbols in the `var` of the AnnData
+        object. Use `index` if the gene symbols are the row names.
 output:
-  h5ad:
-    - meta:
-        type: map
-        description: |
-          Groovy Map containing sample information
-          e.g. [ id:'test' ]
-    - "*.h5ad":
+  anndata:
+    - - meta:
+          type: map
+          description: |
+            Groovy Map containing sample information
+            e.g. [ id:'test' ]
+      - "*.{h5ad,zarr}":
+          type: file
+          description: Filtered AnnData object in h5ad or zarr format
+          pattern: "*.{h5ad,zarr}"
+          ontologies:
+            - edam: "http://edamontology.org/format_3590"
+            - edam: "http://edamontology.org/format_3915"
+  versions:
+    - versions.yml:
         type: file
-        description: Filtered AnnData object
-        pattern: "*.h5ad"
+        description: File containing software versions
+        pattern: "versions.yml"
         ontologies:
-          - edam: "http://edamontology.org/format_3590" # HDF5 format
+          - edam: http://edamontology.org/format_3750
+topics:
   versions:
     - versions.yml:
         type: file
         description: File containing software versions
         pattern: "versions.yml"
-
+        ontologies:
+          - edam: http://edamontology.org/format_3750
 authors:
   - "@nictru"
 maintainers:
diff --git a/modules/nf-core/scanpy/filter/templates/filter.py b/modules/nf-core/scanpy/filter/templates/filter.py
@@ -8,15 +8,27 @@
 os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba"
 
 import platform
+from pathlib import Path
 
+import anndata as ad
 import scanpy as sc
 import yaml
 from threadpoolctl import threadpool_limits
 
 threadpool_limits(int("${task.cpus}"))
 sc.settings.n_jobs = int("${task.cpus}")
 
-adata = sc.read_h5ad("${h5ad}")
+input_file = "${anndata}"
+output_file = "${output_file}"
+
+input_suffix = Path(input_file).suffix
+if input_suffix == ".h5ad":
+    adata = ad.read_h5ad(input_file)
+elif input_suffix == ".zarr":
+    adata = ad.read_zarr(input_file)
+else:
+    raise ValueError(f"Unsupported AnnData input format: {input_suffix}")
+
 prefix = "${prefix}"
 symbol_col = "${symbol_col}"
 
@@ -36,7 +48,13 @@
 sc.pp.filter_cells(adata, min_genes=int("${min_genes}"))
 sc.pp.filter_genes(adata, min_cells=int("${min_cells}"))
 
-adata.write_h5ad(f"{prefix}.h5ad")
+output_suffix = Path(output_file).suffix
+if output_suffix == ".h5ad":
+    adata.write_h5ad(output_file)
+elif output_suffix == ".zarr":
+    adata.write_zarr(output_file)
+else:
+    raise ValueError(f"Unsupported AnnData output format: {output_suffix}")
 
 # Versions
 
diff --git a/modules/nf-core/scanpy/filter/tests/main.nf.test b/modules/nf-core/scanpy/filter/tests/main.nf.test
@@ -8,6 +8,92 @@ nextflow_process {
     tag "modules_nfcore"
     tag "scanpy"
     tag "scanpy/filter"
+    tag "untar"
+
+    test("Should emit zarr output for zarr input - stub") {
+
+        options '-stub'
+
+        setup {
+            run("UNTAR") {
+                config "./nextflow.config"
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                        [ id: 'test_zarr' ],
+                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
+                input[1] = 0
+                input[2] = 0
+                input[3] = 0
+                input[4] = 0
+                input[5] = 100
+                input[6] = 'index'
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert process.out.anndata },
+            { assert file(process.out.anndata[0][1]).name == "test_zarr_filtered.zarr" }
+            )
+        }
+
+    }
+
+    test("Should run with zarr input") {
+
+        setup {
+            run("UNTAR") {
+                config "./nextflow.config"
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                        [ id: 'test_zarr' ],
+                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
+                input[1] = 0
+                input[2] = 0
+                input[3] = 0
+                input[4] = 0
+                input[5] = 100
+                input[6] = 'index'
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert process.out.anndata },
+            { assert file(process.out.anndata[0][1]).name == "test_zarr_filtered.zarr" },
+            { assert file(process.out.anndata[0][1] + "/X/.zarray").exists() }
+            )
+        }
+
+    }
 
     test("Should run without failures") {
 
diff --git a/modules/nf-core/scanpy/filter/tests/main.nf.test.snap b/modules/nf-core/scanpy/filter/tests/main.nf.test.snap
diff --git a/modules/nf-core/scanpy/filter/tests/nextflow.config b/modules/nf-core/scanpy/filter/tests/nextflow.config