Add Zarr support to scanpy/pca (#11697)

ehsanestaji · SPPearce · nictru · web-flow · commit cc2736c07cd7 · 2026-05-28T09:25:40.000Z
* Add zarr support to scanpy pca

* Fix scanpy/pca CI checks

* Move scanpy pca zarr fixture to test-datasets

* review: align scanpy pca zarr outputs

* Harshil-align

---------

Co-authored-by: Simon Pearce &lt;24893913+SPPearce@users.noreply.github.com&gt;
Co-authored-by: Nico Trummer &lt;nictru32@gmail.com&gt;
diff --git a/modules/nf-core/scanpy/pca/main.nf b/modules/nf-core/scanpy/pca/main.nf
@@ -8,27 +8,29 @@ process SCANPY_PCA {
         : 'community.wave.seqera.io/library/python_pyyaml_scanpy:ffeb5af98a9a5bde'}"
 
     input:
-    tuple val(meta), path(h5ad)
+    tuple val(meta), path(anndata)
     val key_added
 
     output:
-    tuple val(meta), path("*.h5ad") , emit: h5ad
-    tuple val(meta), path("X_*.pkl"), emit: obsm
-    path "versions.yml"             , emit: versions
+    tuple val(meta), path("*.{h5ad,zarr}"), emit: anndata
+    tuple val(meta), path("X_*.pkl")      , emit: obsm
+    path "versions.yml"                   , emit: versions, topic: versions
 
     when:
     task.ext.when == null || task.ext.when
 
     script:
     prefix = task.ext.prefix ?: "${meta.id}_pca"
-    if ("${prefix}.h5ad" == "${h5ad}") {
+    output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
+    if (output_file == anndata.name) {
         error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
     }
     template('pca.py')
 
     stub:
     prefix = task.ext.prefix ?: "${meta.id}_pca"
-    if ("${prefix}.h5ad" == "${h5ad}") {
+    output_file = anndata.name.endsWith(".zarr") ? "${prefix}.zarr" : "${prefix}.h5ad"
+    if (output_file == anndata.name) {
         error("Input and output names are the same, use \"task.ext.prefix\" to disambiguate!")
     }
     """
@@ -37,13 +39,17 @@ process SCANPY_PCA {
     export MPLCONFIGDIR=./tmp/mpl
     export NUMBA_CACHE_DIR=./tmp/numba
 
-    touch ${prefix}.h5ad
+    if [[ "${output_file}" == *.zarr ]]; then
+        mkdir -p "${output_file}"
+    else
+        touch "${output_file}"
+    fi
     touch X_${prefix}.pkl
 
     cat <<-END_VERSIONS > versions.yml
     "${task.process}":
         python: \$(python3 -c 'import platform; print(platform.python_version())')
-        scanpy: \$(python3 -c 'import scanpy; print(scanpy.__version__)')
+        scanpy: \$(python3 -c 'import importlib.metadata; print(importlib.metadata.version("scanpy"))')
     END_VERSIONS
     """
 }
diff --git a/modules/nf-core/scanpy/pca/meta.yml b/modules/nf-core/scanpy/pca/meta.yml
@@ -23,30 +23,32 @@ input:
         description: |
           Groovy Map containing sample information
           e.g. [ id:'test' ]
-    - h5ad:
+    - anndata:
         type: file
-        description: AnnData object in h5ad format
-        pattern: "*.{h5ad}"
+        description: AnnData object in h5ad or zarr format
+        pattern: "*.{h5ad,zarr}"
         ontologies:
           - edam: "http://edamontology.org/format_3590" # HDF5 format
+          - edam: "http://edamontology.org/format_3915" # Zarr format
   - key_added:
       type: string
       description: |
         Key to add to obsm with PCA coordinates, usually 'X_pca'
 
 output:
-  h5ad:
+  anndata:
     - - meta:
           type: map
           description: |
             Groovy Map containing sample information
             e.g. [ id:'test' ]
-      - "*.h5ad":
+      - "*.{h5ad,zarr}":
           type: file
-          description: AnnData object with PCA coordinates added
-          pattern: "*.h5ad"
+          description: AnnData object with PCA coordinates added in h5ad or zarr format
+          pattern: "*.{h5ad,zarr}"
           ontologies:
             - edam: "http://edamontology.org/format_3590" # HDF5 format
+            - edam: "http://edamontology.org/format_3915" # Zarr format
   obsm:
     - - meta:
           type: map
@@ -66,6 +68,14 @@ output:
         pattern: "versions.yml"
         ontologies:
           - edam: http://edamontology.org/format_3750 # YAML
+topics:
+  versions:
+    - versions.yml:
+        type: file
+        description: File containing software versions
+        pattern: "versions.yml"
+        ontologies:
+          - edam: http://edamontology.org/format_3750 # YAML
 
 authors:
   - "@nictru"
diff --git a/modules/nf-core/scanpy/pca/templates/pca.py b/modules/nf-core/scanpy/pca/templates/pca.py
@@ -8,7 +8,9 @@
 os.environ["NUMBA_CACHE_DIR"] = "./tmp/numba"
 
 import platform
+from pathlib import Path
 
+import anndata as ad
 import numpy as np
 import pandas as pd
 import scanpy as sc
@@ -18,7 +20,17 @@
 threadpool_limits(int("${task.cpus}"))
 sc.settings.n_jobs = int("${task.cpus}")
 
-adata = sc.read_h5ad("${h5ad}")
+input_file = "${anndata}"
+output_file = "${output_file}"
+
+input_suffix = Path(input_file).suffix
+if input_suffix == ".h5ad":
+    adata = ad.read_h5ad(input_file)
+elif input_suffix == ".zarr":
+    adata = ad.read_zarr(input_file)
+else:
+    raise ValueError(f"Unsupported AnnData input format: {input_suffix}")
+
 prefix = "${prefix}"
 key_added = "${key_added}"
 
@@ -29,7 +41,14 @@
 # This ensures hashes are stable
 adata.obsm[key_added] = np.round(adata.obsm[key_added], 8)
 
-adata.write_h5ad(f"{prefix}.h5ad")
+output_suffix = Path(output_file).suffix
+if output_suffix == ".h5ad":
+    adata.write_h5ad(output_file)
+elif output_suffix == ".zarr":
+    adata.write_zarr(output_file)
+else:
+    raise ValueError(f"Unsupported AnnData output format: {output_suffix}")
+
 df = pd.DataFrame(adata.obsm[key_added], index=adata.obs_names)
 df.to_pickle(f"X_{prefix}.pkl")
 
diff --git a/modules/nf-core/scanpy/pca/tests/main.nf.test b/modules/nf-core/scanpy/pca/tests/main.nf.test
@@ -8,6 +8,83 @@ nextflow_process {
     tag "modules_nfcore"
     tag "scanpy"
     tag "scanpy/pca"
+    tag "untar"
+
+    test("Should emit zarr output for zarr input - stub") {
+
+        options '-stub'
+
+        setup {
+            run("UNTAR") {
+                config "./nextflow.config"
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                        [ id: 'test_zarr' ],
+                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
+                input[1] = "X_pca"
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert process.out.anndata },
+            { assert file(process.out.anndata[0][1]).name == "test_zarr_pca.zarr" }
+            )
+        }
+
+    }
+
+    test("Should run with zarr input") {
+
+        setup {
+            run("UNTAR") {
+                config "./nextflow.config"
+                script "modules/nf-core/untar/main.nf"
+                process {
+                    """
+                    input[0] = [
+                        [ id: 'test_zarr' ],
+                        file(params.modules_testdata_base_path + 'genomics/homo_sapiens/scrnaseq/zarr/test_zarr.zarr.tar.gz', checkIfExists: true)
+                    ]
+                    """
+                }
+            }
+        }
+
+        when {
+            process {
+                """
+                input[0] = UNTAR.out.untar.map { meta, zarr -> [ meta, zarr ] }
+                input[1] = "X_pca"
+                """
+            }
+        }
+
+        then {
+            assertAll(
+            { assert process.success },
+            { assert process.out.anndata },
+            { assert file(process.out.anndata[0][1]).name == "test_zarr_pca.zarr" },
+            { assert file(process.out.anndata[0][1] + "/obsm/X_pca/.zarray").exists() },
+            { assert file(process.out.obsm[0][1]).name == "X_test_zarr_pca.pkl" }
+            )
+        }
+
+    }
 
     test("Should run without failures") {
 
@@ -27,8 +104,12 @@ nextflow_process {
         then {
             assertAll(
             { assert process.success },
-            { assert snapshot(process.out).match() },
-            { assert "X_pca" in anndata(process.out.h5ad[0][1]).obsm }
+            { assert process.out.anndata },
+            { assert file(process.out.anndata[0][1]).name == "test_pca.h5ad" },
+            { assert process.out.obsm },
+            { assert file(process.out.obsm[0][1]).name == "X_test_pca.pkl" },
+            { assert process.out.versions },
+            { assert "X_pca" in anndata(process.out.anndata[0][1]).obsm }
             )
         }
 
diff --git a/modules/nf-core/scanpy/pca/tests/main.nf.test.snap b/modules/nf-core/scanpy/pca/tests/main.nf.test.snap
@@ -21,7 +21,7 @@
                 "2": [
                     "versions.yml:md5,c56ab37f2451ae0c75d5ed843bcffc2d"
                 ],
-                "h5ad": [
+                "anndata": [
                     [
                         {
                             "id": "test"
@@ -42,59 +42,10 @@
                 ]
             }
         ],
+        "timestamp": "2026-05-28T10:39:29.434058",
         "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.04.3"
-        },
-        "timestamp": "2025-08-01T18:51:27.339434146"
-    },
-    "Should run without failures": {
-        "content": [
-            {
-                "0": [
-                    [
-                        {
-                            "id": "test"
-                        },
-                        "test_pca.h5ad:md5,9959a48a84953cda0489e804077383da"
-                    ]
-                ],
-                "1": [
-                    [
-                        {
-                            "id": "test"
-                        },
-                        "X_test_pca.pkl:md5,16ca5954c16bf62e50385e41bd0cb556"
-                    ]
-                ],
-                "2": [
-                    "versions.yml:md5,6ccd7660c2690433e6b70c9600511ee5"
-                ],
-                "h5ad": [
-                    [
-                        {
-                            "id": "test"
-                        },
-                        "test_pca.h5ad:md5,9959a48a84953cda0489e804077383da"
-                    ]
-                ],
-                "obsm": [
-                    [
-                        {
-                            "id": "test"
-                        },
-                        "X_test_pca.pkl:md5,16ca5954c16bf62e50385e41bd0cb556"
-                    ]
-                ],
-                "versions": [
-                    "versions.yml:md5,6ccd7660c2690433e6b70c9600511ee5"
-                ]
-            }
-        ],
-        "meta": {
-            "nf-test": "0.9.2",
-            "nextflow": "25.04.3"
-        },
-        "timestamp": "2025-08-01T18:51:19.796989014"
+            "nf-test": "0.9.5",
+            "nextflow": "26.04.1"
+        }
     }
 }
diff --git a/modules/nf-core/scanpy/pca/tests/nextflow.config b/modules/nf-core/scanpy/pca/tests/nextflow.config
@@ -0,0 +1,5 @@
+process {
+    withName: UNTAR {
+        ext.prefix = "test_zarr.zarr"
+    }
+}