Merge branch 'main' into pr/scverse-bot/241

flying-sheep · flying-sheep · commit 8b351bfd3c97 · 2025-09-30T09:06:24.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ buck-out/
 # Compiled files
 .venv/
 __pycache__/
+.ipynb_checkpoints/
 .*cache/
 
 # Distribution / packaging
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -36,3 +36,24 @@ repos:
       # Check that there are no merge conflicts (could be generated by template sync)
       - id: check-merge-conflict
         args: [--assume-in-merge]
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: '0.34.0'
+    hooks:
+      - id: check-jsonschema
+        files: "tutorial-registry/schema.json"
+        args: ["--check-metaschema"]
+  - repo: https://github.com/python-jsonschema/check-jsonschema
+    rev: '0.34.0'
+    hooks:
+      - id: check-jsonschema
+        files: "tutorial-registry/tutorials/.*/meta.yaml"
+        args: ["--schemafile", "tutorial-registry/schema.json"]
+  - repo: local
+    hooks:
+      - id: forbid-to-commit
+        name: Check files in `tutorials` directory
+        entry: |
+          Only files named `meta.yaml` or `icon.xxx` are permitted in the packages directory
+        language: fail
+        files: "^tutorial-registry/tutorials/.*$"
+        exclude: "^tutorial-registry/tutorials/.*/(meta\\.yaml|icon\\.(svg|png|webp))$"
diff --git a/docs/conf.py b/docs/conf.py
@@ -53,6 +53,7 @@
     "sphinx.ext.intersphinx",
     "sphinx.ext.autosummary",
     "sphinx.ext.napoleon",
+    "sphinx_issues",
     "sphinxcontrib.bibtex",
     "sphinx_autodoc_typehints",
     "sphinx_tabs.tabs",
@@ -93,9 +94,13 @@
 
 intersphinx_mapping = {
     "python": ("https://docs.python.org/3", None),
-    "anndata": ("https://anndata.readthedocs.io/en/stable/", None),
-    "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
+    "anndata": ("https://anndata.readthedocs.io/en/latest/", None),  # TODO: change back to stable after 0.12 release
     "numpy": ("https://numpy.org/doc/stable/", None),
+    "scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
+    "fast-array-utils": ("https://icb-fast-array-utils.readthedocs-hosted.com/en/stable", None),
+    "dask": ("https://docs.dask.org/en/stable", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy", None),
+    "rapids-singlecell": ("https://rapids-singlecell.readthedocs.io/en/stable/", None),
 }
 
 # List of patterns, relative to source directory, that match files and
diff --git a/docs/how-to-dask.md b/docs/how-to-dask.md
@@ -0,0 +1,102 @@
+# Dask Q&A
+
+Here we will go through some common questions and answers about `dask`, with a special focus on its integration with `scanpy` and `anndata`. For more comprehensive tutorials or other topics like {doc}`launching a cluster <dask:deploying>`, head over their documentation.
+
+## Quickstart
+
+### How do I monitor the {doc}`dask dashboard <dask:dashboard>`?
+
+If you are in a jupyter notebook, when you render the `repr` of your `client`, you will see a link, usually something like `http://localhost:8787/status`.
+If you are working locally, this link alone should suffice.
+
+If you are working on some sort of remote notebook from a web browser, you will need to replace `http://localhost` by the root url of the notebook.
+
+If you are in vscode, there is a [`dask` extension] which will allow you to monitor there.
+
+### How do I know how to allocate resources?
+
+In `dask`, every worker will receive an equal share of the memory available.
+So if you request e.g., a slurm job with 256GB of RAM, and then start 8 workers, each will have 32 GB of memory.
+
+`dask` distributes jobs to each worker generally based on the chunking of the array.
+So if you have dense chunks of `(30_000, 30_000)` with 32 bit integers, you will need to be have 3.6 GB for each worker, at the minimum to even load the data.
+Then if you do something like matrix multiplication, you will need double or even more, as an example.
+
+### How do I read my data into a `dask` array?
+
+{func}`anndata.experimental.read_elem_lazy` or {func}`anndata.experimental.read_lazy` can help you if you already have data on-disk that was written to the `anndata` file format.
+If you use {func}`dask.array.to_zarr`, the data _cannot_ be read in using `anndata`'s functionality as `anndata` will look for its {doc}`specified file format metadata <anndata:fileformat-prose>`.
+
+If you need to implement custom io, generally we found that using {func}`dask.array.map_blocks` provides a nice way.
+See [our custom h5 io code] for an example.
+
+## Advanced use and how-to-contribute
+
+### How do `scanpy` and `anndata` handle sparse matrices?
+
+While there is some {class}`scipy.sparse.csr_matrix` and {class}`scipy.sparse.csc_matrix` support for `dask`, it is not comprehensive and missing key functions like summation, mean etc.
+We have implemented custom functionality, much of which lives in {mod}`fast_array_utils`, although we have also had to implement custom algorithms like `pca` for sparse-in-dask.
+In the future, an [`array-api`] compatible sparse matrix like [`finch`] would help us considerably as `dask` supports the [`array-api`].
+
+Therefore, if you run into a puzzling error after trying to run a function like {func}`numpy.sum` (or similar) on a sparse-in-dask array, consider checking {mod}`fast_array_utils`.
+If you need to implement the function yourself, see the next point.
+
+### Custom block-wise array operations
+
+Sometimes you may want to do an operation on a an array that is implemented nowhere.
+Generally, we have found {func}`dask.array.map_blocks` to be versatile enough that most operations can be expressed on it. Click on the link to see `dask`'s own tutorial about the function.
+
+Take this (simplified) example of calculating a gram matrix from {func}`scanpy.pp.pca` for sparse-in-dask:
+
+```python
+def gram_block(x_part):
+    gram_matrix = x_part.T @ x_part
+    return gram_matrix[None, ...]
+
+gram_matrix_dask = da.map_blocks(
+    gram_block,
+    x,
+    new_axis=(1,),
+    chunks=((1,) * x.blocks.size, (x.shape[1],), (x.shape[1],)),
+    meta=np.array([], dtype=x.dtype),
+    dtype=x.dtype,
+).sum(axis=0)
+```
+
+This algorithm goes through every `chunk_size` number of rows and calculates the gram matrix for those rows producing a collection of `(n_vars,n_vars)` size matrix.
+These are the summed together to produce a single `(n_vars,n_vars)` matrix, which is the gram matrix.
+
+Because `dask` does not implement matrix multiplication for sparse-in-dask, we do it ourselves.
+We use `map_blocks` over a CSR sparse-in-dask array where the chunking looks something like `(chunk_size, n_vars)`.
+When we compute the individual block's gram matrix, we add an axis via `[None, ...]` so that we can sum over that axis i.e., the `da.map_blocks` call produces a `(n_obs // chunk_size, n_vars, n_vars)` sized-matrix which is summed over the first dimension.
+However, to make this work, we need to be very specific about how `da.map_blocks` expects its result to look like, done via `new_axis` and `chunks`.
+`new_axis` indicates that we are adding a single new axis at the front.
+The `chunks` argument specifies that the output of `da.map_blocks` should have `x.blocks.size` number of `(1, n_vars, n_vars)` matrixes.
+This `chunks` argument thus allows the inferral of the shape of the output.
+
+While this example is a bit complicated it shows how you can go from a matrix of one shape and chunking to another by operating in a clean way over blocks.
+
+## FAQ
+
+### What is `persist` used for in RSC notebooks?
+
+In the {doc}`multi-gpu showcase notebook for rapids-singlecell <rapids-singlecell:notebooks/06-multi_gpu_show>`, {meth}`dask.array.Array.persist` appears across the notebook.
+This loads the entire dataset into memory while keeping the representation as a dask array.
+Thus, lazy computation still works but only necessitates a single read into memory.
+The catch is that you need to have enough memory to use `persist`, but if you do it greatly speeds up the computation.
+
+### I'm out of memory, what now?
+
+You can always reduce the number of workers you use, which will cause more memory to be allocated per worker.
+Some algorithms may have limitations with loading all data onto a single node; see {issue}`dask/dask-ml#985` for an example.
+
+### How do I choose chunk sizes?
+
+Have a look at the {doc}`dask docs for chunking <dask:array-chunks>`, however the general rule of thumb there is to use larger chunks in memory than on disk.
+In this sense, it is probably a good idea to use the largest chunk size in memory allowable by your memory limits (and the algorithms you use) in order to maximize any thread-level parallelization in algorithms to its fullest.
+For sparse data, where the chunks in-memory do not map to those on disk, maxing out the memory available by choosing a large chunk size becomes more imperative.
+
+[`dask` extension]: https://marketplace.visualstudio.com/items?itemName=joyceerhl.vscode-das
+[our custom h5 io code]: https://github.com/scverse/anndata/blob/089ed929393a02200b389395f278b7c920e5bc4a/src/anndata/_io/specs/lazy_methods.py#L179-L205
+[`array-api`]: https://data-apis.org/array-api/latest/index.html
+[`finch`]: https://github.com/finch-tensor/finch-tensor-python
diff --git a/docs/index.md b/docs/index.md
@@ -12,6 +12,6 @@ notebooks/tutorial_axes_anndata_mudata
 notebooks/scverse_data_backed
 notebooks/scverse_data_interoperability
 notebooks/tutorial_concatenation_anndata_mudata
-
+how-to-dask.md
 references.md
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,70 +22,43 @@ classifiers = [
   "Programming Language :: Python :: 3.12",
   "Programming Language :: Python :: 3.13",
 ]
-dependencies = [
-  "anndata",
-  # for debug logging (referenced from the issue template)
-  "session-info2",
-]
-optional-dependencies.dev = [
-  "pre-commit",
-  "twine>=4.0.2",
-]
-optional-dependencies.doc = [
-  "docutils>=0.8,!=0.18.*,!=0.19.*",
+docs = [
+  "sphinx>=7",
+  "sphinx-book-theme>=1.1.0",
+  "sphinx-issues>=5.0.1",
+  "myst-nb>=1.1.0",
+  "sphinxcontrib-bibtex>=1.0.0",
+  "sphinx-autodoc-typehints",
+  "sphinxext-opengraph",
+  # For notebooks
   "ipykernel",
   "ipython",
-  "myst-nb>=1.1",
-  "pandas",
-  # Until pybtex >0.24.0 releases: https://bitbucket.org/pybtex-devs/pybtex/issues/169/
-  "setuptools",
-  "sphinx>=8.1",
-  "sphinx-autodoc-typehints",
-  "sphinx-book-theme>=1",
   "sphinx-copybutton",
-  "sphinx-tabs",
-  "sphinxcontrib-bibtex>=1",
-  "sphinxext-opengraph",
-]
-optional-dependencies.test = [
-  "coverage>=7.10",
-  "pytest",
-  "pytest-cov",     # For VS Code’s coverage functionality
 ]
-# https://docs.pypi.org/project_metadata/#project-urls
-urls.Documentation = "https://scverse-tutorials.readthedocs.io/"
-urls.Homepage = "https://github.com/scverse/scverse-tutorials"
-urls.Source = "https://github.com/scverse/scverse-tutorials"
 
 [tool.hatch.envs.default]
 installer = "uv"
 features = [ "dev" ]
 
-[tool.hatch.envs.docs]
-features = [ "doc" ]
-scripts.build = "sphinx-build -M html docs docs/_build -W {args}"
-scripts.open = "python -m webbrowser -t docs/_build/html/index.html"
-scripts.clean = "git clean -fdX -- {args:docs}"
-
-# Test the lowest and highest supported Python versions with normal deps
-[[tool.hatch.envs.hatch-test.matrix]]
-deps = [ "stable" ]
-python = [ "3.10", "3.13" ]
-
-# Test the newest supported Python version also with pre-release deps
-[[tool.hatch.envs.hatch-test.matrix]]
-deps = [ "pre" ]
-python = [ "3.13" ]
+[tool.hatch.envs.registry]
+features = [ "registry" ]
+[tool.hatch.envs.registry.scripts]
+validate = "python tutorial-registry/validate.py {args}"
 
-[tool.hatch.envs.hatch-test]
-features = [ "dev", "test" ]
-
-[tool.hatch.envs.hatch-test.overrides]
-# If the matrix variable `deps` is set to "pre",
-# set the environment variable `UV_PRERELEASE` to "allow".
-matrix.deps.env-vars = [
-  { key = "UV_PRERELEASE", value = "allow", if = [ "pre" ] },
+[tool.hatch.envs.docs]
+features = [ "docs" ]
+extra-dependencies = [
+  "setuptools", # undeclared dependency in pybtex
+  # fix from here: https://github.com/executablebooks/MyST-NB/pull/597
+  "myst-nb @ git+https://github.com/flying-sheep/MyST-NB.git@eval-metadata",
 ]
+[tool.hatch.envs.docs.scripts]
+build = "sphinx-build -M html docs docs/_build {args}"
+open = "python3 -m webbrowser -t docs/_build/html/index.html"
+clean = "git clean -fdX -- {args:docs}"
+
+[tool.hatch.build.targets.wheel]
+bypass-selection = true # This is not a package
 
 [tool.ruff]
 line-length = 120