Skip to content

Commit 8b351bf

Browse files
committed
Merge branch 'main' into pr/scverse-bot/241
2 parents 7697afe + c375a44 commit 8b351bf

6 files changed

Lines changed: 158 additions & 56 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ buck-out/
66
# Compiled files
77
.venv/
88
__pycache__/
9+
.ipynb_checkpoints/
910
.*cache/
1011

1112
# Distribution / packaging

.pre-commit-config.yaml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,24 @@ repos:
3636
# Check that there are no merge conflicts (could be generated by template sync)
3737
- id: check-merge-conflict
3838
args: [--assume-in-merge]
39+
- repo: https://github.com/python-jsonschema/check-jsonschema
40+
rev: '0.34.0'
41+
hooks:
42+
- id: check-jsonschema
43+
files: "tutorial-registry/schema.json"
44+
args: ["--check-metaschema"]
45+
- repo: https://github.com/python-jsonschema/check-jsonschema
46+
rev: '0.34.0'
47+
hooks:
48+
- id: check-jsonschema
49+
files: "tutorial-registry/tutorials/.*/meta.yaml"
50+
args: ["--schemafile", "tutorial-registry/schema.json"]
51+
- repo: local
52+
hooks:
53+
- id: forbid-to-commit
54+
name: Check files in `tutorials` directory
55+
entry: |
56+
Only files named `meta.yaml` or `icon.xxx` are permitted in the packages directory
57+
language: fail
58+
files: "^tutorial-registry/tutorials/.*$"
59+
exclude: "^tutorial-registry/tutorials/.*/(meta\\.yaml|icon\\.(svg|png|webp))$"

docs/conf.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
"sphinx.ext.intersphinx",
5454
"sphinx.ext.autosummary",
5555
"sphinx.ext.napoleon",
56+
"sphinx_issues",
5657
"sphinxcontrib.bibtex",
5758
"sphinx_autodoc_typehints",
5859
"sphinx_tabs.tabs",
@@ -93,9 +94,13 @@
9394

9495
intersphinx_mapping = {
9596
"python": ("https://docs.python.org/3", None),
96-
"anndata": ("https://anndata.readthedocs.io/en/stable/", None),
97-
"scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
97+
"anndata": ("https://anndata.readthedocs.io/en/latest/", None), # TODO: change back to stable after 0.12 release
9898
"numpy": ("https://numpy.org/doc/stable/", None),
99+
"scanpy": ("https://scanpy.readthedocs.io/en/stable/", None),
100+
"fast-array-utils": ("https://icb-fast-array-utils.readthedocs-hosted.com/en/stable", None),
101+
"dask": ("https://docs.dask.org/en/stable", None),
102+
"scipy": ("https://docs.scipy.org/doc/scipy", None),
103+
"rapids-singlecell": ("https://rapids-singlecell.readthedocs.io/en/stable/", None),
99104
}
100105

101106
# List of patterns, relative to source directory, that match files and

docs/how-to-dask.md

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
# Dask Q&A
2+
3+
Here we will go through some common questions and answers about `dask`, with a special focus on its integration with `scanpy` and `anndata`. For more comprehensive tutorials or other topics like {doc}`launching a cluster <dask:deploying>`, head over their documentation.
4+
5+
## Quickstart
6+
7+
### How do I monitor the {doc}`dask dashboard <dask:dashboard>`?
8+
9+
If you are in a jupyter notebook, when you render the `repr` of your `client`, you will see a link, usually something like `http://localhost:8787/status`.
10+
If you are working locally, this link alone should suffice.
11+
12+
If you are working on some sort of remote notebook from a web browser, you will need to replace `http://localhost` by the root url of the notebook.
13+
14+
If you are in vscode, there is a [`dask` extension] which will allow you to monitor there.
15+
16+
### How do I know how to allocate resources?
17+
18+
In `dask`, every worker will receive an equal share of the memory available.
19+
So if you request e.g., a slurm job with 256GB of RAM, and then start 8 workers, each will have 32 GB of memory.
20+
21+
`dask` distributes jobs to each worker generally based on the chunking of the array.
22+
So if you have dense chunks of `(30_000, 30_000)` with 32 bit integers, you will need to be have 3.6 GB for each worker, at the minimum to even load the data.
23+
Then if you do something like matrix multiplication, you will need double or even more, as an example.
24+
25+
### How do I read my data into a `dask` array?
26+
27+
{func}`anndata.experimental.read_elem_lazy` or {func}`anndata.experimental.read_lazy` can help you if you already have data on-disk that was written to the `anndata` file format.
28+
If you use {func}`dask.array.to_zarr`, the data _cannot_ be read in using `anndata`'s functionality as `anndata` will look for its {doc}`specified file format metadata <anndata:fileformat-prose>`.
29+
30+
If you need to implement custom io, generally we found that using {func}`dask.array.map_blocks` provides a nice way.
31+
See [our custom h5 io code] for an example.
32+
33+
## Advanced use and how-to-contribute
34+
35+
### How do `scanpy` and `anndata` handle sparse matrices?
36+
37+
While there is some {class}`scipy.sparse.csr_matrix` and {class}`scipy.sparse.csc_matrix` support for `dask`, it is not comprehensive and missing key functions like summation, mean etc.
38+
We have implemented custom functionality, much of which lives in {mod}`fast_array_utils`, although we have also had to implement custom algorithms like `pca` for sparse-in-dask.
39+
In the future, an [`array-api`] compatible sparse matrix like [`finch`] would help us considerably as `dask` supports the [`array-api`].
40+
41+
Therefore, if you run into a puzzling error after trying to run a function like {func}`numpy.sum` (or similar) on a sparse-in-dask array, consider checking {mod}`fast_array_utils`.
42+
If you need to implement the function yourself, see the next point.
43+
44+
### Custom block-wise array operations
45+
46+
Sometimes you may want to do an operation on a an array that is implemented nowhere.
47+
Generally, we have found {func}`dask.array.map_blocks` to be versatile enough that most operations can be expressed on it. Click on the link to see `dask`'s own tutorial about the function.
48+
49+
Take this (simplified) example of calculating a gram matrix from {func}`scanpy.pp.pca` for sparse-in-dask:
50+
51+
```python
52+
def gram_block(x_part):
53+
gram_matrix = x_part.T @ x_part
54+
return gram_matrix[None, ...]
55+
56+
gram_matrix_dask = da.map_blocks(
57+
gram_block,
58+
x,
59+
new_axis=(1,),
60+
chunks=((1,) * x.blocks.size, (x.shape[1],), (x.shape[1],)),
61+
meta=np.array([], dtype=x.dtype),
62+
dtype=x.dtype,
63+
).sum(axis=0)
64+
```
65+
66+
This algorithm goes through every `chunk_size` number of rows and calculates the gram matrix for those rows producing a collection of `(n_vars,n_vars)` size matrix.
67+
These are the summed together to produce a single `(n_vars,n_vars)` matrix, which is the gram matrix.
68+
69+
Because `dask` does not implement matrix multiplication for sparse-in-dask, we do it ourselves.
70+
We use `map_blocks` over a CSR sparse-in-dask array where the chunking looks something like `(chunk_size, n_vars)`.
71+
When we compute the individual block's gram matrix, we add an axis via `[None, ...]` so that we can sum over that axis i.e., the `da.map_blocks` call produces a `(n_obs // chunk_size, n_vars, n_vars)` sized-matrix which is summed over the first dimension.
72+
However, to make this work, we need to be very specific about how `da.map_blocks` expects its result to look like, done via `new_axis` and `chunks`.
73+
`new_axis` indicates that we are adding a single new axis at the front.
74+
The `chunks` argument specifies that the output of `da.map_blocks` should have `x.blocks.size` number of `(1, n_vars, n_vars)` matrixes.
75+
This `chunks` argument thus allows the inferral of the shape of the output.
76+
77+
While this example is a bit complicated it shows how you can go from a matrix of one shape and chunking to another by operating in a clean way over blocks.
78+
79+
## FAQ
80+
81+
### What is `persist` used for in RSC notebooks?
82+
83+
In the {doc}`multi-gpu showcase notebook for rapids-singlecell <rapids-singlecell:notebooks/06-multi_gpu_show>`, {meth}`dask.array.Array.persist` appears across the notebook.
84+
This loads the entire dataset into memory while keeping the representation as a dask array.
85+
Thus, lazy computation still works but only necessitates a single read into memory.
86+
The catch is that you need to have enough memory to use `persist`, but if you do it greatly speeds up the computation.
87+
88+
### I'm out of memory, what now?
89+
90+
You can always reduce the number of workers you use, which will cause more memory to be allocated per worker.
91+
Some algorithms may have limitations with loading all data onto a single node; see {issue}`dask/dask-ml#985` for an example.
92+
93+
### How do I choose chunk sizes?
94+
95+
Have a look at the {doc}`dask docs for chunking <dask:array-chunks>`, however the general rule of thumb there is to use larger chunks in memory than on disk.
96+
In this sense, it is probably a good idea to use the largest chunk size in memory allowable by your memory limits (and the algorithms you use) in order to maximize any thread-level parallelization in algorithms to its fullest.
97+
For sparse data, where the chunks in-memory do not map to those on disk, maxing out the memory available by choosing a large chunk size becomes more imperative.
98+
99+
[`dask` extension]: https://marketplace.visualstudio.com/items?itemName=joyceerhl.vscode-das
100+
[our custom h5 io code]: https://github.com/scverse/anndata/blob/089ed929393a02200b389395f278b7c920e5bc4a/src/anndata/_io/specs/lazy_methods.py#L179-L205
101+
[`array-api`]: https://data-apis.org/array-api/latest/index.html
102+
[`finch`]: https://github.com/finch-tensor/finch-tensor-python

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,6 @@ notebooks/tutorial_axes_anndata_mudata
1212
notebooks/scverse_data_backed
1313
notebooks/scverse_data_interoperability
1414
notebooks/tutorial_concatenation_anndata_mudata
15-
15+
how-to-dask.md
1616
references.md
1717
```

pyproject.toml

Lines changed: 26 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -22,70 +22,43 @@ classifiers = [
2222
"Programming Language :: Python :: 3.12",
2323
"Programming Language :: Python :: 3.13",
2424
]
25-
dependencies = [
26-
"anndata",
27-
# for debug logging (referenced from the issue template)
28-
"session-info2",
29-
]
30-
optional-dependencies.dev = [
31-
"pre-commit",
32-
"twine>=4.0.2",
33-
]
34-
optional-dependencies.doc = [
35-
"docutils>=0.8,!=0.18.*,!=0.19.*",
25+
docs = [
26+
"sphinx>=7",
27+
"sphinx-book-theme>=1.1.0",
28+
"sphinx-issues>=5.0.1",
29+
"myst-nb>=1.1.0",
30+
"sphinxcontrib-bibtex>=1.0.0",
31+
"sphinx-autodoc-typehints",
32+
"sphinxext-opengraph",
33+
# For notebooks
3634
"ipykernel",
3735
"ipython",
38-
"myst-nb>=1.1",
39-
"pandas",
40-
# Until pybtex >0.24.0 releases: https://bitbucket.org/pybtex-devs/pybtex/issues/169/
41-
"setuptools",
42-
"sphinx>=8.1",
43-
"sphinx-autodoc-typehints",
44-
"sphinx-book-theme>=1",
4536
"sphinx-copybutton",
46-
"sphinx-tabs",
47-
"sphinxcontrib-bibtex>=1",
48-
"sphinxext-opengraph",
49-
]
50-
optional-dependencies.test = [
51-
"coverage>=7.10",
52-
"pytest",
53-
"pytest-cov", # For VS Code’s coverage functionality
5437
]
55-
# https://docs.pypi.org/project_metadata/#project-urls
56-
urls.Documentation = "https://scverse-tutorials.readthedocs.io/"
57-
urls.Homepage = "https://github.com/scverse/scverse-tutorials"
58-
urls.Source = "https://github.com/scverse/scverse-tutorials"
5938

6039
[tool.hatch.envs.default]
6140
installer = "uv"
6241
features = [ "dev" ]
6342

64-
[tool.hatch.envs.docs]
65-
features = [ "doc" ]
66-
scripts.build = "sphinx-build -M html docs docs/_build -W {args}"
67-
scripts.open = "python -m webbrowser -t docs/_build/html/index.html"
68-
scripts.clean = "git clean -fdX -- {args:docs}"
69-
70-
# Test the lowest and highest supported Python versions with normal deps
71-
[[tool.hatch.envs.hatch-test.matrix]]
72-
deps = [ "stable" ]
73-
python = [ "3.10", "3.13" ]
74-
75-
# Test the newest supported Python version also with pre-release deps
76-
[[tool.hatch.envs.hatch-test.matrix]]
77-
deps = [ "pre" ]
78-
python = [ "3.13" ]
43+
[tool.hatch.envs.registry]
44+
features = [ "registry" ]
45+
[tool.hatch.envs.registry.scripts]
46+
validate = "python tutorial-registry/validate.py {args}"
7947

80-
[tool.hatch.envs.hatch-test]
81-
features = [ "dev", "test" ]
82-
83-
[tool.hatch.envs.hatch-test.overrides]
84-
# If the matrix variable `deps` is set to "pre",
85-
# set the environment variable `UV_PRERELEASE` to "allow".
86-
matrix.deps.env-vars = [
87-
{ key = "UV_PRERELEASE", value = "allow", if = [ "pre" ] },
48+
[tool.hatch.envs.docs]
49+
features = [ "docs" ]
50+
extra-dependencies = [
51+
"setuptools", # undeclared dependency in pybtex
52+
# fix from here: https://github.com/executablebooks/MyST-NB/pull/597
53+
"myst-nb @ git+https://github.com/flying-sheep/MyST-NB.git@eval-metadata",
8854
]
55+
[tool.hatch.envs.docs.scripts]
56+
build = "sphinx-build -M html docs docs/_build {args}"
57+
open = "python3 -m webbrowser -t docs/_build/html/index.html"
58+
clean = "git clean -fdX -- {args:docs}"
59+
60+
[tool.hatch.build.targets.wheel]
61+
bypass-selection = true # This is not a package
8962

9063
[tool.ruff]
9164
line-length = 120

0 commit comments

Comments
 (0)