diff --git a/.github/workflows/codspeed.yml b/.github/workflows/codspeed.yml index 7192b6d78a..bc4f1c1d4c 100644 --- a/.github/workflows/codspeed.yml +++ b/.github/workflows/codspeed.yml @@ -1,13 +1,10 @@ name: CodSpeed Benchmarks on: - push: - branches: - - "main" + schedule: + - cron: '0 9 * * 1' # Every Monday at 9am UTC pull_request: - types: [labeled, synchronize] - # `workflow_dispatch` allows CodSpeed to trigger backtest - # performance analysis in order to generate initial data. + types: [labeled] workflow_dispatch: permissions: @@ -17,15 +14,14 @@ jobs: benchmarks: name: Run benchmarks runs-on: codspeed-macro - # Only run benchmarks for: pushes to main, manual triggers, or PRs with 'benchmark' label if: | - github.event_name == 'push' || + github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'benchmark')) steps: - uses: actions/checkout@v6 with: - fetch-depth: 0 # grab all branches and tags + fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v6 with: diff --git a/.github/workflows/releases.yml b/.github/workflows/releases.yml index e3e3c446fa..bb9256568c 100644 --- a/.github/workflows/releases.yml +++ b/.github/workflows/releases.yml @@ -10,6 +10,10 @@ on: permissions: contents: read +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + jobs: build_artifacts: @@ -35,7 +39,7 @@ jobs: version: '1.16.5' - name: Build wheel and sdist run: hatch build - - uses: actions/upload-artifact@v6 + - uses: actions/upload-artifact@v7 with: name: releases path: dist @@ -55,16 +59,24 @@ jobs: ls dist upload_pypi: - needs: [build_artifacts] + needs: [build_artifacts, test_dist_pypi] runs-on: ubuntu-latest if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') + environment: + name: releases + url: https://pypi.org/p/zarr + permissions: + id-token: write + attestations: write + artifact-metadata: write steps: - uses: actions/download-artifact@v7 with: name: releases path: dist - - uses: pypa/gh-action-pypi-publish@v1.13.0 + - name: Generate artifact attestation + uses: actions/attest@v4 with: - user: __token__ - password: ${{ secrets.pypi_password }} - # To test: repository_url: https://test.pypi.org/legacy/ + subject-path: dist/* + - name: Publish package to PyPI + uses: pypa/gh-action-pypi-publish@v1.13.0 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f86fb3d9c1..37f41b8222 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -54,6 +54,15 @@ repos: rev: v1.10.0 hooks: - id: numpydoc-validation + - repo: local + hooks: + - id: ban-lstrip-rstrip + name: ban lstrip/rstrip + language: pygrep + # Matches .lstrip() or .rstrip() where the string argument is 2+ characters. + entry: "\\.(lstrip|rstrip)\\([\"'][^\"']{2,}[\"']\\)" + types: [python] + files: ^(src|tests)/ - repo: https://github.com/twisted/towncrier rev: 25.8.0 hooks: diff --git a/changes/3562.misc.md b/changes/3562.misc.md deleted file mode 100644 index e164ab39f8..0000000000 --- a/changes/3562.misc.md +++ /dev/null @@ -1 +0,0 @@ -Add continuous performance benchmarking infrastructure. \ No newline at end of file diff --git a/changes/3603.bugfix.md b/changes/3603.bugfix.md deleted file mode 100644 index 37e1da5cb1..0000000000 --- a/changes/3603.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Correct the target bytes number for auto-chunking when auto-sharding. \ No newline at end of file diff --git a/changes/3605.misc.md b/changes/3605.misc.md deleted file mode 100644 index b8c0757b69..0000000000 --- a/changes/3605.misc.md +++ /dev/null @@ -1 +0,0 @@ -Fix a bug in the test suite that prevented stand-alone example scripts from being tested. \ No newline at end of file diff --git a/changes/3619.misc.md b/changes/3619.misc.md deleted file mode 100644 index 8c36e473b5..0000000000 --- a/changes/3619.misc.md +++ /dev/null @@ -1 +0,0 @@ -Remove upper bounds on `pytest` and `pytest-asyncio` test dependencies. \ No newline at end of file diff --git a/changes/3623.misc.md b/changes/3623.misc.md deleted file mode 100644 index 4060e55e5f..0000000000 --- a/changes/3623.misc.md +++ /dev/null @@ -1,5 +0,0 @@ -This PR contains minor, non-function-altering, changes to use `ZarrFormat` across the repo as opposed to duplicating is with `Literal[2,3]`. - -Additionally, it fixes broken linting by using a `Literal[True, False]` type hint for Numpy hypothesis testing, as opposed to `bool`. - -Basically improves the typehints and reduces fat-finger error surface area slightly. diff --git a/changes/3636.misc.md b/changes/3636.misc.md deleted file mode 100644 index a814160c8b..0000000000 --- a/changes/3636.misc.md +++ /dev/null @@ -1 +0,0 @@ -The minimum required version of NumPy is now 2.0. diff --git a/changes/3648.misc.md b/changes/3648.misc.md deleted file mode 100644 index 156f8671de..0000000000 --- a/changes/3648.misc.md +++ /dev/null @@ -1 +0,0 @@ -Fix deprecation of setting a shape on an array directly in ``numpy`` 2.5+. diff --git a/changes/3655.bugfix.md b/changes/3655.bugfix.md deleted file mode 100644 index 67d384f00d..0000000000 --- a/changes/3655.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fixed a bug in the sharding codec that prevented nested shard reads in certain cases. \ No newline at end of file diff --git a/changes/3656.misc.md b/changes/3656.misc.md deleted file mode 100644 index 159f24d072..0000000000 --- a/changes/3656.misc.md +++ /dev/null @@ -1 +0,0 @@ -Removed *rich* and *mypy* from the `[test]` dependencies, and added a new `[dev]` dependency group that can be used to install all the development dependencies. diff --git a/changes/3657.bugfix.md b/changes/3657.bugfix.md deleted file mode 100644 index 1411704674..0000000000 --- a/changes/3657.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fix obstore _transform_list_dir implementation to correctly relativize paths (removing lstrip usage). \ No newline at end of file diff --git a/changes/3658.misc.md b/changes/3658.misc.md deleted file mode 100644 index f400d97473..0000000000 --- a/changes/3658.misc.md +++ /dev/null @@ -1 +0,0 @@ -Switch from `pre-commit` to [`prek`](https://github.com/j178/prek) for pre-commit checks. \ No newline at end of file diff --git a/changes/3668.feature.md b/changes/3668.feature.md deleted file mode 100644 index def196ec8a..0000000000 --- a/changes/3668.feature.md +++ /dev/null @@ -1,4 +0,0 @@ -Exposes the array runtime configuration as an attribute called `config` on the `Array` and -`AsyncArray` classes. The previous `AsyncArray._config` attribute is now a deprecated alias for `AsyncArray.config`. - -Adds a method for creating a new `Array` / `AsyncArray` instance with a new runtime configuration, and fixes inaccurate documentation about the `write_empty_chunks` configuration parameter. \ No newline at end of file diff --git a/changes/3673.misc.md b/changes/3673.misc.md deleted file mode 100644 index 83643f5d3c..0000000000 --- a/changes/3673.misc.md +++ /dev/null @@ -1 +0,0 @@ -Benchmark CI now only runs for PRs with the `benchmark` label, reducing CodSpeed credit usage. diff --git a/changes/3695.bugfix.md b/changes/3695.bugfix.md deleted file mode 100644 index a7d847e4f1..0000000000 --- a/changes/3695.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Raise error when trying to encode :class:`numpy.dtypes.StringDType` with `na_object` set. \ No newline at end of file diff --git a/changes/3700.bugfix.md b/changes/3700.bugfix.md deleted file mode 100644 index 86acb71d0e..0000000000 --- a/changes/3700.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -CacheStore, LoggingStore and LatencyStore now support with_read_only. \ No newline at end of file diff --git a/changes/3702.bugfix.md b/changes/3702.bugfix.md deleted file mode 100644 index 94a2902567..0000000000 --- a/changes/3702.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Skip chunk coordinate enumeration in resize when the array is only growing, avoiding unbounded memory usage for large arrays. \ No newline at end of file diff --git a/changes/3704.misc.md b/changes/3704.misc.md deleted file mode 100644 index d15d4924e0..0000000000 --- a/changes/3704.misc.md +++ /dev/null @@ -1 +0,0 @@ -Remove an expensive `isinstance` check from the bytes codec decoding routine. \ No newline at end of file diff --git a/changes/3705.bugfix.md b/changes/3705.bugfix.md deleted file mode 100644 index 2abcb4ee7c..0000000000 --- a/changes/3705.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Fix a performance bug in morton curve generation. \ No newline at end of file diff --git a/changes/3706.misc.md b/changes/3706.misc.md deleted file mode 100644 index 70a0e44c58..0000000000 --- a/changes/3706.misc.md +++ /dev/null @@ -1 +0,0 @@ -Allow NumPy ints as input when declaring a shape. \ No newline at end of file diff --git a/changes/3708.misc.md b/changes/3708.misc.md deleted file mode 100644 index dce7546c97..0000000000 --- a/changes/3708.misc.md +++ /dev/null @@ -1 +0,0 @@ -Optimize Morton order computation with hypercube optimization, vectorized decoding, and singleton dimension removal, providing 10-45x speedup for typical chunk shapes. diff --git a/changes/3710.bugfix.md b/changes/3710.bugfix.md deleted file mode 100644 index a40ddcee23..0000000000 --- a/changes/3710.bugfix.md +++ /dev/null @@ -1 +0,0 @@ -Add a dedicated in-memory cache for byte-range requests to the experimental `CacheStore`. \ No newline at end of file diff --git a/changes/3712.misc.md b/changes/3712.misc.md deleted file mode 100644 index 8fa2f2d2f7..0000000000 --- a/changes/3712.misc.md +++ /dev/null @@ -1 +0,0 @@ -Added benchmarks for Morton order computation in sharded arrays. diff --git a/changes/3713.misc.md b/changes/3713.misc.md deleted file mode 100644 index 9b0680dfc0..0000000000 --- a/changes/3713.misc.md +++ /dev/null @@ -1 +0,0 @@ -Vectorize get_chunk_slice for faster sharded array writes. diff --git a/changes/3717.misc.md b/changes/3717.misc.md deleted file mode 100644 index 5fed76b2b7..0000000000 --- a/changes/3717.misc.md +++ /dev/null @@ -1 +0,0 @@ -Add benchmarks for Morton order computation with non-power-of-2 and near-miss shard shapes, covering both pure computation and end-to-end read/write performance. diff --git a/changes/3721.misc.md b/changes/3721.misc.md deleted file mode 100644 index c170712882..0000000000 --- a/changes/3721.misc.md +++ /dev/null @@ -1 +0,0 @@ -Adds synchronous (non-async) encoding and decoding methods to CPU-bound codecs. This is necessary for performance optimizations based on avoiding `asyncio` overhead. These new methods are described by a new protocol: `SupportsSyncCodec`. \ No newline at end of file diff --git a/changes/3728.misc.md b/changes/3728.misc.md deleted file mode 100644 index a3cbb8d3f0..0000000000 --- a/changes/3728.misc.md +++ /dev/null @@ -1 +0,0 @@ -Move development dependencies (`test`, `remote_tests`, `docs`, `dev`) from optional dependencies to [dependency groups](https://packaging.python.org/en/latest/specifications/dependency-groups/). This may cause breakage for anyone who used e.g. `pip install zarr[test]` to get access to test dependencies. To install these dependency groups from a local checkout, use `pip install --group ` (pip 25.1+) or `uv run --group `. \ No newline at end of file diff --git a/changes/3773.bugfix.md b/changes/3773.bugfix.md new file mode 100644 index 0000000000..60d8fa1594 --- /dev/null +++ b/changes/3773.bugfix.md @@ -0,0 +1,5 @@ +Fix divergent behavior between `MemoryStore` and `LocalStore` `list_prefix` methods. + +Both stores now consistently use string prefix matching (checking if keys start with the given prefix string), +rather than `LocalStore` treating the prefix as a filesystem directory path. This ensures consistent +behavior across different store implementations and aligns with the documented behavior of `list_prefix`. diff --git a/changes/3797.bugfix.md b/changes/3797.bugfix.md new file mode 100644 index 0000000000..c683213c5d --- /dev/null +++ b/changes/3797.bugfix.md @@ -0,0 +1,2 @@ +Fix an issue that prevents the correct parsing of special NumPy ``uint32`` dtypes resulting e.g. +from bit wise operations on ``uint32`` arrays on Windows. diff --git a/docs/_static/favicon-96x96.png b/docs/_static/favicon-96x96.png new file mode 100644 index 0000000000..e77977ccf4 Binary files /dev/null and b/docs/_static/favicon-96x96.png differ diff --git a/docs/contributing.md b/docs/contributing.md index a4bbaafbd5..b2c1ae635c 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -36,6 +36,32 @@ If you have an idea about a new feature or some other improvement to Zarr, pleas We very much welcome ideas and suggestions for how to improve Zarr, but please bear in mind that we are likely to be conservative in accepting proposals for new features. The reasons for this are that we would like to keep the Zarr code base lean and focused on a core set of functionalities, and available time for development, review and maintenance of new features is limited. But if you have a great idea, please don't let that stop you from posting it on GitHub, just please don't be offended if we respond cautiously. +## AI-assisted contributions + +AI coding tools are increasingly common in open source development. These tools are welcome in Zarr-Python, but the same standards apply to all contributions regardless of how they were produced — whether written by hand, with AI assistance, or generated entirely by an AI tool. + +### You are responsible for your changes + +If you submit a pull request, you are responsible for understanding and having fully reviewed the changes. You must be able to explain why each change is correct and how it fits into the project. + +### Communication must be your own + +PR descriptions, issue comments, and review responses must be in your own words. The substance and reasoning must come from you. Using AI to polish grammar or phrasing is fine, but do not paste AI-generated text as comments or review responses. + +### Review every line + +You must have personally reviewed and understood all changes before submitting. If you used AI to generate code, you are expected to have read it critically and tested it. The PR description should explain the approach and reasoning — do not leave it to reviewers to figure out what the code does and why. + +### Keep PRs reviewable + +Generating code with AI is fast; reviewing it is not. A large diff shifts the burden from the contributor to the reviewer. PRs that cannot be reviewed in reasonable time with reasonable effort may be closed, regardless of their potential usefulness or correctness. Use AI tools not only to write code but to prepare better, more reviewable PRs — well-structured commits, clear descriptions, and minimal scope. + +If you are planning a large AI-assisted contribution (e.g., a significant refactor or a new subsystem), **open an issue first** to discuss the scope and approach with maintainers. Maintainers may also request that large changes be broken into smaller, reviewable pieces. + +### Documentation + +The same principles apply to documentation. Zarr has domain-specific semantics (chunked storage, codec pipelines, Zarr v2/v3 format details) that AI tools frequently get wrong. Do not submit documentation that you haven't carefully read and verified. + ## Contributing code and/or documentation ### Forking the repository diff --git a/docs/overrides/stylesheets/extra.css b/docs/overrides/stylesheets/extra.css index fab7e4ba13..6cb7c74e8d 100644 --- a/docs/overrides/stylesheets/extra.css +++ b/docs/overrides/stylesheets/extra.css @@ -52,15 +52,10 @@ color: white; } -/* Search box styling */ -.md-search__input { +/* Search box styling in the header */ +.md-header .md-search__input { background-color: rgba(255, 255, 255, 0.15); border: 1px solid rgba(255, 255, 255, 0.2); - color: white; -} - -.md-search__input::placeholder { - color: rgba(255, 255, 255, 0.7); } /* Navigation tabs */ diff --git a/docs/release-notes.md b/docs/release-notes.md index 71c095e19a..b974ae10c7 100644 --- a/docs/release-notes.md +++ b/docs/release-notes.md @@ -2,7 +2,43 @@ -# zarr 3.1.5 (2025-11-21) +## 3.1.6 (2026-03-19) + +### Features + +- Exposes the array runtime configuration as an attribute called `config` on the `Array` and + `AsyncArray` classes. The previous `AsyncArray._config` attribute is now a deprecated alias for `AsyncArray.config`. ([#3668](https://github.com/zarr-developers/zarr-python/issues/3668)) +- Adds a method for creating a new `Array` / `AsyncArray` instance with a new runtime configuration, and fixes inaccurate documentation about the `write_empty_chunks` configuration parameter. ([#3668](https://github.com/zarr-developers/zarr-python/issues/3668)) +- Adds synchronous methods to stores that do not benefit from an async event loop. The shape of these methods is defined by protocol classes to support structural subtyping. ([#3725](https://github.com/zarr-developers/zarr-python/pull/3725)) +- Fix near-miss penalty in `_morton_order` with hybrid ceiling+argsort strategy. ([#3718](https://github.com/zarr-developers/zarr-python/pull/3718)) + +### Bugfixes + +- Correct the target bytes number for auto-chunking when auto-sharding. ([#3603](https://github.com/zarr-developers/zarr-python/issues/3603)) +- Fixed a bug in the sharding codec that prevented nested shard reads in certain cases. ([#3655](https://github.com/zarr-developers/zarr-python/issues/3655)) +- Fix obstore `_transform_list_dir` implementation to correctly relativize paths (removing `lstrip` usage). ([#3657](https://github.com/zarr-developers/zarr-python/issues/3657)) +- Raise error when trying to encode :class:`numpy.dtypes.StringDType` with `na_object` set. ([#3695](https://github.com/zarr-developers/zarr-python/issues/3695)) +- `CacheStore`, `LoggingStore` and `LatencyStore` now support with_read_only. ([#3700](https://github.com/zarr-developers/zarr-python/issues/3700)) +- Skip chunk coordinate enumeration in resize when the array is only growing, avoiding unbounded memory usage for large arrays. ([#3702](https://github.com/zarr-developers/zarr-python/issues/3702)) +- Fix a performance bug in morton curve generation. ([#3705](https://github.com/zarr-developers/zarr-python/issues/3705)) +- Add a dedicated in-memory cache for byte-range requests to the experimental `CacheStore`. ([#3710](https://github.com/zarr-developers/zarr-python/issues/3710)) +- `BaseFloat._check_scalar` rejects invalid string values. ([#3586](https://github.com/zarr-developers/zarr-python/issues/3586)) +- Apply drop_axes squeeze in partial decode path for sharding. ([#3763](https://github.com/zarr-developers/zarr-python/issues/3763)) +- Set `copy=False` in reshape operation. ([#3649](https://github.com/zarr-developers/zarr-python/issues/3649)) +- Validate that dask-style chunks have regular shapes. ([#3779](https://github.com/zarr-developers/zarr-python/issues/3779)) + +### Improved Documentation + +- Add documentation example for creating uncompressed arrays in the Compression section of the user guide. ([#3464](https://github.com/zarr-developers/zarr-python/issues/3464)) +- Add AI-assisted code policy to the contributing guide. ([#3769](https://github.com/zarr-developers/zarr-python/issues/3769)) +- Added a glossary. ([#3767](https://github.com/zarr-developers/zarr-python/issues/3767)) + +### Misc + +- [#3562](https://github.com/zarr-developers/zarr-python/issues/3562), [#3605](https://github.com/zarr-developers/zarr-python/issues/3605), [#3619](https://github.com/zarr-developers/zarr-python/issues/3619), [#3623](https://github.com/zarr-developers/zarr-python/issues/3623), [#3636](https://github.com/zarr-developers/zarr-python/issues/3636), [#3648](https://github.com/zarr-developers/zarr-python/issues/3648), [#3656](https://github.com/zarr-developers/zarr-python/issues/3656), [#3658](https://github.com/zarr-developers/zarr-python/issues/3658), [#3673](https://github.com/zarr-developers/zarr-python/issues/3673), [#3704](https://github.com/zarr-developers/zarr-python/issues/3704), [#3706](https://github.com/zarr-developers/zarr-python/issues/3706), [#3708](https://github.com/zarr-developers/zarr-python/issues/3708), [#3712](https://github.com/zarr-developers/zarr-python/issues/3712), [#3713](https://github.com/zarr-developers/zarr-python/issues/3713), [#3717](https://github.com/zarr-developers/zarr-python/issues/3717), [#3721](https://github.com/zarr-developers/zarr-python/issues/3721), [#3728](https://github.com/zarr-developers/zarr-python/issues/3728), [#3778](https://github.com/zarr-developers/zarr-python/issues/3778) + + +## zarr 3.1.5 (2025-11-21) ## Bugfixes diff --git a/docs/user-guide/arrays.md b/docs/user-guide/arrays.md index cd6a93cac9..a44c096b73 100644 --- a/docs/user-guide/arrays.md +++ b/docs/user-guide/arrays.md @@ -224,6 +224,13 @@ print(z.info_complete()) If you don't specify a compressor, by default Zarr uses the Zstandard compressor. +To create an array without any compression, set `compressors=None`: + +```python exec="true" session="arrays" source="above" result="ansi" +z_no_compress = zarr.create_array(store='data/example-uncompressed.zarr', shape=(10000, 10000), chunks=(1000, 1000), dtype='int32', compressors=None) +print(f"Compressors: {z_no_compress.compressors}") +``` + In addition to Blosc and Zstandard, other compression libraries can also be used. For example, here is an array using Gzip compression, level 1: diff --git a/docs/user-guide/glossary.md b/docs/user-guide/glossary.md new file mode 100644 index 0000000000..a490b7c341 --- /dev/null +++ b/docs/user-guide/glossary.md @@ -0,0 +1,110 @@ +# Glossary + +This page defines key terms used throughout the zarr-python documentation and API. + +## Array Structure + +### Array + +An N-dimensional typed array stored in a Zarr [store](#store). An array's +[metadata](#metadata) defines its shape, data type, chunk layout, and codecs. + +### Chunk + +The fundamental unit of data in a Zarr array. An array is divided into chunks +along each dimension according to the [chunk grid](#chunk-grid), which is currently +part of Zarr's private API. Each chunk is independently compressed and encoded +through the array's [codec](#codec) pipeline. + +When [sharding](#shard) is used, "chunk" refers to the inner chunks within each +shard, because those are the compressible units. The chunks are the smallest units +that can be read independently. + +!!! warning "Convention specific to zarr-python" + The use of "chunk" to mean the inner sub-chunk within a shard is a convention + adopted by zarr-python's `Array` API. In the Zarr V3 specification and in other + Zarr implementations, "chunk" may refer to the top-level grid cells (which + zarr-python calls "shards" when the sharding codec is used). Be aware of this + distinction when working across libraries. + +**API**: [`Array.chunks`][zarr.Array.chunks] returns the chunk shape. When +sharding is used, this is the inner chunk shape. + +### Chunk Grid + +The partitioning of an array's elements into [chunks](#chunk). In Zarr V3, the +chunk grid is defined in the array [metadata](#metadata) and determines the +boundaries of each storage object. + +When sharding is used, the chunk grid defines the [shard](#shard) boundaries, +not the inner chunk boundaries. The inner chunk shape is defined within the +[sharding codec](#shard). + +**API**: The `chunk_grid` field in array metadata contains the storage-level +grid. + +### Shard + +A storage object that contains one or more [chunks](#chunk). Sharding reduces the +number of objects in a [store](#store) by grouping chunks together, which +improves performance on file systems and object storage. + +Within each shard, chunks are compressed independently and can be read +individually. However, writing requires updating the full shard for consistency, +making shards the unit of writing and chunks the unit of reading. + +Sharding is implemented as a [codec](#codec) (the sharding indexed codec). +When sharding is used: + +- The [chunk grid](#chunk-grid) in metadata defines the shard boundaries +- The sharding codec's `chunk_shape` defines the inner chunk size +- Each shard contains `shard_shape / chunk_shape` chunks per dimension + +**API**: [`Array.shards`][zarr.Array.shards] returns the shard shape, or `None` +if sharding is not used. [`Array.chunks`][zarr.Array.chunks] returns the inner +chunk shape. + +## Storage + +### Store + +A key-value storage backend that holds Zarr data and metadata. Stores implement +the [`zarr.abc.store.Store`][] interface. Examples include local file systems, +cloud object storage (S3, GCS, Azure), zip files, and in-memory dictionaries. + +Each [chunk](#chunk) or [shard](#shard) is stored as a single value (object or +file) in the store, addressed by a key derived from its grid coordinates. + +### Metadata + +The JSON document (`zarr.json`) that describes an [array](#array) or group. For +arrays, metadata includes the shape, data type, [chunk grid](#chunk-grid), fill +value, and [codec](#codec) pipeline. Metadata is stored alongside the data in +the [store](#store). Zarr-Python does not yet expose its internal metadata +representation as part of its public API. + +## Codecs + +### Codec + +A transformation applied to array data during reading and writing. Codecs are +chained into a pipeline and come in three types: + +- **Array-to-array**: Transforms like transpose that rearrange array elements +- **Array-to-bytes**: Serialization that converts an array to a byte sequence + (exactly one required) +- **Bytes-to-bytes**: Compression or checksums applied to the serialized bytes + +The [sharding indexed codec](#shard) is a special array-to-bytes codec that +groups multiple [chunks](#chunk) into a single storage object. + +## API Properties + +The following properties are available on [`zarr.Array`][]: + +| Property | Description | +|----------|-------------| +| `.chunks` | Chunk shape — the inner chunk shape when sharding is used | +| `.shards` | Shard shape, or `None` if no sharding | +| `.nchunks` | Total number of independently compressible units across the array | +| `.cdata_shape` | Number of independently compressible units per dimension | diff --git a/docs/user-guide/groups.md b/docs/user-guide/groups.md index e093590dfe..58a9c1c806 100644 --- a/docs/user-guide/groups.md +++ b/docs/user-guide/groups.md @@ -133,5 +133,3 @@ Groups also have the [`zarr.Group.tree`][] method, e.g.: print(root.tree()) ``` -!!! note - [`zarr.Group.tree`][] requires the optional [rich](https://rich.readthedocs.io/en/stable/) dependency. It can be installed with the `[tree]` extra. \ No newline at end of file diff --git a/docs/user-guide/index.md b/docs/user-guide/index.md index fda9bcaa90..ff6e354d80 100644 --- a/docs/user-guide/index.md +++ b/docs/user-guide/index.md @@ -35,6 +35,10 @@ Take your skills to the next level: - **[Extending](extending.md)** - Extend functionality with custom code - **[Consolidated Metadata](consolidated_metadata.md)** - Advanced metadata management +## Reference + +- **[Glossary](glossary.md)** - Definitions of key terms (chunks, shards, codecs, etc.) + ## Need Help? - Browse the [API Reference](../api/zarr/index.md) for detailed function documentation diff --git a/docs/user-guide/installation.md b/docs/user-guide/installation.md index 4d323643f1..6c1414e81a 100644 --- a/docs/user-guide/installation.md +++ b/docs/user-guide/installation.md @@ -26,7 +26,7 @@ These can be installed using `pip install "zarr[]"`, e.g. `pip install "z - `gpu`: support for GPUs - `remote`: support for reading/writing to remote data stores -Additional optional dependencies include `rich`, `universal_pathlib`. These must be installed separately. +Additional optional dependencies include `universal_pathlib`. These must be installed separately. ## conda diff --git a/mkdocs.yml b/mkdocs.yml index 61872b6234..e2c4148e15 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,7 @@ nav: - user-guide/gpu.md - user-guide/consolidated_metadata.md - user-guide/experimental.md + - user-guide/glossary.md - Examples: - user-guide/examples/custom_dtype.md - API Reference: @@ -84,6 +85,7 @@ theme: name: material custom_dir: docs/overrides logo: _static/logo_bw.png + favicon: _static/favicon-96x96.png palette: # Light mode @@ -214,7 +216,6 @@ plugins: 'developers/index.html.md': 'contributing.md' 'developers/roadmap.html.md': 'https://zarr.readthedocs.io/en/v3.0.8/developers/roadmap.html' 'api/zarr/creation.md': 'api/zarr/deprecated/creation.md' - 'api/zarr/codecs/numcodecs.md': 'api/zarr/deprecated/creation.md' 'api.md': 'api/zarr/index.md' 'api/zarr/metadata/migrate_v3.md': 'api/zarr/metadata.md' diff --git a/pyproject.toml b/pyproject.toml index 18bdeda07c..b1077e3e5d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,7 +70,7 @@ gpu = [ "cupy-cuda12x", ] cli = ["typer"] -optional = ["rich", "universal-pathlib"] +optional = ["universal-pathlib"] [project.scripts] zarr = "zarr._cli.cli:app" @@ -122,7 +122,6 @@ docs = [ "towncrier", # Optional dependencies to run examples "numcodecs[msgpack]", - "rich", "s3fs>=2023.10.0", "astroid<4", "pytest", @@ -131,7 +130,6 @@ dev = [ {include-group = "test"}, {include-group = "remote-tests"}, {include-group = "docs"}, - "rich", "universal-pathlib", "mypy", ] @@ -447,7 +445,8 @@ checks = [ directory = 'changes' filename = "docs/release-notes.md" underlines = ["", "", ""] -issue_format = "[#{issue}](https://github.com/zarr-developers/zarr-python/issues{issue})" +title_format = "## {version} ({project_date})" +issue_format = "[#{issue}](https://github.com/zarr-developers/zarr-python/issues/{issue})" start_string = "\n" [tool.codespell] diff --git a/src/zarr/__init__.py b/src/zarr/__init__.py index 3c6195c28f..cdf3840c3b 100644 --- a/src/zarr/__init__.py +++ b/src/zarr/__init__.py @@ -78,7 +78,6 @@ def print_packages(packages: list[str]) -> None: "s3fs", "gcsfs", "universal-pathlib", - "rich", "obstore", ] diff --git a/src/zarr/abc/store.py b/src/zarr/abc/store.py index f677c197dc..d2ab353d43 100644 --- a/src/zarr/abc/store.py +++ b/src/zarr/abc/store.py @@ -16,7 +16,16 @@ from zarr.core.buffer import Buffer, BufferPrototype -__all__ = ["ByteGetter", "ByteSetter", "Store", "set_or_delete"] +__all__ = [ + "ByteGetter", + "ByteSetter", + "Store", + "SupportsDeleteSync", + "SupportsGetSync", + "SupportsSetSync", + "SupportsSyncStore", + "set_or_delete", +] @dataclass(frozen=True, slots=True) @@ -700,6 +709,31 @@ async def delete(self) -> None: ... async def set_if_not_exists(self, default: Buffer) -> None: ... +@runtime_checkable +class SupportsGetSync(Protocol): + def get_sync( + self, + key: str, + *, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: ... + + +@runtime_checkable +class SupportsSetSync(Protocol): + def set_sync(self, key: str, value: Buffer) -> None: ... + + +@runtime_checkable +class SupportsDeleteSync(Protocol): + def delete_sync(self, key: str) -> None: ... + + +@runtime_checkable +class SupportsSyncStore(SupportsGetSync, SupportsSetSync, SupportsDeleteSync, Protocol): ... + + async def set_or_delete(byte_setter: ByteSetter, value: Buffer | None) -> None: """Set or delete a value in a byte setter diff --git a/src/zarr/api/asynchronous.py b/src/zarr/api/asynchronous.py index 6164cda957..66cf3bad7e 100644 --- a/src/zarr/api/asynchronous.py +++ b/src/zarr/api/asynchronous.py @@ -24,7 +24,7 @@ from zarr.core.common import ( JSON, AccessModeLiteral, - DimensionNames, + DimensionNamesLike, MemoryOrder, ZarrFormat, _default_zarr_format, @@ -914,7 +914,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, config: ArrayConfigLike | None = None, **kwargs: Any, diff --git a/src/zarr/api/synchronous.py b/src/zarr/api/synchronous.py index 1204eba3c9..4e718a234e 100644 --- a/src/zarr/api/synchronous.py +++ b/src/zarr/api/synchronous.py @@ -33,7 +33,7 @@ from zarr.core.common import ( JSON, AccessModeLiteral, - DimensionNames, + DimensionNamesLike, MemoryOrder, ShapeLike, ZarrFormat, @@ -649,7 +649,7 @@ def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, config: ArrayConfigLike | None = None, **kwargs: Any, @@ -832,7 +832,7 @@ def create_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -1003,7 +1003,7 @@ def from_array( zarr_format: ZarrFormat | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, diff --git a/src/zarr/core/_tree.py b/src/zarr/core/_tree.py index eed807ec95..a528bf9876 100644 --- a/src/zarr/core/_tree.py +++ b/src/zarr/core/_tree.py @@ -1,17 +1,11 @@ -import io -import os +import sys +from collections import deque from collections.abc import Sequence +from html import escape as html_escape from typing import Any from zarr.core.group import AsyncGroup -try: - import rich - import rich.console - import rich.tree -except ImportError as e: - raise ImportError("'rich' is required for Group.tree") from e - class TreeRepr: """ @@ -21,45 +15,120 @@ class TreeRepr: of Zarr's public API. """ - def __init__(self, tree: rich.tree.Tree) -> None: - self._tree = tree + def __init__(self, text: str, html: str, truncated: str = "") -> None: + self._text = text + self._html = html + self._truncated = truncated def __repr__(self) -> str: - color_system = os.environ.get("OVERRIDE_COLOR_SYSTEM", rich.get_console().color_system) - console = rich.console.Console(file=io.StringIO(), color_system=color_system) - console.print(self._tree) - return str(console.file.getvalue()) + if self._truncated: + return self._truncated + self._text + return self._text def _repr_mimebundle_( self, - include: Sequence[str], - exclude: Sequence[str], + include: Sequence[str] | None = None, + exclude: Sequence[str] | None = None, **kwargs: Any, ) -> dict[str, str]: + text = self._truncated + self._text if self._truncated else self._text # For jupyter support. - # Unsure why mypy infers the return type to by Any - return self._tree._repr_mimebundle_(include=include, exclude=exclude, **kwargs) # type: ignore[no-any-return] + html_body = self._truncated + self._html if self._truncated else self._html + html = ( + '
"
+            f"{html_body}
\n" + ) + return {"text/plain": text, "text/html": html} + +async def group_tree_async( + group: AsyncGroup, + max_depth: int | None = None, + *, + max_nodes: int = 500, + plain: bool = False, +) -> TreeRepr: + members: list[tuple[str, Any]] = [] + truncated = False + async for item in group.members(max_depth=max_depth): + if len(members) == max_nodes: + truncated = True + break + members.append(item) + members.sort(key=lambda key_node: key_node[0]) -async def group_tree_async(group: AsyncGroup, max_depth: int | None = None) -> TreeRepr: - tree = rich.tree.Tree(label=f"[bold]{group.name}[/bold]") - nodes = {"": tree} - members = sorted([x async for x in group.members(max_depth=max_depth)]) + # Set up styling tokens: ANSI bold for terminals, HTML for Jupyter, + # or empty strings when plain=True (useful for LLMs, logging, files). + if plain: + ansi_open = ansi_close = html_open = html_close = "" + else: + # Avoid emitting ANSI escape codes when output is piped or in CI. + use_ansi = sys.stdout.isatty() + ansi_open = "\x1b[1m" if use_ansi else "" + ansi_close = "\x1b[0m" if use_ansi else "" + html_open = "" + html_close = "" + # Group members by parent key so we can render the tree level by level. + nodes: dict[str, list[tuple[str, Any]]] = {} for key, node in members: if key.count("/") == 0: parent_key = "" else: parent_key = key.rsplit("/", 1)[0] - parent = nodes[parent_key] + nodes.setdefault(parent_key, []).append((key, node)) - # We want what the spec calls the node "name", the part excluding all leading - # /'s and path segments. But node.name includes all that, so we build it here. + # Render the tree iteratively (not recursively) to avoid hitting + # Python's recursion limit on deeply nested hierarchies. + # Each stack frame is (prefix_string, remaining_children_at_this_level). + text_lines = [f"{ansi_open}{group.name}{ansi_close}"] + html_lines = [f"{html_open}{html_escape(group.name)}{html_close}"] + stack = [("", deque(nodes.get("", [])))] + while stack: + prefix, remaining = stack[-1] + if not remaining: + stack.pop() + continue + key, node = remaining.popleft() name = key.rsplit("/")[-1] + escaped_name = html_escape(name) + # if we popped the last item then remaining will + # now be empty - that's how we got past the if not remaining + # above, but this can still be true. + is_last = not remaining + connector = "└── " if is_last else "├── " if isinstance(node, AsyncGroup): - label = f"[bold]{name}[/bold]" + text_lines.append(f"{prefix}{connector}{ansi_open}{name}{ansi_close}") + html_lines.append(f"{prefix}{connector}{html_open}{escaped_name}{html_close}") else: - label = f"[bold]{name}[/bold] {node.shape} {node.dtype}" - nodes[key] = parent.add(label) - - return TreeRepr(tree) + text_lines.append( + f"{prefix}{connector}{ansi_open}{name}{ansi_close} {node.shape} {node.dtype}" + ) + html_lines.append( + f"{prefix}{connector}{html_open}{escaped_name}{html_close}" + f" {html_escape(str(node.shape))} {html_escape(str(node.dtype))}" + ) + # Descend into children with an accumulated prefix: + # Example showing how prefix accumulates: + # / + # ├── a prefix = "" + # │ ├── b prefix = "" + "│ " + # │ │ └── x prefix = "" + "│ " + "│ " + # │ └── c prefix = "" + "│ " + # └── d prefix = "" + # └── e prefix = "" + " " + if children := nodes.get(key, []): + if is_last: + child_prefix = prefix + " " + else: + child_prefix = prefix + "│ " + stack.append((child_prefix, deque(children))) + text = "\n".join(text_lines) + "\n" + html = "\n".join(html_lines) + "\n" + note = ( + f"Truncated at max_nodes={max_nodes}, some nodes and their children may be missing\n" + if truncated + else "" + ) + return TreeRepr(text, html, truncated=note) diff --git a/src/zarr/core/array.py b/src/zarr/core/array.py index 564d0e915a..b82c77fa9c 100644 --- a/src/zarr/core/array.py +++ b/src/zarr/core/array.py @@ -53,7 +53,7 @@ ZARR_JSON, ZARRAY_JSON, ZATTRS_JSON, - DimensionNames, + DimensionNamesLike, MemoryOrder, ShapeLike, ZarrFormat, @@ -389,7 +389,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -417,7 +417,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # runtime overwrite: bool = False, data: npt.ArrayLike | None = None, @@ -445,7 +445,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -479,7 +479,7 @@ async def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -630,7 +630,7 @@ async def _create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # v2 only chunks: ShapeLike | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -742,7 +742,7 @@ def _create_metadata_v3( fill_value: Any | None = DEFAULT_FILL_VALUE, chunk_key_encoding: ChunkKeyEncodingLike | None = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, ) -> ArrayV3Metadata: """ @@ -803,7 +803,7 @@ async def _create_v3( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, attributes: dict[str, JSON] | None = None, overwrite: bool = False, ) -> AsyncArrayV3: @@ -1235,24 +1235,28 @@ def basename(self) -> str: @property def cdata_shape(self) -> tuple[int, ...]: """ - The shape of the chunk grid for this array. + The number of chunks along each dimension. + + When sharding is used, this counts inner chunks (not shards) per dimension. Returns ------- tuple[int, ...] - The shape of the chunk grid for this array. + The number of chunks along each dimension. """ return self._chunk_grid_shape @property def _chunk_grid_shape(self) -> tuple[int, ...]: """ - The shape of the chunk grid for this array. + The number of chunks along each dimension. + + When sharding is used, this counts inner chunks (not shards) per dimension. Returns ------- tuple[int, ...] - The shape of the chunk grid for this array. + The number of chunks along each dimension. """ return tuple(starmap(ceildiv, zip(self.shape, self.chunks, strict=True))) @@ -1994,7 +1998,7 @@ def create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # v2 only chunks: tuple[int, ...] | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -2138,7 +2142,7 @@ def _create( | None ) = None, codecs: Iterable[Codec | dict[str, JSON]] | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, # v2 only chunks: tuple[int, ...] | None = None, dimension_separator: Literal[".", "/"] | None = None, @@ -2399,14 +2403,23 @@ def compressors(self) -> tuple[Numcodec, ...] | tuple[BytesBytesCodec, ...]: @property def cdata_shape(self) -> tuple[int, ...]: """ - The shape of the chunk grid for this array. + The number of chunks along each dimension. + + When sharding is used, this counts inner chunks (not shards) per dimension. """ return self.async_array._chunk_grid_shape @property def _chunk_grid_shape(self) -> tuple[int, ...]: """ - The shape of the chunk grid for this array. + The number of chunks along each dimension. + + When sharding is used, this counts inner chunks (not shards) per dimension. + + Returns + ------- + tuple[int, ...] + The number of chunks along each dimension. """ return self.async_array._chunk_grid_shape @@ -4253,7 +4266,7 @@ async def from_array( zarr_format: ZarrFormat | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -4522,7 +4535,7 @@ async def init_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, overwrite: bool = False, config: ArrayConfigLike | None = None, ) -> AnyAsyncArray: @@ -4738,7 +4751,7 @@ async def create_array( zarr_format: ZarrFormat | None = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -4922,7 +4935,7 @@ def _parse_keep_array_attr( order: MemoryOrder | None, zarr_format: ZarrFormat | None, chunk_key_encoding: ChunkKeyEncodingLike | None, - dimension_names: DimensionNames, + dimension_names: DimensionNamesLike, ) -> tuple[ tuple[int, ...] | Literal["auto"], ShardsLike | None, @@ -4933,7 +4946,7 @@ def _parse_keep_array_attr( MemoryOrder | None, ZarrFormat, ChunkKeyEncodingLike | None, - DimensionNames, + DimensionNamesLike, ]: if isinstance(data, Array): if chunks == "keep": diff --git a/src/zarr/core/chunk_grids.py b/src/zarr/core/chunk_grids.py index 2c7945fa64..c903eba013 100644 --- a/src/zarr/core/chunk_grids.py +++ b/src/zarr/core/chunk_grids.py @@ -126,11 +126,14 @@ def normalize_chunks(chunks: Any, shape: tuple[int, ...], typesize: int) -> tupl chunks = tuple(int(chunks) for _ in shape) # handle dask-style chunks (iterable of iterables) - if all(isinstance(c, (tuple | list)) for c in chunks): - # take first chunk size for each dimension - chunks = tuple( - c[0] for c in chunks - ) # TODO: check/error/warn for irregular chunks (e.g. if c[0] != c[1:-1]) + if all(isinstance(c, (tuple, list)) for c in chunks): + for i, c in enumerate(chunks): + if any(x != y for x, y in itertools.pairwise(c[:-1])) or (len(c) > 1 and c[-1] > c[0]): + raise ValueError( + f"Irregular chunk sizes in dimension {i}: {tuple(c)}. " + "Only uniform chunks (with an optional smaller final chunk) are supported." + ) + chunks = tuple(c[0] for c in chunks) # handle bad dimensionality if len(chunks) > len(shape): diff --git a/src/zarr/core/codec_pipeline.py b/src/zarr/core/codec_pipeline.py index fd557ac43e..eed49556d3 100644 --- a/src/zarr/core/codec_pipeline.py +++ b/src/zarr/core/codec_pipeline.py @@ -263,6 +263,8 @@ async def read_batch( chunk_array_batch, batch_info, strict=False ): if chunk_array is not None: + if drop_axes: + chunk_array = chunk_array.squeeze(axis=drop_axes) out[out_selection] = chunk_array else: out[out_selection] = fill_value_or_default(chunk_spec) @@ -285,7 +287,7 @@ async def read_batch( ): if chunk_array is not None: tmp = chunk_array[chunk_selection] - if drop_axes != (): + if drop_axes: tmp = tmp.squeeze(axis=drop_axes) out[out_selection] = tmp else: @@ -324,7 +326,7 @@ def _merge_chunk_array( else: chunk_value = value[out_selection] # handle missing singleton dimensions - if drop_axes != (): + if drop_axes: item = tuple( None # equivalent to np.newaxis if idx in drop_axes diff --git a/src/zarr/core/common.py b/src/zarr/core/common.py index 275d062eba..a61271a941 100644 --- a/src/zarr/core/common.py +++ b/src/zarr/core/common.py @@ -47,7 +47,8 @@ MemoryOrder = Literal["C", "F"] AccessModeLiteral = Literal["r", "r+", "a", "w", "w-"] ANY_ACCESS_MODE: Final = "r", "r+", "a", "w", "w-" -DimensionNames = Iterable[str | None] | None +DimensionNamesLike = Iterable[str | None] | None +DimensionNames = DimensionNamesLike # for backwards compatibility TName = TypeVar("TName", bound=str) TConfig = TypeVar("TConfig", bound=Mapping[str, object]) diff --git a/src/zarr/core/dtype/npy/float.py b/src/zarr/core/dtype/npy/float.py index 0be2cbca9b..2a23cb429d 100644 --- a/src/zarr/core/dtype/npy/float.py +++ b/src/zarr/core/dtype/npy/float.py @@ -201,6 +201,16 @@ def _check_scalar(self, data: object) -> TypeGuard[FloatLike]: TypeGuard[FloatLike] True if the input is a valid scalar value, False otherwise. """ + if isinstance(data, str): + # Only accept strings that are valid float representations (e.g. "NaN", "inf"). + # Plain strings that cannot be converted should return False so that cast_scalar + # raises TypeError rather than a confusing ValueError. + try: + self.to_native_dtype().type(data) + except (ValueError, OverflowError): + return False + else: + return True return isinstance(data, FloatLike) def _cast_scalar_unchecked(self, data: FloatLike) -> TFloatScalar_co: diff --git a/src/zarr/core/dtype/npy/int.py b/src/zarr/core/dtype/npy/int.py index f71f535abb..e5007b7acb 100644 --- a/src/zarr/core/dtype/npy/int.py +++ b/src/zarr/core/dtype/npy/int.py @@ -1070,6 +1070,28 @@ class UInt32(BaseInt[np.dtypes.UInt32DType, np.uint32], HasEndianness): _zarr_v3_name: ClassVar[Literal["uint32"]] = "uint32" _zarr_v2_names: ClassVar[tuple[Literal[">u4"], Literal["u4", " TypeGuard[np.dtypes.UInt32DType]: + """ + A type guard that checks if the input is assignable to the type of ``cls.dtype_class`` + + This method is overridden for this particular data type because of a Windows-specific issue + where ``np.array([1], dtype=np.uint32) & 1`` creates an instance of ``np.dtypes.UIntDType``, + rather than an instance of ``np.dtypes.UInt32DType``, even though both represent 32-bit + unsigned integers. (In contrast to ``np.dtype('i')``, ``np.dtype('u')`` raises an error.) + + Parameters + ---------- + dtype : TDType + The dtype to check. + + Returns + ------- + Bool + True if the dtype matches, False otherwise. + """ + return super()._check_native_dtype(dtype) or dtype == np.dtypes.UInt32DType() + @classmethod def from_native_dtype(cls, dtype: TBaseDType) -> Self: """ diff --git a/src/zarr/core/group.py b/src/zarr/core/group.py index 9b5fee275b..17b8b541b1 100644 --- a/src/zarr/core/group.py +++ b/src/zarr/core/group.py @@ -40,7 +40,7 @@ ZATTRS_JSON, ZGROUP_JSON, ZMETADATA_V2_JSON, - DimensionNames, + DimensionNamesLike, NodeType, ShapeLike, ZarrFormat, @@ -1032,7 +1032,7 @@ async def create_array( order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -1588,12 +1588,17 @@ async def array_values( async for _, array in self.arrays(): yield array - async def tree(self, expand: bool | None = None, level: int | None = None) -> Any: + async def tree( + self, + expand: bool | None = None, + level: int | None = None, + *, + max_nodes: int = 500, + plain: bool = False, + ) -> Any: """ Return a tree-like representation of a hierarchy. - This requires the optional ``rich`` dependency. - Parameters ---------- expand : bool, optional @@ -1601,6 +1606,12 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An it's used. level : int, optional The maximum depth below this Group to display in the tree. + max_nodes : int + Maximum number of nodes to display before truncating. Default is 500. + plain : bool, optional + If True, return a plain-text tree without ANSI styling. This is + useful when the output will be consumed by an LLM or written to a + file. Default is False. Returns ------- @@ -1611,7 +1622,7 @@ async def tree(self, expand: bool | None = None, level: int | None = None) -> An if expand is not None: raise NotImplementedError("'expand' is not yet implemented.") - return await group_tree_async(self, max_depth=level) + return await group_tree_async(self, max_depth=level, max_nodes=max_nodes, plain=plain) async def empty(self, *, name: str, shape: tuple[int, ...], **kwargs: Any) -> AnyAsyncArray: """Create an empty array with the specified shape in this Group. The contents will @@ -2371,12 +2382,17 @@ def array_values(self) -> Generator[AnyArray, None]: for _, array in self.arrays(): yield array - def tree(self, expand: bool | None = None, level: int | None = None) -> Any: + def tree( + self, + expand: bool | None = None, + level: int | None = None, + *, + max_nodes: int = 500, + plain: bool = False, + ) -> Any: """ Return a tree-like representation of a hierarchy. - This requires the optional ``rich`` dependency. - Parameters ---------- expand : bool, optional @@ -2384,13 +2400,21 @@ def tree(self, expand: bool | None = None, level: int | None = None) -> Any: it's used. level : int, optional The maximum depth below this Group to display in the tree. + max_nodes : int + Maximum number of nodes to display before truncating. Default is 500. + plain : bool, optional + If True, return a plain-text tree without ANSI styling. This is + useful when the output will be consumed by an LLM or written to a + file. Default is False. Returns ------- TreeRepr A pretty-printable object displaying the hierarchy. """ - return self._sync(self._async_group.tree(expand=expand, level=level)) + return self._sync( + self._async_group.tree(expand=expand, level=level, max_nodes=max_nodes, plain=plain) + ) def create_group(self, name: str, **kwargs: Any) -> Group: """Create a sub-group. @@ -2459,7 +2483,7 @@ def create( order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -2603,7 +2627,7 @@ def create_array( order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, @@ -3001,7 +3025,7 @@ def array( order: MemoryOrder | None = None, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, storage_options: dict[str, Any] | None = None, overwrite: bool = False, config: ArrayConfigLike | None = None, diff --git a/src/zarr/core/metadata/v3.py b/src/zarr/core/metadata/v3.py index 5ce155bd9a..2a5da50c7b 100644 --- a/src/zarr/core/metadata/v3.py +++ b/src/zarr/core/metadata/v3.py @@ -33,7 +33,7 @@ from zarr.core.common import ( JSON, ZARR_JSON, - DimensionNames, + DimensionNamesLike, NamedConfig, parse_named_configuration, parse_shapelike, @@ -220,7 +220,7 @@ def __init__( fill_value: object, codecs: Iterable[Codec | dict[str, JSON] | NamedConfig[str, Any] | str], attributes: dict[str, JSON] | None, - dimension_names: DimensionNames, + dimension_names: DimensionNamesLike, storage_transformers: Iterable[dict[str, JSON]] | None = None, extra_fields: Mapping[str, AllowedExtraField] | None = None, ) -> None: diff --git a/src/zarr/storage/_common.py b/src/zarr/storage/_common.py index 4bea04f024..08c05864aa 100644 --- a/src/zarr/storage/_common.py +++ b/src/zarr/storage/_common.py @@ -5,7 +5,13 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Literal, Self, TypeAlias -from zarr.abc.store import ByteRequest, Store +from zarr.abc.store import ( + ByteRequest, + Store, + SupportsDeleteSync, + SupportsGetSync, + SupportsSetSync, +) from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.common import ( ANY_ACCESS_MODE, @@ -228,6 +234,37 @@ async def is_empty(self) -> bool: """ return await self.store.is_empty(self.path) + # ------------------------------------------------------------------- + # Synchronous IO delegation + # ------------------------------------------------------------------- + + def get_sync( + self, + *, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + """Synchronous read — delegates to ``self.store.get_sync(self.path, ...)``.""" + if not isinstance(self.store, SupportsGetSync): + raise TypeError(f"Store {type(self.store).__name__} does not support synchronous get.") + if prototype is None: + prototype = default_buffer_prototype() + return self.store.get_sync(self.path, prototype=prototype, byte_range=byte_range) + + def set_sync(self, value: Buffer) -> None: + """Synchronous write — delegates to ``self.store.set_sync(self.path, value)``.""" + if not isinstance(self.store, SupportsSetSync): + raise TypeError(f"Store {type(self.store).__name__} does not support synchronous set.") + self.store.set_sync(self.path, value) + + def delete_sync(self) -> None: + """Synchronous delete — delegates to ``self.store.delete_sync(self.path)``.""" + if not isinstance(self.store, SupportsDeleteSync): + raise TypeError( + f"Store {type(self.store).__name__} does not support synchronous delete." + ) + self.store.delete_sync(self.path) + def __truediv__(self, other: str) -> StorePath: """Combine this store path with another path""" return self.__class__(self.store, _dereference_path(self.path, other)) diff --git a/src/zarr/storage/_local.py b/src/zarr/storage/_local.py index 80233a112d..aa6271db81 100644 --- a/src/zarr/storage/_local.py +++ b/src/zarr/storage/_local.py @@ -187,6 +187,56 @@ def __repr__(self) -> str: def __eq__(self, other: object) -> bool: return isinstance(other, type(self)) and self.root == other.root + # ------------------------------------------------------------------- + # Synchronous store methods + # ------------------------------------------------------------------- + + def _ensure_open_sync(self) -> None: + if not self._is_open: + if not self.read_only: + self.root.mkdir(parents=True, exist_ok=True) + if not self.root.exists(): + raise FileNotFoundError(f"{self.root} does not exist") + self._is_open = True + + def get_sync( + self, + key: str, + *, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + if prototype is None: + prototype = default_buffer_prototype() + self._ensure_open_sync() + assert isinstance(key, str) + path = self.root / key + try: + return _get(path, prototype, byte_range) + except (FileNotFoundError, IsADirectoryError, NotADirectoryError): + return None + + def set_sync(self, key: str, value: Buffer) -> None: + self._ensure_open_sync() + self._check_writable() + assert isinstance(key, str) + if not isinstance(value, Buffer): + raise TypeError( + f"LocalStore.set(): `value` must be a Buffer instance. " + f"Got an instance of {type(value)} instead." + ) + path = self.root / key + _put(path, value) + + def delete_sync(self, key: str) -> None: + self._ensure_open_sync() + self._check_writable() + path = self.root / key + if path.is_dir(): + shutil.rmtree(path) + else: + path.unlink(missing_ok=True) + async def get( self, key: str, @@ -290,11 +340,14 @@ async def list(self) -> AsyncIterator[str]: async def list_prefix(self, prefix: str) -> AsyncIterator[str]: # docstring inherited + # Use string prefix matching to be consistent with MemoryStore behavior. + # The prefix should match keys as strings, not as filesystem paths. to_strip = self.root.as_posix() + "/" - prefix = prefix.rstrip("/") - for p in (self.root / prefix).rglob("*"): + for p in list(self.root.rglob("*")): if p.is_file(): - yield p.as_posix().replace(to_strip, "") + key = p.as_posix().replace(to_strip, "") + if key.startswith(prefix): + yield key async def list_dir(self, prefix: str) -> AsyncIterator[str]: # docstring inherited diff --git a/src/zarr/storage/_memory.py b/src/zarr/storage/_memory.py index e6f9b7a512..1194894b9d 100644 --- a/src/zarr/storage/_memory.py +++ b/src/zarr/storage/_memory.py @@ -77,6 +77,49 @@ def __eq__(self, other: object) -> bool: and self.read_only == other.read_only ) + # ------------------------------------------------------------------- + # Synchronous store methods + # ------------------------------------------------------------------- + + def get_sync( + self, + key: str, + *, + prototype: BufferPrototype | None = None, + byte_range: ByteRequest | None = None, + ) -> Buffer | None: + if prototype is None: + prototype = default_buffer_prototype() + if not self._is_open: + self._is_open = True + assert isinstance(key, str) + try: + value = self._store_dict[key] + start, stop = _normalize_byte_range_index(value, byte_range) + return prototype.buffer.from_buffer(value[start:stop]) + except KeyError: + return None + + def set_sync(self, key: str, value: Buffer) -> None: + self._check_writable() + if not self._is_open: + self._is_open = True + assert isinstance(key, str) + if not isinstance(value, Buffer): + raise TypeError( + f"MemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." + ) + self._store_dict[key] = value + + def delete_sync(self, key: str) -> None: + self._check_writable() + if not self._is_open: + self._is_open = True + try: + del self._store_dict[key] + except KeyError: + logger.debug("Key %s does not exist.", key) + async def get( self, key: str, @@ -122,7 +165,6 @@ async def set(self, key: str, value: Buffer, byte_range: tuple[int, int] | None raise TypeError( f"MemoryStore.set(): `value` must be a Buffer instance. Got an instance of {type(value)} instead." ) - if byte_range is not None: buf = self._store_dict[key] buf[byte_range[0] : byte_range[1]] = value diff --git a/src/zarr/testing/store.py b/src/zarr/testing/store.py index 1b8e85ed98..ce83715b86 100644 --- a/src/zarr/testing/store.py +++ b/src/zarr/testing/store.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: from typing import Any - from zarr.abc.store import ByteRequest from zarr.core.buffer.core import BufferPrototype import pytest @@ -22,6 +21,9 @@ RangeByteRequest, Store, SuffixByteRequest, + SupportsDeleteSync, + SupportsGetSync, + SupportsSetSync, ) from zarr.core.buffer import Buffer, default_buffer_prototype from zarr.core.sync import _collect_aiterator, sync @@ -39,6 +41,27 @@ class StoreTests(Generic[S, B]): store_cls: type[S] buffer_cls: type[B] + @staticmethod + def _require_get_sync(store: S) -> SupportsGetSync: + """Skip unless *store* implements :class:`SupportsGetSync`.""" + if not isinstance(store, SupportsGetSync): + pytest.skip("store does not implement SupportsGetSync") + return store # type: ignore[unreachable] + + @staticmethod + def _require_set_sync(store: S) -> SupportsSetSync: + """Skip unless *store* implements :class:`SupportsSetSync`.""" + if not isinstance(store, SupportsSetSync): + pytest.skip("store does not implement SupportsSetSync") + return store # type: ignore[unreachable] + + @staticmethod + def _require_delete_sync(store: S) -> SupportsDeleteSync: + """Skip unless *store* implements :class:`SupportsDeleteSync`.""" + if not isinstance(store, SupportsDeleteSync): + pytest.skip("store does not implement SupportsDeleteSync") + return store # type: ignore[unreachable] + @abstractmethod async def set(self, store: S, key: str, value: Buffer) -> None: """ @@ -579,6 +602,52 @@ def test_get_json_sync(self, store: S) -> None: sync(self.set(store, key, self.buffer_cls.from_bytes(data_bytes))) assert store._get_json_sync(key, prototype=default_buffer_prototype()) == data + # ------------------------------------------------------------------- + # Synchronous store methods (SupportsSyncStore protocol) + # ------------------------------------------------------------------- + + def test_get_sync(self, store: S) -> None: + getter = self._require_get_sync(store) + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "sync_get" + sync(self.set(store, key, data_buf)) + result = getter.get_sync(key) + assert result is not None + assert_bytes_equal(result, data_buf) + + def test_get_sync_missing(self, store: S) -> None: + getter = self._require_get_sync(store) + result = getter.get_sync("nonexistent") + assert result is None + + def test_set_sync(self, store: S) -> None: + setter = self._require_set_sync(store) + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "sync_set" + setter.set_sync(key, data_buf) + result = sync(self.get(store, key)) + assert_bytes_equal(result, data_buf) + + def test_delete_sync(self, store: S) -> None: + setter = self._require_set_sync(store) + deleter = self._require_delete_sync(store) + getter = self._require_get_sync(store) + if not store.supports_deletes: + pytest.skip("store does not support deletes") + data_buf = self.buffer_cls.from_bytes(b"\x01\x02\x03\x04") + key = "sync_delete" + setter.set_sync(key, data_buf) + deleter.delete_sync(key) + result = getter.get_sync(key) + assert result is None + + def test_delete_sync_missing(self, store: S) -> None: + deleter = self._require_delete_sync(store) + if not store.supports_deletes: + pytest.skip("store does not support deletes") + # should not raise + deleter.delete_sync("nonexistent_sync") + class LatencyStore(WrapperStore[Store]): """ diff --git a/tests/conftest.py b/tests/conftest.py index 23a1e87d0a..86db02f6bf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -25,7 +25,7 @@ from zarr.core.chunk_grids import RegularChunkGrid, _auto_partition from zarr.core.common import ( JSON, - DimensionNames, + DimensionNamesLike, MemoryOrder, ShapeLike, ZarrFormat, @@ -313,7 +313,7 @@ def create_array_metadata( zarr_format: ZarrFormat, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, ) -> ArrayV2Metadata | ArrayV3Metadata: """ Create array metadata @@ -452,7 +452,7 @@ def meta_from_array( zarr_format: ZarrFormat = 3, attributes: dict[str, JSON] | None = None, chunk_key_encoding: ChunkKeyEncoding | ChunkKeyEncodingLike | None = None, - dimension_names: DimensionNames = None, + dimension_names: DimensionNamesLike = None, ) -> ArrayV3Metadata | ArrayV2Metadata: """ Create array metadata from an array diff --git a/tests/test_api.py b/tests/test_api.py index 07c3c8590d..a306ff3dc3 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -599,7 +599,6 @@ def test_load_local(tmp_path: Path, path: str | None, load_read_only: bool) -> N def test_tree() -> None: - pytest.importorskip("rich") g1 = zarr.group() g1.create_group("foo") g3 = g1.create_group("bar") diff --git a/tests/test_chunk_grids.py b/tests/test_chunk_grids.py index 4c69c483ae..2920b5d6f3 100644 --- a/tests/test_chunk_grids.py +++ b/tests/test_chunk_grids.py @@ -35,6 +35,10 @@ def test_guess_chunks(shape: tuple[int, ...], itemsize: int) -> None: ((30, None, None), (100, 20, 10), 1, (30, 20, 10)), ((30, 20, None), (100, 20, 10), 1, (30, 20, 10)), ((30, 20, 10), (100, 20, 10), 1, (30, 20, 10)), + # dask-style chunks (uniform with optional smaller final chunk) + (((100, 100, 100), (50, 50)), (300, 100), 1, (100, 50)), + (((100, 100, 50),), (250,), 1, (100,)), + (((100,),), (100,), 1, (100,)), # auto chunking (None, (100,), 1, (100,)), (-1, (100,), 1, (100,)), @@ -52,3 +56,8 @@ def test_normalize_chunks_errors() -> None: normalize_chunks("foo", (100,), 1) with pytest.raises(ValueError): normalize_chunks((100, 10), (100,), 1) + # dask-style irregular chunks should raise + with pytest.raises(ValueError, match="Irregular chunk sizes"): + normalize_chunks(((10, 20, 30),), (60,), 1) + with pytest.raises(ValueError, match="Irregular chunk sizes"): + normalize_chunks(((100, 100), (10, 20)), (200, 30), 1) diff --git a/tests/test_codecs/test_blosc.py b/tests/test_codecs/test_blosc.py index 6f4821f8b1..0201beb8de 100644 --- a/tests/test_codecs/test_blosc.py +++ b/tests/test_codecs/test_blosc.py @@ -6,11 +6,12 @@ from packaging.version import Version import zarr +from zarr.abc.codec import SupportsSyncCodec from zarr.codecs import BloscCodec from zarr.codecs.blosc import BloscShuffle, Shuffle -from zarr.core.array_spec import ArraySpec +from zarr.core.array_spec import ArrayConfig, ArraySpec from zarr.core.buffer import default_buffer_prototype -from zarr.core.dtype import UInt16 +from zarr.core.dtype import UInt16, get_data_type_from_native_dtype from zarr.storage import MemoryStore, StorePath @@ -110,3 +111,27 @@ async def test_typesize() -> None: else: expected_size = 10216 assert size == expected_size, msg + + +def test_blosc_codec_supports_sync() -> None: + assert isinstance(BloscCodec(), SupportsSyncCodec) + + +def test_blosc_codec_sync_roundtrip() -> None: + codec = BloscCodec(typesize=8) + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) diff --git a/tests/test_codecs/test_crc32c.py b/tests/test_codecs/test_crc32c.py new file mode 100644 index 0000000000..3ab1070f60 --- /dev/null +++ b/tests/test_codecs/test_crc32c.py @@ -0,0 +1,33 @@ +from __future__ import annotations + +import numpy as np + +from zarr.abc.codec import SupportsSyncCodec +from zarr.codecs.crc32c_ import Crc32cCodec +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import default_buffer_prototype +from zarr.core.dtype import get_data_type_from_native_dtype + + +def test_crc32c_codec_supports_sync() -> None: + assert isinstance(Crc32cCodec(), SupportsSyncCodec) + + +def test_crc32c_codec_sync_roundtrip() -> None: + codec = Crc32cCodec() + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) diff --git a/tests/test_codecs/test_endian.py b/tests/test_codecs/test_endian.py index ab64afb1b8..c505cee828 100644 --- a/tests/test_codecs/test_endian.py +++ b/tests/test_codecs/test_endian.py @@ -4,8 +4,12 @@ import pytest import zarr +from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import BytesCodec +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import NDBuffer, default_buffer_prototype +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy @@ -33,6 +37,31 @@ async def test_endian(store: Store, endian: Literal["big", "little"]) -> None: assert np.array_equal(data, readback_data) +def test_bytes_codec_supports_sync() -> None: + assert isinstance(BytesCodec(), SupportsSyncCodec) + + +def test_bytes_codec_sync_roundtrip() -> None: + codec = BytesCodec() + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) + + codec = codec.evolve_from_array_spec(spec) + + encoded = codec._encode_sync(nd_buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + np.testing.assert_array_equal(arr, decoded.as_numpy_array()) + + @pytest.mark.filterwarnings("ignore:The endianness of the requested serializer") @pytest.mark.parametrize("store", ["local", "memory"], indirect=["store"]) @pytest.mark.parametrize("dtype_input_endian", [">u2", " None: a[:, :] = data assert np.array_equal(data, a[:, :]) + + +def test_gzip_codec_supports_sync() -> None: + assert isinstance(GzipCodec(), SupportsSyncCodec) + + +def test_gzip_codec_sync_roundtrip() -> None: + codec = GzipCodec(level=1) + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) diff --git a/tests/test_codecs/test_sharding.py b/tests/test_codecs/test_sharding.py index d0e2d09b7c..d7cbeb5bdb 100644 --- a/tests/test_codecs/test_sharding.py +++ b/tests/test_codecs/test_sharding.py @@ -490,7 +490,8 @@ def test_invalid_shard_shape() -> None: with pytest.raises( ValueError, match=re.escape( - "The array's `chunk_shape` (got (16, 16)) needs to be divisible by the shard's inner `chunk_shape` (got (9,))." + "The array's `chunk_shape` (got (16, 16)) needs to be divisible " + "by the shard's inner `chunk_shape` (got (9,))." ), ): zarr.create_array( @@ -501,3 +502,56 @@ def test_invalid_shard_shape() -> None: dtype=np.dtype("uint8"), fill_value=0, ) + + +@pytest.mark.parametrize("store", ["local"], indirect=["store"]) +def test_sharding_mixed_integer_list_indexing(store: Store) -> None: + """Regression test for https://github.com/zarr-developers/zarr-python/issues/3691. + + Mixed integer/list indexing on sharded arrays should return the same + shape and data as on equivalent chunked arrays. + """ + import numpy as np + + data = np.arange(200 * 100 * 10, dtype=np.uint8).reshape(200, 100, 10) + + chunked = zarr.create_array( + store, + name="chunked", + shape=(200, 100, 10), + dtype=np.uint8, + chunks=(200, 100, 1), + overwrite=True, + ) + chunked[:, :, :] = data + + sharded = zarr.create_array( + store, + name="sharded", + shape=(200, 100, 10), + dtype=np.uint8, + chunks=(200, 100, 1), + shards=(200, 100, 10), + overwrite=True, + ) + sharded[:, :, :] = data + + # Mixed integer + list indexing + c = chunked[0:10, 0, [0, 1]] # type: ignore[index] + s = sharded[0:10, 0, [0, 1]] # type: ignore[index] + assert c.shape == s.shape == (10, 2), ( # type: ignore[union-attr] + f"Expected (10, 2), got chunked={c.shape}, sharded={s.shape}" # type: ignore[union-attr] + ) + np.testing.assert_array_equal(c, s) + + # Multiple integer axes + c2 = chunked[0, 0, [0, 1, 2]] # type: ignore[index] + s2 = sharded[0, 0, [0, 1, 2]] # type: ignore[index] + assert c2.shape == s2.shape == (3,) # type: ignore[union-attr] + np.testing.assert_array_equal(c2, s2) + + # Slice + integer + slice + c3 = chunked[0:5, 1, 0:3] + s3 = sharded[0:5, 1, 0:3] + assert c3.shape == s3.shape == (5, 3) # type: ignore[union-attr] + np.testing.assert_array_equal(c3, s3) diff --git a/tests/test_codecs/test_transpose.py b/tests/test_codecs/test_transpose.py index 06ec668ad3..949bb72a62 100644 --- a/tests/test_codecs/test_transpose.py +++ b/tests/test_codecs/test_transpose.py @@ -3,9 +3,13 @@ import zarr from zarr import AsyncArray, config +from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import TransposeCodec +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import NDBuffer, default_buffer_prototype from zarr.core.common import MemoryOrder +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath from .test_codecs import _AsyncArrayProxy @@ -93,3 +97,27 @@ def test_transpose_invalid( chunk_key_encoding={"name": "v2", "separator": "."}, filters=[TransposeCodec(order=order)], # type: ignore[arg-type] ) + + +def test_transpose_codec_supports_sync() -> None: + assert isinstance(TransposeCodec(order=(0, 1)), SupportsSyncCodec) + + +def test_transpose_codec_sync_roundtrip() -> None: + codec = TransposeCodec(order=(1, 0)) + arr = np.arange(12, dtype="float64").reshape(3, 4) + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + nd_buf: NDBuffer = default_buffer_prototype().nd_buffer.from_numpy_array(arr) + + encoded = codec._encode_sync(nd_buf, spec) + assert encoded is not None + resolved_spec = codec.resolve_metadata(spec) + decoded = codec._decode_sync(encoded, resolved_spec) + np.testing.assert_array_equal(arr, decoded.as_numpy_array()) diff --git a/tests/test_codecs/test_vlen.py b/tests/test_codecs/test_vlen.py index cf0905daca..f3445824b3 100644 --- a/tests/test_codecs/test_vlen.py +++ b/tests/test_codecs/test_vlen.py @@ -5,9 +5,10 @@ import zarr from zarr import Array -from zarr.abc.codec import Codec +from zarr.abc.codec import Codec, SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.codecs.vlen_utf8 import VLenBytesCodec, VLenUTF8Codec from zarr.core.dtype import get_data_type_from_native_dtype from zarr.core.dtype.npy.string import _NUMPY_SUPPORTS_VLEN_STRING from zarr.core.metadata.v3 import ArrayV3Metadata @@ -62,3 +63,11 @@ def test_vlen_string( assert np.array_equal(data, b[:, :]) assert b.metadata.data_type == get_data_type_from_native_dtype(data.dtype) assert a.dtype == data.dtype + + +def test_vlen_utf8_codec_supports_sync() -> None: + assert isinstance(VLenUTF8Codec(), SupportsSyncCodec) + + +def test_vlen_bytes_codec_supports_sync() -> None: + assert isinstance(VLenBytesCodec(), SupportsSyncCodec) diff --git a/tests/test_codecs/test_zstd.py b/tests/test_codecs/test_zstd.py index 6068f53443..3f3f15a41a 100644 --- a/tests/test_codecs/test_zstd.py +++ b/tests/test_codecs/test_zstd.py @@ -2,8 +2,12 @@ import pytest import zarr +from zarr.abc.codec import SupportsSyncCodec from zarr.abc.store import Store from zarr.codecs import ZstdCodec +from zarr.core.array_spec import ArrayConfig, ArraySpec +from zarr.core.buffer import default_buffer_prototype +from zarr.core.dtype import get_data_type_from_native_dtype from zarr.storage import StorePath @@ -23,3 +27,27 @@ def test_zstd(store: Store, checksum: bool) -> None: a[:, :] = data assert np.array_equal(data, a[:, :]) + + +def test_zstd_codec_supports_sync() -> None: + assert isinstance(ZstdCodec(), SupportsSyncCodec) + + +def test_zstd_codec_sync_roundtrip() -> None: + codec = ZstdCodec(level=1) + arr = np.arange(100, dtype="float64") + zdtype = get_data_type_from_native_dtype(arr.dtype) + spec = ArraySpec( + shape=arr.shape, + dtype=zdtype, + fill_value=zdtype.cast_scalar(0), + config=ArrayConfig(order="C", write_empty_chunks=True), + prototype=default_buffer_prototype(), + ) + buf = default_buffer_prototype().buffer.from_array_like(arr.view("B")) + + encoded = codec._encode_sync(buf, spec) + assert encoded is not None + decoded = codec._decode_sync(encoded, spec) + result = np.frombuffer(decoded.as_numpy_array(), dtype="float64") + np.testing.assert_array_equal(arr, result) diff --git a/tests/test_dtype/test_npy/test_float.py b/tests/test_dtype/test_npy/test_float.py index 1bbcbbc81f..8d8e768263 100644 --- a/tests/test_dtype/test_npy/test_float.py +++ b/tests/test_dtype/test_npy/test_float.py @@ -65,7 +65,10 @@ class TestFloat16(_BaseTestFloat): (Float16(), -1.0, np.float16(-1.0)), (Float16(), "NaN", np.float16("NaN")), ) - invalid_scalar_params = ((Float16(), {"set!"}),) + invalid_scalar_params = ( + (Float16(), {"set!"}), + (Float16(), "not_a_float"), + ) hex_string_params = (("0x7fc0", np.nan), ("0x7fc1", np.nan), ("0x3c00", 1.0)) item_size_params = (Float16(),) @@ -113,7 +116,10 @@ class TestFloat32(_BaseTestFloat): (Float32(), -1.0, np.float32(-1.0)), (Float32(), "NaN", np.float32("NaN")), ) - invalid_scalar_params = ((Float32(), {"set!"}),) + invalid_scalar_params = ( + (Float32(), {"set!"}), + (Float32(), "not_a_float"), + ) hex_string_params = (("0x7fc00000", np.nan), ("0x7fc00001", np.nan), ("0x3f800000", 1.0)) item_size_params = (Float32(),) @@ -160,7 +166,10 @@ class TestFloat64(_BaseTestFloat): (Float64(), -1.0, np.float64(-1.0)), (Float64(), "NaN", np.float64("NaN")), ) - invalid_scalar_params = ((Float64(), {"set!"}),) + invalid_scalar_params = ( + (Float64(), {"set!"}), + (Float64(), "not_a_float"), + ) hex_string_params = ( ("0x7ff8000000000000", np.nan), ("0x7ff8000000000001", np.nan), diff --git a/tests/test_dtype/test_npy/test_int.py b/tests/test_dtype/test_npy/test_int.py index f53ec7f5ae..9eab053080 100644 --- a/tests/test_dtype/test_npy/test_int.py +++ b/tests/test_dtype/test_npy/test_int.py @@ -216,7 +216,16 @@ class TestUInt16(BaseTestZDType): class TestUInt32(BaseTestZDType): test_cls = UInt32 scalar_type = np.uint32 - valid_dtype = (np.dtype(">u4"), np.dtype("u4"), np.dtype(" Buffer | None: + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__getitem__", key_suffix] += 1 + return super().get_sync(key, prototype=prototype, byte_range=byte_range) + + def set_sync(self, key: str, value: Buffer) -> None: + key_suffix = "/".join(key.split("/")[1:]) + self.counter["__setitem__", key_suffix] += 1 + return super().set_sync(key, value) + def test_normalize_integer_selection() -> None: assert 1 == normalize_integer_selection(1, 100) diff --git a/tests/test_tree.py b/tests/test_tree.py index b4a5106998..78ea121f4d 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -1,4 +1,3 @@ -import os import textwrap from typing import Any @@ -6,12 +5,19 @@ import zarr -pytest.importorskip("rich") - @pytest.mark.parametrize("root_name", [None, "root"]) -def test_tree(root_name: Any) -> None: - os.environ["OVERRIDE_COLOR_SYSTEM"] = "truecolor" +@pytest.mark.parametrize("atty", [True, False]) +@pytest.mark.parametrize("plain", [True, False]) +def test_tree(root_name: Any, atty: bool, plain: bool, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("sys.stdout.isatty", lambda: atty) + + if atty and not plain: + BOPEN = "\x1b[1m" + BCLOSE = "\x1b[0m" + else: + BOPEN = "" + BCLOSE = "" g = zarr.group(path=root_name) A = g.create_group("A") @@ -25,12 +31,9 @@ def test_tree(root_name: Any) -> None: C.create_array(name="x", shape=(0,), dtype="float64") D.create_array(name="x", shape=(0,), dtype="float64") - result = repr(g.tree()) + result = repr(g.tree(plain=plain)) root = root_name or "" - BOPEN = "\x1b[1m" - BCLOSE = "\x1b[0m" - expected = textwrap.dedent(f"""\ {BOPEN}/{root}{BCLOSE} ├── {BOPEN}A{BCLOSE} @@ -46,15 +49,54 @@ def test_tree(root_name: Any) -> None: assert result == expected - result = repr(g.tree(level=0)) + result = repr(g.tree(level=0, plain=plain)) expected = textwrap.dedent(f"""\ {BOPEN}/{root}{BCLOSE} ├── {BOPEN}A{BCLOSE} └── {BOPEN}B{BCLOSE} """) - assert result == expected + if not plain: + tree = g.tree(plain=False) + bundle = tree._repr_mimebundle_() + assert "text/plain" in bundle + assert "text/html" in bundle + assert "A" in bundle["text/html"] + assert "x" in bundle["text/html"] + assert " None: + g = zarr.group() + g.create_group("a") + g.create_group("b") + g.create_group("c") + g.create_group("d") + g.create_group("e") + + result = repr(g.tree(max_nodes=3, plain=True)) + assert "Truncated at max_nodes=3" in result + # Should show exactly 3 nodes (lines with ── connectors). + lines = result.strip().split("\n") + node_lines = [line for line in lines if "──" in line] + assert len(node_lines) == 3 + + # Full tree should not show truncation message. + full = repr(g.tree(max_nodes=500, plain=True)) + assert "truncated" not in full + + +def test_tree_html_escaping() -> None: + g = zarr.group() + g.create_group("") + + tree = g.tree() + bundle = tree._repr_mimebundle_() + assert "<img" in bundle["text/html"] + assert "" in bundle["text/plain"] + def test_expand_not_implemented() -> None: g = zarr.group()