-
Notifications
You must be signed in to change notification settings - Fork 63
Updates zarr-parser to use obstore list_async instead of concurrent_map
#892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
aa93b8b
37dff68
2fa25a7
626d0b9
9d6a312
bab147d
17e35cc
6cbb7c0
b400a34
19122a7
e22981f
fda8ce6
bbd6a1f
9cba9e8
f50b724
1be91cc
4ed8295
9114613
31c8ed0
e0ddfc2
d96d5c5
716a0bb
7e76088
5df7705
08232a8
3d7ebfc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,5 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import re | ||
| from collections.abc import ( | ||
| Callable, | ||
|
|
@@ -8,13 +10,16 @@ | |
| ValuesView, | ||
| ) | ||
| from pathlib import PosixPath | ||
| from typing import Any, NewType, TypedDict, cast | ||
| from typing import TYPE_CHECKING, Any, NewType, TypedDict, cast | ||
|
|
||
| import numpy as np | ||
|
|
||
| from virtualizarr.manifests.utils import construct_chunk_pattern, parse_manifest_index | ||
| from virtualizarr.types import ChunkKey | ||
|
|
||
| if TYPE_CHECKING: | ||
| import pyarrow as pa # type: ignore[import-untyped,import-not-found] | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd strongly suggest not tying to pyarrow
|
||
|
|
||
| # doesn't guarantee that writers actually handle these | ||
| VALID_URI_PREFIXES = { | ||
| "s3://", | ||
|
|
@@ -322,6 +327,55 @@ def from_arrays( | |
|
|
||
| return obj | ||
|
|
||
| @classmethod | ||
| def _from_arrow( | ||
| cls, | ||
| *, | ||
| paths: "pa.StringArray", | ||
| offsets: "pa.UInt64Array", | ||
| lengths: "pa.UInt64Array", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest typing these as Then this API will automatically support any arrow input, including polars, duckdb, arro3, etc apache/arrow#39195 (comment) |
||
| shape: tuple[int, ...], | ||
| ) -> "ChunkManifest": | ||
| """ | ||
| Create a ChunkManifest from flat 1D PyArrow arrays. | ||
|
|
||
| Avoids intermediate Python dicts by converting Arrow arrays directly | ||
| to the numpy arrays used internally by ChunkManifest. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| paths | ||
| Full paths to chunks, as a PyArrow StringArray. Nulls represent missing chunks. | ||
| offsets | ||
| Byte offsets of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks. | ||
| lengths | ||
| Byte lengths of chunks, as a PyArrow UInt64Array. Nulls represent missing chunks. | ||
| shape | ||
| Shape to reshape the flat arrays into. | ||
| """ | ||
| import pyarrow as pa # type: ignore[import-untyped,import-not-found] | ||
| import pyarrow.compute as pc # type: ignore[import-untyped,import-not-found] | ||
|
|
||
| arrow_paths = pc.if_else(pc.is_null(paths), "", paths) | ||
| arrow_offsets = pc.if_else( | ||
| pc.is_null(offsets), pa.scalar(0, pa.uint64()), offsets | ||
| ) | ||
| arrow_lengths = pc.if_else( | ||
| pc.is_null(lengths), pa.scalar(0, pa.uint64()), lengths | ||
| ) | ||
|
Comment on lines
+359
to
+365
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Requiring a pyarrow dependency just for these three lines is not worth it IMO. Much better to just document that the users must remove any null values before passing in arguments.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And then you can probably use arro3-core for all your needs and save the big pyarrow dependency. |
||
|
|
||
| np_paths = arrow_paths.to_numpy(zero_copy_only=False).astype( | ||
| np.dtypes.StringDType() | ||
| ) | ||
| np_offsets = arrow_offsets.to_numpy(zero_copy_only=False) | ||
| np_lengths = arrow_lengths.to_numpy(zero_copy_only=False) | ||
|
|
||
| return cls.from_arrays( | ||
| paths=np_paths.reshape(shape), | ||
| offsets=np_offsets.reshape(shape), | ||
| lengths=np_lengths.reshape(shape), | ||
| ) | ||
|
|
||
| @property | ||
| def ndim_chunk_grid(self) -> int: | ||
| """ | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Feel free to disregard, but ideally you shouldn't need to depend on both dependencies. Pyarrow is very large. I looked recently and it looks like it's gotten even larger
50MB compressed is huge.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah thanks for the feedback @kylebarron. Good to know about pyarrow.