Skip to content

Commit 91362de

Browse files
authored
Support big-endian HDF5 and NetCDF3 files (#640)
* Support big-endian HDF5 and NetCDF3 files * Mark test as requires_kerchunk
1 parent 62ca069 commit 91362de

5 files changed

Lines changed: 31 additions & 35 deletions

File tree

virtualizarr/codecs.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import zarr
55
from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
66
from zarr.abc.codec import Codec as ZarrCodec
7+
from zarr.codecs import BytesCodec
78
from zarr.core.codec_pipeline import BatchedCodecPipeline
89
from zarr.core.metadata.v3 import ArrayV3Metadata
910

@@ -64,7 +65,6 @@ def convert_to_codec_pipeline(
6465
-------
6566
BatchedCodecPipeline
6667
"""
67-
from zarr.core.array import _get_default_chunk_encoding_v3
6868
from zarr.registry import get_codec_class
6969

7070
zarr_codecs: tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] = ()
@@ -78,7 +78,10 @@ def convert_to_codec_pipeline(
7878
arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
7979

8080
if arraybytes_codec is None:
81-
arraybytes_codec = _get_default_chunk_encoding_v3(dtype)[1]
81+
if dtype.byteorder == ">":
82+
arraybytes_codec = BytesCodec(endian="big")
83+
else:
84+
arraybytes_codec = BytesCodec()
8285

8386
codec_pipeline = BatchedCodecPipeline(
8487
array_array_codecs=arrayarray_codecs,

virtualizarr/manifests/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def create_v3_array_metadata(
5555
"""
5656
return ArrayV3Metadata(
5757
shape=shape,
58-
data_type=data_type,
58+
data_type=data_type.name if hasattr(data_type, "name") else data_type,
5959
chunk_grid={
6060
"name": "regular",
6161
"configuration": {"chunk_shape": chunk_shape},

virtualizarr/tests/test_parsers/conftest.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -457,4 +457,6 @@ def big_endian_dtype_hdf5_file(tmpdir):
457457
filepath = f"{tmpdir}/big_endian.nc"
458458
f = h5py.File(filepath, "w")
459459
f.create_dataset("data", shape=(10,), dtype=">f4")
460+
dset = f["data"]
461+
dset[...] = 10
460462
return filepath

virtualizarr/tests/test_parsers/test_hdf/test_hdf.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import h5py # type: ignore
22
import numpy as np
33
import pytest
4+
import xarray as xr
45

56
from virtualizarr import open_virtual_dataset
67
from virtualizarr.parsers import HDFParser
@@ -221,19 +222,23 @@ def test_coord_names(
221222
) as vds:
222223
assert set(vds.coords) == {"lat", "lon"}
223224

224-
@pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support")
225225
def test_big_endian(
226226
self,
227227
big_endian_dtype_hdf5_file,
228228
):
229229
store = obstore_local(file_url=big_endian_dtype_hdf5_file)
230230
parser = HDFParser()
231-
with open_virtual_dataset(
232-
file_url=big_endian_dtype_hdf5_file,
233-
object_store=store,
234-
parser=parser,
235-
) as vds:
236-
print(vds)
231+
with (
232+
parser(
233+
file_url=big_endian_dtype_hdf5_file, object_store=store
234+
) as manifest_store,
235+
xr.open_dataset(big_endian_dtype_hdf5_file) as expected,
236+
):
237+
observed = xr.open_dataset(
238+
manifest_store, engine="zarr", consolidated=False, zarr_format=3
239+
)
240+
assert isinstance(observed, xr.Dataset)
241+
xr.testing.assert_identical(observed.load(), expected.load())
237242

238243

239244
@requires_hdf5plugin

virtualizarr/tests/test_parsers/test_netcdf3.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,45 +1,31 @@
1-
import pytest
21
import xarray as xr
32
import xarray.testing as xrt
43

54
from virtualizarr import open_virtual_dataset
6-
from virtualizarr.manifests import ChunkManifest, ManifestArray
75
from virtualizarr.parsers import NetCDF3Parser
8-
from virtualizarr.tests import requires_network, requires_scipy
6+
from virtualizarr.tests import requires_kerchunk, requires_network, requires_scipy
97
from virtualizarr.tests.utils import obstore_http, obstore_local
108

119

1210
@requires_scipy
13-
@pytest.mark.xfail(
14-
reason="Big endian not yet supported by zarr-python 3.0"
15-
) # https://github.com/zarr-developers/zarr-python/issues/2324
1611
def test_read_netcdf3(netcdf3_file, array_v3_metadata):
1712
filepath = str(netcdf3_file)
1813
store = obstore_local(file_url=filepath)
1914
parser = NetCDF3Parser()
20-
with open_virtual_dataset(
21-
file_url=filepath,
22-
parser=parser,
23-
object_store=store,
24-
) as vds:
25-
assert isinstance(vds, xr.Dataset)
26-
assert list(vds.variables.keys()) == ["foo"]
27-
assert isinstance(vds["foo"].data, ManifestArray)
28-
29-
expected_manifest = ChunkManifest(
30-
entries={"0": {"path": filepath, "offset": 80, "length": 12}}
15+
with (
16+
parser(file_url=filepath, object_store=store) as manifest_store,
17+
xr.open_dataset(filepath) as expected,
18+
):
19+
observed = xr.open_dataset(
20+
manifest_store, engine="zarr", consolidated=False, zarr_format=3
3121
)
32-
metadata = array_v3_metadata(shape=(3,), chunks=(3,))
33-
expected_ma = ManifestArray(chunkmanifest=expected_manifest, metadata=metadata)
34-
expected_vds = xr.Dataset({"foo": xr.Variable(data=expected_ma, dims=["x"])})
35-
36-
xrt.assert_identical(vds, expected_vds)
22+
assert isinstance(observed, xr.Dataset)
23+
assert list(observed.variables.keys()) == ["foo"]
24+
xrt.assert_identical(observed.load(), expected.load())
3725

3826

27+
@requires_kerchunk
3928
@requires_network
40-
@pytest.mark.xfail(
41-
reason="Big endian not yet supported by zarr-python 3.0"
42-
) # https://github.com/zarr-developers/zarr-python/issues/2324
4329
def test_read_http_netcdf3(array_v3_metadata):
4430
file_url = "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc"
4531
store = obstore_http(file_url=file_url)

0 commit comments

Comments
 (0)