Support big-endian HDF5 and NetCDF3 files (#640)

maxrjones · web-flow · commit 91362de7432b · 2025-06-30T12:42:18.000-04:00
* Support big-endian HDF5 and NetCDF3 files

* Mark test as requires_kerchunk
diff --git a/virtualizarr/codecs.py b/virtualizarr/codecs.py
@@ -4,6 +4,7 @@
 import zarr
 from zarr.abc.codec import ArrayArrayCodec, ArrayBytesCodec, BytesBytesCodec
 from zarr.abc.codec import Codec as ZarrCodec
+from zarr.codecs import BytesCodec
 from zarr.core.codec_pipeline import BatchedCodecPipeline
 from zarr.core.metadata.v3 import ArrayV3Metadata
 
@@ -64,7 +65,6 @@ def convert_to_codec_pipeline(
     -------
     BatchedCodecPipeline
     """
-    from zarr.core.array import _get_default_chunk_encoding_v3
     from zarr.registry import get_codec_class
 
     zarr_codecs: tuple[ArrayArrayCodec | ArrayBytesCodec | BytesBytesCodec, ...] = ()
@@ -78,7 +78,10 @@ def convert_to_codec_pipeline(
     arrayarray_codecs, arraybytes_codec, bytesbytes_codecs = extract_codecs(zarr_codecs)
 
     if arraybytes_codec is None:
-        arraybytes_codec = _get_default_chunk_encoding_v3(dtype)[1]
+        if dtype.byteorder == ">":
+            arraybytes_codec = BytesCodec(endian="big")
+        else:
+            arraybytes_codec = BytesCodec()
 
     codec_pipeline = BatchedCodecPipeline(
         array_array_codecs=arrayarray_codecs,
diff --git a/virtualizarr/manifests/utils.py b/virtualizarr/manifests/utils.py
@@ -55,7 +55,7 @@ def create_v3_array_metadata(
     """
     return ArrayV3Metadata(
         shape=shape,
-        data_type=data_type,
+        data_type=data_type.name if hasattr(data_type, "name") else data_type,
         chunk_grid={
             "name": "regular",
             "configuration": {"chunk_shape": chunk_shape},
diff --git a/virtualizarr/tests/test_parsers/conftest.py b/virtualizarr/tests/test_parsers/conftest.py
@@ -457,4 +457,6 @@ def big_endian_dtype_hdf5_file(tmpdir):
     filepath = f"{tmpdir}/big_endian.nc"
     f = h5py.File(filepath, "w")
     f.create_dataset("data", shape=(10,), dtype=">f4")
+    dset = f["data"]
+    dset[...] = 10
     return filepath
diff --git a/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py b/virtualizarr/tests/test_parsers/test_hdf/test_hdf.py
@@ -1,6 +1,7 @@
 import h5py  # type: ignore
 import numpy as np
 import pytest
+import xarray as xr
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.parsers import HDFParser
@@ -221,19 +222,23 @@ def test_coord_names(
         ) as vds:
             assert set(vds.coords) == {"lat", "lon"}
 
-    @pytest.mark.xfail(reason="Requires Zarr v3 big endian dtype support")
     def test_big_endian(
         self,
         big_endian_dtype_hdf5_file,
     ):
         store = obstore_local(file_url=big_endian_dtype_hdf5_file)
         parser = HDFParser()
-        with open_virtual_dataset(
-            file_url=big_endian_dtype_hdf5_file,
-            object_store=store,
-            parser=parser,
-        ) as vds:
-            print(vds)
+        with (
+            parser(
+                file_url=big_endian_dtype_hdf5_file, object_store=store
+            ) as manifest_store,
+            xr.open_dataset(big_endian_dtype_hdf5_file) as expected,
+        ):
+            observed = xr.open_dataset(
+                manifest_store, engine="zarr", consolidated=False, zarr_format=3
+            )
+            assert isinstance(observed, xr.Dataset)
+            xr.testing.assert_identical(observed.load(), expected.load())
 
 
 @requires_hdf5plugin
diff --git a/virtualizarr/tests/test_parsers/test_netcdf3.py b/virtualizarr/tests/test_parsers/test_netcdf3.py
@@ -1,45 +1,31 @@
-import pytest
 import xarray as xr
 import xarray.testing as xrt
 
 from virtualizarr import open_virtual_dataset
-from virtualizarr.manifests import ChunkManifest, ManifestArray
 from virtualizarr.parsers import NetCDF3Parser
-from virtualizarr.tests import requires_network, requires_scipy
+from virtualizarr.tests import requires_kerchunk, requires_network, requires_scipy
 from virtualizarr.tests.utils import obstore_http, obstore_local
 
 
 @requires_scipy
-@pytest.mark.xfail(
-    reason="Big endian not yet supported by zarr-python 3.0"
-)  # https://github.com/zarr-developers/zarr-python/issues/2324
 def test_read_netcdf3(netcdf3_file, array_v3_metadata):
     filepath = str(netcdf3_file)
     store = obstore_local(file_url=filepath)
     parser = NetCDF3Parser()
-    with open_virtual_dataset(
-        file_url=filepath,
-        parser=parser,
-        object_store=store,
-    ) as vds:
-        assert isinstance(vds, xr.Dataset)
-        assert list(vds.variables.keys()) == ["foo"]
-        assert isinstance(vds["foo"].data, ManifestArray)
-
-        expected_manifest = ChunkManifest(
-            entries={"0": {"path": filepath, "offset": 80, "length": 12}}
+    with (
+        parser(file_url=filepath, object_store=store) as manifest_store,
+        xr.open_dataset(filepath) as expected,
+    ):
+        observed = xr.open_dataset(
+            manifest_store, engine="zarr", consolidated=False, zarr_format=3
         )
-        metadata = array_v3_metadata(shape=(3,), chunks=(3,))
-        expected_ma = ManifestArray(chunkmanifest=expected_manifest, metadata=metadata)
-        expected_vds = xr.Dataset({"foo": xr.Variable(data=expected_ma, dims=["x"])})
-
-        xrt.assert_identical(vds, expected_vds)
+        assert isinstance(observed, xr.Dataset)
+        assert list(observed.variables.keys()) == ["foo"]
+        xrt.assert_identical(observed.load(), expected.load())
 
 
+@requires_kerchunk
 @requires_network
-@pytest.mark.xfail(
-    reason="Big endian not yet supported by zarr-python 3.0"
-)  # https://github.com/zarr-developers/zarr-python/issues/2324
 def test_read_http_netcdf3(array_v3_metadata):
     file_url = "https://github.com/pydata/xarray-data/raw/master/air_temperature.nc"
     store = obstore_http(file_url=file_url)