Handle big-endian pixel data correctly on read

brendancol · brendancol · commit ed1e40f2af44 · 2026-03-20T07:31:43.000-07:00
Big-endian TIFFs (byte order marker 'MM') now byte-swap pixel data
to native order after decompression. Previously, the reader did
.view(dtype) with a native-order dtype, producing garbage values
for multi-byte types (uint16, int32, float32, float64).

Fix: _decode_strip_or_tile uses dtype.newbyteorder(file_byte_order)
for the view, then .astype(native_dtype) if a swap is needed.
Single-byte types (uint8) need no swap. The COG HTTP reader path
has the same fix.

Also fixed the test conftest: make_minimal_tiff(big_endian=True) now
actually writes pixel bytes in big-endian order.

7 new tests: float32, uint16, int32, float64, uint8 (no swap),
windowed read, and public API -- all with big-endian TIFFs.
diff --git a/xrspatial/geotiff/_reader.py b/xrspatial/geotiff/_reader.py
@@ -194,9 +194,17 @@ def _packed_byte_count(pixel_count: int, bps: int) -> int:
 
 
 def _decode_strip_or_tile(data_slice, compression, width, height, samples,
-                          bps, bytes_per_sample, is_sub_byte, dtype, pred):
+                          bps, bytes_per_sample, is_sub_byte, dtype, pred,
+                          byte_order='<'):
     """Decompress, apply predictor, unpack sub-byte, and reshape a strip/tile.
 
+    Parameters
+    ----------
+    byte_order : str
+        '<' for little-endian, '>' for big-endian.  When the file byte
+        order differs from the system's native order, pixel data is
+        byte-swapped after decompression.
+
     Returns an array shaped (height, width) or (height, width, samples).
     """
     pixel_count = width * height * samples
@@ -217,13 +225,21 @@ def _decode_strip_or_tile(data_slice, compression, width, height, samples,
     if is_sub_byte:
         pixels = unpack_bits(chunk, bps, pixel_count)
     else:
-        pixels = chunk.view(dtype)
+        # Use the file's byte order for the view, then convert to native
+        file_dtype = dtype.newbyteorder(byte_order)
+        pixels = chunk.view(file_dtype)
+        if file_dtype.byteorder not in ('=', '|', _NATIVE_ORDER):
+            pixels = pixels.astype(dtype)
 
     if samples > 1:
         return pixels.reshape(height, width, samples)
     return pixels.reshape(height, width)
 
 
+import sys as _sys
+_NATIVE_ORDER = '<' if _sys.byteorder == 'little' else '>'
+
+
 # ---------------------------------------------------------------------------
 # Strip reader
 # ---------------------------------------------------------------------------
@@ -305,7 +321,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
                 strip_data = data[offsets[global_idx]:offsets[global_idx] + byte_counts[global_idx]]
                 strip_pixels = _decode_strip_or_tile(
                     strip_data, compression, width, strip_rows, 1,
-                    bps, bytes_per_sample, is_sub_byte, dtype, pred)
+                    bps, bytes_per_sample, is_sub_byte, dtype, pred,
+                    byte_order=header.byte_order)
 
                 src_r0 = max(r0 - strip_row, 0)
                 src_r1 = min(r1 - strip_row, strip_rows)
@@ -326,7 +343,8 @@ def _read_strips(data: bytes, ifd: IFD, header: TIFFHeader,
             strip_data = data[offsets[strip_idx]:offsets[strip_idx] + byte_counts[strip_idx]]
             strip_pixels = _decode_strip_or_tile(
                 strip_data, compression, width, strip_rows, samples,
-                bps, bytes_per_sample, is_sub_byte, dtype, pred)
+                bps, bytes_per_sample, is_sub_byte, dtype, pred,
+                byte_order=header.byte_order)
 
             src_r0 = max(r0 - strip_row, 0)
             src_r1 = min(r1 - strip_row, strip_rows)
@@ -424,7 +442,8 @@ def _read_tiles(data: bytes, ifd: IFD, header: TIFFHeader,
                 tile_data = data[offsets[tile_idx]:offsets[tile_idx] + byte_counts[tile_idx]]
                 tile_pixels = _decode_strip_or_tile(
                     tile_data, compression, tw, th, tile_samples,
-                    bps, bytes_per_sample, is_sub_byte, dtype, pred)
+                    bps, bytes_per_sample, is_sub_byte, dtype, pred,
+                    byte_order=header.byte_order)
 
                 tile_r0 = tr * th
                 tile_c0 = tc * tw
@@ -552,10 +571,13 @@ def _read_cog_http(url: str, overview_level: int | None = None,
                     chunk = chunk.copy()
                 chunk = _apply_predictor(chunk, pred, tw, th, bytes_per_sample * samples)
 
+            file_dtype = dtype.newbyteorder(header.byte_order)
             if samples > 1:
-                tile_pixels = chunk.view(dtype).reshape(th, tw, samples)
+                tile_pixels = chunk.view(file_dtype).reshape(th, tw, samples)
             else:
-                tile_pixels = chunk.view(dtype).reshape(th, tw)
+                tile_pixels = chunk.view(file_dtype).reshape(th, tw)
+            if file_dtype.byteorder not in ('=', '|', _NATIVE_ORDER):
+                tile_pixels = tile_pixels.astype(dtype)
 
             # Place tile
             y0 = tr * th
diff --git a/xrspatial/geotiff/tests/conftest.py b/xrspatial/geotiff/tests/conftest.py
@@ -63,7 +63,10 @@ def make_minimal_tiff(
         pixel_bytes = b''.join(tile_blobs)
         tile_byte_counts = [len(b) for b in tile_blobs]
     else:
-        pixel_bytes = pixel_data.tobytes()
+        if big_endian and pixel_data.dtype.itemsize > 1:
+            pixel_bytes = pixel_data.astype(pixel_data.dtype.newbyteorder('>')).tobytes()
+        else:
+            pixel_bytes = pixel_data.tobytes()
 
     # --- Collect tags as (tag_id, type_id, value_bytes) ---
     # value_bytes is the serialized value; if len <= 4 it's inline, else overflow.
diff --git a/xrspatial/geotiff/tests/test_features.py b/xrspatial/geotiff/tests/test_features.py
@@ -421,6 +421,111 @@ def test_no_crs_no_wkt(self, tmp_path):
 # Arbitrary tag preservation
 # -----------------------------------------------------------------------
 
+# -----------------------------------------------------------------------
+# Big-endian pixel data
+# -----------------------------------------------------------------------
+
+class TestBigEndian:
+
+    def test_float32_big_endian(self, tmp_path):
+        """Read a big-endian float32 TIFF."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(16, dtype=np.float32).reshape(4, 4)
+        tiff_data = make_minimal_tiff(4, 4, np.dtype('float32'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_f32.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path)
+        assert result.dtype == np.float32
+        np.testing.assert_array_equal(result, expected)
+
+    def test_uint16_big_endian(self, tmp_path):
+        """Read a big-endian uint16 TIFF."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(20, dtype=np.uint16).reshape(4, 5) * 1000
+        tiff_data = make_minimal_tiff(5, 4, np.dtype('uint16'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_u16.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path)
+        assert result.dtype == np.uint16
+        np.testing.assert_array_equal(result, expected)
+
+    def test_int32_big_endian(self, tmp_path):
+        """Read a big-endian int32 TIFF."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(16, dtype=np.int32).reshape(4, 4) - 8
+        tiff_data = make_minimal_tiff(4, 4, np.dtype('int32'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_i32.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path)
+        assert result.dtype == np.int32
+        np.testing.assert_array_equal(result, expected)
+
+    def test_float64_big_endian(self, tmp_path):
+        """Read a big-endian float64 TIFF."""
+        from .conftest import make_minimal_tiff
+        expected = np.linspace(-1.0, 1.0, 16, dtype=np.float64).reshape(4, 4)
+        tiff_data = make_minimal_tiff(4, 4, np.dtype('float64'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_f64.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path)
+        assert result.dtype == np.float64
+        np.testing.assert_array_almost_equal(result, expected)
+
+    def test_uint8_big_endian_no_swap_needed(self, tmp_path):
+        """uint8 big-endian needs no byte swap (single byte per sample)."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(16, dtype=np.uint8).reshape(4, 4)
+        tiff_data = make_minimal_tiff(4, 4, np.dtype('uint8'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_u8.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path)
+        np.testing.assert_array_equal(result, expected)
+
+    def test_big_endian_windowed(self, tmp_path):
+        """Windowed read of a big-endian TIFF."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(64, dtype=np.float32).reshape(8, 8)
+        tiff_data = make_minimal_tiff(8, 8, np.dtype('float32'),
+                                       pixel_data=expected, big_endian=True)
+        path = str(tmp_path / 'be_window.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        result, _ = read_to_array(path, window=(2, 3, 6, 7))
+        np.testing.assert_array_equal(result, expected[2:6, 3:7])
+
+    def test_big_endian_via_public_api(self, tmp_path):
+        """read_geotiff handles big-endian files."""
+        from .conftest import make_minimal_tiff
+        expected = np.arange(16, dtype=np.float32).reshape(4, 4)
+        tiff_data = make_minimal_tiff(
+            4, 4, np.dtype('float32'), pixel_data=expected,
+            big_endian=True,
+            geo_transform=(-120.0, 45.0, 0.001, -0.001), epsg=4326)
+        path = str(tmp_path / 'be_api.tif')
+        with open(path, 'wb') as f:
+            f.write(tiff_data)
+
+        da = read_geotiff(path)
+        assert da.attrs['crs'] == 4326
+        np.testing.assert_array_equal(da.values, expected)
+
+
 class TestExtraTags:
 
     def _make_tiff_with_extra_tags(self, tmp_path):