Skip to content

Commit 8762f91

Browse files
Merge pull request #279 from lucas-diedrich/image-reader-chunkwise
Chunkwise image loader
2 parents 2f541f3 + 355b695 commit 8762f91

8 files changed

Lines changed: 734 additions & 22 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ temp/
66

77
# Compiled files
88
__pycache__/
9+
.ipynb_checkpoints/
910

1011
# Distribution / packaging
1112
/build/
@@ -42,3 +43,4 @@ data
4243
data/
4344
tests/data
4445
uv.lock
46+
.asv/

asv.conf.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"project": "spatialdata-io",
44
"project_url": "https://github.com/scverse/spatialdata-io",
55
"repo": ".",
6-
"branches": ["main", "xenium-labels-dask", "xenium-labels-dask-zipstore"],
6+
"branches": ["image-reader-chunkwise"],
77
"dvcs": "git",
88
"environment_type": "virtualenv",
99
"pythons": ["3.12"],

benchmarks/benchmark_image.py

Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
"""Benchmarks for SpatialData IO operations for large images.
2+
3+
Instructions:
4+
See benchmark_xenium.py for instructions.
5+
"""
6+
7+
import logging
8+
import logging.handlers
9+
import tempfile
10+
from pathlib import Path
11+
from typing import Any
12+
13+
import numpy as np
14+
import tifffile
15+
from spatialdata import SpatialData
16+
from spatialdata._logging import logger
17+
from xarray import DataArray
18+
19+
from spatialdata_io import image # type: ignore[attr-defined]
20+
21+
# =============================================================================
22+
# CONFIGURATION - Edit these values to match your setup
23+
# =============================================================================
24+
# Image dimensions: (channels, height, width)
25+
IMAGE_SHAPE = (3, 30000, 30000)
26+
# =============================================================================
27+
28+
29+
class IOBenchmarkImage:
30+
"""Benchmark IO read operations with different parameter combinations."""
31+
32+
timeout = 3600
33+
repeat = 3
34+
number = 1
35+
warmup_time = 0
36+
processes = 1
37+
38+
# Parameter combinations: scale_factors, (use_tiff_memmap, compressed), chunks
39+
# Combinations: (memmap=False, compressed=True), (memmap=False, compressed=False), (memmap=True, compressed=False)
40+
params = [
41+
[None, [2, 2]], # scale_factors
42+
[(False, True), (False, False), (True, False)], # (use_tiff_memmap, compressed)
43+
[(1, 250, 250), (3, 250, 250)], # chunks
44+
]
45+
param_names = ["scale_factors", "memmap_compressed", "chunks"]
46+
47+
# Class-level temp directory for image files (persists across all benchmarks)
48+
_images_temp_dir: tempfile.TemporaryDirectory[str] | None = None
49+
_path_read_uncompressed: Path | None = None
50+
_path_read_compressed: Path | None = None
51+
52+
@classmethod
53+
def _setup_images(cls) -> None:
54+
"""Create fake image data once for all benchmarks."""
55+
if cls._images_temp_dir is not None:
56+
return
57+
58+
cls._images_temp_dir = tempfile.TemporaryDirectory()
59+
images_dir = Path(cls._images_temp_dir.name)
60+
cls._path_read_uncompressed = images_dir / "image_uncompressed.tif"
61+
cls._path_read_compressed = images_dir / "image_compressed.tif"
62+
63+
# Generate fake image data
64+
rng = np.random.default_rng(42)
65+
data = rng.integers(0, 255, size=IMAGE_SHAPE, dtype=np.uint8)
66+
67+
# Write uncompressed TIFF (memmappable)
68+
tifffile.imwrite(cls._path_read_uncompressed, data, compression=None)
69+
# Write compressed TIFF (not memmappable)
70+
tifffile.imwrite(cls._path_read_compressed, data, compression="zlib")
71+
72+
def setup(self, *_: Any) -> None:
73+
"""Set up paths for benchmarking."""
74+
# Create images once (shared across all benchmark runs)
75+
self._setup_images()
76+
self.path_read_uncompressed = self._path_read_uncompressed
77+
self.path_read_compressed = self._path_read_compressed
78+
79+
# Create a separate temp directory for output (cleaned up after each run)
80+
self._output_temp_dir = tempfile.TemporaryDirectory()
81+
self.path_write = Path(self._output_temp_dir.name) / "data_benchmark.zarr"
82+
83+
def teardown(self, *_: Any) -> None:
84+
"""Clean up output directory after each benchmark run."""
85+
if hasattr(self, "_output_temp_dir"):
86+
self._output_temp_dir.cleanup()
87+
88+
def _convert_image(
89+
self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...]
90+
) -> SpatialData:
91+
"""Read image data with specified parameters."""
92+
use_tiff_memmap, compressed = memmap_compressed
93+
# Select file based on compression setting
94+
path_read = self.path_read_compressed if compressed else self.path_read_uncompressed
95+
assert path_read is not None
96+
97+
# Capture log messages to verify memmappable warning behavior
98+
log_capture = logging.handlers.MemoryHandler(capacity=100)
99+
log_capture.setLevel(logging.WARNING)
100+
logger.addHandler(log_capture)
101+
original_propagate = logger.propagate
102+
logger.propagate = True
103+
104+
try:
105+
im = image(
106+
input=path_read,
107+
data_axes=("c", "y", "x"),
108+
coordinate_system="global",
109+
use_tiff_memmap=use_tiff_memmap,
110+
chunks=chunks,
111+
scale_factors=scale_factors,
112+
)
113+
finally:
114+
logger.removeHandler(log_capture)
115+
logger.propagate = original_propagate
116+
117+
# Check warning behavior: when use_tiff_memmap=True with uncompressed file, no warning should be raised
118+
log_messages = [record.getMessage() for record in log_capture.buffer]
119+
has_memmap_warning = any("image data is not memory-mappable" in msg for msg in log_messages)
120+
if use_tiff_memmap and not compressed:
121+
assert not has_memmap_warning, (
122+
"Uncompressed TIFF with memmap=True should not trigger memory-mappable warning"
123+
)
124+
125+
sdata = SpatialData.init_from_elements({"image": im})
126+
# sanity check: chunks is (c, y, x)
127+
if scale_factors is None:
128+
assert isinstance(sdata["image"], DataArray)
129+
if chunks is not None:
130+
assert (
131+
sdata["image"].chunksizes["x"][0] == chunks[2]
132+
or sdata["image"].chunksizes["x"][0] == sdata["image"].shape[2]
133+
)
134+
assert (
135+
sdata["image"].chunksizes["y"][0] == chunks[1]
136+
or sdata["image"].chunksizes["y"][0] == sdata["image"].shape[1]
137+
)
138+
else:
139+
assert len(sdata["image"].keys()) == len(scale_factors) + 1
140+
if chunks is not None:
141+
assert (
142+
sdata["image"]["scale0"]["image"].chunksizes["x"][0] == chunks[2]
143+
or sdata["image"]["scale0"]["image"].chunksizes["x"][0]
144+
== sdata["image"]["scale0"]["image"].shape[2]
145+
)
146+
assert (
147+
sdata["image"]["scale0"]["image"].chunksizes["y"][0] == chunks[1]
148+
or sdata["image"]["scale0"]["image"].chunksizes["y"][0]
149+
== sdata["image"]["scale0"]["image"].shape[1]
150+
)
151+
152+
return sdata
153+
154+
def time_io(
155+
self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...]
156+
) -> None:
157+
"""Walltime for data parsing."""
158+
sdata = self._convert_image(scale_factors, memmap_compressed, chunks)
159+
sdata.write(self.path_write)
160+
161+
def peakmem_io(
162+
self, scale_factors: list[int] | None, memmap_compressed: tuple[bool, bool], chunks: tuple[int, ...]
163+
) -> None:
164+
"""Peak memory for data parsing."""
165+
sdata = self._convert_image(scale_factors, memmap_compressed, chunks)
166+
sdata.write(self.path_write)
167+
168+
169+
# if __name__ == "__main__":
170+
# # Run a single test case for quick verification
171+
# bench = IOBenchmarkImage()
172+
#
173+
# bench.setup()
174+
# bench.time_io(None, (True, False), (1, 5000, 5000))
175+
# bench.teardown()
176+
#
177+
# # Clean up the shared images temp directory at the end
178+
# if IOBenchmarkImage._images_temp_dir is not None:
179+
# IOBenchmarkImage._images_temp_dir.cleanup()
180+
# IOBenchmarkImage._images_temp_dir = None
Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@
1111
cd /path/to/spatialdata-io
1212
1313
# Quick benchmark (single run, for testing):
14-
asv run --python=same -b IOBenchmark --quick --show-stderr -v
14+
asv run --python=same -b IOBenchmarkXenium --quick --show-stderr -v
1515
1616
# Full benchmark (multiple runs, for accurate results):
17-
asv run --python=same -b IOBenchmark --show-stderr -v
17+
asv run --python=same -b IOBenchmarkXenium --show-stderr -v
1818
1919
Comparing branches:
2020
# Run on specific commits:
21-
asv run main^! -b IOBenchmark --show-stderr -v
22-
asv run xenium-labels-dask^! -b IOBenchmark --show-stderr -v
21+
asv run main^! -b IOBenchmarkXenium --show-stderr -v
22+
asv run xenium-labels-dask^! -b IOBenchmarkXenium --show-stderr -v
2323
2424
# Or compare two branches directly:
25-
asv continuous main xenium-labels-dask -b IOBenchmark --show-stderr -v
25+
asv continuous main xenium-labels-dask -b IOBenchmarkXenium --show-stderr -v
2626
2727
# View comparison:
2828
asv compare main xenium-labels-dask
@@ -36,7 +36,6 @@
3636
import inspect
3737
import shutil
3838
from pathlib import Path
39-
from typing import TYPE_CHECKING
4039

4140
from spatialdata import SpatialData
4241

@@ -62,9 +61,7 @@ def get_paths() -> tuple[Path, Path]:
6261
return path_read, path_write
6362

6463

65-
class IOBenchmark:
66-
"""Benchmark IO read operations."""
67-
64+
class IOBenchmarkXenium:
6865
timeout = 3600
6966
repeat = 3
7067
number = 1
@@ -106,4 +103,6 @@ def peakmem_io(self) -> None:
106103

107104

108105
if __name__ == "__main__":
109-
IOBenchmark().time_io()
106+
benchmark = IOBenchmarkXenium()
107+
benchmark.setup()
108+
benchmark.time_io()

0 commit comments

Comments
 (0)