Skip to content

Commit 7943fb8

Browse files
Merge pull request #152 from quentinblampey/merscope_rioxarray
merscope reader update
2 parents c67db5b + 953b0aa commit 7943fb8

2 files changed

Lines changed: 142 additions & 44 deletions

File tree

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,18 @@ and this project adheres to [Semantic Versioning][].
1515
- (Xenium) support reading multi-polygon selection files from the Xenium Explorer
1616
- (ISS) An experimental loader to load elemental ISS data objects, e.g. raw.tif, label.tif and anndata.h5ad
1717
- (Stereo-seq) Added reader @LLehner @timtreis @florianingelfinger #70
18+
- (MERSCOPE) Optional rioxarray backend for MERSCOPE data (reads chunks)
19+
- (MERSCOPE) Can choose which elements should be loaded
1820

1921
### Fixed
2022

2123
- (Visium) Fixed issue with joining a SpatialElement with a table due to index values not being unique.
2224
obs_names_make_unique is now called internally to enforce unique index values allowing for join operations.
2325

26+
### Changed
27+
28+
- (MERSCOPE) "global" coordinate system is used as a default instead of "microns"
29+
2430
## [0.1.2] - 2024-03-30
2531

2632
### Added

src/spatialdata_io/readers/merscope.py

Lines changed: 136 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,30 @@
11
from __future__ import annotations
22

33
import re
4+
import warnings
45
from collections.abc import Mapping
56
from pathlib import Path
67
from types import MappingProxyType
7-
from typing import Any
8+
from typing import Any, Callable, Literal
89

910
import anndata
1011
import dask.dataframe as dd
1112
import geopandas
1213
import numpy as np
1314
import pandas as pd
15+
import xarray
1416
from dask import array as da
1517
from dask_image.imread import imread
1618
from spatialdata import SpatialData
1719
from spatialdata._logging import logger
1820
from spatialdata.models import Image2DModel, PointsModel, ShapesModel, TableModel
19-
from spatialdata.transformations import Affine, Identity
21+
from spatialdata.transformations import Affine, BaseTransformation
2022

2123
from spatialdata_io._constants._constants import MerscopeKeys
2224
from spatialdata_io._docs import inject_docs
2325

26+
SUPPORTED_BACKENDS = ["dask_image", "rioxarray"]
27+
2428

2529
def _get_channel_names(images_dir: Path) -> list[str]:
2630
exp = r"mosaic_(?P<stain>[\w|-]+[0-9]?)_z(?P<z>[0-9]+).tif"
@@ -82,6 +86,11 @@ def merscope(
8286
z_layers: int | list[int] | None = 3,
8387
region_name: str | None = None,
8488
slide_name: str | None = None,
89+
backend: Literal["dask_image", "rioxarray"] | None = None,
90+
transcripts: bool = True,
91+
cells_boundaries: bool = True,
92+
cells_table: bool = True,
93+
mosaic_images: bool = True,
8594
imread_kwargs: Mapping[str, Any] = MappingProxyType({}),
8695
image_models_kwargs: Mapping[str, Any] = MappingProxyType({}),
8796
) -> SpatialData:
@@ -120,6 +129,16 @@ def merscope(
120129
Name of the region of interest, e.g., `'region_0'`. If `None` then the name of the `path` directory is used.
121130
slide_name
122131
Name of the slide/run. If `None` then the name of the parent directory of `path` is used (whose name starts with a date).
132+
backend
133+
Either `"dask_image"` or `"rioxarray"` (the latter uses less RAM, but requires `rioxarray` to be installed). By default, uses `"rioxarray"` if and only if the `rioxarray` library is installed.
134+
transcripts
135+
Whether to read transcripts.
136+
cells_boundaries
137+
Whether to read cell boundaries (polygons).
138+
cells_table
139+
Whether to read cells table.
140+
mosaic_images
141+
Whether to read the mosaic images.
123142
imread_kwargs
124143
Keyword arguments to pass to the image reader.
125144
image_models_kwargs
@@ -140,13 +159,18 @@ def merscope(
140159
assert isinstance(image_models_kwargs, dict)
141160
image_models_kwargs["scale_factors"] = [2, 2, 2, 2]
142161

162+
assert (
163+
backend is None or backend in SUPPORTED_BACKENDS
164+
), f"Backend '{backend} not supported. Should be one of: {', '.join(SUPPORTED_BACKENDS)}"
165+
143166
path = Path(path).absolute()
144167
count_path, obs_path, boundaries_path = _get_file_paths(path, vpt_outputs)
145168
images_dir = path / MerscopeKeys.IMAGES_DIR
146169

147170
microns_to_pixels = Affine(
148171
np.genfromtxt(images_dir / MerscopeKeys.TRANSFORMATION_FILE), input_axes=("x", "y"), output_axes=("x", "y")
149172
)
173+
transformations = {"global": microns_to_pixels}
150174

151175
vizgen_region = path.name if region_name is None else region_name
152176
slide_name = path.parent.name if slide_name is None else slide_name
@@ -156,73 +180,141 @@ def merscope(
156180
# Images
157181
images = {}
158182

159-
z_layers = [z_layers] if isinstance(z_layers, int) else z_layers or []
160-
161-
stainings = _get_channel_names(images_dir)
162-
if stainings:
163-
for z_layer in z_layers:
164-
im = da.stack(
165-
[
166-
imread(images_dir / f"mosaic_{stain}_z{z_layer}.tif", **imread_kwargs).squeeze()
167-
for stain in stainings
168-
],
169-
axis=0,
170-
)
171-
parsed_im = Image2DModel.parse(
172-
im,
173-
dims=("c", "y", "x"),
174-
transformations={"microns": microns_to_pixels.inverse()},
175-
c_coords=stainings,
176-
**image_models_kwargs,
177-
)
178-
images[f"{dataset_id}_z{z_layer}"] = parsed_im
183+
if mosaic_images:
184+
z_layers = [z_layers] if isinstance(z_layers, int) else z_layers or []
185+
186+
reader = _get_reader(backend)
187+
188+
stainings = _get_channel_names(images_dir)
189+
if stainings:
190+
for z_layer in z_layers:
191+
images[f"{dataset_id}_z{z_layer}"] = reader(
192+
images_dir,
193+
stainings,
194+
z_layer,
195+
image_models_kwargs,
196+
**imread_kwargs,
197+
)
179198

180199
# Transcripts
181200
points = {}
182-
transcript_path = path / MerscopeKeys.TRANSCRIPTS_FILE
183-
if transcript_path.exists():
184-
points[f"{dataset_id}_transcripts"] = _get_points(transcript_path)
185-
else:
186-
logger.warning(f"Transcript file {transcript_path} does not exist. Transcripts are not loaded.")
201+
202+
if transcripts:
203+
transcript_path = path / MerscopeKeys.TRANSCRIPTS_FILE
204+
if transcript_path.exists():
205+
points[f"{dataset_id}_transcripts"] = _get_points(transcript_path, transformations)
206+
else:
207+
logger.warning(f"Transcript file {transcript_path} does not exist. Transcripts are not loaded.")
187208

188209
# Polygons
189210
shapes = {}
190-
if boundaries_path.exists():
191-
shapes[f"{dataset_id}_polygons"] = _get_polygons(boundaries_path)
192-
else:
193-
logger.warning(f"Boundary file {boundaries_path} does not exist. Cell boundaries are not loaded.")
194-
195-
# Table
196-
table = None
197-
if count_path.exists() and obs_path.exists():
198-
table = _get_table(count_path, obs_path, vizgen_region, slide_name, dataset_id, region)
199-
else:
200-
logger.warning(
201-
f"At least one of the following files does not exist: {count_path}, {obs_path}. The table is not loaded."
211+
212+
if cells_boundaries:
213+
if boundaries_path.exists():
214+
shapes[f"{dataset_id}_polygons"] = _get_polygons(boundaries_path, transformations)
215+
else:
216+
logger.warning(f"Boundary file {boundaries_path} does not exist. Cell boundaries are not loaded.")
217+
218+
# Tables
219+
tables = {}
220+
221+
if cells_table:
222+
if count_path.exists() and obs_path.exists():
223+
tables["table"] = _get_table(count_path, obs_path, vizgen_region, slide_name, dataset_id, region)
224+
else:
225+
logger.warning(
226+
f"At least one of the following files does not exist: {count_path}, {obs_path}. The table is not loaded."
227+
)
228+
229+
return SpatialData(shapes=shapes, points=points, images=images, tables=tables)
230+
231+
232+
def _get_reader(backend: str | None) -> Callable: # type: ignore[type-arg]
233+
if backend is not None:
234+
return _rioxarray_load_merscope if backend == "rioxarray" else _dask_image_load_merscope
235+
try:
236+
import rioxarray # noqa: F401
237+
238+
return _rioxarray_load_merscope
239+
except ModuleNotFoundError:
240+
return _dask_image_load_merscope
241+
242+
243+
def _rioxarray_load_merscope(
244+
images_dir: Path,
245+
stainings: list[str],
246+
z_layer: int,
247+
image_models_kwargs: Mapping[str, Any],
248+
**kwargs: Any,
249+
) -> Image2DModel:
250+
logger.info("Using rioxarray backend.")
251+
252+
try:
253+
import rioxarray
254+
except ModuleNotFoundError:
255+
raise ModuleNotFoundError(
256+
"Using rioxarray backend requires to install the rioxarray library (`pip install rioxarray`)"
202257
)
258+
from rasterio.errors import NotGeoreferencedWarning
259+
260+
warnings.simplefilter("ignore", category=NotGeoreferencedWarning)
261+
262+
im = xarray.concat(
263+
[
264+
rioxarray.open_rasterio(
265+
images_dir / f"mosaic_{stain}_z{z_layer}.tif",
266+
chunks=image_models_kwargs["chunks"],
267+
**kwargs,
268+
)
269+
.rename({"band": "c"})
270+
.reset_coords("spatial_ref", drop=True)
271+
for stain in stainings
272+
],
273+
dim="c",
274+
)
275+
276+
return Image2DModel.parse(im, c_coords=stainings, **image_models_kwargs)
277+
203278

204-
return SpatialData(shapes=shapes, points=points, images=images, table=table)
279+
def _dask_image_load_merscope(
280+
images_dir: Path,
281+
stainings: list[str],
282+
z_layer: int,
283+
image_models_kwargs: Mapping[str, Any],
284+
**kwargs: Any,
285+
) -> Image2DModel:
286+
im = da.stack(
287+
[imread(images_dir / f"mosaic_{stain}_z{z_layer}.tif", **kwargs).squeeze() for stain in stainings],
288+
axis=0,
289+
)
290+
291+
return Image2DModel.parse(
292+
im,
293+
dims=("c", "y", "x"),
294+
c_coords=stainings,
295+
**image_models_kwargs,
296+
)
205297

206298

207-
def _get_points(transcript_path: Path) -> dd.DataFrame:
299+
def _get_points(transcript_path: Path, transformations: dict[str, BaseTransformation]) -> dd.DataFrame:
208300
transcript_df = dd.read_csv(transcript_path)
209301
transcripts = PointsModel.parse(
210302
transcript_df,
211303
coordinates={"x": MerscopeKeys.GLOBAL_X, "y": MerscopeKeys.GLOBAL_Y},
212-
transformations={"microns": Identity()},
304+
transformations=transformations,
213305
)
214306
transcripts["gene"] = transcripts["gene"].astype("category")
215307
return transcripts
216308

217309

218-
def _get_polygons(boundaries_path: Path) -> geopandas.GeoDataFrame:
310+
def _get_polygons(boundaries_path: Path, transformations: dict[str, BaseTransformation]) -> geopandas.GeoDataFrame:
219311
geo_df = geopandas.read_parquet(boundaries_path)
220312
geo_df = geo_df.rename_geometry("geometry")
221313
geo_df = geo_df[geo_df[MerscopeKeys.Z_INDEX] == 0] # Avoid duplicate boundaries on all z-levels
222314
geo_df.geometry = geo_df.geometry.map(lambda x: x.geoms[0]) # The MultiPolygons contain only one polygon
223315
geo_df.index = geo_df[MerscopeKeys.METADATA_CELL_KEY].astype(str)
224316

225-
return ShapesModel.parse(geo_df, transformations={"microns": Identity()})
317+
return ShapesModel.parse(geo_df, transformations=transformations)
226318

227319

228320
def _get_table(

0 commit comments

Comments
 (0)