11from __future__ import annotations
22
33import re
4+ import warnings
45from collections .abc import Mapping
56from pathlib import Path
67from types import MappingProxyType
7- from typing import Any
8+ from typing import Any , Callable , Literal
89
910import anndata
1011import dask .dataframe as dd
1112import geopandas
1213import numpy as np
1314import pandas as pd
15+ import xarray
1416from dask import array as da
1517from dask_image .imread import imread
1618from spatialdata import SpatialData
1719from spatialdata ._logging import logger
1820from spatialdata .models import Image2DModel , PointsModel , ShapesModel , TableModel
19- from spatialdata .transformations import Affine , Identity
21+ from spatialdata .transformations import Affine , BaseTransformation
2022
2123from spatialdata_io ._constants ._constants import MerscopeKeys
2224from spatialdata_io ._docs import inject_docs
2325
26+ SUPPORTED_BACKENDS = ["dask_image" , "rioxarray" ]
27+
2428
2529def _get_channel_names (images_dir : Path ) -> list [str ]:
2630 exp = r"mosaic_(?P<stain>[\w|-]+[0-9]?)_z(?P<z>[0-9]+).tif"
@@ -82,6 +86,11 @@ def merscope(
8286 z_layers : int | list [int ] | None = 3 ,
8387 region_name : str | None = None ,
8488 slide_name : str | None = None ,
89+ backend : Literal ["dask_image" , "rioxarray" ] | None = None ,
90+ transcripts : bool = True ,
91+ cells_boundaries : bool = True ,
92+ cells_table : bool = True ,
93+ mosaic_images : bool = True ,
8594 imread_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
8695 image_models_kwargs : Mapping [str , Any ] = MappingProxyType ({}),
8796) -> SpatialData :
@@ -120,6 +129,16 @@ def merscope(
120129 Name of the region of interest, e.g., `'region_0'`. If `None` then the name of the `path` directory is used.
121130 slide_name
122131 Name of the slide/run. If `None` then the name of the parent directory of `path` is used (whose name starts with a date).
132+ backend
133+ Either `"dask_image"` or `"rioxarray"` (the latter uses less RAM, but requires `rioxarray` to be installed). By default, uses `"rioxarray"` if and only if the `rioxarray` library is installed.
134+ transcripts
135+ Whether to read transcripts.
136+ cells_boundaries
137+ Whether to read cell boundaries (polygons).
138+ cells_table
139+ Whether to read cells table.
140+ mosaic_images
141+ Whether to read the mosaic images.
123142 imread_kwargs
124143 Keyword arguments to pass to the image reader.
125144 image_models_kwargs
@@ -140,13 +159,18 @@ def merscope(
140159 assert isinstance (image_models_kwargs , dict )
141160 image_models_kwargs ["scale_factors" ] = [2 , 2 , 2 , 2 ]
142161
162+ assert (
163+ backend is None or backend in SUPPORTED_BACKENDS
164+ ), f"Backend '{ backend } not supported. Should be one of: { ', ' .join (SUPPORTED_BACKENDS )} "
165+
143166 path = Path (path ).absolute ()
144167 count_path , obs_path , boundaries_path = _get_file_paths (path , vpt_outputs )
145168 images_dir = path / MerscopeKeys .IMAGES_DIR
146169
147170 microns_to_pixels = Affine (
148171 np .genfromtxt (images_dir / MerscopeKeys .TRANSFORMATION_FILE ), input_axes = ("x" , "y" ), output_axes = ("x" , "y" )
149172 )
173+ transformations = {"global" : microns_to_pixels }
150174
151175 vizgen_region = path .name if region_name is None else region_name
152176 slide_name = path .parent .name if slide_name is None else slide_name
@@ -156,73 +180,141 @@ def merscope(
156180 # Images
157181 images = {}
158182
159- z_layers = [z_layers ] if isinstance (z_layers , int ) else z_layers or []
160-
161- stainings = _get_channel_names (images_dir )
162- if stainings :
163- for z_layer in z_layers :
164- im = da .stack (
165- [
166- imread (images_dir / f"mosaic_{ stain } _z{ z_layer } .tif" , ** imread_kwargs ).squeeze ()
167- for stain in stainings
168- ],
169- axis = 0 ,
170- )
171- parsed_im = Image2DModel .parse (
172- im ,
173- dims = ("c" , "y" , "x" ),
174- transformations = {"microns" : microns_to_pixels .inverse ()},
175- c_coords = stainings ,
176- ** image_models_kwargs ,
177- )
178- images [f"{ dataset_id } _z{ z_layer } " ] = parsed_im
183+ if mosaic_images :
184+ z_layers = [z_layers ] if isinstance (z_layers , int ) else z_layers or []
185+
186+ reader = _get_reader (backend )
187+
188+ stainings = _get_channel_names (images_dir )
189+ if stainings :
190+ for z_layer in z_layers :
191+ images [f"{ dataset_id } _z{ z_layer } " ] = reader (
192+ images_dir ,
193+ stainings ,
194+ z_layer ,
195+ image_models_kwargs ,
196+ ** imread_kwargs ,
197+ )
179198
180199 # Transcripts
181200 points = {}
182- transcript_path = path / MerscopeKeys .TRANSCRIPTS_FILE
183- if transcript_path .exists ():
184- points [f"{ dataset_id } _transcripts" ] = _get_points (transcript_path )
185- else :
186- logger .warning (f"Transcript file { transcript_path } does not exist. Transcripts are not loaded." )
201+
202+ if transcripts :
203+ transcript_path = path / MerscopeKeys .TRANSCRIPTS_FILE
204+ if transcript_path .exists ():
205+ points [f"{ dataset_id } _transcripts" ] = _get_points (transcript_path , transformations )
206+ else :
207+ logger .warning (f"Transcript file { transcript_path } does not exist. Transcripts are not loaded." )
187208
188209 # Polygons
189210 shapes = {}
190- if boundaries_path .exists ():
191- shapes [f"{ dataset_id } _polygons" ] = _get_polygons (boundaries_path )
192- else :
193- logger .warning (f"Boundary file { boundaries_path } does not exist. Cell boundaries are not loaded." )
194-
195- # Table
196- table = None
197- if count_path .exists () and obs_path .exists ():
198- table = _get_table (count_path , obs_path , vizgen_region , slide_name , dataset_id , region )
199- else :
200- logger .warning (
201- f"At least one of the following files does not exist: { count_path } , { obs_path } . The table is not loaded."
211+
212+ if cells_boundaries :
213+ if boundaries_path .exists ():
214+ shapes [f"{ dataset_id } _polygons" ] = _get_polygons (boundaries_path , transformations )
215+ else :
216+ logger .warning (f"Boundary file { boundaries_path } does not exist. Cell boundaries are not loaded." )
217+
218+ # Tables
219+ tables = {}
220+
221+ if cells_table :
222+ if count_path .exists () and obs_path .exists ():
223+ tables ["table" ] = _get_table (count_path , obs_path , vizgen_region , slide_name , dataset_id , region )
224+ else :
225+ logger .warning (
226+ f"At least one of the following files does not exist: { count_path } , { obs_path } . The table is not loaded."
227+ )
228+
229+ return SpatialData (shapes = shapes , points = points , images = images , tables = tables )
230+
231+
232+ def _get_reader (backend : str | None ) -> Callable : # type: ignore[type-arg]
233+ if backend is not None :
234+ return _rioxarray_load_merscope if backend == "rioxarray" else _dask_image_load_merscope
235+ try :
236+ import rioxarray # noqa: F401
237+
238+ return _rioxarray_load_merscope
239+ except ModuleNotFoundError :
240+ return _dask_image_load_merscope
241+
242+
243+ def _rioxarray_load_merscope (
244+ images_dir : Path ,
245+ stainings : list [str ],
246+ z_layer : int ,
247+ image_models_kwargs : Mapping [str , Any ],
248+ ** kwargs : Any ,
249+ ) -> Image2DModel :
250+ logger .info ("Using rioxarray backend." )
251+
252+ try :
253+ import rioxarray
254+ except ModuleNotFoundError :
255+ raise ModuleNotFoundError (
256+ "Using rioxarray backend requires to install the rioxarray library (`pip install rioxarray`)"
202257 )
258+ from rasterio .errors import NotGeoreferencedWarning
259+
260+ warnings .simplefilter ("ignore" , category = NotGeoreferencedWarning )
261+
262+ im = xarray .concat (
263+ [
264+ rioxarray .open_rasterio (
265+ images_dir / f"mosaic_{ stain } _z{ z_layer } .tif" ,
266+ chunks = image_models_kwargs ["chunks" ],
267+ ** kwargs ,
268+ )
269+ .rename ({"band" : "c" })
270+ .reset_coords ("spatial_ref" , drop = True )
271+ for stain in stainings
272+ ],
273+ dim = "c" ,
274+ )
275+
276+ return Image2DModel .parse (im , c_coords = stainings , ** image_models_kwargs )
277+
203278
204- return SpatialData (shapes = shapes , points = points , images = images , table = table )
279+ def _dask_image_load_merscope (
280+ images_dir : Path ,
281+ stainings : list [str ],
282+ z_layer : int ,
283+ image_models_kwargs : Mapping [str , Any ],
284+ ** kwargs : Any ,
285+ ) -> Image2DModel :
286+ im = da .stack (
287+ [imread (images_dir / f"mosaic_{ stain } _z{ z_layer } .tif" , ** kwargs ).squeeze () for stain in stainings ],
288+ axis = 0 ,
289+ )
290+
291+ return Image2DModel .parse (
292+ im ,
293+ dims = ("c" , "y" , "x" ),
294+ c_coords = stainings ,
295+ ** image_models_kwargs ,
296+ )
205297
206298
207- def _get_points (transcript_path : Path ) -> dd .DataFrame :
299+ def _get_points (transcript_path : Path , transformations : dict [ str , BaseTransformation ] ) -> dd .DataFrame :
208300 transcript_df = dd .read_csv (transcript_path )
209301 transcripts = PointsModel .parse (
210302 transcript_df ,
211303 coordinates = {"x" : MerscopeKeys .GLOBAL_X , "y" : MerscopeKeys .GLOBAL_Y },
212- transformations = { "microns" : Identity ()} ,
304+ transformations = transformations ,
213305 )
214306 transcripts ["gene" ] = transcripts ["gene" ].astype ("category" )
215307 return transcripts
216308
217309
218- def _get_polygons (boundaries_path : Path ) -> geopandas .GeoDataFrame :
310+ def _get_polygons (boundaries_path : Path , transformations : dict [ str , BaseTransformation ] ) -> geopandas .GeoDataFrame :
219311 geo_df = geopandas .read_parquet (boundaries_path )
220312 geo_df = geo_df .rename_geometry ("geometry" )
221313 geo_df = geo_df [geo_df [MerscopeKeys .Z_INDEX ] == 0 ] # Avoid duplicate boundaries on all z-levels
222314 geo_df .geometry = geo_df .geometry .map (lambda x : x .geoms [0 ]) # The MultiPolygons contain only one polygon
223315 geo_df .index = geo_df [MerscopeKeys .METADATA_CELL_KEY ].astype (str )
224316
225- return ShapesModel .parse (geo_df , transformations = { "microns" : Identity ()} )
317+ return ShapesModel .parse (geo_df , transformations = transformations )
226318
227319
228320def _get_table (
0 commit comments