make nuc segmentation and cut dup code

stephenwilliams22 · stephenwilliams22 · commit a6d7e9f38175 · 2025-10-02T12:49:11.000-07:00
diff --git a/src/spatialdata_io/_constants/_constants.py b/src/spatialdata_io/_constants/_constants.py
@@ -359,8 +359,10 @@ class VisiumHDKeys(ModeEnum):
     FILTERED_COUNTS_FILE = "filtered_feature_bc_matrix.h5"
     RAW_COUNTS_FILE = "raw_feature_bc_matrix.h5"
     TISSUE_POSITIONS_FILE = "tissue_positions.parquet"
+    BARCODE_MAPPINGS_FILE = "barcode_mappings.parquet"
     FILTERED_CELL_COUNTS_FILE = "filtered_feature_cell_matrix.h5"
-    SEGMENTATION_GEOJSON_PATH = "cell_segmentations.geojson"
+    CELL_SEGMENTATION_GEOJSON_PATH = "cell_segmentations.geojson"
+    NUCLEUS_SEGMENTATION_GEOJSON_PATH = "nucleus_segmentations.geojson"
 
     # images
     IMAGE_HIRES_FILE = "tissue_hires_image.png"
@@ -405,3 +407,4 @@ class VisiumHDKeys(ModeEnum):
 
     # Cell Segmentation keys
     CELL_SEG_KEY_HD = 'cell_segmentations'
+    NUCLEUS_SEG_KEY_HD = 'nucleus_segmentations'
diff --git a/src/spatialdata_io/readers/visium_hd.py b/src/spatialdata_io/readers/visium_hd.py
@@ -7,7 +7,9 @@
 from types import MappingProxyType
 from typing import TYPE_CHECKING, Any
 from shapely.geometry import Polygon
-
+import pyarrow.parquet as pq
+import anndata
+from scipy.sparse import csc_matrix
 import h5py
 import numpy as np
 import pandas as pd
@@ -39,12 +41,113 @@
 
 RNG = default_rng(0)
 
+def make_filtered_nucleus_adata(
+    filtered_matrix_h5_path: str,
+    barcode_mappings_parquet_path: str,
+    bin_col_name: str = 'square_002um',
+    aggregate_col_name: str = 'cell_id'
+) -> anndata.AnnData:
+    """Generate a filtered AnnData object by aggregating 2um binned data 
+    based on nucleus segmentation.
+    Uses a 2um filtered_feature_bc_matrix.h5 file and a barcode_mappings.parquet file containing 
+    barcode mappings, filters the data to include only valid nucleus mappings, 
+    and aggregates the data based on specified bin into cell IDs which only contain
+    the 2um square data under segmented nuclei.
+    
+    Parameters:
+    -----------
+    filtered_matrix_h5_path : str
+        Path to the 10x Genomics HDF5 matrix file.
+    barcode_mappings_parquet_path : str
+        Path to the Parquet file containing barcode mappings.
+    bin_col_name : str, optional
+        Column name in the barcode mappings that specifies the spatial bin (default is 'square_002um').
+    aggregate_col_name : str, optional
+        Column name in the barcode mappings that specifies the aggregate cell ID (default is 'cell_id').
+    Returns:
+    --------
+    anndata.AnnData
+        An AnnData object where the observations correspond to filtered cell IDs 
+        and the variables correspond to the original features from the input data.
+    """
+    # Read in the necessary files
+    adata_2um = sc.read_10x_h5(filtered_matrix_h5_path)
+    barcode_mappings = pq.read_table(barcode_mappings_parquet_path)
+
+    # Filter to only include valid cell IDs that are in both nucleus and cell
+    barcode_mappings = barcode_mappings.filter((barcode_mappings['cell_id'].is_valid()) and barcode_mappings["in_nucleus"] and barcode_mappings["in_cell"])
+    
+    # Filter the 2um adata to only include squares present in the barcode mappings
+    valid_squares = barcode_mappings[bin_col_name].unique()
+    squares_to_keep = np.intersect1d(adata_2um.obs_names, valid_squares)
+    adata_filtered = adata_2um[squares_to_keep, :].copy()
+
+    # Map each square to its corresponding cell ID
+    square_to_cell_map = dict(zip(
+        barcode_mappings[bin_col_name].to_pylist(),
+        barcode_mappings[aggregate_col_name].to_pylist()
+
+    ))
+    ordered_cell_ids = [square_to_cell_map[square] for square in adata_filtered.obs_names]
+    unique_cells = list(dict.fromkeys(ordered_cell_ids).keys())
+    cell_to_idx = {cell: i for i, cell in enumerate(unique_cells)}
+
+    # Make the aggregation matrix
+    col_indices = [cell_to_idx[cell] for cell in ordered_cell_ids]
+    row_indices = np.arange(len(ordered_cell_ids))
+    data = np.ones_like(row_indices)
+
+    aggregation_matrix = csc_matrix(
+        (data, (row_indices, col_indices)),
+        shape=(adata_filtered.n_obs, len(unique_cells))
+    )
+
+    # Make the final AnnData object where cell IDs are filtered
+    # to the data under the segmented nuclei
+    nucleus_matrix_sparse = adata_filtered.X.T.dot(aggregation_matrix)
+    adata_nucleus = sc.AnnData(nucleus_matrix_sparse.T)
+    adata_nucleus.obs_names = unique_cells
+    adata_nucleus.var = adata_filtered.var
+    
+    return adata_nucleus
+def make_shapes_transformation(scale_factors_path: Path, dataset_id: str) -> dict[str, Scale]:
+    """Load scale factors for lowres and hires images and create transformations.
+
+    Parameters
+    ----------
+    scale_factors_path : Path
+        Path to the scale factors JSON file.
+    dataset_id : str
+        Unique identifier of the dataset.
+
+    Returns
+    -------
+    dict[str, Scale]
+        A dictionary containing the transformations for lowres and hires images.
+    """
+    with open(scale_factors_path, 'r') as f:
+        scale_data_hd = json.load(f)
+    lowres_scale_factor_hd = scale_data_hd['tissue_lowres_scalef']
+    hires_scale_factor_hd = scale_data_hd['tissue_hires_scalef']
+
+    return {
+        f"{dataset_id}_downscaled_lowres": Scale(np.array([lowres_scale_factor_hd, lowres_scale_factor_hd]), axes=("x", "y")),
+        f"{dataset_id}_downscaled_hires": Scale(np.array([hires_scale_factor_hd, hires_scale_factor_hd]), axes=("x", "y"))
+    }
+def make_geojson_features_map(geojson_path: Path) -> dict[str, Any]:
+    with open(geojson_path, 'r') as f:
+        geojson_data = json.load(f)
+    return {
+        f"cellid_{feature['properties']['cell_id']:09d}-1": feature
+        for feature in geojson_data['features']
+    }
 @inject_docs(vx=VisiumHDKeys)
 def visium_hd(
     path: str | Path,
     dataset_id: str | None = None,
     filtered_counts_file: bool = True,
     load_segmentations_only: bool = True,
+    load_nucleus_segmentations: bool = False,
     bin_size: int | list[int] | None = None,
     bins_as_squares: bool = True,
     annotate_table_by_labels: bool = False,
@@ -108,10 +211,14 @@ def visium_hd(
     # Check for segmentation files
     SEGMENTED_OUTPUTS_PATH = path / VisiumHDKeys.SEGMENTATION_OUTPUTS
     COUNT_MATRIX_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.FILTERED_CELL_COUNTS_FILE
-    GEOJSON_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.SEGMENTATION_GEOJSON_PATH
+    CELL_GEOJSON_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.CELL_SEGMENTATION_GEOJSON_PATH
+    NUCLEUS_GEOJSON_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.NUCLEUS_SEGMENTATION_GEOJSON_PATH
     SCALE_FACTORS_PATH = SEGMENTED_OUTPUTS_PATH / VisiumHDKeys.SPATIAL / VisiumHDKeys.SCALEFACTORS_FILE
-    segmentation_files_exist = COUNT_MATRIX_PATH.exists() and GEOJSON_PATH.exists() and SCALE_FACTORS_PATH.exists()
-    
+    BARCODE_MAPPINGS_PATH = path / VisiumHDKeys.BARCODE_MAPPINGS_FILE
+    FILTERED_MATRIX_2U_PATH = path / VisiumHDKeys.BINNED_OUTPUTS / f"{VisiumHDKeys.BIN_PREFIX}002um" / VisiumHDKeys.FILTERED_COUNTS_FILE
+    cell_segmentation_files_exist = COUNT_MATRIX_PATH.exists() and CELL_GEOJSON_PATH.exists() and SCALE_FACTORS_PATH.exists()
+    nucleus_segmentation_files_exist = NUCLEUS_GEOJSON_PATH.exists() and BARCODE_MAPPINGS_PATH.exists() and FILTERED_MATRIX_2U_PATH.exists()
+
     if dataset_id is None:
         dataset_id = _infer_dataset_id(path)
 
@@ -277,29 +384,14 @@ def _get_bins(path_bins: Path) -> list[str]:
                 tables[bin_size_str].var_names_make_unique()
 
     # Integrate the segmentation data (skipped if segmentation files are not found)
-    if segmentation_files_exist:
+    if cell_segmentation_files_exist:
         print("Found segmentation data. Incorporating cell_segmentations.")
         adata_hd = sc.read_10x_h5(COUNT_MATRIX_PATH)
         adata_hd.var_names_make_unique()
-
-        with open(SCALE_FACTORS_PATH, 'r') as f:
-            scale_data_hd = json.load(f)
-
-        with open(GEOJSON_PATH, 'r') as f:
-            geojson_data = json.load(f)
-
-        lowres_scale_factor_hd = scale_data_hd['tissue_lowres_scalef']
-        hires_scale_factor_hd = scale_data_hd['tissue_hires_scalef']
-
-        shapes_transformations_hd = {
-            f"{dataset_id}_downscaled_lowres": Scale(np.array([lowres_scale_factor_hd, lowres_scale_factor_hd]), axes=("x", "y")),
-            f"{dataset_id}_downscaled_hires": Scale(np.array([hires_scale_factor_hd, hires_scale_factor_hd]), axes=("x", "y"))
-        }
-
-        geojson_features_map = {
-            f"cellid_{feature['properties']['cell_id']:09d}-1": feature
-            for feature in geojson_data['features']
-        }
+        
+        shapes_transformations_hd = make_shapes_transformation(scale_factors_path=SCALE_FACTORS_PATH, dataset_id=dataset_id)
+        
+        geojson_features_map = make_geojson_features_map(CELL_GEOJSON_PATH)
 
         geometries = []
         cell_ids_ordered = []
@@ -324,18 +416,63 @@ def _get_bins(path_bins: Path) -> list[str]:
         }, index=cell_ids_ordered)
         
         SHAPES_KEY_HD = f"{dataset_id}_{VisiumHDKeys.CELL_SEG_KEY_HD}"
-        adata_hd.obs['cell_id'] = adata_hd.obs.index
-        adata_hd.obs['region'] = SHAPES_KEY_HD
-        adata_hd.obs['region'] = adata_hd.obs['region'].astype('category')
-        adata_hd = adata_hd[shapes_gdf.index].copy()        
+        cell_adata_hd = adata_hd.copy()
+        cell_adata_hd.obs['cell_id'] = cell_adata_hd.obs.index
+        cell_adata_hd.obs['region'] = SHAPES_KEY_HD
+        cell_adata_hd.obs['region'] = cell_adata_hd.obs['region'].astype('category')
+        cell_adata_hd = cell_adata_hd[shapes_gdf.index].copy()        
 
         shapes[SHAPES_KEY_HD] = ShapesModel.parse(shapes_gdf, transformations=shapes_transformations_hd)
         tables[VisiumHDKeys.CELL_SEG_KEY_HD] = TableModel.parse(
-            adata_hd,
+            cell_adata_hd,
             region=SHAPES_KEY_HD,
             region_key='region',
             instance_key='cell_id'
         )
+        # load nucleus segmentations if available
+        if nucleus_segmentation_files_exist and load_nucleus_segmentations:
+            print("Found nucleus segmentation data. Incorporating nucleus_segmentations.")
+            
+            nucleus_adata_hd = make_filtered_nucleus_adata(filtered_matrix_h5_path=FILTERED_MATRIX_2U_PATH,barcode_mappings_parquet_path=BARCODE_MAPPINGS_PATH)
+            
+            with open(NUCLEUS_GEOJSON_PATH, 'r') as f:
+                geojson_data = json.load(f)
+
+            geometries = []
+            cell_ids_ordered = []
+
+            for obs_index_str in adata_hd.obs.index:
+                feature = geojson_features_map.get(obs_index_str)
+                if feature:
+                    polygon_coords = np.array(feature['geometry']['coordinates'][0])
+                    geometries.append(Polygon(polygon_coords))
+                    cell_ids_ordered.append(obs_index_str)
+                else:
+                    geometries.append(None)
+                    cell_ids_ordered.append(obs_index_str)
+
+            valid_indices = [i for i, geom in enumerate(geometries) if geom is not None]
+            geometries = [geometries[i] for i in valid_indices]
+            cell_ids_ordered = [cell_ids_ordered[i] for i in valid_indices]
+
+            shapes_gdf = GeoDataFrame({
+                'cell_id': cell_ids_ordered,
+                'geometry': geometries
+            }, index=cell_ids_ordered)
+            
+            SHAPES_KEY_HD = f"{dataset_id}_{VisiumHDKeys.NUCLEUS_SEG_KEY_HD}"
+            nucleus_adata_hd.obs['cell_id'] = nucleus_adata_hd.obs.index
+            nucleus_adata_hd.obs['region'] = SHAPES_KEY_HD
+            nucleus_adata_hd.obs['region'] = nucleus_adata_hd.obs['region'].astype('category')
+            nucleus_adata_hd = nucleus_adata_hd[shapes_gdf.index].copy()      
+
+            shapes[SHAPES_KEY_HD] = ShapesModel.parse(shapes_gdf, transformations=shapes_transformations_hd)
+            tables[VisiumHDKeys.CELL_SEG_KEY_HD] = TableModel.parse(
+                nucleus_adata_hd,
+                region=SHAPES_KEY_HD,
+                region_key='region',
+                instance_key='cell_id'
+            )
 
     # Read all images and add transformations for both binning and segmentation
     fullres_image_file_paths = []
@@ -394,7 +531,7 @@ def _get_bins(path_bins: Path) -> list[str]:
                 },
                 set_all=True,
             )
-        if segmentation_files_exist:
+        if cell_segmentation_files_exist:
             set_transformation(
                 images[dataset_id + "_hires_image"],
                 {f"{dataset_id}_downscaled_hires": Identity()},
@@ -422,7 +559,7 @@ def _get_bins(path_bins: Path) -> list[str]:
                 },
                 set_all=True,
             )
-        if segmentation_files_exist:
+        if cell_segmentation_files_exist:
             set_transformation(
                 images[dataset_id + "_lowres_image"],
                 {f"{dataset_id}_downscaled_lowres": Identity()},