MICS-Lab
diff --git a/‎src/_3_check_alignment/_3-1_build_patches.py‎
Lines changed: 507 additions & 0 deletions b/‎src/_3_check_alignment/_3-1_build_patches.py‎
Lines changed: 507 additions & 0 deletions
diff --git a/‎src/_3_check_alignment/_3-2_group_npz.py‎
Lines changed: 233 additions & 0 deletions b/‎src/_3_check_alignment/_3-2_group_npz.py‎
Lines changed: 233 additions & 0 deletions
diff --git a/‎src/_3_check_alignment/_3-3_add_align.py‎
Lines changed: 112 additions & 0 deletions b/‎src/_3_check_alignment/_3-3_add_align.py‎
Lines changed: 112 additions & 0 deletions
@@ -0,0 +1,233 @@
+"""Group all npz in one unique file"""
+
+
+import os
+from scipy.sparse import load_npz, vstack, save_npz
+from tqdm import tqdm
+import argparse
+import gc
+import re
+import numpy as np
+
+
+
+def consolidate_chunks(folder_path, chunk_prefix, output_file):
+    """
+    Consolidate multiple sparse chunk files into a single sparse .npz file.
+    
+    Args:
+        folder_path (str): Path to the folder containing chunk files.
+        chunk_prefix (str): Prefix of the chunk files (e.g., 'masks_chunk', 'masks_cells_chunk').
+        output_file (str): Path to save the consolidated .npz file.
+    """
+    print(f"\n-> Consolidating chunks with prefix '{chunk_prefix}' in {folder_path}...")
+
+    # Collect all chunk files matching the prefix
+    chunk_files = sorted(
+        [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith(chunk_prefix) and f.endswith(".npz")]
+    )
+
+    if not chunk_files:
+        print(f"No chunks found with prefix '{chunk_prefix}'. Skipping.")
+        return
+
+    # Extract chunk indices and sort numerically
+    def extract_index(file_name):
+        match = re.search(rf"{chunk_prefix}_(\d+)\.npz$", file_name)
+        return int(match.group(1)) if match else float('inf')
+
+    chunk_files = sorted(chunk_files, key=lambda x: extract_index(os.path.basename(x)))
+
+    # Load and combine all sparse chunks
+    sparse_matrices = []
+    for chunk_file in tqdm(chunk_files, desc=f"Loading {chunk_prefix}", unit="chunk"):
+        sparse_chunk = load_npz(chunk_file)
+        sparse_matrices.append(sparse_chunk)
+        del sparse_chunk  # Release memory
+        gc.collect()
+
+    # Combine into a single sparse matrix
+    print(f"-> Combining {len(sparse_matrices)} chunks...")
+    final_sparse_matrix = vstack(sparse_matrices)
+
+    # Save the combined sparse matrix
+    print(f"-> Saving...")
+    save_npz(output_file, final_sparse_matrix)
+
+    # # Ensure the output file is saved
+    # if os.path.exists(output_file):
+    #     # If the file is saved successfully, delete all chunk files
+    #     print(f"-> Deleting chunk files after successful save...")
+    #     for chunk_file in chunk_files:
+    #         os.remove(chunk_file)
+    #         print(f"  - Deleted {chunk_file}")
+    # else:
+    #     print(f"-> Warning: Output file '{output_file}' was not created. Chunk files retained.")
+
+    # Cleanup
+    del sparse_matrices, final_sparse_matrix
+    gc.collect()
+
+    print(f"Done.")
+
+
+
+def consolidate_npy_chunks(folder_path, file_prefix, output_file):
+    """
+    Consolidate multiple .npy chunk files into a single .npy file.
+    
+    Args:
+        folder_path (str): Path to the folder containing chunk files.
+        file_prefix (str): Prefix of the chunk files (e.g., 'images_chunk').
+        output_file (str): Path to save the consolidated .npy file.
+    """
+    print(f"\n-> Consolidating .npy chunks with prefix '{file_prefix}' in {folder_path}...")
+
+    # Collect all .npy chunk files matching the prefix
+    chunk_files = sorted(
+        [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith(file_prefix) and f.endswith(".npy")],
+        key=lambda x: int(re.search(rf"{file_prefix}_(\d+)\.npy$", os.path.basename(x)).group(1))
+    )
+
+    if not chunk_files:
+        print(f"No .npy chunks found with prefix '{file_prefix}'. Skipping.")
+        return
+
+    # Load and combine all chunks
+    arrays = []
+    for chunk_file in tqdm(chunk_files, desc=f"Loading {file_prefix}", unit="chunk"):
+        arrays.append(np.load(chunk_file))
+
+    # Concatenate and save the final array
+    final_array = np.concatenate(arrays, axis=0)
+    np.save(output_file, final_array)
+
+    # # Ensure the output file is saved before deleting chunks
+    # if os.path.exists(output_file):
+    #     print(f"-> Deleting chunk files after successful save...")
+    #     for chunk_file in chunk_files:
+    #         os.remove(chunk_file)
+    #         print(f"  - Deleted {chunk_file}")
+    # else:
+    #     print(f"-> Warning: Output file '{output_file}' was not created. Chunk files retained.")
+    
+    # Cleanup
+    del arrays, final_array
+    gc.collect()
+
+    print(f"Done.")
+
+
+
+
+def consolidate_nested_chunks(folder_path, chunk_prefix, output_file):
+    """
+    Consolidate nested sparse chunk files into a single sparse .npz file.
+    
+    Args:
+        folder_path (str): Path to the folder containing chunk files.
+        chunk_prefix (str): Prefix of the chunk files (e.g., 'masks_chunk').
+        output_file (str): Path to save the consolidated .npz file.
+    """
+    print(f"\n-> Consolidating nested chunks with prefix '{chunk_prefix}' in {folder_path}...")
+
+    # Collect all nested chunk files matching the prefix
+    nested_chunk_files = sorted(
+        [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.startswith(chunk_prefix) and f.endswith(".npz")]
+    )
+
+    if not nested_chunk_files:
+        print(f"No nested chunks found with prefix '{chunk_prefix}'. Skipping.")
+        return
+
+    # Group by "i" and then by "j"
+    def extract_indices(file_name):
+        match = re.search(rf"{chunk_prefix}_(\d+)_chunk_(\d+)\.npz$", file_name)
+        if match:
+            return int(match.group(1)), int(match.group(2))
+        return float('inf'), float('inf')
+
+    nested_chunk_files = sorted(nested_chunk_files, key=lambda x: extract_indices(os.path.basename(x)))
+
+    # Combine all nested sparse chunks
+    sparse_matrices = []
+    for nested_chunk_file in tqdm(nested_chunk_files, desc=f"Loading {chunk_prefix}", unit="nested_chunk"):
+        sparse_chunk = load_npz(nested_chunk_file)
+        sparse_matrices.append(sparse_chunk)
+
+    # Combine into a single sparse matrix
+    print(f"-> Combining {len(sparse_matrices)} nested chunks...")
+    final_sparse_matrix = vstack(sparse_matrices)
+
+    # Save the combined sparse matrix
+    print(f"-> Saving...")
+    save_npz(output_file, final_sparse_matrix)
+
+    # # Ensure the output file is saved before deleting chunks
+    # if os.path.exists(output_file):
+    #     print(f"-> Deleting nested chunk files after successful save...")
+    #     for nested_chunk_file in nested_chunk_files:
+    #         os.remove(nested_chunk_file)
+    #         print(f"  - Deleted {nested_chunk_file}")
+    # else:
+    #     print(f"-> Warning: Output file '{output_file}' was not created. Nested chunk files retained.")
+
+    # Cleanup
+    del sparse_matrices, final_sparse_matrix
+    gc.collect()
+
+    print(f"Done.")
+
+
+
+
+def process_slide_folders(slide_ids, folder_name):
+    """
+    Process all slide folders to consolidate sparse mask chunks into single .npz files.
+    
+    Args:
+        slide_ids (list): List of slide IDs to process.
+        folder_name (str): Path to the parent folder containing slide subfolders.
+    """
+    for slide_id in slide_ids:
+        print(f"\n===== PROCESSING SLIDE: {slide_id} =====")
+        slide_folder = os.path.join(folder_name, slide_id)
+        
+        if not os.path.exists(slide_folder):
+            print(f"Slide folder '{slide_folder}' does not exist. Skipping.")
+            continue
+
+        # Check for images.npy or chunked images
+        images_file = os.path.join(slide_folder, "images.npy")
+        chunked_images = sorted(
+            [os.path.join(slide_folder, f) for f in os.listdir(slide_folder) if f.startswith("images_chunk") and f.endswith(".npy")]
+        )
+
+        if os.path.exists(images_file):
+            print(f"[INFO] Single 'images.npy' file detected for slide {slide_id}.")
+            consolidate_chunks(slide_folder, "masks_chunk", os.path.join(slide_folder, "masks.npz"))
+            consolidate_chunks(slide_folder, "masks_cells_chunk", os.path.join(slide_folder, "masks_cells.npz"))
+        elif chunked_images:
+            print(f"[INFO] Chunked 'images_chunk' files detected for slide {slide_id}.")
+            consolidate_npy_chunks(slide_folder, "images_chunk", os.path.join(slide_folder, "images.npy"))
+            consolidate_npy_chunks(slide_folder, "types_chunk", os.path.join(slide_folder, "types.npy"))
+            consolidate_npy_chunks(slide_folder, "patch_ids_chunk", os.path.join(slide_folder, "patch_ids.npy"))
+            consolidate_nested_chunks(slide_folder, "masks_chunk", os.path.join(slide_folder, "masks.npz"))
+            consolidate_nested_chunks(slide_folder, "masks_cells_chunk", os.path.join(slide_folder, "masks_cells.npz"))
+
+    print("\nAll slides processed successfully.")
+
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Consolidate sparse mask chunks for multiple slides.")
+
+    # Input arguments
+    parser.add_argument("--slide_ids", type=str, nargs="+", required=True, help="List of slide IDs to process.")
+    parser.add_argument("--folder_name", type=str, default="/Volumes/DD_FGS/MICS/data_HE2CellType/CT_DS/check_align_patches/patches_xenium", help="Parent folder containing slide subfolders.")
+
+    args = parser.parse_args()
+
+    # Run the consolidation process
+    process_slide_folders(args.slide_ids, args.folder_name)
@@ -0,0 +1,112 @@
+"""
+Before running this script, you need to applgy CellVit on the patches to predict the comparison segmentation mask:
+In HE2CT folder:
+- Prepare dataset using cell_segmentation/datasets/prepare_pannuke.py using ['Neoplastic','Inflammatory','Connective','Dead','Epithelial'] as classes
+- Make the Macenko normalization using cell_segmentation/datasets/macenko_normalization.py (OR NOT ????)
+- Convert the dataset to zip using cell_segmentation/datasets/convert_into_zip.py
+- Apply CellVit inference (file cell_segmentation/inference/inference_cellvit_experiment_pannuke.py):
+    - modifying config.yaml
+    - de-commenting the part to get the predictions for instance_map and pixel count predictions for pannuke label for gt in inference_cellvit_experiment_pannuke.py (cf. See where there is CHOOSE OR NOT)
+    - In case we want to use real cell type mask instead of fake one : Dynamically adapt mask to handle PanNuke categories in datasets/pannuke.py by uncommenting type_map[type_map != 0] = 1 at the end of the load_maskfile function (cf. CHOOSE)
+    - and --cell_tokens nucleus will be useful for H&E features after
+    - and using inference_cellvit_experiment_pannuke.py with in the terminal : python3 cell_segmentation/inference/inference_cellvit_experiment_pannuke.py --run_dir /Volumes/DD_FGS/MICS/data_HE2CellType/CT_DS/check_align_patches/apply_cellvit/output_cellvit/heart_s0 --checkpoint_name CellViT-SAM-H-x40.pth --gpu mps --magnification 40 --cell_tokens nucleus
+    OR use ruche with slurm_cellvit_checkalign.sh
+Then using this file, add metrics for each patch in the sdata object.
+!!!! If slide was too big to do inference in one time, use before the script optional_group_output_cellvit.py to group the output of CellVit in one unique files for the given slide. !!!!
+"""
+
+import argparse
+import os
+import json
+import pandas as pd
+import spatialdata as sd
+
+
+
+def open_json_metrics(output_cellvit_folder, slide_id):
+    
+    json_path = os.path.join(output_cellvit_folder, f'{slide_id}/inference_results.json')
+    with open(json_path, 'r') as file:
+        metric_json_file = json.load(file)
+    
+    return metric_json_file
+
+
+
+
+def build_df_metrics(metric_json_file):
+
+    df_metrics = pd.DataFrame.from_dict(metric_json_file['image_metrics'], orient='index')
+
+    df_metrics.reset_index(inplace=True)
+    df_metrics.rename(columns={'index': 'image'}, inplace=True)
+
+    return df_metrics
+
+
+
+
+def add_metrics_in_sdata(sdata, df_metrics):
+
+    he_patches = sdata.shapes['he_patches'].copy()
+
+    df_metrics['patch_id'] = df_metrics['image'].str.replace('.png', '').astype(int)
+
+    he_patches = he_patches.merge(df_metrics[['patch_id', 'Dice', 'Jaccard', 'bPQ']],
+                                on='patch_id', how='left')
+
+    he_patches[['Dice', 'Jaccard', 'bPQ']] = he_patches[['Dice', 'Jaccard', 'bPQ']].fillna(-1) # -1 will correspond to no cell in xenium mask
+
+    sdata.shapes['he_patches']['Dice'] = he_patches['Dice']
+    sdata.shapes['he_patches']['Jaccard'] = he_patches['Jaccard']
+    sdata.shapes['he_patches']['bPQ'] = he_patches['bPQ']
+    print(sdata.shapes['he_patches'].head())
+    print("\n\n")
+    print(sdata)
+
+    print("\n\nSaving on disk...")
+    sdata.delete_element_from_disk("he_patches")
+    sdata.write_element("he_patches")
+    print("Done.")
+
+
+
+
+def main(args):
+
+    print(f"\n==== Proccessing {args.slide_id} ====")
+
+    # Get metrics output from CellVit
+    print("Loading metrics...")
+    metric_json_file = open_json_metrics(args.output_cellvit_folder, args.slide_id)
+    df_metrics = build_df_metrics(metric_json_file)
+
+    # Load sdata
+    print("Loading sdata...")
+    sdata_path = os.path.join(args.sdata_folder, f'sdata_{args.slide_id}.zarr')
+    sdata = sd.read_zarr(sdata_path, selection=('shapes',))
+
+    try:
+        del sdata.shapes['he_patches']['Dice']
+        del sdata.shapes['he_patches']['Jaccard']
+        del sdata.shapes['he_patches']['bPQ']
+    except:
+        pass
+
+    print(sdata.shapes['he_patches'].head())
+
+    # Add metrics in sdata
+    print("\n\nAdding metrics in sdata...")
+    add_metrics_in_sdata(sdata, df_metrics)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Add metrics to check alignment patches in sdata")
+    
+    parser.add_argument("--slide_id", type=str, default="heart_s0", help="Slide id")
+    parser.add_argument("--output_cellvit_folder", type=str, default="/Volumes/DD_FGS/MICS/data_HE2CellType/CT_DS/check_align_patches/apply_cellvit/output_cellvit", help="Output folder of CellVit for align checking")
+    parser.add_argument("--sdata_folder", type=str, default="/Volumes/SAUV_FGS/MICS/data_HE2CellType/CT_DS/sdata_final", help="Folder containing final sdata")
+
+    args = parser.parse_args()
+    main(args)