From 606cde8a29a618859915977fe5a783d7751a025d Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 10:44:53 -0500 Subject: [PATCH 01/36] Add minian to `Processing` table --- element_miniscope/miniscope.py | 1050 +++++++++++++++++++++++++++----- setup.py | 4 + 2 files changed, 895 insertions(+), 159 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 98a611c..d1c42b3 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -382,7 +382,6 @@ def make(self, key): nframes = total_frames nchannels = 1 # Assumes a single channel - elif acq_software == "Inscopix": inscopix_metadata = next(recording_path.glob("session.json")) @@ -472,7 +471,10 @@ class ProcessingMethod(dj.Lookup): processing_method_desc: varchar(1000) """ - contents = [("caiman", "caiman analysis suite")] + contents = [ + ("caiman", "caiman analysis suite"), + ("minian", "minian analysis suite"), + ] @schema @@ -717,6 +719,7 @@ def make_fetch(self, key): avi_files = (RecordingInfo.File & key).fetch("file_path") processing_params = (ProcessingParamSet & key).fetch1("params") sampling_rate = (RecordingInfo & key).fetch1("fps") + px_height, px_width = (RecordingInfo & key).fetch1("px_height", "px_width") return ( task_mode, @@ -725,6 +728,8 @@ def make_fetch(self, key): avi_files, processing_params, sampling_rate, + px_height, + px_width, ) def make_compute( @@ -736,13 +741,15 @@ def make_compute( avi_files, processing_params, sampling_rate, + px_height, + px_width, ): """ Execute the miniscope analysis defined by the ProcessingTask. - task_mode: 'load', confirm that the results are already computed. - task_mode: 'trigger' runs the analysis. """ - if method != "caiman": + if method not in ["caiman", "minian"]: raise NotImplementedError(f"Method {method} is not supported") params = copy.deepcopy(processing_params) @@ -766,182 +773,594 @@ def make_compute( if method == "caiman": loaded_caiman = loaded_result key = {**key, "processing_time": loaded_caiman.creation_time} + elif method == "minian": + loaded_minian = loaded_result + key = {**key, "processing_time": loaded_minian.creation_time} else: raise NotImplementedError( f"Loading of {method} data is not yet supported" ) elif task_mode == "trigger": - import multiprocessing - import caiman as cm - from caiman.motion_correction import MotionCorrect - from caiman.source_extraction.cnmf.cnmf import CNMF - from caiman.source_extraction.cnmf.params import CNMFParams - from element_interface.run_caiman import _save_mc - - extra_params = params.pop("extra_dj_params", {}) - avi_files = [ find_full_path(get_miniscope_root_data_dir(), avi_file).as_posix() for avi_file in avi_files ] - params["fnames"] = avi_files - params["fr"] = sampling_rate - params["is3D"] = False - if "indices" in params: - params["motion"] = { - "indices": ( - slice(*params.get("indices")[0]), - slice(*params.get("indices")[1]), - ) - } - else: - params["motion"] = {"indices": (slice(None), slice(None))} + if method == "caiman": + import multiprocessing + import caiman as cm + from caiman.motion_correction import MotionCorrect + from caiman.source_extraction.cnmf.cnmf import CNMF + from caiman.source_extraction.cnmf.params import CNMFParams + from element_interface.run_caiman import _save_mc + + extra_params = params.pop("extra_dj_params", {}) + + params["fnames"] = avi_files + params["fr"] = sampling_rate + params["is3D"] = False + if "indices" in params: + params["motion"] = { + "indices": ( + slice(*params.get("indices")[0]), + slice(*params.get("indices")[1]), + ) + } + else: + params["motion"] = {"indices": (slice(None), slice(None))} - @memoized_result( - uniqueness_dict=params, - output_directory=output_dir, - ) - def _run_processing(): - mc_indices = params["motion"].get("indices") - caiman_temp = os.environ.get("CAIMAN_TEMP") - os.environ["CAIMAN_TEMP"] = str(output_dir) - n_processes = np.floor(multiprocessing.cpu_count() * 0.6) - n_processes = int(os.getenv("CAIMAN_MC_N_PROCESSES", n_processes)) - _, dview, n_processes = cm.cluster.setup_cluster( - backend="multiprocessing", - n_processes=n_processes, - maxtasksperchild=1, + @memoized_result( + uniqueness_dict=params, + output_directory=output_dir, ) - try: - opts = CNMFParams(params_dict=params) - cnm = CNMF(n_processes, params=opts, dview=dview) - fnames = cnm.params.get("data", "fnames") - mc = MotionCorrect(fnames, dview=cnm.dview, **cnm.params.motion) - mc_base_attrs = list(mc.__dict__) - logger.info("Starting motion correction (CaImAn)...") - mc.motion_correct(save_movie=mc_indices is None) - mc_results = { - k: v for k, v in mc.__dict__.items() if k not in mc_base_attrs - } - if cnm.params.get("motion", "pw_rigid"): - mc_results["b0"] = np.ceil( - np.max(np.abs(mc.shifts_rig)) - ).astype(int) - cnm.estimates.shifts = mc.shifts_rig - if cnm.params.get("motion", "is3D"): - cnm.estimates.shifts = [ - mc.x_shifts_els, - mc.y_shifts_els, - mc.z_shifts_els, - ] - else: - cnm.estimates.shifts = [mc.x_shifts_els, mc.y_shifts_els] - else: - mc_results["b0"] = np.ceil( - np.max(np.abs(mc.shifts_rig)) - ).astype(int) - cnm.estimates.shifts = mc.shifts_rig - - base_name = pathlib.Path(fnames[0]).stem - fname_mc = ( - mc.fname_tot_els - if cnm.params.motion["pw_rigid"] - else mc.fname_tot_rig + def _run_processing(): + mc_indices = params["motion"].get("indices") + caiman_temp = os.environ.get("CAIMAN_TEMP") + os.environ["CAIMAN_TEMP"] = str(output_dir) + n_processes = np.floor(multiprocessing.cpu_count() * 0.6) + n_processes = int(os.getenv("CAIMAN_MC_N_PROCESSES", n_processes)) + _, dview, n_processes = cm.cluster.setup_cluster( + backend="multiprocessing", + n_processes=n_processes, + maxtasksperchild=1, ) - if all(fname_mc): - logger.info("Generating C-order memmap file...") - border_to_0 = 0 if mc.border_nan == "copy" else mc.border_to_0 - fname_new = cm.mmapping.save_memmap( - fname_mc, - base_name=base_name + "_mc", - order="C", - var_name_hdf5=cnm.params.get("data", "var_name_hdf5"), - border_to_0=border_to_0, + try: + opts = CNMFParams(params_dict=params) + cnm = CNMF(n_processes, params=opts, dview=dview) + fnames = cnm.params.get("data", "fnames") + mc = MotionCorrect(fnames, dview=cnm.dview, **cnm.params.motion) + mc_base_attrs = list(mc.__dict__) + logger.info("Starting motion correction (CaImAn)...") + mc.motion_correct(save_movie=mc_indices is None) + mc_results = { + k: v + for k, v in mc.__dict__.items() + if k not in mc_base_attrs + } + if cnm.params.get("motion", "pw_rigid"): + mc_results["b0"] = np.ceil( + np.max(np.abs(mc.shifts_rig)) + ).astype(int) + cnm.estimates.shifts = mc.shifts_rig + if cnm.params.get("motion", "is3D"): + cnm.estimates.shifts = [ + mc.x_shifts_els, + mc.y_shifts_els, + mc.z_shifts_els, + ] + else: + cnm.estimates.shifts = [ + mc.x_shifts_els, + mc.y_shifts_els, + ] + else: + mc_results["b0"] = np.ceil( + np.max(np.abs(mc.shifts_rig)) + ).astype(int) + cnm.estimates.shifts = mc.shifts_rig + + base_name = pathlib.Path(fnames[0]).stem + fname_mc = ( + mc.fname_tot_els + if cnm.params.motion["pw_rigid"] + else mc.fname_tot_rig ) - else: + if all(fname_mc): + logger.info("Generating C-order memmap file...") + border_to_0 = ( + 0 if mc.border_nan == "copy" else mc.border_to_0 + ) + fname_new = cm.mmapping.save_memmap( + fname_mc, + base_name=base_name + "_mc", + order="C", + var_name_hdf5=cnm.params.get("data", "var_name_hdf5"), + border_to_0=border_to_0, + ) + else: + logger.info( + "Applying shifts, then generating C-order memmap file..." + ) + fname_new = mc.apply_shifts_movie( + fnames, + save_memmap=True, + save_base_name=base_name + "_mc", + order="C", + ) + mc.mmap_file = [fname_new] + Yr, dims, T = cm.mmapping.load_memmap(fname_new) + images = np.reshape(Yr.T, [T] + list(dims), order="F") + cnm.mmap_file = fname_new + # terminate the previous cluster and setup a new one with fewer + # processes for CNMF because it is memory intensive + dview.terminate() + n_processes = np.floor(multiprocessing.cpu_count() * 0.2) + n_processes = int( + os.getenv("CAIMAN_CNMF_N_PROCESSES", n_processes) + ) + _, dview, n_processes = cm.cluster.setup_cluster( + backend="multiprocessing", + n_processes=n_processes, + maxtasksperchild=1, + ) + cnm.dview = dview logger.info( - "Applying shifts, then generating C-order memmap file..." + f"Starting CNMF analysis with {n_processes} processes..." ) - fname_new = mc.apply_shifts_movie( - fnames, - save_memmap=True, - save_base_name=base_name + "_mc", - order="C", + + cnm.fit(images, indices=(slice(None), slice(None))) + cnm.estimates.evaluate_components( + images, cnm.params, dview=cnm.dview ) - mc.mmap_file = [fname_new] - Yr, dims, T = cm.mmapping.load_memmap(fname_new) - images = np.reshape(Yr.T, [T] + list(dims), order="F") - cnm.mmap_file = fname_new - # terminate the previous cluster and setup a new one with fewer - # processes for CNMF because it is memory intensive - dview.terminate() - n_processes = np.floor(multiprocessing.cpu_count() * 0.2) - n_processes = int(os.getenv("CAIMAN_CNMF_N_PROCESSES", n_processes)) - _, dview, n_processes = cm.cluster.setup_cluster( - backend="multiprocessing", - n_processes=n_processes, - maxtasksperchild=1, + cnm.estimates.detrend_df_f(quantileMin=8, frames_window=250) + logger.info("Computing summary images...") + correlation_image, _ = cm.summary_images.correlation_pnr( + images[:: max(T // 1000, 1)], + gSig=cnm.params.init["gSig"][0], + swap_dim=False, + ) + correlation_image[np.isnan(correlation_image)] = 0 + cnm.estimates.Cn = correlation_image + fname_hdf5 = cnm.mmap_file[:-4] + "hdf5" + cnm.save(fname_hdf5) + cnmf_output_file = pathlib.Path(fname_hdf5) + summary_images = { + "average_image": np.mean( + images[:: max(T // 1000, 1)], axis=0 + ), + "max_image": np.max(images[:: max(T // 1000, 1)], axis=0), + "correlation_image": correlation_image, + } + _save_mc( + mc, + cnmf_output_file.as_posix(), + params["is3D"], + summary_images=summary_images, + ) + except Exception as e: + dview.terminate() + raise e + else: + cm.stop_server(dview=dview) + logger.info("CNMF analysis complete. Resulted saved.") + caiman_temp = os.environ.get("CAIMAN_TEMP") + if caiman_temp is not None: + os.environ["CAIMAN_TEMP"] = caiman_temp + else: + del os.environ["CAIMAN_TEMP"] + + _run_processing() + _, imaging_dataset = get_loader_result( + key, ProcessingTask, full_output_dir=output_dir + ) + caiman_dataset = imaging_dataset + key["processing_time"] = caiman_dataset.creation_time + key["package_version"] = cm.__version__ + file_entries = [ + { + **key, + "file_name": f.relative_to( + get_processed_root_data_dir() + ).as_posix(), + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] + + elif method == "minian": + import multiprocessing + import psutil + from dask.distributed import Client, LocalCluster + from minian.cnmf import ( + compute_AtC, + compute_trace, + get_noise_fft, + smooth_sig, + unit_merge, + update_spatial, + update_temporal, + update_background, + ) + from minian.initialization import ( + gmm_refine, + initA, + initC, + intensity_refine, + ks_refine, + pnr_refine, + seeds_init, + seeds_merge, + ) + from minian.motion_correction import apply_transform, estimate_motion + from minian.preprocessing import denoise, remove_background + + from minian.utilities import ( + TaskAnnotation, + get_optimal_chk, + load_videos, + open_minian, + save_minian, + ) + from minian.visualization import ( + CNMFViewer, + export_plot, + VArrayViewer, + generate_videos, + visualize_gmm_fit, + visualize_motion, + visualize_preprocess, + visualize_seeds, + visualize_spatial_update, + visualize_temporal_update, + write_video, + ) + + # Setup Dask cluster with env vars or auto-config + n_workers = int( + os.getenv( + "MINIAN_NWORKERS", + max(1, int(multiprocessing.cpu_count() * 0.8)), + ) + ) + memory_total = psutil.virtual_memory().total + memory_per_worker = int(memory_total * 0.8 / n_workers / 1e9) + memory_limit = os.getenv( + "MINIAN_MEMORY_LIMIT", f"{memory_per_worker}GB" + ) + + # Set intermediate storage path + temp_path = str(output_dir / "intermediate") + os.makedirs(temp_path, exist_ok=True) + os.environ["MINIAN_INTERMEDIATE"] = temp_path + + # Start Dask cluster + logger.info( + f"Starting Minian processing with {n_workers} workers, {memory_limit} memory limit..." + ) + cluster = LocalCluster( + n_workers=n_workers, + memory_limit=memory_limit, + resources={"MEM": 1}, + threads_per_worker=2, + dashboard_address=None, # Disable dashboard in automated pipeline + ) + annotation_plugin = TaskAnnotation() + cluster.scheduler.add_plugin(annotation_plugin) + client = Client(cluster) + + try: + + # ===== LOAD VIDEOS ===== + logger.info("Loading videos...") + param_load_videos = params.get( + "param_load_videos", + {"pattern": "r'.*\.avi$'", "downsample_strategy": "subset"}, + ) + video_array = load_videos( + str(avi_files[0].parent), **param_load_videos + ) + chunk_size, _ = get_optimal_chk(video_array, dtype=float) + video_array = video_array.chunk( + {"frame": chunk_size["frame"], "height": -1, "width": -1} ) - cnm.dview = dview - logger.info(f"Starting CNMF analysis with {n_processes} processes...") - cnm.fit(images, indices=(slice(None), slice(None))) - cnm.estimates.evaluate_components( - images, cnm.params, dview=cnm.dview + # ===== PREPROCESSING ===== + logger.info("Preprocessing: glow removal...") + subset = None + video_array_base_ref = video_array.sel(subset) + video_min_removed = video_array_base_ref.min("frame").compute() + video_array_ref = video_array_base_ref - video_min_removed + logger.info("Preprocessing: denoising...") + param_denoise = params.get( + "param_denoise", {"method": "median", "ksize": 7} ) - cnm.estimates.detrend_df_f(quantileMin=8, frames_window=250) - logger.info("Computing summary images...") - correlation_image, _ = cm.summary_images.correlation_pnr( - images[:: max(T // 1000, 1)], - gSig=cnm.params.init["gSig"][0], - swap_dim=False, + video_array_denoised = denoise(video_array_ref, **param_denoise) + + logger.info("Preprocessing: background removal...") + param_background_removal = params.get( + "param_background_removal", {"method": "tophat", "wnd": 10} ) - correlation_image[np.isnan(correlation_image)] = 0 - cnm.estimates.Cn = correlation_image - fname_hdf5 = cnm.mmap_file[:-4] + "hdf5" - cnm.save(fname_hdf5) - cnmf_output_file = pathlib.Path(fname_hdf5) - summary_images = { - "average_image": np.mean(images[:: max(T // 1000, 1)], axis=0), - "max_image": np.max(images[:: max(T // 1000, 1)], axis=0), - "correlation_image": correlation_image, - } - _save_mc( - mc, - cnmf_output_file.as_posix(), - params["is3D"], - summary_images=summary_images, + video_array_bg_removed = remove_background( + video_array_denoised, **param_background_removal + ) + + logger.info("Preprocessing: saving pre-processed video...") + video_array_preprocessed = save_minian( + video_array_bg_removed.rename("video_array_preprocessed"), + dpath=temp_path, + overwrite=True, + ) + + # ===== MOTION CORRECTION ===== + logger.info("Estimating motion...") + param_estimate_motion = params.get( + "param_estimate_motion", {"dim": "frame"} + ) + motion = estimate_motion( + video_array_preprocessed, **param_estimate_motion + ) + motion = save_minian( + motion.rename("motion"), dpath=temp_path, overwrite=True + ) + + logger.info("Applying motion correction...") + Y = apply_transform(video_array_preprocessed, motion, fill=0) + Y_fm_chk = save_minian( + Y.astype(float).rename("Y_fm_chk"), temp_path, overwrite=True + ) + Y_hw_chk = save_minian( + Y_fm_chk.rename("Y_hw_chk"), + temp_path, + overwrite=True, + chunks={ + "frame": -1, + "height": chunk_size["height"], + "width": chunk_size["width"], + }, + ) + write_video(Y_fm_chk, "motion_corrected_movie.mp4", output_dir) + + logger.info("Motion Correction: Create and save max projection...") + max_proj = Y_fm_chk.max("frame").compute() + max_proj = save_minian( + max_proj.rename("max_proj"), + **{"dpath": output_dir, "overwrite": True}, + ) + + # ===== INITIALIZATION ===== + logger.info("Initializing seeds...") + param_seeds_init = params.get("param_seeds_init", {}) + seeds = seeds_init(Y_fm_chk, **param_seeds_init) + + logger.info("Refining seeds with PNR...") + param_pnr_refine = params.get("param_pnr_refine", {}) + seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) + + logger.info("Refining seeds with KS test...") + param_ks_refine = params.get("param_ks_refine", {}) + seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) + + logger.info("Merging seeds...") + param_seeds_merge = params.get("param_seeds_merge", {}) + seeds_final = seeds[ + seeds["mask_ks"] & seeds["mask_pnr"] + ].reset_index(drop=True) + seeds_final = seeds_merge( + Y_hw_chk, max_proj, seeds_final, **param_seeds_merge + ) + + logger.info( + "Initializing spatial footprints (A) and temporal traces (C)..." + ) + param_initialize = params.get("param_initialize", {}) + A_init = initA( + Y_hw_chk, + seeds_final[seeds_final["mask_mrg"]], + **param_initialize, + ) + A_init = save_minian( + A_init.rename("A_init"), temp_path, overwrite=True + ) + + C_init = initC(Y_fm_chk, A_init) + C_init = save_minian( + C_init.rename("C_init"), + temp_path, + overwrite=True, + chunks={"unit_id": 1, "frame": -1}, + ) + + # Initial merge + logger.info("Merging initial components...") + param_init_merge = params.get("param_init_merge", {}) + A, C = unit_merge(A_init, C_init, **param_init_merge) + A = save_minian(A.rename("A"), temp_path, overwrite=True) + C = save_minian(C.rename("C"), temp_path, overwrite=True) + C_chk = save_minian( + C.rename("C_chk"), + temp_path, + overwrite=True, + chunks={"unit_id": -1, "frame": chunk_size["frame"]}, + ) + + # ===== CNMF - FIRST ITERATION ===== + logger.info("CNMF: Computing noise statistics...") + param_get_noise = params.get("param_get_noise", {}) + sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) + sn_spatial = save_minian( + sn_spatial.rename("sn_spatial"), temp_path, overwrite=True + ) + + # Initialize background + logger.info("CNMF: Initializing background...") + b, f = update_background(Y_fm_chk, A, C_chk) + b = save_minian(b.rename("b"), temp_path, overwrite=True) + f = save_minian(f.rename("f"), temp_path, overwrite=True) + + # First spatial update + logger.info("CNMF: First spatial update...") + param_first_spatial = params.get("param_first_spatial", {}) + A_new, mask_spatial = update_spatial( + Y_hw_chk, A, C, sn_spatial, **param_first_spatial + ) + A = save_minian( + A_new.rename("A"), + temp_path, + overwrite=True, + chunks={"unit_id": 1, "height": -1, "width": -1}, + ) + + # First temporal update + logger.info("CNMF: First temporal update...") + param_first_temporal = params.get("param_first_temporal", {}) + YrA = compute_trace(Y_fm_chk, A, b, C_chk, f) + YrA = save_minian( + YrA.rename("YrA"), + temp_path, + overwrite=True, + chunks={"unit_id": 1, "frame": -1}, + ) + C_new, S_new, b0_new, c0_new, g, mask_temporal = update_temporal( + A, C, YrA=YrA, **param_first_temporal + ) + C = save_minian( + C_new.rename("C").chunk({"unit_id": 1, "frame": -1}), + temp_path, + overwrite=True, + ) + C_chk = save_minian( + C.rename("C_chk"), + temp_path, + overwrite=True, + chunks={"unit_id": -1, "frame": chunk_size["frame"]}, + ) + S = save_minian( + S_new.rename("S").chunk({"unit_id": 1, "frame": -1}), + temp_path, + overwrite=True, + ) + b0 = save_minian( + b0_new.rename("b0").chunk({"unit_id": 1, "frame": -1}), + temp_path, + overwrite=True, + ) + c0 = save_minian( + c0_new.rename("c0").chunk({"unit_id": 1, "frame": -1}), + temp_path, + overwrite=True, + ) + + # First merge + logger.info("CNMF: Merging after first iteration...") + param_first_merge = params.get("param_first_merge", {}) + A, C = unit_merge(A, C, **param_first_merge) + A = save_minian(A.rename("A"), temp_path, overwrite=True) + C = save_minian(C.rename("C"), temp_path, overwrite=True) + C_chk = save_minian( + C.rename("C_chk"), + temp_path, + overwrite=True, + chunks={"unit_id": -1, "frame": chunk_size["frame"]}, + ) + + # ===== CNMF - SECOND ITERATION ===== + # Second spatial update + logger.info("CNMF: Second spatial update...") + param_second_spatial = params.get("param_second_spatial", {}) + A_new, mask_spatial = update_spatial( + Y_hw_chk, A, C, sn_spatial, **param_second_spatial + ) + A = save_minian( + A_new.rename("A"), + temp_path, + overwrite=True, + chunks={"unit_id": 1, "height": -1, "width": -1}, + ) + b_new, f_new = update_background(Y_fm_chk, A, C_chk) + b = save_minian(b_new.rename("b"), temp_path, overwrite=True) + f = save_minian( + f_new.chunk({"frame": chunk_size["frame"]}).rename("f"), + temp_path, + overwrite=True, + ) + + # Second temporal update + logger.info("CNMF: Second temporal update...") + param_second_temporal = params.get("param_second_temporal", {}) + YrA = compute_trace(Y_fm_chk, A, b, C_chk, f) + YrA = save_minian( + YrA.rename("YrA"), + temp_path, + overwrite=True, + chunks={"unit_id": 1, "frame": -1}, + ) + C_new, S_new, b0_new, c0_new, g, mask_temporal = update_temporal( + A, C, YrA=YrA, **param_second_temporal + ) + + # Apply temporal mask to keep only valid units + A = A.sel(unit_id=C_new.coords["unit_id"].values) + + # ===== SAVE FINAL RESULTS ===== + logger.info("Saving final results...") + A = save_minian( + A.rename("A"), **{"dpath": output_dir, "overwrite": True} + ) + C = save_minian( + C_new.rename("C"), **{"dpath": output_dir, "overwrite": True} + ) + S = save_minian( + S_new.rename("S"), **{"dpath": output_dir, "overwrite": True} + ) + c0 = save_minian( + c0_new.rename("c0"), **{"dpath": output_dir, "overwrite": True} + ) + b0 = save_minian( + b0_new.rename("b0"), **{"dpath": output_dir, "overwrite": True} + ) + b = save_minian( + b_new.rename("b"), **{"dpath": output_dir, "overwrite": True} + ) + f = save_minian( + f_new.rename("f"), **{"dpath": output_dir, "overwrite": True} + ) + motion = save_minian( + motion.rename("motion"), + **{"dpath": output_dir, "overwrite": True}, + ) + logger.info( + f"Minian processing complete. {A.sizes['unit_id']} units detected." ) except Exception as e: - dview.terminate() + client.close() + cluster.close() + logger.error(f"Minian processing failed: {e}") raise e - else: - cm.stop_server(dview=dview) - logger.info("CNMF analysis complete. Resulted saved.") - caiman_temp = os.environ.get("CAIMAN_TEMP") - if caiman_temp is not None: - os.environ["CAIMAN_TEMP"] = caiman_temp - else: - del os.environ["CAIMAN_TEMP"] + finally: + client.close() + cluster.close() + # Load results and prepare for insertion + minian_loader = MinianLoader(output_dir) + key["processing_time"] = minian_loader.creation_time + + # Get minian version if available + try: + import minian + + key["package_version"] = getattr(minian, "__version__", "") + except (ImportError, AttributeError): + key["package_version"] = "" + + file_entries = [ + { + **key, + "file_name": f.name, + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] - _run_processing() - _, imaging_dataset = get_loader_result( - key, ProcessingTask, full_output_dir=output_dir - ) - caiman_dataset = imaging_dataset - key["processing_time"] = caiman_dataset.creation_time - key["package_version"] = cm.__version__ - file_entries = [ - { - **key, - "file_name": f.relative_to( - get_processed_root_data_dir() - ).as_posix(), - "file": f.as_posix(), - } - for f in output_dir.rglob("*") - if f.is_file() - ] else: raise ValueError(f"Unknown task mode: {task_mode}") return (file_entries, output_dir) @@ -1131,6 +1550,38 @@ def make(self, key): } self.Summary.insert1(summary_images) + elif method == "minian": + minian_dataset = loaded_result + + self.insert1( + {**key, "motion_correct_channel": minian_dataset.alignment_channel} + ) + + # Minian uses rigid motion correction + rigid_correction = minian_dataset.extract_rigid_mc() + if rigid_correction is not None: + rigid_correction.update(**key) + self.RigidMotionCorrection.insert1(rigid_correction) + + # -- summary images -- + ref_image = minian_dataset.ref_image + mean_image = minian_dataset.mean_image + max_proj_image = minian_dataset.max_proj_image + correlation_image = minian_dataset.correlation_map + + summary_images = { + **key, + "ref_image": ( + ref_image if ref_image is not None else np.zeros((1, 1, 1)) + ), + "average_image": ( + mean_image if mean_image is not None else np.zeros((1, 1, 1)) + ), + "correlation_image": correlation_image, + "max_proj_image": max_proj_image, + } + self.Summary.insert1(summary_images) + else: raise NotImplementedError("Unknown/unimplemented method: {}".format(method)) @@ -1238,6 +1689,57 @@ def make(self, key): cells, ignore_extra_fields=True, allow_direct_insert=True ) + elif method == "minian": + minian_dataset = loaded_result + + # infer "segmentation_channel" - from params if available, else from minian loader + params = (ProcessingParamSet * ProcessingTask & key).fetch1("params") + segmentation_channel = params.get( + "segmentation_channel", minian_dataset.segmentation_channel + ) + + masks, cells = [], [] + for mask in minian_dataset.masks: + masks.append( + { + **key, + "segmentation_channel": segmentation_channel, + "mask": mask["mask_id"], + "mask_npix": mask["mask_npix"], + "mask_center_x": mask["mask_center_x"], + "mask_center_y": mask["mask_center_y"], + "mask_center_z": mask["mask_center_z"], + "mask_xpix": mask["mask_xpix"], + "mask_ypix": mask["mask_ypix"], + "mask_zpix": mask["mask_zpix"], + "mask_weights": mask["mask_weights"], + } + ) + if mask["accepted"]: + cells.append( + { + **key, + "mask_classification_method": "minian_default_classifier", + "mask": mask["mask_id"], + "mask_type": "soma", + } + ) + + self.insert1(key) + self.Mask.insert(masks, ignore_extra_fields=True) + + if cells: + MaskClassification.insert1( + { + **key, + "mask_classification_method": "minian_default_classifier", + }, + allow_direct_insert=True, + ) + MaskClassification.MaskType.insert( + cells, ignore_extra_fields=True, allow_direct_insert=True + ) + else: raise NotImplementedError(f"Unknown/unimplemented method: {method}") @@ -1255,7 +1757,7 @@ class MaskClassificationMethod(dj.Lookup): mask_classification_method: varchar(48) """ - contents = zip(["caiman_default_classifier"]) + contents = zip(["caiman_default_classifier", "minian_default_classifier"]) @schema @@ -1360,6 +1862,29 @@ def make(self, key): self.insert1(key) self.Trace.insert(fluo_traces) + elif method == "minian": + minian_dataset = loaded_result + + # infer "segmentation_channel" - from params if available, else from minian loader + params = (ProcessingParamSet * ProcessingTask & key).fetch1("params") + segmentation_channel = params.get( + "segmentation_channel", minian_dataset.segmentation_channel + ) + + fluo_traces = [] + for mask in minian_dataset.masks: + fluo_traces.append( + { + **key, + "mask": mask["mask_id"], + "fluorescence_channel": segmentation_channel, + "fluorescence": mask["inferred_trace"], + } + ) + + self.insert1(key) + self.Trace.insert(fluo_traces) + else: raise NotImplementedError("Unknown/unimplemented method: {}".format(method)) @@ -1369,14 +1894,14 @@ class ActivityExtractionMethod(dj.Lookup): """Lookup table for activity extraction methods. Attributes: - extraction_method (foreign key, varchar(32) ): Extraction method from CaImAn. + extraction_method (foreign key, varchar(32) ): Extraction method from CaImAn or Minian. """ definition = """ extraction_method: varchar(32) """ - contents = zip(["caiman_deconvolution", "caiman_dff"]) + contents = zip(["caiman_deconvolution", "caiman_dff", "minian_deconvolution"]) @schema @@ -1421,7 +1946,15 @@ def key_source(self): & 'extraction_method LIKE "caiman%"' ) - return caiman_key_source.proj() + minian_key_source = ( + Fluorescence + * ActivityExtractionMethod + * ProcessingParamSet.proj("processing_method") + & 'processing_method = "minian"' + & 'extraction_method LIKE "minian%"' + ) + + return caiman_key_source.proj() + minian_key_source.proj() def make(self, key): """Populates table with activity trace data.""" @@ -1456,6 +1989,28 @@ def make(self, key): for mask in caiman_dataset.masks ) + elif method == "minian": + minian_dataset = loaded_result + + if key["extraction_method"] == "minian_deconvolution": + # infer "segmentation_channel" - from params if available, else from minian loader + params = (ProcessingParamSet * ProcessingTask & key).fetch1("params") + segmentation_channel = params.get( + "segmentation_channel", minian_dataset.segmentation_channel + ) + + self.insert1(key) + self.Trace.insert( + dict( + key, + mask=mask["mask_id"], + fluorescence_channel=segmentation_channel, + activity_trace=mask["spikes"], + ) + for mask in minian_dataset.masks + if "spikes" in mask + ) + else: raise NotImplementedError("Unknown/unimplemented method: {}".format(method)) @@ -1526,6 +2081,181 @@ def make(self, key): # Helper Functions --------------------------------------------------------------------- +class MinianLoader: + """Loader class for Minian analysis results. + + Provides a consistent interface for accessing Minian outputs similar to CaImAn loader. + """ + + def __init__(self, output_dir): + """Initialize the MinianLoader. + + Args: + output_dir: Path to the directory containing Minian zarr outputs. + """ + from minian.utilities import open_minian + + self.output_dir = pathlib.Path(output_dir) + self._minian_ds = open_minian(str(self.output_dir)) + + # Load core arrays + self._A = self._minian_ds.get( + "A" + ) # Spatial footprints (unit_id, height, width) + self._C = self._minian_ds.get("C") # Temporal traces (unit_id, frame) + self._S = self._minian_ds.get( + "S" + ) # Deconvolved activity/spikes (unit_id, frame) + self._b = self._minian_ds.get("b") # Background spatial (height, width) + self._f = self._minian_ds.get("f") # Background temporal (frame) + self._b0 = self._minian_ds.get("b0") # Baseline (unit_id, frame) + self._c0 = self._minian_ds.get("c0") # Initial calcium (unit_id, frame) + + # Try to load motion correction data + self._motion = self._minian_ds.get("motion") + + # Try to load reference/max projection images + self._max_proj = self._minian_ds.get("max_proj") + self._varr_ref = self._minian_ds.get("varr_ref") + + @property + def minian_dataset(self): + """Return the raw Minian xarray Dataset.""" + return self._minian_ds + + @property + def creation_time(self): + """Get the creation time of the Minian output.""" + # Use the modification time of the output directory + return datetime.fromtimestamp(self.output_dir.stat().st_mtime, tz=timezone.utc) + + @property + def alignment_channel(self): + """Channel used for motion correction (default 0 for miniscope).""" + return 0 + + @property + def segmentation_channel(self): + """Channel used for segmentation (default 0 for miniscope).""" + return 0 + + @property + def is_pw_rigid(self): + """Minian uses rigid motion correction by default.""" + return False + + @property + def motion_shifts(self): + """Return motion correction shifts as dict with 'x' and 'y' keys.""" + if self._motion is not None: + motion_data = self._motion.compute() + return { + "x": motion_data.sel(shift_dim="width").values, + "y": motion_data.sel(shift_dim="height").values, + } + return None + + def extract_rigid_mc(self): + """Extract rigid motion correction data in format compatible with MotionCorrection table.""" + shifts = self.motion_shifts + if shifts is None: + return None + + return { + "x_shifts": shifts["x"], + "y_shifts": shifts["y"], + "x_std": np.std(shifts["x"]), + "y_std": np.std(shifts["y"]), + } + + @property + def ref_image(self): + """Return reference image used for motion correction.""" + if self._varr_ref is not None: + # Take mean across frames for reference + return self._varr_ref.mean(dim="frame").compute().values[np.newaxis, :, :] + return None + + @property + def mean_image(self): + """Return mean image (average across frames).""" + if self._varr_ref is not None: + return self._varr_ref.mean(dim="frame").compute().values[np.newaxis, :, :] + return None + + @property + def max_proj_image(self): + """Return maximum projection image.""" + if self._max_proj is not None: + return self._max_proj.compute().values[np.newaxis, :, :] + elif self._varr_ref is not None: + return self._varr_ref.max(dim="frame").compute().values[np.newaxis, :, :] + return None + + @property + def correlation_map(self): + """Return correlation image (computed during initialization).""" + # Minian doesn't store correlation image by default + # Return None or compute if needed + return None + + @property + def masks(self): + """Extract mask information in format compatible with Segmentation table. + + Yields dict for each unit with mask properties. + """ + if self._A is None: + return [] + + A_data = self._A.compute() + C_data = self._C.compute() if self._C is not None else None + S_data = self._S.compute() if self._S is not None else None + + masks = [] + for unit_idx, unit_id in enumerate(A_data.coords["unit_id"].values): + footprint = A_data.sel(unit_id=unit_id).values + + # Find non-zero pixels + mask_indices = np.where(footprint > 0) + if len(mask_indices[0]) == 0: + continue + + y_pix = mask_indices[0] + x_pix = mask_indices[1] + weights = footprint[y_pix, x_pix] + + mask_dict = { + "mask_id": int(unit_id), + "mask_npix": len(x_pix), + "mask_center_x": int(np.mean(x_pix)), + "mask_center_y": int(np.mean(y_pix)), + "mask_center_z": None, + "mask_xpix": x_pix, + "mask_ypix": y_pix, + "mask_zpix": None, + "mask_weights": weights, + "accepted": True, # All units accepted (no manual curation) + } + + # Add trace data if available + if C_data is not None: + mask_dict["inferred_trace"] = C_data.sel(unit_id=unit_id).values + if S_data is not None: + mask_dict["spikes"] = S_data.sel(unit_id=unit_id).values + + masks.append(mask_dict) + + return masks + + @property + def num_units(self): + """Return number of detected units.""" + if self._A is not None: + return len(self._A.coords["unit_id"]) + return 0 + + def get_loader_result(key, table, full_output_dir=None) -> tuple: """Retrieve the loaded processed imaging results from the loader (e.g. caiman, etc.) @@ -1548,6 +2278,8 @@ def get_loader_result(key, table, full_output_dir=None) -> tuple: from element_interface import caiman_loader loaded_output = caiman_loader.CaImAn(output_dir) + elif method == "minian": + loaded_output = MinianLoader(output_dir) else: raise NotImplementedError("Unknown/unimplemented method: {}".format(method)) diff --git a/setup.py b/setup.py index ddfc8bf..56f6b2c 100644 --- a/setup.py +++ b/setup.py @@ -41,5 +41,9 @@ "element-session @ git+https://github.com/datajoint/element-session.git", ], "tests": ["pytest", "pytest-cov", "shutils"], + "minian": [ + "dask", + "minian @ git+https://github.com/kushalbakshi/minian.git", + ] }, ) \ No newline at end of file From 1b6263910a371956eabe04a6c0bed3a4354dc191 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:37:04 -0500 Subject: [PATCH 02/36] version pin xarray --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 56f6b2c..649e0ee 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ "tests": ["pytest", "pytest-cov", "shutils"], "minian": [ "dask", + "xarray==2022.3.0", "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From 20a7370c8d752833f98689fb4d4bfb69ac9dfc43 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:39:45 -0500 Subject: [PATCH 03/36] version pin dask --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 649e0ee..342dbf7 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ ], "tests": ["pytest", "pytest-cov", "shutils"], "minian": [ - "dask", + "dask==2022.5.0", "xarray==2022.3.0", "minian @ git+https://github.com/kushalbakshi/minian.git", ] From 569deec859a7d35533c7ac00021099b010a825bc Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:41:39 -0500 Subject: [PATCH 04/36] downgrade pandas --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 342dbf7..aad3b79 100644 --- a/setup.py +++ b/setup.py @@ -44,6 +44,7 @@ "minian": [ "dask==2022.5.0", "xarray==2022.3.0", + "pandas==1.5.3", "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From c4adef98858b4a225a3137992c2f3529f51eda61 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:49:20 -0500 Subject: [PATCH 05/36] Remove unused imports --- element_miniscope/miniscope.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index d1c42b3..bd97c89 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -972,20 +972,16 @@ def _run_processing(): import psutil from dask.distributed import Client, LocalCluster from minian.cnmf import ( - compute_AtC, compute_trace, get_noise_fft, - smooth_sig, unit_merge, update_spatial, update_temporal, update_background, ) from minian.initialization import ( - gmm_refine, initA, initC, - intensity_refine, ks_refine, pnr_refine, seeds_init, @@ -998,22 +994,9 @@ def _run_processing(): TaskAnnotation, get_optimal_chk, load_videos, - open_minian, save_minian, ) - from minian.visualization import ( - CNMFViewer, - export_plot, - VArrayViewer, - generate_videos, - visualize_gmm_fit, - visualize_motion, - visualize_preprocess, - visualize_seeds, - visualize_spatial_update, - visualize_temporal_update, - write_video, - ) + from minian.visualization import write_video # Setup Dask cluster with env vars or auto-config n_workers = int( From 1a692a8e0504c94eac3e108d207b99114a28534b Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:54:28 -0500 Subject: [PATCH 06/36] Update video pattern --- element_miniscope/miniscope.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index bd97c89..32d3261 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1037,10 +1037,10 @@ def _run_processing(): logger.info("Loading videos...") param_load_videos = params.get( "param_load_videos", - {"pattern": "r'.*\.avi$'", "downsample_strategy": "subset"}, + {"pattern": "r'Miniscope_video.avi$'", "downsample_strategy": "subset"}, ) video_array = load_videos( - str(avi_files[0].parent), **param_load_videos + str(pathlib.Path(avi_files[0]).parent), **param_load_videos ) chunk_size, _ = get_optimal_chk(video_array, dtype=float) video_array = video_array.chunk( From 20ca3e67be505e2a4c525353ac943b1144796711 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 12:58:02 -0500 Subject: [PATCH 07/36] hardcode video pattern --- element_miniscope/miniscope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 32d3261..52dcf3f 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1037,7 +1037,7 @@ def _run_processing(): logger.info("Loading videos...") param_load_videos = params.get( "param_load_videos", - {"pattern": "r'Miniscope_video.avi$'", "downsample_strategy": "subset"}, + {"pattern": "Miniscope_video.avi", "downsample_strategy": "subset"}, ) video_array = load_videos( str(pathlib.Path(avi_files[0]).parent), **param_load_videos From 8cde0ac21ca7f58613c74218deadeb539420b64d Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 13:07:13 -0500 Subject: [PATCH 08/36] set more conversative memory and worker count --- element_miniscope/miniscope.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 52dcf3f..acd54b8 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1002,11 +1002,11 @@ def _run_processing(): n_workers = int( os.getenv( "MINIAN_NWORKERS", - max(1, int(multiprocessing.cpu_count() * 0.8)), + max(1, int(multiprocessing.cpu_count() * 0.4)), ) ) memory_total = psutil.virtual_memory().total - memory_per_worker = int(memory_total * 0.8 / n_workers / 1e9) + memory_per_worker = int(memory_total * 0.4 / n_workers / 1e9) memory_limit = os.getenv( "MINIAN_MEMORY_LIMIT", f"{memory_per_worker}GB" ) From 1d8a00d0cd9c4a480e2409645d34ce46a61890d9 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 13:49:41 -0500 Subject: [PATCH 09/36] version pin distributed --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index aad3b79..4741da3 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ "dask==2022.5.0", "xarray==2022.3.0", "pandas==1.5.3", + "distributed<2024.1.0" "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From 0b1f989ed46bc847660bc9e031d2a379a9547aa3 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 13:50:23 -0500 Subject: [PATCH 10/36] fix typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4741da3..45347cd 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "dask==2022.5.0", "xarray==2022.3.0", "pandas==1.5.3", - "distributed<2024.1.0" + "distributed<2024.1.0", "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From 35b1a43d99688cd0f31400c8a71993bfc1f23ddc Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 13:58:38 -0500 Subject: [PATCH 11/36] Version pin distributed --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 45347cd..4a054cb 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "dask==2022.5.0", "xarray==2022.3.0", "pandas==1.5.3", - "distributed<2024.1.0", + "distributed=2023.12.1", "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From 8d0fe23a8e72b5d9c87f719fce50760f810e9ddc Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 13:59:48 -0500 Subject: [PATCH 12/36] version pin distributed to an even older version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 4a054cb..a036b3b 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ "dask==2022.5.0", "xarray==2022.3.0", "pandas==1.5.3", - "distributed=2023.12.1", + "distributed==2022.5.0", "minian @ git+https://github.com/kushalbakshi/minian.git", ] }, From 4ce9f96aecdcf4b06fe8fd60781b35b400f86875 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 14:52:28 -0500 Subject: [PATCH 13/36] Fix reading from environment variables --- element_miniscope/miniscope.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index acd54b8..a068a3a 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1007,9 +1007,13 @@ def _run_processing(): ) memory_total = psutil.virtual_memory().total memory_per_worker = int(memory_total * 0.4 / n_workers / 1e9) - memory_limit = os.getenv( - "MINIAN_MEMORY_LIMIT", f"{memory_per_worker}GB" - ) + + memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") + if memory_limit_env: + # Add GB suffix if user provided just a number + memory_limit = memory_limit_env if any(c.isalpha() for c in memory_limit_env) else f"{memory_limit_env}GB" + else: + memory_limit = f"{memory_per_worker}GB" # Set intermediate storage path temp_path = str(output_dir / "intermediate") From a770a02e95149033515fa11709371f19d6d71f07 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 18:06:46 -0500 Subject: [PATCH 14/36] fix loading videos --- element_miniscope/miniscope.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index a068a3a..2c84b23 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1039,10 +1039,12 @@ def _run_processing(): # ===== LOAD VIDEOS ===== logger.info("Loading videos...") - param_load_videos = params.get( - "param_load_videos", - {"pattern": "Miniscope_video.avi", "downsample_strategy": "subset"}, - ) + default_load_params = { + "pattern": "Miniscope_video.avi", + "dtype": "uint8", + "downsample_strategy": "subset" + } + param_load_videos = {**default_load_params, **params.get("param_load_videos", {})} video_array = load_videos( str(pathlib.Path(avi_files[0]).parent), **param_load_videos ) From 6bf56182c34fb478dcd55e6ca72b6251a888661b Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 4 Feb 2026 18:32:30 -0500 Subject: [PATCH 15/36] General fixes to minian implementation --- element_miniscope/miniscope.py | 395 ++++++++++++++++++++++----------- 1 file changed, 269 insertions(+), 126 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 2c84b23..12d7ba9 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -989,7 +989,6 @@ def _run_processing(): ) from minian.motion_correction import apply_transform, estimate_motion from minian.preprocessing import denoise, remove_background - from minian.utilities import ( TaskAnnotation, get_optimal_chk, @@ -997,23 +996,51 @@ def _run_processing(): save_minian, ) from minian.visualization import write_video + - # Setup Dask cluster with env vars or auto-config + # ===== CONTAINER-AWARE MEMORY DETECTION ===== + def get_container_memory_limit(): + """Get memory limit respecting container cgroups (v1 and v2).""" + # Try cgroup v2 first (modern Docker/Kubernetes) + try: + with open("/sys/fs/cgroup/memory.max") as f: + limit = f.read().strip() + if limit != "max": + return int(limit) + except (FileNotFoundError, PermissionError): + pass + # Try cgroup v1 (older Docker) + try: + with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: + limit = int(f.read().strip()) + # cgroup v1 returns a very large number if unlimited + if limit < 9223372036854771712: + return limit + except (FileNotFoundError, PermissionError): + pass + # Fall back to psutil (bare metal or unrestricted container) + return psutil.virtual_memory().total + + # ===== DASK CLUSTER CONFIGURATION ===== + memory_total = get_container_memory_limit() n_workers = int( os.getenv( "MINIAN_NWORKERS", - max(1, int(multiprocessing.cpu_count() * 0.4)), + max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), # Cap at 8 workers ) ) - memory_total = psutil.virtual_memory().total - memory_per_worker = int(memory_total * 0.4 / n_workers / 1e9) + # Reserve 20% for system overhead, distribute rest among workers + memory_per_worker = int(memory_total * 0.8 / n_workers) memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") if memory_limit_env: - # Add GB suffix if user provided just a number - memory_limit = memory_limit_env if any(c.isalpha() for c in memory_limit_env) else f"{memory_limit_env}GB" + memory_limit = ( + memory_limit_env + if any(c.isalpha() for c in memory_limit_env) + else f"{memory_limit_env}GB" + ) else: - memory_limit = f"{memory_per_worker}GB" + memory_limit = f"{memory_per_worker // (1024**3)}GB" # Set intermediate storage path temp_path = str(output_dir / "intermediate") @@ -1022,7 +1049,8 @@ def _run_processing(): # Start Dask cluster logger.info( - f"Starting Minian processing with {n_workers} workers, {memory_limit} memory limit..." + f"Starting Minian processing with {n_workers} workers, " + f"{memory_limit} memory limit per worker..." ) cluster = LocalCluster( n_workers=n_workers, @@ -1036,46 +1064,54 @@ def _run_processing(): client = Client(cluster) try: - # ===== LOAD VIDEOS ===== logger.info("Loading videos...") default_load_params = { - "pattern": "Miniscope_video.avi", - "dtype": "uint8", - "downsample_strategy": "subset" + "pattern": r".*\.avi$", + "dtype": np.uint8, + "downsample": dict(frame=1, height=1, width=1), + "downsample_strategy": "subset", + } + param_load_videos = { + **default_load_params, + **params.get("param_load_videos", {}), } - param_load_videos = {**default_load_params, **params.get("param_load_videos", {})} video_array = load_videos( str(pathlib.Path(avi_files[0]).parent), **param_load_videos ) chunk_size, _ = get_optimal_chk(video_array, dtype=float) - video_array = video_array.chunk( - {"frame": chunk_size["frame"], "height": -1, "width": -1} + video_array = save_minian( + video_array.chunk( + {"frame": chunk_size["frame"], "height": -1, "width": -1} + ).rename("video_array"), + temp_path, + overwrite=True, + ) + logger.info( + f"Loaded video: {video_array.sizes['frame']} frames, " + f"{video_array.sizes['height']}x{video_array.sizes['width']} pixels" ) # ===== PREPROCESSING ===== - logger.info("Preprocessing: glow removal...") - subset = None - video_array_base_ref = video_array.sel(subset) - video_min_removed = video_array_base_ref.min("frame").compute() - video_array_ref = video_array_base_ref - video_min_removed + logger.info("Preprocessing: glow removal (subtracting minimum)...") + video_min = video_array.min("frame").compute() + video_array_ref = video_array - video_min + logger.info("Preprocessing: denoising...") param_denoise = params.get( "param_denoise", {"method": "median", "ksize": 7} ) - video_array_denoised = denoise(video_array_ref, **param_denoise) + video_array_ref = denoise(video_array_ref, **param_denoise) logger.info("Preprocessing: background removal...") param_background_removal = params.get( "param_background_removal", {"method": "tophat", "wnd": 10} ) - video_array_bg_removed = remove_background( - video_array_denoised, **param_background_removal - ) + video_array_ref = remove_background(video_array_ref, **param_background_removal) - logger.info("Preprocessing: saving pre-processed video...") - video_array_preprocessed = save_minian( - video_array_bg_removed.rename("video_array_preprocessed"), + logger.info("Preprocessing: saving preprocessed video...") + video_array_ref = save_minian( + video_array_ref.rename("video_array_ref"), dpath=temp_path, overwrite=True, ) @@ -1085,17 +1121,21 @@ def _run_processing(): param_estimate_motion = params.get( "param_estimate_motion", {"dim": "frame"} ) - motion = estimate_motion( - video_array_preprocessed, **param_estimate_motion - ) + motion = estimate_motion(video_array_ref, **param_estimate_motion) motion = save_minian( - motion.rename("motion"), dpath=temp_path, overwrite=True + motion.rename("motion").chunk({"frame": chunk_size["frame"]}), + dpath=temp_path, + overwrite=True, ) logger.info("Applying motion correction...") - Y = apply_transform(video_array_preprocessed, motion, fill=0) + Y = apply_transform(video_array_ref, motion, fill=0) + + # Save two versions with different chunking strategies Y_fm_chk = save_minian( - Y.astype(float).rename("Y_fm_chk"), temp_path, overwrite=True + Y.astype(float).rename("Y_fm_chk"), + temp_path, + overwrite=True, ) Y_hw_chk = save_minian( Y_fm_chk.rename("Y_hw_chk"), @@ -1107,50 +1147,75 @@ def _run_processing(): "width": chunk_size["width"], }, ) - write_video(Y_fm_chk, "motion_corrected_movie.mp4", output_dir) - logger.info("Motion Correction: Create and save max projection...") - max_proj = Y_fm_chk.max("frame").compute() + # Save motion corrected video + logger.info("Saving motion corrected video...") + write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) + + # Create and save max projection + logger.info("Creating max projection...") max_proj = save_minian( - max_proj.rename("max_proj"), - **{"dpath": output_dir, "overwrite": True}, - ) + Y_fm_chk.max("frame").rename("max_proj"), + dpath=str(output_dir), + overwrite=True, + ).compute() - # ===== INITIALIZATION ===== + # ===== SEED INITIALIZATION ===== logger.info("Initializing seeds...") - param_seeds_init = params.get("param_seeds_init", {}) + param_seeds_init = params.get( + "param_seeds_init", + { + "wnd_size": 1000, + "method": "rolling", + "stp_size": 500, + "max_wnd": 15, + "diff_thres": 3, + }, + ) seeds = seeds_init(Y_fm_chk, **param_seeds_init) + logger.info(f"Initial seeds: {len(seeds)}") logger.info("Refining seeds with PNR...") - param_pnr_refine = params.get("param_pnr_refine", {}) + param_pnr_refine = params.get( + "param_pnr_refine", {"noise_freq": 0.06, "thres": 1} + ) seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) logger.info("Refining seeds with KS test...") - param_ks_refine = params.get("param_ks_refine", {}) + param_ks_refine = params.get("param_ks_refine", {"sig": 0.05}) seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) logger.info("Merging seeds...") - param_seeds_merge = params.get("param_seeds_merge", {}) - seeds_final = seeds[ - seeds["mask_ks"] & seeds["mask_pnr"] - ].reset_index(drop=True) + param_seeds_merge = params.get( + "param_seeds_merge", + {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, + ) + seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index( + drop=True + ) seeds_final = seeds_merge( Y_hw_chk, max_proj, seeds_final, **param_seeds_merge ) + n_seeds = seeds_final["mask_mrg"].sum() + logger.info(f"Seeds after filtering and merging: {n_seeds}") - logger.info( - "Initializing spatial footprints (A) and temporal traces (C)..." + if n_seeds == 0: + raise ValueError( + "No seeds remaining after refinement. " + "Consider adjusting param_seeds_init or param_pnr_refine thresholds." + ) + + # ===== INITIALIZE A AND C ===== + logger.info("Initializing spatial footprints (A)...") + param_initialize = params.get( + "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} ) - param_initialize = params.get("param_initialize", {}) A_init = initA( - Y_hw_chk, - seeds_final[seeds_final["mask_mrg"]], - **param_initialize, - ) - A_init = save_minian( - A_init.rename("A_init"), temp_path, overwrite=True + Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize ) + A_init = save_minian(A_init.rename("A_init"), temp_path, overwrite=True) + logger.info("Initializing temporal traces (C)...") C_init = initC(Y_fm_chk, A_init) C_init = save_minian( C_init.rename("C_init"), @@ -1159,9 +1224,9 @@ def _run_processing(): chunks={"unit_id": 1, "frame": -1}, ) - # Initial merge - logger.info("Merging initial components...") - param_init_merge = params.get("param_init_merge", {}) + # Initial unit merge + logger.info("Initial unit merge...") + param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) A, C = unit_merge(A_init, C_init, **param_init_merge) A = save_minian(A.rename("A"), temp_path, overwrite=True) C = save_minian(C.rename("C"), temp_path, overwrite=True) @@ -1171,27 +1236,49 @@ def _run_processing(): overwrite=True, chunks={"unit_id": -1, "frame": chunk_size["frame"]}, ) + logger.info(f"Units after initial merge: {A.sizes['unit_id']}") - # ===== CNMF - FIRST ITERATION ===== - logger.info("CNMF: Computing noise statistics...") - param_get_noise = params.get("param_get_noise", {}) + # ===== INITIALIZE BACKGROUND ===== + logger.info("Initializing background terms...") + b, f = update_background(Y_fm_chk, A, C_chk) + b = save_minian(b.rename("b"), temp_path, overwrite=True) + f = save_minian(f.rename("f"), temp_path, overwrite=True) + + # ===== COMPUTE NOISE STATISTICS ===== + logger.info("Computing noise statistics...") + param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) sn_spatial = save_minian( sn_spatial.rename("sn_spatial"), temp_path, overwrite=True ) - # Initialize background - logger.info("CNMF: Initializing background...") - b, f = update_background(Y_fm_chk, A, C_chk) - b = save_minian(b.rename("b"), temp_path, overwrite=True) - f = save_minian(f.rename("f"), temp_path, overwrite=True) + # ======================================== + # CNMF ITERATION 1 + # ======================================== - # First spatial update - logger.info("CNMF: First spatial update...") - param_first_spatial = params.get("param_first_spatial", {}) - A_new, mask_spatial = update_spatial( + # ----- First Spatial Update ----- + logger.info("CNMF Iteration 1: Spatial update...") + param_first_spatial = params.get( + "param_first_spatial", + {"dl_wnd": 5, "sparse_penal": 0.01, "size_thres": (25, None)}, + ) + A_new, mask, norm_fac = update_spatial( Y_hw_chk, A, C, sn_spatial, **param_first_spatial ) + + # Apply mask and normalization to C (CRITICAL) + C_new = save_minian( + (C.sel(unit_id=mask) * norm_fac).rename("C_new"), + temp_path, + overwrite=True, + ) + C_chk_new = save_minian( + (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), + temp_path, + overwrite=True, + ) + + # Save updated A A = save_minian( A_new.rename("A"), temp_path, @@ -1199,19 +1286,43 @@ def _run_processing(): chunks={"unit_id": 1, "height": -1, "width": -1}, ) - # First temporal update - logger.info("CNMF: First temporal update...") - param_first_temporal = params.get("param_first_temporal", {}) - YrA = compute_trace(Y_fm_chk, A, b, C_chk, f) + # Update background after spatial update + b_new, f_new = update_background(Y_fm_chk, A, C_chk_new) + b = save_minian(b_new.rename("b"), temp_path, overwrite=True) + f = save_minian( + f_new.chunk({"frame": chunk_size["frame"]}).rename("f"), + temp_path, + overwrite=True, + ) + C = save_minian(C_new.rename("C"), temp_path, overwrite=True) + C_chk = save_minian(C_chk_new.rename("C_chk"), temp_path, overwrite=True) + + logger.info(f"Units after first spatial update: {A.sizes['unit_id']}") + + # ----- First Temporal Update ----- + logger.info("CNMF Iteration 1: Temporal update...") YrA = save_minian( - YrA.rename("YrA"), + compute_trace(Y_fm_chk, A, b, C_chk, f).rename("YrA"), temp_path, overwrite=True, chunks={"unit_id": 1, "frame": -1}, ) - C_new, S_new, b0_new, c0_new, g, mask_temporal = update_temporal( + + param_first_temporal = params.get( + "param_first_temporal", + { + "noise_freq": 0.06, + "sparse_penal": 0.5, + "p": 1, + "add_lag": 20, + "jac_thres": 0.2, + }, + ) + C_new, S_new, b0_new, c0_new, g, mask = update_temporal( A, C, YrA=YrA, **param_first_temporal ) + + # Save temporal results C = save_minian( C_new.rename("C").chunk({"unit_id": 1, "frame": -1}), temp_path, @@ -1239,12 +1350,20 @@ def _run_processing(): overwrite=True, ) - # First merge - logger.info("CNMF: Merging after first iteration...") - param_first_merge = params.get("param_first_merge", {}) - A, C = unit_merge(A, C, **param_first_merge) - A = save_minian(A.rename("A"), temp_path, overwrite=True) - C = save_minian(C.rename("C"), temp_path, overwrite=True) + # Sync A with C after temporal update drops units + A = A.sel(unit_id=C.coords["unit_id"].values) + + logger.info(f"Units after first temporal update: {A.sizes['unit_id']}") + + # ----- First Merge ----- + logger.info("CNMF Iteration 1: Merging units...") + param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) + A_mrg, C_mrg, [sig_mrg] = unit_merge( + A, C, [C + b0 + c0], **param_first_merge + ) + + A = save_minian(A_mrg.rename("A"), temp_path, overwrite=True) + C = save_minian(C_mrg.rename("C"), temp_path, overwrite=True) C_chk = save_minian( C.rename("C_chk"), temp_path, @@ -1252,82 +1371,107 @@ def _run_processing(): chunks={"unit_id": -1, "frame": chunk_size["frame"]}, ) - # ===== CNMF - SECOND ITERATION ===== - # Second spatial update - logger.info("CNMF: Second spatial update...") - param_second_spatial = params.get("param_second_spatial", {}) - A_new, mask_spatial = update_spatial( + logger.info(f"Units after first merge: {A.sizes['unit_id']}") + + # ======================================== + # CNMF ITERATION 2 + # ======================================== + + # ----- Second Spatial Update ----- + logger.info("CNMF Iteration 2: Spatial update...") + param_second_spatial = params.get( + "param_second_spatial", + {"dl_wnd": 5, "sparse_penal": 0.01, "size_thres": (25, None)}, + ) + A_new, mask, norm_fac = update_spatial( Y_hw_chk, A, C, sn_spatial, **param_second_spatial ) + + # Apply mask and normalization to C + C_new = save_minian( + (C.sel(unit_id=mask) * norm_fac).rename("C_new"), + temp_path, + overwrite=True, + ) + C_chk_new = save_minian( + (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), + temp_path, + overwrite=True, + ) + + # Save updated A A = save_minian( A_new.rename("A"), temp_path, overwrite=True, chunks={"unit_id": 1, "height": -1, "width": -1}, ) - b_new, f_new = update_background(Y_fm_chk, A, C_chk) + + # Update background + b_new, f_new = update_background(Y_fm_chk, A, C_chk_new) b = save_minian(b_new.rename("b"), temp_path, overwrite=True) f = save_minian( f_new.chunk({"frame": chunk_size["frame"]}).rename("f"), temp_path, overwrite=True, ) + C = save_minian(C_new.rename("C"), temp_path, overwrite=True) + C_chk = save_minian(C_chk_new.rename("C_chk"), temp_path, overwrite=True) + + logger.info(f"Units after second spatial update: {A.sizes['unit_id']}") - # Second temporal update - logger.info("CNMF: Second temporal update...") - param_second_temporal = params.get("param_second_temporal", {}) - YrA = compute_trace(Y_fm_chk, A, b, C_chk, f) + # ----- Second Temporal Update ----- + logger.info("CNMF Iteration 2: Temporal update...") YrA = save_minian( - YrA.rename("YrA"), + compute_trace(Y_fm_chk, A, b, C_chk, f).rename("YrA"), temp_path, overwrite=True, chunks={"unit_id": 1, "frame": -1}, ) - C_new, S_new, b0_new, c0_new, g, mask_temporal = update_temporal( + + param_second_temporal = params.get( + "param_second_temporal", + { + "noise_freq": 0.06, + "sparse_penal": 0.5, + "p": 1, + "add_lag": 20, + "jac_thres": 0.2, + }, + ) + C_new, S_new, b0_new, c0_new, g, mask = update_temporal( A, C, YrA=YrA, **param_second_temporal ) - # Apply temporal mask to keep only valid units + # Sync A with final C A = A.sel(unit_id=C_new.coords["unit_id"].values) + logger.info(f"Final units after second temporal update: {A.sizes['unit_id']}") + # ===== SAVE FINAL RESULTS ===== - logger.info("Saving final results...") - A = save_minian( - A.rename("A"), **{"dpath": output_dir, "overwrite": True} - ) - C = save_minian( - C_new.rename("C"), **{"dpath": output_dir, "overwrite": True} - ) - S = save_minian( - S_new.rename("S"), **{"dpath": output_dir, "overwrite": True} - ) - c0 = save_minian( - c0_new.rename("c0"), **{"dpath": output_dir, "overwrite": True} - ) - b0 = save_minian( - b0_new.rename("b0"), **{"dpath": output_dir, "overwrite": True} - ) - b = save_minian( - b_new.rename("b"), **{"dpath": output_dir, "overwrite": True} - ) - f = save_minian( - f_new.rename("f"), **{"dpath": output_dir, "overwrite": True} - ) - motion = save_minian( - motion.rename("motion"), - **{"dpath": output_dir, "overwrite": True}, - ) + logger.info("Saving final results to output directory...") + final_save_params = {"dpath": str(output_dir), "overwrite": True} + + A = save_minian(A.rename("A"), **final_save_params) + C = save_minian(C_new.rename("C"), **final_save_params) + S = save_minian(S_new.rename("S"), **final_save_params) + c0 = save_minian(c0_new.rename("c0"), **final_save_params) + b0 = save_minian(b0_new.rename("b0"), **final_save_params) + b = save_minian(b_new.rename("b"), **final_save_params) + f = save_minian(f_new.rename("f"), **final_save_params) + motion = save_minian(motion.rename("motion"), **final_save_params) + logger.info( f"Minian processing complete. {A.sizes['unit_id']} units detected." ) + except Exception as e: - client.close() - cluster.close() logger.error(f"Minian processing failed: {e}") raise e finally: client.close() cluster.close() + # Load results and prepare for insertion minian_loader = MinianLoader(output_dir) key["processing_time"] = minian_loader.creation_time @@ -1335,7 +1479,6 @@ def _run_processing(): # Get minian version if available try: import minian - key["package_version"] = getattr(minian, "__version__", "") except (ImportError, AttributeError): key["package_version"] = "" From 92ce84d087f30f555f77708de6840e81f42489ca Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 5 Feb 2026 18:46:23 -0500 Subject: [PATCH 16/36] Updates to minian routine after testing --- element_miniscope/miniscope.py | 364 +++++++++++++++------------------ 1 file changed, 165 insertions(+), 199 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 12d7ba9..dca8836 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -970,9 +970,75 @@ def _run_processing(): elif method == "minian": import multiprocessing import psutil + import shutil from dask.distributed import Client, LocalCluster + + # ===== APPLY COMPATIBILITY PATCHES ===== + # Fix for NetworkX 3.0+ API changes + import networkx as nx + import scipy.sparse + from scipy.sparse import issparse + import minian.cnmf as cnmf_module + + def label_connected_fixed(adj, only_connected=False): + """Fixed label_connected for NetworkX 3.0+ compatibility.""" + if issparse(adj): + adj = adj.toarray() + adj = adj.copy() + np.fill_diagonal(adj, 0) + adj = np.triu(adj) + g = nx.from_numpy_array(adj) + labels = np.zeros(adj.shape[0], dtype=int) + for icomp, comp in enumerate(nx.connected_components(g)): + for node in comp: + labels[node] = icomp + if only_connected: + iso_mask = np.array([len(c) == 1 for c in nx.connected_components(g)]) + labels[np.isin(labels, np.where(iso_mask)[0])] = -1 + return labels + + cnmf_module.label_connected = label_connected_fixed + + # Fix for sparse array auto-densification + import sparse.numba_backend._sparse_array as sparse_mod + sparse_mod.AUTO_DENSIFY = True + + # Patch update_temporal to handle sparse.COO arrays + _original_update_temporal = cnmf_module.update_temporal + + def patched_update_temporal(A, C, b=None, f=None, Y=None, YrA=None, + noise_freq=0.25, p=2, add_lag="p", jac_thres=0.1, + sparse_penal=1, bseg=None, med_wd=None, + zero_thres=1e-8, max_iters=200, use_smooth=True, + normalize=True, warm_start=False, post_scal=False, + scs_fallback=False, concurrent_update=False): + """Patched update_temporal that handles sparse.COO arrays properly.""" + _original_csc_matrix = scipy.sparse.csc_matrix + + class PatchedCSCMatrix(scipy.sparse.csc_matrix): + def __new__(cls, arg1, shape=None, dtype=None, copy=False): + if hasattr(arg1, 'todense'): + arg1 = arg1.todense() + return _original_csc_matrix(arg1, shape=shape, dtype=dtype, copy=copy) + + scipy.sparse.csc_matrix = PatchedCSCMatrix + try: + result = _original_update_temporal( + A, C, b=b, f=f, Y=Y, YrA=YrA, noise_freq=noise_freq, + p=p, add_lag=add_lag, jac_thres=jac_thres, sparse_penal=sparse_penal, + bseg=bseg, med_wd=med_wd, zero_thres=zero_thres, max_iters=max_iters, + use_smooth=use_smooth, normalize=normalize, warm_start=warm_start, + post_scal=post_scal, scs_fallback=scs_fallback, + concurrent_update=concurrent_update + ) + finally: + scipy.sparse.csc_matrix = _original_csc_matrix + return result + + cnmf_module.update_temporal = patched_update_temporal + + # Now import minian modules (after patches are applied) from minian.cnmf import ( - compute_trace, get_noise_fft, unit_merge, update_spatial, @@ -996,12 +1062,10 @@ def _run_processing(): save_minian, ) from minian.visualization import write_video - # ===== CONTAINER-AWARE MEMORY DETECTION ===== def get_container_memory_limit(): """Get memory limit respecting container cgroups (v1 and v2).""" - # Try cgroup v2 first (modern Docker/Kubernetes) try: with open("/sys/fs/cgroup/memory.max") as f: limit = f.read().strip() @@ -1009,16 +1073,13 @@ def get_container_memory_limit(): return int(limit) except (FileNotFoundError, PermissionError): pass - # Try cgroup v1 (older Docker) try: with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: limit = int(f.read().strip()) - # cgroup v1 returns a very large number if unlimited if limit < 9223372036854771712: return limit except (FileNotFoundError, PermissionError): pass - # Fall back to psutil (bare metal or unrestricted container) return psutil.virtual_memory().total # ===== DASK CLUSTER CONFIGURATION ===== @@ -1026,10 +1087,9 @@ def get_container_memory_limit(): n_workers = int( os.getenv( "MINIAN_NWORKERS", - max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), # Cap at 8 workers + max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), ) ) - # Reserve 20% for system overhead, distribute rest among workers memory_per_worker = int(memory_total * 0.8 / n_workers) memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") @@ -1047,6 +1107,14 @@ def get_container_memory_limit(): os.makedirs(temp_path, exist_ok=True) os.environ["MINIAN_INTERMEDIATE"] = temp_path + # Helper function to clean intermediate files + def clean_intermediate_files(file_names): + """Remove intermediate zarr files to avoid conflicts.""" + for fname in file_names: + path = os.path.join(temp_path, f"{fname}.zarr") + if os.path.exists(path): + shutil.rmtree(path) + # Start Dask cluster logger.info( f"Starting Minian processing with {n_workers} workers, " @@ -1057,7 +1125,7 @@ def get_container_memory_limit(): memory_limit=memory_limit, resources={"MEM": 1}, threads_per_worker=2, - dashboard_address=None, # Disable dashboard in automated pipeline + dashboard_address=None, ) annotation_plugin = TaskAnnotation() cluster.scheduler.add_plugin(annotation_plugin) @@ -1076,89 +1144,83 @@ def get_container_memory_limit(): **default_load_params, **params.get("param_load_videos", {}), } - video_array = load_videos( + varr = load_videos( str(pathlib.Path(avi_files[0]).parent), **param_load_videos ) - chunk_size, _ = get_optimal_chk(video_array, dtype=float) - video_array = save_minian( - video_array.chunk( - {"frame": chunk_size["frame"], "height": -1, "width": -1} - ).rename("video_array"), + chk, _ = get_optimal_chk(varr, dtype=float) + + # Save raw video to zarr + logger.info("Saving raw video to zarr...") + varr = save_minian( + varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), temp_path, overwrite=True, ) logger.info( - f"Loaded video: {video_array.sizes['frame']} frames, " - f"{video_array.sizes['height']}x{video_array.sizes['width']} pixels" + f"Loaded video: {varr.sizes['frame']} frames, " + f"{varr.sizes['height']}x{varr.sizes['width']} pixels" ) # ===== PREPROCESSING ===== - logger.info("Preprocessing: glow removal (subtracting minimum)...") - video_min = video_array.min("frame").compute() - video_array_ref = video_array - video_min + logger.info("Preprocessing: glow removal...") + varr_min = varr.min("frame").compute() + varr_ref = varr - varr_min + varr_ref = varr_ref.clip(min=0) logger.info("Preprocessing: denoising...") param_denoise = params.get( "param_denoise", {"method": "median", "ksize": 7} ) - video_array_ref = denoise(video_array_ref, **param_denoise) + varr_ref = denoise(varr_ref, **param_denoise) logger.info("Preprocessing: background removal...") param_background_removal = params.get( "param_background_removal", {"method": "tophat", "wnd": 10} ) - video_array_ref = remove_background(video_array_ref, **param_background_removal) + varr_ref = remove_background(varr_ref, **param_background_removal) - logger.info("Preprocessing: saving preprocessed video...") - video_array_ref = save_minian( - video_array_ref.rename("video_array_ref"), - dpath=temp_path, - overwrite=True, - ) + # Keep it chunked + varr_ref = varr_ref.chunk({"frame": chk["frame"], "height": -1, "width": -1}) # ===== MOTION CORRECTION ===== logger.info("Estimating motion...") param_estimate_motion = params.get( "param_estimate_motion", {"dim": "frame"} ) - motion = estimate_motion(video_array_ref, **param_estimate_motion) + motion = estimate_motion(varr_ref, **param_estimate_motion) motion = save_minian( - motion.rename("motion").chunk({"frame": chunk_size["frame"]}), - dpath=temp_path, + motion.rename("motion").chunk({"frame": chk["frame"]}), + temp_path, overwrite=True, ) logger.info("Applying motion correction...") - Y = apply_transform(video_array_ref, motion, fill=0) + Y = apply_transform(varr_ref, motion, fill=0) - # Save two versions with different chunking strategies + # Save two versions with different chunking + logger.info("Saving motion-corrected video (frame-chunked)...") Y_fm_chk = save_minian( Y.astype(float).rename("Y_fm_chk"), temp_path, overwrite=True, ) + + logger.info("Saving motion-corrected video (spatial-chunked)...") Y_hw_chk = save_minian( Y_fm_chk.rename("Y_hw_chk"), temp_path, overwrite=True, - chunks={ - "frame": -1, - "height": chunk_size["height"], - "width": chunk_size["width"], - }, + chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, ) - # Save motion corrected video - logger.info("Saving motion corrected video...") + # Save motion corrected video as mp4 + logger.info("Writing motion corrected video...") write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) # Create and save max projection - logger.info("Creating max projection...") - max_proj = save_minian( - Y_fm_chk.max("frame").rename("max_proj"), - dpath=str(output_dir), - overwrite=True, - ).compute() + logger.info("Computing max projection...") + max_proj = Y_fm_chk.max("frame").compute() + max_proj = save_minian(max_proj.rename("max_proj"), temp_path, overwrite=True) # ===== SEED INITIALIZATION ===== logger.info("Initializing seeds...") @@ -1180,24 +1242,22 @@ def get_container_memory_limit(): "param_pnr_refine", {"noise_freq": 0.06, "thres": 1} ) seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) + logger.info(f"Seeds after PNR refine: {seeds['mask_pnr'].sum()} / {len(seeds)}") logger.info("Refining seeds with KS test...") param_ks_refine = params.get("param_ks_refine", {"sig": 0.05}) seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) + logger.info(f"Seeds after KS refine: {seeds['mask_ks'].sum()} / {len(seeds)}") logger.info("Merging seeds...") param_seeds_merge = params.get( "param_seeds_merge", {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, ) - seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index( - drop=True - ) - seeds_final = seeds_merge( - Y_hw_chk, max_proj, seeds_final, **param_seeds_merge - ) + seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index(drop=True) + seeds_final = seeds_merge(Y_hw_chk, max_proj, seeds_final, **param_seeds_merge) n_seeds = seeds_final["mask_mrg"].sum() - logger.info(f"Seeds after filtering and merging: {n_seeds}") + logger.info(f"Seeds after merge: {n_seeds} / {len(seeds_final)}") if n_seeds == 0: raise ValueError( @@ -1210,10 +1270,9 @@ def get_container_memory_limit(): param_initialize = params.get( "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} ) - A_init = initA( - Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize - ) + A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) A_init = save_minian(A_init.rename("A_init"), temp_path, overwrite=True) + logger.info(f"A_init shape: {A_init.shape}") logger.info("Initializing temporal traces (C)...") C_init = initC(Y_fm_chk, A_init) @@ -1223,6 +1282,7 @@ def get_container_memory_limit(): overwrite=True, chunks={"unit_id": 1, "frame": -1}, ) + logger.info(f"C_init shape: {C_init.shape}") # Initial unit merge logger.info("Initial unit merge...") @@ -1234,39 +1294,39 @@ def get_container_memory_limit(): C.rename("C_chk"), temp_path, overwrite=True, - chunks={"unit_id": -1, "frame": chunk_size["frame"]}, + chunks={"unit_id": -1, "frame": chk["frame"]}, ) logger.info(f"Units after initial merge: {A.sizes['unit_id']}") # ===== INITIALIZE BACKGROUND ===== logger.info("Initializing background terms...") b, f = update_background(Y_fm_chk, A, C_chk) - b = save_minian(b.rename("b"), temp_path, overwrite=True) f = save_minian(f.rename("f"), temp_path, overwrite=True) + b = save_minian(b.rename("b"), temp_path, overwrite=True) + logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") # ===== COMPUTE NOISE STATISTICS ===== logger.info("Computing noise statistics...") param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) - sn_spatial = save_minian( - sn_spatial.rename("sn_spatial"), temp_path, overwrite=True - ) + sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), temp_path, overwrite=True) # ======================================== # CNMF ITERATION 1 # ======================================== + # Clean intermediate files that might conflict + clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) + # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") param_first_spatial = params.get( "param_first_spatial", - {"dl_wnd": 5, "sparse_penal": 0.01, "size_thres": (25, None)}, + {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, ) A_new, mask, norm_fac = update_spatial( Y_hw_chk, A, C, sn_spatial, **param_first_spatial ) - - # Apply mask and normalization to C (CRITICAL) C_new = save_minian( (C.sel(unit_id=mask) * norm_fac).rename("C_new"), temp_path, @@ -1277,192 +1337,98 @@ def get_container_memory_limit(): temp_path, overwrite=True, ) + logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") - # Save updated A - A = save_minian( - A_new.rename("A"), - temp_path, - overwrite=True, - chunks={"unit_id": 1, "height": -1, "width": -1}, - ) - - # Update background after spatial update - b_new, f_new = update_background(Y_fm_chk, A, C_chk_new) - b = save_minian(b_new.rename("b"), temp_path, overwrite=True) - f = save_minian( - f_new.chunk({"frame": chunk_size["frame"]}).rename("f"), - temp_path, - overwrite=True, - ) - C = save_minian(C_new.rename("C"), temp_path, overwrite=True) - C_chk = save_minian(C_chk_new.rename("C_chk"), temp_path, overwrite=True) - - logger.info(f"Units after first spatial update: {A.sizes['unit_id']}") + # Update background after first spatial + logger.info("Updating background...") + b_new, f_new = update_background(Y_fm_chk, A_new, C_chk_new) # ----- First Temporal Update ----- logger.info("CNMF Iteration 1: Temporal update...") - YrA = save_minian( - compute_trace(Y_fm_chk, A, b, C_chk, f).rename("YrA"), - temp_path, - overwrite=True, - chunks={"unit_id": 1, "frame": -1}, - ) - param_first_temporal = params.get( "param_first_temporal", { "noise_freq": 0.06, - "sparse_penal": 0.5, + "sparse_penal": 1, "p": 1, "add_lag": 20, "jac_thres": 0.2, }, ) C_new, S_new, b0_new, c0_new, g, mask = update_temporal( - A, C, YrA=YrA, **param_first_temporal + A_new, C_new, + Y=Y_fm_chk, + b=b_new, f=f_new, + **param_first_temporal ) - - # Save temporal results - C = save_minian( - C_new.rename("C").chunk({"unit_id": 1, "frame": -1}), - temp_path, - overwrite=True, - ) - C_chk = save_minian( - C.rename("C_chk"), - temp_path, - overwrite=True, - chunks={"unit_id": -1, "frame": chunk_size["frame"]}, - ) - S = save_minian( - S_new.rename("S").chunk({"unit_id": 1, "frame": -1}), - temp_path, - overwrite=True, - ) - b0 = save_minian( - b0_new.rename("b0").chunk({"unit_id": 1, "frame": -1}), - temp_path, - overwrite=True, - ) - c0 = save_minian( - c0_new.rename("c0").chunk({"unit_id": 1, "frame": -1}), - temp_path, - overwrite=True, - ) - - # Sync A with C after temporal update drops units - A = A.sel(unit_id=C.coords["unit_id"].values) - - logger.info(f"Units after first temporal update: {A.sizes['unit_id']}") + logger.info(f"Units after first temporal update: {C_new.sizes['unit_id']}") # ----- First Merge ----- logger.info("CNMF Iteration 1: Merging units...") param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) - A_mrg, C_mrg, [sig_mrg] = unit_merge( - A, C, [C + b0 + c0], **param_first_merge + A_mrg, C_mrg, [S_mrg] = unit_merge( + A_new.sel(unit_id=mask), C_new, [S_new], **param_first_merge ) - - A = save_minian(A_mrg.rename("A"), temp_path, overwrite=True) - C = save_minian(C_mrg.rename("C"), temp_path, overwrite=True) - C_chk = save_minian( - C.rename("C_chk"), - temp_path, - overwrite=True, - chunks={"unit_id": -1, "frame": chunk_size["frame"]}, - ) - - logger.info(f"Units after first merge: {A.sizes['unit_id']}") + logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") # ======================================== # CNMF ITERATION 2 # ======================================== + # Clean intermediate files before second iteration + clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) + # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") param_second_spatial = params.get( "param_second_spatial", - {"dl_wnd": 5, "sparse_penal": 0.01, "size_thres": (25, None)}, + {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, ) - A_new, mask, norm_fac = update_spatial( - Y_hw_chk, A, C, sn_spatial, **param_second_spatial + A_new2, mask2, norm_fac2 = update_spatial( + Y_hw_chk, A_mrg, C_mrg, sn_spatial, **param_second_spatial ) + C_new2 = C_mrg.sel(unit_id=mask2) * norm_fac2 + logger.info(f"Units after second spatial update: {A_new2.sizes['unit_id']}") - # Apply mask and normalization to C - C_new = save_minian( - (C.sel(unit_id=mask) * norm_fac).rename("C_new"), - temp_path, - overwrite=True, - ) - C_chk_new = save_minian( - (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), - temp_path, - overwrite=True, - ) - - # Save updated A - A = save_minian( - A_new.rename("A"), - temp_path, - overwrite=True, - chunks={"unit_id": 1, "height": -1, "width": -1}, - ) - - # Update background - b_new, f_new = update_background(Y_fm_chk, A, C_chk_new) - b = save_minian(b_new.rename("b"), temp_path, overwrite=True) - f = save_minian( - f_new.chunk({"frame": chunk_size["frame"]}).rename("f"), - temp_path, - overwrite=True, - ) - C = save_minian(C_new.rename("C"), temp_path, overwrite=True) - C_chk = save_minian(C_chk_new.rename("C_chk"), temp_path, overwrite=True) - - logger.info(f"Units after second spatial update: {A.sizes['unit_id']}") + # Update background after second spatial + logger.info("Updating background...") + b_new2, f_new2 = update_background(Y_fm_chk, A_new2, C_new2) # ----- Second Temporal Update ----- logger.info("CNMF Iteration 2: Temporal update...") - YrA = save_minian( - compute_trace(Y_fm_chk, A, b, C_chk, f).rename("YrA"), - temp_path, - overwrite=True, - chunks={"unit_id": 1, "frame": -1}, - ) - param_second_temporal = params.get( "param_second_temporal", { "noise_freq": 0.06, - "sparse_penal": 0.5, + "sparse_penal": 1, "p": 1, "add_lag": 20, - "jac_thres": 0.2, + "jac_thres": 0.4, }, ) - C_new, S_new, b0_new, c0_new, g, mask = update_temporal( - A, C, YrA=YrA, **param_second_temporal + C_final, S_final, b0_final, c0_final, g_final, mask_final = update_temporal( + A_new2, C_new2, + Y=Y_fm_chk, + b=b_new2, f=f_new2, + **param_second_temporal ) - - # Sync A with final C - A = A.sel(unit_id=C_new.coords["unit_id"].values) - - logger.info(f"Final units after second temporal update: {A.sizes['unit_id']}") + A_final = A_new2.sel(unit_id=mask_final) + logger.info(f"Final units: {A_final.sizes['unit_id']}") # ===== SAVE FINAL RESULTS ===== logger.info("Saving final results to output directory...") final_save_params = {"dpath": str(output_dir), "overwrite": True} - A = save_minian(A.rename("A"), **final_save_params) - C = save_minian(C_new.rename("C"), **final_save_params) - S = save_minian(S_new.rename("S"), **final_save_params) - c0 = save_minian(c0_new.rename("c0"), **final_save_params) - b0 = save_minian(b0_new.rename("b0"), **final_save_params) - b = save_minian(b_new.rename("b"), **final_save_params) - f = save_minian(f_new.rename("f"), **final_save_params) - motion = save_minian(motion.rename("motion"), **final_save_params) + A_final = save_minian(A_final.rename("A"), **final_save_params) + C_final = save_minian(C_final.rename("C"), **final_save_params) + S_final = save_minian(S_final.rename("S"), **final_save_params) + b_final = save_minian(b_new2.rename("b"), **final_save_params) + f_final = save_minian(f_new2.rename("f"), **final_save_params) + motion_final = save_minian(motion.rename("motion"), **final_save_params) + max_proj_final = save_minian(max_proj.rename("max_proj"), **final_save_params) logger.info( - f"Minian processing complete. {A.sizes['unit_id']} units detected." + f"Minian processing complete. {A_final.sizes['unit_id']} units detected." ) except Exception as e: From 443d0d40d851d603d3d85cbb5d0353f7d4376d77 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 5 Feb 2026 18:47:24 -0500 Subject: [PATCH 17/36] Remove file cleanup for now --- element_miniscope/miniscope.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index dca8836..335fd6f 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1315,9 +1315,6 @@ def clean_intermediate_files(file_names): # CNMF ITERATION 1 # ======================================== - # Clean intermediate files that might conflict - clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) - # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") param_first_spatial = params.get( @@ -1375,9 +1372,6 @@ def clean_intermediate_files(file_names): # CNMF ITERATION 2 # ======================================== - # Clean intermediate files before second iteration - clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) - # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") param_second_spatial = params.get( From 1620e0559a086bfce7d3f31a8935616ff204b359 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 5 Feb 2026 22:40:11 -0500 Subject: [PATCH 18/36] patch compability for mixed array types --- element_miniscope/miniscope.py | 45 ++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 335fd6f..9dcb8b0 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -971,9 +971,11 @@ def _run_processing(): import multiprocessing import psutil import shutil + import dask.array as darr from dask.distributed import Client, LocalCluster # ===== APPLY COMPATIBILITY PATCHES ===== + # Fix for NetworkX 3.0+ API changes import networkx as nx import scipy.sparse @@ -1000,9 +1002,50 @@ def label_connected_fixed(adj, only_connected=False): cnmf_module.label_connected = label_connected_fixed # Fix for sparse array auto-densification + import sparse import sparse.numba_backend._sparse_array as sparse_mod sparse_mod.AUTO_DENSIFY = True + # Fix for darr.block with mixed sparse array types + _original_darr_block = darr.block + + def patched_darr_block(arrays, allow_unknown_chunksizes=False): + """Patched darr.block that ensures sparse array type consistency.""" + def convert_to_coo(arr): + if arr is None: + return arr + if isinstance(arr, sparse.COO): + return arr + if isinstance(arr, sparse.SparseArray): + return sparse.COO(arr) + if isinstance(arr, np.ndarray): + return sparse.COO.from_numpy(arr) + if hasattr(arr, 'todense'): + return sparse.COO.from_numpy(np.asarray(arr.todense())) + return arr + + def recursive_convert(obj): + if isinstance(obj, list): + return [recursive_convert(item) for item in obj] + elif isinstance(obj, np.ndarray) and obj.dtype == object: + result = np.empty_like(obj) + for idx in np.ndindex(obj.shape): + result[idx] = convert_to_coo(obj[idx]) + return result + else: + return convert_to_coo(obj) + + try: + return _original_darr_block(arrays, allow_unknown_chunksizes=allow_unknown_chunksizes) + except ValueError as e: + if "All arrays must be instances of SparseArray" in str(e): + converted = recursive_convert(arrays) + return _original_darr_block(converted, allow_unknown_chunksizes=allow_unknown_chunksizes) + raise + + darr.block = patched_darr_block + darr.core.block = patched_darr_block + # Patch update_temporal to handle sparse.COO arrays _original_update_temporal = cnmf_module.update_temporal @@ -1037,6 +1080,8 @@ def __new__(cls, arg1, shape=None, dtype=None, copy=False): cnmf_module.update_temporal = patched_update_temporal + logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block)") + # Now import minian modules (after patches are applied) from minian.cnmf import ( get_noise_fft, From f821da825fb9b77c055dd0009c283047a002ef97 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 5 Feb 2026 22:53:32 -0500 Subject: [PATCH 19/36] patch sparse array error pt2 --- element_miniscope/miniscope.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 9dcb8b0..8262c58 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1046,6 +1046,30 @@ def recursive_convert(obj): darr.block = patched_darr_block darr.core.block = patched_darr_block + # ===== PATCH: sparse/dense concatenation compatibility ===== + import sparse + import sparse.numba_backend._coo.common as _sparse_coo_common + import numpy as np + + _original_sparse_concat = _sparse_coo_common.concatenate + + def _patched_sparse_concat(arrays, axis=0): + """Convert any dense arrays to sparse.COO before concatenation.""" + converted = [] + for arr in arrays: + if isinstance(arr, sparse.SparseArray): + converted.append(arr) + elif isinstance(arr, np.ndarray): + converted.append(sparse.COO.from_numpy(arr)) + else: + try: + converted.append(sparse.COO.from_numpy(np.asarray(arr))) + except Exception: + converted.append(arr) + return _original_sparse_concat(converted, axis=axis) + + _sparse_coo_common.concatenate = _patched_sparse_concat + # Patch update_temporal to handle sparse.COO arrays _original_update_temporal = cnmf_module.update_temporal From 07cd10492e337abd16a98d6a251ce511c984c662 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 5 Feb 2026 23:15:31 -0500 Subject: [PATCH 20/36] patch sparse array issue pt3 --- element_miniscope/miniscope.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 8262c58..742aa5e 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1070,6 +1070,21 @@ def _patched_sparse_concat(arrays, axis=0): _sparse_coo_common.concatenate = _patched_sparse_concat + # ===== PATCH: sparse/dense concatenation compatibility ===== + import sparse + import numpy as np + import sparse.numba_backend._coo.common as _sparse_coo_common + + _original_check = _sparse_coo_common.check_consistent_fill_value + + def _patched_check(arrays): + """Auto-convert any dense arrays to sparse.COO before validation.""" + for i, arr in enumerate(arrays): + if not isinstance(arr, sparse.SparseArray): + arrays[i] = sparse.COO.from_numpy(np.asarray(arr)) + return _original_check(arrays) + + _sparse_coo_common.check_consistent_fill_value = _patched_check # Patch update_temporal to handle sparse.COO arrays _original_update_temporal = cnmf_module.update_temporal From 90f19a8ab30066e46d64487e2053e323c40f7aea Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Fri, 6 Feb 2026 12:07:13 -0500 Subject: [PATCH 21/36] Create new intermediate dirs for cnmf and update env --- element_miniscope/miniscope.py | 52 ++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 742aa5e..e19f351 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1186,16 +1186,20 @@ def get_container_memory_limit(): else: memory_limit = f"{memory_per_worker // (1024**3)}GB" - # Set intermediate storage path - temp_path = str(output_dir / "intermediate") - os.makedirs(temp_path, exist_ok=True) - os.environ["MINIAN_INTERMEDIATE"] = temp_path + # Set minian intermediate storage paths + pre_cnmf_path = str(output_dir / "pre_cnmf") + first_cnmf_path = str(output_dir / "first_cnmf") + second_cnmf_path = str(output_dir / "second_cnmf") + os.makedirs(pre_cnmf_path, exist_ok=True) + os.makedirs(first_cnmf_path, exist_ok=True) + os.makedirs(second_cnmf_path, exist_ok=True) + os.environ["MINIAN_INTERMEDIATE"] = pre_cnmf_path # Helper function to clean intermediate files - def clean_intermediate_files(file_names): + def clean_intermediate_files(directory_path, file_names): """Remove intermediate zarr files to avoid conflicts.""" for fname in file_names: - path = os.path.join(temp_path, f"{fname}.zarr") + path = os.path.join(directory_path, f"{fname}.zarr") if os.path.exists(path): shutil.rmtree(path) @@ -1237,7 +1241,7 @@ def clean_intermediate_files(file_names): logger.info("Saving raw video to zarr...") varr = save_minian( varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), - temp_path, + pre_cnmf_path, overwrite=True, ) logger.info( @@ -1274,7 +1278,7 @@ def clean_intermediate_files(file_names): motion = estimate_motion(varr_ref, **param_estimate_motion) motion = save_minian( motion.rename("motion").chunk({"frame": chk["frame"]}), - temp_path, + pre_cnmf_path, overwrite=True, ) @@ -1285,14 +1289,14 @@ def clean_intermediate_files(file_names): logger.info("Saving motion-corrected video (frame-chunked)...") Y_fm_chk = save_minian( Y.astype(float).rename("Y_fm_chk"), - temp_path, + pre_cnmf_path, overwrite=True, ) logger.info("Saving motion-corrected video (spatial-chunked)...") Y_hw_chk = save_minian( Y_fm_chk.rename("Y_hw_chk"), - temp_path, + pre_cnmf_path, overwrite=True, chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, ) @@ -1304,7 +1308,7 @@ def clean_intermediate_files(file_names): # Create and save max projection logger.info("Computing max projection...") max_proj = Y_fm_chk.max("frame").compute() - max_proj = save_minian(max_proj.rename("max_proj"), temp_path, overwrite=True) + max_proj = save_minian(max_proj.rename("max_proj"), pre_cnmf_path, overwrite=True) # ===== SEED INITIALIZATION ===== logger.info("Initializing seeds...") @@ -1355,14 +1359,14 @@ def clean_intermediate_files(file_names): "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} ) A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) - A_init = save_minian(A_init.rename("A_init"), temp_path, overwrite=True) + A_init = save_minian(A_init.rename("A_init"), pre_cnmf_path, overwrite=True) logger.info(f"A_init shape: {A_init.shape}") logger.info("Initializing temporal traces (C)...") C_init = initC(Y_fm_chk, A_init) C_init = save_minian( C_init.rename("C_init"), - temp_path, + pre_cnmf_path, overwrite=True, chunks={"unit_id": 1, "frame": -1}, ) @@ -1372,11 +1376,11 @@ def clean_intermediate_files(file_names): logger.info("Initial unit merge...") param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) A, C = unit_merge(A_init, C_init, **param_init_merge) - A = save_minian(A.rename("A"), temp_path, overwrite=True) - C = save_minian(C.rename("C"), temp_path, overwrite=True) + A = save_minian(A.rename("A"), pre_cnmf_path, overwrite=True) + C = save_minian(C.rename("C"), pre_cnmf_path, overwrite=True) C_chk = save_minian( C.rename("C_chk"), - temp_path, + pre_cnmf_path, overwrite=True, chunks={"unit_id": -1, "frame": chk["frame"]}, ) @@ -1385,19 +1389,23 @@ def clean_intermediate_files(file_names): # ===== INITIALIZE BACKGROUND ===== logger.info("Initializing background terms...") b, f = update_background(Y_fm_chk, A, C_chk) - f = save_minian(f.rename("f"), temp_path, overwrite=True) - b = save_minian(b.rename("b"), temp_path, overwrite=True) + f = save_minian(f.rename("f"), pre_cnmf_path, overwrite=True) + b = save_minian(b.rename("b"), pre_cnmf_path, overwrite=True) logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") # ===== COMPUTE NOISE STATISTICS ===== logger.info("Computing noise statistics...") param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) - sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), temp_path, overwrite=True) + sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), pre_cnmf_path, overwrite=True) # ======================================== # CNMF ITERATION 1 # ======================================== + # clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", + # "g", "YrA"]) + + os.environ["MINIAN_INTERMEDIATE"] = first_cnmf_path # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") @@ -1410,12 +1418,12 @@ def clean_intermediate_files(file_names): ) C_new = save_minian( (C.sel(unit_id=mask) * norm_fac).rename("C_new"), - temp_path, + first_cnmf_path, overwrite=True, ) C_chk_new = save_minian( (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), - temp_path, + first_cnmf_path, overwrite=True, ) logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") @@ -1456,6 +1464,8 @@ def clean_intermediate_files(file_names): # CNMF ITERATION 2 # ======================================== + os.environ["MINIAN_INTERMEDIATE"] = second_cnmf_path + # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") param_second_spatial = params.get( From f88560564f88819f4e22560887f1b68e91c3bd5b Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Fri, 6 Feb 2026 12:18:37 -0500 Subject: [PATCH 22/36] Fix minian intermediate --- element_miniscope/miniscope.py | 44 ++++++++++++++-------------------- 1 file changed, 18 insertions(+), 26 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index e19f351..5ec3139 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1187,13 +1187,9 @@ def get_container_memory_limit(): memory_limit = f"{memory_per_worker // (1024**3)}GB" # Set minian intermediate storage paths - pre_cnmf_path = str(output_dir / "pre_cnmf") - first_cnmf_path = str(output_dir / "first_cnmf") - second_cnmf_path = str(output_dir / "second_cnmf") - os.makedirs(pre_cnmf_path, exist_ok=True) - os.makedirs(first_cnmf_path, exist_ok=True) - os.makedirs(second_cnmf_path, exist_ok=True) - os.environ["MINIAN_INTERMEDIATE"] = pre_cnmf_path + minian_data_path = str(output_dir / "minian_data") + os.makedirs(minian_data_path, exist_ok=True) + os.environ["MINIAN_INTERMEDIATE"] = minian_data_path # Helper function to clean intermediate files def clean_intermediate_files(directory_path, file_names): @@ -1241,7 +1237,7 @@ def clean_intermediate_files(directory_path, file_names): logger.info("Saving raw video to zarr...") varr = save_minian( varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), - pre_cnmf_path, + minian_data_path, overwrite=True, ) logger.info( @@ -1278,7 +1274,7 @@ def clean_intermediate_files(directory_path, file_names): motion = estimate_motion(varr_ref, **param_estimate_motion) motion = save_minian( motion.rename("motion").chunk({"frame": chk["frame"]}), - pre_cnmf_path, + minian_data_path, overwrite=True, ) @@ -1289,14 +1285,14 @@ def clean_intermediate_files(directory_path, file_names): logger.info("Saving motion-corrected video (frame-chunked)...") Y_fm_chk = save_minian( Y.astype(float).rename("Y_fm_chk"), - pre_cnmf_path, + minian_data_path, overwrite=True, ) logger.info("Saving motion-corrected video (spatial-chunked)...") Y_hw_chk = save_minian( Y_fm_chk.rename("Y_hw_chk"), - pre_cnmf_path, + minian_data_path, overwrite=True, chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, ) @@ -1308,7 +1304,7 @@ def clean_intermediate_files(directory_path, file_names): # Create and save max projection logger.info("Computing max projection...") max_proj = Y_fm_chk.max("frame").compute() - max_proj = save_minian(max_proj.rename("max_proj"), pre_cnmf_path, overwrite=True) + max_proj = save_minian(max_proj.rename("max_proj"), minian_data_path, overwrite=True) # ===== SEED INITIALIZATION ===== logger.info("Initializing seeds...") @@ -1359,14 +1355,14 @@ def clean_intermediate_files(directory_path, file_names): "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} ) A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) - A_init = save_minian(A_init.rename("A_init"), pre_cnmf_path, overwrite=True) + A_init = save_minian(A_init.rename("A_init"), minian_data_path, overwrite=True) logger.info(f"A_init shape: {A_init.shape}") logger.info("Initializing temporal traces (C)...") C_init = initC(Y_fm_chk, A_init) C_init = save_minian( C_init.rename("C_init"), - pre_cnmf_path, + minian_data_path, overwrite=True, chunks={"unit_id": 1, "frame": -1}, ) @@ -1376,11 +1372,11 @@ def clean_intermediate_files(directory_path, file_names): logger.info("Initial unit merge...") param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) A, C = unit_merge(A_init, C_init, **param_init_merge) - A = save_minian(A.rename("A"), pre_cnmf_path, overwrite=True) - C = save_minian(C.rename("C"), pre_cnmf_path, overwrite=True) + A = save_minian(A.rename("A"), minian_data_path, overwrite=True) + C = save_minian(C.rename("C"), minian_data_path, overwrite=True) C_chk = save_minian( C.rename("C_chk"), - pre_cnmf_path, + minian_data_path, overwrite=True, chunks={"unit_id": -1, "frame": chk["frame"]}, ) @@ -1389,15 +1385,15 @@ def clean_intermediate_files(directory_path, file_names): # ===== INITIALIZE BACKGROUND ===== logger.info("Initializing background terms...") b, f = update_background(Y_fm_chk, A, C_chk) - f = save_minian(f.rename("f"), pre_cnmf_path, overwrite=True) - b = save_minian(b.rename("b"), pre_cnmf_path, overwrite=True) + f = save_minian(f.rename("f"), minian_data_path, overwrite=True) + b = save_minian(b.rename("b"), minian_data_path, overwrite=True) logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") # ===== COMPUTE NOISE STATISTICS ===== logger.info("Computing noise statistics...") param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) - sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), pre_cnmf_path, overwrite=True) + sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), minian_data_path, overwrite=True) # ======================================== # CNMF ITERATION 1 @@ -1405,8 +1401,6 @@ def clean_intermediate_files(directory_path, file_names): # clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", # "g", "YrA"]) - os.environ["MINIAN_INTERMEDIATE"] = first_cnmf_path - # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") param_first_spatial = params.get( @@ -1418,12 +1412,12 @@ def clean_intermediate_files(directory_path, file_names): ) C_new = save_minian( (C.sel(unit_id=mask) * norm_fac).rename("C_new"), - first_cnmf_path, + minian_data_path, overwrite=True, ) C_chk_new = save_minian( (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), - first_cnmf_path, + minian_data_path, overwrite=True, ) logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") @@ -1463,8 +1457,6 @@ def clean_intermediate_files(directory_path, file_names): # ======================================== # CNMF ITERATION 2 # ======================================== - - os.environ["MINIAN_INTERMEDIATE"] = second_cnmf_path # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") From c601e1cc3e88d30bd73c2a585cd10aabcbd06155 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 11 Feb 2026 17:45:00 -0500 Subject: [PATCH 23/36] Update minian data path --- element_miniscope/miniscope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 5ec3139..3c0b2ed 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1519,7 +1519,7 @@ def clean_intermediate_files(directory_path, file_names): cluster.close() # Load results and prepare for insertion - minian_loader = MinianLoader(output_dir) + minian_loader = MinianLoader(minian_data_path) key["processing_time"] = minian_loader.creation_time # Get minian version if available From 36db0f14c22165ddea3e6558d2a10b06414f1e61 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 12 Feb 2026 09:36:56 -0500 Subject: [PATCH 24/36] Fix mismatched units --- element_miniscope/miniscope.py | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 3c0b2ed..e3d9ce5 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1398,9 +1398,10 @@ def clean_intermediate_files(directory_path, file_names): # ======================================== # CNMF ITERATION 1 # ======================================== - # clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", - # "g", "YrA"]) - + + # Clean intermediate files that might conflict + clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) + # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") param_first_spatial = params.get( @@ -1449,15 +1450,29 @@ def clean_intermediate_files(directory_path, file_names): # ----- First Merge ----- logger.info("CNMF Iteration 1: Merging units...") param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) + + # <<< PATCHED: unit_merge coordinate fix + # update_temporal returns C_new/S_new already filtered by mask internally. + # Sync A with C_new's actual coordinates instead of using the boolean mask, + # which can cause alignment mismatches on large datasets. + A_filtered = A_new.sel(unit_id=C_new.coords["unit_id"].values) + logger.info( + f"Unit sync check — A: {A_filtered.sizes['unit_id']}, " + f"C: {C_new.sizes['unit_id']}, S: {S_new.sizes['unit_id']}" + ) A_mrg, C_mrg, [S_mrg] = unit_merge( - A_new.sel(unit_id=mask), C_new, [S_new], **param_first_merge + A_filtered, C_new, [S_new], **param_first_merge ) + # >>> END PATCH logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") # ======================================== # CNMF ITERATION 2 # ======================================== - + + # Clean intermediate files before second iteration + clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) + # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") param_second_spatial = params.get( @@ -1492,7 +1507,11 @@ def clean_intermediate_files(directory_path, file_names): b=b_new2, f=f_new2, **param_second_temporal ) - A_final = A_new2.sel(unit_id=mask_final) + + # <<< PATCHED: unit_merge coordinate fix (same pattern as iteration 1) + # Sync A with C_final's actual coordinates + A_final = A_new2.sel(unit_id=C_final.coords["unit_id"].values) + # >>> END PATCH logger.info(f"Final units: {A_final.sizes['unit_id']}") # ===== SAVE FINAL RESULTS ===== From 0299a55235daade59e2ab139eeb321c66c24dc91 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 12 Feb 2026 10:13:51 -0500 Subject: [PATCH 25/36] Remove intermediate file clean up --- element_miniscope/miniscope.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index e3d9ce5..f66fb40 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1399,9 +1399,6 @@ def clean_intermediate_files(directory_path, file_names): # CNMF ITERATION 1 # ======================================== - # Clean intermediate files that might conflict - clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) - # ----- First Spatial Update ----- logger.info("CNMF Iteration 1: Spatial update...") param_first_spatial = params.get( @@ -1470,9 +1467,6 @@ def clean_intermediate_files(directory_path, file_names): # CNMF ITERATION 2 # ======================================== - # Clean intermediate files before second iteration - clean_intermediate_files(["C_new", "S_new", "b0_new", "c0_new", "g", "YrA"]) - # ----- Second Spatial Update ----- logger.info("CNMF Iteration 2: Spatial update...") param_second_spatial = params.get( From 568e5f2d44188c75a10b99f9018cedeeef03224d Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Thu, 12 Feb 2026 20:52:58 -0500 Subject: [PATCH 26/36] santize arrays to avoid nans and add failure flag --- element_miniscope/miniscope.py | 61 ++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index f66fb40..768d94d 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -971,9 +971,19 @@ def _run_processing(): import multiprocessing import psutil import shutil + import dask import dask.array as darr + import scipy.linalg # <<< NEW: for solve_triangular patch + import xarray as xr # <<< NEW: for sanitize_array from dask.distributed import Client, LocalCluster + # <<< NEW: Prevent infinite retry loops on NaN/Inf errors >>> + # If a Dask task fails (e.g. solve_triangular ValueError), don't + # keep retrying until OOM — fail fast after 3 attempts. + dask.config.set({ + "distributed.scheduler.allowed-failures": 3, + }) + # ===== APPLY COMPATIBILITY PATCHES ===== # Fix for NetworkX 3.0+ API changes @@ -1119,7 +1129,20 @@ def __new__(cls, arg1, shape=None, dtype=None, copy=False): cnmf_module.update_temporal = patched_update_temporal - logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block)") + # <<< NEW PATCH: scipy.linalg.solve_triangular NaN/Inf guard >>> + # Prevents ValueError('array must not contain infs or NaNs') from + # crashing Dask tasks and triggering infinite retry → OOM loops. + _original_scipy_solve_tri = scipy.linalg.solve_triangular + + def patched_solve_triangular(a, b, **kwargs): + """Sanitize NaN/Inf before calling solve_triangular.""" + a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) + b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) + return _original_scipy_solve_tri(a, b, **kwargs) + + scipy.linalg.solve_triangular = patched_solve_triangular + + logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block, solve_triangular NaN guard)") # Now import minian modules (after patches are applied) from minian.cnmf import ( @@ -1199,6 +1222,29 @@ def clean_intermediate_files(directory_path, file_names): if os.path.exists(path): shutil.rmtree(path) + def sanitize_array(arr, name="array"): + """Replace NaN/Inf in an xarray DataArray with 0. + Works with both lazy (dask-backed) and in-memory arrays.""" + logger.info(f"Sanitizing {name} (replacing NaN/Inf with 0)...") + if hasattr(arr.data, "dask"): + sanitized = xr.apply_ufunc( + lambda x: np.nan_to_num( + x, nan=0.0, posinf=0.0, neginf=0.0 + ), + arr, + dask="parallelized", + output_dtypes=[arr.dtype], + ) + else: + sanitized = xr.DataArray( + np.nan_to_num( + arr.values, nan=0.0, posinf=0.0, neginf=0.0 + ), + coords=arr.coords, + dims=arr.dims, + ) + return sanitized + # Start Dask cluster logger.info( f"Starting Minian processing with {n_workers} workers, " @@ -1281,6 +1327,14 @@ def clean_intermediate_files(directory_path, file_names): logger.info("Applying motion correction...") Y = apply_transform(varr_ref, motion, fill=0) + # <<< NEW: Sanitize after motion correction >>> + # apply_transform can introduce NaN at frame borders where the + # shift goes beyond the image boundary. The fill=0 param should + # handle this, but edge cases slip through on large datasets + # with float64 precision. This prevents downstream + # solve_triangular crashes in CNMF. + Y = sanitize_array(Y, "Y_post_motion_correction") + # Save two versions with different chunking logger.info("Saving motion-corrected video (frame-chunked)...") Y_fm_chk = save_minian( @@ -1443,7 +1497,8 @@ def clean_intermediate_files(directory_path, file_names): **param_first_temporal ) logger.info(f"Units after first temporal update: {C_new.sizes['unit_id']}") - + C_new = sanitize_array(C_new, "C_new_pre_merge") + S_new = sanitize_array(S_new, "S_new_pre_merge") # ----- First Merge ----- logger.info("CNMF Iteration 1: Merging units...") param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) @@ -1501,6 +1556,8 @@ def clean_intermediate_files(directory_path, file_names): b=b_new2, f=f_new2, **param_second_temporal ) + C_final = sanitize_array(C_final, "C_final_post_temporal_2") + S_final = sanitize_array(S_final, "S_final_post_temporal_2") # <<< PATCHED: unit_merge coordinate fix (same pattern as iteration 1) # Sync A with C_final's actual coordinates From 6e7984ab4342cedd36e25ead244d5c1c3e1bc805 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Fri, 13 Feb 2026 08:53:37 -0500 Subject: [PATCH 27/36] Fixes for unexpected NaNs in minian output --- element_miniscope/miniscope.py | 80 ++++++++++++++++++++++++++++++---- 1 file changed, 71 insertions(+), 9 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 768d94d..77c1954 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1,6 +1,7 @@ import csv import copy import cv2 +import gc import importlib import inspect import json @@ -1132,15 +1133,24 @@ def __new__(cls, arg1, shape=None, dtype=None, copy=False): # <<< NEW PATCH: scipy.linalg.solve_triangular NaN/Inf guard >>> # Prevents ValueError('array must not contain infs or NaNs') from # crashing Dask tasks and triggering infinite retry → OOM loops. - _original_scipy_solve_tri = scipy.linalg.solve_triangular - - def patched_solve_triangular(a, b, **kwargs): - """Sanitize NaN/Inf before calling solve_triangular.""" - a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) - b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) - return _original_scipy_solve_tri(a, b, **kwargs) - - scipy.linalg.solve_triangular = patched_solve_triangular + from distributed import WorkerPlugin + + class SolveTriangularNaNGuard(WorkerPlugin): + """Patch scipy.linalg.solve_triangular on each Dask worker process + to sanitize NaN/Inf inputs instead of crashing.""" + + def setup(self, worker): + import scipy.linalg + import numpy as np + + _original = scipy.linalg.solve_triangular + + def patched_solve_triangular(a, b, **kwargs): + a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) + b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) + return _original(a, b, **kwargs) + + scipy.linalg.solve_triangular = patched_solve_triangular logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block, solve_triangular NaN guard)") @@ -1260,6 +1270,7 @@ def sanitize_array(arr, name="array"): annotation_plugin = TaskAnnotation() cluster.scheduler.add_plugin(annotation_plugin) client = Client(cluster) + client.register_plugin(SolveTriangularNaNGuard()) try: # ===== LOAD VIDEOS ===== @@ -1517,6 +1528,57 @@ def sanitize_array(arr, name="array"): ) # >>> END PATCH logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") + A_mrg = save_minian(A_mrg.rename("A_mrg"), minian_data_path, overwrite=True) + C_mrg = save_minian(C_mrg.rename("C_mrg"), minian_data_path, overwrite=True) + S_mrg = save_minian(S_mrg.rename("S_mrg"), minian_data_path, overwrite=True) + gc.collect() + logger.info("Saved merged results to zarr for iteration 2") + # Scale down workers for memory-intensive CNMF spatial updates. + # Preprocessing/motion correction can use 8 workers fine, but + # update_spatial with 71k+ frames needs fewer workers with more + # memory each to avoid OOM on intermediate QR/tensordot products. + n_frames = varr.sizes["frame"] + if n_frames > 5000 and n_workers > 2: + logger.info( + f"Large dataset ({n_frames} frames) — scaling to 2 workers " + f"for CNMF iteration 2 to avoid memory pressure" + ) + client.close() + cluster.close() + gc.collect() + + cnmf_n_workers = 2 + cnmf_mem_per_worker = f"{int(memory_total * 0.8 / cnmf_n_workers / (1024**3))}GB" + cluster = LocalCluster( + n_workers=cnmf_n_workers, + memory_limit=cnmf_mem_per_worker, + resources={"MEM": 1}, + threads_per_worker=2, + dashboard_address=None, + ) + annotation_plugin = TaskAnnotation() + cluster.scheduler.add_plugin(annotation_plugin) + client = Client(cluster) + # Re-register the solve_triangular guard on new workers + client.register_plugin(SolveTriangularNaNGuard()) + logger.info( + f"Restarted cluster: {cnmf_n_workers} workers × " + f"{cnmf_mem_per_worker} each" + ) + + # Reload zarr-backed arrays for the new cluster + Y_fm_chk = save_minian( + xr.open_zarr(minian_data_path + "/Y_fm_chk.zarr")["Y_fm_chk"].rename("Y_fm_chk"), + minian_data_path, overwrite=True, + ) + Y_hw_chk = save_minian( + xr.open_zarr(minian_data_path + "/Y_hw_chk.zarr")["Y_hw_chk"].rename("Y_hw_chk"), + minian_data_path, overwrite=True, + ) + sn_spatial = save_minian( + xr.open_zarr(minian_data_path + "/sn_spatial.zarr")["sn_spatial"].rename("sn_spatial"), + minian_data_path, overwrite=True, + ) # ======================================== # CNMF ITERATION 2 From 65ba4a23aa9305894a1af9d2254ddbc2e1d9b122 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Fri, 13 Feb 2026 08:58:38 -0500 Subject: [PATCH 28/36] fix worker plugin syntax attempt 1 --- element_miniscope/miniscope.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 77c1954..4aa191e 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1270,7 +1270,7 @@ def sanitize_array(arr, name="array"): annotation_plugin = TaskAnnotation() cluster.scheduler.add_plugin(annotation_plugin) client = Client(cluster) - client.register_plugin(SolveTriangularNaNGuard()) + client.register_worker_plugin(SolveTriangularNaNGuard()) try: # ===== LOAD VIDEOS ===== @@ -1560,7 +1560,7 @@ def sanitize_array(arr, name="array"): cluster.scheduler.add_plugin(annotation_plugin) client = Client(cluster) # Re-register the solve_triangular guard on new workers - client.register_plugin(SolveTriangularNaNGuard()) + client.register_worker_plugin(SolveTriangularNaNGuard()) logger.info( f"Restarted cluster: {cnmf_n_workers} workers × " f"{cnmf_mem_per_worker} each" From ea45ff0caae79fcd9cfd5f6f40617d643767123d Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Fri, 13 Feb 2026 09:19:38 -0500 Subject: [PATCH 29/36] Remove scaling down workers for now --- element_miniscope/miniscope.py | 46 ---------------------------------- 1 file changed, 46 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 4aa191e..19af1e4 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1533,52 +1533,6 @@ def sanitize_array(arr, name="array"): S_mrg = save_minian(S_mrg.rename("S_mrg"), minian_data_path, overwrite=True) gc.collect() logger.info("Saved merged results to zarr for iteration 2") - # Scale down workers for memory-intensive CNMF spatial updates. - # Preprocessing/motion correction can use 8 workers fine, but - # update_spatial with 71k+ frames needs fewer workers with more - # memory each to avoid OOM on intermediate QR/tensordot products. - n_frames = varr.sizes["frame"] - if n_frames > 5000 and n_workers > 2: - logger.info( - f"Large dataset ({n_frames} frames) — scaling to 2 workers " - f"for CNMF iteration 2 to avoid memory pressure" - ) - client.close() - cluster.close() - gc.collect() - - cnmf_n_workers = 2 - cnmf_mem_per_worker = f"{int(memory_total * 0.8 / cnmf_n_workers / (1024**3))}GB" - cluster = LocalCluster( - n_workers=cnmf_n_workers, - memory_limit=cnmf_mem_per_worker, - resources={"MEM": 1}, - threads_per_worker=2, - dashboard_address=None, - ) - annotation_plugin = TaskAnnotation() - cluster.scheduler.add_plugin(annotation_plugin) - client = Client(cluster) - # Re-register the solve_triangular guard on new workers - client.register_worker_plugin(SolveTriangularNaNGuard()) - logger.info( - f"Restarted cluster: {cnmf_n_workers} workers × " - f"{cnmf_mem_per_worker} each" - ) - - # Reload zarr-backed arrays for the new cluster - Y_fm_chk = save_minian( - xr.open_zarr(minian_data_path + "/Y_fm_chk.zarr")["Y_fm_chk"].rename("Y_fm_chk"), - minian_data_path, overwrite=True, - ) - Y_hw_chk = save_minian( - xr.open_zarr(minian_data_path + "/Y_hw_chk.zarr")["Y_hw_chk"].rename("Y_hw_chk"), - minian_data_path, overwrite=True, - ) - sn_spatial = save_minian( - xr.open_zarr(minian_data_path + "/sn_spatial.zarr")["sn_spatial"].rename("sn_spatial"), - minian_data_path, overwrite=True, - ) # ======================================== # CNMF ITERATION 2 From 5476a45707df6e4e40f562be763bbcbff02926c7 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Mon, 16 Feb 2026 15:45:05 -0500 Subject: [PATCH 30/36] Add dask timeouts --- element_miniscope/miniscope.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 19af1e4..5256b5f 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -983,6 +983,10 @@ def _run_processing(): # keep retrying until OOM — fail fast after 3 attempts. dask.config.set({ "distributed.scheduler.allowed-failures": 3, + "distributed.comm.timeouts.connect": "300s", + "distributed.comm.timeouts.tcp": "7200s", # 2 hours + "distributed.worker.lifetime.stale": "7200s", # don't mark workers stale quickly + "distributed.scheduler.work-stealing": False, }) # ===== APPLY COMPATIBILITY PATCHES ===== @@ -1349,7 +1353,7 @@ def sanitize_array(arr, name="array"): # Save two versions with different chunking logger.info("Saving motion-corrected video (frame-chunked)...") Y_fm_chk = save_minian( - Y.astype(float).rename("Y_fm_chk"), + Y.astype(np.float32).rename("Y_fm_chk"), minian_data_path, overwrite=True, ) From 1c89ba1f05849cc64776c963545101ee69aa2304 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Tue, 17 Feb 2026 09:59:13 -0500 Subject: [PATCH 31/36] Increase worker timeout and do not save movie for now --- element_miniscope/miniscope.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 5256b5f..d2f3602 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -984,9 +984,9 @@ def _run_processing(): dask.config.set({ "distributed.scheduler.allowed-failures": 3, "distributed.comm.timeouts.connect": "300s", - "distributed.comm.timeouts.tcp": "7200s", # 2 hours - "distributed.worker.lifetime.stale": "7200s", # don't mark workers stale quickly + "distributed.comm.timeouts.tcp": "7200s", "distributed.scheduler.work-stealing": False, + "distributed.scheduler.worker-ttl": "7200s", # ← ADD THIS: 1 hour before declaring worker dead }) # ===== APPLY COMPATIBILITY PATCHES ===== @@ -1366,9 +1366,9 @@ def sanitize_array(arr, name="array"): chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, ) - # Save motion corrected video as mp4 - logger.info("Writing motion corrected video...") - write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) + # # Save motion corrected video as mp4 + # logger.info("Writing motion corrected video...") + # write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) # Create and save max projection logger.info("Computing max projection...") From 81ad94c0e7387e72841eceac664b489025f90b7c Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Mon, 23 Feb 2026 13:08:53 -0500 Subject: [PATCH 32/36] Add Docker container execution path for original minian (py38) When extra_dj_params.docker_image is set in ProcessingParamSet, the make_compute() method spawns a sibling Python 3.8 container running the original denisecailab/minian instead of the inline modernized path. This ensures result parity with the customer team's original pipeline. Co-Authored-By: Claude Opus 4.6 --- element_miniscope/miniscope.py | 1373 +++++++++++++++++--------------- 1 file changed, 742 insertions(+), 631 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index d2f3602..69fc3b1 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -969,665 +969,776 @@ def _run_processing(): ] elif method == "minian": - import multiprocessing - import psutil - import shutil - import dask - import dask.array as darr - import scipy.linalg # <<< NEW: for solve_triangular patch - import xarray as xr # <<< NEW: for sanitize_array - from dask.distributed import Client, LocalCluster - - # <<< NEW: Prevent infinite retry loops on NaN/Inf errors >>> - # If a Dask task fails (e.g. solve_triangular ValueError), don't - # keep retrying until OOM — fail fast after 3 attempts. - dask.config.set({ - "distributed.scheduler.allowed-failures": 3, - "distributed.comm.timeouts.connect": "300s", - "distributed.comm.timeouts.tcp": "7200s", - "distributed.scheduler.work-stealing": False, - "distributed.scheduler.worker-ttl": "7200s", # ← ADD THIS: 1 hour before declaring worker dead - }) - - # ===== APPLY COMPATIBILITY PATCHES ===== - - # Fix for NetworkX 3.0+ API changes - import networkx as nx - import scipy.sparse - from scipy.sparse import issparse - import minian.cnmf as cnmf_module - - def label_connected_fixed(adj, only_connected=False): - """Fixed label_connected for NetworkX 3.0+ compatibility.""" - if issparse(adj): - adj = adj.toarray() - adj = adj.copy() - np.fill_diagonal(adj, 0) - adj = np.triu(adj) - g = nx.from_numpy_array(adj) - labels = np.zeros(adj.shape[0], dtype=int) - for icomp, comp in enumerate(nx.connected_components(g)): - for node in comp: - labels[node] = icomp - if only_connected: - iso_mask = np.array([len(c) == 1 for c in nx.connected_components(g)]) - labels[np.isin(labels, np.where(iso_mask)[0])] = -1 - return labels - - cnmf_module.label_connected = label_connected_fixed - - # Fix for sparse array auto-densification - import sparse - import sparse.numba_backend._sparse_array as sparse_mod - sparse_mod.AUTO_DENSIFY = True - - # Fix for darr.block with mixed sparse array types - _original_darr_block = darr.block - - def patched_darr_block(arrays, allow_unknown_chunksizes=False): - """Patched darr.block that ensures sparse array type consistency.""" - def convert_to_coo(arr): - if arr is None: - return arr - if isinstance(arr, sparse.COO): - return arr - if isinstance(arr, sparse.SparseArray): - return sparse.COO(arr) - if isinstance(arr, np.ndarray): - return sparse.COO.from_numpy(arr) - if hasattr(arr, 'todense'): - return sparse.COO.from_numpy(np.asarray(arr.todense())) - return arr - - def recursive_convert(obj): - if isinstance(obj, list): - return [recursive_convert(item) for item in obj] - elif isinstance(obj, np.ndarray) and obj.dtype == object: - result = np.empty_like(obj) - for idx in np.ndindex(obj.shape): - result[idx] = convert_to_coo(obj[idx]) - return result - else: - return convert_to_coo(obj) - - try: - return _original_darr_block(arrays, allow_unknown_chunksizes=allow_unknown_chunksizes) - except ValueError as e: - if "All arrays must be instances of SparseArray" in str(e): - converted = recursive_convert(arrays) - return _original_darr_block(converted, allow_unknown_chunksizes=allow_unknown_chunksizes) - raise - - darr.block = patched_darr_block - darr.core.block = patched_darr_block - - # ===== PATCH: sparse/dense concatenation compatibility ===== - import sparse - import sparse.numba_backend._coo.common as _sparse_coo_common - import numpy as np - - _original_sparse_concat = _sparse_coo_common.concatenate - - def _patched_sparse_concat(arrays, axis=0): - """Convert any dense arrays to sparse.COO before concatenation.""" - converted = [] - for arr in arrays: - if isinstance(arr, sparse.SparseArray): - converted.append(arr) - elif isinstance(arr, np.ndarray): - converted.append(sparse.COO.from_numpy(arr)) - else: - try: - converted.append(sparse.COO.from_numpy(np.asarray(arr))) - except Exception: - converted.append(arr) - return _original_sparse_concat(converted, axis=axis) - - _sparse_coo_common.concatenate = _patched_sparse_concat - - # ===== PATCH: sparse/dense concatenation compatibility ===== - import sparse - import numpy as np - import sparse.numba_backend._coo.common as _sparse_coo_common - - _original_check = _sparse_coo_common.check_consistent_fill_value - - def _patched_check(arrays): - """Auto-convert any dense arrays to sparse.COO before validation.""" - for i, arr in enumerate(arrays): - if not isinstance(arr, sparse.SparseArray): - arrays[i] = sparse.COO.from_numpy(np.asarray(arr)) - return _original_check(arrays) - - _sparse_coo_common.check_consistent_fill_value = _patched_check - # Patch update_temporal to handle sparse.COO arrays - _original_update_temporal = cnmf_module.update_temporal - - def patched_update_temporal(A, C, b=None, f=None, Y=None, YrA=None, - noise_freq=0.25, p=2, add_lag="p", jac_thres=0.1, - sparse_penal=1, bseg=None, med_wd=None, - zero_thres=1e-8, max_iters=200, use_smooth=True, - normalize=True, warm_start=False, post_scal=False, - scs_fallback=False, concurrent_update=False): - """Patched update_temporal that handles sparse.COO arrays properly.""" - _original_csc_matrix = scipy.sparse.csc_matrix - - class PatchedCSCMatrix(scipy.sparse.csc_matrix): - def __new__(cls, arg1, shape=None, dtype=None, copy=False): - if hasattr(arg1, 'todense'): - arg1 = arg1.todense() - return _original_csc_matrix(arg1, shape=shape, dtype=dtype, copy=copy) - - scipy.sparse.csc_matrix = PatchedCSCMatrix - try: - result = _original_update_temporal( - A, C, b=b, f=f, Y=Y, YrA=YrA, noise_freq=noise_freq, - p=p, add_lag=add_lag, jac_thres=jac_thres, sparse_penal=sparse_penal, - bseg=bseg, med_wd=med_wd, zero_thres=zero_thres, max_iters=max_iters, - use_smooth=use_smooth, normalize=normalize, warm_start=warm_start, - post_scal=post_scal, scs_fallback=scs_fallback, - concurrent_update=concurrent_update - ) - finally: - scipy.sparse.csc_matrix = _original_csc_matrix - return result - - cnmf_module.update_temporal = patched_update_temporal - - # <<< NEW PATCH: scipy.linalg.solve_triangular NaN/Inf guard >>> - # Prevents ValueError('array must not contain infs or NaNs') from - # crashing Dask tasks and triggering infinite retry → OOM loops. - from distributed import WorkerPlugin + extra_params = params.pop("extra_dj_params", {}) + docker_image = extra_params.get("docker_image", None) - class SolveTriangularNaNGuard(WorkerPlugin): - """Patch scipy.linalg.solve_triangular on each Dask worker process - to sanitize NaN/Inf inputs instead of crashing.""" - - def setup(self, worker): - import scipy.linalg - import numpy as np - - _original = scipy.linalg.solve_triangular - - def patched_solve_triangular(a, b, **kwargs): - a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) - b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) - return _original(a, b, **kwargs) - - scipy.linalg.solve_triangular = patched_solve_triangular + if docker_image: + # === DOCKER CONTAINER EXECUTION PATH === + # Runs the original minian (Python 3.8) in a sibling container + import docker as docker_sdk + import json as json_mod - logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block, solve_triangular NaN guard)") + logger.info( + f"Running minian via Docker container: {docker_image}" + ) - # Now import minian modules (after patches are applied) - from minian.cnmf import ( - get_noise_fft, - unit_merge, - update_spatial, - update_temporal, - update_background, - ) - from minian.initialization import ( - initA, - initC, - ks_refine, - pnr_refine, - seeds_init, - seeds_merge, - ) - from minian.motion_correction import apply_transform, estimate_motion - from minian.preprocessing import denoise, remove_background - from minian.utilities import ( - TaskAnnotation, - get_optimal_chk, - load_videos, - save_minian, - ) - from minian.visualization import write_video + # Resolve host-level paths for sibling container volume mounts + host_s3_root = os.environ["HOST_S3_ROOT"] + host_outbox = os.environ["HOST_OUTBOX"] + container_raw_root = os.environ.get( + "RAW_ROOT_DATA_DIR", "/home/jovyan/s3/inbox" + ) + container_processed_root = os.environ.get( + "PROCESSED_ROOT_DATA_DIR", "/home/jovyan/efs/outbox" + ) - # ===== CONTAINER-AWARE MEMORY DETECTION ===== - def get_container_memory_limit(): - """Get memory limit respecting container cgroups (v1 and v2).""" - try: - with open("/sys/fs/cgroup/memory.max") as f: - limit = f.read().strip() - if limit != "max": - return int(limit) - except (FileNotFoundError, PermissionError): - pass - try: - with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: - limit = int(f.read().strip()) - if limit < 9223372036854771712: - return limit - except (FileNotFoundError, PermissionError): - pass - return psutil.virtual_memory().total - - # ===== DASK CLUSTER CONFIGURATION ===== - memory_total = get_container_memory_limit() - n_workers = int( - os.getenv( - "MINIAN_NWORKERS", - max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), + # Map container paths -> host paths for sibling container mounts + input_dir = str(pathlib.Path(avi_files[0]).parent) + input_dir_host = input_dir.replace( + container_raw_root, host_s3_root + "/inbox", 1 + ) + output_dir_host = str(output_dir).replace( + container_processed_root, host_outbox, 1 ) - ) - memory_per_worker = int(memory_total * 0.8 / n_workers) - memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") - if memory_limit_env: + # Write config JSON to shared filesystem + n_workers = int(os.getenv("MINIAN_NWORKERS", 2)) + memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT", "4") memory_limit = ( memory_limit_env if any(c.isalpha() for c in memory_limit_env) else f"{memory_limit_env}GB" ) - else: - memory_limit = f"{memory_per_worker // (1024**3)}GB" - - # Set minian intermediate storage paths - minian_data_path = str(output_dir / "minian_data") - os.makedirs(minian_data_path, exist_ok=True) - os.environ["MINIAN_INTERMEDIATE"] = minian_data_path - - # Helper function to clean intermediate files - def clean_intermediate_files(directory_path, file_names): - """Remove intermediate zarr files to avoid conflicts.""" - for fname in file_names: - path = os.path.join(directory_path, f"{fname}.zarr") - if os.path.exists(path): - shutil.rmtree(path) - - def sanitize_array(arr, name="array"): - """Replace NaN/Inf in an xarray DataArray with 0. - Works with both lazy (dask-backed) and in-memory arrays.""" - logger.info(f"Sanitizing {name} (replacing NaN/Inf with 0)...") - if hasattr(arr.data, "dask"): - sanitized = xr.apply_ufunc( - lambda x: np.nan_to_num( - x, nan=0.0, posinf=0.0, neginf=0.0 - ), - arr, - dask="parallelized", - output_dtypes=[arr.dtype], - ) - else: - sanitized = xr.DataArray( - np.nan_to_num( - arr.values, nan=0.0, posinf=0.0, neginf=0.0 - ), - coords=arr.coords, - dims=arr.dims, - ) - return sanitized - - # Start Dask cluster - logger.info( - f"Starting Minian processing with {n_workers} workers, " - f"{memory_limit} memory limit per worker..." - ) - cluster = LocalCluster( - n_workers=n_workers, - memory_limit=memory_limit, - resources={"MEM": 1}, - threads_per_worker=2, - dashboard_address=None, - ) - annotation_plugin = TaskAnnotation() - cluster.scheduler.add_plugin(annotation_plugin) - client = Client(cluster) - client.register_worker_plugin(SolveTriangularNaNGuard()) - - try: - # ===== LOAD VIDEOS ===== - logger.info("Loading videos...") - default_load_params = { - "pattern": r".*\.avi$", - "dtype": np.uint8, - "downsample": dict(frame=1, height=1, width=1), - "downsample_strategy": "subset", - } - param_load_videos = { - **default_load_params, - **params.get("param_load_videos", {}), + config = { + "input_dir": "/data/input", + "output_dir": "/data/output", + "intermediate_dir": "/data/output/minian_data", + "params": params, + "n_workers": n_workers, + "memory_limit": memory_limit, } - varr = load_videos( - str(pathlib.Path(avi_files[0]).parent), **param_load_videos - ) - chk, _ = get_optimal_chk(varr, dtype=float) - - # Save raw video to zarr - logger.info("Saving raw video to zarr...") - varr = save_minian( - varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), - minian_data_path, - overwrite=True, + config_path = output_dir / "minian_config.json" + with open(config_path, "w") as f: + json_mod.dump(config, f, indent=2, default=str) + + # Spawn sibling container + client = docker_sdk.from_env() + container_result = client.containers.run( + image=docker_image, + command=[ + "python", + "/opt/run_minian.py", + "/data/output/minian_config.json", + ], + volumes={ + input_dir_host: {"bind": "/data/input", "mode": "ro"}, + output_dir_host: {"bind": "/data/output", "mode": "rw"}, + }, + mem_limit=extra_params.get("container_mem_limit", "24g"), + environment={ + "MINIAN_NWORKERS": str(n_workers), + "MINIAN_MEMORY_LIMIT": memory_limit_env, + "MKL_NUM_THREADS": "1", + "OPENBLAS_NUM_THREADS": "1", + "OMP_NUM_THREADS": "1", + }, + remove=True, + detach=False, + stdout=True, + stderr=True, ) logger.info( - f"Loaded video: {varr.sizes['frame']} frames, " - f"{varr.sizes['height']}x{varr.sizes['width']} pixels" - ) - - # ===== PREPROCESSING ===== - logger.info("Preprocessing: glow removal...") - varr_min = varr.min("frame").compute() - varr_ref = varr - varr_min - varr_ref = varr_ref.clip(min=0) - - logger.info("Preprocessing: denoising...") - param_denoise = params.get( - "param_denoise", {"method": "median", "ksize": 7} - ) - varr_ref = denoise(varr_ref, **param_denoise) - - logger.info("Preprocessing: background removal...") - param_background_removal = params.get( - "param_background_removal", {"method": "tophat", "wnd": 10} + f"Container output:\n{container_result.decode()}" ) - varr_ref = remove_background(varr_ref, **param_background_removal) - # Keep it chunked - varr_ref = varr_ref.chunk({"frame": chk["frame"], "height": -1, "width": -1}) - - # ===== MOTION CORRECTION ===== - logger.info("Estimating motion...") - param_estimate_motion = params.get( - "param_estimate_motion", {"dim": "frame"} - ) - motion = estimate_motion(varr_ref, **param_estimate_motion) - motion = save_minian( - motion.rename("motion").chunk({"frame": chk["frame"]}), - minian_data_path, - overwrite=True, - ) - - logger.info("Applying motion correction...") - Y = apply_transform(varr_ref, motion, fill=0) - - # <<< NEW: Sanitize after motion correction >>> - # apply_transform can introduce NaN at frame borders where the - # shift goes beyond the image boundary. The fill=0 param should - # handle this, but edge cases slip through on large datasets - # with float64 precision. This prevents downstream - # solve_triangular crashes in CNMF. - Y = sanitize_array(Y, "Y_post_motion_correction") - - # Save two versions with different chunking - logger.info("Saving motion-corrected video (frame-chunked)...") - Y_fm_chk = save_minian( - Y.astype(np.float32).rename("Y_fm_chk"), - minian_data_path, - overwrite=True, - ) - - logger.info("Saving motion-corrected video (spatial-chunked)...") - Y_hw_chk = save_minian( - Y_fm_chk.rename("Y_hw_chk"), - minian_data_path, - overwrite=True, - chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, - ) - - # # Save motion corrected video as mp4 - # logger.info("Writing motion corrected video...") - # write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) + # Verify completion marker + if (output_dir / ".minian_error").exists(): + with open(output_dir / ".minian_error") as f: + err = json_mod.load(f) + raise RuntimeError( + f"Minian container error: {err.get('error')}" + ) + if not (output_dir / ".minian_complete").exists(): + raise RuntimeError( + "Minian container exited without completion marker" + ) - # Create and save max projection - logger.info("Computing max projection...") - max_proj = Y_fm_chk.max("frame").compute() - max_proj = save_minian(max_proj.rename("max_proj"), minian_data_path, overwrite=True) + # Load results (reuses existing MinianLoader) + minian_loader = MinianLoader(str(output_dir)) + key["processing_time"] = minian_loader.creation_time + key["package_version"] = "1.2.1-py38-docker" - # ===== SEED INITIALIZATION ===== - logger.info("Initializing seeds...") - param_seeds_init = params.get( - "param_seeds_init", + file_entries = [ { - "wnd_size": 1000, - "method": "rolling", - "stp_size": 500, - "max_wnd": 15, - "diff_thres": 3, - }, - ) - seeds = seeds_init(Y_fm_chk, **param_seeds_init) - logger.info(f"Initial seeds: {len(seeds)}") - - logger.info("Refining seeds with PNR...") - param_pnr_refine = params.get( - "param_pnr_refine", {"noise_freq": 0.06, "thres": 1} - ) - seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) - logger.info(f"Seeds after PNR refine: {seeds['mask_pnr'].sum()} / {len(seeds)}") - - logger.info("Refining seeds with KS test...") - param_ks_refine = params.get("param_ks_refine", {"sig": 0.05}) - seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) - logger.info(f"Seeds after KS refine: {seeds['mask_ks'].sum()} / {len(seeds)}") - - logger.info("Merging seeds...") - param_seeds_merge = params.get( - "param_seeds_merge", - {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, - ) - seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index(drop=True) - seeds_final = seeds_merge(Y_hw_chk, max_proj, seeds_final, **param_seeds_merge) - n_seeds = seeds_final["mask_mrg"].sum() - logger.info(f"Seeds after merge: {n_seeds} / {len(seeds_final)}") - - if n_seeds == 0: - raise ValueError( - "No seeds remaining after refinement. " - "Consider adjusting param_seeds_init or param_pnr_refine thresholds." - ) + **key, + "file_name": f.name, + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] - # ===== INITIALIZE A AND C ===== - logger.info("Initializing spatial footprints (A)...") - param_initialize = params.get( - "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} - ) - A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) - A_init = save_minian(A_init.rename("A_init"), minian_data_path, overwrite=True) - logger.info(f"A_init shape: {A_init.shape}") - - logger.info("Initializing temporal traces (C)...") - C_init = initC(Y_fm_chk, A_init) - C_init = save_minian( - C_init.rename("C_init"), - minian_data_path, - overwrite=True, - chunks={"unit_id": 1, "frame": -1}, - ) - logger.info(f"C_init shape: {C_init.shape}") - - # Initial unit merge - logger.info("Initial unit merge...") - param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) - A, C = unit_merge(A_init, C_init, **param_init_merge) - A = save_minian(A.rename("A"), minian_data_path, overwrite=True) - C = save_minian(C.rename("C"), minian_data_path, overwrite=True) - C_chk = save_minian( - C.rename("C_chk"), - minian_data_path, - overwrite=True, - chunks={"unit_id": -1, "frame": chk["frame"]}, - ) - logger.info(f"Units after initial merge: {A.sizes['unit_id']}") - - # ===== INITIALIZE BACKGROUND ===== - logger.info("Initializing background terms...") - b, f = update_background(Y_fm_chk, A, C_chk) - f = save_minian(f.rename("f"), minian_data_path, overwrite=True) - b = save_minian(b.rename("b"), minian_data_path, overwrite=True) - logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") - - # ===== COMPUTE NOISE STATISTICS ===== - logger.info("Computing noise statistics...") - param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) - sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) - sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), minian_data_path, overwrite=True) - - # ======================================== - # CNMF ITERATION 1 - # ======================================== - - # ----- First Spatial Update ----- - logger.info("CNMF Iteration 1: Spatial update...") - param_first_spatial = params.get( - "param_first_spatial", - {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, - ) - A_new, mask, norm_fac = update_spatial( - Y_hw_chk, A, C, sn_spatial, **param_first_spatial - ) - C_new = save_minian( - (C.sel(unit_id=mask) * norm_fac).rename("C_new"), - minian_data_path, - overwrite=True, + else: + # === EXISTING INLINE EXECUTION PATH === + import multiprocessing + import psutil + import shutil + import dask + import dask.array as darr + import scipy.linalg # <<< NEW: for solve_triangular patch + import xarray as xr # <<< NEW: for sanitize_array + from dask.distributed import Client, LocalCluster + + # <<< NEW: Prevent infinite retry loops on NaN/Inf errors >>> + # If a Dask task fails (e.g. solve_triangular ValueError), don't + # keep retrying until OOM — fail fast after 3 attempts. + dask.config.set({ + "distributed.scheduler.allowed-failures": 3, + "distributed.comm.timeouts.connect": "300s", + "distributed.comm.timeouts.tcp": "7200s", + "distributed.scheduler.work-stealing": False, + "distributed.scheduler.worker-ttl": "7200s", # ← ADD THIS: 1 hour before declaring worker dead + }) + + # ===== APPLY COMPATIBILITY PATCHES ===== + + # Fix for NetworkX 3.0+ API changes + import networkx as nx + import scipy.sparse + from scipy.sparse import issparse + import minian.cnmf as cnmf_module + + def label_connected_fixed(adj, only_connected=False): + """Fixed label_connected for NetworkX 3.0+ compatibility.""" + if issparse(adj): + adj = adj.toarray() + adj = adj.copy() + np.fill_diagonal(adj, 0) + adj = np.triu(adj) + g = nx.from_numpy_array(adj) + labels = np.zeros(adj.shape[0], dtype=int) + for icomp, comp in enumerate(nx.connected_components(g)): + for node in comp: + labels[node] = icomp + if only_connected: + iso_mask = np.array([len(c) == 1 for c in nx.connected_components(g)]) + labels[np.isin(labels, np.where(iso_mask)[0])] = -1 + return labels + + cnmf_module.label_connected = label_connected_fixed + + # Fix for sparse array auto-densification + import sparse + import sparse.numba_backend._sparse_array as sparse_mod + sparse_mod.AUTO_DENSIFY = True + + # Fix for darr.block with mixed sparse array types + _original_darr_block = darr.block + + def patched_darr_block(arrays, allow_unknown_chunksizes=False): + """Patched darr.block that ensures sparse array type consistency.""" + def convert_to_coo(arr): + if arr is None: + return arr + if isinstance(arr, sparse.COO): + return arr + if isinstance(arr, sparse.SparseArray): + return sparse.COO(arr) + if isinstance(arr, np.ndarray): + return sparse.COO.from_numpy(arr) + if hasattr(arr, 'todense'): + return sparse.COO.from_numpy(np.asarray(arr.todense())) + return arr + + def recursive_convert(obj): + if isinstance(obj, list): + return [recursive_convert(item) for item in obj] + elif isinstance(obj, np.ndarray) and obj.dtype == object: + result = np.empty_like(obj) + for idx in np.ndindex(obj.shape): + result[idx] = convert_to_coo(obj[idx]) + return result + else: + return convert_to_coo(obj) + + try: + return _original_darr_block(arrays, allow_unknown_chunksizes=allow_unknown_chunksizes) + except ValueError as e: + if "All arrays must be instances of SparseArray" in str(e): + converted = recursive_convert(arrays) + return _original_darr_block(converted, allow_unknown_chunksizes=allow_unknown_chunksizes) + raise + + darr.block = patched_darr_block + darr.core.block = patched_darr_block + + # ===== PATCH: sparse/dense concatenation compatibility ===== + import sparse + import sparse.numba_backend._coo.common as _sparse_coo_common + import numpy as np + + _original_sparse_concat = _sparse_coo_common.concatenate + + def _patched_sparse_concat(arrays, axis=0): + """Convert any dense arrays to sparse.COO before concatenation.""" + converted = [] + for arr in arrays: + if isinstance(arr, sparse.SparseArray): + converted.append(arr) + elif isinstance(arr, np.ndarray): + converted.append(sparse.COO.from_numpy(arr)) + else: + try: + converted.append(sparse.COO.from_numpy(np.asarray(arr))) + except Exception: + converted.append(arr) + return _original_sparse_concat(converted, axis=axis) + + _sparse_coo_common.concatenate = _patched_sparse_concat + + # ===== PATCH: sparse/dense concatenation compatibility ===== + import sparse + import numpy as np + import sparse.numba_backend._coo.common as _sparse_coo_common + + _original_check = _sparse_coo_common.check_consistent_fill_value + + def _patched_check(arrays): + """Auto-convert any dense arrays to sparse.COO before validation.""" + for i, arr in enumerate(arrays): + if not isinstance(arr, sparse.SparseArray): + arrays[i] = sparse.COO.from_numpy(np.asarray(arr)) + return _original_check(arrays) + + _sparse_coo_common.check_consistent_fill_value = _patched_check + # Patch update_temporal to handle sparse.COO arrays + _original_update_temporal = cnmf_module.update_temporal + + def patched_update_temporal(A, C, b=None, f=None, Y=None, YrA=None, + noise_freq=0.25, p=2, add_lag="p", jac_thres=0.1, + sparse_penal=1, bseg=None, med_wd=None, + zero_thres=1e-8, max_iters=200, use_smooth=True, + normalize=True, warm_start=False, post_scal=False, + scs_fallback=False, concurrent_update=False): + """Patched update_temporal that handles sparse.COO arrays properly.""" + _original_csc_matrix = scipy.sparse.csc_matrix + + class PatchedCSCMatrix(scipy.sparse.csc_matrix): + def __new__(cls, arg1, shape=None, dtype=None, copy=False): + if hasattr(arg1, 'todense'): + arg1 = arg1.todense() + return _original_csc_matrix(arg1, shape=shape, dtype=dtype, copy=copy) + + scipy.sparse.csc_matrix = PatchedCSCMatrix + try: + result = _original_update_temporal( + A, C, b=b, f=f, Y=Y, YrA=YrA, noise_freq=noise_freq, + p=p, add_lag=add_lag, jac_thres=jac_thres, sparse_penal=sparse_penal, + bseg=bseg, med_wd=med_wd, zero_thres=zero_thres, max_iters=max_iters, + use_smooth=use_smooth, normalize=normalize, warm_start=warm_start, + post_scal=post_scal, scs_fallback=scs_fallback, + concurrent_update=concurrent_update + ) + finally: + scipy.sparse.csc_matrix = _original_csc_matrix + return result + + cnmf_module.update_temporal = patched_update_temporal + + # <<< NEW PATCH: scipy.linalg.solve_triangular NaN/Inf guard >>> + # Prevents ValueError('array must not contain infs or NaNs') from + # crashing Dask tasks and triggering infinite retry → OOM loops. + from distributed import WorkerPlugin + + class SolveTriangularNaNGuard(WorkerPlugin): + """Patch scipy.linalg.solve_triangular on each Dask worker process + to sanitize NaN/Inf inputs instead of crashing.""" + + def setup(self, worker): + import scipy.linalg + import numpy as np + + _original = scipy.linalg.solve_triangular + + def patched_solve_triangular(a, b, **kwargs): + a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) + b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) + return _original(a, b, **kwargs) + + scipy.linalg.solve_triangular = patched_solve_triangular + + logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block, solve_triangular NaN guard)") + + # Now import minian modules (after patches are applied) + from minian.cnmf import ( + get_noise_fft, + unit_merge, + update_spatial, + update_temporal, + update_background, ) - C_chk_new = save_minian( - (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), - minian_data_path, - overwrite=True, + from minian.initialization import ( + initA, + initC, + ks_refine, + pnr_refine, + seeds_init, + seeds_merge, ) - logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") - - # Update background after first spatial - logger.info("Updating background...") - b_new, f_new = update_background(Y_fm_chk, A_new, C_chk_new) - - # ----- First Temporal Update ----- - logger.info("CNMF Iteration 1: Temporal update...") - param_first_temporal = params.get( - "param_first_temporal", - { - "noise_freq": 0.06, - "sparse_penal": 1, - "p": 1, - "add_lag": 20, - "jac_thres": 0.2, - }, + from minian.motion_correction import apply_transform, estimate_motion + from minian.preprocessing import denoise, remove_background + from minian.utilities import ( + TaskAnnotation, + get_optimal_chk, + load_videos, + save_minian, ) - C_new, S_new, b0_new, c0_new, g, mask = update_temporal( - A_new, C_new, - Y=Y_fm_chk, - b=b_new, f=f_new, - **param_first_temporal + from minian.visualization import write_video + + # ===== CONTAINER-AWARE MEMORY DETECTION ===== + def get_container_memory_limit(): + """Get memory limit respecting container cgroups (v1 and v2).""" + try: + with open("/sys/fs/cgroup/memory.max") as f: + limit = f.read().strip() + if limit != "max": + return int(limit) + except (FileNotFoundError, PermissionError): + pass + try: + with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: + limit = int(f.read().strip()) + if limit < 9223372036854771712: + return limit + except (FileNotFoundError, PermissionError): + pass + return psutil.virtual_memory().total + + # ===== DASK CLUSTER CONFIGURATION ===== + memory_total = get_container_memory_limit() + n_workers = int( + os.getenv( + "MINIAN_NWORKERS", + max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), + ) ) - logger.info(f"Units after first temporal update: {C_new.sizes['unit_id']}") - C_new = sanitize_array(C_new, "C_new_pre_merge") - S_new = sanitize_array(S_new, "S_new_pre_merge") - # ----- First Merge ----- - logger.info("CNMF Iteration 1: Merging units...") - param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) - - # <<< PATCHED: unit_merge coordinate fix - # update_temporal returns C_new/S_new already filtered by mask internally. - # Sync A with C_new's actual coordinates instead of using the boolean mask, - # which can cause alignment mismatches on large datasets. - A_filtered = A_new.sel(unit_id=C_new.coords["unit_id"].values) + memory_per_worker = int(memory_total * 0.8 / n_workers) + + memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") + if memory_limit_env: + memory_limit = ( + memory_limit_env + if any(c.isalpha() for c in memory_limit_env) + else f"{memory_limit_env}GB" + ) + else: + memory_limit = f"{memory_per_worker // (1024**3)}GB" + + # Set minian intermediate storage paths + minian_data_path = str(output_dir / "minian_data") + os.makedirs(minian_data_path, exist_ok=True) + os.environ["MINIAN_INTERMEDIATE"] = minian_data_path + + # Helper function to clean intermediate files + def clean_intermediate_files(directory_path, file_names): + """Remove intermediate zarr files to avoid conflicts.""" + for fname in file_names: + path = os.path.join(directory_path, f"{fname}.zarr") + if os.path.exists(path): + shutil.rmtree(path) + + def sanitize_array(arr, name="array"): + """Replace NaN/Inf in an xarray DataArray with 0. + Works with both lazy (dask-backed) and in-memory arrays.""" + logger.info(f"Sanitizing {name} (replacing NaN/Inf with 0)...") + if hasattr(arr.data, "dask"): + sanitized = xr.apply_ufunc( + lambda x: np.nan_to_num( + x, nan=0.0, posinf=0.0, neginf=0.0 + ), + arr, + dask="parallelized", + output_dtypes=[arr.dtype], + ) + else: + sanitized = xr.DataArray( + np.nan_to_num( + arr.values, nan=0.0, posinf=0.0, neginf=0.0 + ), + coords=arr.coords, + dims=arr.dims, + ) + return sanitized + + # Start Dask cluster logger.info( - f"Unit sync check — A: {A_filtered.sizes['unit_id']}, " - f"C: {C_new.sizes['unit_id']}, S: {S_new.sizes['unit_id']}" - ) - A_mrg, C_mrg, [S_mrg] = unit_merge( - A_filtered, C_new, [S_new], **param_first_merge - ) - # >>> END PATCH - logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") - A_mrg = save_minian(A_mrg.rename("A_mrg"), minian_data_path, overwrite=True) - C_mrg = save_minian(C_mrg.rename("C_mrg"), minian_data_path, overwrite=True) - S_mrg = save_minian(S_mrg.rename("S_mrg"), minian_data_path, overwrite=True) - gc.collect() - logger.info("Saved merged results to zarr for iteration 2") - - # ======================================== - # CNMF ITERATION 2 - # ======================================== - - # ----- Second Spatial Update ----- - logger.info("CNMF Iteration 2: Spatial update...") - param_second_spatial = params.get( - "param_second_spatial", - {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, + f"Starting Minian processing with {n_workers} workers, " + f"{memory_limit} memory limit per worker..." ) - A_new2, mask2, norm_fac2 = update_spatial( - Y_hw_chk, A_mrg, C_mrg, sn_spatial, **param_second_spatial + cluster = LocalCluster( + n_workers=n_workers, + memory_limit=memory_limit, + resources={"MEM": 1}, + threads_per_worker=2, + dashboard_address=None, ) - C_new2 = C_mrg.sel(unit_id=mask2) * norm_fac2 - logger.info(f"Units after second spatial update: {A_new2.sizes['unit_id']}") - - # Update background after second spatial - logger.info("Updating background...") - b_new2, f_new2 = update_background(Y_fm_chk, A_new2, C_new2) - - # ----- Second Temporal Update ----- - logger.info("CNMF Iteration 2: Temporal update...") - param_second_temporal = params.get( - "param_second_temporal", + annotation_plugin = TaskAnnotation() + cluster.scheduler.add_plugin(annotation_plugin) + client = Client(cluster) + client.register_worker_plugin(SolveTriangularNaNGuard()) + + try: + # ===== LOAD VIDEOS ===== + logger.info("Loading videos...") + default_load_params = { + "pattern": r".*\.avi$", + "dtype": np.uint8, + "downsample": dict(frame=1, height=1, width=1), + "downsample_strategy": "subset", + } + param_load_videos = { + **default_load_params, + **params.get("param_load_videos", {}), + } + varr = load_videos( + str(pathlib.Path(avi_files[0]).parent), **param_load_videos + ) + chk, _ = get_optimal_chk(varr, dtype=float) + + # Save raw video to zarr + logger.info("Saving raw video to zarr...") + varr = save_minian( + varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), + minian_data_path, + overwrite=True, + ) + logger.info( + f"Loaded video: {varr.sizes['frame']} frames, " + f"{varr.sizes['height']}x{varr.sizes['width']} pixels" + ) + + # ===== PREPROCESSING ===== + logger.info("Preprocessing: glow removal...") + varr_min = varr.min("frame").compute() + varr_ref = varr - varr_min + varr_ref = varr_ref.clip(min=0) + + logger.info("Preprocessing: denoising...") + param_denoise = params.get( + "param_denoise", {"method": "median", "ksize": 7} + ) + varr_ref = denoise(varr_ref, **param_denoise) + + logger.info("Preprocessing: background removal...") + param_background_removal = params.get( + "param_background_removal", {"method": "tophat", "wnd": 10} + ) + varr_ref = remove_background(varr_ref, **param_background_removal) + + # Keep it chunked + varr_ref = varr_ref.chunk({"frame": chk["frame"], "height": -1, "width": -1}) + + # ===== MOTION CORRECTION ===== + logger.info("Estimating motion...") + param_estimate_motion = params.get( + "param_estimate_motion", {"dim": "frame"} + ) + motion = estimate_motion(varr_ref, **param_estimate_motion) + motion = save_minian( + motion.rename("motion").chunk({"frame": chk["frame"]}), + minian_data_path, + overwrite=True, + ) + + logger.info("Applying motion correction...") + Y = apply_transform(varr_ref, motion, fill=0) + + # <<< NEW: Sanitize after motion correction >>> + # apply_transform can introduce NaN at frame borders where the + # shift goes beyond the image boundary. The fill=0 param should + # handle this, but edge cases slip through on large datasets + # with float64 precision. This prevents downstream + # solve_triangular crashes in CNMF. + Y = sanitize_array(Y, "Y_post_motion_correction") + + # Save two versions with different chunking + logger.info("Saving motion-corrected video (frame-chunked)...") + Y_fm_chk = save_minian( + Y.astype(np.float32).rename("Y_fm_chk"), + minian_data_path, + overwrite=True, + ) + + logger.info("Saving motion-corrected video (spatial-chunked)...") + Y_hw_chk = save_minian( + Y_fm_chk.rename("Y_hw_chk"), + minian_data_path, + overwrite=True, + chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, + ) + + # # Save motion corrected video as mp4 + # logger.info("Writing motion corrected video...") + # write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) + + # Create and save max projection + logger.info("Computing max projection...") + max_proj = Y_fm_chk.max("frame").compute() + max_proj = save_minian(max_proj.rename("max_proj"), minian_data_path, overwrite=True) + + # ===== SEED INITIALIZATION ===== + logger.info("Initializing seeds...") + param_seeds_init = params.get( + "param_seeds_init", + { + "wnd_size": 1000, + "method": "rolling", + "stp_size": 500, + "max_wnd": 15, + "diff_thres": 3, + }, + ) + seeds = seeds_init(Y_fm_chk, **param_seeds_init) + logger.info(f"Initial seeds: {len(seeds)}") + + logger.info("Refining seeds with PNR...") + param_pnr_refine = params.get( + "param_pnr_refine", {"noise_freq": 0.06, "thres": 1} + ) + seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) + logger.info(f"Seeds after PNR refine: {seeds['mask_pnr'].sum()} / {len(seeds)}") + + logger.info("Refining seeds with KS test...") + param_ks_refine = params.get("param_ks_refine", {"sig": 0.05}) + seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) + logger.info(f"Seeds after KS refine: {seeds['mask_ks'].sum()} / {len(seeds)}") + + logger.info("Merging seeds...") + param_seeds_merge = params.get( + "param_seeds_merge", + {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, + ) + seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index(drop=True) + seeds_final = seeds_merge(Y_hw_chk, max_proj, seeds_final, **param_seeds_merge) + n_seeds = seeds_final["mask_mrg"].sum() + logger.info(f"Seeds after merge: {n_seeds} / {len(seeds_final)}") + + if n_seeds == 0: + raise ValueError( + "No seeds remaining after refinement. " + "Consider adjusting param_seeds_init or param_pnr_refine thresholds." + ) + + # ===== INITIALIZE A AND C ===== + logger.info("Initializing spatial footprints (A)...") + param_initialize = params.get( + "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} + ) + A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) + A_init = save_minian(A_init.rename("A_init"), minian_data_path, overwrite=True) + logger.info(f"A_init shape: {A_init.shape}") + + logger.info("Initializing temporal traces (C)...") + C_init = initC(Y_fm_chk, A_init) + C_init = save_minian( + C_init.rename("C_init"), + minian_data_path, + overwrite=True, + chunks={"unit_id": 1, "frame": -1}, + ) + logger.info(f"C_init shape: {C_init.shape}") + + # Initial unit merge + logger.info("Initial unit merge...") + param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) + A, C = unit_merge(A_init, C_init, **param_init_merge) + A = save_minian(A.rename("A"), minian_data_path, overwrite=True) + C = save_minian(C.rename("C"), minian_data_path, overwrite=True) + C_chk = save_minian( + C.rename("C_chk"), + minian_data_path, + overwrite=True, + chunks={"unit_id": -1, "frame": chk["frame"]}, + ) + logger.info(f"Units after initial merge: {A.sizes['unit_id']}") + + # ===== INITIALIZE BACKGROUND ===== + logger.info("Initializing background terms...") + b, f = update_background(Y_fm_chk, A, C_chk) + f = save_minian(f.rename("f"), minian_data_path, overwrite=True) + b = save_minian(b.rename("b"), minian_data_path, overwrite=True) + logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") + + # ===== COMPUTE NOISE STATISTICS ===== + logger.info("Computing noise statistics...") + param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) + sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) + sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), minian_data_path, overwrite=True) + + # ======================================== + # CNMF ITERATION 1 + # ======================================== + + # ----- First Spatial Update ----- + logger.info("CNMF Iteration 1: Spatial update...") + param_first_spatial = params.get( + "param_first_spatial", + {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, + ) + A_new, mask, norm_fac = update_spatial( + Y_hw_chk, A, C, sn_spatial, **param_first_spatial + ) + C_new = save_minian( + (C.sel(unit_id=mask) * norm_fac).rename("C_new"), + minian_data_path, + overwrite=True, + ) + C_chk_new = save_minian( + (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), + minian_data_path, + overwrite=True, + ) + logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") + + # Update background after first spatial + logger.info("Updating background...") + b_new, f_new = update_background(Y_fm_chk, A_new, C_chk_new) + + # ----- First Temporal Update ----- + logger.info("CNMF Iteration 1: Temporal update...") + param_first_temporal = params.get( + "param_first_temporal", + { + "noise_freq": 0.06, + "sparse_penal": 1, + "p": 1, + "add_lag": 20, + "jac_thres": 0.2, + }, + ) + C_new, S_new, b0_new, c0_new, g, mask = update_temporal( + A_new, C_new, + Y=Y_fm_chk, + b=b_new, f=f_new, + **param_first_temporal + ) + logger.info(f"Units after first temporal update: {C_new.sizes['unit_id']}") + C_new = sanitize_array(C_new, "C_new_pre_merge") + S_new = sanitize_array(S_new, "S_new_pre_merge") + # ----- First Merge ----- + logger.info("CNMF Iteration 1: Merging units...") + param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) + + # <<< PATCHED: unit_merge coordinate fix + # update_temporal returns C_new/S_new already filtered by mask internally. + # Sync A with C_new's actual coordinates instead of using the boolean mask, + # which can cause alignment mismatches on large datasets. + A_filtered = A_new.sel(unit_id=C_new.coords["unit_id"].values) + logger.info( + f"Unit sync check — A: {A_filtered.sizes['unit_id']}, " + f"C: {C_new.sizes['unit_id']}, S: {S_new.sizes['unit_id']}" + ) + A_mrg, C_mrg, [S_mrg] = unit_merge( + A_filtered, C_new, [S_new], **param_first_merge + ) + # >>> END PATCH + logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") + A_mrg = save_minian(A_mrg.rename("A_mrg"), minian_data_path, overwrite=True) + C_mrg = save_minian(C_mrg.rename("C_mrg"), minian_data_path, overwrite=True) + S_mrg = save_minian(S_mrg.rename("S_mrg"), minian_data_path, overwrite=True) + gc.collect() + logger.info("Saved merged results to zarr for iteration 2") + + # ======================================== + # CNMF ITERATION 2 + # ======================================== + + # ----- Second Spatial Update ----- + logger.info("CNMF Iteration 2: Spatial update...") + param_second_spatial = params.get( + "param_second_spatial", + {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, + ) + A_new2, mask2, norm_fac2 = update_spatial( + Y_hw_chk, A_mrg, C_mrg, sn_spatial, **param_second_spatial + ) + C_new2 = C_mrg.sel(unit_id=mask2) * norm_fac2 + logger.info(f"Units after second spatial update: {A_new2.sizes['unit_id']}") + + # Update background after second spatial + logger.info("Updating background...") + b_new2, f_new2 = update_background(Y_fm_chk, A_new2, C_new2) + + # ----- Second Temporal Update ----- + logger.info("CNMF Iteration 2: Temporal update...") + param_second_temporal = params.get( + "param_second_temporal", + { + "noise_freq": 0.06, + "sparse_penal": 1, + "p": 1, + "add_lag": 20, + "jac_thres": 0.4, + }, + ) + C_final, S_final, b0_final, c0_final, g_final, mask_final = update_temporal( + A_new2, C_new2, + Y=Y_fm_chk, + b=b_new2, f=f_new2, + **param_second_temporal + ) + C_final = sanitize_array(C_final, "C_final_post_temporal_2") + S_final = sanitize_array(S_final, "S_final_post_temporal_2") + + # <<< PATCHED: unit_merge coordinate fix (same pattern as iteration 1) + # Sync A with C_final's actual coordinates + A_final = A_new2.sel(unit_id=C_final.coords["unit_id"].values) + # >>> END PATCH + logger.info(f"Final units: {A_final.sizes['unit_id']}") + + # ===== SAVE FINAL RESULTS ===== + logger.info("Saving final results to output directory...") + final_save_params = {"dpath": str(output_dir), "overwrite": True} + + A_final = save_minian(A_final.rename("A"), **final_save_params) + C_final = save_minian(C_final.rename("C"), **final_save_params) + S_final = save_minian(S_final.rename("S"), **final_save_params) + b_final = save_minian(b_new2.rename("b"), **final_save_params) + f_final = save_minian(f_new2.rename("f"), **final_save_params) + motion_final = save_minian(motion.rename("motion"), **final_save_params) + max_proj_final = save_minian(max_proj.rename("max_proj"), **final_save_params) + + logger.info( + f"Minian processing complete. {A_final.sizes['unit_id']} units detected." + ) + + except Exception as e: + logger.error(f"Minian processing failed: {e}") + raise e + finally: + client.close() + cluster.close() + + # Load results and prepare for insertion + minian_loader = MinianLoader(minian_data_path) + key["processing_time"] = minian_loader.creation_time + + # Get minian version if available + try: + import minian + key["package_version"] = getattr(minian, "__version__", "") + except (ImportError, AttributeError): + key["package_version"] = "" + + file_entries = [ { - "noise_freq": 0.06, - "sparse_penal": 1, - "p": 1, - "add_lag": 20, - "jac_thres": 0.4, - }, - ) - C_final, S_final, b0_final, c0_final, g_final, mask_final = update_temporal( - A_new2, C_new2, - Y=Y_fm_chk, - b=b_new2, f=f_new2, - **param_second_temporal - ) - C_final = sanitize_array(C_final, "C_final_post_temporal_2") - S_final = sanitize_array(S_final, "S_final_post_temporal_2") - - # <<< PATCHED: unit_merge coordinate fix (same pattern as iteration 1) - # Sync A with C_final's actual coordinates - A_final = A_new2.sel(unit_id=C_final.coords["unit_id"].values) - # >>> END PATCH - logger.info(f"Final units: {A_final.sizes['unit_id']}") - - # ===== SAVE FINAL RESULTS ===== - logger.info("Saving final results to output directory...") - final_save_params = {"dpath": str(output_dir), "overwrite": True} - - A_final = save_minian(A_final.rename("A"), **final_save_params) - C_final = save_minian(C_final.rename("C"), **final_save_params) - S_final = save_minian(S_final.rename("S"), **final_save_params) - b_final = save_minian(b_new2.rename("b"), **final_save_params) - f_final = save_minian(f_new2.rename("f"), **final_save_params) - motion_final = save_minian(motion.rename("motion"), **final_save_params) - max_proj_final = save_minian(max_proj.rename("max_proj"), **final_save_params) - - logger.info( - f"Minian processing complete. {A_final.sizes['unit_id']} units detected." - ) - - except Exception as e: - logger.error(f"Minian processing failed: {e}") - raise e - finally: - client.close() - cluster.close() - - # Load results and prepare for insertion - minian_loader = MinianLoader(minian_data_path) - key["processing_time"] = minian_loader.creation_time - - # Get minian version if available - try: - import minian - key["package_version"] = getattr(minian, "__version__", "") - except (ImportError, AttributeError): - key["package_version"] = "" - - file_entries = [ - { - **key, - "file_name": f.name, - "file": f.as_posix(), - } - for f in output_dir.rglob("*") - if f.is_file() - ] + **key, + "file_name": f.name, + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] else: raise ValueError(f"Unknown task mode: {task_mode}") From 001822c3799ca100c8619e7faa980f3870959ad8 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Mon, 23 Feb 2026 13:20:22 -0500 Subject: [PATCH 33/36] Replace inline minian with Docker-only execution path Remove the ~650 lines of inline minian processing + compatibility patches. The minian method now always runs via the py38 sibling container. Docker image configurable via MINIAN_DOCKER_IMAGE env var. Co-Authored-By: Claude Opus 4.6 --- element_miniscope/miniscope.py | 857 ++++----------------------------- 1 file changed, 95 insertions(+), 762 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 69fc3b1..1c6e96e 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -969,776 +969,109 @@ def _run_processing(): ] elif method == "minian": - extra_params = params.pop("extra_dj_params", {}) - docker_image = extra_params.get("docker_image", None) - - if docker_image: - # === DOCKER CONTAINER EXECUTION PATH === - # Runs the original minian (Python 3.8) in a sibling container - import docker as docker_sdk - import json as json_mod + import docker as docker_sdk + import json as json_mod - logger.info( - f"Running minian via Docker container: {docker_image}" - ) + docker_image = os.environ.get( + "MINIAN_DOCKER_IMAGE", "datajoint/minian-py38:latest" + ) + logger.info(f"Running minian via Docker container: {docker_image}") - # Resolve host-level paths for sibling container volume mounts - host_s3_root = os.environ["HOST_S3_ROOT"] - host_outbox = os.environ["HOST_OUTBOX"] - container_raw_root = os.environ.get( - "RAW_ROOT_DATA_DIR", "/home/jovyan/s3/inbox" - ) - container_processed_root = os.environ.get( - "PROCESSED_ROOT_DATA_DIR", "/home/jovyan/efs/outbox" - ) + # Resolve host-level paths for sibling container volume mounts + host_s3_root = os.environ["HOST_S3_ROOT"] + host_outbox = os.environ["HOST_OUTBOX"] + container_raw_root = os.environ.get( + "RAW_ROOT_DATA_DIR", "/home/jovyan/s3/inbox" + ) + container_processed_root = os.environ.get( + "PROCESSED_ROOT_DATA_DIR", "/home/jovyan/efs/outbox" + ) - # Map container paths -> host paths for sibling container mounts - input_dir = str(pathlib.Path(avi_files[0]).parent) - input_dir_host = input_dir.replace( - container_raw_root, host_s3_root + "/inbox", 1 - ) - output_dir_host = str(output_dir).replace( - container_processed_root, host_outbox, 1 - ) + # Map container paths -> host paths for sibling container mounts + input_dir = str(pathlib.Path(avi_files[0]).parent) + input_dir_host = input_dir.replace( + container_raw_root, host_s3_root + "/inbox", 1 + ) + output_dir_host = str(output_dir).replace( + container_processed_root, host_outbox, 1 + ) - # Write config JSON to shared filesystem - n_workers = int(os.getenv("MINIAN_NWORKERS", 2)) - memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT", "4") - memory_limit = ( - memory_limit_env - if any(c.isalpha() for c in memory_limit_env) - else f"{memory_limit_env}GB" - ) - config = { - "input_dir": "/data/input", - "output_dir": "/data/output", - "intermediate_dir": "/data/output/minian_data", - "params": params, - "n_workers": n_workers, - "memory_limit": memory_limit, - } - config_path = output_dir / "minian_config.json" - with open(config_path, "w") as f: - json_mod.dump(config, f, indent=2, default=str) - - # Spawn sibling container - client = docker_sdk.from_env() - container_result = client.containers.run( - image=docker_image, - command=[ - "python", - "/opt/run_minian.py", - "/data/output/minian_config.json", - ], - volumes={ - input_dir_host: {"bind": "/data/input", "mode": "ro"}, - output_dir_host: {"bind": "/data/output", "mode": "rw"}, - }, - mem_limit=extra_params.get("container_mem_limit", "24g"), - environment={ - "MINIAN_NWORKERS": str(n_workers), - "MINIAN_MEMORY_LIMIT": memory_limit_env, - "MKL_NUM_THREADS": "1", - "OPENBLAS_NUM_THREADS": "1", - "OMP_NUM_THREADS": "1", - }, - remove=True, - detach=False, - stdout=True, - stderr=True, + # Write config JSON to shared filesystem + n_workers = int(os.getenv("MINIAN_NWORKERS", 2)) + memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT", "4") + memory_limit = ( + memory_limit_env + if any(c.isalpha() for c in memory_limit_env) + else f"{memory_limit_env}GB" + ) + container_mem_limit = os.getenv("MINIAN_CONTAINER_MEM_LIMIT", "24g") + + config = { + "input_dir": "/data/input", + "output_dir": "/data/output", + "intermediate_dir": "/data/output/minian_data", + "params": params, + "n_workers": n_workers, + "memory_limit": memory_limit, + } + config_path = output_dir / "minian_config.json" + with open(config_path, "w") as f: + json_mod.dump(config, f, indent=2, default=str) + + # Spawn sibling container + docker_client = docker_sdk.from_env() + container_result = docker_client.containers.run( + image=docker_image, + command=[ + "python", + "/opt/run_minian.py", + "/data/output/minian_config.json", + ], + volumes={ + input_dir_host: {"bind": "/data/input", "mode": "ro"}, + output_dir_host: {"bind": "/data/output", "mode": "rw"}, + }, + mem_limit=container_mem_limit, + environment={ + "MINIAN_NWORKERS": str(n_workers), + "MINIAN_MEMORY_LIMIT": memory_limit_env, + "MKL_NUM_THREADS": "1", + "OPENBLAS_NUM_THREADS": "1", + "OMP_NUM_THREADS": "1", + }, + remove=True, + detach=False, + stdout=True, + stderr=True, + ) + logger.info(f"Container output:\n{container_result.decode()}") + + # Verify completion marker + if (output_dir / ".minian_error").exists(): + with open(output_dir / ".minian_error") as f: + err = json_mod.load(f) + raise RuntimeError( + f"Minian container error: {err.get('error')}" ) - logger.info( - f"Container output:\n{container_result.decode()}" + if not (output_dir / ".minian_complete").exists(): + raise RuntimeError( + "Minian container exited without completion marker" ) - # Verify completion marker - if (output_dir / ".minian_error").exists(): - with open(output_dir / ".minian_error") as f: - err = json_mod.load(f) - raise RuntimeError( - f"Minian container error: {err.get('error')}" - ) - if not (output_dir / ".minian_complete").exists(): - raise RuntimeError( - "Minian container exited without completion marker" - ) - - # Load results (reuses existing MinianLoader) - minian_loader = MinianLoader(str(output_dir)) - key["processing_time"] = minian_loader.creation_time - key["package_version"] = "1.2.1-py38-docker" - - file_entries = [ - { - **key, - "file_name": f.name, - "file": f.as_posix(), - } - for f in output_dir.rglob("*") - if f.is_file() - ] + # Load results + minian_loader = MinianLoader(str(output_dir)) + key["processing_time"] = minian_loader.creation_time + key["package_version"] = "1.2.1-py38-docker" - else: - # === EXISTING INLINE EXECUTION PATH === - import multiprocessing - import psutil - import shutil - import dask - import dask.array as darr - import scipy.linalg # <<< NEW: for solve_triangular patch - import xarray as xr # <<< NEW: for sanitize_array - from dask.distributed import Client, LocalCluster - - # <<< NEW: Prevent infinite retry loops on NaN/Inf errors >>> - # If a Dask task fails (e.g. solve_triangular ValueError), don't - # keep retrying until OOM — fail fast after 3 attempts. - dask.config.set({ - "distributed.scheduler.allowed-failures": 3, - "distributed.comm.timeouts.connect": "300s", - "distributed.comm.timeouts.tcp": "7200s", - "distributed.scheduler.work-stealing": False, - "distributed.scheduler.worker-ttl": "7200s", # ← ADD THIS: 1 hour before declaring worker dead - }) - - # ===== APPLY COMPATIBILITY PATCHES ===== - - # Fix for NetworkX 3.0+ API changes - import networkx as nx - import scipy.sparse - from scipy.sparse import issparse - import minian.cnmf as cnmf_module - - def label_connected_fixed(adj, only_connected=False): - """Fixed label_connected for NetworkX 3.0+ compatibility.""" - if issparse(adj): - adj = adj.toarray() - adj = adj.copy() - np.fill_diagonal(adj, 0) - adj = np.triu(adj) - g = nx.from_numpy_array(adj) - labels = np.zeros(adj.shape[0], dtype=int) - for icomp, comp in enumerate(nx.connected_components(g)): - for node in comp: - labels[node] = icomp - if only_connected: - iso_mask = np.array([len(c) == 1 for c in nx.connected_components(g)]) - labels[np.isin(labels, np.where(iso_mask)[0])] = -1 - return labels - - cnmf_module.label_connected = label_connected_fixed - - # Fix for sparse array auto-densification - import sparse - import sparse.numba_backend._sparse_array as sparse_mod - sparse_mod.AUTO_DENSIFY = True - - # Fix for darr.block with mixed sparse array types - _original_darr_block = darr.block - - def patched_darr_block(arrays, allow_unknown_chunksizes=False): - """Patched darr.block that ensures sparse array type consistency.""" - def convert_to_coo(arr): - if arr is None: - return arr - if isinstance(arr, sparse.COO): - return arr - if isinstance(arr, sparse.SparseArray): - return sparse.COO(arr) - if isinstance(arr, np.ndarray): - return sparse.COO.from_numpy(arr) - if hasattr(arr, 'todense'): - return sparse.COO.from_numpy(np.asarray(arr.todense())) - return arr - - def recursive_convert(obj): - if isinstance(obj, list): - return [recursive_convert(item) for item in obj] - elif isinstance(obj, np.ndarray) and obj.dtype == object: - result = np.empty_like(obj) - for idx in np.ndindex(obj.shape): - result[idx] = convert_to_coo(obj[idx]) - return result - else: - return convert_to_coo(obj) - - try: - return _original_darr_block(arrays, allow_unknown_chunksizes=allow_unknown_chunksizes) - except ValueError as e: - if "All arrays must be instances of SparseArray" in str(e): - converted = recursive_convert(arrays) - return _original_darr_block(converted, allow_unknown_chunksizes=allow_unknown_chunksizes) - raise - - darr.block = patched_darr_block - darr.core.block = patched_darr_block - - # ===== PATCH: sparse/dense concatenation compatibility ===== - import sparse - import sparse.numba_backend._coo.common as _sparse_coo_common - import numpy as np - - _original_sparse_concat = _sparse_coo_common.concatenate - - def _patched_sparse_concat(arrays, axis=0): - """Convert any dense arrays to sparse.COO before concatenation.""" - converted = [] - for arr in arrays: - if isinstance(arr, sparse.SparseArray): - converted.append(arr) - elif isinstance(arr, np.ndarray): - converted.append(sparse.COO.from_numpy(arr)) - else: - try: - converted.append(sparse.COO.from_numpy(np.asarray(arr))) - except Exception: - converted.append(arr) - return _original_sparse_concat(converted, axis=axis) - - _sparse_coo_common.concatenate = _patched_sparse_concat - - # ===== PATCH: sparse/dense concatenation compatibility ===== - import sparse - import numpy as np - import sparse.numba_backend._coo.common as _sparse_coo_common - - _original_check = _sparse_coo_common.check_consistent_fill_value - - def _patched_check(arrays): - """Auto-convert any dense arrays to sparse.COO before validation.""" - for i, arr in enumerate(arrays): - if not isinstance(arr, sparse.SparseArray): - arrays[i] = sparse.COO.from_numpy(np.asarray(arr)) - return _original_check(arrays) - - _sparse_coo_common.check_consistent_fill_value = _patched_check - # Patch update_temporal to handle sparse.COO arrays - _original_update_temporal = cnmf_module.update_temporal - - def patched_update_temporal(A, C, b=None, f=None, Y=None, YrA=None, - noise_freq=0.25, p=2, add_lag="p", jac_thres=0.1, - sparse_penal=1, bseg=None, med_wd=None, - zero_thres=1e-8, max_iters=200, use_smooth=True, - normalize=True, warm_start=False, post_scal=False, - scs_fallback=False, concurrent_update=False): - """Patched update_temporal that handles sparse.COO arrays properly.""" - _original_csc_matrix = scipy.sparse.csc_matrix - - class PatchedCSCMatrix(scipy.sparse.csc_matrix): - def __new__(cls, arg1, shape=None, dtype=None, copy=False): - if hasattr(arg1, 'todense'): - arg1 = arg1.todense() - return _original_csc_matrix(arg1, shape=shape, dtype=dtype, copy=copy) - - scipy.sparse.csc_matrix = PatchedCSCMatrix - try: - result = _original_update_temporal( - A, C, b=b, f=f, Y=Y, YrA=YrA, noise_freq=noise_freq, - p=p, add_lag=add_lag, jac_thres=jac_thres, sparse_penal=sparse_penal, - bseg=bseg, med_wd=med_wd, zero_thres=zero_thres, max_iters=max_iters, - use_smooth=use_smooth, normalize=normalize, warm_start=warm_start, - post_scal=post_scal, scs_fallback=scs_fallback, - concurrent_update=concurrent_update - ) - finally: - scipy.sparse.csc_matrix = _original_csc_matrix - return result - - cnmf_module.update_temporal = patched_update_temporal - - # <<< NEW PATCH: scipy.linalg.solve_triangular NaN/Inf guard >>> - # Prevents ValueError('array must not contain infs or NaNs') from - # crashing Dask tasks and triggering infinite retry → OOM loops. - from distributed import WorkerPlugin - - class SolveTriangularNaNGuard(WorkerPlugin): - """Patch scipy.linalg.solve_triangular on each Dask worker process - to sanitize NaN/Inf inputs instead of crashing.""" - - def setup(self, worker): - import scipy.linalg - import numpy as np - - _original = scipy.linalg.solve_triangular - - def patched_solve_triangular(a, b, **kwargs): - a = np.nan_to_num(a, nan=0.0, posinf=0.0, neginf=0.0) - b = np.nan_to_num(b, nan=0.0, posinf=0.0, neginf=0.0) - return _original(a, b, **kwargs) - - scipy.linalg.solve_triangular = patched_solve_triangular - - logger.info("Applied minian compatibility patches (NetworkX, sparse arrays, dask.block, solve_triangular NaN guard)") - - # Now import minian modules (after patches are applied) - from minian.cnmf import ( - get_noise_fft, - unit_merge, - update_spatial, - update_temporal, - update_background, - ) - from minian.initialization import ( - initA, - initC, - ks_refine, - pnr_refine, - seeds_init, - seeds_merge, - ) - from minian.motion_correction import apply_transform, estimate_motion - from minian.preprocessing import denoise, remove_background - from minian.utilities import ( - TaskAnnotation, - get_optimal_chk, - load_videos, - save_minian, - ) - from minian.visualization import write_video - - # ===== CONTAINER-AWARE MEMORY DETECTION ===== - def get_container_memory_limit(): - """Get memory limit respecting container cgroups (v1 and v2).""" - try: - with open("/sys/fs/cgroup/memory.max") as f: - limit = f.read().strip() - if limit != "max": - return int(limit) - except (FileNotFoundError, PermissionError): - pass - try: - with open("/sys/fs/cgroup/memory/memory.limit_in_bytes") as f: - limit = int(f.read().strip()) - if limit < 9223372036854771712: - return limit - except (FileNotFoundError, PermissionError): - pass - return psutil.virtual_memory().total - - # ===== DASK CLUSTER CONFIGURATION ===== - memory_total = get_container_memory_limit() - n_workers = int( - os.getenv( - "MINIAN_NWORKERS", - max(1, min(int(multiprocessing.cpu_count() * 0.4), 8)), - ) - ) - memory_per_worker = int(memory_total * 0.8 / n_workers) - - memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT") - if memory_limit_env: - memory_limit = ( - memory_limit_env - if any(c.isalpha() for c in memory_limit_env) - else f"{memory_limit_env}GB" - ) - else: - memory_limit = f"{memory_per_worker // (1024**3)}GB" - - # Set minian intermediate storage paths - minian_data_path = str(output_dir / "minian_data") - os.makedirs(minian_data_path, exist_ok=True) - os.environ["MINIAN_INTERMEDIATE"] = minian_data_path - - # Helper function to clean intermediate files - def clean_intermediate_files(directory_path, file_names): - """Remove intermediate zarr files to avoid conflicts.""" - for fname in file_names: - path = os.path.join(directory_path, f"{fname}.zarr") - if os.path.exists(path): - shutil.rmtree(path) - - def sanitize_array(arr, name="array"): - """Replace NaN/Inf in an xarray DataArray with 0. - Works with both lazy (dask-backed) and in-memory arrays.""" - logger.info(f"Sanitizing {name} (replacing NaN/Inf with 0)...") - if hasattr(arr.data, "dask"): - sanitized = xr.apply_ufunc( - lambda x: np.nan_to_num( - x, nan=0.0, posinf=0.0, neginf=0.0 - ), - arr, - dask="parallelized", - output_dtypes=[arr.dtype], - ) - else: - sanitized = xr.DataArray( - np.nan_to_num( - arr.values, nan=0.0, posinf=0.0, neginf=0.0 - ), - coords=arr.coords, - dims=arr.dims, - ) - return sanitized - - # Start Dask cluster - logger.info( - f"Starting Minian processing with {n_workers} workers, " - f"{memory_limit} memory limit per worker..." - ) - cluster = LocalCluster( - n_workers=n_workers, - memory_limit=memory_limit, - resources={"MEM": 1}, - threads_per_worker=2, - dashboard_address=None, - ) - annotation_plugin = TaskAnnotation() - cluster.scheduler.add_plugin(annotation_plugin) - client = Client(cluster) - client.register_worker_plugin(SolveTriangularNaNGuard()) - - try: - # ===== LOAD VIDEOS ===== - logger.info("Loading videos...") - default_load_params = { - "pattern": r".*\.avi$", - "dtype": np.uint8, - "downsample": dict(frame=1, height=1, width=1), - "downsample_strategy": "subset", - } - param_load_videos = { - **default_load_params, - **params.get("param_load_videos", {}), - } - varr = load_videos( - str(pathlib.Path(avi_files[0]).parent), **param_load_videos - ) - chk, _ = get_optimal_chk(varr, dtype=float) - - # Save raw video to zarr - logger.info("Saving raw video to zarr...") - varr = save_minian( - varr.chunk({"frame": chk["frame"], "height": -1, "width": -1}).rename("varr"), - minian_data_path, - overwrite=True, - ) - logger.info( - f"Loaded video: {varr.sizes['frame']} frames, " - f"{varr.sizes['height']}x{varr.sizes['width']} pixels" - ) - - # ===== PREPROCESSING ===== - logger.info("Preprocessing: glow removal...") - varr_min = varr.min("frame").compute() - varr_ref = varr - varr_min - varr_ref = varr_ref.clip(min=0) - - logger.info("Preprocessing: denoising...") - param_denoise = params.get( - "param_denoise", {"method": "median", "ksize": 7} - ) - varr_ref = denoise(varr_ref, **param_denoise) - - logger.info("Preprocessing: background removal...") - param_background_removal = params.get( - "param_background_removal", {"method": "tophat", "wnd": 10} - ) - varr_ref = remove_background(varr_ref, **param_background_removal) - - # Keep it chunked - varr_ref = varr_ref.chunk({"frame": chk["frame"], "height": -1, "width": -1}) - - # ===== MOTION CORRECTION ===== - logger.info("Estimating motion...") - param_estimate_motion = params.get( - "param_estimate_motion", {"dim": "frame"} - ) - motion = estimate_motion(varr_ref, **param_estimate_motion) - motion = save_minian( - motion.rename("motion").chunk({"frame": chk["frame"]}), - minian_data_path, - overwrite=True, - ) - - logger.info("Applying motion correction...") - Y = apply_transform(varr_ref, motion, fill=0) - - # <<< NEW: Sanitize after motion correction >>> - # apply_transform can introduce NaN at frame borders where the - # shift goes beyond the image boundary. The fill=0 param should - # handle this, but edge cases slip through on large datasets - # with float64 precision. This prevents downstream - # solve_triangular crashes in CNMF. - Y = sanitize_array(Y, "Y_post_motion_correction") - - # Save two versions with different chunking - logger.info("Saving motion-corrected video (frame-chunked)...") - Y_fm_chk = save_minian( - Y.astype(np.float32).rename("Y_fm_chk"), - minian_data_path, - overwrite=True, - ) - - logger.info("Saving motion-corrected video (spatial-chunked)...") - Y_hw_chk = save_minian( - Y_fm_chk.rename("Y_hw_chk"), - minian_data_path, - overwrite=True, - chunks={"frame": -1, "height": chk["height"], "width": chk["width"]}, - ) - - # # Save motion corrected video as mp4 - # logger.info("Writing motion corrected video...") - # write_video(Y_fm_chk, "motion_corrected.mp4", str(output_dir)) - - # Create and save max projection - logger.info("Computing max projection...") - max_proj = Y_fm_chk.max("frame").compute() - max_proj = save_minian(max_proj.rename("max_proj"), minian_data_path, overwrite=True) - - # ===== SEED INITIALIZATION ===== - logger.info("Initializing seeds...") - param_seeds_init = params.get( - "param_seeds_init", - { - "wnd_size": 1000, - "method": "rolling", - "stp_size": 500, - "max_wnd": 15, - "diff_thres": 3, - }, - ) - seeds = seeds_init(Y_fm_chk, **param_seeds_init) - logger.info(f"Initial seeds: {len(seeds)}") - - logger.info("Refining seeds with PNR...") - param_pnr_refine = params.get( - "param_pnr_refine", {"noise_freq": 0.06, "thres": 1} - ) - seeds, pnr, gmm = pnr_refine(Y_hw_chk, seeds, **param_pnr_refine) - logger.info(f"Seeds after PNR refine: {seeds['mask_pnr'].sum()} / {len(seeds)}") - - logger.info("Refining seeds with KS test...") - param_ks_refine = params.get("param_ks_refine", {"sig": 0.05}) - seeds = ks_refine(Y_hw_chk, seeds, **param_ks_refine) - logger.info(f"Seeds after KS refine: {seeds['mask_ks'].sum()} / {len(seeds)}") - - logger.info("Merging seeds...") - param_seeds_merge = params.get( - "param_seeds_merge", - {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, - ) - seeds_final = seeds[seeds["mask_ks"] & seeds["mask_pnr"]].reset_index(drop=True) - seeds_final = seeds_merge(Y_hw_chk, max_proj, seeds_final, **param_seeds_merge) - n_seeds = seeds_final["mask_mrg"].sum() - logger.info(f"Seeds after merge: {n_seeds} / {len(seeds_final)}") - - if n_seeds == 0: - raise ValueError( - "No seeds remaining after refinement. " - "Consider adjusting param_seeds_init or param_pnr_refine thresholds." - ) - - # ===== INITIALIZE A AND C ===== - logger.info("Initializing spatial footprints (A)...") - param_initialize = params.get( - "param_initialize", {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06} - ) - A_init = initA(Y_hw_chk, seeds_final[seeds_final["mask_mrg"]], **param_initialize) - A_init = save_minian(A_init.rename("A_init"), minian_data_path, overwrite=True) - logger.info(f"A_init shape: {A_init.shape}") - - logger.info("Initializing temporal traces (C)...") - C_init = initC(Y_fm_chk, A_init) - C_init = save_minian( - C_init.rename("C_init"), - minian_data_path, - overwrite=True, - chunks={"unit_id": 1, "frame": -1}, - ) - logger.info(f"C_init shape: {C_init.shape}") - - # Initial unit merge - logger.info("Initial unit merge...") - param_init_merge = params.get("param_init_merge", {"thres_corr": 0.8}) - A, C = unit_merge(A_init, C_init, **param_init_merge) - A = save_minian(A.rename("A"), minian_data_path, overwrite=True) - C = save_minian(C.rename("C"), minian_data_path, overwrite=True) - C_chk = save_minian( - C.rename("C_chk"), - minian_data_path, - overwrite=True, - chunks={"unit_id": -1, "frame": chk["frame"]}, - ) - logger.info(f"Units after initial merge: {A.sizes['unit_id']}") - - # ===== INITIALIZE BACKGROUND ===== - logger.info("Initializing background terms...") - b, f = update_background(Y_fm_chk, A, C_chk) - f = save_minian(f.rename("f"), minian_data_path, overwrite=True) - b = save_minian(b.rename("b"), minian_data_path, overwrite=True) - logger.info(f"Background initialized - b: {b.shape}, f: {f.shape}") - - # ===== COMPUTE NOISE STATISTICS ===== - logger.info("Computing noise statistics...") - param_get_noise = params.get("param_get_noise", {"noise_range": (0.06, 0.5)}) - sn_spatial = get_noise_fft(Y_hw_chk, **param_get_noise) - sn_spatial = save_minian(sn_spatial.rename("sn_spatial"), minian_data_path, overwrite=True) - - # ======================================== - # CNMF ITERATION 1 - # ======================================== - - # ----- First Spatial Update ----- - logger.info("CNMF Iteration 1: Spatial update...") - param_first_spatial = params.get( - "param_first_spatial", - {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, - ) - A_new, mask, norm_fac = update_spatial( - Y_hw_chk, A, C, sn_spatial, **param_first_spatial - ) - C_new = save_minian( - (C.sel(unit_id=mask) * norm_fac).rename("C_new"), - minian_data_path, - overwrite=True, - ) - C_chk_new = save_minian( - (C_chk.sel(unit_id=mask) * norm_fac).rename("C_chk_new"), - minian_data_path, - overwrite=True, - ) - logger.info(f"Units after first spatial update: {A_new.sizes['unit_id']}") - - # Update background after first spatial - logger.info("Updating background...") - b_new, f_new = update_background(Y_fm_chk, A_new, C_chk_new) - - # ----- First Temporal Update ----- - logger.info("CNMF Iteration 1: Temporal update...") - param_first_temporal = params.get( - "param_first_temporal", - { - "noise_freq": 0.06, - "sparse_penal": 1, - "p": 1, - "add_lag": 20, - "jac_thres": 0.2, - }, - ) - C_new, S_new, b0_new, c0_new, g, mask = update_temporal( - A_new, C_new, - Y=Y_fm_chk, - b=b_new, f=f_new, - **param_first_temporal - ) - logger.info(f"Units after first temporal update: {C_new.sizes['unit_id']}") - C_new = sanitize_array(C_new, "C_new_pre_merge") - S_new = sanitize_array(S_new, "S_new_pre_merge") - # ----- First Merge ----- - logger.info("CNMF Iteration 1: Merging units...") - param_first_merge = params.get("param_first_merge", {"thres_corr": 0.8}) - - # <<< PATCHED: unit_merge coordinate fix - # update_temporal returns C_new/S_new already filtered by mask internally. - # Sync A with C_new's actual coordinates instead of using the boolean mask, - # which can cause alignment mismatches on large datasets. - A_filtered = A_new.sel(unit_id=C_new.coords["unit_id"].values) - logger.info( - f"Unit sync check — A: {A_filtered.sizes['unit_id']}, " - f"C: {C_new.sizes['unit_id']}, S: {S_new.sizes['unit_id']}" - ) - A_mrg, C_mrg, [S_mrg] = unit_merge( - A_filtered, C_new, [S_new], **param_first_merge - ) - # >>> END PATCH - logger.info(f"Units after first merge: {A_mrg.sizes['unit_id']}") - A_mrg = save_minian(A_mrg.rename("A_mrg"), minian_data_path, overwrite=True) - C_mrg = save_minian(C_mrg.rename("C_mrg"), minian_data_path, overwrite=True) - S_mrg = save_minian(S_mrg.rename("S_mrg"), minian_data_path, overwrite=True) - gc.collect() - logger.info("Saved merged results to zarr for iteration 2") - - # ======================================== - # CNMF ITERATION 2 - # ======================================== - - # ----- Second Spatial Update ----- - logger.info("CNMF Iteration 2: Spatial update...") - param_second_spatial = params.get( - "param_second_spatial", - {"dl_wnd": 10, "sparse_penal": 0.01, "size_thres": (25, None)}, - ) - A_new2, mask2, norm_fac2 = update_spatial( - Y_hw_chk, A_mrg, C_mrg, sn_spatial, **param_second_spatial - ) - C_new2 = C_mrg.sel(unit_id=mask2) * norm_fac2 - logger.info(f"Units after second spatial update: {A_new2.sizes['unit_id']}") - - # Update background after second spatial - logger.info("Updating background...") - b_new2, f_new2 = update_background(Y_fm_chk, A_new2, C_new2) - - # ----- Second Temporal Update ----- - logger.info("CNMF Iteration 2: Temporal update...") - param_second_temporal = params.get( - "param_second_temporal", - { - "noise_freq": 0.06, - "sparse_penal": 1, - "p": 1, - "add_lag": 20, - "jac_thres": 0.4, - }, - ) - C_final, S_final, b0_final, c0_final, g_final, mask_final = update_temporal( - A_new2, C_new2, - Y=Y_fm_chk, - b=b_new2, f=f_new2, - **param_second_temporal - ) - C_final = sanitize_array(C_final, "C_final_post_temporal_2") - S_final = sanitize_array(S_final, "S_final_post_temporal_2") - - # <<< PATCHED: unit_merge coordinate fix (same pattern as iteration 1) - # Sync A with C_final's actual coordinates - A_final = A_new2.sel(unit_id=C_final.coords["unit_id"].values) - # >>> END PATCH - logger.info(f"Final units: {A_final.sizes['unit_id']}") - - # ===== SAVE FINAL RESULTS ===== - logger.info("Saving final results to output directory...") - final_save_params = {"dpath": str(output_dir), "overwrite": True} - - A_final = save_minian(A_final.rename("A"), **final_save_params) - C_final = save_minian(C_final.rename("C"), **final_save_params) - S_final = save_minian(S_final.rename("S"), **final_save_params) - b_final = save_minian(b_new2.rename("b"), **final_save_params) - f_final = save_minian(f_new2.rename("f"), **final_save_params) - motion_final = save_minian(motion.rename("motion"), **final_save_params) - max_proj_final = save_minian(max_proj.rename("max_proj"), **final_save_params) - - logger.info( - f"Minian processing complete. {A_final.sizes['unit_id']} units detected." - ) - - except Exception as e: - logger.error(f"Minian processing failed: {e}") - raise e - finally: - client.close() - cluster.close() - - # Load results and prepare for insertion - minian_loader = MinianLoader(minian_data_path) - key["processing_time"] = minian_loader.creation_time - - # Get minian version if available - try: - import minian - key["package_version"] = getattr(minian, "__version__", "") - except (ImportError, AttributeError): - key["package_version"] = "" - - file_entries = [ - { - **key, - "file_name": f.name, - "file": f.as_posix(), - } - for f in output_dir.rglob("*") - if f.is_file() - ] + file_entries = [ + { + **key, + "file_name": f.name, + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] else: raise ValueError(f"Unknown task mode: {task_mode}") From db39743f6802b5ce61fad5894da652e74a3676da Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Tue, 24 Feb 2026 10:09:58 -0500 Subject: [PATCH 34/36] use dockerized minian --- element_miniscope/miniscope.py | 361 ++++++++++++++++++++++++--------- 1 file changed, 264 insertions(+), 97 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 1c6e96e..cc2f90e 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -76,6 +76,17 @@ def activate( add_objects=_linking_module.__dict__, ) +# ─── Configuration ──────────────────────────────────────────────────────── + +# Docker image for minian processing. +# Override with env var MINIAN_DOCKER_IMAGE if using a custom registry. +MINIAN_DOCKER_IMAGE = os.getenv( + "MINIAN_DOCKER_IMAGE", "datajoint/minian-base:0.1.0" +) + +# Timeout for the docker run command (seconds). Default: 24 hours. +# Large datasets (70k+ frames) can take many hours. +MINIAN_DOCKER_TIMEOUT = int(os.getenv("MINIAN_DOCKER_TIMEOUT", "86400")) # Functions required by the element-miniscope ----------------------------------------- @@ -781,6 +792,17 @@ def make_compute( raise NotImplementedError( f"Loading of {method} data is not yet supported" ) + file_entries = [ + { + **key, + "file_name": f.relative_to( + get_processed_root_data_dir() + ).as_posix(), + "file": f.as_posix(), + } + for f in output_dir.rglob("*") + if f.is_file() + ] elif task_mode == "trigger": avi_files = [ find_full_path(get_miniscope_root_data_dir(), avi_file).as_posix() @@ -967,102 +989,25 @@ def _run_processing(): for f in output_dir.rglob("*") if f.is_file() ] - elif method == "minian": - import docker as docker_sdk - import json as json_mod - - docker_image = os.environ.get( - "MINIAN_DOCKER_IMAGE", "datajoint/minian-py38:latest" - ) - logger.info(f"Running minian via Docker container: {docker_image}") - - # Resolve host-level paths for sibling container volume mounts - host_s3_root = os.environ["HOST_S3_ROOT"] - host_outbox = os.environ["HOST_OUTBOX"] - container_raw_root = os.environ.get( - "RAW_ROOT_DATA_DIR", "/home/jovyan/s3/inbox" - ) - container_processed_root = os.environ.get( - "PROCESSED_ROOT_DATA_DIR", "/home/jovyan/efs/outbox" - ) + logger.info("Running minian via Docker container...") - # Map container paths -> host paths for sibling container mounts - input_dir = str(pathlib.Path(avi_files[0]).parent) - input_dir_host = input_dir.replace( - container_raw_root, host_s3_root + "/inbox", 1 - ) - output_dir_host = str(output_dir).replace( - container_processed_root, host_outbox, 1 + # Run the minian pipeline in a Docker container + status = _run_minian_in_container( + avi_files=avi_files, + output_dir=output_dir, + params=params, ) - # Write config JSON to shared filesystem - n_workers = int(os.getenv("MINIAN_NWORKERS", 2)) - memory_limit_env = os.getenv("MINIAN_MEMORY_LIMIT", "4") - memory_limit = ( - memory_limit_env - if any(c.isalpha() for c in memory_limit_env) - else f"{memory_limit_env}GB" + # Set processing time from container output + key["processing_time"] = status.get( + "timestamp", time.strftime("%Y-%m-%dT%H:%M:%S") ) - container_mem_limit = os.getenv("MINIAN_CONTAINER_MEM_LIMIT", "24g") - - config = { - "input_dir": "/data/input", - "output_dir": "/data/output", - "intermediate_dir": "/data/output/minian_data", - "params": params, - "n_workers": n_workers, - "memory_limit": memory_limit, - } - config_path = output_dir / "minian_config.json" - with open(config_path, "w") as f: - json_mod.dump(config, f, indent=2, default=str) - - # Spawn sibling container - docker_client = docker_sdk.from_env() - container_result = docker_client.containers.run( - image=docker_image, - command=[ - "python", - "/opt/run_minian.py", - "/data/output/minian_config.json", - ], - volumes={ - input_dir_host: {"bind": "/data/input", "mode": "ro"}, - output_dir_host: {"bind": "/data/output", "mode": "rw"}, - }, - mem_limit=container_mem_limit, - environment={ - "MINIAN_NWORKERS": str(n_workers), - "MINIAN_MEMORY_LIMIT": memory_limit_env, - "MKL_NUM_THREADS": "1", - "OPENBLAS_NUM_THREADS": "1", - "OMP_NUM_THREADS": "1", - }, - remove=True, - detach=False, - stdout=True, - stderr=True, - ) - logger.info(f"Container output:\n{container_result.decode()}") - - # Verify completion marker - if (output_dir / ".minian_error").exists(): - with open(output_dir / ".minian_error") as f: - err = json_mod.load(f) - raise RuntimeError( - f"Minian container error: {err.get('error')}" - ) - if not (output_dir / ".minian_complete").exists(): - raise RuntimeError( - "Minian container exited without completion marker" - ) - # Load results - minian_loader = MinianLoader(str(output_dir)) - key["processing_time"] = minian_loader.creation_time - key["package_version"] = "1.2.1-py38-docker" + # Get minian version (from status.json or default) + key["package_version"] = status.get("minian_version", "") + # Collect output files for insertion into DataJoint file_entries = [ { **key, @@ -1072,9 +1017,8 @@ def _run_processing(): for f in output_dir.rglob("*") if f.is_file() ] - - else: - raise ValueError(f"Unknown task mode: {task_mode}") + else: + raise ValueError(f"Unknown task mode: {task_mode}") return (file_entries, output_dir) def make_insert(self, key, file_entries, output_dir): @@ -1087,9 +1031,14 @@ def make_insert(self, key, file_entries, output_dir): ).as_posix(), } ) - self.insert1(dict(**key, processing_time=datetime.now(timezone.utc))) - # for file in file_entries: - # self.File.insert1(file, ignore_extra_fields=True) + self.insert1( + dict( + **key, + processing_time=key.get( + "processing_time", datetime.now(timezone.utc) + ), + ) + ) # Motion Correction -------------------------------------------------------------------- @@ -1805,10 +1754,22 @@ def __init__(self, output_dir): Args: output_dir: Path to the directory containing Minian zarr outputs. """ - from minian.utilities import open_minian + import xarray as xr self.output_dir = pathlib.Path(output_dir) - self._minian_ds = open_minian(str(self.output_dir)) + + # Load each zarr variable into a combined Dataset + # (replicates what minian's open_minian does without requiring + # the minian package to be installed) + ds = xr.Dataset() + for zarr_path in sorted(self.output_dir.glob("*.zarr")): + try: + var_ds = xr.open_zarr(str(zarr_path)) + for name, da in var_ds.data_vars.items(): + ds[name] = da + except Exception: + logger.debug(f"Skipping non-zarr path: {zarr_path}") + self._minian_ds = ds # Load core arrays self._A = self._minian_ds.get( @@ -1996,3 +1957,209 @@ def get_loader_result(key, table, full_output_dir=None) -> tuple: raise NotImplementedError("Unknown/unimplemented method: {}".format(method)) return method, loaded_output + + +def _run_minian_in_container( + avi_files: list, + output_dir: pathlib.Path, + params: dict, + docker_image: str = MINIAN_DOCKER_IMAGE, + timeout: int = MINIAN_DOCKER_TIMEOUT, +): + """ + Run the minian pipeline inside a Docker container. + + This function: + 1. Prepares a params.json file + 2. Identifies the input directory containing the AVI files + 3. Runs `docker run` with volume mounts for input, output, and params + 4. Streams container logs to the pipeline logger + 5. Checks the exit code and status.json for success/failure + + Args: + avi_files: List of paths to AVI video files (must all be in same directory) + output_dir: Path where minian results will be saved + params: Dictionary of minian pipeline parameters + docker_image: Docker image to use (default: datajoint/minian-base:0.1.0) + timeout: Max seconds to wait for the container (default: 86400 = 24h) + + Raises: + RuntimeError: If the container exits with non-zero code or status.json + reports an error. + subprocess.TimeoutExpired: If the container exceeds the timeout. + """ + import json + import subprocess + import tempfile + + # ── Resolve input directory ────────────────────────────────────────── + # All AVI files should be in the same directory (or we use the parent + # of the first file). If files are in different directories, we'd need + # to copy them — but that's not typical for miniscope recordings. + input_dir = pathlib.Path(avi_files[0]).parent.resolve() + + # Verify all files are in the same directory + for avi in avi_files: + if pathlib.Path(avi).parent.resolve() != input_dir: + raise ValueError( + f"All AVI files must be in the same directory. " + f"Found files in {input_dir} and {pathlib.Path(avi).parent}" + ) + + # ── Prepare params ─────────────────────────────────────────────────── + # Write params to a temp directory that will be mounted into the container + params_dir = tempfile.mkdtemp(prefix="minian_params_") + params_file = os.path.join(params_dir, "params.json") + + # Convert any tuple values to lists for JSON serialization + def _serialize_params(obj): + if isinstance(obj, tuple): + return list(obj) + if isinstance(obj, dict): + return {k: _serialize_params(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_serialize_params(v) for v in obj] + return obj + + serializable_params = _serialize_params(params) + + with open(params_file, "w") as f: + json.dump(serializable_params, f, indent=2) + + logger.info(f"Params written to {params_file}") + + # ── Prepare output directory ───────────────────────────────────────── + output_dir = pathlib.Path(output_dir).resolve() + os.makedirs(output_dir, exist_ok=True) + + # ── Build docker run command ───────────────────────────────────────── + # + # Volume mounts: + # input_dir -> /data/input (read-only: we never modify source videos) + # output_dir -> /data/output (read-write: results written here) + # params_dir -> /data/params (read-only: params.json) + # + # We also pass through resource-related env vars if set. + docker_cmd = [ + "docker", + "run", + "--rm", # Auto-remove container after exit + # Volume mounts + "-v", + f"{input_dir}:/data/input:ro", + "-v", + f"{output_dir}:/data/output", + "-v", + f"{params_dir}:/data/params:ro", + ] + + # Pass through resource configuration env vars + for env_var in ["MINIAN_NWORKERS", "MINIAN_MEMORY_LIMIT"]: + val = os.getenv(env_var) + if val: + docker_cmd.extend(["-e", f"{env_var}={val}"]) + + # Memory limit for the container itself (if set) + container_memory = os.getenv("MINIAN_CONTAINER_MEMORY") + if container_memory: + docker_cmd.extend(["--memory", container_memory]) + + # CPU limit (if set) + container_cpus = os.getenv("MINIAN_CONTAINER_CPUS") + if container_cpus: + docker_cmd.extend(["--cpus", container_cpus]) + + # SHM size — Dask workers need shared memory for inter-process comms + docker_cmd.extend(["--shm-size", os.getenv("MINIAN_SHM_SIZE", "8g")]) + + # The image + docker_cmd.append(docker_image) + + logger.info(f"Docker command: {' '.join(docker_cmd)}") + + # ── Pull image if not present ──────────────────────────────────────── + try: + subprocess.run( + ["docker", "image", "inspect", docker_image], + capture_output=True, + check=True, + ) + logger.info(f"Docker image {docker_image} found locally") + except subprocess.CalledProcessError: + logger.info(f"Pulling Docker image {docker_image}...") + subprocess.run( + ["docker", "pull", docker_image], + check=True, + timeout=600, # 10 min pull timeout + ) + + # ── Run the container ──────────────────────────────────────────────── + logger.info("Starting minian container...") + start_time = time.time() + + process = subprocess.Popen( + docker_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True, + bufsize=1, # Line-buffered + ) + + # Stream container logs to the pipeline logger in real time + try: + for line in process.stdout: + line = line.rstrip() + if line: + logger.info(f"[minian-container] {line}") + + process.wait(timeout=timeout) + except subprocess.TimeoutExpired: + logger.error( + f"Minian container timed out after {timeout}s. Killing..." + ) + process.kill() + process.wait() + raise RuntimeError( + f"Minian container exceeded timeout of {timeout} seconds" + ) + + elapsed = time.time() - start_time + logger.info( + f"Container exited with code {process.returncode} " + f"after {elapsed:.1f}s ({elapsed/3600:.1f}h)" + ) + + # ── Check results ──────────────────────────────────────────────────── + status_file = output_dir / "status.json" + + if process.returncode != 0: + error_msg = f"Minian container exited with code {process.returncode}" + if status_file.exists(): + with open(status_file) as f: + status = json.load(f) + error_msg += f": {status.get('message', 'unknown error')}" + raise RuntimeError(error_msg) + + if not status_file.exists(): + raise RuntimeError( + "Minian container exited successfully but no status.json found" + ) + + with open(status_file) as f: + status = json.load(f) + + if status.get("status") != "success": + raise RuntimeError( + f"Minian container reported failure: {status.get('message', 'unknown')}" + ) + + logger.info( + f"Minian processing complete: {status.get('n_units', '?')} units detected" + ) + + # ── Cleanup temp params dir ────────────────────────────────────────── + import shutil + + shutil.rmtree(params_dir, ignore_errors=True) + + return status From cc0440f396ea34b650c8fec981350e8d000e904d Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Tue, 10 Mar 2026 15:01:23 -0400 Subject: [PATCH 35/36] Update minian Docker integration with host path translation and Dask tuning - Change default image to minian-py38:latest (conda-forge based) - Refactor _run_minian_in_container: write params as unified config JSON, translate container paths to host paths via HOST_S3_ROOT/HOST_OUTBOX env vars, pass Dask timeout and memory allocator env vars to container - Add test_minian_pipeline.py for end-to-end Minian integration testing Co-Authored-By: Claude Opus 4.6 --- element_miniscope/miniscope.py | 108 ++++---- notebooks/test_minian_pipeline.py | 396 ++++++++++++++++++++++++++++++ 2 files changed, 449 insertions(+), 55 deletions(-) create mode 100644 notebooks/test_minian_pipeline.py diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index cc2f90e..777ad77 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -7,6 +7,7 @@ import json import os import pathlib +import time from datetime import datetime, timezone from typing import Union @@ -81,7 +82,7 @@ def activate( # Docker image for minian processing. # Override with env var MINIAN_DOCKER_IMAGE if using a custom registry. MINIAN_DOCKER_IMAGE = os.getenv( - "MINIAN_DOCKER_IMAGE", "datajoint/minian-base:0.1.0" + "MINIAN_DOCKER_IMAGE", "datajoint/minian-py38:latest" ) # Timeout for the docker run command (seconds). Default: 24 hours. @@ -1966,39 +1967,12 @@ def _run_minian_in_container( docker_image: str = MINIAN_DOCKER_IMAGE, timeout: int = MINIAN_DOCKER_TIMEOUT, ): - """ - Run the minian pipeline inside a Docker container. - - This function: - 1. Prepares a params.json file - 2. Identifies the input directory containing the AVI files - 3. Runs `docker run` with volume mounts for input, output, and params - 4. Streams container logs to the pipeline logger - 5. Checks the exit code and status.json for success/failure - - Args: - avi_files: List of paths to AVI video files (must all be in same directory) - output_dir: Path where minian results will be saved - params: Dictionary of minian pipeline parameters - docker_image: Docker image to use (default: datajoint/minian-base:0.1.0) - timeout: Max seconds to wait for the container (default: 86400 = 24h) - - Raises: - RuntimeError: If the container exits with non-zero code or status.json - reports an error. - subprocess.TimeoutExpired: If the container exceeds the timeout. - """ import json import subprocess - import tempfile # ── Resolve input directory ────────────────────────────────────────── - # All AVI files should be in the same directory (or we use the parent - # of the first file). If files are in different directories, we'd need - # to copy them — but that's not typical for miniscope recordings. input_dir = pathlib.Path(avi_files[0]).parent.resolve() - # Verify all files are in the same directory for avi in avi_files: if pathlib.Path(avi).parent.resolve() != input_dir: raise ValueError( @@ -2006,12 +1980,15 @@ def _run_minian_in_container( f"Found files in {input_dir} and {pathlib.Path(avi).parent}" ) - # ── Prepare params ─────────────────────────────────────────────────── - # Write params to a temp directory that will be mounted into the container - params_dir = tempfile.mkdtemp(prefix="minian_params_") - params_file = os.path.join(params_dir, "params.json") + # ── Prepare output directory ───────────────────────────────────────── + output_dir = pathlib.Path(output_dir).resolve() + os.makedirs(output_dir, exist_ok=True) + + # ── Prepare params (under output_dir so it's on a shared volume) ───── + params_dir = output_dir / ".params" + os.makedirs(params_dir, exist_ok=True) + params_file = params_dir / "params.json" - # Convert any tuple values to lists for JSON serialization def _serialize_params(obj): if isinstance(obj, tuple): return list(obj) @@ -2020,37 +1997,43 @@ def _serialize_params(obj): if isinstance(obj, list): return [_serialize_params(v) for v in obj] return obj - + serializable_params = _serialize_params(params) + config = { + "input_dir": "/data/input", + "output_dir": "/data/output", + "params": serializable_params, + } + with open(params_file, "w") as f: - json.dump(serializable_params, f, indent=2) + json.dump(config, f, indent=2) logger.info(f"Params written to {params_file}") - # ── Prepare output directory ───────────────────────────────────────── - output_dir = pathlib.Path(output_dir).resolve() - os.makedirs(output_dir, exist_ok=True) + # ── Translate container paths → host paths ─────────────────────────── + # The host Docker daemon resolves bind-mount paths on the HOST + # filesystem, not inside this worker container. We use env vars + # set by docker-compose to map container paths to host paths. + host_s3_root = os.getenv("HOST_S3_ROOT") + host_outbox = os.getenv("HOST_OUTBOX") + + def _to_host_path(container_path, container_prefix, host_prefix): + s = str(container_path) + if host_prefix and s.startswith(container_prefix): + return host_prefix + s[len(container_prefix):] + return s # fallback: use as-is (works if running directly on host) + + host_input_dir = _to_host_path(input_dir, "/home/jovyan/s3", host_s3_root) + host_output_dir = _to_host_path(output_dir, "/home/jovyan/efs/outbox", host_outbox) + host_params_dir = _to_host_path(params_dir, "/home/jovyan/efs/outbox", host_outbox) # ── Build docker run command ───────────────────────────────────────── - # - # Volume mounts: - # input_dir -> /data/input (read-only: we never modify source videos) - # output_dir -> /data/output (read-write: results written here) - # params_dir -> /data/params (read-only: params.json) - # - # We also pass through resource-related env vars if set. docker_cmd = [ - "docker", - "run", - "--rm", # Auto-remove container after exit - # Volume mounts - "-v", - f"{input_dir}:/data/input:ro", - "-v", - f"{output_dir}:/data/output", - "-v", - f"{params_dir}:/data/params:ro", + "docker", "run", "--rm", + "-v", f"{host_input_dir}:/data/input:ro", + "-v", f"{host_output_dir}:/data/output", + "-v", f"{host_params_dir}:/data/params:ro", ] # Pass through resource configuration env vars @@ -2059,6 +2042,20 @@ def _serialize_params(obj): if val: docker_cmd.extend(["-e", f"{env_var}={val}"]) + # Dask timeout config (prevents worker kills during long computations) + dask_env = { + "DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL": "3600s", + "DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP": "7200s", + "DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT": "300s", + "DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING": "False", + } + for key, val in dask_env.items(): + docker_cmd.extend(["-e", f"{key}={val}"]) + + # Memory allocator tuning (reduces heap fragmentation under heavy load) + docker_cmd.extend(["-e", "MALLOC_TRIM_THRESHOLD_=131072"]) + docker_cmd.extend(["-e", "MALLOC_MMAP_THRESHOLD_=131072"]) + # Memory limit for the container itself (if set) container_memory = os.getenv("MINIAN_CONTAINER_MEMORY") if container_memory: @@ -2074,6 +2071,7 @@ def _serialize_params(obj): # The image docker_cmd.append(docker_image) + docker_cmd.append("/data/params/params.json") logger.info(f"Docker command: {' '.join(docker_cmd)}") diff --git a/notebooks/test_minian_pipeline.py b/notebooks/test_minian_pipeline.py new file mode 100644 index 0000000..28d4e6f --- /dev/null +++ b/notebooks/test_minian_pipeline.py @@ -0,0 +1,396 @@ +""" +Test script for Minian integration in element-miniscope. + +This script sets up the database, populates required tables, and runs +the Minian processing pipeline. + +Prerequisites: +- DataJoint database connection configured (dj.config) +- Minian package installed +- Test miniscope video files available + +Usage: + python test_minian_pipeline.py --data-dir /path/to/miniscope/videos +""" + +import os +import sys +import argparse +import datajoint as dj +import numpy as np +from pathlib import Path +from datetime import datetime + +# ============================================================================ +# Configuration - Modify these paths for your environment +# ============================================================================ + +# Root directory containing raw miniscope data +# This should be the parent directory containing session folders with .avi files +DEFAULT_DATA_DIR = "./miniscope_test/" + +# Root directory for processed outputs +DEFAULT_PROCESSED_DIR = "./processed/" + +# Database schema prefix (schemas will be named: {prefix}_miniscope, etc.) +SCHEMA_PREFIX = "test" + + +# ============================================================================ +# Linking Module - Required functions for element-miniscope +# ============================================================================ + +def get_miniscope_root_data_dir(): + """Return the root directory for raw miniscope data.""" + return [os.environ.get("MINISCOPE_ROOT_DATA_DIR", DEFAULT_DATA_DIR)] + + +def get_processed_root_data_dir(): + """Return the root directory for processed data.""" + processed_dir = os.environ.get("MINISCOPE_PROCESSED_DIR", DEFAULT_PROCESSED_DIR) + os.makedirs(processed_dir, exist_ok=True) + return processed_dir + + +def get_session_directory(session_key): + """Return the session directory for a given session key. + + For testing, we assume the directory structure is: + {root_data_dir}/{subject}/{session_date}/ + """ + from element_miniscope import miniscope + + # For a simple test setup, use the session key directly + subject = session_key.get("subject", "test_subject") + session_datetime = session_key.get("session_datetime", datetime.now()) + + if isinstance(session_datetime, datetime): + session_date = session_datetime.strftime("%Y-%m-%d") + else: + session_date = str(session_datetime).split()[0] + + return f"{subject}/{session_date}" + + +# Create a module-like object for the linking module +class LinkingModule: + get_miniscope_root_data_dir = staticmethod(get_miniscope_root_data_dir) + get_processed_root_data_dir = staticmethod(get_processed_root_data_dir) + get_session_directory = staticmethod(get_session_directory) + + +# ============================================================================ +# Schema Setup +# ============================================================================ + +def setup_schemas(): + """Activate the miniscope schemas.""" + from element_miniscope import miniscope + from element_miniscope import miniscope_report + + # Create minimal upstream tables for testing + schema = dj.schema(f"{SCHEMA_PREFIX}_lab") + + @schema + class Subject(dj.Manual): + definition = """ + subject: varchar(32) + """ + + @schema + class Session(dj.Manual): + definition = """ + -> Subject + session_datetime: datetime + """ + + @schema + class Device(dj.Lookup): + definition = """ + device: varchar(32) + """ + contents = [("Miniscope_V4",)] + + @schema + class AnatomicalLocation(dj.Lookup): + definition = """ + location: varchar(32) + """ + contents = [("CA1",), ("mPFC",)] + + # Add required attributes to linking module + LinkingModule.Subject = Subject + LinkingModule.Session = Session + LinkingModule.Device = Device + LinkingModule.AnatomicalLocation = AnatomicalLocation + + # Activate miniscope schema + miniscope.activate( + f"{SCHEMA_PREFIX}_miniscope", + linking_module=LinkingModule, + create_schema=True, + create_tables=True, + ) + + # Activate report schema + miniscope_report.activate( + f"{SCHEMA_PREFIX}_miniscope_report", + create_schema=True, + create_tables=True, + ) + + print(f"Schemas activated: {SCHEMA_PREFIX}_lab, {SCHEMA_PREFIX}_miniscope, {SCHEMA_PREFIX}_miniscope_report") + + return Subject, Session, Device, miniscope + + +def populate_test_data(Subject, Session, miniscope, data_dir): + """Populate the database with test data entries.""" + + # Insert test subject + subject_key = {"subject": "test_mouse"} + Subject.insert1(subject_key, skip_duplicates=True) + print(f"Inserted subject: {subject_key}") + + # Insert test session + session_key = { + **subject_key, + "session_datetime": datetime.now().replace(microsecond=0), + } + Session.insert1(session_key, skip_duplicates=True) + print(f"Inserted session: {session_key}") + + # Insert recording + recording_key = { + **session_key, + "recording_id": 0, + "acq_software": "Miniscope-DAQ-V4", + } + miniscope.Recording.insert1(recording_key, skip_duplicates=True) + print(f"Inserted recording: {recording_key}") + + # Populate RecordingInfo (this reads metadata from the video files) + miniscope.RecordingInfo.populate(display_progress=True) + print("Populated RecordingInfo") + + return session_key, recording_key + + +def create_minian_paramset(miniscope): + """Create a ProcessingParamSet for Minian analysis.""" + + minian_params = { + # Video loading + "load_videos": { + "pattern": r".*\.avi$", + "dtype": "uint8", + "downsample": {"frame": 1, "height": 1, "width": 1}, + "downsample_strategy": "subset", + }, + # Preprocessing + "denoise": {"method": "median", "ksize": 7}, + "background_removal": {"method": "tophat", "wnd": 10}, + # Motion correction + "estimate_motion": {"dim": "frame"}, + # Seeds initialization + "seeds_init": { + "wnd_size": 1000, + "method": "rolling", + "stp_size": 500, + "max_wnd": 15, + "diff_thres": 6.5, + }, + "pnr_refine": {"noise_freq": 0.05, "thres": 1}, + "ks_refine": {"sig": 0.05}, + "seeds_merge": {"thres_dist": 10, "thres_corr": 0.8, "noise_freq": 0.06}, + "initialize": {"thres_corr": 0.8, "wnd": 10, "noise_freq": 0.06}, + "init_merge": {"thres_corr": 0.8}, + # CNMF + "get_noise": {"noise_range": (0.06, 0.5)}, + "first_spatial": { + "dl_wnd": 5, + "sparse_penal": 0.001, + "size_thres": (20, None), + }, + "first_temporal": { + "noise_freq": 0.06, + "sparse_penal": 0.001, + "p": 1, + "add_lag": 10, + "jac_thres": 0.1, + }, + "first_merge": {"thres_corr": 0.6}, + "second_spatial": { + "dl_wnd": 10, + "sparse_penal": 0.001, + "size_thres": (20, None), + }, + "second_temporal": { + "noise_freq": 0.06, + "sparse_penal": 0.01, + "p": 1, + "add_lag": 10, + "jac_thres": 0.2, + "zero_thres": 1e-10, + }, + } + + paramset_idx = 0 + + miniscope.ProcessingParamSet.insert_new_params( + processing_method="minian", + paramset_idx=paramset_idx, + paramset_desc="Default Minian parameters for miniscope analysis", + params=minian_params, + ) + + print(f"Created Minian ProcessingParamSet with idx={paramset_idx}") + return paramset_idx + + +def create_processing_task(miniscope, recording_key, paramset_idx): + """Create a ProcessingTask entry.""" + + task_key = { + **recording_key, + "paramset_idx": paramset_idx, + } + + # Infer output directory + output_dir = miniscope.ProcessingTask.infer_output_dir( + task_key, relative=True, mkdir=True + ) + + miniscope.ProcessingTask.insert1( + { + **task_key, + "processing_output_dir": str(output_dir), + "task_mode": "trigger", + }, + skip_duplicates=True, + ) + + print(f"Created ProcessingTask: {task_key}") + print(f"Output directory: {output_dir}") + + return task_key + + +def run_processing(miniscope): + """Run the processing pipeline.""" + print("\n" + "=" * 60) + print("Starting Minian Processing...") + print("=" * 60 + "\n") + + miniscope.Processing.populate(display_progress=True) + + print("\nProcessing complete!") + print(f"Entries in Processing table: {len(miniscope.Processing())}") + + +def populate_downstream_tables(miniscope, miniscope_report): + """Populate downstream analysis tables.""" + print("\n" + "=" * 60) + print("Populating downstream tables...") + print("=" * 60 + "\n") + + print("Populating MotionCorrection...") + miniscope.MotionCorrection.populate(display_progress=True) + + print("Populating Segmentation...") + miniscope.Segmentation.populate(display_progress=True) + + print("Populating Fluorescence...") + miniscope.Fluorescence.populate(display_progress=True) + + print("Populating Activity...") + miniscope.Activity.populate(display_progress=True) + + print("Populating visualizations...") + miniscope_report.MinianProcessingVisualization.populate(display_progress=True) + + print("\nDownstream tables populated!") + + +def verify_results(miniscope): + """Print summary of results.""" + print("\n" + "=" * 60) + print("Results Summary") + print("=" * 60 + "\n") + + print(f"Processing entries: {len(miniscope.Processing())}") + print(f"MotionCorrection entries: {len(miniscope.MotionCorrection())}") + print(f"Segmentation entries: {len(miniscope.Segmentation())}") + print(f"Segmentation.Mask entries: {len(miniscope.Segmentation.Mask())}") + print(f"Fluorescence entries: {len(miniscope.Fluorescence())}") + print(f"Fluorescence.Trace entries: {len(miniscope.Fluorescence.Trace())}") + print(f"Activity entries: {len(miniscope.Activity())}") + + # Show some mask statistics if available + if len(miniscope.Segmentation.Mask()) > 0: + masks = miniscope.Segmentation.Mask.fetch() + print(f"\nDetected {len(masks)} ROIs/cells") + + +def main(): + parser = argparse.ArgumentParser(description="Test Minian integration") + parser.add_argument( + "--data-dir", + type=str, + default=DEFAULT_DATA_DIR, + help="Path to directory containing miniscope video files", + ) + parser.add_argument( + "--processed-dir", + type=str, + default=DEFAULT_PROCESSED_DIR, + help="Path to directory for processed outputs", + ) + parser.add_argument( + "--skip-processing", + action="store_true", + help="Skip processing and just verify existing results", + ) + args = parser.parse_args() + + # Set environment variables + os.environ["MINISCOPE_ROOT_DATA_DIR"] = os.path.abspath(args.data_dir) + os.environ["MINISCOPE_PROCESSED_DIR"] = os.path.abspath(args.processed_dir) + + print(f"Data directory: {os.environ['MINISCOPE_ROOT_DATA_DIR']}") + print(f"Processed directory: {os.environ['MINISCOPE_PROCESSED_DIR']}") + + # Import after setting paths + from element_miniscope import miniscope_report + + # Setup schemas + Subject, Session, Device, miniscope = setup_schemas() + + if not args.skip_processing: + # Populate test data + session_key, recording_key = populate_test_data( + Subject, Session, miniscope, args.data_dir + ) + + # Create parameter set + paramset_idx = create_minian_paramset(miniscope) + + # Create processing task + task_key = create_processing_task(miniscope, recording_key, paramset_idx) + + # Run processing + run_processing(miniscope) + + # Populate downstream tables + populate_downstream_tables(miniscope, miniscope_report) + + # Verify results + verify_results(miniscope) + + print("\n" + "=" * 60) + print("Test complete!") + print("=" * 60) + + +if __name__ == "__main__": + main() From ac6d5dac9acab42e954a2518827035f9134c9ba7 Mon Sep 17 00:00:00 2001 From: Kushal Bakshi Date: Wed, 18 Mar 2026 13:48:41 -0400 Subject: [PATCH 36/36] Fix MinianLoader summary images and container env var passthrough MinianLoader: - Load ref_image and mean_proj zarrs directly instead of computing from full varr_ref video (which was never in the final output) - Remove dead varr_ref fallback from max_proj_image _run_minian_in_container: - Pass through Dask env vars from environment instead of hardcoding - Include DASK_DISTRIBUTED__WORKER__MEMORY__* vars - Fix MINIAN_CONTAINER_MEMORY -> MINIAN_CONTAINER_MEM_LIMIT to match docker-compose naming - Remove unused /data/params/params.json CLI argument Co-Authored-By: Claude Opus 4.6 (1M context) --- element_miniscope/miniscope.py | 46 +++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/element_miniscope/miniscope.py b/element_miniscope/miniscope.py index 777ad77..97f16ce 100644 --- a/element_miniscope/miniscope.py +++ b/element_miniscope/miniscope.py @@ -1788,9 +1788,10 @@ def __init__(self, output_dir): # Try to load motion correction data self._motion = self._minian_ds.get("motion") - # Try to load reference/max projection images + # Summary images self._max_proj = self._minian_ds.get("max_proj") - self._varr_ref = self._minian_ds.get("varr_ref") + self._mean_proj = self._minian_ds.get("mean_proj") + self._ref_image = self._minian_ds.get("ref_image") @property def minian_dataset(self): @@ -1845,16 +1846,15 @@ def extract_rigid_mc(self): @property def ref_image(self): """Return reference image used for motion correction.""" - if self._varr_ref is not None: - # Take mean across frames for reference - return self._varr_ref.mean(dim="frame").compute().values[np.newaxis, :, :] + if self._ref_image is not None: + return self._ref_image.compute().values[np.newaxis, :, :] return None @property def mean_image(self): """Return mean image (average across frames).""" - if self._varr_ref is not None: - return self._varr_ref.mean(dim="frame").compute().values[np.newaxis, :, :] + if self._mean_proj is not None: + return self._mean_proj.compute().values[np.newaxis, :, :] return None @property @@ -1862,8 +1862,6 @@ def max_proj_image(self): """Return maximum projection image.""" if self._max_proj is not None: return self._max_proj.compute().values[np.newaxis, :, :] - elif self._varr_ref is not None: - return self._varr_ref.max(dim="frame").compute().values[np.newaxis, :, :] return None @property @@ -2036,28 +2034,31 @@ def _to_host_path(container_path, container_prefix, host_prefix): "-v", f"{host_params_dir}:/data/params:ro", ] - # Pass through resource configuration env vars - for env_var in ["MINIAN_NWORKERS", "MINIAN_MEMORY_LIMIT"]: + # Pass through resource configuration and Dask env vars + _passthrough_env_vars = [ + "MINIAN_NWORKERS", + "MINIAN_MEMORY_LIMIT", + # Dask distributed config (set via docker-compose with defaults) + "DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL", + "DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP", + "DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT", + "DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING", + "DASK_DISTRIBUTED__WORKER__MEMORY__TARGET", + "DASK_DISTRIBUTED__WORKER__MEMORY__SPILL", + "DASK_DISTRIBUTED__WORKER__MEMORY__PAUSE", + "DASK_DISTRIBUTED__WORKER__MEMORY__TERMINATE", + ] + for env_var in _passthrough_env_vars: val = os.getenv(env_var) if val: docker_cmd.extend(["-e", f"{env_var}={val}"]) - # Dask timeout config (prevents worker kills during long computations) - dask_env = { - "DASK_DISTRIBUTED__SCHEDULER__WORKER_TTL": "3600s", - "DASK_DISTRIBUTED__COMM__TIMEOUTS__TCP": "7200s", - "DASK_DISTRIBUTED__COMM__TIMEOUTS__CONNECT": "300s", - "DASK_DISTRIBUTED__SCHEDULER__WORK_STEALING": "False", - } - for key, val in dask_env.items(): - docker_cmd.extend(["-e", f"{key}={val}"]) - # Memory allocator tuning (reduces heap fragmentation under heavy load) docker_cmd.extend(["-e", "MALLOC_TRIM_THRESHOLD_=131072"]) docker_cmd.extend(["-e", "MALLOC_MMAP_THRESHOLD_=131072"]) # Memory limit for the container itself (if set) - container_memory = os.getenv("MINIAN_CONTAINER_MEMORY") + container_memory = os.getenv("MINIAN_CONTAINER_MEM_LIMIT") if container_memory: docker_cmd.extend(["--memory", container_memory]) @@ -2071,7 +2072,6 @@ def _to_host_path(container_path, container_prefix, host_prefix): # The image docker_cmd.append(docker_image) - docker_cmd.append("/data/params/params.json") logger.info(f"Docker command: {' '.join(docker_cmd)}")