InSARdev
diff --git a/‎insardev/insardev/Batch.py‎
Lines changed: 273 additions & 215 deletions b/‎insardev/insardev/Batch.py‎
Lines changed: 273 additions & 215 deletions
diff --git a/‎insardev/insardev/BatchCore.py‎
Lines changed: 1249 additions & 655 deletions b/‎insardev/insardev/BatchCore.py‎
Lines changed: 1249 additions & 655 deletions
diff --git a/‎insardev/insardev/Stack.py‎
Lines changed: 101 additions & 69 deletions b/‎insardev/insardev/Stack.py‎
Lines changed: 101 additions & 69 deletions
diff --git a/‎insardev/insardev/Stack_ps.py‎
Lines changed: 10 additions & 70 deletions b/‎insardev/insardev/Stack_ps.py‎
Lines changed: 10 additions & 70 deletions
diff --git a/‎insardev/insardev/Stack_stl.py‎
Lines changed: 61 additions & 34 deletions b/‎insardev/insardev/Stack_stl.py‎
Lines changed: 61 additions & 34 deletions
@@ -149,7 +149,6 @@ def psfunction(self, device='auto', allow_rechunk=True, debug=False):
         # Disable automatic rechunking (manual chunk control)
         psf = stack.chunk({'y': 2048, 'x': 2048}).psfunction(allow_rechunk=False)
         """
-        import dask
         import dask.array
         import numpy as np
         import torch
@@ -184,62 +183,11 @@ def psfunction(self, device='auto', allow_rechunk=True, debug=False):
             if not isinstance(slc_data.data, dask.array.Array):
                 slc_data = slc_data.chunk({'y': 512, 'x': 512})
 
-            # Save original spatial chunks for restoring output
-            original_y_chunks = slc_data.chunks[1] if len(slc_data.chunks) > 1 else None
-            original_x_chunks = slc_data.chunks[2] if len(slc_data.chunks) > 2 else None
-
             if debug:
                 print(f'DEBUG: psfunction for {key}: shape={slc_data.shape}, chunks={slc_data.chunks}')
 
-            # Calculate target chunks for memory efficiency
-            from .utils_dask import compute_aligned_chunks_3d
-            dask_chunk_bytes = dask_chunk_mb * 1024 * 1024
-            element_bytes = slc_data.dtype.itemsize
-            target_chunks = compute_aligned_chunks_3d(
-                slc_data.shape, slc_data.chunks, dask_chunk_bytes, element_bytes,
-                min_chunk=256, keep_first_dim=True
-            )
-
-            # Get current spatial chunk sizes (use first chunk as representative)
-            orig_y = slc_data.chunks[1][0] if len(slc_data.chunks) > 1 and slc_data.chunks[1] else slc_data.shape[1]
-            orig_x = slc_data.chunks[2][0] if len(slc_data.chunks) > 2 and slc_data.chunks[2] else slc_data.shape[2]
-
-            # Calculate chunk memory sizes
-            n_dates = slc_data.shape[0]
-            orig_chunk_mb = n_dates * orig_y * orig_x * element_bytes / (1024 * 1024)
-            # target_chunks is tuple of tuples: ((n,), (y1, y2, ...), (x1, x2, ...))
-            target_y = max(target_chunks[1]) if target_chunks[1] else orig_y
-            target_x = max(target_chunks[2]) if target_chunks[2] else orig_x
-            target_chunk_mb = n_dates * target_y * target_x * element_bytes / (1024 * 1024)
-
-            # Determine if rechunking is needed (only if original is larger than target)
-            needs_rechunk = (orig_y > target_y) or (orig_x > target_x)
-
-            # Print NOTE about chunks (only once for first burst)
-            if not note_printed:
-                chunk_size_note = f"dask.config['array.chunk-size']={dask_chunk_mb} MB"
-                if needs_rechunk:
-                    if allow_rechunk:
-                        print(f'NOTE psfunction: rechunking from ({n_dates}, {orig_y}, {orig_x}) [{orig_chunk_mb:.0f} MB] '
-                              f'to ({n_dates}, {target_chunks[1]}, {target_chunks[2]}) [{target_chunk_mb:.0f} MB] for {chunk_size_note}')
-                    else:
-                        print(f'NOTE psfunction: chunks ({n_dates}, {orig_y}, {orig_x}) [{orig_chunk_mb:.0f} MB] exceed '
-                              f'{chunk_size_note}, recommended: ({n_dates}, {target_chunks[1]}, {target_chunks[2]}) [{target_chunk_mb:.0f} MB]. '
-                              f'Use allow_rechunk=True or .chunk() manually.')
-                else:
-                    # Chunks already fit - just confirm, no "optimal" claim
-                    print(f'NOTE psfunction: chunks ({n_dates}, {orig_y}, {orig_x}) [{orig_chunk_mb:.0f} MB] '
-                          f'fit {chunk_size_note}')
-                note_printed = True
-
-            # Apply rechunking if needed and allowed (pass exact chunk tuples)
-            if needs_rechunk and allow_rechunk:
-                slc_data = slc_data.chunk({'date': -1, 'y': target_chunks[1], 'x': target_chunks[2]})
-                if debug:
-                    print(f'DEBUG: after rechunk: chunks={slc_data.chunks}')
-            else:
-                # Just ensure date is single chunk
-                slc_data = slc_data.chunk({'date': -1})
+            # Merge dates dim, keep input spatial chunks as-is.
+            slc_data = slc_data.chunk({'date': -1})
 
             # Create wrapper that captures device and debug
             def make_wrapper(dev, dbg):
@@ -262,24 +210,16 @@ def process_wrapper(slc_chunk):
             # Use xr.apply_ufunc with dask='parallelized' for lazy execution
             # Core dim is 'date' (reduction), chunked dims are y, x
             # Note: input_core_dims moves 'date' to last axis, wrapper transposes back
-            # Use GPU annotation to prevent MPS command buffer conflicts
             # Provide explicit meta to avoid ComplexWarning when dask infers
             # output type from complex input (we intentionally convert to real)
-            with dask.annotate(resources={'gpu': 1} if device != 'cpu' else {}):
-                psf_da = xr.apply_ufunc(
-                    wrapper,
-                    slc_data,
-                    input_core_dims=[['date']],
-                    output_core_dims=[[]],
-                    dask='parallelized',
-                    dask_gufunc_kwargs={'meta': np.array((), dtype=np.float32)},
-                )
-
-            # Restore original spatial chunks if we rechunked (pass full tuples)
-            if allow_rechunk and original_y_chunks is not None:
-                psf_da = psf_da.chunk({'y': original_y_chunks, 'x': original_x_chunks})
-                if debug:
-                    print(f'DEBUG: restored output chunks: {psf_da.chunks}')
+            psf_da = xr.apply_ufunc(
+                wrapper,
+                slc_data,
+                input_core_dims=[['date']],
+                output_core_dims=[[]],
+                dask='parallelized',
+                dask_gufunc_kwargs={'meta': np.array((), dtype=np.float32)},
+            )
 
             # Assign name to match SLC variable
             psf_da.name = var_name
 
@@ -151,41 +151,71 @@ def _stl(self, data, freq='W', periods=52, robust=False):
         n_dates_out = len(dt_periodic)
         n_dates_in = data.date.size
 
-        # Use rechunk2d for uniform chunk sizes based on memory
-        # Multiplier accounts for STL internal memory (trend, seasonal, resid, weights, etc.)
-        # Effective memory: 8 * n_dates_in * 16 bytes (complex128) per pixel
-        from .utils_dask import rechunk2d
-        mem_per_pixel = 8 * n_dates_in * 16  # complex128 = 16 bytes
-        optimal = rechunk2d((data.y.size, data.x.size), element_bytes=mem_per_pixel)
-        chunks_y, chunks_x = optimal['y'], optimal['x']
-
-        # Rechunk: all dates together (-1), auto-chunked y,x
-        first_dim = data.dims[0]
-        data = data.chunk({first_dim: -1, 'y': chunks_y, 'x': chunks_x})
-
-        # Use blockwise to avoid embedding large arrays in the graph
-        def process_block(data_block):
-            # data_block: (n_dates, y_chunk, x_chunk)
-            # transpose to (y, x, n_dates) for vectorized STL
-            data_transposed = data_block.transpose(1, 2, 0)
+        # No rechunk on dim 0 — pass per-date delayed lists to kernel.
+        data_dask = data.data
+
+        y_chunks = data_dask.chunks[1]
+        x_chunks = data_dask.chunks[2]
+        y_breaks = [0] + list(np.cumsum(y_chunks))
+        x_breaks = [0] + list(np.cumsum(x_chunks))
+
+        def process_chunks(data_chunks):
+            import math
+            from .utils_dask import get_dask_chunk_size_mb
+            chunks = [np.asarray(c) for c in data_chunks]
+            ny, nx = chunks[0].shape[1], chunks[0].shape[2]
+            n_dates_in_local = sum(c.shape[0] for c in chunks)
+            result = np.empty((3, n_dates_out, ny, nx), dtype=np.float32)
             vec_stl = np.vectorize(
                 lambda ts: utils_stl.stl1d(ts, dt, dt_periodic, periods, robust),
                 signature='(n)->(m),(m),(m)'
             )
-            # result: (3, y, x, n_dates_out) after asarray
-            block = np.asarray(vec_stl(data_transposed))
-            del vec_stl, data_transposed
-            # transpose to (3, n_dates_out, y, x)
-            return block.transpose(0, 3, 1, 2).astype(np.float32)
-
-        data_dask = data.data
-        models = dask.array.map_blocks(
-            process_block, data_dask,
-            dtype=np.float32,
-            drop_axis=0,
-            new_axis=[0, 1],
-            chunks=(3, n_dates_out) + data_dask.chunks[1:],
-        )
+            # Calculate sub-tile size from dask chunk budget.
+            # Per sub-tile memory: input (n_dates_in × sub_pixels × 4) + output (3 × n_dates_out × sub_pixels × 4)
+            per_pixel_bytes = (n_dates_in_local + 3 * n_dates_out) * 4
+            budget_bytes = int(get_dask_chunk_size_mb() * 1024 * 1024)
+            max_sub_pixels = max(256, budget_bytes // max(1, per_pixel_bytes))
+            sub_side = int(math.sqrt(max_sub_pixels))
+            sub_h = min(sub_side, ny)
+            sub_w = min(sub_side, nx)
+            for ty0 in range(0, ny, sub_h):
+                ty1 = min(ty0 + sub_h, ny)
+                for tx0 in range(0, nx, sub_w):
+                    tx1 = min(tx0 + sub_w, nx)
+                    if len(chunks) == 1:
+                        tile = chunks[0][:, ty0:ty1, tx0:tx1]
+                    else:
+                        tile = np.concatenate(
+                            [c[:, ty0:ty1, tx0:tx1] for c in chunks], axis=0
+                        )
+                    # (n_dates, sub_h, sub_w) -> (sub_h, sub_w, n_dates)
+                    tile_t = tile.transpose(1, 2, 0)
+                    del tile
+                    # result: (3, sub_h, sub_w, n_dates_out) after asarray
+                    block = np.asarray(vec_stl(tile_t))
+                    del tile_t
+                    # (3, sub_h, sub_w, n_dates_out) -> (3, n_dates_out, sub_h, sub_w)
+                    result[:, :, ty0:ty1, tx0:tx1] = block.transpose(0, 3, 1, 2)
+                    del block
+            del vec_stl
+            return result
+
+        blocks_rows = []
+        for bj in range(len(y_breaks) - 1):
+            y0, y1 = y_breaks[bj], y_breaks[bj + 1]
+            blocks_row = []
+            for bk in range(len(x_breaks) - 1):
+                x0, x1 = x_breaks[bk], x_breaks[bk + 1]
+                td_list = data_dask[:, y0:y1, x0:x1].to_delayed().ravel().tolist()
+                block = dask.array.from_delayed(
+                    dask.delayed(process_chunks)(td_list),
+                    shape=(3, n_dates_out, y1 - y0, x1 - x0),
+                    dtype=np.float32,
+                )
+                blocks_row.append(block)
+            blocks_rows.append(dask.array.concatenate(blocks_row, axis=3))
+
+        models = dask.array.concatenate(blocks_rows, axis=2)
 
         coords = {'date': dt_periodic.astype('datetime64[ns]'), 'y': data.y, 'x': data.x}
 
@@ -194,9 +224,6 @@ def process_block(data_block):
         keys_vars = {}
         for varidx, varname in enumerate(varnames):
             var_data = models[varidx]
-            # Rechunk to date=1 for efficient per-slice downstream operations (preserve spatial chunks)
-            if hasattr(var_data, 'rechunk'):
-                var_data = var_data.rechunk({0: 1})
             keys_vars[varname] = xr.DataArray(var_data, coords=coords)
         model = xr.Dataset({**keys_vars})
         del models