22
33Given a categorical raster and a pixel-count threshold, replaces
44connected regions smaller than the threshold with the value of
5- their largest spatial neighbor. Pairs with classification functions
6- (``natural_breaks``, ``reclassify``, etc.) and ``polygonize`` for
7- cleaning results before vectorization.
5+ their largest spatial neighbor that is already at or above the
6+ threshold. Matches the single-pass semantics of GDAL's
7+ ``GDALSieveFilter`` / ``rasterio.features.sieve``.
8+
9+ Pairs with classification functions (``natural_breaks``,
10+ ``reclassify``, etc.) and ``polygonize`` for cleaning results
11+ before vectorization.
812
913Supports all four backends: numpy, cupy, dask+numpy, dask+cupy.
1014"""
1115
1216from __future__ import annotations
1317
14- import warnings
1518from collections import defaultdict
1619from typing import Sequence
1720
@@ -40,7 +43,6 @@ class cupy:
4043 ngjit ,
4144)
4245
43- _MAX_ITERATIONS = 50
4446
4547
4648# ---------------------------------------------------------------------------
@@ -205,67 +207,75 @@ def _collect(a, b):
205207
206208
207209def _sieve_numpy (data , threshold , neighborhood , skip_values ):
208- """Replace connected regions smaller than *threshold* pixels."""
210+ """Single-pass sieve matching GDAL's ``GDALSieveFilter`` semantics.
211+
212+ A small region is only merged into a neighbor whose size is
213+ **>= threshold**. If no such neighbor exists the region stays.
214+ Regions are processed smallest-first with in-place size updates
215+ so that earlier merges can grow a neighbor above threshold for
216+ later ones within the same pass.
217+ """
209218 result = data .astype (np .float64 , copy = True )
210219 is_float = np .issubdtype (data .dtype , np .floating )
211220 valid = ~ np .isnan (result ) if is_float else np .ones (result .shape , dtype = bool )
212221 skip_set = set (skip_values ) if skip_values is not None else set ()
213222
214- for _ in range (_MAX_ITERATIONS ):
215- region_map , region_val , uid = _label_connected (
216- result , valid , neighborhood
217- )
218- region_size = np .bincount (
219- region_map .ravel (), minlength = uid
220- ).astype (np .int64 )
221-
222- # Identify small regions eligible for merging
223- small_ids = [
224- rid
225- for rid in range (1 , uid )
226- if region_size [rid ] < threshold
227- and region_val [rid ] not in skip_set
228- ]
229- if not small_ids :
230- return result , True
231-
232- adjacency = _build_adjacency (region_map , neighborhood )
233-
234- # Process smallest regions first so they merge into larger neighbors
235- small_ids .sort (key = lambda r : region_size [r ])
236-
237- merged_any = False
238- for rid in small_ids :
239- if region_size [rid ] == 0 or region_size [rid ] >= threshold :
240- continue
223+ region_map , region_val , uid = _label_connected (
224+ result , valid , neighborhood
225+ )
226+ region_size = np .bincount (
227+ region_map .ravel (), minlength = uid
228+ ).astype (np .int64 )
229+
230+ small_ids = [
231+ rid
232+ for rid in range (1 , uid )
233+ if region_size [rid ] < threshold
234+ and region_val [rid ] not in skip_set
235+ ]
236+ if not small_ids :
237+ return result
238+
239+ adjacency = _build_adjacency (region_map , neighborhood )
240+
241+ # Process smallest regions first so earlier merges can grow
242+ # a neighbor above threshold for later candidates.
243+ small_ids .sort (key = lambda r : region_size [r ])
244+
245+ for rid in small_ids :
246+ if region_size [rid ] == 0 or region_size [rid ] >= threshold :
247+ continue
241248
242- neighbors = adjacency .get (rid )
243- if not neighbors :
244- continue # surrounded by nodata only
249+ neighbors = adjacency .get (rid )
250+ if not neighbors :
251+ continue # surrounded by nodata only
245252
246- largest_nid = max (neighbors , key = lambda n : region_size [n ])
247- mask = region_map == rid
248- result [mask ] = region_val [largest_nid ]
253+ # Only merge into a neighbor that is already >= threshold.
254+ valid_neighbors = [
255+ n for n in neighbors if region_size [n ] >= threshold
256+ ]
257+ if not valid_neighbors :
258+ continue
249259
250- # Update tracking in place
251- region_map [mask ] = largest_nid
252- region_size [largest_nid ] += region_size [rid ]
253- region_size [rid ] = 0
260+ largest_nid = max (valid_neighbors , key = lambda n : region_size [n ])
261+ mask = region_map == rid
262+ result [mask ] = region_val [largest_nid ]
254263
255- for n in neighbors :
256- if n != largest_nid :
257- adjacency [n ].discard (rid )
258- adjacency [n ].add (largest_nid )
259- adjacency .setdefault (largest_nid , set ()).add (n )
260- if largest_nid in adjacency :
261- adjacency [largest_nid ].discard (rid )
262- del adjacency [rid ]
263- merged_any = True
264+ # Update tracking in place
265+ region_map [mask ] = largest_nid
266+ region_size [largest_nid ] += region_size [rid ]
267+ region_size [rid ] = 0
264268
265- if not merged_any :
266- return result , True
269+ for n in neighbors :
270+ if n != largest_nid :
271+ adjacency [n ].discard (rid )
272+ adjacency [n ].add (largest_nid )
273+ adjacency .setdefault (largest_nid , set ()).add (n )
274+ if largest_nid in adjacency :
275+ adjacency [largest_nid ].discard (rid )
276+ del adjacency [rid ]
267277
268- return result , False
278+ return result
269279
270280
271281# ---------------------------------------------------------------------------
@@ -277,10 +287,10 @@ def _sieve_cupy(data, threshold, neighborhood, skip_values):
277287 """CuPy backend: transfer to CPU, sieve, transfer back."""
278288 import cupy as cp
279289
280- np_result , converged = _sieve_numpy (
290+ np_result = _sieve_numpy (
281291 data .get (), threshold , neighborhood , skip_values
282292 )
283- return cp .asarray (np_result ), converged
293+ return cp .asarray (np_result )
284294
285295
286296# ---------------------------------------------------------------------------
@@ -320,10 +330,10 @@ def _sieve_dask(data, threshold, neighborhood, skip_values):
320330 )
321331
322332 np_data = data .compute ()
323- result , converged = _sieve_numpy (
333+ result = _sieve_numpy (
324334 np_data , threshold , neighborhood , skip_values
325335 )
326- return da .from_array (result , chunks = data .chunks ), converged
336+ return da .from_array (result , chunks = data .chunks )
327337
328338
329339def _sieve_dask_cupy (data , threshold , neighborhood , skip_values ):
@@ -345,10 +355,10 @@ def _sieve_dask_cupy(data, threshold, neighborhood, skip_values):
345355 pass
346356
347357 cp_data = data .compute ()
348- result , converged = _sieve_cupy (
358+ result = _sieve_cupy (
349359 cp_data , threshold , neighborhood , skip_values
350360 )
351- return da .from_array (result , chunks = data .chunks ), converged
361+ return da .from_array (result , chunks = data .chunks )
352362
353363
354364# ---------------------------------------------------------------------------
@@ -367,7 +377,10 @@ def sieve(
367377
368378 Identifies connected components of same-value pixels and replaces
369379 regions smaller than *threshold* pixels with the value of their
370- largest spatial neighbor. NaN pixels are always preserved.
380+ largest spatial neighbor that is already at or above *threshold*.
381+ Regions whose only neighbors are also below *threshold* are left
382+ unchanged, matching GDAL's single-pass semantics. NaN pixels
383+ are always preserved.
371384
372385 Parameters
373386 ----------
@@ -417,6 +430,11 @@ def sieve(
417430
418431 Notes
419432 -----
433+ Uses single-pass semantics matching GDAL's ``GDALSieveFilter``.
434+ A small region is only merged into a neighbor whose current size
435+ is >= *threshold*. If no such neighbor exists the region is left
436+ unchanged.
437+
420438 This is a global operation: for dask-backed arrays the entire raster
421439 is computed into memory before sieving. Connected-component labeling
422440 cannot be performed on individual chunks because regions may span
@@ -442,35 +460,21 @@ def sieve(
442460 data = raster .data
443461
444462 if isinstance (data , np .ndarray ):
445- out , converged = _sieve_numpy (
446- data , threshold , neighborhood , skip_values
447- )
463+ out = _sieve_numpy (data , threshold , neighborhood , skip_values )
448464 elif has_cuda_and_cupy () and is_cupy_array (data ):
449- out , converged = _sieve_cupy (
450- data , threshold , neighborhood , skip_values
451- )
465+ out = _sieve_cupy (data , threshold , neighborhood , skip_values )
452466 elif da is not None and isinstance (data , da .Array ):
453467 if is_dask_cupy (raster ):
454- out , converged = _sieve_dask_cupy (
468+ out = _sieve_dask_cupy (
455469 data , threshold , neighborhood , skip_values
456470 )
457471 else :
458- out , converged = _sieve_dask (
459- data , threshold , neighborhood , skip_values
460- )
472+ out = _sieve_dask (data , threshold , neighborhood , skip_values )
461473 else :
462474 raise TypeError (
463475 f"Unsupported array type { type (data ).__name__ } for sieve()"
464476 )
465477
466- if not converged :
467- warnings .warn (
468- f"sieve() did not converge after { _MAX_ITERATIONS } iterations. "
469- f"The result may still contain regions smaller than "
470- f"threshold={ threshold } ." ,
471- stacklevel = 2 ,
472- )
473-
474478 return DataArray (
475479 out ,
476480 name = name ,
0 commit comments