|
| 1 | +"""DuckDB-native equivalent of GEOS ST_CoverageClean. |
| 2 | +
|
| 3 | +Cleans coverage errors in `_01` by: |
| 4 | +
|
| 5 | +1. Identifying which interior holes of `ST_Union_Agg(_01)` are lakes (preserved) |
| 6 | + versus slivers (absorbed into a neighbour) without modifying any original |
| 7 | + geometry. Classification uses two scale-invariant gates (max-inscribed-circle |
| 8 | + diameter; Polsby-Popper compactness). |
| 9 | +
|
| 10 | +2. Turning `_01` into lines via `ST_Boundary`, then `ST_Node` + `ST_Polygonize` |
| 11 | + to produce atomic regions whose boundaries are made entirely of original |
| 12 | + polygon vertices (plus crossing vertices added by `ST_Node` — never moved, |
| 13 | + only inserted, and shared identically between every atom that touches the |
| 14 | + crossing). |
| 15 | +
|
| 16 | +3. Joining atoms with each polygon's representative point to assign each atom |
| 17 | + to a fid. Atoms that match multiple polygons (overlap regions) go to the |
| 18 | + `overlap_strategy` winner. Atoms that match no polygon are gaps — slivers |
| 19 | + go to their longest-border neighbour, lakes are skipped. |
| 20 | +
|
| 21 | +4. `ST_Union_Agg`-ing atoms per fid for the cleaned `_01`. |
| 22 | +
|
| 23 | +The crucial property versus a per-loser `ST_Difference` approach: every shared |
| 24 | +edge between adjacent cleaned polygons inherits its coordinates from the same |
| 25 | +`ST_Polygonize` output, so `lines.py`'s exact `ST_Intersection` matches them |
| 26 | +without sub-pixel residue. Modifications to `_01` are only made where dirty |
| 27 | +input topology demands them; clean coverages take the early-exit path and pass |
| 28 | +through bit-for-bit. |
| 29 | +
|
| 30 | +This module is the swap boundary: when DuckDB-spatial wraps GEOS 3.13's |
| 31 | +`coverage_clean`, the body of `main` collapses to a single |
| 32 | +`SELECT ST_CoverageClean(list(geom), ...)` call. Signature mirrors PostGIS |
| 33 | +`ST_CoverageClean(geom_array, snapping_distance, gap_maximum_width, |
| 34 | +overlap_merge_strategy)` so the swap is a body-only change. |
| 35 | +""" |
| 36 | + |
| 37 | +from logging import getLogger |
| 38 | +from typing import Literal |
| 39 | + |
| 40 | +from duckdb import DuckDBPyConnection |
| 41 | + |
| 42 | +from .config import debug |
| 43 | + |
| 44 | +OverlapStrategy = Literal["largest_area", "merge_longest_border"] |
| 45 | + |
| 46 | +# Per-strategy SQL expression that ranks (atom, fid) candidates. Materialised |
| 47 | +# as the `metric` column in the candidates CTE so QUALIFY can ORDER BY it. |
| 48 | +# Only the chosen strategy's expression is computed — `merge_longest_border`'s |
| 49 | +# `ST_Intersection(ST_Boundary, ST_Boundary)` is the most expensive op in the |
| 50 | +# whole CTE, so skipping it under `largest_area` is a real win on dirty data. |
| 51 | +_STRATEGY_METRIC: dict[OverlapStrategy, str] = { |
| 52 | + "largest_area": "ST_Area(p.geom)", |
| 53 | + "merge_longest_border": ( |
| 54 | + "ST_Length(ST_Intersection(ST_Boundary(a.atom), ST_Boundary(p.geom)))" |
| 55 | + ), |
| 56 | +} |
| 57 | + |
| 58 | +logger = getLogger(__name__) |
| 59 | + |
| 60 | + |
| 61 | +def main( # noqa: PLR0913 - mirrors ST_CoverageClean signature |
| 62 | + conn: DuckDBPyConnection, |
| 63 | + name: str, |
| 64 | + *, |
| 65 | + snapping_distance: float = 0.0, # noqa: ARG001 - reserved for ST_CoverageClean swap |
| 66 | + gap_maximum_width: float = 0.0001, |
| 67 | + gap_max_thinness: float = 0.05, |
| 68 | + overlap_strategy: OverlapStrategy = "largest_area", |
| 69 | +) -> None: |
| 70 | + """Clean coverage errors in `_01` via polygonize-and-reattribute. |
| 71 | +
|
| 72 | + A hole is treated as a fillable sliver if EITHER: |
| 73 | + - its max-inscribed-circle diameter ≤ ``gap_maximum_width`` (small |
| 74 | + round artifact, sub-pixel safety net), OR |
| 75 | + - its Polsby-Popper compactness ``4πA/P²`` ≤ ``gap_max_thinness`` |
| 76 | + (stringy/elongated shape, primary discriminator). |
| 77 | +
|
| 78 | + Lakes and intentional small wedges are large AND compact — they fail |
| 79 | + both gates and are preserved as holes in the cleaned coverage. |
| 80 | + """ |
| 81 | + # Early exit when the input is already a valid coverage. ST_CoverageInvalidEdges |
| 82 | + # flags overlaps and unmatched shared edges (sliver-gap boundaries) but does |
| 83 | + # NOT flag legitimate interior holes like lakes, so a dataset whose only |
| 84 | + # holes are lakes returns NULL here and skips cleaning. |
| 85 | + has_errors = conn.execute(f"""--sql |
| 86 | + WITH agg AS ( |
| 87 | + SELECT ST_CoverageInvalidEdges_Agg(geom) AS g FROM "{name}_01" |
| 88 | + ) |
| 89 | + SELECT g IS NOT NULL AND NOT ST_IsEmpty(g) FROM agg |
| 90 | + """).fetchall()[0][0] |
| 91 | + # Persistent tracker — merge.py uses this to know which polygons need |
| 92 | + # their full `_02a` exterior edges added to the polygonize input. Empty if |
| 93 | + # clean exited early (uniform interface). |
| 94 | + conn.execute(f"""--sql |
| 95 | + CREATE OR REPLACE TABLE "{name}_01_modified_fids" (fid INTEGER) |
| 96 | + """) |
| 97 | + |
| 98 | + if not has_errors: |
| 99 | + logger.info("clean: no coverage errors in %s_01, skipping", name) |
| 100 | + return |
| 101 | + |
| 102 | + strategy_metric = _STRATEGY_METRIC[overlap_strategy] |
| 103 | + |
| 104 | + # Full snapshot of `_01` (attributes AND geometry). Geometry is preserved so |
| 105 | + # that any polygon which ends up with no assigned atoms (e.g. fully |
| 106 | + # contained by a strategy winner — unusual) can fall back to its original |
| 107 | + # geom rather than disappearing from the output. |
| 108 | + conn.execute(f"""--sql |
| 109 | + CREATE OR REPLACE TABLE "{name}_01_tmp0" AS |
| 110 | + SELECT * FROM "{name}_01" |
| 111 | + """) |
| 112 | + |
| 113 | + # Lakes: interior rings of the union that are large AND compact (fail BOTH |
| 114 | + # the width and thinness gates). These are preserved as holes — atoms |
| 115 | + # falling inside them are not assigned to any fid. |
| 116 | + conn.execute(f"""--sql |
| 117 | + CREATE OR REPLACE TABLE "{name}_01_tmp1" AS |
| 118 | + WITH unioned AS ( |
| 119 | + SELECT ST_Union_Agg(geom) AS u FROM "{name}_01" |
| 120 | + ), |
| 121 | + shells AS ( |
| 122 | + SELECT UNNEST(ST_Dump(u)).geom AS shell FROM unioned |
| 123 | + ), |
| 124 | + holes AS ( |
| 125 | + SELECT ST_MakePolygon(ST_InteriorRingN(shell, n)) AS hole_geom |
| 126 | + FROM shells, generate_series(1, ST_NumInteriorRings(shell)) AS s(n) |
| 127 | + WHERE ST_NumInteriorRings(shell) > 0 |
| 128 | + ) |
| 129 | + SELECT hole_geom |
| 130 | + FROM holes |
| 131 | + WHERE NOT ( |
| 132 | + 2 * (ST_MaximumInscribedCircle(hole_geom)).radius |
| 133 | + <= {gap_maximum_width!r} |
| 134 | + OR 4 * pi() * ST_Area(hole_geom) |
| 135 | + / NULLIF(pow(ST_Perimeter(hole_geom), 2), 0) |
| 136 | + <= {gap_max_thinness!r} |
| 137 | + ) |
| 138 | + """) |
| 139 | + |
| 140 | + # One representative point per polygon part. Same shape as merge.py's |
| 141 | + # `_05_tmp4` — both attribute downstream cells to a fid via point-in-atom. |
| 142 | + conn.execute(f"""--sql |
| 143 | + CREATE OR REPLACE TABLE "{name}_01_tmp2" AS |
| 144 | + WITH parts AS ( |
| 145 | + SELECT fid, UNNEST(ST_Dump(geom)).geom AS part_geom |
| 146 | + FROM "{name}_01" |
| 147 | + ) |
| 148 | + SELECT fid, ST_PointOnSurface(part_geom) AS pt |
| 149 | + FROM parts |
| 150 | + """) |
| 151 | + |
| 152 | + # Atomic regions from `ST_Node` + `ST_Polygonize` over every polygon's |
| 153 | + # boundary. ST_Node only ADDS crossing vertices — it never moves an |
| 154 | + # existing one — so unaffected polygon edges keep their original vertices |
| 155 | + # exactly, and every atom that touches a crossing inherits identical |
| 156 | + # coordinates from the same ST_Node output. Adjacent polygons therefore |
| 157 | + # share their cleaned-coverage boundaries vertex-for-vertex. |
| 158 | + conn.execute(f"""--sql |
| 159 | + CREATE OR REPLACE TABLE "{name}_01_tmp3" AS |
| 160 | + WITH lines AS ( |
| 161 | + SELECT ST_Boundary(geom) AS line FROM "{name}_01" |
| 162 | + ), |
| 163 | + noded AS ( |
| 164 | + SELECT ST_Node(ST_Collect(list(line))) AS line FROM lines |
| 165 | + ), |
| 166 | + atoms AS ( |
| 167 | + SELECT UNNEST(ST_Dump(ST_Polygonize(list(line)))).geom AS atom |
| 168 | + FROM noded |
| 169 | + ) |
| 170 | + SELECT ROW_NUMBER() OVER () AS aid, atom FROM atoms |
| 171 | + """) |
| 172 | + |
| 173 | + # Atom-to-fid assignment. |
| 174 | + # |
| 175 | + # `DISTINCT` in candidates collapses multipart polygons that produce more |
| 176 | + # than one (aid, fid) candidate row. Sliver-to-neighbour uses longest |
| 177 | + # shared boundary regardless of `overlap_strategy`: gaps live between |
| 178 | + # polygons, not inside them, so "largest area" is meaningless for slivers. |
| 179 | + # See merge.py for the bbox-prefilter rationale (avoids SPATIAL_JOIN OOM). |
| 180 | + conn.execute(f"""--sql |
| 181 | + CREATE OR REPLACE TABLE "{name}_01_tmp4" AS |
| 182 | + WITH |
| 183 | + candidates AS ( |
| 184 | + SELECT DISTINCT a.aid, a.atom, pt.fid, |
| 185 | + {strategy_metric} AS metric |
| 186 | + FROM "{name}_01_tmp3" a |
| 187 | + JOIN "{name}_01_tmp2" pt |
| 188 | + ON ST_X(pt.pt) >= ST_XMin(a.atom) |
| 189 | + AND ST_X(pt.pt) <= ST_XMax(a.atom) |
| 190 | + AND ST_Y(pt.pt) >= ST_YMin(a.atom) |
| 191 | + AND ST_Y(pt.pt) <= ST_YMax(a.atom) |
| 192 | + AND ST_Within(pt.pt, a.atom) |
| 193 | + JOIN "{name}_01" p ON p.fid = pt.fid |
| 194 | + ), |
| 195 | + interior_winners AS ( |
| 196 | + SELECT aid, atom, fid |
| 197 | + FROM candidates |
| 198 | + QUALIFY ROW_NUMBER() OVER ( |
| 199 | + PARTITION BY aid ORDER BY metric DESC, fid ASC |
| 200 | + ) = 1 |
| 201 | + ), |
| 202 | + gap_atoms AS ( |
| 203 | + SELECT a.aid, a.atom |
| 204 | + FROM "{name}_01_tmp3" a |
| 205 | + WHERE NOT EXISTS ( |
| 206 | + SELECT 1 FROM "{name}_01_tmp2" pt |
| 207 | + WHERE ST_Within(pt.pt, a.atom) |
| 208 | + ) |
| 209 | + ), |
| 210 | + sliver_atoms AS ( |
| 211 | + SELECT g.aid, g.atom |
| 212 | + FROM gap_atoms g |
| 213 | + WHERE NOT EXISTS ( |
| 214 | + SELECT 1 FROM "{name}_01_tmp1" lake |
| 215 | + WHERE ST_Within(ST_PointOnSurface(g.atom), lake.hole_geom) |
| 216 | + ) |
| 217 | + ), |
| 218 | + sliver_assigned AS ( |
| 219 | + SELECT s.aid, s.atom, p.fid |
| 220 | + FROM sliver_atoms s |
| 221 | + JOIN "{name}_01" p |
| 222 | + ON ST_XMax(s.atom) >= ST_XMin(p.geom) |
| 223 | + AND ST_XMin(s.atom) <= ST_XMax(p.geom) |
| 224 | + AND ST_YMax(s.atom) >= ST_YMin(p.geom) |
| 225 | + AND ST_YMin(s.atom) <= ST_YMax(p.geom) |
| 226 | + AND ST_Intersects(ST_Boundary(s.atom), ST_Boundary(p.geom)) |
| 227 | + QUALIFY ROW_NUMBER() OVER ( |
| 228 | + PARTITION BY s.aid ORDER BY |
| 229 | + ST_Length(ST_Intersection( |
| 230 | + ST_Boundary(s.atom), ST_Boundary(p.geom) |
| 231 | + )) DESC, |
| 232 | + p.fid ASC |
| 233 | + ) = 1 |
| 234 | + ) |
| 235 | + SELECT aid, atom, fid FROM interior_winners |
| 236 | + UNION ALL |
| 237 | + SELECT aid, atom, fid FROM sliver_assigned |
| 238 | + """) |
| 239 | + |
| 240 | + # Rewrite `_01`: union assigned atoms per fid, re-attach attributes. A |
| 241 | + # polygon with no assigned atoms (e.g. swallowed entirely by a larger |
| 242 | + # strategy winner — rare) falls back to its original geometry from the |
| 243 | + # snapshot, so no row is ever silently dropped here. |
| 244 | + conn.execute(f"""--sql |
| 245 | + CREATE OR REPLACE TABLE "{name}_01" AS |
| 246 | + WITH per_fid AS ( |
| 247 | + SELECT fid, ST_Union_Agg(atom) AS new_geom |
| 248 | + FROM "{name}_01_tmp4" |
| 249 | + GROUP BY fid |
| 250 | + ) |
| 251 | + SELECT a.* EXCLUDE (geom), |
| 252 | + COALESCE(p.new_geom, a.geom) AS geom |
| 253 | + FROM "{name}_01_tmp0" a |
| 254 | + LEFT JOIN per_fid p ON a.fid = p.fid |
| 255 | + """) |
| 256 | + |
| 257 | + # Populate the persistent modified-fids tracker — every fid whose cleaned |
| 258 | + # geometry differs from the snapshot. |
| 259 | + conn.execute(f"""--sql |
| 260 | + INSERT INTO "{name}_01_modified_fids" |
| 261 | + SELECT a.fid |
| 262 | + FROM "{name}_01" a JOIN "{name}_01_tmp0" o ON a.fid = o.fid |
| 263 | + WHERE NOT ST_Equals(a.geom, o.geom) |
| 264 | + """) |
| 265 | + |
| 266 | + if not debug: |
| 267 | + conn.execute(f'DROP TABLE IF EXISTS "{name}_01_tmp0"') |
| 268 | + conn.execute(f'DROP TABLE IF EXISTS "{name}_01_tmp1"') |
| 269 | + conn.execute(f'DROP TABLE IF EXISTS "{name}_01_tmp2"') |
| 270 | + conn.execute(f'DROP TABLE IF EXISTS "{name}_01_tmp3"') |
| 271 | + conn.execute(f'DROP TABLE IF EXISTS "{name}_01_tmp4"') |
0 commit comments