Skip to content

Commit 72b178a

Browse files
committed
[SEDONA-2880] GeoParquet reader: ignore [0,0,0,0] bbox fingerprint
Defensively guard `LeafFilter.evaluate` against legacy GeoParquet files written by buggy versions of Sedona that emitted `bbox: [0, 0, 0, 0]` in the metadata of zero-row files. When a file's bbox matches that exact fingerprint, treat it as untrusted and skip bbox-based pruning. Without this guard, files written by legacy Sedona with `bbox: [0, 0, 0, 0]` would be incorrectly retained for any query window intersecting Null Island and incorrectly pruned for query windows that don't, neither of which reflects the file's real (empty) contents. Note: this could in theory over-include a real file whose geometries all sit exactly at (0, 0) with zero extent, but such a file would also be uninteresting for any spatial query and the over-inclusion is harmless.
1 parent 3b308d8 commit 72b178a

2 files changed

Lines changed: 59 additions & 1 deletion

File tree

spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,13 @@ trait GeoParquetSpatialFilter {
3434

3535
object GeoParquetSpatialFilter {
3636

37+
/**
38+
* Bbox value emitted by buggy versions of Sedona's GeoParquet writer for files containing no
39+
* geometries (e.g. when an empty Spark partition is written out). See issue #2880. Files with
40+
* this fingerprint should not participate in bbox-based pruning.
41+
*/
42+
private[geoparquet] val LegacyEmptyBboxFingerprint: Seq[Double] = Seq(0.0, 0.0, 0.0, 0.0)
43+
3744
case class AndFilter(left: GeoParquetSpatialFilter, right: GeoParquetSpatialFilter)
3845
extends GeoParquetSpatialFilter {
3946
override def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = {
@@ -72,6 +79,13 @@ object GeoParquetSpatialFilter {
7279
if (bbox.isEmpty) {
7380
return true
7481
}
82+
// Defend against legacy GeoParquet files written by buggy versions of Sedona that
83+
// emitted `bbox: [0, 0, 0, 0]` in the metadata of zero-row files (see issue #2880).
84+
// The only legitimate file matching this fingerprint would contain geometries lying
85+
// exactly at the origin with zero extent — pathological and not worth pruning on.
86+
if (bbox == LegacyEmptyBboxFingerprint) {
87+
return true
88+
}
7589

7690
val columnEnvelope =
7791
queryWindow.getFactory.toGeometry(new Envelope(bbox(0), bbox(2), bbox(1), bbox(3)))

spark/common/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,8 @@ import org.apache.spark.sql.DataFrame
2929
import org.apache.spark.sql.SparkSession
3030
import org.apache.spark.sql.execution.FileSourceScanExec
3131
import org.apache.spark.sql.execution.SimpleMode
32-
import org.apache.spark.sql.execution.datasources.geoparquet.{GeoParquetFileFormat, GeoParquetMetaData, GeoParquetSpatialFilter}
32+
import org.apache.spark.sql.execution.datasources.geoparquet.{GeoParquetFileFormat, GeoParquetMetaData, GeoParquetSpatialFilter, GeometryFieldMetaData}
33+
import org.apache.sedona.core.spatialOperator.SpatialPredicate
3334
import org.locationtech.jts.geom.Coordinate
3435
import org.locationtech.jts.geom.Geometry
3536
import org.locationtech.jts.geom.GeometryFactory
@@ -307,6 +308,49 @@ class GeoParquetSpatialFilterPushDownSuite extends TestBaseScala with TableDrive
307308
assert(explainString.contains("with spatial filter"))
308309
}
309310

311+
it("LeafFilter ignores legacy [0,0,0,0] bbox fingerprint (issue #2880)") {
312+
// Buggy versions of the GeoParquet writer emitted [0, 0, 0, 0] as the bbox of zero-row
313+
// files. The reader must not prune those files based on that fingerprint, since the
314+
// metadata does not actually describe a real spatial extent. New writers omit bbox
315+
// entirely for empty files, which is also handled here.
316+
val geomFactory = new GeometryFactory()
317+
val queryWindow =
318+
geomFactory.toGeometry(new org.locationtech.jts.geom.Envelope(100.0, 200.0, 100.0, 200.0))
319+
val filter = GeoParquetSpatialFilter.LeafFilter(
320+
columnName = "geom",
321+
predicateType = SpatialPredicate.INTERSECTS,
322+
queryWindow = queryWindow)
323+
324+
// Legacy buggy file: bbox = [0, 0, 0, 0] is the buggy-writer fingerprint
325+
val legacyBuggyMeta = GeometryFieldMetaData(
326+
encoding = "WKB",
327+
geometryTypes = Seq.empty,
328+
bbox = Some(Seq(0.0, 0.0, 0.0, 0.0)))
329+
assert(filter.evaluate(Map("geom" -> legacyBuggyMeta)))
330+
331+
// Spec-compliant empty file: bbox omitted entirely
332+
val newEmptyMeta =
333+
GeometryFieldMetaData(encoding = "WKB", geometryTypes = Seq.empty, bbox = None)
334+
assert(filter.evaluate(Map("geom" -> newEmptyMeta)))
335+
336+
// Real file with bbox at the origin (but non-zero extent) should still participate
337+
// in pruning — only the exact [0, 0, 0, 0] fingerprint is treated as untrusted.
338+
val realOriginMeta = GeometryFieldMetaData(
339+
encoding = "WKB",
340+
geometryTypes = Seq("Point"),
341+
bbox = Some(Seq(0.0, 0.0, 1.0, 1.0)))
342+
assert(!filter.evaluate(Map("geom" -> realOriginMeta)))
343+
344+
// A file with empty geometry_types but a real (non-fingerprint) bbox should still
345+
// be subject to pruning — the spec allows `geometry_types: []` to mean "types are
346+
// not known" without invalidating the bbox.
347+
val unknownTypesMeta = GeometryFieldMetaData(
348+
encoding = "WKB",
349+
geometryTypes = Seq.empty,
350+
bbox = Some(Seq(50.0, 50.0, 60.0, 60.0)))
351+
assert(!filter.evaluate(Map("geom" -> unknownTypesMeta)))
352+
}
353+
310354
it("Manually disable spatial filter push-down") {
311355
withConf(Map("spark.sedona.geoparquet.spatialFilterPushDown" -> "false")) {
312356
val dfFiltered = geoParquetDf.where(

0 commit comments

Comments
 (0)