From 3b308d80bb31eaa80af17890cad8b7f6db94afd5 Mon Sep 17 00:00:00 2001 From: James Willis Date: Mon, 4 May 2026 10:48:31 -0700 Subject: [PATCH] [SEDONA-2880] GeoParquet writer: omit bbox in metadata for empty files When a Spark partition has zero rows the GeoParquet writer was emitting `bbox: [0, 0, 0, 0]` in the per-column geo metadata. Per the GeoParquet 1.1 spec, `bbox` is the bounding box of the geometries in the file and is optional ("if specified, MUST be encoded..."), so for a file with no geometries we should omit it rather than fabricate an extent. The fabricated `[0, 0, 0, 0]` is especially harmful: it places a phantom "data at Null Island" claim in the metadata, breaking bbox-based file pruning in downstream readers (Sedona's own GeoParquetSpatialFilter, DuckDB Spatial, GDAL's OGR_GEOPARQUET driver, GeoPandas) and corrupting dataset-level extent aggregation. This change makes `GeometryFieldMetaData.bbox` an `Option[Seq[Double]]` and writes `None` (which json4s omits from JSON) when no geometries were observed. All consumers of the case class are updated. --- .../geoparquet/GeoParquetMetaData.scala | 6 ++++-- .../geoparquet/GeoParquetSpatialFilter.scala | 2 +- .../geoparquet/GeoParquetWriteSupport.scala | 18 ++++++++++++------ .../sql/sedona_sql/io/stac/StacBatch.scala | 2 +- .../apache/sedona/sql/geoparquetIOTests.scala | 7 +++++-- ...ParquetMetadataPartitionReaderFactory.scala | 2 +- ...ParquetMetadataPartitionReaderFactory.scala | 2 +- ...ParquetMetadataPartitionReaderFactory.scala | 2 +- ...ParquetMetadataPartitionReaderFactory.scala | 2 +- 9 files changed, 27 insertions(+), 16 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala index c4649316f67..7517b8e5fc4 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala @@ -34,7 +34,9 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject, * @param geometryTypes * The geometry types of all geometries, or an empty array if they are not known. * @param bbox - * Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5. + * Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5. None + * if the file contains no geometries (per the GeoParquet 1.1 spec, bbox is optional and should + * be omitted when there is no extent to describe). * @param crs * The CRS of the geometries in the file. None if crs metadata is absent, Some(JNull) if crs is * null, Some(value) if the crs is present and not null. @@ -44,7 +46,7 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject, case class GeometryFieldMetaData( encoding: String, geometryTypes: Seq[String], - bbox: Seq[Double], + bbox: Option[Seq[Double]], crs: Option[JValue] = None, covering: Option[Covering] = None) diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala index 3d8b1442065..cacc64d94b4 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala @@ -68,7 +68,7 @@ object GeoParquetSpatialFilter { extends GeoParquetSpatialFilter { def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = { columns.get(columnName).forall { column => - val bbox = column.bbox + val bbox = column.bbox.getOrElse(return true) if (bbox.isEmpty) { return true } diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala index ca6f7e090e3..c63437d22f8 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala @@ -245,13 +245,19 @@ class GeoParquetWriteSupport extends WriteSupport[InternalRow] with Logging { val columns = geometryColumnInfoMap.map { case (ordinal, columnInfo) => val columnName = schema.fields(ordinal).name val geometryTypes = columnInfo.seenGeometryTypes.toSeq + // Omit bbox from column metadata when no geometries were observed (e.g. an empty + // Spark partition produces a zero-row file). Per the GeoParquet 1.1 spec, bbox is + // optional and represents the extent of the geometries in the file; emitting + // [0, 0, 0, 0] for an empty file falsely advertises data at Null Island and breaks + // bbox-based file pruning in downstream readers. val bbox = if (geometryTypes.nonEmpty) { - Seq( - columnInfo.bbox.minX, - columnInfo.bbox.minY, - columnInfo.bbox.maxX, - columnInfo.bbox.maxY) - } else Seq(0.0, 0.0, 0.0, 0.0) + Some( + Seq( + columnInfo.bbox.minX, + columnInfo.bbox.minY, + columnInfo.bbox.maxX, + columnInfo.bbox.maxY)) + } else None val crs = geoParquetColumnCrsMap.getOrElse( columnName, { if (!userExplicitlySetDefaultCrs) { diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala index e97bbdc2273..5717bec15c5 100644 --- a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala +++ b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala @@ -328,7 +328,7 @@ case class StacBatch( val geometryFieldMetaData = GeometryFieldMetaData( encoding = "WKB", geometryTypes = geometryTypes, - bbox = bbox, + bbox = Some(bbox), crs = None, covering = None) diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala index 3041757ed20..5d2a13f4867 100644 --- a/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala +++ b/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala @@ -252,9 +252,12 @@ class geoparquetIOTests extends TestBaseScala with BeforeAndAfterAll { validateGeoParquetMetadata(geoParquetSavePath) { geo => implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats val g0Types = (geo \ "columns" \ "g" \ "geometry_types").extract[Seq[String]] - val g0BBox = (geo \ "columns" \ "g" \ "bbox").extract[Seq[Double]] assert(g0Types.isEmpty) - assert(g0BBox == Seq(0.0, 0.0, 0.0, 0.0)) + // Per the GeoParquet spec, bbox is optional and represents the extent of the geometries + // in the file; for a file with no geometries we omit it entirely rather than emit a + // bogus [0, 0, 0, 0] (which would falsely advertise data at Null Island and break + // bbox-based file pruning in downstream readers). See issue #2880. + assert((geo \ "columns" \ "g" \ "bbox") == org.json4s.JNothing) } } diff --git a/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 22e02d172d9..d60ebcb07a2 100644 --- a/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -83,7 +83,7 @@ object GeoParquetMetadataPartitionReaderFactory { val columnMetadataFields: Array[Any] = Array( UTF8String.fromString(columnMetadata.encoding), new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray), - new GenericArrayData(columnMetadata.bbox.toArray), + columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull, columnMetadata.crs .map(projjson => UTF8String.fromString(compact(render(projjson)))) .getOrElse(UTF8String.fromString("")), diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 683160e93b2..b15c8888023 100644 --- a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory { val columnMetadataFields: Array[Any] = Array( UTF8String.fromString(columnMetadata.encoding), new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray), - new GenericArrayData(columnMetadata.bbox.toArray), + columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull, columnMetadata.crs .map(projjson => UTF8String.fromString(compact(render(projjson)))) .getOrElse(UTF8String.fromString("")), diff --git a/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 683160e93b2..b15c8888023 100644 --- a/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory { val columnMetadataFields: Array[Any] = Array( UTF8String.fromString(columnMetadata.encoding), new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray), - new GenericArrayData(columnMetadata.bbox.toArray), + columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull, columnMetadata.crs .map(projjson => UTF8String.fromString(compact(render(projjson)))) .getOrElse(UTF8String.fromString("")), diff --git a/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala index 683160e93b2..b15c8888023 100644 --- a/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala +++ b/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala @@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory { val columnMetadataFields: Array[Any] = Array( UTF8String.fromString(columnMetadata.encoding), new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray), - new GenericArrayData(columnMetadata.bbox.toArray), + columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull, columnMetadata.crs .map(projjson => UTF8String.fromString(compact(render(projjson)))) .getOrElse(UTF8String.fromString("")),