[SEDONA-2880] GeoParquet writer: omit bbox in metadata for empty files

james-willis · james-willis · commit 3b308d80bb31 · 2026-05-04T10:48:31.000-07:00
When a Spark partition has zero rows the GeoParquet writer was emitting
`bbox: [0, 0, 0, 0]` in the per-column geo metadata. Per the GeoParquet
1.1 spec, `bbox` is the bounding box of the geometries in the file and
is optional ("if specified, MUST be encoded..."), so for a file with no
geometries we should omit it rather than fabricate an extent.

The fabricated `[0, 0, 0, 0]` is especially harmful: it places a phantom
"data at Null Island" claim in the metadata, breaking bbox-based file
pruning in downstream readers (Sedona's own GeoParquetSpatialFilter,
DuckDB Spatial, GDAL's OGR_GEOPARQUET driver, GeoPandas) and corrupting
dataset-level extent aggregation.

This change makes `GeometryFieldMetaData.bbox` an `Option[Seq[Double]]`
and writes `None` (which json4s omits from JSON) when no geometries
were observed. All consumers of the case class are updated.
diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala
@@ -34,7 +34,9 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject,
  * @param geometryTypes
  *   The geometry types of all geometries, or an empty array if they are not known.
  * @param bbox
- *   Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5.
+ *   Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5. None
+ *   if the file contains no geometries (per the GeoParquet 1.1 spec, bbox is optional and should
+ *   be omitted when there is no extent to describe).
  * @param crs
  *   The CRS of the geometries in the file. None if crs metadata is absent, Some(JNull) if crs is
  *   null, Some(value) if the crs is present and not null.
@@ -44,7 +46,7 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject,
 case class GeometryFieldMetaData(
     encoding: String,
     geometryTypes: Seq[String],
-    bbox: Seq[Double],
+    bbox: Option[Seq[Double]],
     crs: Option[JValue] = None,
     covering: Option[Covering] = None)
 
diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala
@@ -68,7 +68,7 @@ object GeoParquetSpatialFilter {
       extends GeoParquetSpatialFilter {
     def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = {
       columns.get(columnName).forall { column =>
-        val bbox = column.bbox
+        val bbox = column.bbox.getOrElse(return true)
         if (bbox.isEmpty) {
           return true
         }
diff --git a/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala b/spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala
@@ -245,13 +245,19 @@ class GeoParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
       val columns = geometryColumnInfoMap.map { case (ordinal, columnInfo) =>
         val columnName = schema.fields(ordinal).name
         val geometryTypes = columnInfo.seenGeometryTypes.toSeq
+        // Omit bbox from column metadata when no geometries were observed (e.g. an empty
+        // Spark partition produces a zero-row file). Per the GeoParquet 1.1 spec, bbox is
+        // optional and represents the extent of the geometries in the file; emitting
+        // [0, 0, 0, 0] for an empty file falsely advertises data at Null Island and breaks
+        // bbox-based file pruning in downstream readers.
         val bbox = if (geometryTypes.nonEmpty) {
-          Seq(
-            columnInfo.bbox.minX,
-            columnInfo.bbox.minY,
-            columnInfo.bbox.maxX,
-            columnInfo.bbox.maxY)
-        } else Seq(0.0, 0.0, 0.0, 0.0)
+          Some(
+            Seq(
+              columnInfo.bbox.minX,
+              columnInfo.bbox.minY,
+              columnInfo.bbox.maxX,
+              columnInfo.bbox.maxY))
+        } else None
         val crs = geoParquetColumnCrsMap.getOrElse(
           columnName, {
             if (!userExplicitlySetDefaultCrs) {
diff --git a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala
@@ -328,7 +328,7 @@ case class StacBatch(
             val geometryFieldMetaData = GeometryFieldMetaData(
               encoding = "WKB",
               geometryTypes = geometryTypes,
-              bbox = bbox,
+              bbox = Some(bbox),
               crs = None,
               covering = None)
 
diff --git a/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala b/spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala
@@ -252,9 +252,12 @@ class geoparquetIOTests extends TestBaseScala with BeforeAndAfterAll {
       validateGeoParquetMetadata(geoParquetSavePath) { geo =>
         implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats
         val g0Types = (geo \ "columns" \ "g" \ "geometry_types").extract[Seq[String]]
-        val g0BBox = (geo \ "columns" \ "g" \ "bbox").extract[Seq[Double]]
         assert(g0Types.isEmpty)
-        assert(g0BBox == Seq(0.0, 0.0, 0.0, 0.0))
+        // Per the GeoParquet spec, bbox is optional and represents the extent of the geometries
+        // in the file; for a file with no geometries we omit it entirely rather than emit a
+        // bogus [0, 0, 0, 0] (which would falsely advertise data at Null Island and break
+        // bbox-based file pruning in downstream readers). See issue #2880.
+        assert((geo \ "columns" \ "g" \ "bbox") == org.json4s.JNothing)
       }
     }
 
diff --git a/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala
@@ -83,7 +83,7 @@ object GeoParquetMetadataPartitionReaderFactory {
           val columnMetadataFields: Array[Any] = Array(
             UTF8String.fromString(columnMetadata.encoding),
             new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
-            new GenericArrayData(columnMetadata.bbox.toArray),
+            columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
             columnMetadata.crs
               .map(projjson => UTF8String.fromString(compact(render(projjson))))
               .getOrElse(UTF8String.fromString("")),
diff --git a/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
           val columnMetadataFields: Array[Any] = Array(
             UTF8String.fromString(columnMetadata.encoding),
             new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
-            new GenericArrayData(columnMetadata.bbox.toArray),
+            columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
             columnMetadata.crs
               .map(projjson => UTF8String.fromString(compact(render(projjson))))
               .getOrElse(UTF8String.fromString("")),
diff --git a/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
           val columnMetadataFields: Array[Any] = Array(
             UTF8String.fromString(columnMetadata.encoding),
             new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
-            new GenericArrayData(columnMetadata.bbox.toArray),
+            columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
             columnMetadata.crs
               .map(projjson => UTF8String.fromString(compact(render(projjson))))
               .getOrElse(UTF8String.fromString("")),
diff --git a/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala b/spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
           val columnMetadataFields: Array[Any] = Array(
             UTF8String.fromString(columnMetadata.encoding),
             new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
-            new GenericArrayData(columnMetadata.bbox.toArray),
+            columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
             columnMetadata.crs
               .map(projjson => UTF8String.fromString(compact(render(projjson))))
               .getOrElse(UTF8String.fromString("")),

Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ object GeoParquetSpatialFilter {`
`68`	`68`	`extends GeoParquetSpatialFilter {`
`69`	`69`	`def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = {`
`70`	`70`	`columns.get(columnName).forall { column =>`
`71`		`- val bbox = column.bbox`
	`71`	`+ val bbox = column.bbox.getOrElse(return true)`
`72`	`72`	`if (bbox.isEmpty) {`
`73`	`73`	`return true`
`74`	`74`	`}`
Original file line number	Diff line number	Diff line change
`@@ -252,9 +252,12 @@ class geoparquetIOTests extends TestBaseScala with BeforeAndAfterAll {`
`252`	`252`	`validateGeoParquetMetadata(geoParquetSavePath) { geo =>`
`253`	`253`	`implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats`
`254`	`254`	`val g0Types = (geo \ "columns" \ "g" \ "geometry_types").extract[Seq[String]]`
`255`		`- val g0BBox = (geo \ "columns" \ "g" \ "bbox").extract[Seq[Double]]`
`256`	`255`	`assert(g0Types.isEmpty)`
`257`		`- assert(g0BBox == Seq(0.0, 0.0, 0.0, 0.0))`
	`256`	`+ // Per the GeoParquet spec, bbox is optional and represents the extent of the geometries`
	`257`	`+ // in the file; for a file with no geometries we omit it entirely rather than emit a`
	`258`	`+ // bogus [0, 0, 0, 0] (which would falsely advertise data at Null Island and break`
	`259`	`+ // bbox-based file pruning in downstream readers). See issue #2880.`
	`260`	`+ assert((geo \ "columns" \ "g" \ "bbox") == org.json4s.JNothing)`
`258`	`261`	`}`
`259`	`262`	`}`
`260`	`263`