Skip to content

Commit 3b308d8

Browse files
committed
[SEDONA-2880] GeoParquet writer: omit bbox in metadata for empty files
When a Spark partition has zero rows the GeoParquet writer was emitting `bbox: [0, 0, 0, 0]` in the per-column geo metadata. Per the GeoParquet 1.1 spec, `bbox` is the bounding box of the geometries in the file and is optional ("if specified, MUST be encoded..."), so for a file with no geometries we should omit it rather than fabricate an extent. The fabricated `[0, 0, 0, 0]` is especially harmful: it places a phantom "data at Null Island" claim in the metadata, breaking bbox-based file pruning in downstream readers (Sedona's own GeoParquetSpatialFilter, DuckDB Spatial, GDAL's OGR_GEOPARQUET driver, GeoPandas) and corrupting dataset-level extent aggregation. This change makes `GeometryFieldMetaData.bbox` an `Option[Seq[Double]]` and writes `None` (which json4s omits from JSON) when no geometries were observed. All consumers of the case class are updated.
1 parent 3c280e7 commit 3b308d8

9 files changed

Lines changed: 27 additions & 16 deletions

File tree

spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetMetaData.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject,
3434
* @param geometryTypes
3535
* The geometry types of all geometries, or an empty array if they are not known.
3636
* @param bbox
37-
* Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5.
37+
* Bounding Box of the geometries in the file, formatted according to RFC 7946, section 5. None
38+
* if the file contains no geometries (per the GeoParquet 1.1 spec, bbox is optional and should
39+
* be omitted when there is no extent to describe).
3840
* @param crs
3941
* The CRS of the geometries in the file. None if crs metadata is absent, Some(JNull) if crs is
4042
* null, Some(value) if the crs is present and not null.
@@ -44,7 +46,7 @@ import org.json4s.{DefaultFormats, Extraction, JField, JNothing, JNull, JObject,
4446
case class GeometryFieldMetaData(
4547
encoding: String,
4648
geometryTypes: Seq[String],
47-
bbox: Seq[Double],
49+
bbox: Option[Seq[Double]],
4850
crs: Option[JValue] = None,
4951
covering: Option[Covering] = None)
5052

spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetSpatialFilter.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ object GeoParquetSpatialFilter {
6868
extends GeoParquetSpatialFilter {
6969
def evaluate(columns: Map[String, GeometryFieldMetaData]): Boolean = {
7070
columns.get(columnName).forall { column =>
71-
val bbox = column.bbox
71+
val bbox = column.bbox.getOrElse(return true)
7272
if (bbox.isEmpty) {
7373
return true
7474
}

spark/common/src/main/scala/org/apache/spark/sql/execution/datasources/geoparquet/GeoParquetWriteSupport.scala

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -245,13 +245,19 @@ class GeoParquetWriteSupport extends WriteSupport[InternalRow] with Logging {
245245
val columns = geometryColumnInfoMap.map { case (ordinal, columnInfo) =>
246246
val columnName = schema.fields(ordinal).name
247247
val geometryTypes = columnInfo.seenGeometryTypes.toSeq
248+
// Omit bbox from column metadata when no geometries were observed (e.g. an empty
249+
// Spark partition produces a zero-row file). Per the GeoParquet 1.1 spec, bbox is
250+
// optional and represents the extent of the geometries in the file; emitting
251+
// [0, 0, 0, 0] for an empty file falsely advertises data at Null Island and breaks
252+
// bbox-based file pruning in downstream readers.
248253
val bbox = if (geometryTypes.nonEmpty) {
249-
Seq(
250-
columnInfo.bbox.minX,
251-
columnInfo.bbox.minY,
252-
columnInfo.bbox.maxX,
253-
columnInfo.bbox.maxY)
254-
} else Seq(0.0, 0.0, 0.0, 0.0)
254+
Some(
255+
Seq(
256+
columnInfo.bbox.minX,
257+
columnInfo.bbox.minY,
258+
columnInfo.bbox.maxX,
259+
columnInfo.bbox.maxY))
260+
} else None
255261
val crs = geoParquetColumnCrsMap.getOrElse(
256262
columnName, {
257263
if (!userExplicitlySetDefaultCrs) {

spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/io/stac/StacBatch.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,7 @@ case class StacBatch(
328328
val geometryFieldMetaData = GeometryFieldMetaData(
329329
encoding = "WKB",
330330
geometryTypes = geometryTypes,
331-
bbox = bbox,
331+
bbox = Some(bbox),
332332
crs = None,
333333
covering = None)
334334

spark/common/src/test/scala/org/apache/sedona/sql/geoparquetIOTests.scala

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,12 @@ class geoparquetIOTests extends TestBaseScala with BeforeAndAfterAll {
252252
validateGeoParquetMetadata(geoParquetSavePath) { geo =>
253253
implicit val formats: org.json4s.Formats = org.json4s.DefaultFormats
254254
val g0Types = (geo \ "columns" \ "g" \ "geometry_types").extract[Seq[String]]
255-
val g0BBox = (geo \ "columns" \ "g" \ "bbox").extract[Seq[Double]]
256255
assert(g0Types.isEmpty)
257-
assert(g0BBox == Seq(0.0, 0.0, 0.0, 0.0))
256+
// Per the GeoParquet spec, bbox is optional and represents the extent of the geometries
257+
// in the file; for a file with no geometries we omit it entirely rather than emit a
258+
// bogus [0, 0, 0, 0] (which would falsely advertise data at Null Island and break
259+
// bbox-based file pruning in downstream readers). See issue #2880.
260+
assert((geo \ "columns" \ "g" \ "bbox") == org.json4s.JNothing)
258261
}
259262
}
260263

spark/spark-3.4/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ object GeoParquetMetadataPartitionReaderFactory {
8383
val columnMetadataFields: Array[Any] = Array(
8484
UTF8String.fromString(columnMetadata.encoding),
8585
new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
86-
new GenericArrayData(columnMetadata.bbox.toArray),
86+
columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
8787
columnMetadata.crs
8888
.map(projjson => UTF8String.fromString(compact(render(projjson))))
8989
.getOrElse(UTF8String.fromString("")),

spark/spark-3.5/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
8484
val columnMetadataFields: Array[Any] = Array(
8585
UTF8String.fromString(columnMetadata.encoding),
8686
new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
87-
new GenericArrayData(columnMetadata.bbox.toArray),
87+
columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
8888
columnMetadata.crs
8989
.map(projjson => UTF8String.fromString(compact(render(projjson))))
9090
.getOrElse(UTF8String.fromString("")),

spark/spark-4.0/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
8484
val columnMetadataFields: Array[Any] = Array(
8585
UTF8String.fromString(columnMetadata.encoding),
8686
new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
87-
new GenericArrayData(columnMetadata.bbox.toArray),
87+
columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
8888
columnMetadata.crs
8989
.map(projjson => UTF8String.fromString(compact(render(projjson))))
9090
.getOrElse(UTF8String.fromString("")),

spark/spark-4.1/src/main/scala/org/apache/spark/sql/execution/datasources/v2/geoparquet/metadata/GeoParquetMetadataPartitionReaderFactory.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ object GeoParquetMetadataPartitionReaderFactory {
8484
val columnMetadataFields: Array[Any] = Array(
8585
UTF8String.fromString(columnMetadata.encoding),
8686
new GenericArrayData(columnMetadata.geometryTypes.map(UTF8String.fromString).toArray),
87-
new GenericArrayData(columnMetadata.bbox.toArray),
87+
columnMetadata.bbox.map(b => new GenericArrayData(b.toArray)).orNull,
8888
columnMetadata.crs
8989
.map(projjson => UTF8String.fromString(compact(render(projjson))))
9090
.getOrElse(UTF8String.fromString("")),

0 commit comments

Comments
 (0)