@@ -100,7 +100,9 @@ def write_partitioned_dataset(
100100 partition_dir = output_dir / f"geohash_prefix={ prefix } "
101101 partition_dir .mkdir ()
102102 group .sort_values ("geohash_sort" )[cols ].to_parquet (
103- partition_dir / "part-0.parquet"
103+ partition_dir / "part-0.parquet" ,
104+ write_covering_bbox = True ,
105+ row_group_size = 100_000 ,
104106 )
105107 if (i + 1 ) % 100 == 0 :
106108 print (f" { i + 1 } /{ n_partitions } partitions written..." )
@@ -180,6 +182,32 @@ def write_label_partitioned_dataset(
180182 Partition values that are not alphanumeric (e.g., ``"Fast Food
181183 Restaurant"``) are URL-encoded in the directory name. DuckDB's
182184 ``hive_partitioning=1`` decodes these transparently at read time.
185+
186+ Output files include a GeoParquet 1.1 ``covering.bbox`` struct column
187+ (``write_covering_bbox=True``) and use ``row_group_size=100_000`` so
188+ spatial bbox predicates can prune row groups. Gotchas for consumers:
189+
190+ * Predicate pushdown only fires when filters reference the ``bbox``
191+ struct fields directly — e.g.,
192+ ``WHERE bbox.xmin <= ? AND bbox.xmax >= ? AND bbox.ymin <= ? AND
193+ bbox.ymax >= ?``. ``ST_Intersects(geometry, …)`` alone will not
194+ trigger bbox pruning in DuckDB; pass both predicates or rely on
195+ GeoPandas's ``read_parquet(..., bbox=...)`` which adds the struct
196+ filter automatically.
197+ * Minimum reader versions: DuckDB ≥ 0.10, PyArrow ≥ 15, GeoPandas
198+ ≥ 1.0, GDAL ≥ 3.8. Older readers see the ``bbox`` column as a
199+ plain struct and silently skip the pruning optimization.
200+ * The ``bbox`` column appears in ``DESCRIBE`` output and in
201+ ``SELECT *`` results. Downstream pipelines that materialize all
202+ columns will pick it up; drop it explicitly if it conflicts with a
203+ schema contract.
204+ * Writing requires ``geopandas >= 1.0``; calling this function from
205+ an older environment will fail with an unexpected-kwarg error on
206+ ``write_covering_bbox``.
207+ * ``row_group_size=100_000`` is fine for small partitions (they end
208+ up as a single row group) but it caps pruning granularity for very
209+ large partitions. Tune downward if a single file grows past ~5M
210+ rows and viewport queries still feel slow.
183211 """
184212 output_dir = Path (output_dir )
185213
@@ -218,7 +246,9 @@ def write_label_partitioned_dataset(
218246 partition_dir = output_dir / f"{ partition_col } ={ safe_value } "
219247 partition_dir .mkdir ()
220248 group .sort_values (sort_col )[cols ].to_parquet (
221- partition_dir / "part-0.parquet"
249+ partition_dir / "part-0.parquet" ,
250+ write_covering_bbox = True ,
251+ row_group_size = 100_000 ,
222252 )
223253 if (i + 1 ) % 25 == 0 :
224254 print (f" { i + 1 } /{ n_partitions } partitions written..." )
0 commit comments