44//! SpatialBench benchmark implementation
55
66use std:: fs;
7+ use std:: path:: Path ;
78
89use url:: Url ;
910
@@ -13,9 +14,13 @@ use crate::Engine;
1314use crate :: Format ;
1415use crate :: TableSpec ;
1516use crate :: spatialbench:: datagen;
17+ use crate :: spatialbench:: datagen:: Table ;
1618use crate :: utils:: file:: resolve_data_url;
1719use crate :: workspace_root;
1820
21+ /// Data-dir subfolder for the native-geometry Vortex files (the `vortex-native` lane).
22+ pub const NATIVE_DIR : & str = "vortex-native" ;
23+
1924/// SpatialBench geospatial benchmark (Apache Sedona): a `trip` point table, `building` polygons, and
2025/// a `customer` attribute table, queried with spatial filters and joins. `zone` polygons are sourced
2126/// externally and registered when present. See <https://sedona.apache.org/spatialbench/>.
@@ -34,6 +39,21 @@ impl SpatialBenchBenchmark {
3439 scale_factor,
3540 } )
3641 }
42+
43+ /// Tables to materialize and register: the in-process base tables (`trip`, `building`,
44+ /// `customer`) plus the externally-sourced `zone` when its parquet is present. Shared by native
45+ /// data-gen and table registration so both lanes cover the same set.
46+ fn base_tables ( & self ) -> Vec < Table > {
47+ let mut tables = vec ! [ Table :: Trip , Table :: Building , Table :: Customer ] ;
48+ let zone_present = match self . data_url . to_file_path ( ) {
49+ Ok ( base) => zone_parquet_present ( & base. join ( Format :: Parquet . name ( ) ) ) ,
50+ Err ( ( ) ) => true ,
51+ } ;
52+ if zone_present {
53+ tables. push ( Table :: Zone ) ;
54+ }
55+ tables
56+ }
3757}
3858
3959#[ async_trait:: async_trait]
@@ -58,6 +78,16 @@ impl Benchmark for SpatialBenchBenchmark {
5878 . collect ( ) )
5979 }
6080
81+ /// On the `vortex-native` lane, geometry columns surface as `GEOMETRY`, so drop the
82+ /// `ST_GeomFromWKB(..)` wrappers and let DuckDB's `spatial` extension evaluate the `ST_*`
83+ /// predicates directly on the native geometry.
84+ fn query_for_format ( & self , query : & str , format : Format ) -> String {
85+ match format {
86+ Format :: VortexNative => strip_wkb_wrappers ( query) ,
87+ _ => query. to_string ( ) ,
88+ }
89+ }
90+
6191 async fn generate_base_data ( & self ) -> anyhow:: Result < ( ) > {
6292 if self . data_url . scheme ( ) != "file" {
6393 return Ok ( ( ) ) ;
@@ -66,14 +96,37 @@ impl Benchmark for SpatialBenchBenchmark {
6696 . data_url
6797 . to_file_path ( )
6898 . map_err ( |_| anyhow:: anyhow!( "Invalid file URL: {}" , self . data_url. as_str( ) ) ) ?;
69- datagen:: generate_tables ( & self . scale_factor , base_data_dir) . await ?;
99+ datagen:: generate_tables ( & self . scale_factor , base_data_dir. clone ( ) ) . await ?;
100+ Ok ( ( ) )
101+ }
102+
103+ /// The `vortex-native` lane decodes each table's WKB geometry to native GeoArrow once, into the
104+ /// `vortex-native` dir, so its queries read DuckDB `GEOMETRY` directly. Idempotent.
105+ async fn prepare_format ( & self , format : Format , base_path : & Path ) -> anyhow:: Result < ( ) > {
106+ if format == Format :: VortexNative {
107+ let parquet_dir = base_path. join ( Format :: Parquet . name ( ) ) ;
108+ let native_dir = base_path. join ( NATIVE_DIR ) ;
109+ for table in self . base_tables ( ) {
110+ datagen:: write_native_vortex ( table, & parquet_dir, & native_dir) . await ?;
111+ }
112+ }
70113 Ok ( ( ) )
71114 }
72115
73116 fn data_url ( & self ) -> & Url {
74117 & self . data_url
75118 }
76119
120+ /// The `vortex-native` lane reads the native-geometry Vortex dir; every other format reads its
121+ /// own `{format}` subfolder.
122+ fn format_path ( & self , format : Format , base_url : & Url ) -> anyhow:: Result < Url > {
123+ let dir = match format {
124+ Format :: VortexNative => NATIVE_DIR ,
125+ other => other. name ( ) ,
126+ } ;
127+ Ok ( base_url. join ( & format ! ( "{dir}/" ) ) ?)
128+ }
129+
77130 fn expected_row_counts ( & self ) -> Option < Vec < usize > > {
78131 // Indexed by `query_idx` (1-based), so index 0 is a dummy and Q1's count is at index 1 (TPC-H
79132 // convention). Only SF1.0 and SF10.0 are validated (like TPC-H); other scale factors return
@@ -101,22 +154,32 @@ impl Benchmark for SpatialBenchBenchmark {
101154 format ! ( "spatialbench(sf={})" , self . scale_factor)
102155 }
103156
157+ /// Both lanes register the same tables (WKB reads `parquet`/`vortex`, native reads
158+ /// `vortex-native`); `zone` is externally sourced and optional, registered only when present.
104159 fn table_specs ( & self ) -> Vec < TableSpec > {
105- let mut specs = vec ! [
106- TableSpec :: new( "trip" , None ) ,
107- TableSpec :: new( "building" , None ) ,
108- TableSpec :: new( "customer" , None ) ,
109- ] ;
110- // `zone` is externally sourced and optional; register it only when present so queries that
111- // don't need it don't fail on the missing glob.
112- let zone_present = match self . data_url . to_file_path ( ) {
113- Ok ( base) => zone_parquet_present ( & base. join ( Format :: Parquet . name ( ) ) ) ,
114- Err ( ( ) ) => true ,
115- } ;
116- if zone_present {
117- specs. push ( TableSpec :: new ( "zone" , None ) ) ;
160+ self . base_tables ( )
161+ . iter ( )
162+ . map ( |table| TableSpec :: new ( table. name ( ) , None ) )
163+ . collect ( )
164+ }
165+
166+ /// DuckDB's view star-expansion drops native `GEOMETRY` columns down to `BLOB`, so `ST_*` fail to
167+ /// bind. Re-cast every geometry column back to `GEOMETRY` in the view's projection.
168+ fn view_projection ( & self , table_name : & str , format : Format ) -> String {
169+ if format == Format :: VortexNative
170+ && let Some ( table) = Table :: from_name ( table_name)
171+ {
172+ let geometry_columns = table. geometry_columns ( ) ;
173+ if !geometry_columns. is_empty ( ) {
174+ let casts = geometry_columns
175+ . iter ( )
176+ . map ( |column| format ! ( "{name}::GEOMETRY AS {name}" , name = column. name) )
177+ . collect :: < Vec < _ > > ( )
178+ . join ( ", " ) ;
179+ return format ! ( "* REPLACE ({casts})" ) ;
180+ }
118181 }
119- specs
182+ "*" . to_string ( )
120183 }
121184
122185 /// Scope each table to its own `{table}_*.{ext}` files; the default globs every file in the
@@ -141,8 +204,33 @@ impl Benchmark for SpatialBenchBenchmark {
141204
142205/// Whether an externally-sourced `zone_*.parquet` exists under `parquet_dir` (generated by the
143206/// upstream `spatialbench-cli`; see the module docs).
144- fn zone_parquet_present ( parquet_dir : & std :: path :: Path ) -> bool {
207+ fn zone_parquet_present ( parquet_dir : & Path ) -> bool {
145208 glob:: glob ( & parquet_dir. join ( "zone_*.parquet" ) . to_string_lossy ( ) )
146209 . map ( |mut paths| paths. next ( ) . is_some ( ) )
147210 . unwrap_or ( false )
148211}
212+
213+ /// Strip `ST_GeomFromWKB(<inner>)` → `<inner>` so the native lane reads the already-`GEOMETRY`
214+ /// column directly. Assumes the wrapped expression contains no inner `)` (true for our column refs).
215+ fn strip_wkb_wrappers ( sql : & str ) -> String {
216+ const OPEN : & str = "ST_GeomFromWKB(" ;
217+ let mut out = String :: with_capacity ( sql. len ( ) ) ;
218+ let mut rest = sql;
219+ while let Some ( pos) = rest. find ( OPEN ) {
220+ out. push_str ( & rest[ ..pos] ) ;
221+ let after = & rest[ pos + OPEN . len ( ) ..] ;
222+ match after. find ( ')' ) {
223+ Some ( close) => {
224+ out. push_str ( & after[ ..close] ) ;
225+ rest = & after[ close + 1 ..] ;
226+ }
227+ // Unbalanced wrapper: emit it verbatim and stop rewriting.
228+ None => {
229+ out. push_str ( OPEN ) ;
230+ rest = after;
231+ }
232+ }
233+ }
234+ out. push_str ( rest) ;
235+ out
236+ }
0 commit comments