Skip to content

Commit 3e7098c

Browse files
feat(vortex-bench): infra of SpatialBench on DuckDB, plain binary (#8598)
<!-- Thank you for submitting a pull request! We appreciate your time and effort. Please make sure to provide enough information so that we can review your pull request. The Summary and Testing sections below contain guidance on what to include. --> ## Summary <!-- If this PR is related to a tracked effort, please link to the relevant issue here (e.g., `Closes: #123`). Otherwise, feel free to ignore / delete this. In this section, please: 1. Explain the rationale for this change. 2. Summarize the changes included in this PR. A general rule of thumb is that larger PRs should have larger summaries. If there are a lot of changes, please help us review the code by explaining what was changed and why. If there is an issue or discussion attached, there is no need to duplicate all the details, but clarity is always preferred over brevity. --> Adds a SpatialBench benchmark that runs on DuckDB across two storage lanes, Vortex binary BLOB and Parquet binary BLOB. <!-- ## API Changes Uncomment this section if there are any user-facing changes. Consider whether the change affects users in one of the following ways: 1. Breaks public APIs in some way. 2. Changes the underlying behavior of one of the engine integrations. 3. Should some documentation be updated to reflect this change? If a public API is changed in a breaking manner, make sure to add the appropriate label. --> ## Benchmark results <!-- Please describe how this change was tested. Here are some common categories for testing in Vortex: 1. Verifying existing behavior is maintained. 2. Verifying new behavior and functionality works correctly. 3. Serialization compatibility (backwards and forwards) should be maintained or explicitly broken. --> Local, warm, median of 5 (ms). | Q | Query | Parquet | Vortex | Winner | |---|---|--:|--:|:--| | Q1 | trips within 50km of Sedona | 170.2 ms | 132.7 ms | Vortex 1.3× | | Q2 | trips in Coconino County | 379.5 ms | 249.2 ms | Vortex 1.5× | | Q3 | monthly stats within 15km | 235.5 ms | 203.3 ms | Vortex 1.2× | | Q4 | top-1000-tip zone distribution | 404.8 ms | 114.8 ms | Vortex 3.5× | | Q5 | repeat-customer convex hulls | 3174.8 ms | 3187.3 ms | tie | | Q6 | zone stats in bbox | 718.8 ms | 306.1 ms | Vortex 2.3× | | Q7 | route-detour ratios | 938.6 ms | 1046.1 ms | tie (~noise) | | Q8 | nearby pickups per building (500m) | 1000.6 ms | 1071.2 ms | tie (~noise) | | Q9 | building conflation (IoU) | 36.4 ms | 40.6 ms | tie | | Q10 | zone stats (LEFT JOIN) | timeout | timeout | — | | Q11 | cross-zone trips | timeout | timeout | — | | Q12 | 5 nearest buildings (KNN) | timeout | timeout | — | ## Takeaways Vortex wins the scan/filter-bound queries (Q1–Q4, Q6) by up to 3.5×. Q5/Q7/Q8/Q9 are compute-bound (DuckDB spatial join dominates), so the formats converge. Q10–Q12 time out on both, because the unindexed spatial join is the wall, independent of storage format. --------- Signed-off-by: Nemo Yu <zyu379@wisc.edu>
1 parent 797b650 commit 3e7098c

16 files changed

Lines changed: 1331 additions & 299 deletions

File tree

Cargo.lock

Lines changed: 619 additions & 296 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,14 @@ similar = "3.0.0"
244244
sketches-ddsketch = "0.4.0"
245245
smallvec = "1.15.1"
246246
smol = "2.0.2"
247+
spatialbench = "0.2"
248+
spatialbench-arrow = "0.2"
249+
# spatialbench still pins arrow 56, two majors behind the workspace arrow. Until upstream
250+
# catches up, write its generated batches with a matching parquet instead of converting
251+
# arrow versions at the boundary.
252+
spatialbench-parquet = { package = "parquet", version = "56", features = [
253+
"async",
254+
] }
247255
static_assertions = "1.1"
248256
strum = "0.28"
249257
syn = { version = "2.0.117", features = ["full"] }

benchmarks/duckdb-bench/src/lib.rs

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ pub struct DuckClient {
2626
connection: Option<Connection>,
2727
pub db_path: PathBuf,
2828
pub threads: Option<usize>,
29+
/// `INSTALL spatial; LOAD spatial;` for SpatialBench.
30+
init_sql: Vec<String>,
2931
}
3032

3133
impl DuckClient {
@@ -67,9 +69,19 @@ impl DuckClient {
6769
connection: Some(connection),
6870
db_path,
6971
threads,
72+
init_sql: Vec::new(),
7073
})
7174
}
7275

76+
/// Run `statements` now and after every subsequent [`DuckClient::reopen`].
77+
pub fn set_init_sql(&mut self, statements: Vec<String>) -> Result<()> {
78+
for stmt in &statements {
79+
self.connection().query(stmt)?;
80+
}
81+
self.init_sql = statements;
82+
Ok(())
83+
}
84+
7385
pub fn open_and_setup_database(
7486
path: Option<PathBuf>,
7587
threads: Option<usize>,
@@ -108,6 +120,14 @@ impl DuckClient {
108120
self.db = Some(db);
109121
self.connection = Some(connection);
110122

123+
// Replay init SQL (e.g. LOAD spatial).
124+
for stmt in &self.init_sql {
125+
self.connection
126+
.as_ref()
127+
.vortex_expect("connection just opened")
128+
.query(stmt)?;
129+
}
130+
111131
Ok(())
112132
}
113133

@@ -123,6 +143,7 @@ impl DuckClient {
123143
connection: Some(connection),
124144
db_path,
125145
threads: None,
146+
init_sql: Vec::new(),
126147
})
127148
}
128149

benchmarks/duckdb-bench/src/main.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,13 @@ fn main() -> anyhow::Result<()> {
171171
&filtered_queries,
172172
mode,
173173
|format| {
174-
let ctx = DuckClient::new(
174+
let mut ctx = DuckClient::new(
175175
&*benchmark,
176176
format,
177177
args.delete_duckdb_database,
178178
args.threads,
179179
)?;
180+
ctx.set_init_sql(benchmark.engine_init_sql(Engine::DuckDB))?;
180181
ctx.register_tables(&*benchmark, format)?;
181182

182183
// Duckdb doesn't support octet_length for strings but we need this

vortex-bench/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ vortex = { workspace = true, features = [
2323
"tokio",
2424
"zstd",
2525
] }
26+
vortex-geo = { workspace = true }
2627
vortex-tensor = { workspace = true } # TODO(connor): In the future, this might be inside vortex.
2728

2829
anyhow = { workspace = true }
@@ -48,6 +49,9 @@ regex = { workspace = true }
4849
reqwest = { workspace = true, features = ["stream"] }
4950
serde = { workspace = true, features = ["derive"] }
5051
serde_json = { workspace = true }
52+
spatialbench = { workspace = true }
53+
spatialbench-arrow = { workspace = true }
54+
spatialbench-parquet = { workspace = true }
5155
sysinfo = { workspace = true }
5256
tabled = { workspace = true, features = ["std"] }
5357
target-lexicon = { workspace = true }

vortex-bench/spatialbench.sql

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
-- SpatialBench queries (DuckDB dialect), from sedona-spatialbench DuckDBSpatialBenchBenchmark
2+
-- (spatialbench-queries/print_queries.py). Query logic is unchanged, only reformatted for readability
3+
-- and numbered Q1..Q12 (canonical order). The harness splits the file on semicolons, so a comment
4+
-- must never contain one.
5+
6+
-- Q1: trips starting within 50km of Sedona city center, ordered by distance.
7+
SELECT
8+
t.t_tripkey,
9+
ST_X(t.t_pickuploc) AS pickup_lon,
10+
ST_Y(t.t_pickuploc) AS pickup_lat,
11+
t.t_pickuptime,
12+
ST_Distance(t.t_pickuploc, ST_GeomFromText('POINT (-111.7610 34.8697)')) AS distance_to_center
13+
FROM trip t
14+
WHERE ST_DWithin(t.t_pickuploc, ST_GeomFromText('POINT (-111.7610 34.8697)'), 0.45)
15+
ORDER BY distance_to_center ASC, t.t_tripkey ASC;
16+
17+
-- Q2: count trips starting within the Coconino County (Arizona) zone.
18+
SELECT COUNT(*) AS trip_count_in_coconino_county
19+
FROM trip t
20+
WHERE ST_Intersects(
21+
t.t_pickuploc,
22+
(SELECT z.z_boundary FROM zone z WHERE z.z_name = 'Coconino County' LIMIT 1)
23+
);
24+
25+
-- Q3: monthly trip statistics within 15km of Sedona city center (10km bbox + 5km buffer).
26+
SELECT
27+
DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
28+
COUNT(t.t_tripkey) AS total_trips,
29+
AVG(t.t_distance) AS avg_distance,
30+
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration,
31+
AVG(t.t_fare) AS avg_fare
32+
FROM trip t
33+
WHERE ST_DWithin(
34+
t.t_pickuploc,
35+
ST_GeomFromText('POLYGON((-111.9060 34.7347, -111.6160 34.7347, -111.6160 35.0047, -111.9060 35.0047, -111.9060 34.7347))'),
36+
0.045
37+
)
38+
GROUP BY pickup_month
39+
ORDER BY pickup_month;
40+
41+
-- Q4: zone distribution of the top 1000 trips by tip amount.
42+
SELECT z.z_zonekey, z.z_name, COUNT(*) AS trip_count
43+
FROM zone z
44+
JOIN (
45+
SELECT t.t_pickuploc
46+
FROM trip t
47+
ORDER BY t.t_tip DESC, t.t_tripkey ASC
48+
LIMIT 1000
49+
) top_trips ON ST_Within(top_trips.t_pickuploc, z.z_boundary)
50+
GROUP BY z.z_zonekey, z.z_name
51+
ORDER BY trip_count DESC, z.z_zonekey ASC;
52+
53+
-- Q5: monthly travel patterns for repeat customers (convex hull of dropoff locations).
54+
SELECT
55+
c.c_custkey,
56+
c.c_name AS customer_name,
57+
DATE_TRUNC('month', t.t_pickuptime) AS pickup_month,
58+
ST_Area(ST_ConvexHull(ST_Collect(ARRAY_AGG(t.t_dropoffloc)))) AS monthly_travel_hull_area,
59+
COUNT(*) AS dropoff_count
60+
FROM trip t
61+
JOIN customer c ON t.t_custkey = c.c_custkey
62+
GROUP BY c.c_custkey, c.c_name, pickup_month
63+
HAVING dropoff_count > 5
64+
ORDER BY dropoff_count DESC, c.c_custkey ASC;
65+
66+
-- Q6: zone statistics for trips intersecting a bounding box.
67+
SELECT
68+
z.z_zonekey,
69+
z.z_name,
70+
COUNT(t.t_tripkey) AS total_pickups,
71+
AVG(t.t_totalamount) AS avg_distance,
72+
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration
73+
FROM trip t, zone z
74+
WHERE ST_Intersects(
75+
ST_GeomFromText('POLYGON((-112.2110 34.4197, -111.3110 34.4197, -111.3110 35.3197, -112.2110 35.3197, -112.2110 34.4197))'),
76+
z.z_boundary
77+
)
78+
AND ST_Within(t.t_pickuploc, z.z_boundary)
79+
GROUP BY z.z_zonekey, z.z_name
80+
ORDER BY total_pickups DESC, z.z_zonekey ASC;
81+
82+
-- Q7: detect potential route detours by comparing reported vs. geometric distances.
83+
WITH trip_lengths AS (
84+
SELECT
85+
t.t_tripkey,
86+
t.t_distance AS reported_distance_m,
87+
ST_Length(ST_MakeLine(t.t_pickuploc, t.t_dropoffloc)) / 0.000009 AS line_distance_m
88+
FROM trip t
89+
)
90+
SELECT
91+
t.t_tripkey,
92+
t.reported_distance_m,
93+
t.line_distance_m,
94+
t.reported_distance_m / NULLIF(t.line_distance_m, 0) AS detour_ratio
95+
FROM trip_lengths t
96+
ORDER BY detour_ratio DESC NULLS LAST, reported_distance_m DESC, t_tripkey ASC;
97+
98+
-- Q8: count nearby pickups for each building within ~500m.
99+
SELECT b.b_buildingkey, b.b_name, COUNT(*) AS nearby_pickup_count
100+
FROM trip t
101+
JOIN building b ON ST_DWithin(t.t_pickuploc, b.b_boundary, 0.0045)
102+
GROUP BY b.b_buildingkey, b.b_name
103+
ORDER BY nearby_pickup_count DESC, b.b_buildingkey ASC;
104+
105+
-- Q9: building conflation (duplicate/overlap detection via IoU).
106+
WITH b1 AS (
107+
SELECT b_buildingkey AS id, b_boundary AS geom FROM building
108+
),
109+
b2 AS (
110+
SELECT b_buildingkey AS id, b_boundary AS geom FROM building
111+
),
112+
pairs AS (
113+
SELECT
114+
b1.id AS building_1,
115+
b2.id AS building_2,
116+
ST_Area(b1.geom) AS area1,
117+
ST_Area(b2.geom) AS area2,
118+
ST_Area(ST_Intersection(b1.geom, b2.geom)) AS overlap_area
119+
FROM b1
120+
JOIN b2 ON b1.id < b2.id AND ST_Intersects(b1.geom, b2.geom)
121+
)
122+
SELECT
123+
building_1,
124+
building_2,
125+
area1,
126+
area2,
127+
overlap_area,
128+
CASE
129+
WHEN overlap_area = 0 THEN 0.0
130+
WHEN (area1 + area2 - overlap_area) = 0 THEN 1.0
131+
ELSE overlap_area / (area1 + area2 - overlap_area)
132+
END AS iou
133+
FROM pairs
134+
ORDER BY iou DESC, building_1 ASC, building_2 ASC;
135+
136+
-- Q10: zone statistics for trips starting within each zone.
137+
SELECT
138+
z.z_zonekey,
139+
z.z_name AS pickup_zone,
140+
AVG(t.t_dropofftime - t.t_pickuptime) AS avg_duration,
141+
AVG(t.t_distance) AS avg_distance,
142+
COUNT(t.t_tripkey) AS num_trips
143+
FROM zone z
144+
LEFT JOIN trip t ON ST_Within(t.t_pickuploc, z.z_boundary)
145+
GROUP BY z.z_zonekey, z.z_name
146+
ORDER BY avg_duration DESC NULLS LAST, z.z_zonekey ASC;
147+
148+
-- Q11: count trips that cross between different zones.
149+
SELECT COUNT(*) AS cross_zone_trip_count
150+
FROM trip t
151+
JOIN zone pickup_zone ON ST_Within(t.t_pickuploc, pickup_zone.z_boundary)
152+
JOIN zone dropoff_zone ON ST_Within(t.t_dropoffloc, dropoff_zone.z_boundary)
153+
WHERE pickup_zone.z_zonekey != dropoff_zone.z_zonekey;
154+
155+
-- Q12: five nearest buildings per trip pickup (CROSS JOIN LATERAL, since DuckDB spatial has no ST_KNN).
156+
SELECT
157+
t.t_tripkey,
158+
t.t_pickuploc,
159+
nb.b_buildingkey,
160+
nb.building_name,
161+
nb.distance_to_building
162+
FROM trip t
163+
CROSS JOIN LATERAL (
164+
SELECT
165+
b.b_buildingkey,
166+
b.b_name AS building_name,
167+
ST_Distance(t.t_pickuploc, b.b_boundary) AS distance_to_building
168+
FROM building b
169+
ORDER BY distance_to_building
170+
LIMIT 5
171+
) AS nb
172+
ORDER BY nb.distance_to_building, nb.b_buildingkey;

vortex-bench/src/benchmark.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ use glob::Pattern;
88
use url::Url;
99

1010
use crate::BenchmarkDataset;
11+
use crate::Engine;
1112
use crate::Format;
1213

1314
/// Specification for a table in a benchmark dataset.
@@ -32,6 +33,12 @@ pub trait Benchmark: Send + Sync {
3233
/// Get all available queries for this benchmark
3334
fn queries(&self) -> anyhow::Result<Vec<(usize, String)>>;
3435

36+
/// SQL an `engine` must run before this benchmark's queries (e.g. loading engine
37+
/// extensions). Runners replay these after every (re)open. Default: none.
38+
fn engine_init_sql(&self, _engine: Engine) -> Vec<String> {
39+
Vec::new()
40+
}
41+
3542
/// Generate or prepare base data for the benchmark (typically Parquet format).
3643
/// This is the canonical source data that can be converted to other formats.
3744
/// This should be idempotent - safe to call multiple times.

0 commit comments

Comments
 (0)