Skip to content

Commit 15926c3

Browse files
Add FSST URL compare benchmark with ClickBench-style data (#6906)
Benchmarks FSST compare operations on 100k realistic URL strings modeled after ClickBench URL distributions. Tests high (~50%) and low (~1%) selectivity for: - Eq pushdown (compare in FSST-encoded space) - Eq canonicalize (decompress then compare) - LIKE substring matching (always goes through canonicalization) claude help
1 parent 7f6856e commit 15926c3

2 files changed

Lines changed: 263 additions & 0 deletions

File tree

encodings/fsst/Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
3939
name = "fsst_compress"
4040
harness = false
4141

42+
[[bench]]
43+
name = "fsst_url_compare"
44+
harness = false
45+
4246
[[bench]]
4347
name = "chunked_dict_fsst_builder"
4448
harness = false
Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use std::sync::LazyLock;
7+
8+
use divan::Bencher;
9+
use rand::Rng;
10+
use rand::SeedableRng;
11+
use rand::rngs::StdRng;
12+
use vortex_array::IntoArray;
13+
use vortex_array::RecursiveCanonical;
14+
use vortex_array::VortexSessionExecute;
15+
use vortex_array::arrays::ConstantArray;
16+
use vortex_array::arrays::VarBinArray;
17+
use vortex_array::builtins::ArrayBuiltins;
18+
use vortex_array::compute::warm_up_vtables;
19+
use vortex_array::dtype::DType;
20+
use vortex_array::dtype::Nullability;
21+
use vortex_array::expr::like;
22+
use vortex_array::expr::lit;
23+
use vortex_array::expr::root;
24+
use vortex_array::scalar::Scalar;
25+
use vortex_array::scalar_fn::fns::operators::Operator;
26+
use vortex_array::session::ArraySession;
27+
use vortex_fsst::fsst_compress;
28+
use vortex_fsst::fsst_train_compressor;
29+
use vortex_session::VortexSession;
30+
31+
fn main() {
32+
warm_up_vtables();
33+
divan::main();
34+
}
35+
36+
static SESSION: LazyLock<VortexSession> =
37+
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
38+
39+
const NUM_URLS: usize = 100_000;
40+
41+
/// A high-frequency domain that appears in ~50% of generated URLs.
42+
const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru";
43+
44+
/// A low-frequency domain that appears in ~1% of generated URLs.
45+
const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com";
46+
47+
// Domains modeled after real ClickBench URL distributions.
48+
const DOMAINS: &[(&str, u32)] = &[
49+
("smeshariki.ru", 500), // ~50%
50+
("auto.ru", 150), // ~15%
51+
("komme.ru", 100), // ~10%
52+
("yandex.ru", 80), // ~8%
53+
("mail.ru", 60), // ~6%
54+
("livejournal.com", 40), // ~4%
55+
("vk.com", 30), // ~3%
56+
("avito.ru", 20), // ~2%
57+
("kinopoisk.ru", 10), // ~1%
58+
("rare-example-domain.com", 10), // ~1%
59+
];
60+
61+
const PATHS: &[&str] = &[
62+
"/GameMain.aspx",
63+
"/index.php",
64+
"/catalog/item",
65+
"/search",
66+
"/news/article",
67+
"/user/profile",
68+
"/collection/view",
69+
"/cars/used/sale",
70+
"/forum/thread",
71+
"/photo/album",
72+
"/video/watch",
73+
"/download/file",
74+
"/api/v1/resource",
75+
"/shop/product",
76+
"/blog/post",
77+
];
78+
79+
/// Generate 100k realistic ClickBench-style URLs.
80+
fn generate_url_data() -> VarBinArray {
81+
let mut rng = StdRng::seed_from_u64(42);
82+
83+
// Build a weighted domain lookup.
84+
let total_weight: u32 = DOMAINS.iter().map(|(_, w)| w).sum();
85+
let urls: Vec<Option<Box<[u8]>>> = (0..NUM_URLS)
86+
.map(|_| {
87+
let domain_roll = rng.random_range(0..total_weight);
88+
let mut cumulative = 0u32;
89+
let mut domain = DOMAINS[0].0;
90+
for &(d, w) in DOMAINS {
91+
cumulative += w;
92+
if domain_roll < cumulative {
93+
domain = d;
94+
break;
95+
}
96+
}
97+
98+
let path = PATHS[rng.random_range(0..PATHS.len())];
99+
let query_id: u32 = rng.random_range(1..100_000);
100+
let tab: u16 = rng.random_range(1..20);
101+
102+
let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}");
103+
Some(url.into_bytes().into_boxed_slice())
104+
})
105+
.collect();
106+
107+
VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable))
108+
}
109+
110+
static URL_DATA: LazyLock<VarBinArray> = LazyLock::new(generate_url_data);
111+
112+
// ---------------------------------------------------------------------------
113+
// Eq compare benchmarks (FSST pushdown vs canonicalize)
114+
// ---------------------------------------------------------------------------
115+
116+
/// Pick a concrete URL from the dataset that uses the given domain.
117+
fn pick_url_with_domain(data: &VarBinArray, domain: &str) -> String {
118+
use vortex_array::accessor::ArrayAccessor;
119+
data.with_iterator(|iter| {
120+
iter.flatten()
121+
.map(|b| std::str::from_utf8(b).unwrap().to_string())
122+
.find(|u| u.contains(domain))
123+
.unwrap_or_else(|| format!("http://{domain}/missing"))
124+
})
125+
}
126+
127+
#[divan::bench]
128+
fn eq_pushdown_high_match(bencher: Bencher) {
129+
let data = &*URL_DATA;
130+
let compressor = fsst_train_compressor(data);
131+
let fsst_array = fsst_compress(data, &compressor);
132+
let match_url = pick_url_with_domain(data, HIGH_MATCH_DOMAIN);
133+
let constant = ConstantArray::new(Scalar::from(match_url.as_str()), NUM_URLS);
134+
135+
bencher
136+
.with_inputs(|| (&fsst_array, &constant, SESSION.create_execution_ctx()))
137+
.bench_refs(|(fsst_array, constant, ctx)| {
138+
fsst_array
139+
.clone()
140+
.into_array()
141+
.binary(constant.clone().into_array(), Operator::Eq)
142+
.unwrap()
143+
.execute::<RecursiveCanonical>(ctx)
144+
.unwrap()
145+
});
146+
}
147+
148+
#[divan::bench]
149+
fn eq_pushdown_low_match(bencher: Bencher) {
150+
let data = &*URL_DATA;
151+
let compressor = fsst_train_compressor(data);
152+
let fsst_array = fsst_compress(data, &compressor);
153+
let match_url = pick_url_with_domain(data, LOW_MATCH_DOMAIN);
154+
let constant = ConstantArray::new(Scalar::from(match_url.as_str()), NUM_URLS);
155+
156+
bencher
157+
.with_inputs(|| (&fsst_array, &constant, SESSION.create_execution_ctx()))
158+
.bench_refs(|(fsst_array, constant, ctx)| {
159+
fsst_array
160+
.clone()
161+
.into_array()
162+
.binary(constant.clone().into_array(), Operator::Eq)
163+
.unwrap()
164+
.execute::<RecursiveCanonical>(ctx)
165+
.unwrap()
166+
});
167+
}
168+
169+
#[divan::bench]
170+
fn eq_canonicalize_high_match(bencher: Bencher) {
171+
let data = &*URL_DATA;
172+
let compressor = fsst_train_compressor(data);
173+
let fsst_array = fsst_compress(data, &compressor);
174+
let match_url = pick_url_with_domain(data, HIGH_MATCH_DOMAIN);
175+
let constant = ConstantArray::new(Scalar::from(match_url.as_str()), NUM_URLS);
176+
177+
bencher
178+
.with_inputs(|| (&fsst_array, &constant, SESSION.create_execution_ctx()))
179+
.bench_refs(|(fsst_array, constant, ctx)| {
180+
fsst_array
181+
.to_canonical()
182+
.unwrap()
183+
.as_ref()
184+
.to_array()
185+
.binary(constant.clone().into_array(), Operator::Eq)
186+
.unwrap()
187+
.execute::<RecursiveCanonical>(ctx)
188+
.unwrap()
189+
});
190+
}
191+
192+
#[divan::bench]
193+
fn eq_canonicalize_low_match(bencher: Bencher) {
194+
let data = &*URL_DATA;
195+
let compressor = fsst_train_compressor(data);
196+
let fsst_array = fsst_compress(data, &compressor);
197+
let match_url = pick_url_with_domain(data, LOW_MATCH_DOMAIN);
198+
let constant = ConstantArray::new(Scalar::from(match_url.as_str()), NUM_URLS);
199+
200+
bencher
201+
.with_inputs(|| (&fsst_array, &constant, SESSION.create_execution_ctx()))
202+
.bench_refs(|(fsst_array, constant, ctx)| {
203+
fsst_array
204+
.to_canonical()
205+
.unwrap()
206+
.as_ref()
207+
.to_array()
208+
.binary(constant.clone().into_array(), Operator::Eq)
209+
.unwrap()
210+
.execute::<RecursiveCanonical>(ctx)
211+
.unwrap()
212+
});
213+
}
214+
215+
// ---------------------------------------------------------------------------
216+
// LIKE substring benchmarks (always goes through canonicalization for FSST)
217+
// ---------------------------------------------------------------------------
218+
219+
#[divan::bench]
220+
fn like_substr_high_match(bencher: Bencher) {
221+
let data = &*URL_DATA;
222+
let compressor = fsst_train_compressor(data);
223+
let fsst_array = fsst_compress(data, &compressor);
224+
let pattern = format!("%{HIGH_MATCH_DOMAIN}%");
225+
let expr = like(root(), lit(pattern.as_str()));
226+
227+
bencher
228+
.with_inputs(|| (&fsst_array, SESSION.create_execution_ctx()))
229+
.bench_refs(|(fsst_array, ctx)| {
230+
fsst_array
231+
.clone()
232+
.into_array()
233+
.apply(&expr)
234+
.unwrap()
235+
.execute::<RecursiveCanonical>(ctx)
236+
.unwrap()
237+
});
238+
}
239+
240+
#[divan::bench]
241+
fn like_substr_low_match(bencher: Bencher) {
242+
let data = &*URL_DATA;
243+
let compressor = fsst_train_compressor(data);
244+
let fsst_array = fsst_compress(data, &compressor);
245+
let pattern = format!("%{LOW_MATCH_DOMAIN}%");
246+
let expr = like(root(), lit(pattern.as_str()));
247+
248+
bencher
249+
.with_inputs(|| (&fsst_array, SESSION.create_execution_ctx()))
250+
.bench_refs(|(fsst_array, ctx)| {
251+
fsst_array
252+
.clone()
253+
.into_array()
254+
.apply(&expr)
255+
.unwrap()
256+
.execute::<RecursiveCanonical>(ctx)
257+
.unwrap()
258+
});
259+
}

0 commit comments

Comments
 (0)