Skip to content

Commit ad7b09f

Browse files
chore[ci]: clean up the fsst benchmarks and add like (#6934)
Shared the benchmark gen code between fsst benchmarks Adds like benchmarks for fsst compressed arrays. --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk> Co-authored-by: Claude <noreply@anthropic.com>
1 parent 252a1a9 commit ad7b09f

5 files changed

Lines changed: 654 additions & 75 deletions

File tree

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ rust-version = "1.90"
8181
version = "0.1.0"
8282

8383
[workspace.dependencies]
84+
aho-corasick = "1.1.3"
8485
anyhow = "1.0.97"
8586
arbitrary = "1.3.2"
8687
arc-swap = "1.8"
@@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [
121122
"cuda-12050",
122123
] }
123124
custom-labels = "0.4.4"
125+
daachorse = "1.0.0"
124126
dashmap = "6.1.0"
125127
datafusion = { version = "52", default-features = false, features = ["sql"] }
126128
datafusion-catalog = { version = "52" }
@@ -155,6 +157,7 @@ indicatif = "0.18.0"
155157
insta = "1.43"
156158
inventory = "0.3.20"
157159
itertools = "0.14.0"
160+
jetscii = "0.5.3"
158161
jiff = "0.2.0"
159162
kanal = "0.1.1"
160163
lending-iterator = "0.1.7"
@@ -163,6 +166,7 @@ libloading = "0.8"
163166
liblzma = "0.4"
164167
log = { version = "0.4.21" }
165168
loom = { version = "0.7", features = ["checkpoint"] }
169+
memchr = "2.8.0"
166170
memmap2 = "0.9.5"
167171
mimalloc = "0.1.42"
168172
moka = { version = "0.12.10", default-features = false }
@@ -196,6 +200,7 @@ rand = "0.9.0"
196200
rand_distr = "0.5"
197201
ratatui = { version = "0.30", default-features = false }
198202
regex = "1.11.0"
203+
regex-automata = "0.4"
199204
reqwest = { version = "0.12.4", features = [
200205
"charset",
201206
"http2",

encodings/fsst/Cargo.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,17 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
3838
[[bench]]
3939
name = "fsst_compress"
4040
harness = false
41+
required-features = ["_test-harness"]
42+
43+
[[bench]]
44+
name = "fsst_contains"
45+
harness = false
46+
required-features = ["_test-harness"]
4147

4248
[[bench]]
4349
name = "fsst_url_compare"
4450
harness = false
51+
required-features = ["_test-harness"]
4552

4653
[[bench]]
4754
name = "chunked_dict_fsst_builder"
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
#![allow(clippy::unwrap_used)]
5+
6+
use std::fmt;
7+
use std::sync::LazyLock;
8+
9+
use divan::Bencher;
10+
use vortex_array::Canonical;
11+
use vortex_array::IntoArray;
12+
use vortex_array::VortexSessionExecute;
13+
use vortex_array::arrays::ConstantArray;
14+
use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
15+
use vortex_array::scalar_fn::fns::like::Like;
16+
use vortex_array::scalar_fn::fns::like::LikeOptions;
17+
use vortex_array::session::ArraySession;
18+
use vortex_fsst::FSSTArray;
19+
use vortex_fsst::test_utils::NUM_STRINGS;
20+
use vortex_fsst::test_utils::make_fsst_clickbench_urls;
21+
use vortex_fsst::test_utils::make_fsst_emails;
22+
use vortex_fsst::test_utils::make_fsst_file_paths;
23+
use vortex_fsst::test_utils::make_fsst_json_strings;
24+
use vortex_fsst::test_utils::make_fsst_log_lines;
25+
use vortex_fsst::test_utils::make_fsst_rare_match;
26+
use vortex_fsst::test_utils::make_fsst_short_urls;
27+
use vortex_session::VortexSession;
28+
29+
fn main() {
30+
divan::main();
31+
}
32+
33+
static SESSION: LazyLock<VortexSession> =
34+
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
35+
36+
const N: usize = NUM_STRINGS;
37+
38+
static FSST_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_short_urls(N));
39+
static FSST_CB_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_clickbench_urls(N));
40+
static FSST_LOG_LINES: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_log_lines(N));
41+
static FSST_JSON_STRINGS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_json_strings(N));
42+
static FSST_FILE_PATHS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_file_paths(N));
43+
static FSST_EMAILS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_emails(N));
44+
static FSST_RARE_MATCH: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_rare_match(N));
45+
46+
enum Dataset {
47+
Urls,
48+
Cb,
49+
Log,
50+
Json,
51+
Path,
52+
Email,
53+
Rare,
54+
}
55+
56+
impl fmt::Display for Dataset {
57+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
58+
match self {
59+
Self::Urls => f.write_str("urls"),
60+
Self::Cb => f.write_str("cb"),
61+
Self::Log => f.write_str("log"),
62+
Self::Json => f.write_str("json"),
63+
Self::Path => f.write_str("path"),
64+
Self::Email => f.write_str("email"),
65+
Self::Rare => f.write_str("rare"),
66+
}
67+
}
68+
}
69+
70+
impl Dataset {
71+
fn fsst_array(&self) -> &'static FSSTArray {
72+
match self {
73+
Self::Urls => &FSST_URLS,
74+
Self::Cb => &FSST_CB_URLS,
75+
Self::Log => &FSST_LOG_LINES,
76+
Self::Json => &FSST_JSON_STRINGS,
77+
Self::Path => &FSST_FILE_PATHS,
78+
Self::Email => &FSST_EMAILS,
79+
Self::Rare => &FSST_RARE_MATCH,
80+
}
81+
}
82+
83+
fn pattern(&self) -> &'static str {
84+
match self {
85+
Self::Urls => "%google%",
86+
Self::Cb => "%yandex%",
87+
Self::Log => "%Googlebot%",
88+
Self::Json => "%enterprise%",
89+
Self::Path => "%target/release%",
90+
Self::Email => "%gmail%",
91+
Self::Rare => "%xyzzy%",
92+
}
93+
}
94+
}
95+
96+
#[divan::bench(args = [
97+
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
98+
Dataset::Path, Dataset::Email, Dataset::Rare,
99+
])]
100+
fn fsst_like(bencher: Bencher, dataset: &Dataset) {
101+
let fsst = dataset.fsst_array();
102+
let len = fsst.len();
103+
let arr = fsst.clone().into_array();
104+
let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
105+
bencher.bench_local(|| {
106+
Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
107+
.unwrap()
108+
.into_array()
109+
.execute::<Canonical>(&mut SESSION.create_execution_ctx())
110+
.unwrap()
111+
});
112+
}

encodings/fsst/benches/fsst_url_compare.rs

Lines changed: 5 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,13 @@
66
use std::sync::LazyLock;
77

88
use divan::Bencher;
9-
use rand::Rng;
10-
use rand::SeedableRng;
11-
use rand::rngs::StdRng;
129
use vortex_array::IntoArray;
1310
use vortex_array::RecursiveCanonical;
1411
use vortex_array::VortexSessionExecute;
1512
use vortex_array::arrays::ConstantArray;
1613
use vortex_array::arrays::VarBinArray;
1714
use vortex_array::builtins::ArrayBuiltins;
1815
use vortex_array::compute::warm_up_vtables;
19-
use vortex_array::dtype::DType;
20-
use vortex_array::dtype::Nullability;
2116
use vortex_array::expr::like;
2217
use vortex_array::expr::lit;
2318
use vortex_array::expr::root;
@@ -26,6 +21,10 @@ use vortex_array::scalar_fn::fns::operators::Operator;
2621
use vortex_array::session::ArraySession;
2722
use vortex_fsst::fsst_compress;
2823
use vortex_fsst::fsst_train_compressor;
24+
use vortex_fsst::test_utils::HIGH_MATCH_DOMAIN;
25+
use vortex_fsst::test_utils::LOW_MATCH_DOMAIN;
26+
use vortex_fsst::test_utils::NUM_STRINGS;
27+
use vortex_fsst::test_utils::generate_url_data;
2928
use vortex_session::VortexSession;
3029

3130
fn main() {
@@ -36,76 +35,7 @@ fn main() {
3635
static SESSION: LazyLock<VortexSession> =
3736
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
3837

39-
const NUM_URLS: usize = 100_000;
40-
41-
/// A high-frequency domain that appears in ~50% of generated URLs.
42-
const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru";
43-
44-
/// A low-frequency domain that appears in ~1% of generated URLs.
45-
const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com";
46-
47-
// Domains modeled after real ClickBench URL distributions.
48-
const DOMAINS: &[(&str, u32)] = &[
49-
("smeshariki.ru", 500), // ~50%
50-
("auto.ru", 150), // ~15%
51-
("komme.ru", 100), // ~10%
52-
("yandex.ru", 80), // ~8%
53-
("mail.ru", 60), // ~6%
54-
("livejournal.com", 40), // ~4%
55-
("vk.com", 30), // ~3%
56-
("avito.ru", 20), // ~2%
57-
("kinopoisk.ru", 10), // ~1%
58-
("rare-example-domain.com", 10), // ~1%
59-
];
60-
61-
const PATHS: &[&str] = &[
62-
"/GameMain.aspx",
63-
"/index.php",
64-
"/catalog/item",
65-
"/search",
66-
"/news/article",
67-
"/user/profile",
68-
"/collection/view",
69-
"/cars/used/sale",
70-
"/forum/thread",
71-
"/photo/album",
72-
"/video/watch",
73-
"/download/file",
74-
"/api/v1/resource",
75-
"/shop/product",
76-
"/blog/post",
77-
];
78-
79-
/// Generate 100k realistic ClickBench-style URLs.
80-
fn generate_url_data() -> VarBinArray {
81-
let mut rng = StdRng::seed_from_u64(42);
82-
83-
// Build a weighted domain lookup.
84-
let total_weight: u32 = DOMAINS.iter().map(|(_, w)| w).sum();
85-
let urls: Vec<Option<Box<[u8]>>> = (0..NUM_URLS)
86-
.map(|_| {
87-
let domain_roll = rng.random_range(0..total_weight);
88-
let mut cumulative = 0u32;
89-
let mut domain = DOMAINS[0].0;
90-
for &(d, w) in DOMAINS {
91-
cumulative += w;
92-
if domain_roll < cumulative {
93-
domain = d;
94-
break;
95-
}
96-
}
97-
98-
let path = PATHS[rng.random_range(0..PATHS.len())];
99-
let query_id: u32 = rng.random_range(1..100_000);
100-
let tab: u16 = rng.random_range(1..20);
101-
102-
let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}");
103-
Some(url.into_bytes().into_boxed_slice())
104-
})
105-
.collect();
106-
107-
VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable))
108-
}
38+
const NUM_URLS: usize = NUM_STRINGS;
10939

11040
static URL_DATA: LazyLock<VarBinArray> = LazyLock::new(generate_url_data);
11141

0 commit comments

Comments
 (0)