Skip to content

Commit cf1c050

Browse files
perf[fsst]: like pushdown using a dfa (#6935)
Fsst `like` execution without decompression. This uses a DFA over the symbol table and the like expression. Once this is proved out we could think about putting this in fsst-rs? --------- Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
1 parent d260d42 commit cf1c050

20 files changed

Lines changed: 1684 additions & 14 deletions

File tree

.github/workflows/fuzz.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,42 @@ jobs:
9999
gh_token: ${{ secrets.GITHUB_TOKEN }}
100100
incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }}
101101

102+
# ============================================================================
103+
# FSST LIKE Fuzzer
104+
# ============================================================================
105+
fsst_like_fuzz:
106+
name: "FSST LIKE Fuzz"
107+
uses: ./.github/workflows/run-fuzzer.yml
108+
with:
109+
fuzz_target: fsst_like
110+
jobs: 16
111+
secrets:
112+
R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
113+
R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}
114+
115+
report-fsst-like-fuzz-failures:
116+
name: "Report FSST LIKE Fuzz Failures"
117+
needs: fsst_like_fuzz
118+
if: always() && needs.fsst_like_fuzz.outputs.crashes_found == 'true'
119+
permissions:
120+
issues: write
121+
contents: read
122+
id-token: write
123+
pull-requests: read
124+
uses: ./.github/workflows/report-fuzz-crash.yml
125+
with:
126+
fuzz_target: fsst_like
127+
crash_file: ${{ needs.fsst_like_fuzz.outputs.first_crash_name }}
128+
artifact_url: ${{ needs.fsst_like_fuzz.outputs.artifact_url }}
129+
artifact_name: fsst_like-crash-artifacts
130+
logs_artifact_name: fsst_like-logs
131+
branch: ${{ github.ref_name }}
132+
commit: ${{ github.sha }}
133+
secrets:
134+
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
135+
gh_token: ${{ secrets.GITHUB_TOKEN }}
136+
incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }}
137+
102138
# ============================================================================
103139
# Compress Roundtrip Fuzzer
104140
# ============================================================================

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

_typos.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
extend-ignore-identifiers-re = ["ffor", "FFOR", "FoR", "typ", "ratatui"]
33
# We support a few common special comments to tell the checker to ignore sections of code
44
extend-ignore-re = [
5-
"(#|//)\\s*spellchecker:ignore-next-line\\n.*", # Ignore the next line
6-
"(?Rm)^.*(#|//)\\s*spellchecker:disable-line$", # Ignore line that ends with this hint
5+
"(#|//)\\s*spellchecker:ignore-next-line\\n.*", # Ignore the next line
6+
"(?Rm)^.*(#|//)\\s*spellchecker:disable-line$", # Ignore line that ends with this hint
77
"(?s)(#|//)\\s*spellchecker:off.*?\\n\\s*(#|//)\\s*spellchecker:on", # Ignore block between hints
88
]
99

1010
[files]
11-
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**"]
11+
extend-exclude = ["/vortex-bench/**", "/docs/references.bib", "benchmarks/**", "vortex-sqllogictest/slt/**", "encodings/fsst/src/dfa/tests.rs", "encodings/fsst/src/dfa/flat_contains.rs"]
1212

1313
[type.py]
1414
extend-ignore-identifiers-re = [

encodings/fsst/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ harness = false
4141
required-features = ["_test-harness"]
4242

4343
[[bench]]
44-
name = "fsst_contains"
44+
name = "fsst_like"
4545
harness = false
4646
required-features = ["_test-harness"]
4747

encodings/fsst/README.md

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,23 @@
22

33
A Vortex Encoding for Binary and Utf8 data that utilizes the [Fast Static Symbol Table](https://github.com/spiraldb/fsst)
44
compression algorithm.
5+
6+
## LIKE Pushdown
7+
8+
The FSST encoding has a specialized LIKE fast path for a narrow subset of
9+
patterns:
10+
11+
- `prefix%`
12+
- `%needle%`
13+
14+
Unsupported shapes, including `_`, `%suffix`, or patterns with interior
15+
wildcards, fall back to ordinary decompression-based LIKE evaluation.
16+
17+
There are also two implementation limits on the pushdown path, both measured in
18+
pattern bytes:
19+
20+
- `prefix%` supports up to 253 bytes.
21+
- `%needle%` supports up to 254 bytes.
22+
23+
Patterns beyond those limits are still evaluated correctly, but they do so via
24+
the fallback path instead of the DFA matcher.
Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,19 @@ impl Dataset {
8080
}
8181
}
8282

83-
fn pattern(&self) -> &'static str {
83+
fn prefix_pattern(&self) -> &'static str {
84+
match self {
85+
Self::Urls => "https%",
86+
Self::Cb => "https://www.%",
87+
Self::Log => "192.168%",
88+
Self::Json => r#"{"id%"#,
89+
Self::Path => "/home%",
90+
Self::Email => "john%",
91+
Self::Rare => "xyz%",
92+
}
93+
}
94+
95+
fn contains_pattern(&self) -> &'static str {
8496
match self {
8597
Self::Urls => "%google%",
8698
Self::Cb => "%yandex%",
@@ -93,15 +105,10 @@ impl Dataset {
93105
}
94106
}
95107

96-
#[divan::bench(args = [
97-
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
98-
Dataset::Path, Dataset::Email, Dataset::Rare,
99-
])]
100-
fn fsst_like(bencher: Bencher, dataset: &Dataset) {
101-
let fsst = dataset.fsst_array();
108+
fn bench_like(bencher: Bencher, fsst: &FSSTArray, pattern: &str) {
102109
let len = fsst.len();
103110
let arr = fsst.clone().into_array();
104-
let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
111+
let pattern = ConstantArray::new(pattern, len).into_array();
105112
bencher.bench_local(|| {
106113
Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
107114
.unwrap()
@@ -110,3 +117,19 @@ fn fsst_like(bencher: Bencher, dataset: &Dataset) {
110117
.unwrap()
111118
});
112119
}
120+
121+
#[divan::bench(args = [
122+
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
123+
Dataset::Path, Dataset::Email, Dataset::Rare,
124+
])]
125+
fn fsst_prefix(bencher: Bencher, dataset: &Dataset) {
126+
bench_like(bencher, dataset.fsst_array(), dataset.prefix_pattern());
127+
}
128+
129+
#[divan::bench(args = [
130+
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
131+
Dataset::Path, Dataset::Email, Dataset::Rare,
132+
])]
133+
fn fsst_contains(bencher: Bencher, dataset: &Dataset) {
134+
bench_like(bencher, dataset.fsst_array(), dataset.contains_pattern());
135+
}

encodings/fsst/public-api.lock

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_fsst::FSS
3030

3131
pub fn vortex_fsst::FSST::cast(array: &vortex_fsst::FSSTArray, dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
3232

33+
impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_fsst::FSST
34+
35+
pub fn vortex_fsst::FSST::like(array: &vortex_fsst::FSSTArray, pattern: &vortex_array::array::ArrayRef, options: vortex_array::scalar_fn::fns::like::LikeOptions, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>
36+
3337
impl vortex_array::vtable::VTable for vortex_fsst::FSST
3438

3539
pub type vortex_fsst::FSST::Array = vortex_fsst::FSSTArray

0 commit comments

Comments
 (0)