Skip to content

Commit d65969b

Browse files
committed
Add OnPair string array encoding with predicate pushdown
Introduces two new crates that integrate the OnPair C++ short-string compression library (gargiulofrancesco/onpair_cpp, arXiv:2508.02280) as a first-class Vortex array. * `encodings/onpair-sys`: build.rs uses cmake-rs to FetchContent the upstream onpair_cpp at configure time, applies a small in-tree patch that swaps `boost::unordered_flat_map` for `std::unordered_map` (plus a `std::hash<std::pair<...>>` specialisation), and links a C-ABI shim (`cxx/onpair_shim.{h,cpp}`) into a static archive. Safe Rust wraps the shim in a `Column` owning handle exposing compress / serialise / decompress and the compressed-domain predicates. * `encodings/onpair`: Vortex `Array` impl mirroring `vortex-fsst`. Stores the serialised OnPair column (`ONPAIR01` magic + dictionary + bit-packed token stream) as a single opaque buffer plus an `uncompressed_lengths` child for cheap canonicalisation. Default preset is "dict-12" (12-bit codes, dictionary capped at 4 096 entries). Wires equals / starts-with / contains pushdown straight through to the C++ scan implementation via `CompareKernel` and `LikeKernel`, so `arr = const` and `arr LIKE 'prefix%' / '%substr%'` evaluate on the compressed stream without decoding rows. * Tests cover roundtrip, nullable canonicalisation, scalar_at, and all three pushdown predicates end-to-end through the C++ stack (7/7 pass locally with cmake + g++). Build requirements: cmake >= 3.21, a C++20 compiler, and network access on the first build (subsequent builds are cached under `$OUT_DIR/onpair-build/_deps`). No Boost dependency at build time. Signed-off-by: Claude <noreply@anthropic.com>
1 parent 7668bef commit d65969b

28 files changed

Lines changed: 2381 additions & 0 deletions

Cargo.lock

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ members = [
4848
"encodings/alp",
4949
"encodings/datetime-parts",
5050
"encodings/fsst",
51+
"encodings/onpair",
52+
"encodings/onpair-sys",
5153
"encodings/pco",
5254
"encodings/sparse",
5355
"encodings/zigzag",
@@ -284,6 +286,8 @@ vortex-fastlanes = { version = "0.1.0", path = "./encodings/fastlanes", default-
284286
vortex-file = { version = "0.1.0", path = "./vortex-file", default-features = false }
285287
vortex-flatbuffers = { version = "0.1.0", path = "./vortex-flatbuffers", default-features = false }
286288
vortex-fsst = { version = "0.1.0", path = "./encodings/fsst", default-features = false }
289+
vortex-onpair = { version = "0.1.0", path = "./encodings/onpair", default-features = false }
290+
vortex-onpair-sys = { version = "0.1.0", path = "./encodings/onpair-sys", default-features = false }
287291
vortex-io = { version = "0.1.0", path = "./vortex-io", default-features = false }
288292
vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = false }
289293
vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false }

encodings/onpair-sys/Cargo.toml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[package]
2+
name = "vortex-onpair-sys"
3+
authors = { workspace = true }
4+
categories = { workspace = true }
5+
description = "Native FFI bindings to the OnPair short-string compression library"
6+
edition = { workspace = true }
7+
homepage = { workspace = true }
8+
include = [
9+
"build.rs",
10+
"src/**/*.rs",
11+
"cxx/**/*",
12+
"cmake/**/*",
13+
"Cargo.toml",
14+
"README.md",
15+
]
16+
keywords = { workspace = true }
17+
license = { workspace = true }
18+
links = "onpair_shim"
19+
readme = "README.md"
20+
repository = { workspace = true }
21+
rust-version = { workspace = true }
22+
version = { workspace = true }
23+
24+
[lints]
25+
workspace = true
26+
27+
[dependencies]
28+
29+
[build-dependencies]
30+
cmake = "0.1"

encodings/onpair-sys/README.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# vortex-onpair-sys
2+
3+
Low-level FFI bindings to the [OnPair][onpair] short-string compression library.
4+
5+
OnPair is a dictionary-based compressor with **random access** and
6+
**compressed-domain predicate evaluation** (substring, prefix, exact-match),
7+
making it a natural fit for column scans with filter pushdown.
8+
9+
This crate is the unsafe `*-sys` layer used by [`vortex-onpair`][onpair-rs].
10+
End users should depend on `vortex-onpair`, not this crate.
11+
12+
## Build
13+
14+
The build script uses CMake's `FetchContent` to pull
15+
`gargiulofrancesco/onpair_cpp` at the pin recorded in `cmake/onpair_pin.cmake`,
16+
applies a small patch that replaces `boost::unordered_flat_map` with
17+
`std::unordered_map` to avoid the Boost dependency, and compiles both OnPair
18+
and a thin C ABI shim (`cxx/onpair_shim.{h,cpp}`) into a single static archive
19+
that is linked into the Rust crate.
20+
21+
### Requirements
22+
23+
- CMake >= 3.21
24+
- A C++20-capable compiler (GCC >= 11, Clang >= 13, MSVC >= 19.29)
25+
- Network access on first build (for `FetchContent`)
26+
27+
After the first build the source tree is cached under
28+
`$OUT_DIR/onpair-build/_deps`, so subsequent builds are offline.
29+
30+
[onpair]: https://arxiv.org/abs/2508.02280
31+
[onpair-rs]: ../onpair

encodings/onpair-sys/build.rs

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
//
4+
// Builds the OnPair C++ library plus a thin C-ABI shim into a static archive
5+
// that gets linked into this crate. The CMake configuration lives in
6+
// `cmake/CMakeLists.txt` and fetches `gargiulofrancesco/onpair_cpp` via
7+
// `FetchContent`.
8+
9+
fn main() {
10+
let cmake_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("cmake");
11+
12+
println!("cargo:rerun-if-changed={}", cmake_dir.display());
13+
println!(
14+
"cargo:rerun-if-changed={}",
15+
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
16+
.join("cxx")
17+
.display()
18+
);
19+
println!("cargo:rerun-if-env-changed=VORTEX_ONPAIR_FORCE_REBUILD");
20+
21+
let dst = cmake::Config::new(&cmake_dir)
22+
.profile("Release")
23+
.define("CMAKE_POLICY_DEFAULT_CMP0077", "NEW")
24+
.define("CMAKE_POSITION_INDEPENDENT_CODE", "ON")
25+
.define("ONPAIR_BUILD_TESTS", "OFF")
26+
.define("ONPAIR_BUILD_EXAMPLES", "OFF")
27+
.build();
28+
29+
println!("cargo:rustc-link-search=native={}/lib", dst.display());
30+
// The shim depends on onpair; both are static archives.
31+
println!("cargo:rustc-link-lib=static=onpair_shim");
32+
println!("cargo:rustc-link-lib=static=onpair");
33+
34+
// C++ standard library, picked by host platform.
35+
let target = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
36+
match target.as_str() {
37+
"macos" | "ios" => println!("cargo:rustc-link-lib=c++"),
38+
"windows" => {} // MSVC links the runtime automatically.
39+
_ => println!("cargo:rustc-link-lib=stdc++"),
40+
}
41+
}
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
cmake_minimum_required(VERSION 3.21)
2+
project(onpair_shim CXX)
3+
4+
set(CMAKE_CXX_STANDARD 20)
5+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
6+
set(CMAKE_CXX_EXTENSIONS OFF)
7+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
8+
9+
include(FetchContent)
10+
include("${CMAKE_CURRENT_LIST_DIR}/onpair_pin.cmake")
11+
12+
# Skip onpair_cpp's own tests/examples and tell it not to fetch Boost.
13+
set(ONPAIR_BUILD_TESTS OFF CACHE BOOL "" FORCE)
14+
set(ONPAIR_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
15+
set(ONPAIR_ENABLE_LTO OFF CACHE BOOL "" FORCE)
16+
set(ONPAIR_NATIVE_ARCH OFF CACHE BOOL "" FORCE)
17+
18+
FetchContent_Declare(
19+
onpair_cpp
20+
GIT_REPOSITORY ${ONPAIR_CPP_REPO}
21+
GIT_TAG ${ONPAIR_CPP_TAG}
22+
PATCH_COMMAND ${CMAKE_COMMAND}
23+
-DSRC_DIR=<SOURCE_DIR>
24+
-P "${CMAKE_CURRENT_LIST_DIR}/strip_boost.cmake"
25+
)
26+
FetchContent_MakeAvailable(onpair_cpp)
27+
28+
add_library(onpair_shim STATIC
29+
"${CMAKE_CURRENT_LIST_DIR}/../cxx/onpair_shim.cpp"
30+
)
31+
target_include_directories(onpair_shim
32+
PUBLIC "${CMAKE_CURRENT_LIST_DIR}/../cxx"
33+
)
34+
target_link_libraries(onpair_shim PUBLIC OnPair::onpair)
35+
set_target_properties(onpair_shim PROPERTIES POSITION_INDEPENDENT_CODE ON)
36+
37+
install(TARGETS onpair_shim onpair
38+
ARCHIVE DESTINATION lib
39+
LIBRARY DESTINATION lib)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent.
2+
# Bump both fields when updating.
3+
set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git")
4+
set(ONPAIR_CPP_TAG "main")
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# Replaces boost::unordered_flat_{map,set} with std::unordered_{map,set}
2+
# in the fetched onpair_cpp source tree. Idempotent.
3+
#
4+
# Invoked by FetchContent_Declare(PATCH_COMMAND ...).
5+
#
6+
# We rewrite `#include <boost/unordered/...>` to `#include <unordered_{map,set}>`
7+
# and substitute the qualified types. OnPair only uses the public, std-compatible
8+
# subset of boost::unordered_flat_map (operator[], find, emplace, size, iterators),
9+
# so this is a sound substitution.
10+
11+
if(NOT DEFINED SRC_DIR)
12+
message(FATAL_ERROR "strip_boost.cmake: SRC_DIR not set")
13+
endif()
14+
15+
file(GLOB_RECURSE ONPAIR_SOURCES
16+
"${SRC_DIR}/include/onpair/*.h"
17+
"${SRC_DIR}/include/onpair/*.hpp"
18+
"${SRC_DIR}/src/onpair/*.cpp"
19+
"${SRC_DIR}/src/onpair/*.h"
20+
"${SRC_DIR}/src/onpair/*.hpp"
21+
)
22+
23+
set(_PAIR_HASH_BLOCK
24+
"// strip_boost.cmake: std::hash<std::pair<uint64_t, uint8_t>> for unordered_map keys\n#include <cstdint>\n#include <functional>\n#include <utility>\nnamespace std {\ntemplate<> struct hash<std::pair<uint64_t, uint8_t>> {\n size_t operator()(const std::pair<uint64_t, uint8_t>& p) const noexcept {\n return std::hash<uint64_t>{}(p.first) ^ (std::hash<uint8_t>{}(p.second) << 1);\n }\n};\n} // namespace std\n")
25+
26+
foreach(F ${ONPAIR_SOURCES})
27+
file(READ "${F}" CONTENT)
28+
string(REGEX REPLACE
29+
"#include[ \t]+<boost/unordered/unordered_flat_map\\.hpp>"
30+
"#include <unordered_map>" CONTENT "${CONTENT}")
31+
string(REGEX REPLACE
32+
"#include[ \t]+<boost/unordered/unordered_flat_set\\.hpp>"
33+
"#include <unordered_set>" CONTENT "${CONTENT}")
34+
string(REGEX REPLACE
35+
"#include[ \t]+<boost/unordered\\.hpp>"
36+
"#include <unordered_map>\n#include <unordered_set>" CONTENT "${CONTENT}")
37+
string(REPLACE "boost::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
38+
string(REPLACE "boost::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
39+
string(REPLACE "boost::unordered::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
40+
string(REPLACE "boost::unordered::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
41+
# Inject the pair-hash specialization once, at the top of any file that
42+
# keys an unordered_map by std::pair. std::hash<std::pair<...>> does not
43+
# exist by default; boost::unordered_flat_map shipped its own.
44+
string(FIND "${CONTENT}" "unordered_map<std::pair" _has_pair_key)
45+
if(NOT _has_pair_key EQUAL -1)
46+
string(FIND "${CONTENT}" "strip_boost.cmake: std::hash<std::pair" _has_block)
47+
if(_has_block EQUAL -1)
48+
set(CONTENT "${_PAIR_HASH_BLOCK}${CONTENT}")
49+
endif()
50+
endif()
51+
file(WRITE "${F}" "${CONTENT}")
52+
endforeach()
53+
54+
# Drop find_package(Boost) and Boost link lines from onpair_cpp's CMake files
55+
# so the build doesn't error out looking for Boost on the host.
56+
file(GLOB_RECURSE ONPAIR_CMAKE
57+
"${SRC_DIR}/CMakeLists.txt"
58+
"${SRC_DIR}/cmake/*.cmake"
59+
)
60+
foreach(F ${ONPAIR_CMAKE})
61+
file(READ "${F}" CONTENT)
62+
string(REGEX REPLACE "find_package\\([ \t]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
63+
string(REGEX REPLACE "FetchContent_Declare\\([ \t\r\n]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
64+
string(REGEX REPLACE "FetchContent_MakeAvailable\\([ \t]*Boost[ \t]*\\)" "" CONTENT "${CONTENT}")
65+
string(REGEX REPLACE "Boost::[A-Za-z_]+" "" CONTENT "${CONTENT}")
66+
file(WRITE "${F}" "${CONTENT}")
67+
endforeach()

0 commit comments

Comments
 (0)