Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
d65969b
Add OnPair string array encoding with predicate pushdown
claude May 14, 2026
0fb5929
Add 100k-row smoke test for OnPair encoding
claude May 14, 2026
87f217f
Refactor OnPair to FSST-shape: dict-as-blob, u16 codes child, Rust de…
claude May 14, 2026
70947a8
Wire OnPair as a btrblocks string scheme
claude May 14, 2026
803bc4e
Make OnPair the default string-fragmentation scheme + register globally
claude May 14, 2026
6a9a2a2
Move OnPair default-feature flag up to the vortex umbrella crate
claude May 14, 2026
7ae6718
Round out OnPair CI: widen-on-decode + public-api locks + lints
claude May 14, 2026
83651e4
Add file-write roundtrip skeleton + track Misaligned buffer follow-up
claude May 14, 2026
f0e03a3
OnPair layout v3: all integer arrays as buffers, file roundtrip works
claude May 14, 2026
ce16314
Thorough multi-column / multi-chunk OnPair file round-trip tests
claude May 14, 2026
15b7300
Wire the OnPair roundtrip suite through the full Vortex session
claude May 14, 2026
d229d6e
SIMD-friendly OnPair decode + divan bench
claude May 14, 2026
5432766
Fix Misaligned buffer on read by reordering OnPair buffers
claude May 14, 2026
d9a6c8c
OnPair: FSST-shape ABI — codes / codes_offsets / dict_offsets as slot…
claude May 14, 2026
15569bb
OnPair decoder: combined (offset|length) table + skip canonicalize do…
claude May 14, 2026
adeda19
OnPair decoder: drop redundant dict_offsets widen + tighter hot path
claude May 14, 2026
53c3ea4
OnPair: filter shares dict (TPC-H Q22 SF=10 fix) + token-aware predic…
claude May 14, 2026
18f0cf2
OnPair: drop Like pushdown for now, keep Compare token-aware path
claude May 14, 2026
87011ec
OnPair: fast LIKE on compressed codes (PrefixAutomaton + bloom + filt…
claude May 14, 2026
a1ba67f
OnPair: regression tests for narrowed codes_offsets in filter
claude May 14, 2026
c3bcb2e
OnPair: drop `%contains%` pushdown (fall through to canonical decode)…
claude May 15, 2026
cd71c15
OnPair: FineWeb-shape bench + decode-vs-fallback measurements
claude May 15, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ members = [
"encodings/alp",
"encodings/datetime-parts",
"encodings/fsst",
"encodings/onpair",
"encodings/onpair-sys",
"encodings/pco",
"encodings/sparse",
"encodings/zigzag",
Expand Down Expand Up @@ -289,6 +291,8 @@ vortex-ipc = { version = "0.1.0", path = "./vortex-ipc", default-features = fals
vortex-layout = { version = "0.1.0", path = "./vortex-layout", default-features = false }
vortex-mask = { version = "0.1.0", path = "./vortex-mask", default-features = false }
vortex-metrics = { version = "0.1.0", path = "./vortex-metrics", default-features = false }
vortex-onpair = { version = "0.1.0", path = "./encodings/onpair", default-features = false }
vortex-onpair-sys = { version = "0.1.0", path = "./encodings/onpair-sys", default-features = false }
vortex-pco = { version = "0.1.0", path = "./encodings/pco", default-features = false }
vortex-proto = { version = "0.1.0", path = "./vortex-proto", default-features = false }
vortex-runend = { version = "0.1.0", path = "./encodings/runend", default-features = false }
Expand Down
30 changes: 30 additions & 0 deletions encodings/onpair-sys/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[package]
name = "vortex-onpair-sys"
authors = { workspace = true }
categories = { workspace = true }
description = "Native FFI bindings to the OnPair short-string compression library"
edition = { workspace = true }
homepage = { workspace = true }
include = [
"build.rs",
"src/**/*.rs",
"cxx/**/*",
"cmake/**/*",
"Cargo.toml",
"README.md",
]
keywords = { workspace = true }
license = { workspace = true }
links = "onpair_shim"
readme = "README.md"
repository = { workspace = true }
rust-version = { workspace = true }
version = { workspace = true }

[lints]
workspace = true

[dependencies]

[build-dependencies]
cmake = "0.1"
31 changes: 31 additions & 0 deletions encodings/onpair-sys/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# vortex-onpair-sys

Low-level FFI bindings to the [OnPair][onpair] short-string compression library.

OnPair is a dictionary-based compressor with **random access** and
**compressed-domain predicate evaluation** (substring, prefix, exact-match),
making it a natural fit for column scans with filter pushdown.

This crate is the unsafe `*-sys` layer used by [`vortex-onpair`][onpair-rs].
End users should depend on `vortex-onpair`, not this crate.

## Build

The build script uses CMake's `FetchContent` to pull
`gargiulofrancesco/onpair_cpp` at the pin recorded in `cmake/onpair_pin.cmake`,
applies a small patch that replaces `boost::unordered_flat_map` with
`std::unordered_map` to avoid the Boost dependency, and compiles both OnPair
and a thin C ABI shim (`cxx/onpair_shim.{h,cpp}`) into a single static archive
that is linked into the Rust crate.

### Requirements

- CMake >= 3.21
- A C++20-capable compiler (GCC >= 11, Clang >= 13, MSVC >= 19.29)
- Network access on first build (for `FetchContent`)

After the first build the source tree is cached under
`$OUT_DIR/onpair-build/_deps`, so subsequent builds are offline.

[onpair]: https://arxiv.org/abs/2508.02280
[onpair-rs]: ../onpair
41 changes: 41 additions & 0 deletions encodings/onpair-sys/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors
//
// Builds the OnPair C++ library plus a thin C-ABI shim into a static archive
// that gets linked into this crate. The CMake configuration lives in
// `cmake/CMakeLists.txt` and fetches `gargiulofrancesco/onpair_cpp` via
// `FetchContent`.

fn main() {
let cmake_dir = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("cmake");

println!("cargo:rerun-if-changed={}", cmake_dir.display());
println!(
"cargo:rerun-if-changed={}",
std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("cxx")
.display()
);
println!("cargo:rerun-if-env-changed=VORTEX_ONPAIR_FORCE_REBUILD");

let dst = cmake::Config::new(&cmake_dir)
.profile("Release")
.define("CMAKE_POLICY_DEFAULT_CMP0077", "NEW")
.define("CMAKE_POSITION_INDEPENDENT_CODE", "ON")
.define("ONPAIR_BUILD_TESTS", "OFF")
.define("ONPAIR_BUILD_EXAMPLES", "OFF")
.build();

println!("cargo:rustc-link-search=native={}/lib", dst.display());
// The shim depends on onpair; both are static archives.
println!("cargo:rustc-link-lib=static=onpair_shim");
println!("cargo:rustc-link-lib=static=onpair");

// C++ standard library, picked by host platform.
let target = std::env::var("CARGO_CFG_TARGET_OS").unwrap_or_default();
match target.as_str() {
"macos" | "ios" => println!("cargo:rustc-link-lib=c++"),
"windows" => {} // MSVC links the runtime automatically.
_ => println!("cargo:rustc-link-lib=stdc++"),
}
}
42 changes: 42 additions & 0 deletions encodings/onpair-sys/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors

cmake_minimum_required(VERSION 3.21)
project(onpair_shim CXX)

set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

include(FetchContent)
include("${CMAKE_CURRENT_LIST_DIR}/onpair_pin.cmake")

# Skip onpair_cpp's own tests/examples and tell it not to fetch Boost.
set(ONPAIR_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(ONPAIR_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(ONPAIR_ENABLE_LTO OFF CACHE BOOL "" FORCE)
set(ONPAIR_NATIVE_ARCH OFF CACHE BOOL "" FORCE)

FetchContent_Declare(
onpair_cpp
GIT_REPOSITORY ${ONPAIR_CPP_REPO}
GIT_TAG ${ONPAIR_CPP_TAG}
PATCH_COMMAND ${CMAKE_COMMAND}
-DSRC_DIR=<SOURCE_DIR>
-P "${CMAKE_CURRENT_LIST_DIR}/strip_boost.cmake"
)
FetchContent_MakeAvailable(onpair_cpp)

add_library(onpair_shim STATIC
"${CMAKE_CURRENT_LIST_DIR}/../cxx/onpair_shim.cpp"
)
target_include_directories(onpair_shim
PUBLIC "${CMAKE_CURRENT_LIST_DIR}/../cxx"
)
target_link_libraries(onpair_shim PUBLIC OnPair::onpair)
set_target_properties(onpair_shim PROPERTIES POSITION_INDEPENDENT_CODE ON)

install(TARGETS onpair_shim onpair
ARCHIVE DESTINATION lib
LIBRARY DESTINATION lib)
8 changes: 8 additions & 0 deletions encodings/onpair-sys/cmake/onpair_pin.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors
#
# Pin of gargiulofrancesco/onpair_cpp consumed by FetchContent.
# Bump `ONPAIR_CPP_TAG` to a full commit SHA when updating — never use a
# branch name in CI, otherwise builds become non-reproducible.
set(ONPAIR_CPP_REPO "https://github.com/gargiulofrancesco/onpair_cpp.git")
set(ONPAIR_CPP_TAG "ae590713515c7bb7893e14a757b484545e5339c3")
70 changes: 70 additions & 0 deletions encodings/onpair-sys/cmake/strip_boost.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors
#
# Replaces boost::unordered_flat_{map,set} with std::unordered_{map,set}
# in the fetched onpair_cpp source tree. Idempotent.
#
# Invoked by FetchContent_Declare(PATCH_COMMAND ...).
#
# We rewrite `#include <boost/unordered/...>` to `#include <unordered_{map,set}>`
# and substitute the qualified types. OnPair only uses the public, std-compatible
# subset of boost::unordered_flat_map (operator[], find, emplace, size, iterators),
# so this is a sound substitution.

if(NOT DEFINED SRC_DIR)
message(FATAL_ERROR "strip_boost.cmake: SRC_DIR not set")
endif()

file(GLOB_RECURSE ONPAIR_SOURCES
"${SRC_DIR}/include/onpair/*.h"
"${SRC_DIR}/include/onpair/*.hpp"
"${SRC_DIR}/src/onpair/*.cpp"
"${SRC_DIR}/src/onpair/*.h"
"${SRC_DIR}/src/onpair/*.hpp"
)

set(_PAIR_HASH_BLOCK
"// strip_boost.cmake: std::hash<std::pair<uint64_t, uint8_t>> for unordered_map keys\n#include <cstdint>\n#include <functional>\n#include <utility>\nnamespace std {\ntemplate<> struct hash<std::pair<uint64_t, uint8_t>> {\n size_t operator()(const std::pair<uint64_t, uint8_t>& p) const noexcept {\n return std::hash<uint64_t>{}(p.first) ^ (std::hash<uint8_t>{}(p.second) << 1);\n }\n};\n} // namespace std\n")

foreach(F ${ONPAIR_SOURCES})
file(READ "${F}" CONTENT)
string(REGEX REPLACE
"#include[ \t]+<boost/unordered/unordered_flat_map\\.hpp>"
"#include <unordered_map>" CONTENT "${CONTENT}")
string(REGEX REPLACE
"#include[ \t]+<boost/unordered/unordered_flat_set\\.hpp>"
"#include <unordered_set>" CONTENT "${CONTENT}")
string(REGEX REPLACE
"#include[ \t]+<boost/unordered\\.hpp>"
"#include <unordered_map>\n#include <unordered_set>" CONTENT "${CONTENT}")
string(REPLACE "boost::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
string(REPLACE "boost::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
string(REPLACE "boost::unordered::unordered_flat_map" "std::unordered_map" CONTENT "${CONTENT}")
string(REPLACE "boost::unordered::unordered_flat_set" "std::unordered_set" CONTENT "${CONTENT}")
# Inject the pair-hash specialization once, at the top of any file that
# keys an unordered_map by std::pair. std::hash<std::pair<...>> does not
# exist by default; boost::unordered_flat_map shipped its own.
string(FIND "${CONTENT}" "unordered_map<std::pair" _has_pair_key)
if(NOT _has_pair_key EQUAL -1)
string(FIND "${CONTENT}" "strip_boost.cmake: std::hash<std::pair" _has_block)
if(_has_block EQUAL -1)
set(CONTENT "${_PAIR_HASH_BLOCK}${CONTENT}")
endif()
endif()
file(WRITE "${F}" "${CONTENT}")
endforeach()

# Drop find_package(Boost) and Boost link lines from onpair_cpp's CMake files
# so the build doesn't error out looking for Boost on the host.
file(GLOB_RECURSE ONPAIR_CMAKE
"${SRC_DIR}/CMakeLists.txt"
"${SRC_DIR}/cmake/*.cmake"
)
foreach(F ${ONPAIR_CMAKE})
file(READ "${F}" CONTENT)
string(REGEX REPLACE "find_package\\([ \t]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
string(REGEX REPLACE "FetchContent_Declare\\([ \t\r\n]*Boost[^)]*\\)" "" CONTENT "${CONTENT}")
string(REGEX REPLACE "FetchContent_MakeAvailable\\([ \t]*Boost[ \t]*\\)" "" CONTENT "${CONTENT}")
string(REGEX REPLACE "Boost::[A-Za-z_]+" "" CONTENT "${CONTENT}")
file(WRITE "${F}" "${CONTENT}")
endforeach()
Loading
Loading