Skip to content

Commit bbf1e12

Browse files
committed
DuckDB FSST Export
Signed-off-by: Nicholas Gates <nick@nickgates.com>
1 parent bf95b6f commit bbf1e12

8 files changed

Lines changed: 504 additions & 4 deletions

File tree

vortex-duckdb/build.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ fn try_build_duckdb(
281281
}
282282
}
283283

284-
fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
284+
fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path, duckdb_fsst_include_dir: &Path) {
285285
let bindings = bindgen::Builder::default()
286286
.header("cpp/include/duckdb_vx.h")
287287
.override_abi(Abi::CUnwind, ".*")
@@ -302,6 +302,7 @@ fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
302302
.rustified_non_exhaustive_enum("DUCKDB_TYPE")
303303
.size_t_is_usize(true)
304304
.clang_arg(format!("-I{}", duckdb_include_dir.display()))
305+
.clang_arg(format!("-I{}", duckdb_fsst_include_dir.display()))
305306
.clang_arg(format!("-I{}", crate_dir.join("cpp/include").display()))
306307
.generate_comments(true)
307308
// Tell cargo to invalidate the built crate whenever any of the
@@ -328,12 +329,13 @@ fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) {
328329
}
329330
}
330331

331-
fn cpp(duckdb_include_dir: &Path) {
332+
fn cpp(duckdb_include_dir: &Path, duckdb_fsst_include_dir: &Path) {
332333
cc::Build::new()
333334
.std("c++20")
334335
.flags(["-Wall", "-Wextra", "-Wpedantic"])
335336
.cpp(true)
336337
.include(duckdb_include_dir)
338+
.include(duckdb_fsst_include_dir)
337339
.include("cpp/include")
338340
.files(SOURCE_FILES)
339341
.compile("vortex-duckdb-extras");
@@ -449,7 +451,8 @@ fn main() {
449451
};
450452

451453
let duckdb_include_dir = inner_dir.join("src").join("include");
452-
c2rust(&crate_dir, &duckdb_include_dir);
453-
cpp(&duckdb_include_dir);
454+
let duckdb_fsst_include_dir = inner_dir.join("third_party").join("fsst");
455+
c2rust(&crate_dir, &duckdb_include_dir, &duckdb_fsst_include_dir);
456+
cpp(&duckdb_include_dir, &duckdb_fsst_include_dir);
454457
rust2c(&crate_dir);
455458
}

vortex-duckdb/cpp/include/duckdb_vx/vector.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,21 @@ void duckdb_vx_vector_set_data_ptr(duckdb_vector ffi_vector, void *ptr);
5353
// Converts a duckdb flat vector into a Sequence vector.
5454
void duckdb_vx_sequence_vector(duckdb_vector c_vector, int64_t start, int64_t step, idx_t capacity);
5555

56+
// Finalize a vector as an FSST vector using externally managed compressed string data and a
57+
// symbol table owned by Vortex.
58+
void duckdb_vx_fsst_vector_set(
59+
duckdb_vector ffi_vector,
60+
const uint64_t *symbols,
61+
const uint8_t *symbol_lengths,
62+
idx_t symbol_count,
63+
idx_t string_block_limit,
64+
idx_t count,
65+
duckdb_vx_vector_buffer buffer
66+
);
67+
68+
// Returns whether the vector is currently an FSST vector.
69+
bool duckdb_vx_vector_is_fsst(duckdb_vector ffi_vector);
70+
5671
void duckdb_vector_flatten(duckdb_vector vector, unsigned long len);
5772

5873
const char *duckdb_vector_to_string(duckdb_vector vector, unsigned long len, duckdb_vx_error *err);

vortex-duckdb/cpp/vector.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55

66
DUCKDB_INCLUDES_BEGIN
77
#include "duckdb/common/vector.hpp"
8+
#include "duckdb/common/fsst.hpp"
89
#include "duckdb/common/types/value.hpp"
10+
#include "duckdb/common/types/vector_buffer.hpp"
911
#include "duckdb/common/types/vector.hpp"
1012
DUCKDB_INCLUDES_END
1113

@@ -60,6 +62,8 @@ class DataVector : public Vector {
6062

6163
} // namespace vortex
6264

65+
static constexpr uint64_t VX_FSST_CORRUPT = 32774747032022883ULL;
66+
6367
extern "C" void duckdb_vx_string_vector_add_vector_data_buffer(duckdb_vector ffi_vector,
6468
duckdb_vx_vector_buffer buffer) {
6569
auto vector = reinterpret_cast<Vector *>(ffi_vector);
@@ -81,6 +85,49 @@ extern "C" void duckdb_vx_vector_set_data_ptr(duckdb_vector ffi_vector, void *pt
8185
dvector->SetDataPtr((data_ptr_t)ptr);
8286
}
8387

88+
extern "C" void duckdb_vx_fsst_vector_set(duckdb_vector ffi_vector,
89+
const uint64_t *symbols,
90+
const uint8_t *symbol_lengths,
91+
idx_t symbol_count,
92+
idx_t string_block_limit,
93+
idx_t count,
94+
duckdb_vx_vector_buffer buffer) {
95+
auto vector = reinterpret_cast<Vector *>(ffi_vector);
96+
D_ASSERT(vector);
97+
D_ASSERT(symbol_count <= 255);
98+
99+
buffer_ptr<void> decoder_buffer = make_buffer<duckdb_fsst_decoder_t>();
100+
auto *decoder = reinterpret_cast<duckdb_fsst_decoder_t *>(decoder_buffer.get());
101+
decoder->version = 0;
102+
decoder->zeroTerminated = 0;
103+
for (idx_t i = 0; i < 255; i++) {
104+
decoder->len[i] = 8;
105+
decoder->symbol[i] = VX_FSST_CORRUPT;
106+
}
107+
for (idx_t i = 0; i < symbol_count; i++) {
108+
decoder->len[i] = symbol_lengths[i];
109+
decoder->symbol[i] = symbols[i];
110+
}
111+
112+
FSSTVector::RegisterDecoder(*vector, decoder_buffer, string_block_limit);
113+
114+
if (buffer) {
115+
auto data = reinterpret_cast<shared_ptr<vortex::ExternalVectorBuffer> *>(buffer);
116+
auto aux = vector->GetAuxiliary();
117+
D_ASSERT(aux);
118+
D_ASSERT(aux->GetBufferType() == VectorBufferType::FSST_BUFFER);
119+
aux->Cast<VectorFSSTStringBuffer>().AddHeapReference(*data);
120+
}
121+
122+
FSSTVector::SetCount(*vector, count);
123+
vector->SetVectorType(VectorType::FSST_VECTOR);
124+
}
125+
126+
extern "C" bool duckdb_vx_vector_is_fsst(duckdb_vector ffi_vector) {
127+
auto vector = reinterpret_cast<Vector *>(ffi_vector);
128+
return vector && vector->GetVectorType() == VectorType::FSST_VECTOR;
129+
}
130+
84131
extern "C" duckdb_value duckdb_vx_vector_get_value(duckdb_vector ffi_vector, idx_t index) {
85132
auto vector = reinterpret_cast<Vector *>(ffi_vector);
86133
auto value = duckdb::make_uniq<Value>(vector->GetValue(index));

vortex-duckdb/src/duckdb/vector.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,31 @@ impl VectorRef {
101101
unsafe { cpp::duckdb_vx_sequence_vector(self.as_ptr(), start, stop, capacity) }
102102
}
103103

104+
pub fn set_fsst(
105+
&mut self,
106+
symbols: &[u64],
107+
symbol_lengths: &[u8],
108+
string_block_limit: usize,
109+
count: usize,
110+
buffer: &VectorBufferRef,
111+
) {
112+
unsafe {
113+
cpp::duckdb_vx_fsst_vector_set(
114+
self.as_ptr(),
115+
symbols.as_ptr(),
116+
symbol_lengths.as_ptr(),
117+
symbols.len() as idx_t,
118+
string_block_limit as idx_t,
119+
count as idx_t,
120+
buffer.as_ptr(),
121+
)
122+
}
123+
}
124+
125+
pub fn is_fsst(&self) -> bool {
126+
unsafe { cpp::duckdb_vx_vector_is_fsst(self.as_ptr()) }
127+
}
128+
104129
/// Converts a vector into a flat uncompressed vector vortex call this `canonicalize`.
105130
pub fn flatten(&self, length: u64) {
106131
unsafe { cpp::duckdb_vector_flatten(self.as_ptr(), length) }

vortex-duckdb/src/e2e_test/vortex_scan_test.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,11 @@ use vortex::array::arrays::VarBinArray;
3030
use vortex::array::arrays::VarBinViewArray;
3131
use vortex::array::validity::Validity;
3232
use vortex::buffer::buffer;
33+
use vortex::dtype::DType;
3334
use vortex::dtype::Nullability;
3435
use vortex::dtype::PType;
36+
use vortex::encodings::fsst::fsst_compress;
37+
use vortex::encodings::fsst::fsst_train_compressor;
3538
use vortex::file::WriteOptionsSessionExt;
3639
use vortex::io::runtime::BlockingRuntime;
3740
use vortex::scalar::PValue;
@@ -202,6 +205,29 @@ fn test_vortex_scan_strings() {
202205
assert_eq!(result, "Hello,Hi,Hey");
203206
}
204207

208+
#[test]
209+
fn test_vortex_scan_fsst_strings() {
210+
let file = RUNTIME.block_on(async {
211+
let strings = VarBinArray::from_iter(
212+
[Some("Hello"), Some("Hi"), Some("Hey")],
213+
DType::Utf8(Nullability::Nullable),
214+
);
215+
let compressor = fsst_train_compressor(&strings);
216+
let strings = fsst_compress(
217+
&strings,
218+
strings.len(),
219+
&DType::Utf8(Nullability::Nullable),
220+
&compressor,
221+
);
222+
write_single_column_vortex_file("strings", strings).await
223+
});
224+
225+
let result: String =
226+
scan_vortex_file_single_row(file, "SELECT string_agg(strings, ',') FROM ?", 0);
227+
228+
assert_eq!(result, "Hello,Hi,Hey");
229+
}
230+
205231
#[test]
206232
fn test_vortex_scan_strings_contains() {
207233
let file = RUNTIME.block_on(async {

vortex-duckdb/src/exporter/fixed_size_list.rs

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,8 +101,13 @@ impl ColumnExporter for FixedSizeListExporter {
101101
mod tests {
102102
use vortex::array::IntoArray as _;
103103
use vortex::array::VortexSessionExecute;
104+
use vortex::array::arrays::VarBinArray;
104105
use vortex::array::validity::Validity;
105106
use vortex::buffer::buffer;
107+
use vortex::dtype::DType;
108+
use vortex::dtype::Nullability;
109+
use vortex::encodings::fsst::fsst_compress;
110+
use vortex::encodings::fsst::fsst_train_compressor;
106111
use vortex::error::VortexExpect;
107112

108113
use super::*;
@@ -196,6 +201,39 @@ mod tests {
196201
verify_array_elements(vector, &[1, 2, 3, 4, 5, 6], 2, 3);
197202
}
198203

204+
#[test]
205+
fn test_export_fsst_children_fall_back_to_flat_strings() {
206+
let strings = VarBinArray::from_iter(
207+
[Some("alpha"), Some("beta"), Some("gamma"), Some("delta")],
208+
DType::Utf8(Nullability::Nullable),
209+
);
210+
let compressor = fsst_train_compressor(&strings);
211+
let fsst = fsst_compress(
212+
&strings,
213+
strings.len(),
214+
&DType::Utf8(Nullability::Nullable),
215+
&compressor,
216+
);
217+
let fsl = FixedSizeListArray::new(fsst.into_array(), 2, Validity::AllValid, 2);
218+
219+
let array_type = LogicalType::array_type(LogicalType::varchar(), 2)
220+
.vortex_expect("array type should be valid");
221+
let mut chunk = DataChunk::new([array_type]);
222+
let mut ctx = SESSION.create_execution_ctx();
223+
224+
new_exporter(fsl, &ConversionCache::default(), &mut ctx)
225+
.unwrap()
226+
.export(0, 2, chunk.get_vector_mut(0), &mut ctx)
227+
.unwrap();
228+
chunk.set_len(2);
229+
230+
let child = chunk.get_vector(0).array_vector_get_child();
231+
assert!(
232+
!child.is_fsst(),
233+
"nested exports should flatten FSST children"
234+
);
235+
}
236+
199237
#[test]
200238
fn test_export_fixed_size_list_with_nulls() {
201239
// Create a FixedSizeListArray with 4 lists of size 3, with 2nd list null.

0 commit comments

Comments
 (0)