Skip to content

Commit f77e4a5

Browse files
committed
DuckDB FSST Export
Signed-off-by: Nicholas Gates <nick@nickgates.com>
1 parent bbf1e12 commit f77e4a5

3 files changed

Lines changed: 20 additions & 16 deletions

File tree

vortex-duckdb/cpp/include/duckdb_vx/vector.h

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -55,15 +55,13 @@ void duckdb_vx_sequence_vector(duckdb_vector c_vector, int64_t start, int64_t st
5555

5656
// Finalize a vector as an FSST vector using externally managed compressed string data and a
5757
// symbol table owned by Vortex.
58-
void duckdb_vx_fsst_vector_set(
59-
duckdb_vector ffi_vector,
60-
const uint64_t *symbols,
61-
const uint8_t *symbol_lengths,
62-
idx_t symbol_count,
63-
idx_t string_block_limit,
64-
idx_t count,
65-
duckdb_vx_vector_buffer buffer
66-
);
58+
void duckdb_vx_fsst_vector_set(duckdb_vector ffi_vector,
59+
const uint64_t *symbols,
60+
const uint8_t *symbol_lengths,
61+
idx_t symbol_count,
62+
idx_t string_block_limit,
63+
idx_t count,
64+
duckdb_vx_vector_buffer buffer);
6765

6866
// Returns whether the vector is currently an FSST vector.
6967
bool duckdb_vx_vector_is_fsst(duckdb_vector ffi_vector);

vortex-duckdb/cpp/vector.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,10 @@ extern "C" void duckdb_vx_fsst_vector_set(duckdb_vector ffi_vector,
109109
decoder->symbol[i] = symbols[i];
110110
}
111111

112+
// DuckDB can reuse vector instances across chunk exports. Replace the FSST auxiliary buffer
113+
// on each export so heap references to prior compressed byte buffers are dropped instead of
114+
// accumulating for the lifetime of the reused vector.
115+
vector->SetAuxiliary(make_buffer<VectorFSSTStringBuffer>());
112116
FSSTVector::RegisterDecoder(*vector, decoder_buffer, string_block_limit);
113117

114118
if (buffer) {

vortex-duckdb/src/exporter/fsst.rs

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ use vortex::array::match_each_integer_ptype;
1515
use vortex::buffer::ByteBuffer;
1616
use vortex::encodings::fsst::FSST;
1717
use vortex::encodings::fsst::FSSTArrayExt;
18+
use vortex::error::vortex_err;
1819
use vortex::error::VortexResult;
1920

2021
use crate::cpp;
@@ -102,7 +103,7 @@ impl ColumnExporter for FSSTExporter {
102103
let start: usize = offsets[offset + row].as_();
103104
let end: usize = offsets[offset + row + 1].as_();
104105
let value = &bytes[start..end];
105-
out[row] = PtrString::new(value);
106+
out[row] = PtrString::new(value)?;
106107
}
107108
});
108109

@@ -125,32 +126,33 @@ struct PtrString {
125126
}
126127

127128
impl PtrString {
128-
fn new(value: &[u8]) -> Self {
129-
let length = u32::try_from(value.len()).expect("FSST code length must fit in u32");
129+
fn new(value: &[u8]) -> VortexResult<Self> {
130+
let length = u32::try_from(value.len())
131+
.map_err(|_| vortex_err!("FSST code length {} exceeds u32", value.len()))?;
130132
if value.len() <= 12 {
131133
let mut inlined = [0_i8; 12];
132134
for (dst, src) in inlined.iter_mut().zip(value) {
133135
*dst = *src as i8;
134136
}
135-
Self {
137+
Ok(Self {
136138
value: PtrStringValue {
137139
inlined: PtrStringInlined { length, inlined },
138140
},
139-
}
141+
})
140142
} else {
141143
let mut prefix = [0_i8; 4];
142144
for (dst, src) in prefix.iter_mut().zip(value.iter().copied()) {
143145
*dst = src as i8;
144146
}
145-
Self {
147+
Ok(Self {
146148
value: PtrStringValue {
147149
pointer: PtrStringPointer {
148150
length,
149151
prefix,
150152
ptr: value.as_ptr().cast_mut().cast::<c_char>(),
151153
},
152154
},
153-
}
155+
})
154156
}
155157
}
156158
}

0 commit comments

Comments
 (0)