@@ -32,9 +32,9 @@ capabilities to the host:
3232## Motivation
3333
3434Vortex is a Rust project, and thus integration with non-Rust clients requires a
35- decent amount of scaffolding[ ^ 3 ] which lowers down the probability of such
35+ decent amount of scaffolding[ ^ 3 ] which lowers the probability of such
3636integration. As reading files is most important feature for a file format,
37- exposing it to C-compatible uses lowers down the integration costs.
37+ exposing it to C-compatible users lowers integration costs.
3838
3939Additional motivation is our DuckDB integration which requires C++-to-Rust
4040and Rust-to-C++ bridging. Replacing it with direct calls to C API would make
@@ -49,9 +49,9 @@ reasons:
4949- Lack of support for exporting to ` ArrowSchema ` and ` ArrowArrayStream ` .
5050- Lack of introspection over produced ` vx_array ` objects.
5151
52- Vortex also has a C++ API in vortex-cxx which wraps Rust code. Its future is
53- outside of scope of this proposal but it's expected to wrap C scan API,
54- removing dependency on cxx.rs for vortex-cxx.
52+ > Vortex also has a C++ API in vortex-cxx which wraps Rust code. Its future is
53+ > outside of scope of this proposal but it's expected to wrap C scan API,
54+ > removing dependency on cxx.rs for vortex-cxx.
5555
5656## Motivation for layered API
5757
@@ -70,10 +70,9 @@ no way to suspend to host on IO operations. Exposing only the middle layer
7070requires sync hosts to manage their own event loop which is tedious.
7171
7272Another benefit of splitting API into layers is that it can be discussed and
73- implemented separately.
74-
75- As this is a big change, this PR will from now on focus just on the high level
76- scan API. Other levels may be implemented later on demand.
73+ implemented separately. As this is a big change, this PR will from now on focus
74+ just on the high level scan API. Other levels may be implemented later on
75+ demand.
7776
7877## Overview
7978
@@ -94,6 +93,15 @@ scan API. Other levels may be implemented later on demand.
9493 └─►Array (thread unsafe)
9594```
9695
96+ ## Event loop
97+
98+ Vortex manages the event loop by using a CurrentThreadRuntime runtime handle
99+ which in turn has a ` smol::Executor<'static> ` without a background thread pool.
100+ This means the event loop will run on host's thread pool scheduler (in other
101+ terms, use the calling thread) implicitly. In case of multiple threads, the
102+ underlying executor is shared between them, and drives from from all of these
103+ threads. The executor is initialized once on loading FFI library.
104+
97105## DataSource
98106
99107A DataSource is a reference to multiple possibly remote files. When created, it
@@ -105,38 +113,41 @@ scans from a DataSource.
105113// bindgen
106114typedef struct vx_data_source vx_data_source;
107115typedef struct vx_file_handle vx_file_handle;
108- typedef struct vx_cache_handle *vx_cache_handle;
109116
110- typedef void (* vx_list_callback)(void* userdata, const char* name, int is_dir);
111- typedef void (* vx_glob_callback)(void* userdata, const char* file);
117+ typedef void * vx_fs_userdata;
118+ typedef void (* vx_list_callback)(vx_fs_userdata userdata, const char* name,
119+ int is_dir);
120+ typedef void (* vx_glob_callback)(vx_fs_userdata userdata, const char* file);
112121
113122typedef struct vx_data_source_options {
123+ const char* files;
124+
114125 // (1) Filesystem customization
115- bool ( * fs_use_vortex)(const char * schema, const char * path) ;
116- void (* fs_set_userdata)(void * userdata );
126+ vx_fs_userdata fs_userdata ;
127+ void (*fs_free_userdata)(vx_fs_userdata fs_userdata );
117128
118129 // Called after glob expansion, single-file mode
119- vx_file_handle (*fs_open)(void* userdata, const char* path,
130+ vx_file_handle (*fs_open)(vx_fs_userdata userdata, const char* path,
120131 vx_error** err);
121- vx_file_handle (*fs_create)(void* userdata, const char* path,
132+ vx_file_handle (*fs_create)(vx_fs_userdata userdata, const char* path,
122133 vx_error** err);
123134
124135 // non-recursive, callback is invoked with each path
125- void fs_list(void* userdata, const char* path, vx_list_callback cb ,
126- vx_error** err);
136+ void (*fs_list)(vx_fs_userdata userdata, const char* path,
137+ vx_list_callback cb, vx_error** err);
127138
128- void fs_close(vx_file_handle handle);
129- uint64_t fs_size(vx_file_handle handle, vx_error * err);
139+ void (* fs_close) (vx_file_handle handle);
140+ uint64_t (* fs_size) (vx_file_handle handle, vx_error** err);
130141
131142 // positional read, doesn't change file open cursor
132- void fs_read(vx_file_handle handle, uint64_t offset, size_t len,
143+ void (* fs_read) (vx_file_handle handle, uint64_t offset, uint64_t len,
133144 uint8_t *buffer, vx_error** err);
134145
135146 // not needed for scanning but makes FS API complete
136- void fs_write(vx_file_handle handle, uint64_t offset, size_t len,
147+ void (* fs_write) (vx_file_handle handle, uint64_t offset, uint64_t len,
137148 uint8_t *buffer, vx_error** err);
138149
139- void fs_sync(vx_file_handle handle, vx_error** err);
150+ void (* fs_sync) (vx_file_handle handle, vx_error** err);
140151
141152 // (2) Globbing customization
142153 void (*glob)(const char* glob, vx_glob_callback cb, vx_error** err);
@@ -146,12 +157,12 @@ typedef struct vx_data_source_options {
146157// be freed by the caller using release callback. If err is populated, out
147158// is not set.
148159void vx_dtype_to_arrow_schema(const vx_dtype* dtype, ArrowSchema* out,
149- vx_error* err);
160+ vx_error** err);
150161
151162/// Create a new owned datasource which must be freed by the caller.
152163const vx_data_source *
153164vx_data_source_new(const vx_session * session,
154- const vx_data_source_options * opts, vx_error_out err);
165+ const vx_data_source_options * opts, vx_error ** err);
155166
156167// datasource is Arc'd inside so a clone creates another reference rather
157168// than cloning the state fully.
@@ -176,19 +187,32 @@ void vx_data_source_get_row_count(const vx_data_source *ds,
176187void vx_data_source_free(const vx_data_source * ds);
177188```
178189
190+ > const vx_data_source* may be misleading as it's actually owned, but that's
191+ > limitation of arc_dyn_wrapper! as for now. This would likely be changed soon
192+
1791931. Open local or remote file. Allow using Vortex's filesystem or host's
180- filesystem e.g. DuckDB fs. Allow partial customization e.g. DuckDB fs for
181- local reads, but Vortex fs for remote reads. Host filesystem customization
194+ filesystem e.g. DuckDB fs. Host filesystem customization
182195 point has the benefit of not duplicating credentials e.g. S3 access key
183196 between host and Vortex. Local implementation may be more performant.
197+ User must either implement all `fs_*` callback and set `fs_userdata`, or set
198+ all callbacks in (1) and `fs_userdata` to NULL.
199+
1842002. Open single file or multiple files. Hosts may have their own glob expansion
185201 [^4] which does HTTP IO.
186202
187203When both customization points are implemented, Vortex offloads all IO
188204to a query engine.
189205
190- Memory allocation customization is out of scope of this proposal, but it's
191- possible for Vortex to expose bringing allocator from outside for buffers.
206+ > Memory allocation customization is out of scope of this proposal, but it's
207+ > possible for Vortex to expose bringing allocator from outside for buffers.
208+
209+ To allow stateful filesystem operations, a DataSource allows providing a custom
210+ filesystem userdata with associated deleter. It may be set per DataSource.
211+ Filesystem customizations do syncrohonous IO without batching. Clients who want
212+ control over IO should use the middle layer or lower.
213+ As filesystem callbacks are synchronous, middle layer would use different option
214+ structs, and there would be no way to re-use DataSource or other structures
215+ between layers.
192216
193217## Scan
194218
@@ -207,13 +231,14 @@ either and this is a low priority task.
207231Scan options:
208232
209233```c
234+ // roaring bitmap include and exclude won't be supported as for now because of
235+ // exposure complexity
210236typedef enum {
211237 VX_S_INCLUDE_ALL = 0,
212238 VX_S_INCLUDE_RANGE = 1,
213239 VX_S_EXCLUDE_RANGE = 2,
214240} vx_scan_selection_include;
215241
216- // Roaring bitmaps won't be supported.
217242// If selection is VX_S_INCLUDE_ALL, idx is not read. idx is copied
218243// by host on scan invocation and can be freed after first partition is
219244// requested
@@ -226,12 +251,13 @@ typedef struct {
226251typedef struct vx_scan_options {
227252 const vx_expression *projection; // NULL means "return all columns"
228253 const vx_expression *filter; // NULL means "no filter"
229- uint64_t row_range_begin; // Inclusive, [0; 0) means "all rows"
254+ // special case: row_range_begin == 0 && row_range_end == 0 means "all rows"
255+ uint64_t row_range_begin; // Inclusive
230256 uint64_t row_range_end; // Exclusive
231257 vx_scan_selection selection;
232258 uint64_t limit; // 0 means no limit
233259 uint64_t max_threads; // 0 means no limit.
234- bool ordered; // 0 means unordered scan
260+ int ordered; // 0 means unordered scan
235261} vx_scan_options;
236262```
237263
@@ -249,7 +275,7 @@ typedef enum {
249275 VX_ESTIMATE_INEXACT = 2,
250276} vx_estimate_boundary;
251277
252- // If type is VX_P_ESTIMATE_UNKNOWN , estimate is not populated.
278+ // If type is VX_ESTIMATE_UNKNOWN , estimate is not populated.
253279typedef struct {
254280 uint64_t estimate;
255281 vx_estimate_boundary type;
@@ -259,10 +285,17 @@ typedef struct {
259285// estimate->estimate to distribute work.
260286// options and estimate fields may be NUL.
261287// Calling vx_data_source_scan doesn't do IO unless vx_scan_next is called.
262- // vx_scan can outlive vx_data_source.
288+ //
289+ // As an implementation detail, a scan has a reference to a data source thus
290+ // a scan can outlive a data source.
263291vx_scan *
264292vx_data_source_scan (const vx_data_source * ds, const vx_scan_options * options,
265293 vx_estimate* estimate, vx_error ** err);
294+
295+ // Return scan progress from 0.0 to 1.0.
296+ // Reports estimates of read vs. total rows. Thread-safe.
297+ // May be inaccurate, main use case is progress bars in clients.
298+ double vx_scan_progress(const vx_scan * scan);
266299```
267300
268301## Partition
@@ -277,21 +310,36 @@ data.
277310// Get next owned partition out of a scan request.
278311// Caller must free this partition using vx_partition_free.
279312// This method is thread-safe.
313+ // You can call vx_scan_free or vx_data_source_free while you are holding
314+ // partitions.
280315// Hosts are encouraged to create a worker thread per partition.
281- // Returns NULL and doesn't set err on exhaustion.
282- // Returns NULL and sets err on error.
316+ // Returns NULL and sets * err to NULL on exhaustion.
317+ // Returns NULL and sets * err on error.
283318vx_partition *vx_scan_next(vx_scan *scan, vx_error **err);
284319
285320// Request an array stream in Arrow format from a partition, consuming it
286- // fully. Not thread-safe, should be called once.
287- // stream is owned and must be freed using the release callback
321+ // fully. Calling vx_partition_next on partition after calling this function
322+ // is undefined behaviour.
323+ // User does not need to call vx_partition_free in partition after calling
324+ // this function.
325+ // Not thread-safe, should be called once.
326+ // stream is owned and must be freed using the release callback.
327+ // Does not modify stream and sets err on error.
288328void vx_partition_scan_arrow(const vx_partition *partition,
289- ArrowArrayStream *stream, vx_error_out err);
329+ ArrowArrayStream *stream, vx_error** err);
290330
291331// Thread-unsafe. Get an owned vx_array of an iterator.
292- // Returns NULL and sets err to NULL if iterator is exhausted .
332+ // Calling from multiple threads is undefined behaviour .
293333// Array must be freed by caller.
334+ // You can call vx_scan_free, vx_data_source_free, or vx_partition_free
335+ // while you are holding arrays from it.
336+ // Returns NULL and sets *err to NULL on exhaustion.
337+ // Returns NULL and sets *err on error.
294338const vx_array *vx_partition_next(vx_partition *partition, vx_error **err);
339+
340+ // Return a (possibly exact) estimate of rows in a partition.
341+ void vx_partition_row_count(const vx_partition *partition, vx_estimate *count,
342+ vx_error **err);
295343```
296344
297345## Array introspection
@@ -300,7 +348,9 @@ The main question is how to transform outputs of iteration, `vx_array`, into
300348something query engines can operate with. You need to execute the array
301349iteratively till you recognize data and start exporting it. Thus API provides a
302350way to scan partitions directly into ArrowArrayStream which should be good
303- enough for most hosts.
351+ enough for most hosts. However, some hosts may want to work with vx_array
352+ directly as it provides types like ConstantArray which Arrow doesn't have, and
353+ thus convertion requires CPU-intensive work.
304354
305355## Compatibility
306356
@@ -311,19 +361,21 @@ eventually.
311361
312362- Dividing scan requests into Partitions for threads is taken from DataFusion's
313363 [ partitioned streams] ( https://docs.rs/datafusion/latest/datafusion/physical_plan/enum.Partitioning.html ) .
314- - Making Partitions produce Arrays without synchronization mimicks
364+ - Making Partitions produce Arrays without synchronization mimics
315365 [ Arrow Stream] ( https://arrow.apache.org/docs/format/CStreamInterface.html )
316366 interface.
317367
318368## Unresolved Questions
319369
370+ - Should high level API support reading a file from a buffer and not from a
371+ path? Is there a use case for that?
320372- Should high level API have a way to use host's persistent caching options?
321373 In-memory caching may be implemented using host allocator only but for
322374 persistent caching we need additional filesystem customizations i.e. cache
323375 location path.
324376- Should high level API expose batch reads from Partition? There are plans to
325377 deprecate Partitions on Rust side.
326- - What introspecion should high level API have for hosts which aren't satisfied
378+ - What introspection should high level API have for hosts which aren't satisfied
327379 with ` Vortex -> ArrowArrayStream ` conversion? Should there be iterative
328380 execution API?
329381- How should API expose definitions of ` ArrowSchema ` and ` ArrowArrayStream ` ?
@@ -337,6 +389,9 @@ eventually.
337389 #include " vortex.h"
338390 ```
339391
392+ Current implementation copy-pastes ArrowSchema and ArrowArrayStream and gates
393+ them behind a macro.
394+
340395## High level API integration example: DuckDB
341396
342397```
@@ -355,15 +410,15 @@ duckdb -> TableFunction vtable -> ffi_wrapping -> vx_scan()*
355410* - vx_array -> DataChunk reuses existing Rust code
356411```
357412
358- https://github.com/vortex-data/vortex/tree/myrrc/scan-api-duckdb has an
359- implementation of using C Scan API for Duckdb scan integration. Duckdb has a
360- sync multi-threaded runtime, and table function is called from multiple threads
361- simultaneously. Users can save a per-thread state.
413+ https://github.com/vortex-data/vortex/tree/myrrc/scan-api-duckdb has a PoC (not
414+ production-ready) implementation of using C Scan API for Duckdb scan
415+ integration. Duckdb has a sync multi-threaded runtime, and table function is
416+ called from multiple threads simultaneously. Users can save a per-thread state.
362417
363418The integration splits into following parts:
364- - DType -> LogicalType integration, done sans Temporal extension.
419+ - DType -> LogicalType integration, done except for Temporal extension support .
365420- Table function binding (creating a DataSource), done.
366- - Global state initialization (creating a Scan), done sans filter pushdown.
421+ - Global state initialization (creating a Scan), done except filter pushdown.
367422- Local state initialization (export batch id), done.
368423- Utility functions like cardinality estimates, done.
369424- vx_array -> DataChunk export, delegated to existing Rust code.
@@ -385,15 +440,16 @@ works on its own partition without synchronization.
385440[ ^ 1 ] : Clients of this API would mostly be query engines like Velox or
386441 ClickHouse, but may as well be our own integrations like vortex-duckdb.
387442[ ^ 2 ] : Like opening the first file in a glob expression to determine schema.
388- [ ^ 3 ] : Exposed Rust ABI is not stable, so clients can 't use cbindgen. C++ clients
389- can use cxx.rs but this still requires writing manual bindings and runtime
390- bridging.
443+ [ ^ 3 ] : Rust ABI is not stable if you don 't use ` repr(C) ` , so clients can't use
444+ cbindgen directly. C++ clients can use cxx.rs but this still requires
445+ writing manual bindings and runtime bridging.
391446[ ^ 4 ] : DuckDB MultiFileReader and MultiFileList.
392447[ ^ 5 ] : The name may be misleading as it doesn't correspond to Rust side's
393- Partitions.
448+ Partitions. Alternative considered was vx_split but it conflicts with
449+ register_splits in LayoutReader.
394450[ ^ 6 ] : DuckDB integration currently hides Partitions and Arrays behind a single
395451 [ thread-safe iterator] ( https://github.com/vortex-data/vortex/blob/e8cd130c8ccac45082a0b458b1f843c4313555bf/vortex-duckdb/src/datasource.rs#L151 )
396- which implies unnecessary intra-thread synchronization on pulling data.
397- On the other hand, the producer, an async crossbeam queue, allows smoothing
398- out uneven workloads from the Vortex side, and if that's removed, Vortex's
452+ which implies unnecessary intra-thread synchronization on pulling data. On
453+ the other hand, the producer, an async crossbeam queue, allows smoothing out
454+ uneven workloads from the Vortex side, and if that's removed, Vortex's
399455 partition scheduling must be precise.
0 commit comments