|
| 1 | +- Start Date: 2026-03-13 |
| 2 | +- Authors: Mikhail Kot |
| 3 | + |
| 4 | +# C Scan API |
| 5 | + |
| 6 | +There is a scan API for Rust-compatible code available at |
| 7 | +https://github.com/vortex-data/vortex/tree/develop/vortex-scan. |
| 8 | + |
| 9 | +The goal of introducing C scan API is to make integration with non-Rust query |
| 10 | +engines like Velox easier. In theory, such engines can use cxx.rs, but it |
| 11 | +requires a lot of binding code and runtime bridging (see below). |
| 12 | + |
| 13 | +There exists a partial scan API for C exposed over files [1], but it's limited |
| 14 | +to single-file URIs without globs, and it's also not thread-safe. Its main |
| 15 | +flaws, however, are: |
| 16 | + |
| 17 | +- Inability to export to well-known format like ArrowArrayStream, |
| 18 | +- Lack of introspection over produced `vx_array`s, and |
| 19 | +- Inability to control scan on a level lower than just getting partitions and |
| 20 | + `vx_array`s with filters and projections pre-configured. |
| 21 | + |
| 22 | + Why does Scan API need to expose `vx_array`s? What's the benefit of using |
| 23 | + own format over ArrowArrayStream? |
| 24 | + |
| 25 | + The answer is "compression". Vortex DTypes don't exactly match with Arrow |
| 26 | + physical encodings, so if you have i.e. a ConstantArray, you need to |
| 27 | + decompress it into something Arrow-compatible. This was a major regression |
| 28 | + in Duckdb integration. |
| 29 | + |
| 30 | +C++ API works it out by allowing to produce an ArrowArrayStream interface out of |
| 31 | +ScanBuilder, but it uses Rust code directly via cxx.rs which we want to avoid |
| 32 | +while adding C interfaces. C++ API future is outside of scope of this proposal |
| 33 | +but it's expected to wrap C API directly over time, removing dependency on |
| 34 | +cxx.rs for vortex-cxx. |
| 35 | + |
| 36 | +## Customization points |
| 37 | + |
| 38 | +Main goal of providing customization points is to do as little work as possible |
| 39 | +in Vortex code and as much work as possible in the query engine. Some engines |
| 40 | +may request control over scan execution like pruning. Engines like Duckdb have |
| 41 | +own remote storage, caching, and globbing implementations. API still needs an |
| 42 | +ability to fall back to own implementation. |
| 43 | + |
| 44 | +Still, Scan API is a relatively high-level concept, and if its level is not |
| 45 | +suffifient, engines can resort to using a layout reader plan and executing it |
| 46 | +directly. |
| 47 | + |
| 48 | +## Datasource |
| 49 | + |
| 50 | +A Datasource is a reference to multiple possibly remote files. When created, it |
| 51 | +opens first file to determine the schema from DType, all other operations are |
| 52 | +deferred till a scan is requested. You can request multiple file scans from a |
| 53 | +Datasource. |
| 54 | + |
| 55 | +```c |
| 56 | +// Opaque, generated by bindgen |
| 57 | +typedef struct vx_data_source vx_data_source; |
| 58 | +typedef struct vx_file_handle vx_file_handle; |
| 59 | + |
| 60 | +typedef void (*vx_list_callback)(void* userdata, const char* name, int is_dir); |
| 61 | +typedef void (*vx_glob_callback)(void* userdata, const char* file); |
| 62 | + |
| 63 | +typedef struct vx_data_source_options { |
| 64 | + // (1) Filesystem customization |
| 65 | + |
| 66 | + bool (*fs_use_vortex)(const char* schema, const char* path); |
| 67 | + void (*fs_set_userdata)(void* userdata); |
| 68 | + |
| 69 | + // should be called after glob expansion, single-file mode |
| 70 | + vx_file_handle (*fs_open)(void* userdata, const char* path, vx_error** err); |
| 71 | + vx_file_handle (*fs_create)(void* userdata, const char* path, vx_error** err); |
| 72 | + |
| 73 | + // non-recursive, callback is invoked with each path |
| 74 | + void fs_list(void* userdata, const char* path, vx_list_callback cb, vx_error *err); |
| 75 | + |
| 76 | + void fs_close(vx_file_handle handle); |
| 77 | + uint64_t fs_size(vx_file_handle handle, vx_error *err); |
| 78 | + |
| 79 | + // positional read, doesn't change file open cursor |
| 80 | + void fs_read(vx_file_handle handle, uint64_t offset, size_t len, uint8_t *buffer, |
| 81 | + vx_error *err); |
| 82 | + |
| 83 | + // not needed for scanning but makes FS API complete |
| 84 | + void fs_write(vx_file_handle handle, uint64_t offset, size_t len, uint8_t *buffer, |
| 85 | + vx_error *err); |
| 86 | + void fs_sync(vx_file_handle handle, vx_error *err); |
| 87 | + |
| 88 | + // (2) Globbing customization |
| 89 | + |
| 90 | + void (*glob)(const char* glob, vx_glob_callback cb, vx_error* err); |
| 91 | + |
| 92 | + /// (3) Cache customization |
| 93 | + |
| 94 | + void* (*cache_init)(vx_error* err); |
| 95 | + void (*cache_free)(void* cache, vx_error* err); |
| 96 | + void (*cache_get)(void* cache, const char* key, void** value, vx_error* err); |
| 97 | + void (*cache_put)(void* cache, const char* key, void* value, vx_error* err); |
| 98 | + void (*cache_delete)(void* cache, const char* key, vx_error* err); |
| 99 | +} vx_data_source_options; |
| 100 | + |
| 101 | +// Addition to existing DType API, returns owned ArrowSchema which needs to |
| 102 | +// be freed by the caller using release callback. If err is populated, out |
| 103 | +// is not set. |
| 104 | +void vx_dtype_to_arrow_schema(const vx_dtype* dtype, ArrowSchema* out, vx_error* err); |
| 105 | + |
| 106 | +/// Create a new owned datasource which must be freed by the caller. |
| 107 | +const vx_data_source * |
| 108 | +vx_data_source_new(const vx_session *session, const vx_data_source_options *opts, vx_error_out err); |
| 109 | + |
| 110 | +// datasource is Arc'd inside so a clone creates another reference rather |
| 111 | +// than cloning the state fully. |
| 112 | +const vx_data_source *vx_data_source_clone(const vx_data_source *ptr); |
| 113 | + |
| 114 | +// vx_dtype's lifetime is bound to datasource's lifetime, caller doesn't need |
| 115 | +// to free it |
| 116 | +const vx_dtype *vx_data_source_dtype(const vx_data_source *ds); |
| 117 | + |
| 118 | +typedef enum { |
| 119 | + VX_CARD_UNKNOWN = 0, |
| 120 | + VX_CARD_ESTIMATE = 1, |
| 121 | + VX_CARD_MAXIMUM = 2, |
| 122 | +} vx_cardinality; |
| 123 | +typedef struct { |
| 124 | + vx_cardinality cardinality; |
| 125 | + uint64_t rows; |
| 126 | +} vx_data_source_row_count; |
| 127 | + |
| 128 | +void vx_data_source_get_row_count(const vx_data_source *ds, vx_data_source_row_count *rc); |
| 129 | +void vx_data_source_free(const vx_data_source *ptr); |
| 130 | +``` |
| 131 | +
|
| 132 | +1. Open local or remote file. Allow using vortex's filesystem or query engine's |
| 133 | + filesystem e.g. duckdb fs. Allow partial customization e.g. duckdb fs for |
| 134 | + local reads, but vortex fs for remote reads. Remote filesystem customization |
| 135 | + point has the benefit of not duplicating credentials e.g. S3 access key |
| 136 | + between query engine and vortex. Local implementation may be more performant. |
| 137 | + Vortex rolled back full duckdb fs usage due to performance implications. |
| 138 | +2. Open single file or multiple files. Query engines may have their own glob |
| 139 | + expansion [4] which does HTTP IO. |
| 140 | +3. Cache intermediate results. Main use case for Vortex is caching schema in |
| 141 | + memory for footer cache and conversion results. Benefits are in no |
| 142 | + requirement to open first file in a glob eagerly if there's a cache hit. |
| 143 | + Vortex had had an integration with Duckdb object cache which was deleted in |
| 144 | + favor of own implementation which led to a performance regression. |
| 145 | +
|
| 146 | +When all three customization points are implemented, Vortex offloads all IO |
| 147 | +to a query engine. |
| 148 | +
|
| 149 | + Why not expose API to consume byte ranges or emit byte range requests for |
| 150 | + the query engine to read and populate buffers? |
| 151 | +
|
| 152 | + This approach is indeed easier than using a vtable with a specific |
| 153 | + implementation, and requires slightly more scaffolding on the query engine |
| 154 | + side, but it's significantly easier to implement on Vortex side and it is |
| 155 | + coherent with current Rust implementation. |
| 156 | + Similar implementation can be found in Arrow's RandomAccessFile or |
| 157 | + parquet-rs's AsyncFileReader. |
| 158 | +
|
| 159 | + However, as we're thinking of changing our Rust API, we can try to invest |
| 160 | + time into this approach as well. |
| 161 | +
|
| 162 | +Coupled with https://github.com/vortex-data/vortex/pull/7012 it also allows |
| 163 | +Duckdb integration to abstract memory allocations to the database. |
| 164 | +
|
| 165 | +## Runtime bridging |
| 166 | +
|
| 167 | +In Rust API, a Datasource produces a stream of Partitions. A Partition produces |
| 168 | +a stream of Arrays. API is required to be used in an async runtime, current |
| 169 | +runtime for Vortex is tokio. |
| 170 | +
|
| 171 | +Velox uses a non-coroutine but async runtime based on Folly executors. Engines |
| 172 | +like CoroBase use a coroutine-based runtime. Duckdb and ClickHouse runtimes are sync |
| 173 | +based on thread pools. Postgres runtime is sync based on processes. Some of |
| 174 | +engines may use OS-specific IO like `io_uring`. |
| 175 | +
|
| 176 | +All potential usages of our API may be grouped into 4 cases: |
| 177 | +
|
| 178 | +- sync, single-thread runtime, trivial. (1) |
| 179 | +- sync, multi-thread runtime. (2) |
| 180 | +- async, multi-thread/coroutine. (3) |
| 181 | +- async, single-thread. (4) |
| 182 | +
|
| 183 | +Scan/Partition suggestions outlined below play well with (2) but not with (3) |
| 184 | +and (4) because Vortex has its own runtime which will block on current thread |
| 185 | +when i.e. getting an Array out of Partition. An async-friendly API basically |
| 186 | +means exposing a coroutine/state machine which hands control over to the host on |
| 187 | +IO. |
| 188 | +
|
| 189 | +As Joe mentioned, we want to get away from the concept of partitions and emit |
| 190 | +chunks of vx_array's directly from the scan. In this case, such state machine |
| 191 | +may be expressed with roughly the following states: |
| 192 | +
|
| 193 | +``` |
| 194 | + Passed a file handle |
| 195 | +START -> NEED_IO (offset, len) -> EXECUTE -> DONE |
| 196 | + ^ When passed a file handle, instructs host to read following byte |
| 197 | + range into buffer and return a handle to this buffer. |
| 198 | + ^ Decompresses the buffer (executes the Array) |
| 199 | + one step into other buffer |
| 200 | + ^ |
| 201 | + Array is executed/canonicalized to the form host can work with. |
| 202 | + Host now transfers data from buffers to its own output format. |
| 203 | +``` |
| 204 | +
|
| 205 | +However, as the future of such approach is unclear, a async-unfriendly option is |
| 206 | +described below |
| 207 | +
|
| 208 | +## Scan |
| 209 | +
|
| 210 | +Scan iterators: |
| 211 | +
|
| 212 | +```c |
| 213 | +``` |
| 214 | + |
| 215 | +Scan options: |
| 216 | + |
| 217 | +```c |
| 218 | +typedef enum { |
| 219 | + VX_S_INCLUDE_ALL = 0, |
| 220 | + VX_S_INCLUDE_RANGE = 1, |
| 221 | + VX_S_EXCLUDE_RANGE = 2, |
| 222 | +} vx_scan_selection_include; |
| 223 | + |
| 224 | +typedef struct { |
| 225 | + uint64_t *idx; |
| 226 | + size_t idx_len; |
| 227 | + // Roaring bitmaps won't be supported as for now |
| 228 | + // If selection is VX_S_INCLUDE_ALL, these are not read. idx is copied by query |
| 229 | + // engine on scan invocation and can be freed after a scan iterator is requested |
| 230 | + vx_scan_selection_include include; |
| 231 | +} vx_scan_selection; |
| 232 | + |
| 233 | +typedef struct vx_scan_selection { |
| 234 | + const size_t* idx; |
| 235 | + size_t idx_len; |
| 236 | +} vx_scan_selection; |
| 237 | + |
| 238 | +typedef struct vx_scan_options { |
| 239 | + // May be NULL which means "return all columns" |
| 240 | + const vx_expression *projection; |
| 241 | + |
| 242 | + // May be NULL which means "no filter" |
| 243 | + const vx_expression *filter; |
| 244 | + |
| 245 | + // Set both to 0 to indicate no range request |
| 246 | + // Inclusive |
| 247 | + uint64_t row_range_begin; |
| 248 | + // Exclusive |
| 249 | + uint64_t row_range_end; |
| 250 | + |
| 251 | + vx_scan_selection selection; |
| 252 | + |
| 253 | + // 0 for no limit |
| 254 | + uint64_t limit; |
| 255 | + int ordered; |
| 256 | +} vx_scan_options; |
| 257 | +``` |
| 258 | + |
| 259 | +Scan interface: |
| 260 | + |
| 261 | +```c |
| 262 | +typedef struct vx_scan vx_scan; |
| 263 | + |
| 264 | +/** |
| 265 | + * A partition is a contiguous chunk of memory from which you can interatively |
| 266 | + * get vx_arrays. |
| 267 | + */ |
| 268 | +typedef struct vx_partition vx_partition; |
| 269 | + |
| 270 | +typedef enum { |
| 271 | + VX_ESTIMATE_UNKNOWN = 0, |
| 272 | + VX_ESTIMATE_EXACT = 1, |
| 273 | + VX_ESTIMATE_INEXACT = 2, |
| 274 | +} vx_estimate_boundary; |
| 275 | + |
| 276 | +typedef struct { |
| 277 | + // If type is VX_P_ESTIMATE_UNKNOWN, estimate field is not populated |
| 278 | + uint64_t estimate; |
| 279 | + vx_estimate_boundary boundary; |
| 280 | +} vx_estimate; |
| 281 | + |
| 282 | +// Users are encouraged to create worker threads depending on est->estimate to |
| 283 | +// distribute work. |
| 284 | +// opts and est may be nullptr. |
| 285 | +// Requesting a scan doesn't do anything unless vx_partition_next is called. |
| 286 | +vx_scan * |
| 287 | +vx_data_source_scan(const vx_data_source *data_source, const vx_scan_options *options, vx_error_out err); |
| 288 | + |
| 289 | +/** |
| 290 | + * Get next owned partition out of a scan request. |
| 291 | + * Caller must free this partition using vx_partition_free. |
| 292 | + * This method is thread-safe. |
| 293 | + * If using in a sync multi-thread runtime, users are encouraged to create a |
| 294 | + * worker thread per partition. |
| 295 | + * Returns NULL and doesn't set err on exhaustion. |
| 296 | + * Returns NULL and sets err on error. |
| 297 | + */ |
| 298 | +vx_partition *vx_scan_next(vx_scan *scan, vx_error_out err); |
| 299 | + |
| 300 | +// Request an array stream in Arrow format from a partition, consuming it |
| 301 | +// fully. Not thread-safe, should be called once. |
| 302 | +// stream is owned and must be freed using the release callback |
| 303 | +void vx_partition_scan_arrow(const vx_partition *partition, FFI_ArrowArrayStream *stream, vx_error_out err); |
| 304 | + |
| 305 | +// Thread-unsafe. Get an owned vx_array of an iterator. |
| 306 | +// Returns NULL if iterator is exhausted. Array is owned and must be |
| 307 | +// freed by caller. |
| 308 | +const vx_array *vx_partition_next(vx_partition *partition, vx_error_out err); |
| 309 | +``` |
| 310 | +
|
| 311 | +There are examples of APIs also exposing batch reads, but I doubt this is a good |
| 312 | +option as for every ArrayRef the work that needs to be done to execute it may be |
| 313 | +significant, and if you want to parallelize work, you can use this with |
| 314 | +partitions, so each thread will be still busy with one ArrayRef at a time. |
| 315 | +It can be introduced in the future. |
| 316 | +
|
| 317 | +Scan functions are lazy as they operate on streams and it is |
| 318 | +consumer's code responsibility to use parallelism at the desired degree. |
| 319 | +
|
| 320 | +## What to do with `vx_array` |
| 321 | +
|
| 322 | +The main question is how to transform outputs of iteration, vx_array, into |
| 323 | +something query engines can operate with. You need to execute the array |
| 324 | +iteratively till you recognize data and start exporting it. Duckdb integration |
| 325 | +is mostly written in Rust with C++ code calling Rust's vtable functions. Rust |
| 326 | +code does all data export. PoC implementation moves Duckdb to use C API but |
| 327 | +leaves existing Rust code for exporting `vx_array` into DataChunk. |
| 328 | +
|
| 329 | +However, the goal is not to interface with Rust code, so as a baseline the API |
| 330 | +provides a way to scan partitions directly into ArrowArrayStream which should be |
| 331 | +good enough for most consumers. |
| 332 | +
|
| 333 | +## Cancellation |
| 334 | +
|
| 335 | +There will be no option to cancel the scan as this isn't possibe on Rust API |
| 336 | +either and this is a low priority task. |
| 337 | +
|
| 338 | +## Testing |
| 339 | +
|
| 340 | +C API doesn't have any testing. I suggest setting up a Catch3 testing target and |
| 341 | +a CMake library for C API using FetchContent to download Catch. This way people |
| 342 | +not working on Duckdb integration or FFI wouldn't need CMake and Catch. To |
| 343 | +integrate C tests with `cargo test`, we can write a `build.rs` extension which |
| 344 | +parses C test names and codegenerates rust tests targets calling to Catch. |
| 345 | +
|
| 346 | +## Duckdb integration PoC |
| 347 | +
|
| 348 | +``` |
| 349 | +before: |
| 350 | +Duckdb side Vortex side |
| 351 | + |
| 352 | +C++ C++ Rust |
| 353 | +duckdb -> TableFunction vtable -> ffi wrapping -> vtable implementation |
| 354 | + |
| 355 | +after: |
| 356 | + |
| 357 | +C++ C++ C |
| 358 | +duckdb -> TableFunction vtable -> ffi_wrapping -> vx_scan()* |
| 359 | + |
| 360 | +* - vx_array -> DataChunk reuses existing Rust code |
| 361 | +``` |
| 362 | +
|
| 363 | +https://github.com/vortex-data/vortex/tree/myrrc/scan-api-duckdb has an |
| 364 | +implementation of using C Scan API for Duckdb scan integration. Duckdb has a |
| 365 | +sync multi-threaded runtime, and table function is called from multiple threads |
| 366 | +simultaneously. Users can save a per-thread state. |
| 367 | +
|
| 368 | +The integration splits into following parts: |
| 369 | +- DType -> LogicalType integration, done sans Temporal extension. |
| 370 | +- Table function binding (creating a DataSource), done. |
| 371 | +- Global state initialization (creating a Scan), done sans filter pushdown. |
| 372 | +- Local state initialization (export batch id), done. |
| 373 | +- Utility functions like cardinality estimates, done. |
| 374 | +- vx_array -> DataChunk export, delegated to existing Rust code. |
| 375 | +
|
| 376 | +On filter pushdown: projection pushdown requires exposing only `select()` |
| 377 | +expression. On the other hand, filter pushdown requires `TableFilter -> Vortex |
| 378 | +Expression` conversion which is significant porting so left out. |
| 379 | +
|
| 380 | +On DataChunk export: it requires exposing features like array optimization, |
| 381 | +validity masks, and other features, so left out. |
| 382 | +
|
| 383 | +Table function uses Vortex _partition_ concepts as a work splitting term only, |
| 384 | +i.e. one worker thread operating on one or multiple partitions. Each thread |
| 385 | +pulls out partitions from `vx_scan_next` (thus it's thread-safe) and then |
| 386 | +works on its own partition without synchronization. |
| 387 | +
|
| 388 | +[1] `vx_file_scan` |
| 389 | +
|
| 390 | +[2] Need to control pruning |
| 391 | + https://spiraldb.slack.com/archives/C0AJS0HDS6R/p1773068549282999 |
| 392 | +
|
| 393 | +[4] e.g. Duckdb MultiFileReader / MultiFileList |
0 commit comments