From 015ff487a5ed79671e33544af1406e88a36e9ab0 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Fri, 17 Apr 2026 15:12:46 +0100 Subject: [PATCH] initial Signed-off-by: Mikhail Kot --- .github/workflows/rust-instrumented.yml | 19 ++- vortex-ffi/CMakeLists.txt | 20 ++- vortex-ffi/README.md | 72 ++++---- vortex-ffi/cbindgen.toml | 13 +- vortex-ffi/cinclude/vortex.h | 86 +++++----- vortex-ffi/examples/CMakeLists.txt | 18 ++ vortex-ffi/examples/Makefile | 75 --------- vortex-ffi/examples/README.md | 30 ---- vortex-ffi/examples/dtype.c | 156 ++++++++++++++++++ vortex-ffi/examples/hello-vortex.c | 116 ------------- vortex-ffi/examples/scan.c | 209 ++++++++++++++++++++++++ vortex-ffi/examples/scan_to_arrow.c | 124 ++++++++++++++ vortex-ffi/examples/write_sample.c | 142 ++++++++++++++++ vortex-ffi/src/data_source.rs | 42 ++--- vortex-ffi/src/scan.rs | 31 ++-- vortex-ffi/test/CMakeLists.txt | 9 - vortex-ffi/test/scan.cpp | 7 +- vortex-file/src/multi/mod.rs | 15 +- 18 files changed, 818 insertions(+), 366 deletions(-) create mode 100644 vortex-ffi/examples/CMakeLists.txt delete mode 100644 vortex-ffi/examples/Makefile delete mode 100644 vortex-ffi/examples/README.md create mode 100644 vortex-ffi/examples/dtype.c delete mode 100644 vortex-ffi/examples/hello-vortex.c create mode 100644 vortex-ffi/examples/scan.c create mode 100644 vortex-ffi/examples/scan_to_arrow.c create mode 100644 vortex-ffi/examples/write_sample.c diff --git a/.github/workflows/rust-instrumented.yml b/.github/workflows/rust-instrumented.yml index 9af1471f44f..e134d8b5912 100644 --- a/.github/workflows/rust-instrumented.yml +++ b/.github/workflows/rust-instrumented.yml @@ -187,15 +187,28 @@ jobs: cargo +$NIGHTLY_TOOLCHAIN build --locked --no-default-features \ --target x86_64-unknown-linux-gnu -Zbuild-std \ -p vortex-ffi - - name: Build FFI library tests + - name: Build FFI library tests and examples run: | cd vortex-ffi - cmake -Bbuild -DBUILD_TESTS=1 -DSANITIZER=${{ matrix.sanitizer }} -DTARGET_TRIPLE="x86_64-unknown-linux-gnu" + cmake -Bbuild -DBUILD_TESTS=1 -DBUILD_EXAMPLES=1 -DSANITIZER=${{ matrix.sanitizer }} -DTARGET_TRIPLE="x86_64-unknown-linux-gnu" cmake --build build -j - name: Run tests run: | set -o pipefail - ./vortex-ffi/build/test/vortex_ffi_test 2>&1 | rustfilt -i- + ./vortex-ffi/build/test/vortex_ffi_test 2>&1 | rustfilt + - name: Run examples + run: | + set -o pipefail + + # Failed to create data source: Object store error: Generic LocalFileSystem + # error: Unable to walk dir: File system loop found + rm -fr vortex-ffi/build/_deps/nanoarrow-src/python + + ./vortex-ffi/build/examples/write_sample file.vortex 2>&1 | rustfilt + ./vortex-ffi/build/examples/write_sample file2.vortex 2>&1 | rustfilt + ./vortex-ffi/build/examples/dtype '*.vortex' 2>&1 | rustfilt + ./vortex-ffi/build/examples/scan '*.vortex' 2>&1 | rustfilt + ./vortex-ffi/build/examples/scan_to_arrow '*.vortex' 2>&1 | rustfilt miri: name: "Rust tests (miri)" diff --git a/vortex-ffi/CMakeLists.txt b/vortex-ffi/CMakeLists.txt index dc6a231fc55..68107b0d244 100644 --- a/vortex-ffi/CMakeLists.txt +++ b/vortex-ffi/CMakeLists.txt @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright the Vortex contributors cmake_minimum_required(VERSION 3.10) +include(FetchContent) + project(VortexFFI VERSION 0.0.1 LANGUAGES C) @@ -10,6 +12,7 @@ set(CMAKE_C_STANDARD 17) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Werror -Wextra -Wpedantic") option(BUILD_TESTS "Build tests" OFF) +option(BUILD_EXAMPLES "Build examples" OFF) set(SANITIZER "" CACHE STRING "Build with sanitizers") set(TARGET_TRIPLE "" CACHE STRING "Rust target triple for FFI library") @@ -76,10 +79,10 @@ Static library path ${LIBRARY_PATH} Headers path ${LIBRARY_HEADERS}") if (NOT EXISTS "${LIBRARY_PATH_SHARED}") - message(FATAL_ERROR "Shared library not found") + message(FATAL_ERROR "Shared library not found, run `cargo build --release -p vortex-ffi`") endif() if (NOT EXISTS "${LIBRARY_PATH}") - message(FATAL_ERROR "Static library not found") + message(FATAL_ERROR "Static library not found, run `cargo build --release -p vortex-ffi`") endif() add_library(vortex_ffi STATIC IMPORTED) @@ -95,6 +98,15 @@ set_target_properties(vortex_ffi_shared PROPERTIES INTERFACE_LINK_OPTIONS "LINKER:-rpath,${LIBRARY_DIR}" ) +if (BUILD_TESTS OR BUILD_EXAMPLES) + FetchContent_Declare( + Nanoarrow + GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow + GIT_TAG apache-arrow-nanoarrow-0.8.0 + ) + FetchContent_MakeAvailable(Nanoarrow) +endif() + if (BUILD_TESTS) enable_language(CXX) set(CMAKE_CXX_STANDARD 20) @@ -103,3 +115,7 @@ if (BUILD_TESTS) enable_testing() add_subdirectory(test) endif() + +if (BUILD_EXAMPLES) + add_subdirectory(examples) +endif() diff --git a/vortex-ffi/README.md b/vortex-ffi/README.md index 472bc89f71c..34df8cc2a1a 100644 --- a/vortex-ffi/README.md +++ b/vortex-ffi/README.md @@ -1,58 +1,50 @@ -# Foreign Function Interface +# Vortex C interface +## Usage from a CMake project -Vortex is a file format that can be used by any execution engine. Nearly every programming language supports -the C ABI (Application Binary Interface), so by providing an FFI interface to work with Vortex objects we can -make it easy to support a variety of languages. - -Check out the [`examples`](./examples/) directory to see an example of how to use the API to build -a real native application. - -## Design - -The FFI is designed to be very simple and follows a very object-oriented approach: - -- **Constructors** are simple C functions that return opaque pointers -- **Methods** are functions that receive an opaque pointer as the first argument, followed by subsequent arguments. - Methods may return a value or void. -- **Destructors** free native resources (allocations, file handles, network sockets) and must be explicitly called by - the foreign language to avoid leaking resources. - -Constructors will generally allocate rust memory, and destructors free that memory. - -## Documentation - -The FFI API is documented in `docs/api/c` with explicit inclusion of types, enums, and functions, etc. Note that an -item cannot be referenced in the documentation if it does not have a documentation comment. +``` +# in vortex folder +cargo build --release -p vortex-ffi -## Updating Headers +# in your CMakeLists.txt +include_directory(vortex/vortex-ffi) +target_link_libraries(my_target, vortex_ffi_shared) +# or target_link_libraries(my_target, vortex_ffi) +``` -To rebuild the header file: +## Running C examples: ```sh -cargo +nightly build -p vortex-ffi +cmake -Bbuild -DBUILD_EXAMPLES=1 +cmake --build build +./build/examples/dtype +./build/examples/scan +./build/examples/scan_to_arrow +./build/examples/write_sample ``` -The header generation uses cbindgen's macro expansion feature which requires nightly. -Stable builds use the checked-in header file at `cinclude/vortex.h`. +## Updating Headers + +If you're developing FFI and want to rebuild `cinclude/vortex.h`, run +`cargo +nightly build -p vortex-ffi`. -### Testing C part +## Testing C part -Build the test library +Build the test library: ```sh -cmake -Bbuild -cmake --build build -j $(nproc) +cmake -Bbuild -DBUILD_TESTS=1 +cmake --build build ``` -Run the tests +Run the tests: ```sh ctest --test-dir build -j $(nproc) ``` -You would need C++ compiler toolchain to run the tests since they use Catch2. +You will need C++ compiler toolchain to run the tests since they use Catch2. -### Testing Rust part with sanitizers +## Testing Rust part with sanitizers AddressSanitizer: @@ -90,20 +82,20 @@ with sanitizers. - `allow-abi-mismatch` is safe because in our dependency graph only crates like `compiler_builtins` unset sanitization, and they do it on purpose. - Make sure to use `cargo test` and not `cargo nextest` as nextest reports less -leaks. + leaks. - If you want stack trace symbolization, install `llvm-symbolizer`. -### Testing Rust and C with sanitizers +## Testing Rust and C with sanitizers 1. Build FFI library with external sanitizer runtime: ```sh RUSTFLAGS="-Zsanitizer=address -Zexternal-clangrt" \ cargo +nightly build -Zbuild-std --target= \ ---no-default-features -p vortex-ffi + --no-default-features -p vortex-ffi ``` -2. Build tests with target triple +2. Build tests with target triple: ```sh cmake -Bbuild -DWITH_ASAN=1 -DTARGET_TRIPLE= diff --git a/vortex-ffi/cbindgen.toml b/vortex-ffi/cbindgen.toml index 5153eb0f9fc..46d80cd72c7 100644 --- a/vortex-ffi/cbindgen.toml +++ b/vortex-ffi/cbindgen.toml @@ -16,10 +16,15 @@ header = """ // // https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions -// We don't want to bundle nanoarrow or similar just for these two definitions. -// If you use your own Arrow library, define this macro and -// typedef FFI_ArrowSchema ArrowSchema; -// typedef FFI_ArrowArrayStream ArrowArrayStream; +// If you want to use your own Arrow library like nanoarrow, define this macro +// and typedef your types: +// +// #include "nanoarrow/common/inline_types.h" +// #define USE_OWN_ARROW +// typedef struct ArrowSchema FFI_ArrowSchema; +// typedef struct ArrowArrayStream FFI_ArrowArrayStream; +// #include "vortex.h" +// #ifndef USE_OWN_ARROW struct ArrowSchema { const char* format; diff --git a/vortex-ffi/cinclude/vortex.h b/vortex-ffi/cinclude/vortex.h index bb4495b34de..ae152752ba2 100644 --- a/vortex-ffi/cinclude/vortex.h +++ b/vortex-ffi/cinclude/vortex.h @@ -8,10 +8,15 @@ // // https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions -// We don't want to bundle nanoarrow or similar just for these two definitions. -// If you use your own Arrow library, define this macro and -// typedef FFI_ArrowSchema ArrowSchema; -// typedef FFI_ArrowArrayStream ArrowArrayStream; +// If you want to use your own Arrow library like nanoarrow, define this macro +// and typedef your types: +// +// #include "nanoarrow/common/inline_types.h" +// #define USE_OWN_ARROW +// typedef struct ArrowSchema FFI_ArrowSchema; +// typedef struct ArrowArrayStream FFI_ArrowArrayStream; +// #include "vortex.h" +// #ifndef USE_OWN_ARROW struct ArrowSchema { const char *format; @@ -175,10 +180,19 @@ typedef enum { } vx_validity_type; typedef enum { - VX_CARD_UNKNOWN = 0, - VX_CARD_ESTIMATE = 1, - VX_CARD_MAXIMUM = 2, -} vx_cardinality; + /** + * No estimate is available. + */ + VX_ESTIMATE_UNKNOWN = 0, + /** + * The value in vx_estimate.estimate is exact. + */ + VX_ESTIMATE_EXACT = 1, + /** + * The value in vx_estimate.estimate is an upper bound. + */ + VX_ESTIMATE_INEXACT = 2, +} vx_estimate_type; /** * Equalities, inequalities, and boolean operations over possibly null values. @@ -282,21 +296,6 @@ typedef enum { VX_SELECTION_EXCLUDE_RANGE = 2, } vx_scan_selection_include; -typedef enum { - /** - * No estimate is available. - */ - VX_ESTIMATE_UNKNOWN = 0, - /** - * The value in vx_estimate.estimate is exact. - */ - VX_ESTIMATE_EXACT = 1, - /** - * The value in vx_estimate.estimate is an upper bound. - */ - VX_ESTIMATE_INEXACT = 2, -} vx_estimate_type; - /** * Physical type enum, represents the in-memory physical layout but might represent a different logical type. */ @@ -490,6 +489,10 @@ typedef struct vx_file vx_file; */ typedef struct vx_partition vx_partition; +/** + * A scan is a single traversal of a data source with projections and + * filters. A scan can be consumed only once. + */ typedef struct vx_scan vx_scan; /** @@ -537,13 +540,17 @@ typedef struct { const char *paths; } vx_data_source_options; +/** + * Used for estimating number of partitions in a data source or number of rows + * in a partition. + */ typedef struct { - vx_cardinality cardinality; + vx_estimate_type type; /** - * Set only when "cardinality" is not VX_CARD_UNKNOWN + * Set only when "type" is not VX_ESTIMATE_UNKNOWN. */ - uint64_t rows; -} vx_data_source_row_count; + uint64_t estimate; +} vx_estimate; /** * Options supplied for opening a file. @@ -662,18 +669,6 @@ typedef struct { bool ordered; } vx_scan_options; -/** - * Used for estimating number of partitions in a data source or number of rows - * in a partition. - */ -typedef struct { - vx_estimate_type type; - /** - * Set only when "type" is not VX_ESTIMATE_UNKNOWN. - */ - uint64_t estimate; -} vx_estimate; - #ifdef __cplusplus extern "C" { #endif // __cplusplus @@ -921,7 +916,7 @@ const vx_dtype *vx_data_source_dtype(const vx_data_source *ds); /** * Write data source's row count estimate into "row_count". */ -void vx_data_source_get_row_count(const vx_data_source *ds, vx_data_source_row_count *row_count); +void vx_data_source_get_row_count(const vx_data_source *ds, vx_estimate *row_count); /** * Clone a borrowed [`vx_dtype`], returning an owned [`vx_dtype`]. @@ -1319,6 +1314,17 @@ vx_partition *vx_scan_next_partition(vx_scan *scan, vx_error **err); */ int vx_partition_row_count(const vx_partition *partition, vx_estimate *count, vx_error **err); +/** + * Scan partition to ArrowArrayStream. + * Consumes partition fully: subsequent calls to vx_partition_scan_arrow or + * vx_partition_next are undefined behaviour. + * This call blocks current thread until underlying stream is fully consumed. + * + * Caller must not free partition after calling this function. + * + * On success, sets "stream" and returns 0. + * On error, sets "err" and returns 1, freeing the partition. + */ int vx_partition_scan_arrow(const vx_session *session, vx_partition *partition, FFI_ArrowArrayStream *stream, diff --git a/vortex-ffi/examples/CMakeLists.txt b/vortex-ffi/examples/CMakeLists.txt new file mode 100644 index 00000000000..47228d9a903 --- /dev/null +++ b/vortex-ffi/examples/CMakeLists.txt @@ -0,0 +1,18 @@ +# SPDX-License-Identifier: CC-BY-4.0 +# SPDX-FileCopyrightText: Copyright the Vortex contributors + +# allow linking with vortex_ffi_shared although it's not in current folder +cmake_policy(SET CMP0079 NEW) + +add_executable(scan scan.c) +target_link_libraries(scan PRIVATE vortex_ffi_shared) + +add_executable(scan_to_arrow scan_to_arrow.c) +target_link_libraries(scan_to_arrow PRIVATE + nanoarrow_shared vortex_ffi_shared) + +add_executable(dtype dtype.c) +target_link_libraries(dtype PRIVATE vortex_ffi_shared) + +add_executable(write_sample write_sample.c) +target_link_libraries(write_sample PRIVATE vortex_ffi_shared) diff --git a/vortex-ffi/examples/Makefile b/vortex-ffi/examples/Makefile deleted file mode 100644 index c22056524fd..00000000000 --- a/vortex-ffi/examples/Makefile +++ /dev/null @@ -1,75 +0,0 @@ -# SPDX-License-Identifier: CC-BY-4.0 -# SPDX-FileCopyrightText: Copyright the Vortex contributors - -# Directory containing the Rust crate -ROOT_DIR = ../.. -RUST_CRATE_DIR = .. -# Directory where cargo places the built libraries -RUST_TARGET_DIR = $(ROOT_DIR)/target -# The library name (without lib prefix and .so/.dylib suffix) -LIBRARY_NAME = vortex_ffi - -# Determine platform-specific library extension -UNAME_S := $(shell uname -s) -ifeq ($(UNAME_S),Linux) - LIB_EXT = so -else ifeq ($(UNAME_S),Darwin) - LIB_EXT = dylib -else - # Default to .dll for Windows - LIB_EXT = dll -endif - -# Path to the library depending on build mode -DEBUG_LIB = $(RUST_TARGET_DIR)/debug/lib$(LIBRARY_NAME).$(LIB_EXT) -RELEASE_LIB = $(RUST_TARGET_DIR)/release/lib$(LIBRARY_NAME).$(LIB_EXT) - -# Default to debug mode -LIB = $(DEBUG_LIB) - -# C flags -CFLAGS = -Wall -I$(RUST_CRATE_DIR)/cinclude - -# Linking flags -LDFLAGS = -L$(RUST_TARGET_DIR)/debug -l$(LIBRARY_NAME) - -# Final executable name -EXECUTABLE = hello_vortex - -.PHONY: all clean run release debug - -# Default target -all: debug - -# Debug build -debug: LDFLAGS = -L$(RUST_TARGET_DIR)/debug -l$(LIBRARY_NAME) -debug: build - -# Release build -release: LDFLAGS = -L$(RUST_TARGET_DIR)/release -l$(LIBRARY_NAME) -release: LIB = $(RELEASE_LIB) -release: build - -# Build rule -build: $(LIB) $(EXECUTABLE) - -# Build the Rust library -$(DEBUG_LIB): - cd $(RUST_CRATE_DIR) && cargo build - -$(RELEASE_LIB): - cd $(RUST_CRATE_DIR) && cargo build --release - -# Build the C executable -$(EXECUTABLE): hello-vortex.c - $(CC) $(CFLAGS) -o $@ $< $(LDFLAGS) - -# Run the executable with the correct library path -run: build - @echo "Running example..." - LD_LIBRARY_PATH=$(dir $(LIB)) ./$(EXECUTABLE) - -# Clean build artifacts -clean: - rm -f $(EXECUTABLE) - cd $(RUST_CRATE_DIR) && cargo clean diff --git a/vortex-ffi/examples/README.md b/vortex-ffi/examples/README.md deleted file mode 100644 index 5ebc99f9bfe..00000000000 --- a/vortex-ffi/examples/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# C FFI example - -This example shows how to interface with the FFI of this crate using C code. - -Run `make` to build the `hello_vortex` binary. - -The binary expects a single argument, which is the path to a Vortex file. A new streaming -scan will be created that will materialize file splits as Vortex in-memory arrays, and print -some information about them. - -Here's an example from the TPC-H `partsupp` dataset: - -``` -$ make run file:///tmp/partsupp.vortex -Scanning file: file:///tmp/partsupp.vortex -Chunk 0: 65536 -Chunk 1: 65536 -Chunk 2: 65536 -Chunk 3: 65536 -Chunk 4: 65536 -Chunk 5: 65536 -Chunk 6: 65536 -Chunk 7: 65536 -Chunk 8: 65536 -Chunk 9: 65536 -Chunk 10: 65536 -Chunk 11: 65536 -Chunk 12: 13568 -Scanning complete -``` diff --git a/vortex-ffi/examples/dtype.c b/vortex-ffi/examples/dtype.c new file mode 100644 index 00000000000..007c3993e33 --- /dev/null +++ b/vortex-ffi/examples/dtype.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: CC-BY-4.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "vortex.h" +#include + +const char *usage = "Print dtype of files\n" + "Usage: dtype \n"; + +void print_dtype(const vx_dtype *dtype); + +void print_ptype(const vx_dtype *type) { + const char *ptype = NULL; + + switch (vx_dtype_primitive_ptype(type)) { + case PTYPE_U8: + ptype = "uint8_t"; + break; + case PTYPE_U16: + ptype = "uint16_t"; + break; + case PTYPE_U32: + ptype = "uint32_t"; + break; + case PTYPE_U64: + ptype = "uint64_t"; + break; + case PTYPE_I8: + ptype = "int8_t"; + break; + case PTYPE_I16: + ptype = "int16_t"; + break; + case PTYPE_I32: + ptype = "int32_t"; + break; + case PTYPE_I64: + ptype = "int64_t"; + break; + case PTYPE_F16: + ptype = "float16"; + break; + case PTYPE_F32: + ptype = "float"; + break; + case PTYPE_F64: + ptype = "double"; + break; + default: + __builtin_unreachable(); + } + + printf("primitive(%s)", ptype); +} + +void print_struct_dtype(const vx_dtype *dtype) { + const vx_struct_fields *fields = vx_dtype_struct_dtype(dtype); + + printf("struct(\n"); + for (uint64_t i = 0; i < vx_struct_fields_nfields(fields); ++i) { + const vx_dtype *field_dtype = vx_struct_fields_field_dtype(fields, i); + const vx_string *field_name = vx_struct_fields_field_name(fields, i); + printf(" %.*s = ", (int)vx_string_len(field_name), vx_string_ptr(field_name)); + print_dtype(field_dtype); + vx_dtype_free(field_dtype); + } + printf(")"); +} + +void print_list_dtype(const vx_dtype *dtype) { + printf("list("); + print_dtype(vx_dtype_list_element(dtype)); + printf(")"); +} + +void print_fixed_list_dtype(const vx_dtype *dtype) { + printf("fixed list(size=%d, ", vx_dtype_fixed_size_list_size(dtype)); + print_dtype(vx_dtype_fixed_size_list_element(dtype)); + printf(")"); +} + +void print_decimal_dtype(const vx_dtype *dtype) { + const uint8_t precision = vx_dtype_decimal_precision(dtype); + const int8_t scale = vx_dtype_decimal_scale(dtype); + printf("decimal(precision=%u, scale=%d)", precision, scale); +} + +void print_dtype(const vx_dtype *dtype) { + switch (vx_dtype_get_variant(dtype)) { + case DTYPE_NULL: + printf("null"); + break; + case DTYPE_BOOL: + printf("bool"); + break; + case DTYPE_UTF8: + printf("utf8"); + break; + case DTYPE_BINARY: + printf("binary"); + break; + case DTYPE_EXTENSION: + printf("extension"); + break; + case DTYPE_PRIMITIVE: + print_ptype(dtype); + break; + case DTYPE_STRUCT: + print_struct_dtype(dtype); + break; + case DTYPE_LIST: + print_list_dtype(dtype); + break; + case DTYPE_FIXED_SIZE_LIST: + print_fixed_list_dtype(dtype); + break; + case DTYPE_DECIMAL: + print_decimal_dtype(dtype); + break; + } + printf("%c\n", vx_dtype_is_nullable(dtype) ? '?' : ' '); +} + +void print_error(const char *what, const vx_error *error) { + const vx_string *str = vx_error_get_message(error); + fprintf(stderr, "%s: %.*s\n", what, (int)vx_string_len(str), vx_string_ptr(str)); +} + +int main(int argc, char **argv) { + if (argc != 2) { + fprintf(stderr, "%s", usage); + return 1; + } + + vx_error *error = NULL; + vx_session *const session = vx_session_new(); + if (session == NULL) { + fprintf(stderr, "Failed to create Vortex session\n"); + return 1; + } + + vx_data_source_options ds_options = {.paths = argv[1]}; + const vx_data_source *data_source = vx_data_source_new(session, &ds_options, &error); + if (data_source == NULL) { + print_error("Failed to create data source", error); + vx_error_free(error); + vx_session_free(session); + return 1; + } + + printf("dtype: "); + print_dtype(vx_data_source_dtype(data_source)); + + vx_data_source_free(data_source); + vx_session_free(session); + return 0; +} diff --git a/vortex-ffi/examples/hello-vortex.c b/vortex-ffi/examples/hello-vortex.c deleted file mode 100644 index 192e4d73ae3..00000000000 --- a/vortex-ffi/examples/hello-vortex.c +++ /dev/null @@ -1,116 +0,0 @@ -// SPDX-License-Identifier: CC-BY-4.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -#include "vortex.h" -#include -#include -#include - -int main(int argc, char *argv[]) { - if (argc < 2) { - printf("Usage: %s \n", argv[0]); - return 1; - } - - vx_error *error = NULL; - vx_session *session = vx_session_new(); - if (session == NULL) { - fprintf(stderr, "Failed to create Vortex session\n"); - return -1; - } - - // Open the file - char *uri = argv[1]; - printf("Opening file: %s\n", uri); - - vx_file_open_options open_opts = { - .uri = uri, - .property_keys = NULL, - .property_vals = NULL, - .property_len = 0, - }; - - const vx_file *file = vx_file_open_reader(session, &open_opts, &error); - if (error != NULL) { - fprintf(stderr, "Failed to open file: %s\n%s", uri, vx_string_ptr(vx_error_get_message(error))); - vx_error_free(error); - vx_session_free(session); - return -1; - } - - // Print file metadata - this will satisfy the "File contains" check - uint64_t row_count = vx_file_row_count(file); - printf("File contains %llu total rows\n", (unsigned long long)row_count); - - // Get and display file dtype - const vx_dtype *file_dtype = vx_file_dtype(file); - vx_dtype_variant variant = vx_dtype_get_variant(file_dtype); - bool nullable = vx_dtype_is_nullable(file_dtype); - printf("File DType variant: %d, nullable: %s\n", variant, - nullable ? "true" : "false"); - - // Start scanning - printf("\nScanning file...\n"); - vx_array_iterator *scan = vx_file_scan(session, file, NULL, &error); - if (error != NULL) { - fprintf(stderr, "Failed to create file scan iterator\n"); - vx_error_free(error); - vx_file_free(file); - vx_session_free(session); - return -1; - } - - int chunk_count = 0; - const vx_array *batch = vx_array_iterator_next(scan, &error); - - while (batch != NULL && error == NULL) { - size_t batch_len = vx_array_len(batch); - printf("Chunk %d: %zu rows\n", chunk_count, batch_len); - - // For the first chunk, show additional API coverage including struct introspection - if (chunk_count == 0 && batch_len > 0) { - const vx_dtype *dtype = vx_array_dtype(batch); - vx_dtype_variant batch_variant = vx_dtype_get_variant(dtype); - printf(" First chunk DType variant: %d\n", batch_variant); - - // Test null count API - uint32_t null_count = vx_array_null_count(batch, &error); - if (error == NULL) { - printf(" Null count: %u\n", null_count); - } else { - printf(" Null count check failed (expected for some array types)\n"); - vx_error_free(error); - error = NULL; - } - - // Test struct field count if it's a struct - if (batch_variant == DTYPE_STRUCT) { - const vx_struct_fields *fields = vx_dtype_struct_dtype(dtype); - size_t n_fields = vx_struct_fields_nfields(fields); - printf(" Struct with %zu fields\n", n_fields); - } - } - - vx_array_free(batch); - batch = vx_array_iterator_next(scan, &error); - chunk_count++; - } - - printf("Total chunks processed: %d\n", chunk_count); - - // Clean up resources - vx_array_iterator_free(scan); - vx_file_free(file); - - if (error != NULL) { - fprintf(stderr, "Error during scan operation\n"); - vx_error_free(error); - vx_session_free(session); - return -1; - } - - printf("Scanning completed successfully\n"); - vx_session_free(session); - - return 0; -} diff --git a/vortex-ffi/examples/scan.c b/vortex-ffi/examples/scan.c new file mode 100644 index 00000000000..6db36c52684 --- /dev/null +++ b/vortex-ffi/examples/scan.c @@ -0,0 +1,209 @@ +// SPDX-License-Identifier: CC-BY-4.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "vortex.h" +#include +#include +#include + +#define MAX_THREADS 64 + +const char *usage = "Multi-threaded file scan\n" + "Usage: scan [-j threads] \n"; + +void print_estimate(const char *what, const vx_estimate *estimate) { + switch (estimate->type) { + case VX_ESTIMATE_UNKNOWN: + printf("%s: unknown\n", what); + return; + case VX_ESTIMATE_EXACT: + printf("%s: %lu\n", what, estimate->estimate); + return; + case VX_ESTIMATE_INEXACT: + printf("%s: approximately %lu\n", what, estimate->estimate); + break; + } +} + +void print_error(const char *what, const vx_error *error) { + const vx_string *str = vx_error_get_message(error); + fprintf(stderr, "%s: %.*s\n", what, (int)vx_string_len(str), vx_string_ptr(str)); +} + +struct scan_thread_info { + pthread_t thread_id; + pthread_mutex_t *mutex; + vx_scan *scan; + size_t partitions, arrays, rows; + vx_error *error; +}; + +void *execute_scan_thread(void *arg) { + struct scan_thread_info *info = arg; + while (true) { + // A partition is an independent unit of work a thread can work on. + pthread_mutex_lock(info->mutex); + vx_partition *partition = vx_scan_next_partition(info->scan, &info->error); + pthread_mutex_unlock(info->mutex); + + if (partition == NULL && info->error == NULL) { + break; // partition iterator exhausted + } + if (partition == NULL && info->error != NULL) { + return NULL; // partition was not scanned due to an error + } + ++info->partitions; + + vx_estimate row_count; + if (vx_partition_row_count(partition, &row_count, &info->error)) { + vx_partition_free(partition); + return NULL; + } + + printf("Thread %lu processing partition %lu, ", info->thread_id + 1, info->partitions); + print_estimate("row count", &row_count); + + // An array is a batch of rows from a partition + const vx_array *array = NULL; + while ((array = vx_partition_next(partition, &info->error)) != NULL) { + ++info->arrays; + info->rows += vx_array_len(array); + vx_array_free(array); + } + + vx_partition_free(partition); + + if (info->error != NULL) { + return NULL; + } + } + + printf("Thread %lu finished, processed %lu partitions, %lu arrays, %lu rows\n", + info->thread_id + 1, + info->partitions, + info->arrays, + info->rows); + return NULL; +} + +vx_error *execute_scan(vx_scan *scan, int num_threads) { + pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; + pthread_t threads[MAX_THREADS]; + struct scan_thread_info infos[MAX_THREADS] = {0}; + + printf("Starting scan, using %d threads\n", num_threads); + for (int i = 0; i < num_threads; ++i) { + struct scan_thread_info *info = &infos[i]; + info->thread_id = i; + info->mutex = &mutex; + info->scan = scan; + pthread_create(&threads[i], NULL, execute_scan_thread, info); + } + + size_t partitions = 0, arrays = 0, rows = 0; + for (int i = 0; i < num_threads; ++i) { + pthread_join(threads[i], NULL); + struct scan_thread_info *info = &infos[i]; + + if (info->error != NULL) { + // Don't join other threads as program will be terminated early + return info->error; + } + + partitions += info->partitions; + arrays += info->arrays; + rows += info->rows; + } + + printf("Finished scan, processed %lu partitions, %lu arrays, %lu rows\n", partitions, arrays, rows); + return NULL; +} + +int parse_options(int argc, char *argv[], int *threads, char **paths) { + int opt; + while ((opt = getopt(argc, argv, "j:")) != -1) { + switch (opt) { + case 'j': + *threads = atoi(optarg); + break; + default: + fprintf(stderr, "%s", usage); + return 1; + } + } + + if (*threads != 0 && (*threads < 1 || *threads > MAX_THREADS)) { + fprintf(stderr, "Invalid thread count %d, expected [1; 64]\n", *threads); + return 1; + } + + if (optind + 1 != argc) { + fprintf(stderr, "%s", usage); + return 1; + } + + *paths = argv[optind]; + return 0; +} + +int main(int argc, char *argv[]) { + int threads = 0; + char *paths; + if (parse_options(argc, argv, &threads, &paths)) { + return 1; + } + + vx_session *session = vx_session_new(); + if (session == NULL) { + fprintf(stderr, "Failed to create Vortex session\n"); + return 1; + } + + printf("Opening files: %s\n", paths); + + // A datasource is a reference to some files. + // We can request multiple scans from a data source. + vx_data_source_options ds_options = {.paths = paths}; + vx_error *error = NULL; + const vx_data_source *data_source = vx_data_source_new(session, &ds_options, &error); + if (data_source == NULL) { + print_error("Failed to create data source", error); + // Returned errors are owned and need to be freed + vx_error_free(error); + vx_session_free(session); + return 1; + } + + vx_estimate row_count; + vx_data_source_get_row_count(data_source, &row_count); + print_estimate("Data source row count", &row_count); + + // A scan is a single traversal of a data source. + // Here we request a scan without any filters, projections, or limiting. + vx_scan_options scan_options = {.max_threads = threads}; + vx_estimate partition_estimate; + vx_scan *scan = vx_data_source_scan(data_source, &scan_options, &partition_estimate, &error); + if (scan == NULL) { + print_error("Failed to create scan", error); + vx_error_free(error); + vx_data_source_free(data_source); + vx_session_free(session); + return 1; + } + + // Caller can use partition estimates to schedule worker threads. + print_estimate("Partition count", &partition_estimate); + if (threads == 0 && partition_estimate.type != VX_ESTIMATE_UNKNOWN) { + threads = partition_estimate.estimate; + } + + error = execute_scan(scan, threads); + if (error != NULL) { + print_error("Failed to scan", error); + vx_error_free(error); + } + + vx_scan_free(scan); + vx_data_source_free(data_source); + vx_session_free(session); + return 0; +} diff --git a/vortex-ffi/examples/scan_to_arrow.c b/vortex-ffi/examples/scan_to_arrow.c new file mode 100644 index 00000000000..deb9d885634 --- /dev/null +++ b/vortex-ffi/examples/scan_to_arrow.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: CC-BY-4.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "nanoarrow/common/inline_types.h" +#include "nanoarrow/nanoarrow.h" + +#define USE_OWN_ARROW +typedef struct ArrowSchema FFI_ArrowSchema; +typedef struct ArrowArrayStream FFI_ArrowArrayStream; +#include "vortex.h" + +#include +#include + +const char *usage = "Scan vortex files to Arrow\n" + "Usage: scan_to_arrow \n"; + +void print_error(const char *what, const vx_error *error) { + const vx_string *str = vx_error_get_message(error); + fprintf(stderr, "%s: %.*s\n", what, (int)vx_string_len(str), vx_string_ptr(str)); +} + +void execute_scan(vx_session *session, vx_scan *scan) { + vx_error *error = NULL; + + // Returned dtype is owned and mustn't be freed + const vx_dtype *dtype = vx_scan_dtype(scan, &error); + if (dtype == NULL) { + print_error("Failed to get scan dtype", error); + vx_error_free(error); + return; + } + + struct ArrowSchema schema; + if (vx_dtype_to_arrow_schema(dtype, &schema, &error)) { + print_error("Failed to convert dtype to Arrow schema", error); + vx_error_free(error); + return; + } + + char schema_buf[1024 * 10]; + const int schema_len = ArrowSchemaToString(&schema, schema_buf, sizeof schema_buf, 1); + printf("arrow schema: %.*s\n", schema_len, schema_buf); + if (schema.release) { + schema.release(&schema); + } + + struct ArrowError arrow_error; + ArrowErrorInit(&arrow_error); + + vx_partition *partition; + size_t partitions = 0, arrays = 0, rows = 0; + + while ((partition = vx_scan_next_partition(scan, &error)) != NULL) { + struct ArrowArrayStream stream; + // Partition is consumed, we must not free it or use it after + if (vx_partition_scan_arrow(session, partition, &stream, &error)) { + print_error("Failed to scan partition to Arrow", error); + vx_error_free(error); + error = NULL; + break; + } + + struct ArrowArray array = {0}; + while (ArrowArrayStreamGetNext(&stream, &array, &arrow_error) == NANOARROW_OK && + array.release != NULL) { + rows += array.length; + ++arrays; + array.release(&array); + memset(&array, 0, sizeof(array)); + } + + printf("Read Partition %lu to arrow, %lu arrays, %lu rows\n", partitions, arrays, rows); + rows = 0; + arrays = 0; + + stream.release(&stream); + ++partitions; + } + + if (error) { + print_error("Failed scanning partition", error); + vx_error_free(error); + } +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "%s", usage); + return 1; + } + const char *paths = argv[1]; + + vx_session *session = vx_session_new(); + if (session == NULL) { + fprintf(stderr, "Failed to create Vortex session\n"); + return -1; + } + + vx_data_source_options ds_options = {.paths = paths}; + vx_error *error = NULL; + const vx_data_source *data_source = vx_data_source_new(session, &ds_options, &error); + if (data_source == NULL) { + print_error("Failed to create data source", error); + // Returned errors are owned and need to be freed + vx_error_free(error); + vx_session_free(session); + return 1; + } + + vx_scan *scan = vx_data_source_scan(data_source, NULL, NULL, &error); + if (scan == NULL) { + print_error("Failed to create scan", error); + vx_error_free(error); + vx_data_source_free(data_source); + vx_session_free(session); + return 1; + } + + execute_scan(session, scan); + + vx_scan_free(scan); + vx_data_source_free(data_source); + vx_session_free(session); +} diff --git a/vortex-ffi/examples/write_sample.c b/vortex-ffi/examples/write_sample.c new file mode 100644 index 00000000000..441fdea3c2c --- /dev/null +++ b/vortex-ffi/examples/write_sample.c @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: CC-BY-4.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +#include "vortex.h" +#include +#include +#include + +#define SAMPLE_ROWS 200 + +const char *usage = "Write a sample 200 rows .vortex file\n" + "Usage: write_sample \n"; + +// StructArray { age=u8, height=u16? } +const vx_dtype *sample_dtype(void) { + vx_struct_fields_builder *builder = vx_struct_fields_builder_new(); + + const char *age = "age"; + const vx_string *age_name = vx_string_new(age, strlen(age)); + const vx_dtype *age_type = vx_dtype_new_primitive(PTYPE_U8, false); + vx_struct_fields_builder_add_field(builder, age_name, age_type); + + const char *height = "height"; + const vx_string *height_name = vx_string_new(height, strlen(height)); + const vx_dtype *height_type = vx_dtype_new_primitive(PTYPE_U16, true); + vx_struct_fields_builder_add_field(builder, height_name, height_type); + + vx_struct_fields *fields = vx_struct_fields_builder_finalize(builder); + return vx_dtype_new_struct(fields, false); +} + +void print_error(const char *what, const vx_error *error) { + const vx_string *str = vx_error_get_message(error); + fprintf(stderr, "%s: %.*s\n", what, (int)vx_string_len(str), vx_string_ptr(str)); +} + +const vx_array *sample_array(void) { + vx_validity validity = {.type = VX_VALIDITY_NON_NULLABLE}; + vx_struct_column_builder *builder = vx_struct_column_builder_new(&validity, SAMPLE_ROWS); + + uint8_t age_buffer[SAMPLE_ROWS]; + uint16_t height_buffer[SAMPLE_ROWS]; + for (uint8_t i = 0; i < SAMPLE_ROWS; ++i) { + age_buffer[i] = i; + height_buffer[i] = rand() % (i + 1); + } + + vx_error *error = NULL; + const vx_array *age_array = vx_array_new_primitive(PTYPE_U8, age_buffer, SAMPLE_ROWS, &validity, &error); + if (error != NULL) { + print_error("Error creating age array", error); + vx_error_free(error); + vx_struct_column_builder_free(builder); + return NULL; + } + + vx_struct_column_builder_add_field(builder, "age", age_array, &error); + vx_array_free(age_array); + if (error != NULL) { + print_error("Error adding age array field to root array", error); + vx_error_free(error); + vx_struct_column_builder_free(builder); + return NULL; + } + + validity.type = VX_VALIDITY_ALL_VALID; + const vx_array *height_array = + vx_array_new_primitive(PTYPE_U16, height_buffer, SAMPLE_ROWS, &validity, &error); + if (error != NULL) { + print_error("Error adding age array field to root array", error); + vx_error_free(error); + vx_struct_column_builder_free(builder); + return NULL; + } + + vx_struct_column_builder_add_field(builder, "height", height_array, &error); + vx_array_free(height_array); + if (error != NULL) { + print_error("Error adding height array field to root array", error); + vx_error_free(error); + vx_struct_column_builder_free(builder); + return NULL; + } + + const vx_array *array = vx_struct_column_builder_finalize(builder, &error); + if (error != NULL) { + print_error("Error creating struct array", error); + vx_error_free(error); + return NULL; + } + + return array; +} + +int main(int argc, char *argv[]) { + if (argc != 2) { + fprintf(stderr, "%s", usage); + return 1; + } + const char *output = argv[1]; + + vx_session *const session = vx_session_new(); + if (session == NULL) { + fprintf(stderr, "Failed to create Vortex session\n"); + return 1; + } + + const vx_dtype *dtype = sample_dtype(); + + vx_error *error = NULL; + vx_array_sink *sink = vx_array_sink_open_file(session, output, dtype, &error); + + vx_dtype_free(dtype); + if (error != NULL) { + vx_session_free(session); + return 1; + } + + const vx_array *array = sample_array(); + if (array == NULL) { + // We already have an error, so we can ignore a potential error + // from this operation + vx_array_sink_close(sink, &error); + vx_session_free(session); + return 1; + } + + vx_array_sink_push(sink, array, &error); + if (error != NULL) { + vx_array_sink_close(sink, &error); + vx_session_free(session); + return 1; + } + vx_array_free(array); + + vx_array_sink_close(sink, &error); + if (error != NULL) { + print_error("Error closing output sink", error); + vx_error_free(error); + } + + vx_session_free(session); +} diff --git a/vortex-ffi/src/data_source.rs b/vortex-ffi/src/data_source.rs index 2055e697209..33ebe937b3a 100644 --- a/vortex-ffi/src/data_source.rs +++ b/vortex-ffi/src/data_source.rs @@ -20,6 +20,8 @@ use crate::RUNTIME; use crate::dtype::vx_dtype; use crate::error::try_or; use crate::error::vx_error; +use crate::scan::vx_estimate; +use crate::scan::vx_estimate_type; use crate::session::vx_session; use crate::to_string; @@ -106,39 +108,24 @@ pub unsafe extern "C-unwind" fn vx_data_source_dtype(ds: *const vx_data_source) vx_dtype::new_ref(vx_data_source::as_ref(ds).dtype()) } -#[repr(C)] -#[cfg_attr(test, derive(PartialEq, Debug))] -enum vx_cardinality { - VX_CARD_UNKNOWN = 0, - VX_CARD_ESTIMATE = 1, - VX_CARD_MAXIMUM = 2, -} - -#[repr(C)] -pub struct vx_data_source_row_count { - cardinality: vx_cardinality, - /// Set only when "cardinality" is not VX_CARD_UNKNOWN - rows: u64, -} - /// Write data source's row count estimate into "row_count". #[unsafe(no_mangle)] pub unsafe extern "C-unwind" fn vx_data_source_get_row_count( ds: *const vx_data_source, - row_count: *mut vx_data_source_row_count, + row_count: *mut vx_estimate, ) { let rc = unsafe { &mut *row_count }; match vx_data_source::as_ref(ds).row_count() { Some(Exact(rows)) => { - rc.cardinality = vx_cardinality::VX_CARD_MAXIMUM; - rc.rows = rows; + rc.r#type = vx_estimate_type::VX_ESTIMATE_EXACT; + rc.estimate = rows; } Some(Inexact(rows)) => { - rc.cardinality = vx_cardinality::VX_CARD_ESTIMATE; - rc.rows = rows; + rc.r#type = vx_estimate_type::VX_ESTIMATE_INEXACT; + rc.estimate = rows; } None => { - rc.cardinality = vx_cardinality::VX_CARD_UNKNOWN; + rc.r#type = vx_estimate_type::VX_ESTIMATE_UNKNOWN; } } } @@ -152,14 +139,14 @@ mod tests { use std::ffi::CString; use std::ptr; - use crate::data_source::vx_cardinality; use crate::data_source::vx_data_source_dtype; use crate::data_source::vx_data_source_free; use crate::data_source::vx_data_source_get_row_count; use crate::data_source::vx_data_source_new; use crate::data_source::vx_data_source_options; - use crate::data_source::vx_data_source_row_count; use crate::dtype::vx_dtype; + use crate::scan::vx_estimate; + use crate::scan::vx_estimate_type; use crate::session::vx_session_free; use crate::session::vx_session_new; use crate::tests::SAMPLE_ROWS; @@ -221,13 +208,10 @@ mod tests { let dtype = vx_dtype::as_ref(vx_data_source_dtype(ds)); assert_eq!(dtype, struct_array.dtype()); - let mut row_count = vx_data_source_row_count { - cardinality: vx_cardinality::VX_CARD_UNKNOWN, - rows: 0, - }; + let mut row_count = vx_estimate::default(); vx_data_source_get_row_count(ds, &raw mut row_count); - assert_eq!(row_count.cardinality, vx_cardinality::VX_CARD_MAXIMUM); - assert_eq!(row_count.rows, SAMPLE_ROWS as u64); + assert_eq!(row_count.r#type, vx_estimate_type::VX_ESTIMATE_EXACT); + assert_eq!(row_count.estimate, SAMPLE_ROWS as u64); vx_data_source_free(ds); vx_session_free(session); diff --git a/vortex-ffi/src/scan.rs b/vortex-ffi/src/scan.rs index fdae46aca4f..7d728aa135c 100644 --- a/vortex-ffi/src/scan.rs +++ b/vortex-ffi/src/scan.rs @@ -49,7 +49,11 @@ pub enum VxScan { Started(PartitionStream), Finished, } -crate::box_wrapper!(VxScan, vx_scan); +crate::box_wrapper!( + /// A scan is a single traversal of a data source with projections and + /// filters. A scan can be consumed only once. + VxScan, + vx_scan); pub enum VxPartitionScan { Pending(Box), @@ -114,8 +118,10 @@ pub struct vx_scan_options { } #[repr(C)] +#[cfg_attr(test, derive(Debug, PartialEq, Eq, Default))] pub enum vx_estimate_type { /// No estimate is available. + #[cfg_attr(test, default)] VX_ESTIMATE_UNKNOWN = 0, /// The value in vx_estimate.estimate is exact. VX_ESTIMATE_EXACT = 1, @@ -126,10 +132,11 @@ pub enum vx_estimate_type { /// Used for estimating number of partitions in a data source or number of rows /// in a partition. #[repr(C)] +#[cfg_attr(test, derive(Default))] pub struct vx_estimate { - r#type: vx_estimate_type, + pub r#type: vx_estimate_type, /// Set only when "type" is not VX_ESTIMATE_UNKNOWN. - estimate: u64, + pub estimate: u64, } fn scan_request(opts: *const vx_scan_options) -> VortexResult { @@ -325,15 +332,15 @@ pub unsafe extern "C-unwind" fn vx_partition_row_count( }) } -// Scan partition to ArrowArrayStream. -// Consumes partition fully: subsequent calls to vx_partition_scan_arrow or -// vx_partition_next are undefined behaviour. -// This call blocks current thread until underlying stream is fully consumed. -// -// Caller must not free partition after calling this function. -// -// On success, sets "stream" and returns 0. -// On error, sets "err" and returns 1, freeing the partition. +/// Scan partition to ArrowArrayStream. +/// Consumes partition fully: subsequent calls to vx_partition_scan_arrow or +/// vx_partition_next are undefined behaviour. +/// This call blocks current thread until underlying stream is fully consumed. +/// +/// Caller must not free partition after calling this function. +/// +/// On success, sets "stream" and returns 0. +/// On error, sets "err" and returns 1, freeing the partition. #[unsafe(no_mangle)] pub unsafe extern "C-unwind" fn vx_partition_scan_arrow( session: *const vx_session, diff --git a/vortex-ffi/test/CMakeLists.txt b/vortex-ffi/test/CMakeLists.txt index 6ec4c130800..be0288ef954 100644 --- a/vortex-ffi/test/CMakeLists.txt +++ b/vortex-ffi/test/CMakeLists.txt @@ -1,8 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright the Vortex contributors include(CTest) -include(FetchContent) - FetchContent_Declare( Catch GIT_REPOSITORY https://github.com/catchorg/Catch2.git @@ -13,13 +11,6 @@ include(Catch) # https://github.com/catchorg/Catch2/issues/1833 target_compile_definitions(Catch2 PRIVATE CATCH_CONFIG_NO_POSIX_SIGNALS) -FetchContent_Declare( - Nanoarrow - GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow - GIT_TAG apache-arrow-nanoarrow-0.8.0 -) -FetchContent_MakeAvailable(Nanoarrow) - file(GLOB TEST_FILES "${CMAKE_CURRENT_SOURCE_DIR}/*.cpp") message(NOTICE "Test files ${TEST_FILES}") add_executable(vortex_ffi_test ${TEST_FILES}) diff --git a/vortex-ffi/test/scan.cpp b/vortex-ffi/test/scan.cpp index 93a476ce614..10d8716c90b 100644 --- a/vortex-ffi/test/scan.cpp +++ b/vortex-ffi/test/scan.cpp @@ -1,6 +1,5 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -#include #include #include #include @@ -252,11 +251,11 @@ TEST_CASE("Write file and read dtypes", "[datasource]") { vx_data_source_free(ds); }; - vx_data_source_row_count row_count = {}; + vx_estimate row_count; vx_data_source_get_row_count(ds, &row_count); - CHECK(row_count.cardinality == VX_CARD_MAXIMUM); - CHECK(row_count.rows == SAMPLE_ROWS); + CHECK(row_count.type == VX_ESTIMATE_EXACT); + CHECK(row_count.estimate == SAMPLE_ROWS); const vx_dtype *data_source_dtype = vx_data_source_dtype(ds); REQUIRE(vx_dtype_get_variant(data_source_dtype) == DTYPE_STRUCT); diff --git a/vortex-file/src/multi/mod.rs b/vortex-file/src/multi/mod.rs index 5e7f5466921..0b97376c362 100644 --- a/vortex-file/src/multi/mod.rs +++ b/vortex-file/src/multi/mod.rs @@ -77,9 +77,20 @@ impl MultiFileDataSource { /// /// The glob path should be relative to the filesystem's base URL. Pass `None` for the /// filesystem to use the local filesystem (auto-created in [`Self::build`]). + /// + /// Relative paths are resolved against the process working directory. pub fn with_glob(mut self, glob: impl Into, fs: Option) -> Self { - let glob_str = glob.into().trim_start_matches('/').to_string(); - self.glob_sources.push((glob_str, fs)); + let glob = glob.into(); + let glob = if fs.is_none() && std::path::Path::new(&glob).is_relative() { + std::env::current_dir() + .map(|cwd| cwd.join(&glob).to_string_lossy().into_owned()) + .unwrap_or(glob) + .trim_start_matches('/') + .to_string() + } else { + glob.trim_start_matches('/').to_string() + }; + self.glob_sources.push((glob, fs)); self }