|
| 1 | +// SPDX-License-Identifier: Apache-2.0 |
| 2 | +// SPDX-FileCopyrightText: Copyright The Lance Authors |
| 3 | + |
| 4 | +//! Fragment writer C API: write Arrow data to local fragment files without committing. |
| 5 | +//! |
| 6 | +//! Designed for embedded / robotics C++ pipelines where sensor data is ingested |
| 7 | +//! at high frequency on edge devices. The C++ process writes Lance fragment files |
| 8 | +//! locally with minimal overhead (no manifest, no coordination). A separate Rust |
| 9 | +//! finalizer process later reads the file footers, reconstructs fragment metadata, |
| 10 | +//! and commits them into a dataset on a remote data lake (S3, GCS, etc.). |
| 11 | +//! |
| 12 | +//! # Two-process workflow |
| 13 | +//! |
| 14 | +//! **1. Writer process (C/C++ on edge device):** |
| 15 | +//! ```c |
| 16 | +//! // Stream sensor batches into local fragment files. |
| 17 | +//! int32_t rc = lance_write_fragments( |
| 18 | +//! "file:///data/staging/robot.lance", &schema, &stream, NULL); |
| 19 | +//! ``` |
| 20 | +//! |
| 21 | +//! **2. Finalizer process (Rust, runs periodically or on sync):** |
| 22 | +//! ```text |
| 23 | +//! // Scan data/*.lance files, reconstruct Fragment metadata from file footers, |
| 24 | +//! // then commit via CommitBuilder to publish to the data lake. |
| 25 | +//! ``` |
| 26 | +
|
| 27 | +use std::ffi::c_char; |
| 28 | +use std::sync::Arc; |
| 29 | + |
| 30 | +use arrow::ffi::FFI_ArrowSchema; |
| 31 | +use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; |
| 32 | +use arrow::record_batch::RecordBatchReader; |
| 33 | +use arrow_schema::Schema as ArrowSchema; |
| 34 | +use lance::dataset::{InsertBuilder, WriteParams}; |
| 35 | +use lance_core::Result; |
| 36 | +use lance_io::object_store::{ObjectStoreParams, StorageOptionsAccessor}; |
| 37 | + |
| 38 | +use crate::error::ffi_try; |
| 39 | +use crate::helpers; |
| 40 | +use crate::runtime::block_on; |
| 41 | + |
| 42 | +/// Write an Arrow record batch stream to fragment files at `uri`. |
| 43 | +/// |
| 44 | +/// The data is written but **not committed** — no dataset manifest is created |
| 45 | +/// or updated. The written `.lance` files under `<uri>/data/` contain full |
| 46 | +/// metadata in their footers (schema with field IDs, row counts, format version). |
| 47 | +/// A Rust finalizer can reconstruct `Fragment` metadata by reading these footers |
| 48 | +/// and commit via `CommitBuilder`. |
| 49 | +/// |
| 50 | +/// - `uri`: Directory URI where fragment files are written (`file://`, `s3://`, etc.) |
| 51 | +/// - `schema`: Required Arrow schema. The stream's schema must match; the call |
| 52 | +/// fails fast with `LANCE_ERR_INVALID_ARGUMENT` on mismatch. |
| 53 | +/// - `stream`: Arrow C Data Interface stream consumed by this call. The caller |
| 54 | +/// must not use the stream after this function returns. |
| 55 | +/// - `storage_opts`: NULL-terminated key-value pairs `["key","val",NULL]`, or NULL. |
| 56 | +/// |
| 57 | +/// Returns 0 on success, -1 on error. |
| 58 | +#[unsafe(no_mangle)] |
| 59 | +pub unsafe extern "C" fn lance_write_fragments( |
| 60 | + uri: *const c_char, |
| 61 | + schema: *const FFI_ArrowSchema, |
| 62 | + stream: *mut FFI_ArrowArrayStream, |
| 63 | + storage_opts: *const *const c_char, |
| 64 | +) -> i32 { |
| 65 | + ffi_try!( |
| 66 | + unsafe { write_fragments_inner(uri, schema, stream, storage_opts) }, |
| 67 | + neg |
| 68 | + ) |
| 69 | +} |
| 70 | + |
| 71 | +unsafe fn write_fragments_inner( |
| 72 | + uri: *const c_char, |
| 73 | + schema: *const FFI_ArrowSchema, |
| 74 | + stream: *mut FFI_ArrowArrayStream, |
| 75 | + storage_opts: *const *const c_char, |
| 76 | +) -> Result<i32> { |
| 77 | + if uri.is_null() || schema.is_null() || stream.is_null() { |
| 78 | + return Err(lance_core::Error::InvalidInput { |
| 79 | + source: "uri, schema, and stream must not be NULL".into(), |
| 80 | + location: snafu::location!(), |
| 81 | + }); |
| 82 | + } |
| 83 | + |
| 84 | + let uri_str = unsafe { helpers::parse_c_string(uri)? }.ok_or_else(|| { |
| 85 | + lance_core::Error::InvalidInput { |
| 86 | + source: "uri must not be empty".into(), |
| 87 | + location: snafu::location!(), |
| 88 | + } |
| 89 | + })?; |
| 90 | + |
| 91 | + // Import the caller-provided schema from the Arrow C Data Interface. |
| 92 | + let expected_schema = ArrowSchema::try_from(unsafe { &*schema }).map_err(|e| { |
| 93 | + lance_core::Error::InvalidInput { |
| 94 | + source: format!("invalid schema: {e}").into(), |
| 95 | + location: snafu::location!(), |
| 96 | + } |
| 97 | + })?; |
| 98 | + |
| 99 | + let opts = unsafe { helpers::parse_storage_options(storage_opts)? }; |
| 100 | + |
| 101 | + // Consume the C stream into an Arrow RecordBatch reader. |
| 102 | + let reader = unsafe { ArrowArrayStreamReader::from_raw(stream) }.map_err(|e| { |
| 103 | + lance_core::Error::InvalidInput { |
| 104 | + source: e.to_string().into(), |
| 105 | + location: snafu::location!(), |
| 106 | + } |
| 107 | + })?; |
| 108 | + |
| 109 | + // Fail fast: compare the stream schema against the caller-provided schema. |
| 110 | + let stream_schema = reader.schema(); |
| 111 | + if stream_schema.fields() != expected_schema.fields() { |
| 112 | + return Err(lance_core::Error::InvalidInput { |
| 113 | + source: format!( |
| 114 | + "stream schema does not match the provided schema.\n expected: {expected_schema}\n got: {stream_schema}" |
| 115 | + ) |
| 116 | + .into(), |
| 117 | + location: snafu::location!(), |
| 118 | + }); |
| 119 | + } |
| 120 | + |
| 121 | + let mut params = WriteParams::default(); |
| 122 | + if !opts.is_empty() { |
| 123 | + params.store_params = Some(ObjectStoreParams { |
| 124 | + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( |
| 125 | + opts, |
| 126 | + ))), |
| 127 | + ..ObjectStoreParams::default() |
| 128 | + }); |
| 129 | + } |
| 130 | + |
| 131 | + // Write fragment data files. The Transaction result is discarded — |
| 132 | + // the finalizer reconstructs Fragment metadata from the file footers. |
| 133 | + let _transaction = block_on( |
| 134 | + InsertBuilder::new(uri_str) |
| 135 | + .with_params(¶ms) |
| 136 | + .execute_uncommitted_stream(reader), |
| 137 | + )?; |
| 138 | + |
| 139 | + Ok(0) |
| 140 | +} |
0 commit comments