diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index ee8de347ad7..aa56d54de95 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -631,6 +631,7 @@ fn create_dataset<'local>( &storage_options_obj, &initial_bases, &target_bases, + &JObject::null(), // allow_external_blob_outside_bases not used for Dataset.write() )?; // Set up namespace commit handler and storage options provider if namespace is provided diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 05b72c58a51..3b2a7ba6b22 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -89,15 +89,16 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( dataset_uri: JString, arrow_array_addr: jlong, arrow_schema_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map - namespace_obj: JObject, // LanceNamespace (can be null) - table_id_obj: JObject, // List (can be null) + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + namespace_obj: JObject, // LanceNamespace (can be null) + table_id_obj: JObject, // List (can be null) + allow_external_blob_outside_bases: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -115,6 +116,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( storage_options_obj, namespace_obj, table_id_obj, + allow_external_blob_outside_bases, ), JObject::default() ) @@ -126,15 +128,16 @@ fn inner_create_with_ffi_array<'local>( dataset_uri: JString, arrow_array_addr: jlong, arrow_schema_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map - namespace_obj: JObject, // LanceNamespace (can be null) - table_id_obj: JObject, // List (can be null) + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + namespace_obj: JObject, // LanceNamespace (can be null) + table_id_obj: JObject, // List (can be null) + allow_external_blob_outside_bases: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -161,6 +164,7 @@ fn inner_create_with_ffi_array<'local>( storage_options_obj, namespace_obj, table_id_obj, + allow_external_blob_outside_bases, reader, ) } @@ -171,15 +175,16 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( _obj: JObject, dataset_uri: JString, arrow_array_stream_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map - namespace_obj: JObject, // LanceNamespace (can be null) - table_id_obj: JObject, // List (can be null) + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + namespace_obj: JObject, // LanceNamespace (can be null) + table_id_obj: JObject, // List (can be null) + allow_external_blob_outside_bases: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -196,6 +201,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( storage_options_obj, namespace_obj, table_id_obj, + allow_external_blob_outside_bases, ), JObject::null() ) @@ -206,15 +212,16 @@ fn inner_create_with_ffi_stream<'local>( env: &mut JNIEnv<'local>, dataset_uri: JString, arrow_array_stream_addr: jlong, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map - namespace_obj: JObject, // LanceNamespace (can be null) - table_id_obj: JObject, // List (can be null) + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + namespace_obj: JObject, // LanceNamespace (can be null) + table_id_obj: JObject, // List (can be null) + allow_external_blob_outside_bases: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -231,6 +238,7 @@ fn inner_create_with_ffi_stream<'local>( storage_options_obj, namespace_obj, table_id_obj, + allow_external_blob_outside_bases, reader, ) } @@ -239,15 +247,16 @@ fn inner_create_with_ffi_stream<'local>( fn create_fragment<'a>( env: &mut JNIEnv<'a>, dataset_uri: JString, - max_rows_per_file: JObject, // Optional - max_rows_per_group: JObject, // Optional - max_bytes_per_file: JObject, // Optional - mode: JObject, // Optional - enable_stable_row_ids: JObject, // Optional - data_storage_version: JObject, // Optional - storage_options_obj: JObject, // Map - namespace_obj: JObject, // LanceNamespace (can be null) - table_id_obj: JObject, // List (can be null) + max_rows_per_file: JObject, // Optional + max_rows_per_group: JObject, // Optional + max_bytes_per_file: JObject, // Optional + mode: JObject, // Optional + enable_stable_row_ids: JObject, // Optional + data_storage_version: JObject, // Optional + storage_options_obj: JObject, // Map + namespace_obj: JObject, // LanceNamespace (can be null) + table_id_obj: JObject, // List (can be null) + allow_external_blob_outside_bases: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -264,6 +273,7 @@ fn create_fragment<'a>( &storage_options_obj, &JObject::null(), // not used when creating fragments &JObject::null(), // not used when creating fragments + &allow_external_blob_outside_bases, )?; // Set up storage options provider if namespace is provided diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index 113769dd17f..3d7ef2ebdaf 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -49,8 +49,9 @@ pub fn extract_write_params( data_storage_version: &JObject, enable_v2_manifest_paths: Option<&JObject>, storage_options_obj: &JObject, - initial_bases: &JObject, // Optional - target_bases: &JObject, // Optional + initial_bases: &JObject, // Optional + target_bases: &JObject, // Optional + allow_external_blob_outside_bases: &JObject, // Optional ) -> Result { let mut write_params = WriteParams::default(); @@ -97,6 +98,10 @@ pub fn extract_write_params( write_params.target_base_names_or_paths = Some(names); } + if let Some(allow) = env.get_boolean_opt(allow_external_blob_outside_bases)? { + write_params.allow_external_blob_outside_bases = allow; + } + // Create storage options accessor from static storage_options let accessor = if storage_options.is_empty() { None diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index 367e03b58d4..89a61560561 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -279,7 +279,8 @@ static List create( params.getDataStorageVersion(), params.getStorageOptions(), namespaceClient, - tableId); + tableId, + params.getAllowExternalBlobOutsideBases()); } } @@ -304,7 +305,8 @@ static List create( params.getDataStorageVersion(), params.getStorageOptions(), namespaceClient, - tableId); + tableId, + params.getAllowExternalBlobOutsideBases()); } /** Create a fragment from the given arrow array and schema. */ @@ -320,7 +322,8 @@ private static native List createWithFfiArray( Optional dataStorageVersion, Map storageOptions, LanceNamespace namespaceClient, - List tableId); + List tableId, + Optional allowExternalBlobOutsideBases); /** Create a fragment from the given arrow stream. */ private static native List createWithFfiStream( @@ -334,5 +337,6 @@ private static native List createWithFfiStream( Optional dataStorageVersion, Map storageOptions, LanceNamespace namespaceClient, - List tableId); + List tableId, + Optional allowExternalBlobOutsideBases); } diff --git a/java/src/main/java/org/lance/WriteParams.java b/java/src/main/java/org/lance/WriteParams.java index 8a04a5d5b96..6f5d4545ca9 100644 --- a/java/src/main/java/org/lance/WriteParams.java +++ b/java/src/main/java/org/lance/WriteParams.java @@ -40,6 +40,7 @@ public enum WriteMode { private Map storageOptions = new HashMap<>(); private final Optional> initialBases; private final Optional> targetBases; + private final Optional allowExternalBlobOutsideBases; private WriteParams( Optional maxRowsPerFile, @@ -51,7 +52,8 @@ private WriteParams( Optional enableV2ManifestPaths, Map storageOptions, Optional> initialBases, - Optional> targetBases) { + Optional> targetBases, + Optional allowExternalBlobOutsideBases) { this.maxRowsPerFile = maxRowsPerFile; this.maxRowsPerGroup = maxRowsPerGroup; this.maxBytesPerFile = maxBytesPerFile; @@ -62,6 +64,7 @@ private WriteParams( this.storageOptions = storageOptions; this.initialBases = initialBases; this.targetBases = targetBases; + this.allowExternalBlobOutsideBases = allowExternalBlobOutsideBases; } public Optional getMaxRowsPerFile() { @@ -109,6 +112,18 @@ public Optional> getTargetBases() { return targetBases; } + /** + * Get whether external blob URIs outside registered bases are allowed. + * + *

When true, blob v2 columns can reference external URIs that are not under any registered + * base path. The URI is stored as an absolute external reference with base_id=0. + * + * @return Optional containing the setting, or empty if not set + */ + public Optional getAllowExternalBlobOutsideBases() { + return allowExternalBlobOutsideBases; + } + @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -132,6 +147,7 @@ public static class Builder { private Map storageOptions = new HashMap<>(); private Optional> initialBases = Optional.empty(); private Optional> targetBases = Optional.empty(); + private Optional allowExternalBlobOutsideBases = Optional.empty(); public Builder withMaxRowsPerFile(int maxRowsPerFile) { this.maxRowsPerFile = Optional.of(maxRowsPerFile); @@ -183,6 +199,21 @@ public Builder withTargetBases(List targetBases) { return this; } + /** + * Allow external blob URIs outside registered bases. + * + *

When true, blob v2 columns can reference external URIs (e.g. pointing to blob files in + * another Lance dataset) that are not under any registered base path. The URI is stored as an + * absolute external reference with base_id=0. + * + * @param allow true to allow external blob URIs outside bases + * @return this builder + */ + public Builder withAllowExternalBlobOutsideBases(boolean allow) { + this.allowExternalBlobOutsideBases = Optional.of(allow); + return this; + } + public WriteParams build() { return new WriteParams( maxRowsPerFile, @@ -194,7 +225,8 @@ public WriteParams build() { enableV2ManifestPaths, storageOptions, initialBases, - targetBases); + targetBases, + allowExternalBlobOutsideBases); } } }