Skip to content

Commit 54196a2

Browse files
beinanclaude
andauthored
feat(java): add allowExternalBlobOutsideBases to WriteParams (#6330)
## Summary Add support for writing blob v2 columns with external URI references that are outside registered base paths. This enables use cases like INSERT INTO SELECT across Lance tables where the target table stores external blob references pointing to the source table's blob files instead of copying the actual blob bytes. ## Changes - **WriteParams.java**: Add `allowExternalBlobOutsideBases` Optional<Boolean> field, getter, and builder method - **Fragment.java**: Pass the new field through `createWithFfiArray` and `createWithFfiStream` native methods - **fragment.rs (JNI)**: Thread the new `Optional<Boolean>` parameter through all fragment creation functions to `extract_write_params` - **utils.rs (JNI)**: Parse the new parameter and set `allow_external_blob_outside_bases` on Rust `WriteParams` - **blocking_dataset.rs (JNI)**: Pass `JObject::null()` for the new param in `Dataset.write()` path (not needed there) ## Context This is a prerequisite for lance-spark blob JOIN support (lance-format/lance-spark#355). When blob data flows through Spark's shuffle during JOIN + INSERT INTO, the target table needs to write external blob references pointing to the source table's physical blob files. The Rust `BlobPreprocessor` already supports this via `allow_external_blob_outside_bases`, but the Java SDK had no way to set it. Ref: #6321, #6322 ## Test plan - [x] Rust JNI code compiles cleanly (no errors in changed files) - [ ] Java unit tests (CI) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 4f6e3c1 commit 54196a2

5 files changed

Lines changed: 105 additions & 53 deletions

File tree

java/lance-jni/src/blocking_dataset.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ fn create_dataset<'local>(
631631
&storage_options_obj,
632632
&initial_bases,
633633
&target_bases,
634+
&JObject::null(), // allow_external_blob_outside_bases not used for Dataset.write()
634635
)?;
635636

636637
// Set up namespace commit handler and storage options provider if namespace is provided

java/lance-jni/src/fragment.rs

Lines changed: 55 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -89,15 +89,16 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>(
8989
dataset_uri: JString,
9090
arrow_array_addr: jlong,
9191
arrow_schema_addr: jlong,
92-
max_rows_per_file: JObject, // Optional<Integer>
93-
max_rows_per_group: JObject, // Optional<Integer>
94-
max_bytes_per_file: JObject, // Optional<Long>
95-
mode: JObject, // Optional<String>
96-
enable_stable_row_ids: JObject, // Optional<Boolean>
97-
data_storage_version: JObject, // Optional<String>
98-
storage_options_obj: JObject, // Map<String, String>
99-
namespace_obj: JObject, // LanceNamespace (can be null)
100-
table_id_obj: JObject, // List<String> (can be null)
92+
max_rows_per_file: JObject, // Optional<Integer>
93+
max_rows_per_group: JObject, // Optional<Integer>
94+
max_bytes_per_file: JObject, // Optional<Long>
95+
mode: JObject, // Optional<String>
96+
enable_stable_row_ids: JObject, // Optional<Boolean>
97+
data_storage_version: JObject, // Optional<String>
98+
storage_options_obj: JObject, // Map<String, String>
99+
namespace_obj: JObject, // LanceNamespace (can be null)
100+
table_id_obj: JObject, // List<String> (can be null)
101+
allow_external_blob_outside_bases: JObject, // Optional<Boolean>
101102
) -> JObject<'local> {
102103
ok_or_throw_with_return!(
103104
env,
@@ -115,6 +116,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>(
115116
storage_options_obj,
116117
namespace_obj,
117118
table_id_obj,
119+
allow_external_blob_outside_bases,
118120
),
119121
JObject::default()
120122
)
@@ -126,15 +128,16 @@ fn inner_create_with_ffi_array<'local>(
126128
dataset_uri: JString,
127129
arrow_array_addr: jlong,
128130
arrow_schema_addr: jlong,
129-
max_rows_per_file: JObject, // Optional<Integer>
130-
max_rows_per_group: JObject, // Optional<Integer>
131-
max_bytes_per_file: JObject, // Optional<Long>
132-
mode: JObject, // Optional<String>
133-
enable_stable_row_ids: JObject, // Optional<Boolean>
134-
data_storage_version: JObject, // Optional<String>
135-
storage_options_obj: JObject, // Map<String, String>
136-
namespace_obj: JObject, // LanceNamespace (can be null)
137-
table_id_obj: JObject, // List<String> (can be null)
131+
max_rows_per_file: JObject, // Optional<Integer>
132+
max_rows_per_group: JObject, // Optional<Integer>
133+
max_bytes_per_file: JObject, // Optional<Long>
134+
mode: JObject, // Optional<String>
135+
enable_stable_row_ids: JObject, // Optional<Boolean>
136+
data_storage_version: JObject, // Optional<String>
137+
storage_options_obj: JObject, // Map<String, String>
138+
namespace_obj: JObject, // LanceNamespace (can be null)
139+
table_id_obj: JObject, // List<String> (can be null)
140+
allow_external_blob_outside_bases: JObject, // Optional<Boolean>
138141
) -> Result<JObject<'local>> {
139142
let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray;
140143
let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema;
@@ -161,6 +164,7 @@ fn inner_create_with_ffi_array<'local>(
161164
storage_options_obj,
162165
namespace_obj,
163166
table_id_obj,
167+
allow_external_blob_outside_bases,
164168
reader,
165169
)
166170
}
@@ -171,15 +175,16 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>(
171175
_obj: JObject,
172176
dataset_uri: JString,
173177
arrow_array_stream_addr: jlong,
174-
max_rows_per_file: JObject, // Optional<Integer>
175-
max_rows_per_group: JObject, // Optional<Integer>
176-
max_bytes_per_file: JObject, // Optional<Long>
177-
mode: JObject, // Optional<String>
178-
enable_stable_row_ids: JObject, // Optional<Boolean>
179-
data_storage_version: JObject, // Optional<String>
180-
storage_options_obj: JObject, // Map<String, String>
181-
namespace_obj: JObject, // LanceNamespace (can be null)
182-
table_id_obj: JObject, // List<String> (can be null)
178+
max_rows_per_file: JObject, // Optional<Integer>
179+
max_rows_per_group: JObject, // Optional<Integer>
180+
max_bytes_per_file: JObject, // Optional<Long>
181+
mode: JObject, // Optional<String>
182+
enable_stable_row_ids: JObject, // Optional<Boolean>
183+
data_storage_version: JObject, // Optional<String>
184+
storage_options_obj: JObject, // Map<String, String>
185+
namespace_obj: JObject, // LanceNamespace (can be null)
186+
table_id_obj: JObject, // List<String> (can be null)
187+
allow_external_blob_outside_bases: JObject, // Optional<Boolean>
183188
) -> JObject<'a> {
184189
ok_or_throw_with_return!(
185190
env,
@@ -196,6 +201,7 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>(
196201
storage_options_obj,
197202
namespace_obj,
198203
table_id_obj,
204+
allow_external_blob_outside_bases,
199205
),
200206
JObject::null()
201207
)
@@ -206,15 +212,16 @@ fn inner_create_with_ffi_stream<'local>(
206212
env: &mut JNIEnv<'local>,
207213
dataset_uri: JString,
208214
arrow_array_stream_addr: jlong,
209-
max_rows_per_file: JObject, // Optional<Integer>
210-
max_rows_per_group: JObject, // Optional<Integer>
211-
max_bytes_per_file: JObject, // Optional<Long>
212-
mode: JObject, // Optional<String>
213-
enable_stable_row_ids: JObject, // Optional<Boolean>
214-
data_storage_version: JObject, // Optional<String>
215-
storage_options_obj: JObject, // Map<String, String>
216-
namespace_obj: JObject, // LanceNamespace (can be null)
217-
table_id_obj: JObject, // List<String> (can be null)
215+
max_rows_per_file: JObject, // Optional<Integer>
216+
max_rows_per_group: JObject, // Optional<Integer>
217+
max_bytes_per_file: JObject, // Optional<Long>
218+
mode: JObject, // Optional<String>
219+
enable_stable_row_ids: JObject, // Optional<Boolean>
220+
data_storage_version: JObject, // Optional<String>
221+
storage_options_obj: JObject, // Map<String, String>
222+
namespace_obj: JObject, // LanceNamespace (can be null)
223+
table_id_obj: JObject, // List<String> (can be null)
224+
allow_external_blob_outside_bases: JObject, // Optional<Boolean>
218225
) -> Result<JObject<'local>> {
219226
let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream;
220227
let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?;
@@ -231,6 +238,7 @@ fn inner_create_with_ffi_stream<'local>(
231238
storage_options_obj,
232239
namespace_obj,
233240
table_id_obj,
241+
allow_external_blob_outside_bases,
234242
reader,
235243
)
236244
}
@@ -239,15 +247,16 @@ fn inner_create_with_ffi_stream<'local>(
239247
fn create_fragment<'a>(
240248
env: &mut JNIEnv<'a>,
241249
dataset_uri: JString,
242-
max_rows_per_file: JObject, // Optional<Integer>
243-
max_rows_per_group: JObject, // Optional<Integer>
244-
max_bytes_per_file: JObject, // Optional<Long>
245-
mode: JObject, // Optional<String>
246-
enable_stable_row_ids: JObject, // Optional<Boolean>
247-
data_storage_version: JObject, // Optional<String>
248-
storage_options_obj: JObject, // Map<String, String>
249-
namespace_obj: JObject, // LanceNamespace (can be null)
250-
table_id_obj: JObject, // List<String> (can be null)
250+
max_rows_per_file: JObject, // Optional<Integer>
251+
max_rows_per_group: JObject, // Optional<Integer>
252+
max_bytes_per_file: JObject, // Optional<Long>
253+
mode: JObject, // Optional<String>
254+
enable_stable_row_ids: JObject, // Optional<Boolean>
255+
data_storage_version: JObject, // Optional<String>
256+
storage_options_obj: JObject, // Map<String, String>
257+
namespace_obj: JObject, // LanceNamespace (can be null)
258+
table_id_obj: JObject, // List<String> (can be null)
259+
allow_external_blob_outside_bases: JObject, // Optional<Boolean>
251260
source: impl StreamingWriteSource,
252261
) -> Result<JObject<'a>> {
253262
let path_str = dataset_uri.extract(env)?;
@@ -264,6 +273,7 @@ fn create_fragment<'a>(
264273
&storage_options_obj,
265274
&JObject::null(), // not used when creating fragments
266275
&JObject::null(), // not used when creating fragments
276+
&allow_external_blob_outside_bases,
267277
)?;
268278

269279
// Set up storage options provider if namespace is provided

java/lance-jni/src/utils.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,9 @@ pub fn extract_write_params(
4949
data_storage_version: &JObject,
5050
enable_v2_manifest_paths: Option<&JObject>,
5151
storage_options_obj: &JObject,
52-
initial_bases: &JObject, // Optional<BasePath>
53-
target_bases: &JObject, // Optional<String>
52+
initial_bases: &JObject, // Optional<BasePath>
53+
target_bases: &JObject, // Optional<String>
54+
allow_external_blob_outside_bases: &JObject, // Optional<Boolean>
5455
) -> Result<WriteParams> {
5556
let mut write_params = WriteParams::default();
5657

@@ -97,6 +98,10 @@ pub fn extract_write_params(
9798
write_params.target_base_names_or_paths = Some(names);
9899
}
99100

101+
if let Some(allow) = env.get_boolean_opt(allow_external_blob_outside_bases)? {
102+
write_params.allow_external_blob_outside_bases = allow;
103+
}
104+
100105
// Create storage options accessor from static storage_options
101106
let accessor = if storage_options.is_empty() {
102107
None

java/src/main/java/org/lance/Fragment.java

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ static List<FragmentMetadata> create(
279279
params.getDataStorageVersion(),
280280
params.getStorageOptions(),
281281
namespaceClient,
282-
tableId);
282+
tableId,
283+
params.getAllowExternalBlobOutsideBases());
283284
}
284285
}
285286

@@ -304,7 +305,8 @@ static List<FragmentMetadata> create(
304305
params.getDataStorageVersion(),
305306
params.getStorageOptions(),
306307
namespaceClient,
307-
tableId);
308+
tableId,
309+
params.getAllowExternalBlobOutsideBases());
308310
}
309311

310312
/** Create a fragment from the given arrow array and schema. */
@@ -320,7 +322,8 @@ private static native List<FragmentMetadata> createWithFfiArray(
320322
Optional<String> dataStorageVersion,
321323
Map<String, String> storageOptions,
322324
LanceNamespace namespaceClient,
323-
List<String> tableId);
325+
List<String> tableId,
326+
Optional<Boolean> allowExternalBlobOutsideBases);
324327

325328
/** Create a fragment from the given arrow stream. */
326329
private static native List<FragmentMetadata> createWithFfiStream(
@@ -334,5 +337,6 @@ private static native List<FragmentMetadata> createWithFfiStream(
334337
Optional<String> dataStorageVersion,
335338
Map<String, String> storageOptions,
336339
LanceNamespace namespaceClient,
337-
List<String> tableId);
340+
List<String> tableId,
341+
Optional<Boolean> allowExternalBlobOutsideBases);
338342
}

java/src/main/java/org/lance/WriteParams.java

Lines changed: 34 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ public enum WriteMode {
4040
private Map<String, String> storageOptions = new HashMap<>();
4141
private final Optional<List<BasePath>> initialBases;
4242
private final Optional<List<String>> targetBases;
43+
private final Optional<Boolean> allowExternalBlobOutsideBases;
4344

4445
private WriteParams(
4546
Optional<Integer> maxRowsPerFile,
@@ -51,7 +52,8 @@ private WriteParams(
5152
Optional<Boolean> enableV2ManifestPaths,
5253
Map<String, String> storageOptions,
5354
Optional<List<BasePath>> initialBases,
54-
Optional<List<String>> targetBases) {
55+
Optional<List<String>> targetBases,
56+
Optional<Boolean> allowExternalBlobOutsideBases) {
5557
this.maxRowsPerFile = maxRowsPerFile;
5658
this.maxRowsPerGroup = maxRowsPerGroup;
5759
this.maxBytesPerFile = maxBytesPerFile;
@@ -62,6 +64,7 @@ private WriteParams(
6264
this.storageOptions = storageOptions;
6365
this.initialBases = initialBases;
6466
this.targetBases = targetBases;
67+
this.allowExternalBlobOutsideBases = allowExternalBlobOutsideBases;
6568
}
6669

6770
public Optional<Integer> getMaxRowsPerFile() {
@@ -109,6 +112,18 @@ public Optional<List<String>> getTargetBases() {
109112
return targetBases;
110113
}
111114

115+
/**
116+
* Get whether external blob URIs outside registered bases are allowed.
117+
*
118+
* <p>When true, blob v2 columns can reference external URIs that are not under any registered
119+
* base path. The URI is stored as an absolute external reference with base_id=0.
120+
*
121+
* @return Optional containing the setting, or empty if not set
122+
*/
123+
public Optional<Boolean> getAllowExternalBlobOutsideBases() {
124+
return allowExternalBlobOutsideBases;
125+
}
126+
112127
@Override
113128
public String toString() {
114129
return MoreObjects.toStringHelper(this)
@@ -132,6 +147,7 @@ public static class Builder {
132147
private Map<String, String> storageOptions = new HashMap<>();
133148
private Optional<List<BasePath>> initialBases = Optional.empty();
134149
private Optional<List<String>> targetBases = Optional.empty();
150+
private Optional<Boolean> allowExternalBlobOutsideBases = Optional.empty();
135151

136152
public Builder withMaxRowsPerFile(int maxRowsPerFile) {
137153
this.maxRowsPerFile = Optional.of(maxRowsPerFile);
@@ -183,6 +199,21 @@ public Builder withTargetBases(List<String> targetBases) {
183199
return this;
184200
}
185201

202+
/**
203+
* Allow external blob URIs outside registered bases.
204+
*
205+
* <p>When true, blob v2 columns can reference external URIs (e.g. pointing to blob files in
206+
* another Lance dataset) that are not under any registered base path. The URI is stored as an
207+
* absolute external reference with base_id=0.
208+
*
209+
* @param allow true to allow external blob URIs outside bases
210+
* @return this builder
211+
*/
212+
public Builder withAllowExternalBlobOutsideBases(boolean allow) {
213+
this.allowExternalBlobOutsideBases = Optional.of(allow);
214+
return this;
215+
}
216+
186217
public WriteParams build() {
187218
return new WriteParams(
188219
maxRowsPerFile,
@@ -194,7 +225,8 @@ public WriteParams build() {
194225
enableV2ManifestPaths,
195226
storageOptions,
196227
initialBases,
197-
targetBases);
228+
targetBases,
229+
allowExternalBlobOutsideBases);
198230
}
199231
}
200232
}

0 commit comments

Comments
 (0)