lance-format
diff --git a/‎lance-spark-3.4_2.12/src/test/java/org/lance/spark/BlobJoinTest.java‎
Lines changed: 19 additions & 0 deletions b/‎lance-spark-3.4_2.12/src/test/java/org/lance/spark/BlobJoinTest.java‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lance-spark-3.5_2.12/src/test/java/org/lance/spark/BlobJoinTest.java‎
Lines changed: 19 additions & 0 deletions b/‎lance-spark-3.5_2.12/src/test/java/org/lance/spark/BlobJoinTest.java‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java‎
Lines changed: 41 additions & 1 deletion b/‎lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentColumnarBatchScanner.java‎
Lines changed: 41 additions & 1 deletion
diff --git a/‎lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java‎
Lines changed: 61 additions & 33 deletions b/‎lance-spark-base_2.12/src/main/java/org/lance/spark/internal/LanceFragmentScanner.java‎
Lines changed: 61 additions & 33 deletions
@@ -0,0 +1,19 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark;
+
+/** Concrete implementation of BaseBlobJoinTest for Spark 3.4. */
+public class BlobJoinTest extends BaseBlobJoinTest {
+  // All test methods are inherited from BaseBlobJoinTest
+}
@@ -0,0 +1,19 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.spark;
+
+/** Concrete implementation of BaseBlobJoinTest for Spark 3.5. */
+public class BlobJoinTest extends BaseBlobJoinTest {
+  // All test methods are inherited from BaseBlobJoinTest
+}
@@ -19,6 +19,7 @@
 import org.lance.spark.vectorized.LanceArrowColumnVector;
 
 import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.ipc.ArrowReader;
@@ -36,6 +37,7 @@
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 
 public class LanceFragmentColumnarBatchScanner implements AutoCloseable {
   private final LanceFragmentScanner fragmentScanner;
@@ -117,6 +119,10 @@ private List<ColumnVector> buildSparkOrderedVectors(
       actualFields.put(rootVectors.get(i).getField().getName(), rootVectors.get(i));
     }
 
+    // Extract row addresses for blob reference support
+    Set<String> blobColumnNames = fragmentScanner.getBlobColumnNames();
+    long[] rowAddresses = extractRowAddresses(rootVectors, blobColumnNames, root.getRowCount());
+
     List<ColumnVector> fieldVectors = new ArrayList<>(schema.size());
     StructField[] fields = schema.fields();
     for (StructField field : fields) {
@@ -150,12 +156,46 @@ private List<ColumnVector> buildSparkOrderedVectors(
           throw new IllegalStateException(
               "Lance scan did not return expected field '" + fieldName + "'");
         }
-        fieldVectors.add(new LanceArrowColumnVector(vector));
+        LanceArrowColumnVector colVec = new LanceArrowColumnVector(vector);
+
+        // Set blob reference context so getBinary() produces blob references
+        if (rowAddresses != null && blobColumnNames.contains(fieldName)) {
+          BlobStructAccessor blobAccessor = colVec.getBlobStructAccessor();
+          if (blobAccessor != null) {
+            blobAccessor.setBlobReferenceContext(
+                fragmentScanner.getDatasetUri(), fieldName, rowAddresses);
+          }
+        }
+
+        fieldVectors.add(colVec);
       }
     }
     return fieldVectors;
   }
 
+  /**
+   * Extracts row addresses from the {@code _rowaddr} column appended by the native scanner. Row
+   * addresses are needed to construct blob references that allow the write side to fetch actual
+   * blob bytes from the source dataset.
+   */
+  private long[] extractRowAddresses(
+      List<FieldVector> rootVectors, Set<String> blobColumnNames, int rowCount) {
+    if (blobColumnNames.isEmpty()) {
+      return null;
+    }
+    for (FieldVector fv : rootVectors) {
+      if (LanceConstant.ROW_ADDRESS.equals(fv.getField().getName()) && fv instanceof UInt8Vector) {
+        UInt8Vector rowAddrVector = (UInt8Vector) fv;
+        long[] rowAddresses = new long[rowCount];
+        for (int i = 0; i < rowCount; i++) {
+          rowAddresses[i] = rowAddrVector.get(i);
+        }
+        return rowAddresses;
+      }
+    }
+    return null;
+  }
+
   // Virtual column vector for blob position
   private static class BlobPositionColumnVector extends ColumnVector {
     private final BlobStructAccessor accessor;
 
@@ -21,6 +21,7 @@
 import org.lance.spark.LanceRuntime;
 import org.lance.spark.LanceSparkReadOptions;
 import org.lance.spark.read.LanceInputPartition;
+import org.lance.spark.utils.BlobUtils;
 import org.lance.spark.utils.Utils;
 
 import org.apache.arrow.vector.ipc.ArrowReader;
@@ -29,51 +30,55 @@
 
 import java.io.IOException;
 import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 import java.util.stream.Collectors;
 
 public class LanceFragmentScanner implements AutoCloseable {
   private final Dataset dataset;
   private final LanceScanner scanner;
   private final int fragmentId;
-  private final boolean withFragemtId;
+  private final boolean withFragmentId;
   private final LanceInputPartition inputPartition;
   private final long datasetOpenTimeNs;
   private final long scannerCreateTimeNs;
 
+  /**
+   * Whether the scanner requested _rowaddr for blob reference support. When true, the _rowaddr
+   * column in the Arrow batch was implicitly added and should be stripped from user-visible output.
+   */
+  private final boolean withRowAddrForBlobs;
+
+  /** The names of blob columns in the projected schema. */
+  private final Set<String> blobColumnNames;
+
   private LanceFragmentScanner(
       Dataset dataset,
       LanceScanner scanner,
       int fragmentId,
       boolean withFragmentId,
       LanceInputPartition inputPartition,
       long datasetOpenTimeNs,
-      long scannerCreateTimeNs) {
+      long scannerCreateTimeNs,
+      boolean withRowAddrForBlobs,
+      Set<String> blobColumnNames) {
     this.dataset = dataset;
     this.scanner = scanner;
     this.fragmentId = fragmentId;
-    this.withFragemtId = withFragmentId;
+    this.withFragmentId = withFragmentId;
     this.inputPartition = inputPartition;
     this.datasetOpenTimeNs = datasetOpenTimeNs;
     this.scannerCreateTimeNs = scannerCreateTimeNs;
+    this.withRowAddrForBlobs = withRowAddrForBlobs;
+    this.blobColumnNames = blobColumnNames;
   }
 
   public static LanceFragmentScanner create(int fragmentId, LanceInputPartition inputPartition) {
     Dataset dataset = null;
     LanceScanner lanceScanner = null;
     try {
       LanceSparkReadOptions readOptions = inputPartition.getReadOptions();
-      // Optionally rebuild the namespace client on the executor so the dataset open routes through
-      // Utils.OpenDatasetBuilder's namespaceClient branch. This preserves the storage options
-      // provider on the Rust side, which refreshes short-lived vended credentials (e.g. STS
-      // tokens) during long-running scans. The price is an eager describeTable() RPC against the
-      // namespace on every fragment open.
-      //
-      // For catalogs whose backing service authenticates per-call (e.g. Hive Metastore over
-      // Kerberos) executors typically lack a TGT and that RPC fails with "GSS initiate failed".
-      // Setting LanceSparkReadOptions.CONFIG_EXECUTOR_CREDENTIAL_REFRESH=false makes executors
-      // skip the rebuild and open the dataset by URI using the initialStorageOptions the driver
-      // already obtained, at the cost of losing the Rust-side credential refresh callback.
       if (inputPartition.getNamespaceImpl() != null && readOptions.isExecutorCredentialRefresh()) {
         if (LanceRuntime.useNamespaceOnWorkers(inputPartition.getNamespaceImpl())) {
           readOptions.setNamespace(
@@ -97,31 +102,34 @@ public static LanceFragmentScanner create(int fragmentId, LanceInputPartition in
                 fragmentId, readOptions.getDatasetUri(), readOptions.getVersion()));
       }
       ScanOptions.Builder scanOptions = new ScanOptions.Builder();
+
+      // Detect blob columns in the schema
+      Set<String> blobColumnNames = getBlobColumnNames(inputPartition.getSchema());
+      boolean hasBlobColumns = !blobColumnNames.isEmpty();
+
       List<String> projectedColumns = getColumnNames(inputPartition.getSchema());
       if (projectedColumns.isEmpty() && inputPartition.getSchema().isEmpty()) {
-        // Lance requires at least one projected column. Use _rowid as a lightweight
-        // sentinel so the scanner still returns the correct row count (e.g. SELECT 1).
         scanOptions.withRowId(true);
       }
       if (hasField(inputPartition.getSchema(), LanceConstant.ROW_ID)) {
         scanOptions.withRowId(true);
       }
-      if (hasField(inputPartition.getSchema(), LanceConstant.ROW_ADDRESS)) {
+
+      // Request _rowaddr when blob columns are present so we can build blob references.
+      boolean userRequestedRowAddr =
+          hasField(inputPartition.getSchema(), LanceConstant.ROW_ADDRESS);
+      boolean withRowAddrForBlobs = hasBlobColumns && !userRequestedRowAddr;
+      if (hasBlobColumns || userRequestedRowAddr) {
         scanOptions.withRowAddress(true);
       }
+
       scanOptions.columns(projectedColumns);
       if (inputPartition.getWhereCondition().isPresent()) {
         scanOptions.filter(inputPartition.getWhereCondition().get());
       }
       scanOptions.batchSize(readOptions.getBatchSize());
       if (readOptions.getNearest() != null) {
         scanOptions.nearest(readOptions.getNearest());
-        // We strictly set `prefilter = true` here to ensure query correctness.
-        // This is necessary due to the combination of two factors:
-        // 1. Spark currently performs the vector search by individually scanning each fragment.
-        // 2. Lance mandates that `prefilter` must be enabled for fragmented vector queries.
-        // If Spark's execution model or Lance's search functionality changes in the future,
-        // we need to revisit this.
         scanOptions.prefilter(true);
       }
       if (inputPartition.getLimit().isPresent()) {
@@ -145,7 +153,9 @@ public static LanceFragmentScanner create(int fragmentId, LanceInputPartition in
           withFragmentId,
           inputPartition,
           dsOpenTimeNs,
-          scanCreateTimeNs);
+          scanCreateTimeNs,
+          withRowAddrForBlobs,
+          blobColumnNames);
     } catch (Throwable throwable) {
       if (lanceScanner != null) {
         try {
@@ -211,8 +221,8 @@ public int fragmentId() {
     return fragmentId;
   }
 
-  public boolean withFragemtId() {
-    return withFragemtId;
+  public boolean withFragmentId() {
+    return withFragmentId;
   }
 
   public LanceInputPartition getInputPartition() {
@@ -227,19 +237,37 @@ public long getScannerCreateTimeNs() {
     return scannerCreateTimeNs;
   }
 
-  /**
-   * Builds the projection column list for the scanner. Row ID and row address are requested through
-   * explicit scan flags so Lance computes them from the active fragment metadata instead of reading
-   * them as regular columns.
-   */
+  /** Whether the scanner implicitly requested _rowaddr for blob reference support. */
+  public boolean isWithRowAddrForBlobs() {
+    return withRowAddrForBlobs;
+  }
+
+  /** Returns the blob column names in the projected schema. */
+  public Set<String> getBlobColumnNames() {
+    return blobColumnNames;
+  }
+
+  /** Returns the dataset URI for blob references. */
+  public String getDatasetUri() {
+    return inputPartition.getReadOptions().getDatasetUri();
+  }
+
+  private static Set<String> getBlobColumnNames(StructType schema) {
+    Set<String> blobColumns = new HashSet<>();
+    for (StructField field : schema.fields()) {
+      if (BlobUtils.isBlobSparkField(field)) {
+        blobColumns.add(field.name());
+      }
+    }
+    return blobColumns;
+  }
+
   private static List<String> getColumnNames(StructType schema) {
-    // Collect all field names in the schema for quick lookup
     java.util.Set<String> schemaFields = new java.util.HashSet<>();
     for (StructField field : schema.fields()) {
       schemaFields.add(field.name());
     }
 
-    // Regular data columns (exclude all special/metadata columns)
     List<String> columns =
         Arrays.stream(schema.fields())
             .map(StructField::name)