fix: fix memory safety issue in native c2r (#3367)

andygrove · web-flow · commit d2fbd6e3e9ba · 2026-02-04T05:43:46.000-07:00
diff --git a/spark/src/main/scala/org/apache/comet/NativeColumnarToRowConverter.scala b/spark/src/main/scala/org/apache/comet/NativeColumnarToRowConverter.scala
@@ -139,6 +139,6 @@ private class NativeRowIterator(info: NativeColumnarToRowInfo, unsafeRow: Unsafe
     unsafeRow.pointTo(null, rowAddress, rowSize)
     currentIdx += 1
 
-    unsafeRow
+    unsafeRow.copy()
   }
 }
diff --git a/spark/src/test/scala/org/apache/comet/exec/CometNativeColumnarToRowSuite.scala b/spark/src/test/scala/org/apache/comet/exec/CometNativeColumnarToRowSuite.scala
@@ -32,7 +32,7 @@ import org.apache.spark.sql.comet.CometNativeColumnarToRowExec
 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
 import org.apache.spark.sql.types._
 
-import org.apache.comet.CometConf
+import org.apache.comet.{CometConf, NativeColumnarToRowConverter}
 import org.apache.comet.testing.{DataGenOptions, FuzzDataGenerator, SchemaGenOptions}
 
 /**
@@ -470,6 +470,70 @@ class CometNativeColumnarToRowSuite extends CometTestBase with AdaptiveSparkPlan
     }
   }
 
+  // Regression test for https://github.com/apache/datafusion-comet/issues/3308
+  // Native columnar-to-row returns UnsafeRow pointing into a Rust-owned buffer that is
+  // cleared/reused on each convert() call. This test directly exercises the converter:
+  // it converts multiple batches and holds row references from earlier batches, then
+  // verifies they still contain correct data. Without a fix (e.g., copying rows),
+  // rows from earlier batches will contain corrupted data from buffer reuse.
+  test("rows from earlier batches are not corrupted by subsequent convert() calls") {
+    import org.apache.spark.sql.catalyst.InternalRow
+    import org.apache.spark.sql.comet.execution.arrow.CometArrowConverters
+    import org.apache.spark.unsafe.types.UTF8String
+
+    import scala.collection.mutable.ArrayBuffer
+
+    val schema = new StructType().add("id", IntegerType).add("str", StringType)
+
+    // Create multiple small batches using CometArrowConverters
+    val numBatches = 10
+    val rowsPerBatch = 5
+    val totalRows = numBatches * rowsPerBatch
+
+    val rows = (0 until totalRows).map { i =>
+      InternalRow(i, UTF8String.fromString(s"value_$i"))
+    }
+
+    // Create batches using rowToArrowBatchIter which handles shading internally
+    val batchIter = CometArrowConverters
+      .rowToArrowBatchIter(rows.iterator, schema, rowsPerBatch, "UTC", null)
+
+    val converter = new NativeColumnarToRowConverter(schema, rowsPerBatch)
+    try {
+      // Collect all rows from all batches into a single array
+      // The converter returns rows that should be independent copies
+      val allRows = new ArrayBuffer[InternalRow]()
+      var batchCount = 0
+
+      while (batchIter.hasNext) {
+        val batch = batchIter.next()
+        batchCount += 1
+        // Convert this batch and collect all rows
+        val rowIter = converter.convert(batch)
+        while (rowIter.hasNext) {
+          allRows += rowIter.next()
+        }
+        batch.close()
+      }
+
+      assert(batchCount == numBatches, s"Expected $numBatches batches, got $batchCount")
+      assert(allRows.length == totalRows, s"Expected $totalRows rows, got ${allRows.length}")
+
+      // Verify that reading through held references produces all expected
+      // distinct values. If rows weren't copied, all entries would point
+      // to the same reused UnsafeRow object with stale data.
+      val distinctIds = allRows.map(_.getInt(0)).toSet
+      assert(
+        distinctIds.size == totalRows,
+        s"UnsafeRow reuse bug: expected $totalRows distinct row IDs but got " +
+          s"${distinctIds.size} (values: ${distinctIds.toSeq.sorted.mkString(", ")}). " +
+          "This means rows were not copied and all references point to the same " +
+          "reused UnsafeRow object.")
+    } finally {
+      converter.close()
+    }
+  }
+
   /**
    * Helper to create a parquet table from a DataFrame and run a function with it.
    */

Original file line number	Diff line number	Diff line change
`@@ -139,6 +139,6 @@ private class NativeRowIterator(info: NativeColumnarToRowInfo, unsafeRow: Unsafe`
`139`	`139`	`unsafeRow.pointTo(null, rowAddress, rowSize)`
`140`	`140`	`currentIdx += 1`
`141`	`141`
`142`		`- unsafeRow`
	`142`	`+ unsafeRow.copy()`
`143`	`143`	`}`
`144`	`144`	`}`