switch to taskid-keyed state for CometUDFs.

mbutrovich · mbutrovich · commit 17b2714b0528 · 2026-05-14T14:55:11.000-04:00
diff --git a/common/src/main/java/org/apache/comet/udf/CometUdfBridge.java b/common/src/main/java/org/apache/comet/udf/CometUdfBridge.java
@@ -19,8 +19,7 @@
 
 package org.apache.comet.udf;
 
-import java.util.LinkedHashMap;
-import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import org.apache.arrow.c.ArrowArray;
 import org.apache.arrow.c.ArrowSchema;
@@ -30,31 +29,48 @@
 import org.apache.arrow.vector.ValueVector;
 import org.apache.spark.TaskContext;
 import org.apache.spark.comet.CometTaskContextShim;
+import org.apache.spark.util.TaskCompletionListener;
 
 /**
  * JNI entry point for native execution to invoke a {@link CometUDF}. Matches the static-method
  * pattern used by CometScalarSubquery so the native side can dispatch via
  * call_static_method_unchecked.
+ *
+ * <p>Cache invariants:
+ *
+ * <ol>
+ *   <li>For each live Spark task attempt there is at most one {@link CometUDF} instance per class
+ *       name.
+ *   <li>A {@link CometUDF} instance is visible only within the Spark task attempt that instantiated
+ *       it. Two task attempts observing the same class name receive distinct instances.
+ *   <li>At any instant at most one thread is inside {@code evaluate()} for a given {@code
+ *       taskAttemptId}. This follows from Spark executing one native future per partition and Tokio
+ *       polling one future per worker at a time.
+ *   <li>All instances for a task are dropped by the {@link TaskCompletionListener} registered on
+ *       the first cache miss for that task. No cache entry outlives its task.
+ *   <li>When {@code taskContext} is {@code null} (unit tests, direct native driver) the fallback
+ *       key {@code -1L} is used; that bucket is never evicted because no task-completion event will
+ *       fire.
+ * </ol>
+ *
+ * <p>Keying by {@code taskAttemptId} rather than by thread keeps the cache correct under Tokio
+ * work-stealing: on the scan-free execution path the same Spark task can be polled by different
+ * Tokio workers across batches, so a thread-local cache would lose per-task state on migration. The
+ * task attempt ID is stable for the life of the task regardless of which worker is polling.
  */
 public class CometUdfBridge {
 
-  // Per-thread, bounded LRU of UDF instances keyed by class name. Comet
-  // native execution threads (Tokio/DataFusion worker pool) are reused
-  // across tasks within an executor, so the effective lifetime of cached
-  // entries is the worker thread (i.e. the executor JVM). Fine for
-  // stateless UDFs; future stateful UDFs would need explicit per-task
-  // isolation.
-  private static final int CACHE_CAPACITY = 64;
+  /**
+   * Task-scoped cache of {@link CometUDF} instances. Outer map keys are Spark task attempt IDs (or
+   * {@code -1L} when no {@link TaskContext} is available). Inner maps hold one instance per UDF
+   * class name for the task's lifetime. Entries are removed by the {@link TaskCompletionListener}
+   * registered on the first cache miss per task.
+   */
+  private static final ConcurrentHashMap<Long, ConcurrentHashMap<String, CometUDF>> INSTANCES =
+      new ConcurrentHashMap<>();
 
-  private static final ThreadLocal<LinkedHashMap<String, CometUDF>> INSTANCES =
-      ThreadLocal.withInitial(
-          () ->
-              new LinkedHashMap<String, CometUDF>(CACHE_CAPACITY, 0.75f, true) {
-                @Override
-                protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
-                  return size() > CACHE_CAPACITY;
-                }
-              });
+  /** Sentinel key for calls that carry no {@link TaskContext} (unit tests, direct driver). */
+  private static final long NO_TASK_ID = -1L;
 
   /**
    * Called from native via JNI.
@@ -76,7 +92,9 @@ protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
    *     / {@code Uuid} / {@code MonotonicallyIncreasingID} that read the partition index via {@code
    *     TaskContext.get().partitionId()}) sees the real context rather than null. The thread-local
    *     is cleared in a {@code finally} so Tokio workers don't leak a stale TaskContext across
-   *     invocations.
+   *     invocations. The task attempt ID drawn from this context also keys the UDF-instance cache,
+   *     so a UDF holding per-task state in fields sees a consistent instance for every call within
+   *     the task regardless of which Tokio worker is polling.
    */
   public static void evaluate(
       String udfClassName,
@@ -86,14 +104,31 @@ public static void evaluate(
       long outSchemaPtr,
       int numRows,
       TaskContext taskContext) {
+    assert udfClassName != null && !udfClassName.isEmpty() : "udfClassName must be non-empty";
+    assert inputArrayPtrs != null && inputSchemaPtrs != null
+        : "input pointer arrays must be non-null";
+    assert inputArrayPtrs.length == inputSchemaPtrs.length
+        : "input array pointer count must equal schema pointer count";
+    assert numRows >= 0 : "numRows must be non-negative";
+    assert outArrayPtr != 0L : "outArrayPtr must be a valid FFI pointer";
+    assert outSchemaPtr != 0L : "outSchemaPtr must be a valid FFI pointer";
+
     boolean installedTaskContext = false;
     if (taskContext != null && TaskContext.get() == null) {
       CometTaskContextShim.set(taskContext);
       installedTaskContext = true;
+      assert TaskContext.get() == taskContext
+          : "TaskContext install did not take effect on this thread";
     }
     try {
       evaluateInternal(
-          udfClassName, inputArrayPtrs, inputSchemaPtrs, outArrayPtr, outSchemaPtr, numRows);
+          udfClassName,
+          inputArrayPtrs,
+          inputSchemaPtrs,
+          outArrayPtr,
+          outSchemaPtr,
+          numRows,
+          taskContext);
     } finally {
       if (installedTaskContext) {
         CometTaskContextShim.unset();
@@ -107,24 +142,50 @@ private static void evaluateInternal(
       long[] inputSchemaPtrs,
       long outArrayPtr,
       long outSchemaPtr,
-      int numRows) {
-    LinkedHashMap<String, CometUDF> cache = INSTANCES.get();
-    CometUDF udf = cache.get(udfClassName);
-    if (udf == null) {
-      try {
-        // Resolve via the executor's context classloader so user-supplied UDF jars
-        // (added via spark.jars / --jars) are visible.
-        ClassLoader cl = Thread.currentThread().getContextClassLoader();
-        if (cl == null) {
-          cl = CometUdfBridge.class.getClassLoader();
-        }
-        udf =
-            (CometUDF) Class.forName(udfClassName, true, cl).getDeclaredConstructor().newInstance();
-      } catch (ReflectiveOperationException e) {
-        throw new RuntimeException("Failed to instantiate CometUDF: " + udfClassName, e);
-      }
-      cache.put(udfClassName, udf);
-    }
+      int numRows,
+      TaskContext taskContext) {
+    long taskAttemptId = (taskContext != null) ? taskContext.taskAttemptId() : NO_TASK_ID;
+
+    ConcurrentHashMap<String, CometUDF> perTask =
+        INSTANCES.computeIfAbsent(
+            taskAttemptId,
+            id -> {
+              ConcurrentHashMap<String, CometUDF> fresh = new ConcurrentHashMap<>();
+              if (taskContext != null) {
+                // computeIfAbsent runs this lambda at most once per key, so the listener is
+                // registered exactly once per task attempt.
+                taskContext.addTaskCompletionListener(
+                    (TaskCompletionListener)
+                        ctx -> {
+                          ConcurrentHashMap<String, CometUDF> removed = INSTANCES.remove(id);
+                          assert removed != null
+                              : "task-completion listener fired but cache already removed "
+                                  + "entry for task "
+                                  + id;
+                        });
+              }
+              return fresh;
+            });
+    assert perTask != null : "per-task cache must be non-null after computeIfAbsent";
+
+    CometUDF udf =
+        perTask.computeIfAbsent(
+            udfClassName,
+            name -> {
+              try {
+                // Resolve via the executor's context classloader so user-supplied UDF jars
+                // (added via spark.jars / --jars) are visible.
+                ClassLoader cl = Thread.currentThread().getContextClassLoader();
+                if (cl == null) {
+                  cl = CometUdfBridge.class.getClassLoader();
+                }
+                return (CometUDF)
+                    Class.forName(name, true, cl).getDeclaredConstructor().newInstance();
+              } catch (ReflectiveOperationException e) {
+                throw new RuntimeException("Failed to instantiate CometUDF: " + name, e);
+              }
+            });
+    assert udf != null : "reflective instantiation returned null for " + udfClassName;
 
     BufferAllocator allocator = org.apache.comet.package$.MODULE$.CometArrowAllocator();
 
@@ -138,6 +199,9 @@ private static void evaluateInternal(
       }
 
       result = udf.evaluate(inputs, numRows);
+      assert result instanceof FieldVector
+          : "CometUDF implementations must return FieldVector; got "
+              + (result == null ? "null" : result.getClass().getName());
       if (!(result instanceof FieldVector)) {
         throw new RuntimeException(
             "CometUDF.evaluate() must return a FieldVector, got: " + result.getClass().getName());
diff --git a/common/src/main/scala/org/apache/comet/udf/CometUDF.scala b/common/src/main/scala/org/apache/comet/udf/CometUDF.scala
@@ -34,8 +34,14 @@ import org.apache.arrow.vector.ValueVector
  * `numRows`; UDFs that may be called with zero data columns (e.g. a zero-arg ScalaUDF through the
  * codegen dispatcher) need `numRows` to know how many rows to produce.
  *
- * Implementations must have a public no-arg constructor and should be stateless: instances are
- * cached per executor thread for the lifetime of the JVM.
+ * Implementations must have a public no-arg constructor. A fresh instance is created per Spark
+ * task attempt per class and reused for every call within that task. Instances may hold per-task
+ * state in fields (counters, compiled patterns, scratch buffers); instances are dropped at task
+ * completion. Do not hold state that must persist across tasks.
+ *
+ * At most one thread calls `evaluate` on a given instance at a time: Spark runs one native future
+ * per partition and Tokio polls one future per worker, so the per-task instance is never touched
+ * concurrently even if the task's future migrates between Tokio workers across batches.
  */
 trait CometUDF {
   def evaluate(inputs: Array[ValueVector], numRows: Int): ValueVector
diff --git a/native/core/src/execution/planner.rs b/native/core/src/execution/planner.rs
@@ -751,6 +751,13 @@ impl PhysicalPlanner {
                     to_arrow_datatype(udf.return_type.as_ref().ok_or_else(|| {
                         GeneralError("JvmScalarUdf missing return_type".to_string())
                     })?);
+                // Invariant: task_context is propagated for every JvmScalarUdfExpr built during
+                // normal execution. The TEST_EXEC_CONTEXT_ID path is the only context in which
+                // task_context may legitimately be None (unit tests, direct native driver runs).
+                debug_assert!(
+                    self.task_context.is_some() || self.exec_context_id == TEST_EXEC_CONTEXT_ID,
+                    "task_context must be set for non-test execution"
+                );
                 Ok(Arc::new(JvmScalarUdfExpr::new(
                     udf.class_name.clone(),
                     args,
diff --git a/native/spark-expr/src/jvm_udf/mod.rs b/native/spark-expr/src/jvm_udf/mod.rs
@@ -62,6 +62,10 @@ impl JvmScalarUdfExpr {
         return_nullable: bool,
         task_context: Option<Arc<Global<JObject<'static>>>>,
     ) -> Self {
+        debug_assert!(
+            !class_name.is_empty(),
+            "JvmScalarUdfExpr requires a non-empty class name"
+        );
         Self {
             class_name,
             args,
@@ -159,6 +163,13 @@ impl PhysicalExpr for JvmScalarUdfExpr {
             .map(|b| b.as_ref() as *const FFI_ArrowSchema as i64)
             .collect();
 
+        debug_assert!(!self.class_name.is_empty(), "class_name must not be empty");
+        debug_assert_eq!(
+            in_arr_ptrs.len(),
+            in_sch_ptrs.len(),
+            "input array and schema pointer counts must match"
+        );
+
         let mut out_array = Box::new(FFI_ArrowArray::empty());
         let mut out_schema = Box::new(FFI_ArrowSchema::empty());
         let out_arr_ptr = out_array.as_mut() as *mut FFI_ArrowArray as i64;
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala
@@ -439,6 +439,26 @@ class CometCodegenDispatchSmokeSuite extends CometTestBase with AdaptiveSparkPla
     }
   }
 
+  test("per-task cache isolates UDF state across sequential task runs in one session") {
+    // Regression guard for the cache-scoping invariant on CometUdfBridge: instances live for
+    // exactly one Spark task and are dropped on task completion, so a stateful kernel sees a
+    // fresh instance per task. Running the same `monotonically_increasing_id()`-carrying query
+    // twice in one session must produce identical results each run. Under a cache that outlived
+    // a task and got reused by the next one, the counter would continue from the previous run's
+    // final value and the second run's IDs would diverge. Under a cache that was keyed by Tokio
+    // worker thread rather than task attempt ID, worker reuse across tasks would cause the same
+    // leak whenever the second task happened to be polled by the same worker.
+    val rows = (0 until 2048).map(i => s"row_$i")
+    withSubjects(rows: _*) {
+      val q = "SELECT s, monotonically_increasing_id() AS mid FROM t"
+      val first = sql(q).collect().map(r => (r.getString(0), r.getLong(1))).toSeq
+      val second = sql(q).collect().map(r => (r.getString(0), r.getLong(1))).toSeq
+      assert(
+        first == second,
+        s"per-task cache leaked state across runs: first=${first.take(5)} second=${second.take(5)}")
+    }
+  }
+
   /**
    * Scalar ScalaUDF smoke tests. These prove that user-registered UDFs route through the codegen
    * dispatcher rather than forcing a whole-plan Spark fallback. Spark's `ScalaUDF.doGenCode`