apache
diff --git a/‎.github/workflows/pr_benchmark_check.yml‎
Lines changed: 1 addition & 5 deletions b/‎.github/workflows/pr_benchmark_check.yml‎
Lines changed: 1 addition & 5 deletions
diff --git a/‎common/src/main/java/org/apache/comet/udf/CometBatchKernel.java‎
Lines changed: 68 additions & 0 deletions b/‎common/src/main/java/org/apache/comet/udf/CometBatchKernel.java‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 45 additions & 32 deletions b/‎common/src/main/java/org/apache/comet/udf/CometUdfBridge.java‎
Lines changed: 45 additions & 32 deletions
diff --git a/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 40 additions & 0 deletions b/‎common/src/main/scala/org/apache/comet/CometConf.scala‎
Lines changed: 40 additions & 0 deletions
@@ -84,9 +84,5 @@ jobs:
             ${{ runner.os }}-benchmark-maven-
 
       - name: Check Scala compilation and linting
-        # Pin to spark-4.0 (Scala 2.13.16) because the default profile is now
-        # spark-4.1 / Scala 2.13.17, and semanticdb-scalac_2.13.17 is not yet
-        # published, which breaks `-Psemanticdb`. See pr_build_linux.yml for
-        # the same exclusion in the main lint matrix.
         run: |
-          ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -Pspark-4.0 -DskipTests
+          ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.comet.udf;
+
+import org.apache.arrow.vector.FieldVector;
+import org.apache.arrow.vector.ValueVector;
+
+/**
+ * Abstract base extended by the Janino-compiled batch kernel emitted by {@code
+ * CometBatchKernelCodegen}. The generated subclass extends {@code CometInternalRow} (so Spark's
+ * {@code BoundReference.genCode} can call {@code this.getUTF8String(ord)} directly) and carries
+ * typed input fields baked at codegen time, one per input column. Expression evaluation plus Arrow
+ * read/write fuse into one method per expression tree.
+ *
+ * <p>Input scope: any {@code ValueVector[]}; the generated subclass casts each slot to the concrete
+ * Arrow type the compile-time schema specified. Output is a generic {@code FieldVector}; the
+ * generated subclass casts to the concrete type matching the bound expression's {@code dataType}.
+ * Widen input support by adding vector classes to the getter switch in {@code
+ * CometBatchKernelCodegen.typedInputAccessors}; widen output support by adding cases in {@code
+ * CometBatchKernelCodegen.allocateOutput} and {@code outputWriter}.
+ */
+public abstract class CometBatchKernel extends CometInternalRow {
+
+  protected final Object[] references;
+
+  protected CometBatchKernel(Object[] references) {
+    this.references = references;
+  }
+
+  /**
+   * Process one batch.
+   *
+   * @param inputs Arrow input vectors; length and concrete classes must match the schema the kernel
+   *     was compiled against
+   * @param output Arrow output vector; caller allocates to the expression's {@code dataType}
+   * @param numRows number of rows in this batch
+   */
+  public abstract void process(ValueVector[] inputs, FieldVector output, int numRows);
+
+  /**
+   * Run partition-dependent initialization. The generated subclass overrides this to execute
+   * statements collected via {@code CodegenContext.addPartitionInitializationStatement}, for
+   * example reseeding {@code Rand}'s {@code XORShiftRandom} from {@code seed + partitionIndex}.
+   * Deterministic expressions leave this as a no-op.
+   *
+   * <p>The caller must invoke this before the first {@code process} call of each partition. The
+   * generated subclass is not thread-safe across concurrent {@code process} calls, so kernels are
+   * allocated per dispatcher invocation and init is run once on the fresh instance.
+   */
+  public void init(int partitionIndex) {}
+}
@@ -19,7 +19,8 @@
 
 package org.apache.comet.udf;
 
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.LinkedHashMap;
+import java.util.Map;
 
 import org.apache.arrow.c.ArrowArray;
 import org.apache.arrow.c.ArrowSchema;
@@ -35,10 +36,23 @@
  */
 public class CometUdfBridge {
 
-  // Process-wide cache of UDF instances keyed by class name. CometUDF
-  // implementations are required to be stateless (see CometUDF), so a
-  // single shared instance per class is safe across native worker threads.
-  private static final ConcurrentHashMap<String, CometUDF> INSTANCES = new ConcurrentHashMap<>();
+  // Per-thread, bounded LRU of UDF instances keyed by class name. Comet
+  // native execution threads (Tokio/DataFusion worker pool) are reused
+  // across tasks within an executor, so the effective lifetime of cached
+  // entries is the worker thread (i.e. the executor JVM). This is fine for
+  // stateless UDFs like RegExpLikeUDF; future stateful UDFs would need
+  // explicit per-task isolation.
+  private static final int CACHE_CAPACITY = 64;
+
+  private static final ThreadLocal<LinkedHashMap<String, CometUDF>> INSTANCES =
+      ThreadLocal.withInitial(
+          () ->
+              new LinkedHashMap<String, CometUDF>(CACHE_CAPACITY, 0.75f, true) {
+                @Override
+                protected boolean removeEldestEntry(Map.Entry<String, CometUDF> eldest) {
+                  return size() > CACHE_CAPACITY;
+                }
+              });
 
   /**
    * Called from native via JNI.
@@ -48,30 +62,35 @@ public class CometUdfBridge {
    * @param inputSchemaPtrs addresses of pre-allocated FFI_ArrowSchema structs (one per input)
    * @param outArrayPtr address of pre-allocated FFI_ArrowArray for the result
    * @param outSchemaPtr address of pre-allocated FFI_ArrowSchema for the result
+   * @param numRows number of rows in the current batch. Mirrors DataFusion's
+   *     {@code ScalarFunctionArgs.number_rows} and gives UDFs an explicit batch-size signal for
+   *     cases where no input arg is a batch-length array (e.g. a zero-arg non-deterministic
+   *     ScalaUDF). UDFs that already read size from their input vectors can ignore it.
    */
   public static void evaluate(
       String udfClassName,
       long[] inputArrayPtrs,
       long[] inputSchemaPtrs,
       long outArrayPtr,
-      long outSchemaPtr) {
-    CometUDF udf =
-        INSTANCES.computeIfAbsent(
-            udfClassName,
-            name -> {
-              try {
-                // Resolve via the executor's context classloader so user-supplied UDF jars
-                // (added via spark.jars / --jars) are visible.
-                ClassLoader cl = Thread.currentThread().getContextClassLoader();
-                if (cl == null) {
-                  cl = CometUdfBridge.class.getClassLoader();
-                }
-                return (CometUDF)
-                    Class.forName(name, true, cl).getDeclaredConstructor().newInstance();
-              } catch (ReflectiveOperationException e) {
-                throw new RuntimeException("Failed to instantiate CometUDF: " + name, e);
-              }
-            });
+      long outSchemaPtr,
+      int numRows) {
+    LinkedHashMap<String, CometUDF> cache = INSTANCES.get();
+    CometUDF udf = cache.get(udfClassName);
+    if (udf == null) {
+      try {
+        // Resolve via the executor's context classloader so user-supplied UDF jars
+        // (added via spark.jars / --jars) are visible.
+        ClassLoader cl = Thread.currentThread().getContextClassLoader();
+        if (cl == null) {
+          cl = CometUdfBridge.class.getClassLoader();
+        }
+        udf =
+            (CometUDF) Class.forName(udfClassName, true, cl).getDeclaredConstructor().newInstance();
+      } catch (ReflectiveOperationException e) {
+        throw new RuntimeException("Failed to instantiate CometUDF: " + udfClassName, e);
+      }
+      cache.put(udfClassName, udf);
+    }
 
     BufferAllocator allocator = org.apache.comet.package$.MODULE$.CometArrowAllocator();
 
@@ -84,23 +103,17 @@ public static void evaluate(
         inputs[i] = Data.importVector(allocator, inArr, inSch, null);
       }
 
-      result = udf.evaluate(inputs);
+      result = udf.evaluate(inputs, numRows);
       if (!(result instanceof FieldVector)) {
         throw new RuntimeException(
             "CometUDF.evaluate() must return a FieldVector, got: " + result.getClass().getName());
       }
-      // Result length must match the longest input. Scalar (length-1) inputs
-      // are allowed to be shorter, but a vector input bounds the output.
-      int expectedLen = 0;
-      for (ValueVector v : inputs) {
-        expectedLen = Math.max(expectedLen, v.getValueCount());
-      }
-      if (result.getValueCount() != expectedLen) {
+      if (result.getValueCount() != numRows) {
         throw new RuntimeException(
             "CometUDF.evaluate() returned "
                 + result.getValueCount()
                 + " rows, expected "
-                + expectedLen);
+                + numRows);
       }
       ArrowArray outArr = ArrowArray.wrap(outArrayPtr);
       ArrowSchema outSch = ArrowSchema.wrap(outSchemaPtr);
 
@@ -380,6 +380,46 @@ object CometConf extends ShimCometConf {
       .booleanConf
       .createWithDefault(false)
 
+  val REGEXP_ENGINE_RUST = "rust"
+  val REGEXP_ENGINE_JAVA = "java"
+
+  val COMET_REGEXP_ENGINE: ConfigEntry[String] =
+    conf("spark.comet.exec.regexp.engine")
+      .category(CATEGORY_EXEC)
+      .doc(
+        "Experimental. Selects the engine used to evaluate supported regular-expression " +
+          s"expressions. `$REGEXP_ENGINE_RUST` uses the native DataFusion regexp engine. " +
+          s"`$REGEXP_ENGINE_JAVA` routes through a JVM-side UDF (java.util.regex.Pattern) for " +
+          "Spark-compatible semantics, at the cost of JNI roundtrips per batch. Expressions " +
+          "routed when set to java: rlike, regexp_extract, regexp_extract_all, regexp_replace, " +
+          "regexp_instr, and split.")
+      .stringConf
+      .transform(_.toLowerCase(Locale.ROOT))
+      .checkValues(Set(REGEXP_ENGINE_RUST, REGEXP_ENGINE_JAVA))
+      .createWithDefault(REGEXP_ENGINE_JAVA)
+
+  val CODEGEN_DISPATCH_AUTO = "auto"
+  val CODEGEN_DISPATCH_DISABLED = "disabled"
+  val CODEGEN_DISPATCH_FORCE = "force"
+
+  val COMET_CODEGEN_DISPATCH_MODE: ConfigEntry[String] =
+    conf("spark.comet.exec.codegenDispatch.mode")
+      .category(CATEGORY_EXEC)
+      .doc("Controls whether Comet routes eligible scalar expressions through the Arrow-direct " +
+        "codegen dispatcher (`CometCodegenDispatchUDF`) rather than through a native " +
+        s"DataFusion implementation or a hand-coded JVM UDF. `$CODEGEN_DISPATCH_AUTO` lets " +
+        "each expression's serde decide its preferred path based on measured evidence " +
+        "(e.g. for regex, codegen is preferred when " +
+        s"spark.comet.exec.regexp.engine=$REGEXP_ENGINE_JAVA). " +
+        s"`$CODEGEN_DISPATCH_DISABLED` never uses codegen dispatch. `$CODEGEN_DISPATCH_FORCE` " +
+        "inverts the chain: every serde tries codegen first and falls through to its next " +
+        "preferred path only when `canHandle` rejects the expression. Useful for debugging " +
+        "and benchmarking.")
+      .stringConf
+      .transform(_.toLowerCase(Locale.ROOT))
+      .checkValues(Set(CODEGEN_DISPATCH_AUTO, CODEGEN_DISPATCH_DISABLED, CODEGEN_DISPATCH_FORCE))
+      .createWithDefault(CODEGEN_DISPATCH_AUTO)
+
   val COMET_EXEC_SHUFFLE_WITH_HASH_PARTITIONING_ENABLED: ConfigEntry[Boolean] =
     conf("spark.comet.native.shuffle.partitioning.hash.enabled")
       .category(CATEGORY_SHUFFLE)