stronger tests

mbutrovich · mbutrovich · commit b1e1c5585b57 · 2026-05-15T16:46:35.000-04:00
diff --git a/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala b/spark/src/test/scala/org/apache/comet/CometCodegenDispatchSmokeSuite.scala
@@ -263,20 +263,23 @@ class CometCodegenDispatchSmokeSuite extends CometTestBase with AdaptiveSparkPla
   test("per-task cache isolates UDF state across sequential task runs in one session") {
     // Regression guard for the cache-scoping invariant on CometUdfBridge: instances live for
     // exactly one Spark task and are dropped on task completion, so a stateful kernel sees a
-    // fresh instance per task. Running the same `monotonically_increasing_id()`-carrying query
-    // twice in one session must produce identical results each run. Under a cache that outlived
-    // a task and got reused by the next one, the counter would continue from the previous run's
-    // final value and the second run's IDs would diverge. Under a cache that was keyed by Tokio
-    // worker thread rather than task attempt ID, worker reuse across tasks would cause the same
-    // leak whenever the second task happened to be polled by the same worker.
+    // fresh instance per task. The query has to actually route through the dispatcher for this
+    // to test anything, so wrap `monotonically_increasing_id()` in a ScalaUDF identity. Running
+    // it twice in one session must produce results matching Spark each time. Under a cache that
+    // outlived a task and got reused by the next one, the counter would continue from the
+    // previous run's final value and the second run's IDs would diverge from Spark. Under a
+    // cache that was keyed by Tokio worker thread rather than task attempt ID, worker reuse
+    // across tasks would cause the same leak whenever the second task happened to be polled by
+    // the same worker. Two `checkSparkAnswerAndOperator` calls are stronger than asserting
+    // first == second: equality alone could pass if both runs are wrong-but-consistent (e.g.
+    // `init(partitionIndex)` never fires); matching Spark on both runs rules that out and
+    // implies cross-run equality because Spark is deterministic on the same query.
+    spark.udf.register("idPassthrough", (id: Long) => id)
     val rows = (0 until 2048).map(i => s"row_$i")
     withSubjects(rows: _*) {
-      val q = "SELECT s, monotonically_increasing_id() AS mid FROM t"
-      val first = sql(q).collect().map(r => (r.getString(0), r.getLong(1))).toSeq
-      val second = sql(q).collect().map(r => (r.getString(0), r.getLong(1))).toSeq
-      assert(
-        first == second,
-        s"per-task cache leaked state across runs: first=${first.take(5)} second=${second.take(5)}")
+      val q = "SELECT s, idPassthrough(monotonically_increasing_id()) AS mid FROM t"
+      checkSparkAnswerAndOperator(sql(q))
+      checkSparkAnswerAndOperator(sql(q))
     }
   }