Merge branch 'apache:main' into main

kazantsev-maksim · web-flow · commit 231aa905dbd2 · 2025-12-17T08:20:06.000-08:00
diff --git a/native/core/src/execution/shuffle/row.rs b/native/core/src/execution/shuffle/row.rs
@@ -765,9 +765,6 @@ pub fn process_sorted_row_partition(
     initial_checksum: Option<u32>,
     codec: &CompressionCodec,
 ) -> Result<(i64, Option<u32>), CometError> {
-    // TODO: We can tune this parameter automatically based on row size and cache size.
-    let row_step = 10;
-
     // The current row number we are reading
     let mut current_row = 0;
     // Total number of bytes written
@@ -790,26 +787,19 @@ pub fn process_sorted_row_partition(
         })?;
 
         // Appends rows to the array builders.
-        let mut row_start: usize = current_row;
-        while row_start < current_row + n {
-            let row_end = std::cmp::min(row_start + row_step, current_row + n);
-
-            // For each column, iterating over rows and appending values to corresponding array
-            // builder.
-            for (idx, builder) in data_builders.iter_mut().enumerate() {
-                append_columns(
-                    row_addresses_ptr,
-                    row_sizes_ptr,
-                    row_start,
-                    row_end,
-                    schema,
-                    idx,
-                    builder,
-                    prefer_dictionary_ratio,
-                )?;
-            }
-
-            row_start = row_end;
+        // For each column, iterating over rows and appending values to corresponding array
+        // builder.
+        for (idx, builder) in data_builders.iter_mut().enumerate() {
+            append_columns(
+                row_addresses_ptr,
+                row_sizes_ptr,
+                current_row,
+                current_row + n,
+                schema,
+                idx,
+                builder,
+                prefer_dictionary_ratio,
+            )?;
         }
 
         // Writes a record batch generated from the array builders to the output file.
diff --git a/native/spark-expr/src/math_funcs/modulo_expr.rs b/native/spark-expr/src/math_funcs/modulo_expr.rs
@@ -100,10 +100,16 @@ pub fn create_modulo_expr(
                 SparkCastOptions::new_without_timezone(EvalMode::Legacy, false),
             ));
 
+            // The UDF's return type must match what Arrow's rem function will actually return.
+            // Since we're operating on Decimal256 inputs, rem will return Decimal256.
+            let decimal256_return_type = match &data_type {
+                DataType::Decimal128(p, s) => DataType::Decimal256(*p, *s),
+                other => other.clone(),
+            };
             let modulo_scalar_func = create_modulo_scalar_function(
                 left_256,
                 right_256,
-                &data_type,
+                &decimal256_return_type,
                 registry,
                 fail_on_error,
             )?;
diff --git a/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometExpressionSuite.scala
@@ -1772,8 +1772,24 @@ class CometExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelper {
     }
   }
 
+  test("Decimal modulus with Decimal256 intermediate type") {
+    // regression test for https://github.com/apache/datafusion-comet/issues/2911
+    withTable("test") {
+      sql("create table test(a decimal(33, 29), b decimal(28, 17)) using parquet")
+      sql(
+        "insert into test values (-6788.53035340376888409034576923353, " +
+          "70948216565.90127985418365471)")
+      withSQLConf(
+        "spark.comet.enabled" -> "true",
+        "spark.sql.decimalOperations.allowPrecisionLoss" -> "true") {
+        val df = sql("select a, b, a % b from test")
+        df.collect()
+      }
+    }
+  }
+
   test("Decimal random number tests") {
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     def makeNum(p: Int, s: Int): String = {
       val int1 = rand.nextLong()
       val int2 = rand.nextLong().abs
diff --git a/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala b/spark/src/test/scala/org/apache/comet/parquet/ParquetReadSuite.scala
@@ -348,7 +348,7 @@ abstract class ParquetReadSuite extends CometTestBase {
   }
 
   test("mixed nulls and non-nulls") {
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     val data = (0 to 100).map { i =>
       val row: (Boolean, Integer, java.lang.Long, java.lang.Float, java.lang.Double, String) = {
         if (rand.nextBoolean()) {
@@ -403,7 +403,7 @@ abstract class ParquetReadSuite extends CometTestBase {
         pageSize = pageSize,
         dictionaryPageSize = pageSize)
 
-      val rand = scala.util.Random
+      val rand = new scala.util.Random(42)
       val expected = (0 until n).map { i =>
         if (rand.nextBoolean()) {
           None
@@ -626,7 +626,7 @@ abstract class ParquetReadSuite extends CometTestBase {
         dictionaryPageSize = dictionaryPageSize,
         pageRowCountLimit = pageRowCount)
 
-      val rand = scala.util.Random
+      val rand = new scala.util.Random(42)
       val expected = (0 until n).map { i =>
         // use a single value for the first page, to make sure dictionary encoding kicks in
         val value = if (i < pageRowCount) i % 8 else i
@@ -814,7 +814,7 @@ abstract class ParquetReadSuite extends CometTestBase {
         dictionaryPageSize = pageSize,
         rowGroupSize = 1024 * 128)
 
-      val rand = scala.util.Random
+      val rand = new scala.util.Random(42)
       val expected = (0 until n).map { i =>
         if (rand.nextBoolean()) {
           None
@@ -1564,7 +1564,7 @@ abstract class ParquetReadSuite extends CometTestBase {
         pageSize = pageSize,
         dictionaryPageSize = pageSize)
 
-      val rand = scala.util.Random
+      val rand = new scala.util.Random(42)
       val expected = (0 until n).map { i =>
         if (rand.nextBoolean()) {
           None
@@ -1662,7 +1662,7 @@ abstract class ParquetReadSuite extends CometTestBase {
         dictionaryPageSize = dictionaryPageSize,
         pageRowCountLimit = pageRowCount)
 
-      val rand = scala.util.Random
+      val rand = new scala.util.Random(42)
       val expected = (0 until n).map { i =>
         // use a single value for the first page, to make sure dictionary encoding kicks in
         val value = if (i < pageRowCount) i % 8 else i
diff --git a/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala b/spark/src/test/scala/org/apache/spark/sql/CometTestBase.scala
@@ -694,7 +694,7 @@ abstract class CometTestBase
 
     val idGenerator = new AtomicInteger(0)
 
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     val data = (begin until end).map { i =>
       if (nullEnabled && rand.nextBoolean()) {
         None
@@ -788,7 +788,7 @@ abstract class CometTestBase
       rowGroupSize = rowGroupSize)
     val div = if (dictionaryEnabled) 10 else n // maps value to a small range for dict to kick in
 
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     val expected = (0 until n).map { i =>
       if (rand.nextBoolean()) {
         None
@@ -842,7 +842,7 @@ abstract class CometTestBase
       rowGroupSize = rowGroupSize)
     val div = if (dictionaryEnabled) 10 else n // maps value to a small range for dict to kick in
 
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     val expected = (0 until n).map { i =>
       if (rand.nextBoolean()) {
         None
@@ -1240,7 +1240,7 @@ abstract class CometTestBase
     val schema = MessageTypeParser.parseMessageType(schemaStr)
     val writer = createParquetWriter(schema, path, dictionaryEnabled = true)
 
-    val rand = scala.util.Random
+    val rand = new scala.util.Random(42)
     val expected = (0 until total).map { i =>
       // use a single value for the first page, to make sure dictionary encoding kicks in
       if (rand.nextBoolean()) None