chore: specify heap, metadata mem sizes for sql_core* tests (apache#4623)

comphead · web-flow · commit a049750c5e9a · 2026-06-11T16:32:18.000-07:00
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -56,6 +56,8 @@ jobs:
       github.event_name != 'pull_request' ||
       github.event.action != 'labeled' ||
       github.event.label.name == 'run-spark-3.4-tests' ||
+
+      github.event.label.name == 'run-spark-4.0-tests' ||
       github.event.label.name == 'run-spark-4.1-tests'
     steps:
       - uses: actions/checkout@v6
@@ -231,11 +233,15 @@ jobs:
   spark_4_0:
     name: Spark SQL Tests (Spark 4.0)
     needs: changes
+    # Main-only by default; PRs need the `run-spark-4.0-tests` label. Swapped
+    # with spark_4_1 on the `oom` branch to validate the memory caps against
+    # Spark 4.1 by default.
     if: |
       needs.changes.outputs.spark_4_0 == 'true' &&
       (github.event_name == 'push' ||
        github.event_name == 'workflow_dispatch' ||
-       github.event_name == 'pull_request')
+       (github.event_name == 'pull_request' &&
+        contains(github.event.pull_request.labels.*.name, 'run-spark-4.0-tests')))
     uses: ./.github/workflows/spark_sql_test_reusable.yml
     with:
       spark-short: '4.0'
@@ -245,13 +251,11 @@ jobs:
   spark_4_1:
     name: Spark SQL Tests (Spark 4.1)
     needs: changes
-    # Main-only by default; PRs need the `run-spark-4.1-tests` label.
     if: |
       needs.changes.outputs.spark_4_1 == 'true' &&
       (github.event_name == 'push' ||
        github.event_name == 'workflow_dispatch' ||
-       (github.event_name == 'pull_request' &&
-        contains(github.event.pull_request.labels.*.name, 'run-spark-4.1-tests')))
+       github.event_name == 'pull_request')
     uses: ./.github/workflows/spark_sql_test_reusable.yml
     with:
       spark-short: '4.1'
diff --git a/.github/workflows/spark_sql_test_reusable.yml b/.github/workflows/spark_sql_test_reusable.yml
@@ -108,9 +108,12 @@ jobs:
       matrix:
         module:
           - {name: "catalyst", args1: "catalyst/test", args2: ""}
-          - {name: "sql_core-1", args1: "", args2: "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest"}
-          - {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"}
-          - {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"}
+          # sql_core-* set HEAP_SIZE / METASPACE_SIZE so SparkBuild.scala caps
+          # forked test JVMs below the Spark defaults (-Xmx4g, MaxMetaspaceSize=1300m),
+          # leaving headroom for SBT (SBT_MEM=3072) inside the 7 GB runner budget.
+          - {name: "sql_core-1", args1: "", args2: "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest", heap: "3g", metaspace: "1g"}
+          - {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest", heap: "3g", metaspace: "1g"}
+          - {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest", heap: "3g", metaspace: "1g"}
           - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
           - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
           - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
@@ -152,9 +155,18 @@ jobs:
           if [ "${{ inputs.spark-short }}" != "4.0" ] || [ "${{ inputs.java }}" != "21" ]; then
             export SERIAL_SBT_TESTS=1
           fi
+          # Per-row forked-test-JVM caps (read by Spark's SparkBuild.scala). Only
+          # exported when the matrix entry sets them; rows without these fields
+          # keep Spark's defaults (-Xmx4g, -XX:MaxMetaspaceSize=1300m).
+          if [ -n "${{ matrix.module.heap }}" ]; then
+            export HEAP_SIZE="${{ matrix.module.heap }}"
+          fi
+          if [ -n "${{ matrix.module.metaspace }}" ]; then
+            export METASPACE_SIZE="${{ matrix.module.metaspace }}"
+          fi
           # Cap parallel forked test JVMs at 1 so that even when
           # SparkParallelTestGrouping is enabled we don't blow the
-          # 7 GB runner budget (each forked test JVM has -Xmx2g).
+          # 7 GB runner budget.
           NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ inputs.collect-fallback-logs }} \
             build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM \
               'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \
diff --git a/native/spark-expr/src/array_funcs/list_extract.rs b/native/spark-expr/src/array_funcs/list_extract.rs
@@ -126,8 +126,17 @@ impl PhysicalExpr for ListExtract {
     }
 
     fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
-        // Only non-nullable if fail_on_error is enabled and the element is non-nullable
-        Ok(!self.fail_on_error || self.child_field(input_schema)?.is_nullable())
+        // The result is null if any of the following holds:
+        //  - fail_on_error is disabled (an out-of-bounds index yields null), or
+        //  - the array itself is null (a null array row yields null), or
+        //  - the ordinal is null (a null index yields null), or
+        //  - the extracted element is itself nullable.
+        // It is only non-nullable when fail_on_error is enabled and none of the inputs
+        // nor the element can be null.
+        Ok(!self.fail_on_error
+            || self.child.nullable(input_schema)?
+            || self.ordinal.nullable(input_schema)?
+            || self.child_field(input_schema)?.is_nullable())
     }
 
     fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
@@ -330,10 +339,40 @@ impl Display for ListExtract {
 mod test {
     use super::*;
     use arrow::array::{Array, Int32Array, ListArray};
-    use arrow::datatypes::Int32Type;
+    use arrow::datatypes::{Field, Int32Type};
     use datafusion::common::{Result, ScalarValue};
+    use datafusion::physical_expr::expressions::Column;
     use datafusion::physical_plan::ColumnarValue;
 
+    #[test]
+    fn test_nullable_when_array_is_nullable() -> Result<()> {
+        // Regression test for SPARK-55747: GetArrayItem over a nullable array (e.g. the
+        // result of split() on a null input) must report nullable=true even with
+        // fail_on_error enabled (ANSI), because a null array row yields a null result.
+        // The list element field is non-nullable here, mirroring split()'s output.
+        let element_field = Arc::new(Field::new("item", DataType::Int32, false));
+        let schema = Schema::new(vec![
+            Field::new("arr", DataType::List(element_field), true),
+            Field::new("idx", DataType::Int32, false),
+        ]);
+
+        let list_extract = ListExtract::new(
+            Arc::new(Column::new("arr", 0)),
+            Arc::new(Column::new("idx", 1)),
+            None,
+            false, // one_based (GetArrayItem)
+            true,  // fail_on_error (ANSI enabled)
+            None,
+            crate::create_query_context_map(),
+        );
+
+        assert!(
+            list_extract.nullable(&schema)?,
+            "ListExtract over a nullable array must be nullable even with fail_on_error"
+        );
+        Ok(())
+    }
+
     #[test]
     fn test_list_extract_default_value() -> Result<()> {
         let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
diff --git a/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala b/spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala
@@ -1106,4 +1106,20 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp
       }
     }
   }
+
+  // https://issues.apache.org/jira/browse/SPARK-55747
+  test("(ansi) GetArrayItem on null array from split()") {
+    withSQLConf(
+      SQLConf.ANSI_ENABLED.key -> "true",
+      CometConf.COMET_ENABLED.key -> "true",
+      CometConf.COMET_EXEC_ENABLED.key -> "true") {
+      withTable("test_split_null") {
+        sql("CREATE TABLE test_split_null(s STRING) USING parquet")
+        sql("INSERT INTO test_split_null VALUES ('a,b,c'), (NULL)")
+        // split(NULL, ...) yields a null array; arr[0] on a null array must return NULL
+        // rather than failing the non-nullable schema validation in native execution.
+        checkSparkAnswerAndOperator(sql("SELECT split(s, ',')[0] FROM test_split_null"))
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -1106,4 +1106,20 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp`
`1106`	`1106`	`}`
`1107`	`1107`	`}`
`1108`	`1108`	`}`
	`1109`	`+`
	`1110`	`+ // https://issues.apache.org/jira/browse/SPARK-55747`
	`1111`	`+ test("(ansi) GetArrayItem on null array from split()") {`
	`1112`	`+ withSQLConf(`
	`1113`	`+ SQLConf.ANSI_ENABLED.key -> "true",`
	`1114`	`+ CometConf.COMET_ENABLED.key -> "true",`
	`1115`	`+ CometConf.COMET_EXEC_ENABLED.key -> "true") {`
	`1116`	`+ withTable("test_split_null") {`
	`1117`	`+ sql("CREATE TABLE test_split_null(s STRING) USING parquet")`
	`1118`	`+ sql("INSERT INTO test_split_null VALUES ('a,b,c'), (NULL)")`
	`1119`	`+ // split(NULL, ...) yields a null array; arr[0] on a null array must return NULL`
	`1120`	`+ // rather than failing the non-nullable schema validation in native execution.`
	`1121`	`+ checkSparkAnswerAndOperator(sql("SELECT split(s, ',')[0] FROM test_split_null"))`
	`1122`	`+ }`
	`1123`	`+ }`
	`1124`	`+ }`
`1109`	`1125`	`}`