Skip to content

Commit a049750

Browse files
authored
chore: specify heap, metadata mem sizes for sql_core* tests (apache#4623)
1 parent 2e35d9b commit a049750

4 files changed

Lines changed: 82 additions & 11 deletions

File tree

.github/workflows/ci.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,8 @@ jobs:
5656
github.event_name != 'pull_request' ||
5757
github.event.action != 'labeled' ||
5858
github.event.label.name == 'run-spark-3.4-tests' ||
59+
60+
github.event.label.name == 'run-spark-4.0-tests' ||
5961
github.event.label.name == 'run-spark-4.1-tests'
6062
steps:
6163
- uses: actions/checkout@v6
@@ -231,11 +233,15 @@ jobs:
231233
spark_4_0:
232234
name: Spark SQL Tests (Spark 4.0)
233235
needs: changes
236+
# Main-only by default; PRs need the `run-spark-4.0-tests` label. Swapped
237+
# with spark_4_1 on the `oom` branch to validate the memory caps against
238+
# Spark 4.1 by default.
234239
if: |
235240
needs.changes.outputs.spark_4_0 == 'true' &&
236241
(github.event_name == 'push' ||
237242
github.event_name == 'workflow_dispatch' ||
238-
github.event_name == 'pull_request')
243+
(github.event_name == 'pull_request' &&
244+
contains(github.event.pull_request.labels.*.name, 'run-spark-4.0-tests')))
239245
uses: ./.github/workflows/spark_sql_test_reusable.yml
240246
with:
241247
spark-short: '4.0'
@@ -245,13 +251,11 @@ jobs:
245251
spark_4_1:
246252
name: Spark SQL Tests (Spark 4.1)
247253
needs: changes
248-
# Main-only by default; PRs need the `run-spark-4.1-tests` label.
249254
if: |
250255
needs.changes.outputs.spark_4_1 == 'true' &&
251256
(github.event_name == 'push' ||
252257
github.event_name == 'workflow_dispatch' ||
253-
(github.event_name == 'pull_request' &&
254-
contains(github.event.pull_request.labels.*.name, 'run-spark-4.1-tests')))
258+
github.event_name == 'pull_request')
255259
uses: ./.github/workflows/spark_sql_test_reusable.yml
256260
with:
257261
spark-short: '4.1'

.github/workflows/spark_sql_test_reusable.yml

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,12 @@ jobs:
108108
matrix:
109109
module:
110110
- {name: "catalyst", args1: "catalyst/test", args2: ""}
111-
- {name: "sql_core-1", args1: "", args2: "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest"}
112-
- {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest"}
113-
- {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest"}
111+
# sql_core-* set HEAP_SIZE / METASPACE_SIZE so SparkBuild.scala caps
112+
# forked test JVMs below the Spark defaults (-Xmx4g, MaxMetaspaceSize=1300m),
113+
# leaving headroom for SBT (SBT_MEM=3072) inside the 7 GB runner budget.
114+
- {name: "sql_core-1", args1: "", args2: "sql/testOnly * -- -l org.apache.spark.tags.ExtendedSQLTest -l org.apache.spark.tags.SlowSQLTest", heap: "3g", metaspace: "1g"}
115+
- {name: "sql_core-2", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.ExtendedSQLTest", heap: "3g", metaspace: "1g"}
116+
- {name: "sql_core-3", args1: "", args2: "sql/testOnly * -- -n org.apache.spark.tags.SlowSQLTest", heap: "3g", metaspace: "1g"}
114117
- {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
115118
- {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
116119
- {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
@@ -152,9 +155,18 @@ jobs:
152155
if [ "${{ inputs.spark-short }}" != "4.0" ] || [ "${{ inputs.java }}" != "21" ]; then
153156
export SERIAL_SBT_TESTS=1
154157
fi
158+
# Per-row forked-test-JVM caps (read by Spark's SparkBuild.scala). Only
159+
# exported when the matrix entry sets them; rows without these fields
160+
# keep Spark's defaults (-Xmx4g, -XX:MaxMetaspaceSize=1300m).
161+
if [ -n "${{ matrix.module.heap }}" ]; then
162+
export HEAP_SIZE="${{ matrix.module.heap }}"
163+
fi
164+
if [ -n "${{ matrix.module.metaspace }}" ]; then
165+
export METASPACE_SIZE="${{ matrix.module.metaspace }}"
166+
fi
155167
# Cap parallel forked test JVMs at 1 so that even when
156168
# SparkParallelTestGrouping is enabled we don't blow the
157-
# 7 GB runner budget (each forked test JVM has -Xmx2g).
169+
# 7 GB runner budget.
158170
NOLINT_ON_COMPILE=true ENABLE_COMET=true ENABLE_COMET_ONHEAP=true ENABLE_COMET_LOG_FALLBACK_REASONS=${{ inputs.collect-fallback-logs }} \
159171
build/sbt -Dsbt.log.noformat=true -mem $SBT_MEM \
160172
'set Global / concurrentRestrictions := Seq(Tags.limit(Tags.ForkedTestGroup, 1))' \

native/spark-expr/src/array_funcs/list_extract.rs

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,17 @@ impl PhysicalExpr for ListExtract {
126126
}
127127

128128
fn nullable(&self, input_schema: &Schema) -> DataFusionResult<bool> {
129-
// Only non-nullable if fail_on_error is enabled and the element is non-nullable
130-
Ok(!self.fail_on_error || self.child_field(input_schema)?.is_nullable())
129+
// The result is null if any of the following holds:
130+
// - fail_on_error is disabled (an out-of-bounds index yields null), or
131+
// - the array itself is null (a null array row yields null), or
132+
// - the ordinal is null (a null index yields null), or
133+
// - the extracted element is itself nullable.
134+
// It is only non-nullable when fail_on_error is enabled and none of the inputs
135+
// nor the element can be null.
136+
Ok(!self.fail_on_error
137+
|| self.child.nullable(input_schema)?
138+
|| self.ordinal.nullable(input_schema)?
139+
|| self.child_field(input_schema)?.is_nullable())
131140
}
132141

133142
fn evaluate(&self, batch: &RecordBatch) -> DataFusionResult<ColumnarValue> {
@@ -330,10 +339,40 @@ impl Display for ListExtract {
330339
mod test {
331340
use super::*;
332341
use arrow::array::{Array, Int32Array, ListArray};
333-
use arrow::datatypes::Int32Type;
342+
use arrow::datatypes::{Field, Int32Type};
334343
use datafusion::common::{Result, ScalarValue};
344+
use datafusion::physical_expr::expressions::Column;
335345
use datafusion::physical_plan::ColumnarValue;
336346

347+
#[test]
348+
fn test_nullable_when_array_is_nullable() -> Result<()> {
349+
// Regression test for SPARK-55747: GetArrayItem over a nullable array (e.g. the
350+
// result of split() on a null input) must report nullable=true even with
351+
// fail_on_error enabled (ANSI), because a null array row yields a null result.
352+
// The list element field is non-nullable here, mirroring split()'s output.
353+
let element_field = Arc::new(Field::new("item", DataType::Int32, false));
354+
let schema = Schema::new(vec![
355+
Field::new("arr", DataType::List(element_field), true),
356+
Field::new("idx", DataType::Int32, false),
357+
]);
358+
359+
let list_extract = ListExtract::new(
360+
Arc::new(Column::new("arr", 0)),
361+
Arc::new(Column::new("idx", 1)),
362+
None,
363+
false, // one_based (GetArrayItem)
364+
true, // fail_on_error (ANSI enabled)
365+
None,
366+
crate::create_query_context_map(),
367+
);
368+
369+
assert!(
370+
list_extract.nullable(&schema)?,
371+
"ListExtract over a nullable array must be nullable even with fail_on_error"
372+
);
373+
Ok(())
374+
}
375+
337376
#[test]
338377
fn test_list_extract_default_value() -> Result<()> {
339378
let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![

spark/src/test/scala/org/apache/comet/CometArrayExpressionSuite.scala

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,4 +1106,20 @@ class CometArrayExpressionSuite extends CometTestBase with AdaptiveSparkPlanHelp
11061106
}
11071107
}
11081108
}
1109+
1110+
// https://issues.apache.org/jira/browse/SPARK-55747
1111+
test("(ansi) GetArrayItem on null array from split()") {
1112+
withSQLConf(
1113+
SQLConf.ANSI_ENABLED.key -> "true",
1114+
CometConf.COMET_ENABLED.key -> "true",
1115+
CometConf.COMET_EXEC_ENABLED.key -> "true") {
1116+
withTable("test_split_null") {
1117+
sql("CREATE TABLE test_split_null(s STRING) USING parquet")
1118+
sql("INSERT INTO test_split_null VALUES ('a,b,c'), (NULL)")
1119+
// split(NULL, ...) yields a null array; arr[0] on a null array must return NULL
1120+
// rather than failing the non-nullable schema validation in native execution.
1121+
checkSparkAnswerAndOperator(sql("SELECT split(s, ',')[0] FROM test_split_null"))
1122+
}
1123+
}
1124+
}
11091125
}

0 commit comments

Comments
 (0)