Merge branch 'skip-ci-bench' into improve-date-time-bench

andygrove · andygrove · commit 15d8515ba319 · 2026-01-02T15:25:13.000-07:00
diff --git a/.github/workflows/pr_benchmark_check.yml b/.github/workflows/pr_benchmark_check.yml
@@ -0,0 +1,85 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Lightweight CI for benchmark-only changes - verifies compilation and linting
+# without running full test suites
+
+name: PR Benchmark Check
+
+concurrency:
+  group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
+  cancel-in-progress: true
+
+on:
+  push:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  pull_request:
+    paths:
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
+  workflow_dispatch:
+
+env:
+  RUST_VERSION: stable
+
+jobs:
+  benchmark-check:
+    name: Benchmark Compile & Lint Check
+    runs-on: ubuntu-latest
+    container:
+      image: amd64/rust
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{ env.RUST_VERSION }}
+          jdk-version: 17
+
+      - name: Check Cargo fmt
+        run: |
+          cd native
+          cargo fmt --all -- --check --color=never
+
+      - name: Check Cargo clippy
+        run: |
+          cd native
+          cargo clippy --color=never --all-targets --workspace -- -D warnings
+
+      - name: Check benchmark compilation
+        run: |
+          cd native
+          cargo check --benches
+
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-benchmark-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-benchmark-maven-
+
+      - name: Check Scala compilation and linting
+        run: |
+          ./mvnw -B compile test-compile scalafix:scalafix -Dscalafix.mode=CHECK -Psemanticdb -DskipTests
diff --git a/.github/workflows/pr_build_linux.yml b/.github/workflows/pr_build_linux.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/pr_build_macos.yml b/.github/workflows/pr_build_macos.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
diff --git a/.github/workflows/spark_sql_test.yml b/.github/workflows/spark_sql_test.yml
@@ -27,11 +27,17 @@ on:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   pull_request:
     paths-ignore:
       - "doc/**"
       - "docs/**"
       - "**.md"
+      - "native/core/benches/**"
+      - "native/spark-expr/benches/**"
+      - "spark/src/test/scala/org/apache/spark/sql/benchmark/**"
   # manual trigger
   # https://docs.github.com/en/actions/managing-workflow-runs/manually-running-a-workflow
   workflow_dispatch:
@@ -59,6 +65,10 @@ jobs:
           - {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
           - {name: "sql_hive-2", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.ExtendedHiveTest"}
           - {name: "sql_hive-3", args1: "", args2: "hive/testOnly * -- -n org.apache.spark.tags.SlowHiveTest"}
+        # Skip sql_hive-1 for Spark 4.0 due to https://github.com/apache/datafusion-comet/issues/2946
+        exclude:
+          - spark-version: {short: '4.0', full: '4.0.1', java: 17}
+            module: {name: "sql_hive-1", args1: "", args2: "hive/testOnly * -- -l org.apache.spark.tags.ExtendedHiveTest -l org.apache.spark.tags.SlowHiveTest"}
       fail-fast: false
     name: spark-sql-${{ matrix.module.name }}/${{ matrix.os }}/spark-${{ matrix.spark-version.full }}/java-${{ matrix.spark-version.java }}
     runs-on: ${{ matrix.os }}
diff --git a/native/spark-expr/Cargo.toml b/native/spark-expr/Cargo.toml
@@ -80,6 +80,10 @@ harness = false
 name = "padding"
 harness = false
 
+[[bench]]
+name = "normalize_nan"
+harness = false
+
 [[test]]
 name = "test_udf_registration"
 path = "tests/spark_expr_reg.rs"
diff --git a/native/spark-expr/benches/normalize_nan.rs b/native/spark-expr/benches/normalize_nan.rs
@@ -0,0 +1,88 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Benchmarks for NormalizeNaNAndZero expression
+
+use arrow::array::Float64Array;
+use arrow::datatypes::{DataType, Field, Schema};
+use arrow::record_batch::RecordBatch;
+use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
+use datafusion::physical_expr::expressions::Column;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion_comet_spark_expr::NormalizeNaNAndZero;
+use std::hint::black_box;
+use std::sync::Arc;
+
+const BATCH_SIZE: usize = 8192;
+
+fn make_col(name: &str, index: usize) -> Arc<dyn PhysicalExpr> {
+    Arc::new(Column::new(name, index))
+}
+
+/// Create a batch with float64 column containing various values including NaN and -0.0
+fn create_float_batch(nan_pct: usize, neg_zero_pct: usize, null_pct: usize) -> RecordBatch {
+    let mut values: Vec<Option<f64>> = Vec::with_capacity(BATCH_SIZE);
+
+    for i in 0..BATCH_SIZE {
+        if null_pct > 0 && i % (100 / null_pct.max(1)) == 0 {
+            values.push(None);
+        } else if nan_pct > 0 && i % (100 / nan_pct.max(1)) == 1 {
+            values.push(Some(f64::NAN));
+        } else if neg_zero_pct > 0 && i % (100 / neg_zero_pct.max(1)) == 2 {
+            values.push(Some(-0.0));
+        } else {
+            values.push(Some(i as f64 * 1.5));
+        }
+    }
+
+    let array = Float64Array::from(values);
+    let schema = Schema::new(vec![Field::new("c1", DataType::Float64, true)]);
+
+    RecordBatch::try_new(Arc::new(schema), vec![Arc::new(array)]).unwrap()
+}
+
+fn bench_normalize_nan_and_zero(c: &mut Criterion) {
+    let mut group = c.benchmark_group("normalize_nan_and_zero");
+
+    // Test with different percentages of special values
+    let test_cases = [
+        ("no_special", 0, 0, 0),
+        ("10pct_nan", 10, 0, 0),
+        ("10pct_neg_zero", 0, 10, 0),
+        ("10pct_null", 0, 0, 10),
+        ("mixed_10pct", 5, 5, 5),
+        ("all_normal", 0, 0, 0),
+    ];
+
+    for (name, nan_pct, neg_zero_pct, null_pct) in test_cases {
+        let batch = create_float_batch(nan_pct, neg_zero_pct, null_pct);
+
+        let normalize_expr = Arc::new(NormalizeNaNAndZero::new(
+            DataType::Float64,
+            make_col("c1", 0),
+        ));
+
+        group.bench_with_input(BenchmarkId::new("float64", name), &batch, |b, batch| {
+            b.iter(|| black_box(normalize_expr.evaluate(black_box(batch)).unwrap()));
+        });
+    }
+
+    group.finish();
+}
+
+criterion_group!(benches, bench_normalize_nan_and_zero);
+criterion_main!(benches);
diff --git a/native/spark-expr/src/math_funcs/internal/normalize_nan.rs b/native/spark-expr/src/math_funcs/internal/normalize_nan.rs
@@ -15,10 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use arrow::compute::unary;
 use arrow::datatypes::{DataType, Schema};
 use arrow::{
-    array::{as_primitive_array, ArrayAccessor, ArrayIter, Float32Array, Float64Array},
-    datatypes::{ArrowNativeType, Float32Type, Float64Type},
+    array::{as_primitive_array, Float32Array, Float64Array},
+    datatypes::{Float32Type, Float64Type},
     record_batch::RecordBatch,
 };
 use datafusion::logical_expr::ColumnarValue;
@@ -78,14 +79,16 @@ impl PhysicalExpr for NormalizeNaNAndZero {
 
         match &self.data_type {
             DataType::Float32 => {
-                let v = eval_typed(as_primitive_array::<Float32Type>(&array));
-                let new_array = Float32Array::from(v);
-                Ok(ColumnarValue::Array(Arc::new(new_array)))
+                let input = as_primitive_array::<Float32Type>(&array);
+                // Use unary which operates directly on values buffer without intermediate allocation
+                let result: Float32Array = unary(input, normalize_float);
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
             DataType::Float64 => {
-                let v = eval_typed(as_primitive_array::<Float64Type>(&array));
-                let new_array = Float64Array::from(v);
-                Ok(ColumnarValue::Array(Arc::new(new_array)))
+                let input = as_primitive_array::<Float64Type>(&array);
+                // Use unary which operates directly on values buffer without intermediate allocation
+                let result: Float64Array = unary(input, normalize_float);
+                Ok(ColumnarValue::Array(Arc::new(result)))
             }
             dt => panic!("Unexpected data type {dt:?}"),
         }
@@ -106,60 +109,21 @@ impl PhysicalExpr for NormalizeNaNAndZero {
     }
 }
 
-fn eval_typed<V: FloatDouble, T: ArrayAccessor<Item = V>>(input: T) -> Vec<Option<V>> {
-    let iter = ArrayIter::new(input);
-    iter.map(|o| {
-        o.map(|v| {
-            if v.is_nan() {
-                v.nan()
-            } else if v.is_neg_zero() {
-                v.zero()
-            } else {
-                v
-            }
-        })
-    })
-    .collect()
+/// Normalize a floating point value by converting all NaN representations to a canonical NaN
+/// and negative zero to positive zero. This is used for Spark's comparison semantics.
+#[inline]
+fn normalize_float<T: num::Float>(v: T) -> T {
+    if v.is_nan() {
+        T::nan()
+    } else if v == T::neg_zero() {
+        T::zero()
+    } else {
+        v
+    }
 }
 
 impl Display for NormalizeNaNAndZero {
     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
         write!(f, "FloatNormalize [child: {}]", self.child)
     }
 }
-
-trait FloatDouble: ArrowNativeType {
-    fn is_nan(&self) -> bool;
-    fn nan(&self) -> Self;
-    fn is_neg_zero(&self) -> bool;
-    fn zero(&self) -> Self;
-}
-
-impl FloatDouble for f32 {
-    fn is_nan(&self) -> bool {
-        f32::is_nan(*self)
-    }
-    fn nan(&self) -> Self {
-        f32::NAN
-    }
-    fn is_neg_zero(&self) -> bool {
-        *self == -0.0
-    }
-    fn zero(&self) -> Self {
-        0.0
-    }
-}
-impl FloatDouble for f64 {
-    fn is_nan(&self) -> bool {
-        f64::is_nan(*self)
-    }
-    fn nan(&self) -> Self {
-        f64::NAN
-    }
-    fn is_neg_zero(&self) -> bool {
-        *self == -0.0
-    }
-    fn zero(&self) -> Self {
-        0.0
-    }
-}