Skip to content

Commit 9bbc28b

Browse files
raushanprabhakar1Raushan Prabhakarblagininadriangb
authored
Adding Use of arrow's has_true() / has_false() (apache#21806)
## Which issue does this PR close? Closes apache#21784 ## Rationale for this change Apache Arrow added `BooleanArray::has_true()` and `has_false()` so callers can answer “any true/false?” without a full bit count. That can short-circuit and avoid unnecessary work compared to patterns like `true_count() == 0` or `true_count() > 0`. This PR applies those APIs across DataFusion where the logic is purely existential (or equivalent via null-safe “all true” / “no true” checks), matching the audit suggested in the issue. ## What changes are included in this PR? - Replace hot-path checks that only needed existence or emptiness with `has_true()` / `has_false()` (and `null_count()` where needed), including: - Nested/array helpers (`array_has`, list replace), Spark `array_contains` null-semantics fast path - Physical expressions: `evaluate_selection`, binary AND/OR short-circuit, CASE/IN list loops - `scatter` fast paths - Top-K filter handling, sort-merge join filter, nested-loop join bitmap checks - Parquet column stats (`metadata.rs`, `has_any_exact_match`) - Keep `true_count()` / `false_count()` where an actual count is required (row counts, metrics, selectivity, `to_array(n)`, etc.) - Import `arrow::array::Array` where `null_count()` is used on `BooleanArray` in trait-heavy paths ## Are these changes tested? Existing tests cover this behavior; the edits are semantics-preserving refactors (same conditions, cheaper primitives). No new tests were added. ## Are there any user-facing changes? No. Behavior should be unchanged; this is an internal performance/clarity improvement. --------- Co-authored-by: Raushan Prabhakar <ros@Raushans-MacBook-Air.local> Co-authored-by: Dmitrii Blaginin <dmitrii@blaginin.me> Co-authored-by: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com>
1 parent d648982 commit 9bbc28b

12 files changed

Lines changed: 38 additions & 53 deletions

File tree

datafusion/datasource-parquet/src/metadata.rs

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -520,12 +520,10 @@ fn summarize_column_statistics(
520520

521521
// handle the common special case when all row groups have exact statistics
522522
let exactness = &is_max_value_exact_stat;
523-
if !exactness.is_empty()
524-
&& exactness.null_count() == 0
525-
&& exactness.true_count() == exactness.len()
523+
if !exactness.is_empty() && exactness.null_count() == 0 && !exactness.has_false()
526524
{
527525
accumulators.is_max_value_exact[logical_schema_index] = Some(true);
528-
} else if exactness.true_count() == 0 {
526+
} else if !exactness.has_true() {
529527
accumulators.is_max_value_exact[logical_schema_index] = Some(false);
530528
} else {
531529
let val = max_acc.evaluate()?;
@@ -539,12 +537,10 @@ fn summarize_column_statistics(
539537

540538
// handle the common special case when all row groups have exact statistics
541539
let exactness = &is_min_value_exact_stat;
542-
if !exactness.is_empty()
543-
&& exactness.null_count() == 0
544-
&& exactness.true_count() == exactness.len()
540+
if !exactness.is_empty() && exactness.null_count() == 0 && !exactness.has_false()
545541
{
546542
accumulators.is_min_value_exact[logical_schema_index] = Some(true);
547-
} else if exactness.true_count() == 0 {
543+
} else if !exactness.has_true() {
548544
accumulators.is_min_value_exact[logical_schema_index] = Some(false);
549545
} else {
550546
let val = min_acc.evaluate()?;
@@ -675,7 +671,7 @@ fn has_any_exact_match(
675671
let scalar_array = value.to_scalar().ok()?;
676672
let eq_mask = eq(&scalar_array, &array).ok()?;
677673
let combined_mask = and(&eq_mask, exactness).ok()?;
678-
Some(combined_mask.true_count() > 0)
674+
Some(combined_mask.has_true())
679675
}
680676

681677
/// Wrapper to implement [`FileMetadata`] for [`ParquetMetaData`].

datafusion/functions-nested/src/array_has.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -338,7 +338,7 @@ fn array_has_dispatch_for_array<'a>(
338338
let is_nested = arr.data_type().is_nested();
339339
let needle_row = Scalar::new(needle.slice(i, 1));
340340
let eq_array = compare_with_eq(&arr, &needle_row, is_nested)?;
341-
result.append(eq_array.true_count() > 0);
341+
result.append(eq_array.has_true());
342342
}
343343

344344
Ok(Arc::new(BooleanArray::new(result.finish(), combined_nulls)))

datafusion/functions-nested/src/replace.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ fn general_replace<O: OffsetSizeTrait>(
347347
let mut counter = 0;
348348

349349
// All elements are false, no need to replace, just copy original data
350-
if eq_array.false_count() == eq_array.len() {
350+
if !eq_array.has_true() {
351351
mutable.extend(
352352
original_idx.to_usize().unwrap(),
353353
start.to_usize().unwrap(),

datafusion/physical-expr-common/src/physical_expr.rs

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ use std::sync::Arc;
2323

2424
use crate::utils::scatter;
2525

26-
use arrow::array::{ArrayRef, BooleanArray, new_empty_array};
26+
use arrow::array::{Array, ArrayRef, BooleanArray, new_empty_array};
2727
use arrow::compute::filter_record_batch;
2828
use arrow::datatypes::{DataType, Field, FieldRef, Schema};
2929
use arrow::record_batch::RecordBatch;
@@ -109,17 +109,15 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
109109
);
110110
}
111111

112-
let selection_count = selection.true_count();
113-
114112
// First, check if we can avoid filtering altogether.
115-
if selection_count == row_count {
113+
if selection.null_count() == 0 && !selection.has_false() {
116114
// All values from the `selection` filter are true and match the input batch.
117115
// No need to perform any filtering.
118116
return self.evaluate(batch);
119117
}
120118

121119
// Next, prepare the result array for each 'true' row in the selection vector.
122-
let filtered_result = if selection_count == 0 {
120+
let filtered_result = if !selection.has_true() {
123121
// Do not call `evaluate` when the selection is empty.
124122
// `evaluate_selection` is used to conditionally evaluate expressions.
125123
// When the expression in question is fallible, evaluating it with an empty
@@ -132,7 +130,7 @@ pub trait PhysicalExpr: Any + Send + Sync + Display + Debug + DynEq + DynHash {
132130
// If we reach this point, there's no other option than to filter the batch.
133131
// This is a fairly costly operation since it requires creating partial copies
134132
// (worst case of length `row_count - 1`) of all the arrays in the record batch.
135-
// The resulting `filtered_batch` will contain `selection_count` rows.
133+
// The resulting `filtered_batch` will contain one row per true in `selection`.
136134
let filtered_batch = filter_record_batch(batch, selection)?;
137135
self.evaluate(&filtered_batch)?
138136
};

datafusion/physical-expr-common/src/utils.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,18 @@ pub fn scatter(mask: &BooleanArray, truthy: &dyn Array) -> Result<ArrayRef> {
8383
};
8484

8585
let output_len = mask.len();
86-
let count = mask.true_count();
8786

8887
// Fast path: no true values mean all-null object
89-
if count == 0 {
88+
if !mask.has_true() {
9089
return Ok(new_null_array(truthy.data_type(), output_len));
9190
}
9291

9392
// Fast path: all true means output = truthy
94-
if count == output_len {
93+
if mask.null_count() == 0 && !mask.has_false() {
9594
return Ok(truthy.slice(0, truthy.len()));
9695
}
9796

97+
let count = mask.true_count();
9898
let selectivity = count as f64 / output_len as f64;
9999
let mask_buffer = mask.values();
100100

datafusion/physical-expr/src/expressions/binary.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -289,11 +289,11 @@ impl PhysicalExpr for BinaryExpr {
289289
ColumnarValue::Array(array) => {
290290
// When the array on the right is all true or all false, skip the scatter process
291291
let boolean_array = array.as_boolean();
292-
let true_count = boolean_array.true_count();
293-
let length = boolean_array.len();
294-
if true_count == length {
292+
if boolean_array.null_count() == 0 && !boolean_array.has_false() {
295293
return Ok(lhs);
296-
} else if true_count == 0 && boolean_array.null_count() == 0 {
294+
} else if boolean_array.null_count() == 0
295+
&& !boolean_array.has_true()
296+
{
297297
// If the right-hand array is returned at this point,the lengths will be inconsistent;
298298
// returning a scalar can avoid this issue
299299
return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(

datafusion/physical-expr/src/expressions/case.rs

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -814,17 +814,15 @@ impl CaseBody {
814814
}
815815
}?;
816816

817-
// `true_count` ignores `true` values where the validity bit is not set, so there's
818-
// no need to call `prep_null_mask_filter`.
819-
let when_true_count = when_value.true_count();
820-
821-
// If the 'when' predicate did not match any rows, continue to the next branch immediately
822-
if when_true_count == 0 {
817+
// If the 'when' predicate did not match any rows, continue to the next branch immediately.
818+
// Only counts valid slots that are true (masked-null predicate slots are ignored),
819+
// so no `prep_null_mask_filter` needed here.
820+
if !when_value.has_true() {
823821
continue;
824822
}
825823

826824
// If the 'when' predicate matched all remaining rows, there is no need to filter
827-
if when_true_count == remainder_batch.num_rows() {
825+
if when_value.null_count() == 0 && !when_value.has_false() {
828826
let then_expression = &self.when_then_expr[i].1;
829827
let then_value = then_expression.evaluate(&remainder_batch)?;
830828
result_builder.add_branch_result(&remainder_rows, then_value)?;
@@ -903,17 +901,15 @@ impl CaseBody {
903901
internal_datafusion_err!("WHEN expression did not return a BooleanArray")
904902
})?;
905903

906-
// `true_count` ignores `true` values where the validity bit is not set, so there's
907-
// no need to call `prep_null_mask_filter`.
908-
let when_true_count = when_value.true_count();
909-
910-
// If the 'when' predicate did not match any rows, continue to the next branch immediately
911-
if when_true_count == 0 {
904+
// If the 'when' predicate did not match any rows, continue to the next branch immediately.
905+
// Only counts valid slots that are true (masked-null predicate slots are ignored)
906+
// so no `prep_null_mask_filter` needed here.
907+
if !when_value.has_true() {
912908
continue;
913909
}
914910

915911
// If the 'when' predicate matched all remaining rows, there is no need to filter
916-
if when_true_count == remainder_batch.num_rows() {
912+
if when_value.null_count() == 0 && !when_value.has_false() {
917913
let then_expression = &self.when_then_expr[i].1;
918914
let then_value = then_expression.evaluate(&remainder_batch)?;
919915
result_builder.add_branch_result(&remainder_rows, then_value)?;
@@ -1165,11 +1161,10 @@ impl CaseExpr {
11651161
)
11661162
})?;
11671163

1168-
let true_count = when_value.true_count();
1169-
if true_count == when_value.len() {
1164+
if when_value.null_count() == 0 && !when_value.has_false() {
11701165
// All input rows are true, just call the 'then' expression
11711166
self.body.when_then_expr[0].1.evaluate(batch)
1172-
} else if true_count == 0 {
1167+
} else if !when_value.has_true() {
11731168
// All input rows are false/null, just call the 'else' expression
11741169
match &self.body.else_expr {
11751170
Some(else_expr) => else_expr.evaluate(batch),

datafusion/physical-expr/src/expressions/in_list.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ impl PhysicalExpr for InListExpr {
377377
for expr in self.list.iter().skip(1) {
378378
// Short-circuit: if every non-null row is already true,
379379
// no further list items can change the result.
380-
if found.null_count() == 0 && found.true_count() == num_rows {
380+
if found.null_count() == 0 && !found.has_false() {
381381
break;
382382
}
383383
found = or_kleene(&found, &compare_one(expr)?)?;

datafusion/physical-plan/src/joins/nested_loop_join.rs

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2105,7 +2105,7 @@ impl NestedLoopJoinStream {
21052105
return Ok(None);
21062106
}
21072107

2108-
if cur_right_bitmap.true_count() == 0 {
2108+
if !cur_right_bitmap.has_true() {
21092109
// If none of the pairs has passed the join predicate/filter
21102110
Ok(None)
21112111
} else {
@@ -2300,11 +2300,8 @@ impl NestedLoopJoinStream {
23002300
) -> Result<()> {
23012301
let left_data = self.get_left_data()?;
23022302

2303-
// number of successfully joined pairs from (l_index x cur_right_batch)
2304-
let joined_len = r_matched_bitmap.true_count();
2305-
23062303
// 1. Maybe update the left bitmap
2307-
if need_produce_result_in_final(self.join_type) && (joined_len > 0) {
2304+
if need_produce_result_in_final(self.join_type) && r_matched_bitmap.has_true() {
23082305
let mut bitmap = left_data.bitmap().lock();
23092306
bitmap.set_bit(l_index, true);
23102307
}
@@ -2696,7 +2693,7 @@ fn build_unmatched_batch(
26962693
not(&batch_bitmap)?
26972694
};
26982695

2699-
if bitmap.true_count() == 0 {
2696+
if !bitmap.has_true() {
27002697
return Ok(None);
27012698
}
27022699

datafusion/physical-plan/src/joins/sort_merge_join/filter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ pub fn filter_record_batch_by_join_type(
331331
.unwrap();
332332

333333
// All rows passed the filter — no null-joining needed
334-
if kept_corrected.true_count() == kept_corrected.len() {
334+
if !kept_corrected.has_false() {
335335
return Ok(kept_batch);
336336
}
337337

0 commit comments

Comments
 (0)