Merge pull request #407 from posit-dev/fix-avoid-collect-counting-slowdown

rich-iannone · web-flow · commit e45f0c99b8da · 2026-06-30T17:00:15.000-04:00
fix: avoid inefficient collect/count at interrogation time
diff --git a/pointblank/_utils.py b/pointblank/_utils.py
@@ -395,6 +395,61 @@ def _count_null_values_in_column(
     return int(result.item())
 
 
+def _count_validation_units(
+    tbl: IntoFrame,
+    column: str,
+) -> tuple[int, int, int, int]:
+    """
+    Compute the row count and pass/fail/null counts for a results table in a single pass.
+
+    Given a results table with a boolean `column` (typically ``pb_is_good_``), this returns
+    the total number of rows, the number of `True` values (passing test units), the number of
+    `False` values (failing test units), and the number of Null values.
+
+    Computing all four quantities in one aggregation is important for LazyFrames: otherwise each
+    separate count would trigger its own `collect()`, re-executing the entire (potentially
+    expensive) lazy plan multiple times.
+
+    Parameters
+    ----------
+    tbl
+        A Narwhals-compatible DataFrame or table-like object.
+    column
+        The boolean column to summarize.
+
+    Returns
+    -------
+    tuple[int, int, int, int]
+        A tuple of ``(n, n_passed, n_failed, n_null)``.
+    """
+
+    # Convert the DataFrame to a Narwhals DataFrame (no detrimental effect if
+    # already a Narwhals DataFrame)
+    tbl_nw = nw.from_native(tbl)
+
+    # Build a single aggregation that computes all counts at once. Casting booleans to Int32
+    # before summing is required for backends like PySpark (which can't sum booleans), and the
+    # sums naturally ignore Null values (so `n_passed`/`n_failed` exclude nulls).
+    result = tbl_nw.select(
+        nw.len().alias("n"),
+        nw.col(column).cast(nw.Int32).sum().alias("n_passed"),
+        (~nw.col(column)).cast(nw.Int32).sum().alias("n_failed"),
+        nw.col(column).is_null().cast(nw.Int32).sum().alias("n_null"),
+    )
+
+    if is_narwhals_lazyframe(result):
+        result = result.collect()
+
+    row = result.rows(named=True)[0]
+
+    n = int(row["n"])
+    n_passed = int(row["n_passed"] or 0)
+    n_failed = int(row["n_failed"] or 0)
+    n_null = int(row["n_null"] or 0)
+
+    return n, n_passed, n_failed, n_null
+
+
 def _is_numeric_dtype(dtype: str) -> bool:
     """
     Check if a given data type string represents a numeric type.
diff --git a/pointblank/validate.py b/pointblank/validate.py
@@ -96,8 +96,7 @@
     _check_invalid_fields,
     _column_test_prep,
     _copy_dataframe,
-    _count_null_values_in_column,
-    _count_true_values_in_column,
+    _count_validation_units,
     _derive_bounds,
     _format_to_integer_value,
     _get_fn_name,
@@ -15437,22 +15436,23 @@ def interrogate(
             # called `pb_is_good_` that contains boolean values; we can then use this table to
             # determine the number of test units that passed and failed
             if results_tbl is not None:
-                # Count the number of passing and failing test units
-                validation.n_passed = _count_true_values_in_column(
+                # Count passing/failing test units and the total row count in a single pass.
+                # Doing this together avoids re-executing the (possibly lazy) results-table plan
+                # multiple times, which would otherwise scan the data once per count.
+                n_units, n_passed, n_failed, n_null = _count_validation_units(
                     tbl=results_tbl, column="pb_is_good_"
                 )
-                validation.n_failed = _count_true_values_in_column(
-                    tbl=results_tbl, column="pb_is_good_", inverse=True
-                )
+
+                validation.n_passed = n_passed
+                validation.n_failed = n_failed
 
                 # Solely for the col_vals_in_set assertion type, any Null values in the
                 # `pb_is_good_` column are counted as failing test units
                 if assertion_type == "col_vals_in_set":
-                    null_count = _count_null_values_in_column(tbl=results_tbl, column="pb_is_good_")
-                    validation.n_failed += null_count
+                    validation.n_failed += n_null
 
                 # For column-value validations, the number of test units is the number of rows
-                validation.n = get_row_count(data=results_tbl)
+                validation.n = n_units
 
                 # Set the `all_passed` attribute based on whether there are any failing test units
                 validation.all_passed = validation.n_failed == 0
diff --git a/tests/test__utils.py b/tests/test__utils.py
@@ -24,6 +24,7 @@
     _copy_dataframe,
     _count_null_values_in_column,
     _count_true_values_in_column,
+    _count_validation_units,
     _derive_bounds,
     _derive_single_bound,
     _format_to_float_value,
@@ -364,6 +365,31 @@ def test_count_null_values_in_column(tbl_type):
     assert _count_null_values_in_column(tbl=data, column="c") == 2
 
 
+@pytest.mark.parametrize("tbl_type", ["polars", "duckdb"])
+def test_count_validation_units(tbl_type):
+    data = load_dataset(dataset="small_table", tbl_type=tbl_type)
+
+    # Column `e` has 8 True and 5 False values (13 rows total, no nulls)
+    n, n_passed, n_failed, n_null = _count_validation_units(tbl=data, column="e")
+
+    assert n == 13
+    assert n_passed == 8
+    assert n_failed == 5
+    assert n_null == 0
+
+
+def test_count_validation_units_with_nulls():
+    import polars as pl
+
+    df = pl.DataFrame({"pb_is_good_": [True, False, True, None, None]})
+
+    # A LazyFrame and an eager DataFrame should yield identical counts; Null values are excluded
+    # from both the pass and fail counts and surfaced separately
+    for native in (df, df.lazy()):
+        n, n_passed, n_failed, n_null = _count_validation_units(tbl=native, column="pb_is_good_")
+        assert (n, n_passed, n_failed, n_null) == (5, 2, 1, 2)
+
+
 def test_format_to_integer_value():
     assert _format_to_integer_value(0) == "0"
     assert _format_to_integer_value(0.3) == "0"