Skip to content

Commit 9d92944

Browse files
timsaucerclaude
andauthored
fix: preserve Inexact precision in Statistics (#22146)
## Which issue does this PR close? No issue reported ## Rationale for this change `Statistics::with_fetch` unconditionally returned `Precision::Exact(0)` when the input had `nr <= skip`, even when the input was `Inexact(nr)` — promoting an estimated upper bound into an exact zero. The exactness flag then misleads downstream consumers (notably `AggregateStatistics` via `Count::value_from_stats`) into trusting a derived "0" and folding the count subtree to a literal. Concrete user-visible symptom reported on TPC-H Q22: ```rust let df = ctx.sql(q22_sql).await?; df.clone().show().await?; // prints 7 rows (correct) df.count().await?; // returns 0 (wrong) ``` `EXPLAIN` for the count plan shows the outer count aggregate collapsed to `ProjectionExec([lit(0)]) -> PlaceholderRowExec`. After PR #21240 left uncorrelated scalar subqueries in the filter rather than rewriting them to joins, `FilterExec` can't use interval analysis on `ScalarSubqueryExpr`, falls back to the 20% default selectivity, and produces a small `Inexact` row estimate. A `LeftAnti` join whose estimated semi-overlap covers the outer estimate then yields `Inexact(0)`. That zero propagates through grouped aggregates whose `estimate_num_rows` returns the child stats unchanged when `value == 0`. The pre-existing `with_fetch` bug on a downstream `SortExec` finally promotes it to `Exact(0)`, which `AggregateStatistics` trusts. The root cause is the precision promotion in `with_fetch`. The PR fixes that; the surrounding plan-shape changes after #21240 just made it reachable. ## What changes are included in this PR? - `Statistics::with_fetch`: when `nr <= skip`, preserve the exactness of the input via `check_num_rows(Some(0), self.num_rows.is_exact().unwrap())` instead of always returning `Exact(0)`. - `datafusion-common`: new unit test `test_with_fetch_skip_all_rows_inexact` pinning the new behaviour. - `datafusion-physical-plan`: update the existing `test_row_number_statistics_for_global_limit` expectation that encoded the old (incorrect) promotion to expect `Inexact(0)` now. - `datafusion/sqllogictest/test_files/subquery.slt`: SLT regression test reproducing the user-visible `count(*)` symptom over a query that contains a scalar subquery, `not exists`, and a group-by on a derived column, backed by parquet sources so the data sources report Exact statistics. ## Are these changes tested? Yes: - Unit test in `datafusion-common` for the precision-preserving behaviour. - Updated unit test in `datafusion-physical-plan/limit.rs`. - SLT regression test in `subquery.slt` that fails without the fix (`count(*)` returns `0` instead of `2`) and passes with it. ## Are there any user-facing changes? Only the bug fix. No public API changes. --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 89391b5 commit 9d92944

3 files changed

Lines changed: 109 additions & 4 deletions

File tree

datafusion/common/src/stats.rs

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -553,10 +553,12 @@ impl Statistics {
553553
num_rows: Precision::Inexact(nr),
554554
..
555555
} => {
556-
// Here, the inexact case gives us an upper bound on the number of rows.
556+
// Here, the inexact case gives us an estimate of the number of rows.
557557
if nr <= skip {
558-
// All input data will be skipped:
559-
Precision::Exact(0)
558+
// All input data will be skipped. Preserve the exactness of
559+
// the input estimate: if the input was inexact, the
560+
// resulting zero is also inexact.
561+
check_num_rows(Some(0), self.num_rows.is_exact().unwrap())
560562
} else if nr <= fetch_val && skip == 0 {
561563
// If the input does not reach the `fetch` globally, and `skip`
562564
// is zero (meaning the input and output are identical), return
@@ -2336,6 +2338,22 @@ mod tests {
23362338
assert_eq!(result.total_byte_size, Precision::Inexact(0));
23372339
}
23382340

2341+
#[test]
2342+
fn test_with_fetch_skip_all_rows_inexact() {
2343+
// When the input num_rows is Inexact (an upper-bound estimate), an
2344+
// `nr <= skip` outcome must remain Inexact: the estimate could be
2345+
// wrong, so we cannot promote 0 to Exact.
2346+
let original_stats = Statistics {
2347+
num_rows: Precision::Inexact(0),
2348+
total_byte_size: Precision::Inexact(0),
2349+
column_statistics: vec![col_stats_i64(10)],
2350+
};
2351+
2352+
let result = original_stats.clone().with_fetch(None, 0, 1).unwrap();
2353+
2354+
assert_eq!(result.num_rows, Precision::Inexact(0));
2355+
}
2356+
23392357
#[test]
23402358
fn test_with_fetch_no_limit() {
23412359
// Test when fetch is None and skip is 0 (no limit applied)

datafusion/physical-plan/src/limit.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -791,9 +791,12 @@ mod tests {
791791
row_number_inexact_statistics_for_global_limit(5, Some(10)).await?;
792792
assert_eq!(row_count, Precision::Inexact(10));
793793

794+
// Input was Inexact, so an `nr <= skip` outcome must remain Inexact:
795+
// the inexact estimate could be wrong, so we cannot promote 0 to
796+
// Exact.
794797
let row_count =
795798
row_number_inexact_statistics_for_global_limit(400, Some(10)).await?;
796-
assert_eq!(row_count, Precision::Exact(0));
799+
assert_eq!(row_count, Precision::Inexact(0));
797800

798801
let row_count =
799802
row_number_inexact_statistics_for_global_limit(398, Some(10)).await?;

datafusion/sqllogictest/test_files/subquery.slt

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2187,3 +2187,87 @@ ORDER BY column1;
21872187

21882188
statement ok
21892189
DROP TABLE subquery_partitioned;
2190+
2191+
# Regression test: a `count(*)` aggregate wrapping a query whose filter
2192+
# contains an uncorrelated scalar subquery used to be folded to a literal
2193+
# `0` by the `AggregateStatistics` physical-optimizer rule.
2194+
#
2195+
# The chain that triggered it requires Exact source statistics (parquet),
2196+
# a filter the interval analyzer cannot reason about (the scalar subquery)
2197+
# so a small default-selectivity Inexact upper-bound flows out, and a
2198+
# LeftAnti join whose semi-join estimate matches the outer estimate.
2199+
# `Statistics::with_fetch` then incorrectly promoted the resulting
2200+
# `Inexact(0)` to `Exact(0)`, which the count statistics fast-path
2201+
# trusted.
2202+
query I
2203+
COPY (SELECT column1 AS c_custkey,
2204+
column2 AS c_phone,
2205+
arrow_cast(column3, 'Decimal128(15, 2)') AS c_acctbal
2206+
FROM (VALUES (1::BIGINT, '13-a', 10.0),
2207+
(2::BIGINT, '17-b', 20.0),
2208+
(3::BIGINT, '18-c', 30.0),
2209+
(4::BIGINT, '23-d', 5.0),
2210+
(5::BIGINT, '29-e', 40.0)))
2211+
TO 'test_files/scratch/subquery/count_scalar_sq/customer.parquet';
2212+
----
2213+
5
2214+
2215+
query I
2216+
COPY (SELECT column1 AS o_custkey FROM (VALUES (1::BIGINT), (4::BIGINT)))
2217+
TO 'test_files/scratch/subquery/count_scalar_sq/orders.parquet';
2218+
----
2219+
2
2220+
2221+
statement ok
2222+
CREATE EXTERNAL TABLE sq_count_customer
2223+
STORED AS PARQUET
2224+
LOCATION 'test_files/scratch/subquery/count_scalar_sq/customer.parquet';
2225+
2226+
statement ok
2227+
CREATE EXTERNAL TABLE sq_count_orders
2228+
STORED AS PARQUET
2229+
LOCATION 'test_files/scratch/subquery/count_scalar_sq/orders.parquet';
2230+
2231+
# Inner query result: 2 distinct cntrycodes survive the filters/anti-join.
2232+
query TIR
2233+
select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal
2234+
from (
2235+
select substring(c_phone from 1 for 2) as cntrycode, c_acctbal
2236+
from sq_count_customer
2237+
where c_acctbal > (
2238+
select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00
2239+
)
2240+
and not exists (
2241+
select * from sq_count_orders where o_custkey = c_custkey
2242+
)
2243+
) as custsale
2244+
group by cntrycode
2245+
order by cntrycode;
2246+
----
2247+
18 1 30
2248+
29 1 40
2249+
2250+
# `count(*)` over the same query must agree with the row count above.
2251+
query I
2252+
select count(*) from (
2253+
select cntrycode, count(*) as numcust, sum(c_acctbal) as totacctbal
2254+
from (
2255+
select substring(c_phone from 1 for 2) as cntrycode, c_acctbal
2256+
from sq_count_customer
2257+
where c_acctbal > (
2258+
select avg(c_acctbal) from sq_count_customer where c_acctbal > 0.00
2259+
)
2260+
and not exists (
2261+
select * from sq_count_orders where o_custkey = c_custkey
2262+
)
2263+
) as custsale
2264+
group by cntrycode
2265+
) as q;
2266+
----
2267+
2
2268+
2269+
statement ok
2270+
DROP TABLE sq_count_customer;
2271+
2272+
statement ok
2273+
DROP TABLE sq_count_orders;

0 commit comments

Comments
 (0)