@@ -303,14 +303,11 @@ pub(crate) struct SharedBuildAccumulator {
303303 on_right : Vec < PhysicalExprRef > ,
304304 /// Schema of the probe (right) side for evaluating filter expressions
305305 probe_schema : Arc < Schema > ,
306- /// Cap on the cross-partition merged-InList distinct count. Reuses
307- /// `optimizer.hash_join_inlist_pushdown_max_distinct_values` (the same
308- /// option that gates per-partition InList pushdown). When the union of
309- /// every reported partition's deduplicated InList values stays at or
310- /// below this many distinct entries we collapse them into a single
311- /// `IN (SET)` predicate that can participate in parquet stats /
312- /// bloom-filter pruning at the scan; otherwise we use
313- /// `multi_hash_lookup` over every partition's hash table instead.
306+ /// Maximum distinct entries the cross-partition merged InList may
307+ /// contain before we fall back to `multi_hash_lookup`. Sourced from
308+ /// `optimizer.hash_join_inlist_pushdown_max_distinct_values` so the
309+ /// same threshold caps both per-partition and cross-partition InList
310+ /// pushdown.
314311 inlist_max_distinct_values : usize ,
315312}
316313
@@ -734,31 +731,26 @@ impl SharedBuildAccumulator {
734731 } ) )
735732 }
736733
737- /// Build the dynamic filter for `PartitionMode::Partitioned`. The filter
738- /// is decoupled from the repartition strategy: regardless of whether
739- /// individual partitions chose Map or InList for their pushdown, we
740- /// always emit `(global_minmax AND ([merged_in_list AND] multi_hash_lookup))`.
741- /// This drops the legacy `CASE hash_repartition % N WHEN p THEN … END`
742- /// routing expression entirely.
734+ /// Build the dynamic filter for `PartitionMode::Partitioned`. Emits
735+ /// `global_minmax AND ([merged_in_list AND] multi_hash_lookup)` —
736+ /// independent of how the build side was repartitioned.
743737 ///
744738 /// * `global_minmax` — envelope of every partition's per-column min/max.
745739 /// Cheap short-circuit and the only piece visible to scan-level
746740 /// `pruning_predicate` extraction.
747- /// * `merged_in_list` — concatenated build keys when every reported
748- /// partition produced an `InList` array and the cross-partition
749- /// *deduplicated* set has at most `inlist_max_distinct_values` distinct
750- /// entries (the same option that gates the per-partition InList path,
751- /// `optimizer.hash_join_inlist_pushdown_max_distinct_values`). Worth
752- /// carrying because a small `IN (SET)` participates in parquet stats /
753- /// bloom-filter pruning, which `multi_hash_lookup` cannot .
754- /// * `multi_hash_lookup` — runtime hash-table probe across every
755- /// partition's `Map`, hashing the join keys once .
741+ /// * `merged_in_list` — concatenated, deduplicated build keys when every
742+ /// reported partition contributed an `InList` array and the
743+ /// cross-partition union fits under
744+ /// `optimizer.hash_join_inlist_pushdown_max_distinct_values`. A small
745+ /// `IN (SET)` participates in parquet stats / bloom-filter pruning,
746+ /// which `multi_hash_lookup` does not. When present it fully replaces
747+ /// the lookup .
748+ /// * `multi_hash_lookup` — hashes the join keys once and ORs
749+ /// `contain_hashes()` across every partition's hash table .
756750 ///
757- /// The `has_canceled_unknown` case is the only one that can't safely use
758- /// this shape (we'd be missing maps for the canceled partitions). We
759- /// could keep a CASE just for that case, but the query is in the middle
760- /// of being torn down — emit `lit(true)` and let the join do whatever
761- /// filtering it can on its own.
751+ /// `has_canceled_unknown` partitions short-circuit to `lit(true)`: we
752+ /// don't have their maps, so we cannot include them in the lookup, and
753+ /// the query is being torn down anyway.
762754 fn build_partitioned_filter (
763755 & self ,
764756 real_partitions : & [ & PartitionData ] ,
@@ -777,12 +769,10 @@ impl SharedBuildAccumulator {
777769 . as_ref ( )
778770 . and_then ( |b| create_bounds_predicate ( & self . on_right , b) ) ;
779771
780- // Try to build a merged InList. Only fires when *every* reported
781- // partition contributed an InList array AND the cross-partition
782- // deduplicated union has at most `inlist_max_distinct_values`
783- // entries. The merged InList already covers the union of every
784- // partition's build-side keys, so when present it fully replaces
785- // `multi_hash_lookup` — no need to AND a redundant probe on top.
772+ // The merged InList covers the union of every partition's
773+ // build-side keys, so when it fires it stands alone — there is no
774+ // need to also AND a `multi_hash_lookup` (which would just probe
775+ // the same data via a different structure).
786776 let membership_expr =
787777 if let Some ( merged) = self . try_build_merged_inlist ( real_partitions) ? {
788778 Some ( merged)
@@ -815,14 +805,13 @@ impl SharedBuildAccumulator {
815805 /// If every reported partition contributed an InList array, concatenate
816806 /// them, deduplicate by scalar value, and gate on the
817807 /// `inlist_max_distinct_values` cap. Returns the merged
818- /// `(struct(...))? IN (SET) ([…])` predicate built over the *deduplicated*
819- /// keys when the cap is satisfied; `None` otherwise.
808+ /// `(struct(...))? IN (SET) ([…])` predicate built over the
809+ /// deduplicated keys when the cap is satisfied; `None` otherwise.
820810 ///
821- /// Per-partition arrays carry duplicates (the build side never dedups
822- /// before shipping), so each partition's `arr.len()` is an upper bound on
823- /// its distinct count. We start with a cheap pre-check (sum of lengths ≤
824- /// some fast-reject limit) before the dedup walk to keep the cost
825- /// proportional to actual partition sizes.
811+ /// Per-partition arrays carry duplicates — each partition ships its raw
812+ /// build-side join keys, dedup happens here. The dedup walk early-aborts
813+ /// the moment we cross the cap, so the cost stays bounded by
814+ /// `O(rows-until-cap+1-distinct-found)` rather than total input size.
826815 fn try_build_merged_inlist (
827816 & self ,
828817 real_partitions : & [ & PartitionData ] ,
0 commit comments