@@ -302,14 +302,11 @@ pub(crate) struct SharedBuildAccumulator {
302302 on_right : Vec < PhysicalExprRef > ,
303303 /// Schema of the probe (right) side for evaluating filter expressions
304304 probe_schema : Arc < Schema > ,
305- /// Cap on the cross-partition merged-InList distinct count. Reuses
306- /// `optimizer.hash_join_inlist_pushdown_max_distinct_values` (the same
307- /// option that gates per-partition InList pushdown). When the union of
308- /// every reported partition's deduplicated InList values stays at or
309- /// below this many distinct entries we collapse them into a single
310- /// `IN (SET)` predicate that can participate in parquet stats /
311- /// bloom-filter pruning at the scan; otherwise we use
312- /// `multi_hash_lookup` over every partition's hash table instead.
305+ /// Maximum distinct entries the cross-partition merged InList may
306+ /// contain before we fall back to `multi_hash_lookup`. Sourced from
307+ /// `optimizer.hash_join_inlist_pushdown_max_distinct_values` so the
308+ /// same threshold caps both per-partition and cross-partition InList
309+ /// pushdown.
313310 inlist_max_distinct_values : usize ,
314311}
315312
@@ -733,31 +730,26 @@ impl SharedBuildAccumulator {
733730 } ) )
734731 }
735732
736- /// Build the dynamic filter for `PartitionMode::Partitioned`. The filter
737- /// is decoupled from the repartition strategy: regardless of whether
738- /// individual partitions chose Map or InList for their pushdown, we
739- /// always emit `(global_minmax AND ([merged_in_list AND] multi_hash_lookup))`.
740- /// This drops the legacy `CASE hash_repartition % N WHEN p THEN … END`
741- /// routing expression entirely.
733+ /// Build the dynamic filter for `PartitionMode::Partitioned`. Emits
734+ /// `global_minmax AND ([merged_in_list AND] multi_hash_lookup)` —
735+ /// independent of how the build side was repartitioned.
742736 ///
743737 /// * `global_minmax` — envelope of every partition's per-column min/max.
744738 /// Cheap short-circuit and the only piece visible to scan-level
745739 /// `pruning_predicate` extraction.
746- /// * `merged_in_list` — concatenated build keys when every reported
747- /// partition produced an `InList` array and the cross-partition
748- /// *deduplicated* set has at most `inlist_max_distinct_values` distinct
749- /// entries (the same option that gates the per-partition InList path,
750- /// `optimizer.hash_join_inlist_pushdown_max_distinct_values`). Worth
751- /// carrying because a small `IN (SET)` participates in parquet stats /
752- /// bloom-filter pruning, which `multi_hash_lookup` cannot .
753- /// * `multi_hash_lookup` — runtime hash-table probe across every
754- /// partition's `Map`, hashing the join keys once .
740+ /// * `merged_in_list` — concatenated, deduplicated build keys when every
741+ /// reported partition contributed an `InList` array and the
742+ /// cross-partition union fits under
743+ /// `optimizer.hash_join_inlist_pushdown_max_distinct_values`. A small
744+ /// `IN (SET)` participates in parquet stats / bloom-filter pruning,
745+ /// which `multi_hash_lookup` does not. When present it fully replaces
746+ /// the lookup .
747+ /// * `multi_hash_lookup` — hashes the join keys once and ORs
748+ /// `contain_hashes()` across every partition's hash table .
755749 ///
756- /// The `has_canceled_unknown` case is the only one that can't safely use
757- /// this shape (we'd be missing maps for the canceled partitions). We
758- /// could keep a CASE just for that case, but the query is in the middle
759- /// of being torn down — emit `lit(true)` and let the join do whatever
760- /// filtering it can on its own.
750+ /// `has_canceled_unknown` partitions short-circuit to `lit(true)`: we
751+ /// don't have their maps, so we cannot include them in the lookup, and
752+ /// the query is being torn down anyway.
761753 fn build_partitioned_filter (
762754 & self ,
763755 real_partitions : & [ & PartitionData ] ,
@@ -776,12 +768,10 @@ impl SharedBuildAccumulator {
776768 . as_ref ( )
777769 . and_then ( |b| create_bounds_predicate ( & self . on_right , b) ) ;
778770
779- // Try to build a merged InList. Only fires when *every* reported
780- // partition contributed an InList array AND the cross-partition
781- // deduplicated union has at most `inlist_max_distinct_values`
782- // entries. The merged InList already covers the union of every
783- // partition's build-side keys, so when present it fully replaces
784- // `multi_hash_lookup` — no need to AND a redundant probe on top.
771+ // The merged InList covers the union of every partition's
772+ // build-side keys, so when it fires it stands alone — there is no
773+ // need to also AND a `multi_hash_lookup` (which would just probe
774+ // the same data via a different structure).
785775 let membership_expr =
786776 if let Some ( merged) = self . try_build_merged_inlist ( real_partitions) ? {
787777 Some ( merged)
@@ -814,14 +804,13 @@ impl SharedBuildAccumulator {
814804 /// If every reported partition contributed an InList array, concatenate
815805 /// them, deduplicate by scalar value, and gate on the
816806 /// `inlist_max_distinct_values` cap. Returns the merged
817- /// `(struct(...))? IN (SET) ([…])` predicate built over the *deduplicated*
818- /// keys when the cap is satisfied; `None` otherwise.
807+ /// `(struct(...))? IN (SET) ([…])` predicate built over the
808+ /// deduplicated keys when the cap is satisfied; `None` otherwise.
819809 ///
820- /// Per-partition arrays carry duplicates (the build side never dedups
821- /// before shipping), so each partition's `arr.len()` is an upper bound on
822- /// its distinct count. We start with a cheap pre-check (sum of lengths ≤
823- /// some fast-reject limit) before the dedup walk to keep the cost
824- /// proportional to actual partition sizes.
810+ /// Per-partition arrays carry duplicates — each partition ships its raw
811+ /// build-side join keys, dedup happens here. The dedup walk early-aborts
812+ /// the moment we cross the cap, so the cost stays bounded by
813+ /// `O(rows-until-cap+1-distinct-found)` rather than total input size.
825814 fn try_build_merged_inlist (
826815 & self ,
827816 real_partitions : & [ & PartitionData ] ,
0 commit comments