@@ -842,6 +842,36 @@ config_namespace! {
842842 /// will be collected into a single partition
843843 pub hash_join_single_partition_threshold_rows: usize , default = 1024 * 128
844844
845+ /// Maximum size in bytes for the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
846+ /// Build sides larger than this will use hash table lookups instead.
847+ /// Set to 0 to always use hash table lookups.
848+ ///
849+ /// InList pushdown can be more efficient for small build sides because it can result in better
850+ /// statistics pruning as well as use any bloom filters present on the scan side.
851+ /// InList expressions are also more transparent and easier to serialize over the network in distributed uses of DataFusion.
852+ /// On the other hand InList pushdown requires making a copy of the data and thus adds some overhead to the build side and uses more memory.
853+ ///
854+ /// This setting is per-partition, so we may end up using `hash_join_inlist_pushdown_max_size` * `target_partitions` memory.
855+ ///
856+ /// The default is 128kB per partition.
857+ /// This should allow point lookup joins (e.g. joining on a unique primary key) to use InList pushdown in most cases
858+ /// but avoids excessive memory usage or overhead for larger joins.
859+ pub hash_join_inlist_pushdown_max_size: usize , default = 128 * 1024
860+
861+ /// Maximum number of distinct values (rows) in the build side of a hash join to be pushed down as an InList expression for dynamic filtering.
862+ /// Build sides with more rows than this will use hash table lookups instead.
863+ /// Set to 0 to always use hash table lookups.
864+ ///
865+ /// This provides an additional limit beyond `hash_join_inlist_pushdown_max_size` to prevent
866+ /// very large IN lists that might not provide much benefit over hash table lookups.
867+ ///
868+ /// This uses the deduplicated row count once the build side has been evaluated.
869+ ///
870+ /// The default is 150 values per partition.
871+ /// This is inspired by Trino's `max-filter-keys-per-column` setting.
872+ /// See: <https://trino.io/docs/current/admin/dynamic-filtering.html#dynamic-filter-collection-thresholds>
873+ pub hash_join_inlist_pushdown_max_distinct_values: usize , default = 150
874+
845875 /// The default filter selectivity used by Filter Statistics
846876 /// when an exact selectivity cannot be determined. Valid values are
847877 /// between 0 (no selectivity) and 100 (all rows are selected).
0 commit comments