@@ -20,7 +20,7 @@ use datafusion::{arrow::datatypes::DataType, logical_expr::Volatility};
2020use std:: { any:: Any , sync:: Arc } ;
2121
2222use crate :: bloom_filter:: spark_bloom_filter;
23- use crate :: bloom_filter:: spark_bloom_filter:: SparkBloomFilter ;
23+ use crate :: bloom_filter:: spark_bloom_filter:: { SparkBloomFilter , SparkBloomFilterVersion } ;
2424
2525use arrow:: array:: ArrayRef ;
2626use arrow:: array:: BinaryArray ;
@@ -37,6 +37,10 @@ pub struct BloomFilterAgg {
3737 signature : Signature ,
3838 num_items : i32 ,
3939 num_bits : i32 ,
40+ /// Output serialization version. Spark <= 4.0 only knows V1; Spark 4.1+'s
41+ /// `BloomFilter.create` defaults to V2, so the JVM serde sets this to V2 on
42+ /// 4.1+ to keep `bloom_filter_agg` byte-equivalent with Spark's aggregator.
43+ version : SparkBloomFilterVersion ,
4044}
4145
4246#[ inline]
@@ -54,6 +58,7 @@ impl BloomFilterAgg {
5458 num_items : Arc < dyn PhysicalExpr > ,
5559 num_bits : Arc < dyn PhysicalExpr > ,
5660 data_type : DataType ,
61+ version : SparkBloomFilterVersion ,
5762 ) -> Self {
5863 assert ! ( matches!( data_type, DataType :: Binary ) ) ;
5964 Self {
@@ -70,6 +75,7 @@ impl BloomFilterAgg {
7075 ) ,
7176 num_items : extract_i32_from_literal ( num_items) ,
7277 num_bits : extract_i32_from_literal ( num_bits) ,
78+ version,
7379 }
7480 }
7581}
@@ -92,10 +98,13 @@ impl AggregateUDFImpl for BloomFilterAgg {
9298 }
9399
94100 fn accumulator ( & self , _acc_args : AccumulatorArgs ) -> Result < Box < dyn Accumulator > > {
95- Ok ( Box :: new ( SparkBloomFilter :: from ( (
101+ Ok ( Box :: new ( SparkBloomFilter :: new (
102+ self . version ,
96103 spark_bloom_filter:: optimal_num_hash_functions ( self . num_items , self . num_bits ) ,
97104 self . num_bits ,
98- ) ) ) )
105+ // Spark's BloomFilterAggregate always uses BloomFilterImplV2.DEFAULT_SEED (= 0).
106+ 0 ,
107+ ) ) )
99108 }
100109
101110 fn state_fields ( & self , _args : StateFieldsArgs ) -> Result < Vec < FieldRef > > {
0 commit comments