1515// specific language governing permissions and limitations
1616// under the License.
1717
18+ use std:: sync:: atomic:: { AtomicI64 , Ordering } ;
19+
1820use crate :: bit_iterator:: { BitIndexIterator , BitIterator , BitSliceIterator } ;
1921use crate :: buffer:: BooleanBuffer ;
2022use crate :: { Buffer , MutableBuffer } ;
2123
24+ const UNINITIALIZED_NULL_COUNT : i64 = -1 ;
25+
26+ #[ derive( Debug ) ]
27+ pub enum NullCount {
28+ Eager ( usize ) ,
29+ Lazy ( AtomicI64 ) ,
30+ }
31+
32+ impl Clone for NullCount {
33+ fn clone ( & self ) -> Self {
34+ match self {
35+ Self :: Eager ( v) => Self :: Eager ( * v) ,
36+ Self :: Lazy ( v) => {
37+ let v = v. load ( Ordering :: Relaxed ) ;
38+ Self :: Lazy ( AtomicI64 :: new ( v) )
39+ }
40+ }
41+ }
42+ }
43+
2244/// A [`BooleanBuffer`] used to encode validity for arrow arrays
2345///
2446/// As per the [Arrow specification], array validity is encoded in a packed bitmask with a
2547/// `true` value indicating the corresponding slot is not null, and `false` indicating
2648/// that it is null.
2749///
2850/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
29- #[ derive( Debug , Clone , Eq , PartialEq ) ]
51+ #[ derive( Debug , Clone ) ]
3052pub struct NullBuffer {
3153 buffer : BooleanBuffer ,
32- null_count : usize ,
54+ null_count : NullCount ,
55+ }
56+
57+ impl PartialEq for NullBuffer {
58+ fn eq ( & self , other : & Self ) -> bool {
59+ self . buffer == other. buffer
60+ }
3361}
3462
63+ impl Eq for NullBuffer { }
64+
3565impl NullBuffer {
3666 /// Create a new [`NullBuffer`] computing the null count
3767 pub fn new ( buffer : BooleanBuffer ) -> Self {
38- let null_count = buffer. len ( ) - buffer. count_set_bits ( ) ;
68+ // Expensive to calc the null count, we should lazily compute it when
69+ let null_count = NullCount :: Lazy ( AtomicI64 :: new ( UNINITIALIZED_NULL_COUNT ) ) ;
3970 Self { buffer, null_count }
4071 }
4172
4273 /// Create a new [`NullBuffer`] of length `len` where all values are null
4374 pub fn new_null ( len : usize ) -> Self {
4475 Self {
4576 buffer : BooleanBuffer :: new_unset ( len) ,
46- null_count : len,
77+ null_count : NullCount :: Eager ( len) ,
4778 }
4879 }
4980
@@ -53,7 +84,7 @@ impl NullBuffer {
5384 pub fn new_valid ( len : usize ) -> Self {
5485 Self {
5586 buffer : BooleanBuffer :: new_set ( len) ,
56- null_count : 0 ,
87+ null_count : NullCount :: Eager ( 0 ) ,
5788 }
5889 }
5990
@@ -63,7 +94,10 @@ impl NullBuffer {
6394 ///
6495 /// `buffer` must contain `null_count` `0` bits
6596 pub unsafe fn new_unchecked ( buffer : BooleanBuffer , null_count : usize ) -> Self {
66- Self { buffer, null_count }
97+ Self {
98+ buffer,
99+ null_count : NullCount :: Eager ( null_count) ,
100+ }
67101 }
68102
69103 /// Computes the union of the nulls in two optional [`NullBuffer`]
@@ -81,9 +115,12 @@ impl NullBuffer {
81115
82116 /// Returns true if all nulls in `other` also exist in self
83117 pub fn contains ( & self , other : & NullBuffer ) -> bool {
84- if other. null_count == 0 {
85- return true ;
118+ if let NullCount :: Eager ( v) = & other. null_count {
119+ if * v == 0 {
120+ return true ;
121+ }
86122 }
123+
87124 let lhs = self . inner ( ) . bit_chunks ( ) . iter_padded ( ) ;
88125 let rhs = other. inner ( ) . bit_chunks ( ) . iter_padded ( ) ;
89126 lhs. zip ( rhs) . all ( |( l, r) | ( l & !r) == 0 )
@@ -106,9 +143,17 @@ impl NullBuffer {
106143 crate :: bit_util:: set_bit ( buffer. as_mut ( ) , i * count + j)
107144 }
108145 }
146+
147+ let null_count = if let NullCount :: Eager ( v) = & self . null_count {
148+ NullCount :: Eager ( v * count)
149+ } else {
150+ // TODO: not sure about if it is better to load the atomic and attempt to reuse the compute result
151+ NullCount :: Lazy ( AtomicI64 :: new ( UNINITIALIZED_NULL_COUNT ) )
152+ } ;
153+
109154 Self {
110155 buffer : BooleanBuffer :: new ( buffer. into ( ) , 0 , capacity) ,
111- null_count : self . null_count * count ,
156+ null_count,
112157 }
113158 }
114159
@@ -131,9 +176,20 @@ impl NullBuffer {
131176 }
132177
133178 /// Returns the null count for this [`NullBuffer`]
134- #[ inline]
135179 pub fn null_count ( & self ) -> usize {
136- self . null_count
180+ match & self . null_count {
181+ NullCount :: Eager ( v) => * v,
182+ NullCount :: Lazy ( v) => {
183+ let cached_null_count = v. load ( Ordering :: Acquire ) ;
184+ if cached_null_count != UNINITIALIZED_NULL_COUNT {
185+ return cached_null_count as usize ;
186+ }
187+
188+ let computed_null_count = self . buffer . len ( ) - self . buffer . count_set_bits ( ) ;
189+ v. store ( computed_null_count as i64 , Ordering :: Release ) ;
190+ computed_null_count
191+ }
192+ }
137193 }
138194
139195 /// Returns `true` if the value at `idx` is not null
@@ -189,8 +245,10 @@ impl NullBuffer {
189245 & self ,
190246 f : F ,
191247 ) -> Result < ( ) , E > {
192- if self . null_count == self . len ( ) {
193- return Ok ( ( ) ) ;
248+ if let NullCount :: Eager ( v) = & self . null_count {
249+ if * v == self . len ( ) {
250+ return Ok ( ( ) ) ;
251+ }
194252 }
195253 self . valid_indices ( ) . try_for_each ( f)
196254 }
0 commit comments