1010
1111use serde:: ser:: { Serialize , Serializer } ;
1212use std:: borrow:: Borrow ;
13- use std:: collections:: HashMap ;
13+ use std:: collections:: HashSet ;
1414use std:: hash:: Hash ;
1515
1616/// A Vec-backed map that provides HashMap-like lookup by key.
@@ -28,28 +28,56 @@ use std::hash::Hash;
2828/// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and
2929/// won't deduplicate.
3030///
31- /// Explicit deduplication is currently being done automatically and on-the-fly during
32- /// serialization. If needed, in the future, we might trigger deduplication on other events, for
33- /// example at insertion if the size is bigger than a threshold.
34- #[ derive( Clone , Debug , PartialEq , Default ) ]
35- pub struct VecMap < K , V > ( Vec < ( K , V ) > ) ;
31+ /// Explicit deduplication is currently being done on-demand by [VecMap::dedup]. An internal flag is
32+ /// used to avoid undue deduplication (see [VecMap::dedup]). `VecMap` is automatically deduped
33+ /// before serialization.
34+ ///
35+ /// In the future, we could trigger deduplication on other events, for example at insertion if the
36+ /// size is bigger than a threshold (and we haven't deduped for `x` operations).
37+ #[ derive( Clone , Debug , PartialEq ) ]
38+ pub struct VecMap < K , V > {
39+ data : Vec < ( K , V ) > ,
40+ /// Deduped is a flag that is set after entry deduplication. It is dirtied (set to `false`)
41+ /// when any modification is performed (`deduped == false` doesn't imply there are actual
42+ /// duplicates, just than there might be). This is useful to avoid performing deduplication
43+ /// several times in the export pipeline.
44+ deduped : bool ,
45+ }
46+
47+ impl < K , V > Default for VecMap < K , V > {
48+ fn default ( ) -> Self {
49+ Self {
50+ data : Default :: default ( ) ,
51+ deduped : false ,
52+ }
53+ }
54+ }
3655
3756impl < K , V > VecMap < K , V > {
3857 #[ must_use]
3958 #[ inline]
4059 pub fn new ( ) -> Self {
41- VecMap ( Vec :: new ( ) )
60+ Self :: default ( )
61+ }
62+
63+ /// Dirty the `dedup` flag after a mutation that could introduce duplicates.
64+ fn dirty ( & mut self ) {
65+ self . deduped = false ;
4266 }
4367
4468 #[ must_use]
4569 #[ inline]
4670 pub fn with_capacity ( capacity : usize ) -> Self {
47- VecMap ( Vec :: with_capacity ( capacity) )
71+ VecMap {
72+ data : Vec :: with_capacity ( capacity) ,
73+ deduped : false ,
74+ }
4875 }
4976
5077 #[ inline]
5178 pub fn insert ( & mut self , key : K , value : V ) {
52- self . 0 . push ( ( key, value) ) ;
79+ self . data . push ( ( key, value) ) ;
80+ self . dirty ( ) ;
5381 }
5482
5583 #[ inline]
@@ -58,7 +86,7 @@ impl<K, V> VecMap<K, V> {
5886 K : Borrow < Q > ,
5987 Q : ?Sized + PartialEq ,
6088 {
61- self . 0
89+ self . data
6290 . iter ( )
6391 . rev ( )
6492 . find ( |( k, _) | k. borrow ( ) == key)
@@ -71,7 +99,7 @@ impl<K, V> VecMap<K, V> {
7199 K : Borrow < Q > ,
72100 Q : ?Sized + PartialEq ,
73101 {
74- self . 0
102+ self . data
75103 . iter_mut ( )
76104 . rev ( )
77105 . find ( |( k, _) | ( * k) . borrow ( ) == key)
@@ -84,7 +112,7 @@ impl<K, V> VecMap<K, V> {
84112 K : Borrow < Q > ,
85113 Q : ?Sized + PartialEq ,
86114 {
87- self . 0 . iter ( ) . any ( |( k, _) | k. borrow ( ) == key)
115+ self . data . iter ( ) . any ( |( k, _) | k. borrow ( ) == key)
88116 }
89117
90118 /// Remove all entries matching this key from the map. This method uses [Vec::retain], and is
@@ -97,42 +125,83 @@ impl<K, V> VecMap<K, V> {
97125 K : Borrow < Q > ,
98126 Q : ?Sized + PartialEq ,
99127 {
100- self . 0 . retain ( |( k, _) | k. borrow ( ) != key) ;
128+ self . data . retain ( |( k, _) | k. borrow ( ) != key) ;
101129 }
102130
103131 /// Iterate over the element, including duplicate entries.
104132 #[ inline]
105133 pub fn iter ( & self ) -> std:: slice:: Iter < ' _ , ( K , V ) > {
106- self . 0 . iter ( )
134+ self . data . iter ( )
107135 }
108136
109137 /// Iterate mutably over the elements, including duplicate entries.
110138 #[ inline]
111139 pub fn iter_mut ( & mut self ) -> std:: slice:: IterMut < ' _ , ( K , V ) > {
112- self . 0 . iter_mut ( )
140+ self . dirty ( ) ;
141+ self . data . iter_mut ( )
113142 }
114143
115144 /// Return the length of the underlying vector, thus including duplicate entries.
116145 #[ inline]
117146 pub fn len ( & self ) -> usize {
118- self . 0 . len ( )
147+ self . data . len ( )
119148 }
120149
121150 #[ inline]
122151 pub fn is_empty ( & self ) -> bool {
123- self . 0 . is_empty ( )
152+ self . data . is_empty ( )
153+ }
154+
155+ /// Return `true` if the map hasn't been extended since the last call to [Self::dedup],
156+ /// guaranteeing that the underlying vector doesn't have any duplicate key.
157+ ///
158+ /// If `is_deduped` returns `false`, the map may have duplicate keys.
159+ #[ inline]
160+ pub fn is_deduped ( & self ) -> bool {
161+ self . deduped
162+ }
163+ }
164+
165+ impl < K : Hash + Eq + Clone , V > VecMap < K , V > {
166+ /// Remove entries with a duplicate key, only keeping the last one. After this, a flag is set
167+ /// internally, such that as long as the map isn't extended or mutably iterated, the next
168+ /// [Self::dedup] doesn't perform the work again.
169+ pub fn dedup ( & mut self ) {
170+ if self . deduped {
171+ return ;
172+ }
173+
174+ // Since we're going to shuffle elements around, it's not easy to keep references to keys in
175+ // the deduping set. The simplest is to clone them.
176+ let mut seen = HashSet :: with_capacity ( self . len ( ) ) ;
177+
178+ self . data . reverse ( ) ;
179+ self . data . retain ( |( k, _) | seen. insert ( k. clone ( ) ) ) ;
180+ self . deduped = true ;
124181 }
125182}
126183
127184impl < K , V > From < Vec < ( K , V ) > > for VecMap < K , V > {
128- fn from ( vec : Vec < ( K , V ) > ) -> Self {
129- VecMap ( vec)
185+ fn from ( data : Vec < ( K , V ) > ) -> Self {
186+ Self {
187+ data,
188+ deduped : false ,
189+ }
190+ }
191+ }
192+
193+ impl < K , V > From < VecMap < K , V > > for Vec < ( K , V ) > {
194+ fn from ( value : VecMap < K , V > ) -> Self {
195+ value. data
130196 }
131197}
132198
133199impl < K , V > FromIterator < ( K , V ) > for VecMap < K , V > {
134200 fn from_iter < I : IntoIterator < Item = ( K , V ) > > ( iter : I ) -> Self {
135- VecMap ( iter. into_iter ( ) . collect ( ) )
201+ Self {
202+ data : iter. into_iter ( ) . collect ( ) ,
203+ deduped : false ,
204+ }
136205 }
137206}
138207
@@ -141,7 +210,7 @@ impl<K, V> IntoIterator for VecMap<K, V> {
141210 type IntoIter = std:: vec:: IntoIter < ( K , V ) > ;
142211
143212 fn into_iter ( self ) -> Self :: IntoIter {
144- self . 0 . into_iter ( )
213+ self . data . into_iter ( )
145214 }
146215}
147216
@@ -150,7 +219,7 @@ impl<'a, K, V> IntoIterator for &'a VecMap<K, V> {
150219 type IntoIter = std:: slice:: Iter < ' a , ( K , V ) > ;
151220
152221 fn into_iter ( self ) -> Self :: IntoIter {
153- self . 0 . iter ( )
222+ self . data . iter ( )
154223 }
155224}
156225
@@ -159,28 +228,44 @@ impl<'a, K, V> IntoIterator for &'a mut VecMap<K, V> {
159228 type IntoIter = std:: slice:: IterMut < ' a , ( K , V ) > ;
160229
161230 fn into_iter ( self ) -> Self :: IntoIter {
162- self . 0 . iter_mut ( )
231+ self . data . iter_mut ( )
163232 }
164233}
165234
166235impl < K , V > Extend < ( K , V ) > for VecMap < K , V > {
167236 fn extend < I : IntoIterator < Item = ( K , V ) > > ( & mut self , iter : I ) {
168- self . 0 . extend ( iter) ;
237+ self . dirty ( ) ;
238+ self . data . extend ( iter) ;
169239 }
170240}
171241
172242impl < K : Serialize + Eq + Hash , V : Serialize > Serialize for VecMap < K , V > {
173243 fn serialize < S : Serializer > ( & self , serializer : S ) -> Result < S :: Ok , S :: Error > {
244+ use serde:: ser:: SerializeMap ;
245+ use std:: collections:: HashMap ;
246+
174247 // We pre-compute the deduped map. If deduplication were done on the fly during
175248 // serialization, we couldn't provide a length up front to the serializer, and the current
176249 // one (rmp) will allocate an intermediate buffer defensively.
177- self . 0
178- . iter ( )
179- . map ( |( k, v) | ( k, v) )
180- // Since the iterator is sized, `collect()` should pre-allocate with the right capacity
181- // in one shot.
182- . collect :: < HashMap < & K , & V > > ( )
183- . serialize ( serializer)
250+ if self . deduped {
251+ let mut map_ser = serializer. serialize_map ( Some ( self . len ( ) ) ) ?;
252+
253+ for ( k, v) in self {
254+ map_ser. serialize_entry ( k, v) ?;
255+ }
256+
257+ map_ser. end ( )
258+ } else {
259+ // Note: using `dedup` would need an additional `clone()` of the whole map here. We can
260+ // use references instead.
261+ self . data
262+ . iter ( )
263+ . map ( |( k, v) | ( k, v) )
264+ // Since the iterator is sized, `collect()` should pre-allocate with the right
265+ // capacity in one shot.
266+ . collect :: < HashMap < & K , & V > > ( )
267+ . serialize ( serializer)
268+ }
184269 }
185270}
186271
@@ -243,6 +328,118 @@ mod tests {
243328 assert_eq ! ( pairs, vec![ ( "a" , 1 ) , ( "b" , 2 ) ] ) ;
244329 }
245330
331+ #[ test]
332+ fn is_deduped_false_initially ( ) {
333+ let m: VecMap < & str , i32 > = VecMap :: new ( ) ;
334+ assert ! ( !m. is_deduped( ) ) ;
335+ }
336+
337+ #[ test]
338+ fn is_deduped_false_after_from ( ) {
339+ let m: VecMap < & str , i32 > = vec ! [ ( "a" , 1 ) ] . into ( ) ;
340+ assert ! ( !m. is_deduped( ) ) ;
341+ }
342+
343+ #[ test]
344+ fn is_deduped_false_after_collect ( ) {
345+ let m: VecMap < & str , i32 > = vec ! [ ( "a" , 1 ) ] . into_iter ( ) . collect ( ) ;
346+ assert ! ( !m. is_deduped( ) ) ;
347+ }
348+
349+ #[ test]
350+ fn dedup_sets_flag ( ) {
351+ let mut m = VecMap :: new ( ) ;
352+ m. insert ( "a" , 1 ) ;
353+ assert ! ( !m. is_deduped( ) ) ;
354+ m. dedup ( ) ;
355+ assert ! ( m. is_deduped( ) ) ;
356+ }
357+
358+ #[ test]
359+ fn dedup_on_empty_map ( ) {
360+ let mut m: VecMap < String , i32 > = VecMap :: new ( ) ;
361+ m. dedup ( ) ;
362+ assert ! ( m. is_deduped( ) ) ;
363+ assert ! ( m. is_empty( ) ) ;
364+ }
365+
366+ #[ test]
367+ fn dedup_no_duplicates ( ) {
368+ let mut m = VecMap :: new ( ) ;
369+ m. insert ( "a" , 1 ) ;
370+ m. insert ( "b" , 2 ) ;
371+ m. insert ( "c" , 3 ) ;
372+ m. dedup ( ) ;
373+ assert_eq ! ( m. len( ) , 3 ) ;
374+ assert_eq ! ( m. get( "a" ) , Some ( & 1 ) ) ;
375+ assert_eq ! ( m. get( "b" ) , Some ( & 2 ) ) ;
376+ assert_eq ! ( m. get( "c" ) , Some ( & 3 ) ) ;
377+ }
378+
379+ #[ test]
380+ fn dedup_keeps_last_value ( ) {
381+ let mut m = VecMap :: new ( ) ;
382+ m. insert ( "a" , 1 ) ;
383+ m. insert ( "b" , 10 ) ;
384+ m. insert ( "a" , 2 ) ;
385+ m. insert ( "a" , 3 ) ;
386+ m. insert ( "b" , 20 ) ;
387+ m. dedup ( ) ;
388+ assert_eq ! ( m. len( ) , 2 ) ;
389+ assert_eq ! ( m. get( "a" ) , Some ( & 3 ) ) ;
390+ assert_eq ! ( m. get( "b" ) , Some ( & 20 ) ) ;
391+ }
392+
393+ #[ test]
394+ fn dedup_is_idempotent ( ) {
395+ let mut m = VecMap :: new ( ) ;
396+ m. insert ( "a" , 1 ) ;
397+ m. insert ( "a" , 2 ) ;
398+ m. dedup ( ) ;
399+ assert ! ( m. is_deduped( ) ) ;
400+ assert_eq ! ( m. len( ) , 1 ) ;
401+ m. dedup ( ) ;
402+ assert ! ( m. is_deduped( ) ) ;
403+ assert_eq ! ( m. len( ) , 1 ) ;
404+ assert_eq ! ( m. get( "a" ) , Some ( & 2 ) ) ;
405+ }
406+
407+ #[ test]
408+ fn insert_dirties_dedup_flag ( ) {
409+ let mut m = VecMap :: new ( ) ;
410+ m. insert ( "a" , 1 ) ;
411+ m. dedup ( ) ;
412+ assert ! ( m. is_deduped( ) ) ;
413+
414+ m. insert ( "b" , 2 ) ;
415+ assert ! ( !m. is_deduped( ) ) ;
416+ }
417+
418+ #[ test]
419+ fn extend_dirties_dedup_flag ( ) {
420+ let mut m = VecMap :: new ( ) ;
421+ m. insert ( "a" , 1 ) ;
422+ m. dedup ( ) ;
423+ assert ! ( m. is_deduped( ) ) ;
424+
425+ m. extend ( vec ! [ ( "b" , 2 ) ] ) ;
426+ assert ! ( !m. is_deduped( ) ) ;
427+ }
428+
429+ #[ test]
430+ fn iter_mut_dirties_dedup_flag ( ) {
431+ let mut m = VecMap :: new ( ) ;
432+ m. insert ( "a" , 1 ) ;
433+ m. dedup ( ) ;
434+ assert ! ( m. is_deduped( ) ) ;
435+
436+ for ( _, v) in m. iter_mut ( ) {
437+ * v += 1 ;
438+ }
439+
440+ assert ! ( !m. is_deduped( ) ) ;
441+ }
442+
246443 #[ test]
247444 fn serialize_deduplicates_keeping_last ( ) {
248445 let mut m = VecMap :: new ( ) ;
0 commit comments