Skip to content

Commit 7e38b23

Browse files
committed
feat: in-place deduplication
1 parent 33a91d2 commit 7e38b23

1 file changed

Lines changed: 228 additions & 31 deletions

File tree

libdd-trace-utils/src/span/vec_map.rs

Lines changed: 228 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
1111
use serde::ser::{Serialize, Serializer};
1212
use std::borrow::Borrow;
13-
use std::collections::HashMap;
13+
use std::collections::HashSet;
1414
use std::hash::Hash;
1515

1616
/// A Vec-backed map that provides HashMap-like lookup by key.
@@ -28,28 +28,56 @@ use std::hash::Hash;
2828
/// speak. [VecMap::len], [VecMap::iter], and others just delegates to the underlying `Vec`, and
2929
/// won't deduplicate.
3030
///
31-
/// Explicit deduplication is currently being done automatically and on-the-fly during
32-
/// serialization. If needed, in the future, we might trigger deduplication on other events, for
33-
/// example at insertion if the size is bigger than a threshold.
34-
#[derive(Clone, Debug, PartialEq, Default)]
35-
pub struct VecMap<K, V>(Vec<(K, V)>);
31+
/// Explicit deduplication is currently being done on-demand by [VecMap::dedup]. An internal flag is
32+
/// used to avoid undue deduplication (see [VecMap::dedup]). `VecMap` is automatically deduped
33+
/// before serialization.
34+
///
35+
/// In the future, we could trigger deduplication on other events, for example at insertion if the
36+
/// size is bigger than a threshold (and we haven't deduped for `x` operations).
37+
#[derive(Clone, Debug, PartialEq)]
38+
pub struct VecMap<K, V> {
39+
data: Vec<(K, V)>,
40+
/// Deduped is a flag that is set after entry deduplication. It is dirtied (set to `false`)
41+
/// when any modification is performed (`deduped == false` doesn't imply there are actual
42+
/// duplicates, just than there might be). This is useful to avoid performing deduplication
43+
/// several times in the export pipeline.
44+
deduped: bool,
45+
}
46+
47+
impl<K, V> Default for VecMap<K, V> {
48+
fn default() -> Self {
49+
Self {
50+
data: Default::default(),
51+
deduped: false,
52+
}
53+
}
54+
}
3655

3756
impl<K, V> VecMap<K, V> {
3857
#[must_use]
3958
#[inline]
4059
pub fn new() -> Self {
41-
VecMap(Vec::new())
60+
Self::default()
61+
}
62+
63+
/// Dirty the `dedup` flag after a mutation that could introduce duplicates.
64+
fn dirty(&mut self) {
65+
self.deduped = false;
4266
}
4367

4468
#[must_use]
4569
#[inline]
4670
pub fn with_capacity(capacity: usize) -> Self {
47-
VecMap(Vec::with_capacity(capacity))
71+
VecMap {
72+
data: Vec::with_capacity(capacity),
73+
deduped: false,
74+
}
4875
}
4976

5077
#[inline]
5178
pub fn insert(&mut self, key: K, value: V) {
52-
self.0.push((key, value));
79+
self.data.push((key, value));
80+
self.dirty();
5381
}
5482

5583
#[inline]
@@ -58,7 +86,7 @@ impl<K, V> VecMap<K, V> {
5886
K: Borrow<Q>,
5987
Q: ?Sized + PartialEq,
6088
{
61-
self.0
89+
self.data
6290
.iter()
6391
.rev()
6492
.find(|(k, _)| k.borrow() == key)
@@ -71,7 +99,7 @@ impl<K, V> VecMap<K, V> {
7199
K: Borrow<Q>,
72100
Q: ?Sized + PartialEq,
73101
{
74-
self.0
102+
self.data
75103
.iter_mut()
76104
.rev()
77105
.find(|(k, _)| (*k).borrow() == key)
@@ -84,7 +112,7 @@ impl<K, V> VecMap<K, V> {
84112
K: Borrow<Q>,
85113
Q: ?Sized + PartialEq,
86114
{
87-
self.0.iter().any(|(k, _)| k.borrow() == key)
115+
self.data.iter().any(|(k, _)| k.borrow() == key)
88116
}
89117

90118
/// Remove all entries matching this key from the map. This method uses [Vec::retain], and is
@@ -97,42 +125,83 @@ impl<K, V> VecMap<K, V> {
97125
K: Borrow<Q>,
98126
Q: ?Sized + PartialEq,
99127
{
100-
self.0.retain(|(k, _)| k.borrow() != key);
128+
self.data.retain(|(k, _)| k.borrow() != key);
101129
}
102130

103131
/// Iterate over the element, including duplicate entries.
104132
#[inline]
105133
pub fn iter(&self) -> std::slice::Iter<'_, (K, V)> {
106-
self.0.iter()
134+
self.data.iter()
107135
}
108136

109137
/// Iterate mutably over the elements, including duplicate entries.
110138
#[inline]
111139
pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, (K, V)> {
112-
self.0.iter_mut()
140+
self.dirty();
141+
self.data.iter_mut()
113142
}
114143

115144
/// Return the length of the underlying vector, thus including duplicate entries.
116145
#[inline]
117146
pub fn len(&self) -> usize {
118-
self.0.len()
147+
self.data.len()
119148
}
120149

121150
#[inline]
122151
pub fn is_empty(&self) -> bool {
123-
self.0.is_empty()
152+
self.data.is_empty()
153+
}
154+
155+
/// Return `true` if the map hasn't been extended since the last call to [Self::dedup],
156+
/// guaranteeing that the underlying vector doesn't have any duplicate key.
157+
///
158+
/// If `is_deduped` returns `false`, the map may have duplicate keys.
159+
#[inline]
160+
pub fn is_deduped(&self) -> bool {
161+
self.deduped
162+
}
163+
}
164+
165+
impl<K: Hash + Eq + Clone, V> VecMap<K, V> {
166+
/// Remove entries with a duplicate key, only keeping the last one. After this, a flag is set
167+
/// internally, such that as long as the map isn't extended or mutably iterated, the next
168+
/// [Self::dedup] doesn't perform the work again.
169+
pub fn dedup(&mut self) {
170+
if self.deduped {
171+
return;
172+
}
173+
174+
// Since we're going to shuffle elements around, it's not easy to keep references to keys in
175+
// the deduping set. The simplest is to clone them.
176+
let mut seen = HashSet::with_capacity(self.len());
177+
178+
self.data.reverse();
179+
self.data.retain(|(k, _)| seen.insert(k.clone()));
180+
self.deduped = true;
124181
}
125182
}
126183

127184
impl<K, V> From<Vec<(K, V)>> for VecMap<K, V> {
128-
fn from(vec: Vec<(K, V)>) -> Self {
129-
VecMap(vec)
185+
fn from(data: Vec<(K, V)>) -> Self {
186+
Self {
187+
data,
188+
deduped: false,
189+
}
190+
}
191+
}
192+
193+
impl<K, V> From<VecMap<K, V>> for Vec<(K, V)> {
194+
fn from(value: VecMap<K, V>) -> Self {
195+
value.data
130196
}
131197
}
132198

133199
impl<K, V> FromIterator<(K, V)> for VecMap<K, V> {
134200
fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I) -> Self {
135-
VecMap(iter.into_iter().collect())
201+
Self {
202+
data: iter.into_iter().collect(),
203+
deduped: false,
204+
}
136205
}
137206
}
138207

@@ -141,7 +210,7 @@ impl<K, V> IntoIterator for VecMap<K, V> {
141210
type IntoIter = std::vec::IntoIter<(K, V)>;
142211

143212
fn into_iter(self) -> Self::IntoIter {
144-
self.0.into_iter()
213+
self.data.into_iter()
145214
}
146215
}
147216

@@ -150,7 +219,7 @@ impl<'a, K, V> IntoIterator for &'a VecMap<K, V> {
150219
type IntoIter = std::slice::Iter<'a, (K, V)>;
151220

152221
fn into_iter(self) -> Self::IntoIter {
153-
self.0.iter()
222+
self.data.iter()
154223
}
155224
}
156225

@@ -159,28 +228,44 @@ impl<'a, K, V> IntoIterator for &'a mut VecMap<K, V> {
159228
type IntoIter = std::slice::IterMut<'a, (K, V)>;
160229

161230
fn into_iter(self) -> Self::IntoIter {
162-
self.0.iter_mut()
231+
self.data.iter_mut()
163232
}
164233
}
165234

166235
impl<K, V> Extend<(K, V)> for VecMap<K, V> {
167236
fn extend<I: IntoIterator<Item = (K, V)>>(&mut self, iter: I) {
168-
self.0.extend(iter);
237+
self.dirty();
238+
self.data.extend(iter);
169239
}
170240
}
171241

172242
impl<K: Serialize + Eq + Hash, V: Serialize> Serialize for VecMap<K, V> {
173243
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
244+
use serde::ser::SerializeMap;
245+
use std::collections::HashMap;
246+
174247
// We pre-compute the deduped map. If deduplication were done on the fly during
175248
// serialization, we couldn't provide a length up front to the serializer, and the current
176249
// one (rmp) will allocate an intermediate buffer defensively.
177-
self.0
178-
.iter()
179-
.map(|(k, v)| (k, v))
180-
// Since the iterator is sized, `collect()` should pre-allocate with the right capacity
181-
// in one shot.
182-
.collect::<HashMap<&K, &V>>()
183-
.serialize(serializer)
250+
if self.deduped {
251+
let mut map_ser = serializer.serialize_map(Some(self.len()))?;
252+
253+
for (k, v) in self {
254+
map_ser.serialize_entry(k, v)?;
255+
}
256+
257+
map_ser.end()
258+
} else {
259+
// Note: using `dedup` would need an additional `clone()` of the whole map here. We can
260+
// use references instead.
261+
self.data
262+
.iter()
263+
.map(|(k, v)| (k, v))
264+
// Since the iterator is sized, `collect()` should pre-allocate with the right
265+
// capacity in one shot.
266+
.collect::<HashMap<&K, &V>>()
267+
.serialize(serializer)
268+
}
184269
}
185270
}
186271

@@ -243,6 +328,118 @@ mod tests {
243328
assert_eq!(pairs, vec![("a", 1), ("b", 2)]);
244329
}
245330

331+
#[test]
332+
fn is_deduped_false_initially() {
333+
let m: VecMap<&str, i32> = VecMap::new();
334+
assert!(!m.is_deduped());
335+
}
336+
337+
#[test]
338+
fn is_deduped_false_after_from() {
339+
let m: VecMap<&str, i32> = vec![("a", 1)].into();
340+
assert!(!m.is_deduped());
341+
}
342+
343+
#[test]
344+
fn is_deduped_false_after_collect() {
345+
let m: VecMap<&str, i32> = vec![("a", 1)].into_iter().collect();
346+
assert!(!m.is_deduped());
347+
}
348+
349+
#[test]
350+
fn dedup_sets_flag() {
351+
let mut m = VecMap::new();
352+
m.insert("a", 1);
353+
assert!(!m.is_deduped());
354+
m.dedup();
355+
assert!(m.is_deduped());
356+
}
357+
358+
#[test]
359+
fn dedup_on_empty_map() {
360+
let mut m: VecMap<String, i32> = VecMap::new();
361+
m.dedup();
362+
assert!(m.is_deduped());
363+
assert!(m.is_empty());
364+
}
365+
366+
#[test]
367+
fn dedup_no_duplicates() {
368+
let mut m = VecMap::new();
369+
m.insert("a", 1);
370+
m.insert("b", 2);
371+
m.insert("c", 3);
372+
m.dedup();
373+
assert_eq!(m.len(), 3);
374+
assert_eq!(m.get("a"), Some(&1));
375+
assert_eq!(m.get("b"), Some(&2));
376+
assert_eq!(m.get("c"), Some(&3));
377+
}
378+
379+
#[test]
380+
fn dedup_keeps_last_value() {
381+
let mut m = VecMap::new();
382+
m.insert("a", 1);
383+
m.insert("b", 10);
384+
m.insert("a", 2);
385+
m.insert("a", 3);
386+
m.insert("b", 20);
387+
m.dedup();
388+
assert_eq!(m.len(), 2);
389+
assert_eq!(m.get("a"), Some(&3));
390+
assert_eq!(m.get("b"), Some(&20));
391+
}
392+
393+
#[test]
394+
fn dedup_is_idempotent() {
395+
let mut m = VecMap::new();
396+
m.insert("a", 1);
397+
m.insert("a", 2);
398+
m.dedup();
399+
assert!(m.is_deduped());
400+
assert_eq!(m.len(), 1);
401+
m.dedup();
402+
assert!(m.is_deduped());
403+
assert_eq!(m.len(), 1);
404+
assert_eq!(m.get("a"), Some(&2));
405+
}
406+
407+
#[test]
408+
fn insert_dirties_dedup_flag() {
409+
let mut m = VecMap::new();
410+
m.insert("a", 1);
411+
m.dedup();
412+
assert!(m.is_deduped());
413+
414+
m.insert("b", 2);
415+
assert!(!m.is_deduped());
416+
}
417+
418+
#[test]
419+
fn extend_dirties_dedup_flag() {
420+
let mut m = VecMap::new();
421+
m.insert("a", 1);
422+
m.dedup();
423+
assert!(m.is_deduped());
424+
425+
m.extend(vec![("b", 2)]);
426+
assert!(!m.is_deduped());
427+
}
428+
429+
#[test]
430+
fn iter_mut_dirties_dedup_flag() {
431+
let mut m = VecMap::new();
432+
m.insert("a", 1);
433+
m.dedup();
434+
assert!(m.is_deduped());
435+
436+
for (_, v) in m.iter_mut() {
437+
*v += 1;
438+
}
439+
440+
assert!(!m.is_deduped());
441+
}
442+
246443
#[test]
247444
fn serialize_deduplicates_keeping_last() {
248445
let mut m = VecMap::new();

0 commit comments

Comments
 (0)