Skip to content

Commit 331b904

Browse files
yannhambengl
andauthored
feat(trace-utils)!: add dedup convenience to VecMap (#2049)
# What does this PR do? Follow-up of #2022. This split off some work required in #2043 for deduplication before msgpack encoding as a separate PR. # Motivation See #2022 for overall motivation. To integrate `VecMap`, it turns out we can't currently rely on the agent to properly handle maps with deuplicate entries (the "last win" semantics is the current implementation, but not guaranteed). We need to perform deduplication before msgpack encoding. This PR backports the required machinery into `VecMap` as a separate PR. # Additional Notes N/A # How to test the change? Tests included. Co-authored-by: bryan.english <bryan.english@datadoghq.com>
1 parent 2a6c295 commit 331b904

1 file changed

Lines changed: 170 additions & 26 deletions

File tree

libdd-trace-utils/src/span/vec_map.rs

Lines changed: 170 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
1111
use serde::ser::{Serialize, Serializer};
1212
use std::borrow::Borrow;
13-
use std::collections::HashSet;
13+
use std::collections::{HashMap, HashSet};
1414
use std::hash::Hash;
15+
use std::slice;
16+
use std::sync::atomic::{AtomicBool, Ordering};
1517

1618
/// A Vec-backed map that provides HashMap-like lookup by key.
1719
///
@@ -39,7 +41,7 @@ use std::hash::Hash;
3941
///
4042
/// As this is a map, iteration order is not defined nor guaranteed. In practice, iteration follows
4143
/// insertion order, but [Self::dedup] will reverse the underlying vector.
42-
#[derive(Clone, Debug, PartialEq)]
44+
#[derive(Clone, Debug)]
4345
pub struct VecMap<K, V> {
4446
data: Vec<(K, V)>,
4547
/// Deduped is a flag that is set after entry deduplication. It is dirtied (set to `false`)
@@ -59,6 +61,21 @@ impl<K, V> Default for VecMap<K, V> {
5961
}
6062
}
6163

64+
// This implementation allocates, which isn't expected for equality testing (not allocating would be
65+
// rather tedious though). It's fine for tests (where `PartialEq` is currently needed), so we
66+
// cfg-gate it to avoid unintended usage in prod.
67+
#[cfg(any(test, feature = "test-utils"))]
68+
impl<K: Eq + Hash, V: PartialEq> PartialEq for VecMap<K, V> {
69+
fn eq(&self, other: &Self) -> bool {
70+
let lhs: HashMap<&K, &V> = self.data.iter().map(|(k, v)| (k, v)).collect();
71+
let rhs: HashMap<&K, &V> = other.data.iter().map(|(k, v)| (k, v)).collect();
72+
lhs == rhs
73+
}
74+
}
75+
76+
#[cfg(any(test, feature = "test-utils"))]
77+
impl<K: Eq + Hash, V: Eq> Eq for VecMap<K, V> {}
78+
6279
impl<K, V> VecMap<K, V> {
6380
#[must_use]
6481
#[inline]
@@ -136,13 +153,13 @@ impl<K, V> VecMap<K, V> {
136153

137154
/// Iterate over the element, including duplicate entries.
138155
#[inline]
139-
pub fn iter(&self) -> std::slice::Iter<'_, (K, V)> {
156+
pub fn iter(&self) -> slice::Iter<'_, (K, V)> {
140157
self.data.iter()
141158
}
142159

143160
/// Iterate mutably over the elements, including duplicate entries.
144161
#[inline]
145-
pub fn iter_mut(&mut self) -> std::slice::IterMut<'_, (K, V)> {
162+
pub fn iter_mut(&mut self) -> slice::IterMut<'_, (K, V)> {
146163
self.dirty();
147164
self.data.iter_mut()
148165
}
@@ -168,6 +185,37 @@ impl<K, V> VecMap<K, V> {
168185
}
169186
}
170187

188+
impl<K: Eq + Hash, V> VecMap<K, V> {
189+
/// Returns a deduped map, that either borrows from `self` without performing any work if the
190+
/// map is already deduped, or dedup the entries in a new separate vec otherwise. As opposed to
191+
/// [Self::dedup], `as_deduped_map` takes an immutable reference to `self` but might allocate.
192+
/// Prefer [Self::dedup] when applicable.
193+
pub fn as_deduped_map(&self) -> DedupedVecMap<'_, K, V> {
194+
if self.deduped {
195+
DedupedVecMap::Borrowed(self)
196+
} else {
197+
DedupedVecMap::Owned(self.data.iter().map(|(k, v)| (k, v)).collect())
198+
}
199+
}
200+
201+
/// This is a convenience wrapper around [Self::as_deduped_map] used in the msgpack encoder,
202+
/// where we expect the map to be deduped, but call `as_deduped_map` as a defensive measure. If
203+
/// the latter had to deduplicate and allocate a new vec, we log a warning (at most once).
204+
#[allow(unused)]
205+
pub(crate) fn defensive_dedup(&self) -> DedupedVecMap<'_, K, V> {
206+
if !self.is_deduped() {
207+
static WARNED: AtomicBool = AtomicBool::new(false);
208+
if !WARNED.swap(true, Ordering::Relaxed) {
209+
tracing::warn!(
210+
"VecMap not deduped before encoding. Performing defensive on-the-fly dedup"
211+
);
212+
}
213+
}
214+
215+
self.as_deduped_map()
216+
}
217+
}
218+
171219
impl<K: Hash + Eq + Clone, V> VecMap<K, V> {
172220
/// Remove entries with a duplicate key, only keeping the last one. After this, a flag is set
173221
/// internally, such that as long as the map isn't extended or mutably iterated, the next
@@ -222,7 +270,7 @@ impl<K, V> IntoIterator for VecMap<K, V> {
222270

223271
impl<'a, K, V> IntoIterator for &'a VecMap<K, V> {
224272
type Item = &'a (K, V);
225-
type IntoIter = std::slice::Iter<'a, (K, V)>;
273+
type IntoIter = slice::Iter<'a, (K, V)>;
226274

227275
fn into_iter(self) -> Self::IntoIter {
228276
self.data.iter()
@@ -231,7 +279,7 @@ impl<'a, K, V> IntoIterator for &'a VecMap<K, V> {
231279

232280
impl<'a, K, V> IntoIterator for &'a mut VecMap<K, V> {
233281
type Item = &'a mut (K, V);
234-
type IntoIter = std::slice::IterMut<'a, (K, V)>;
282+
type IntoIter = slice::IterMut<'a, (K, V)>;
235283

236284
fn into_iter(self) -> Self::IntoIter {
237285
// Since we iterate on keys as well, they can modified, and introduce duplicates.
@@ -250,33 +298,74 @@ impl<K, V> Extend<(K, V)> for VecMap<K, V> {
250298
impl<K: Serialize + Eq + Hash, V: Serialize> Serialize for VecMap<K, V> {
251299
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
252300
use serde::ser::SerializeMap;
253-
use std::collections::HashMap;
254301

255-
// We pre-compute the deduped map. If deduplication were done on the fly during
256-
// serialization, we couldn't provide a length up front to the serializer, and the current
257-
// one (rmp) will allocate an intermediate buffer defensively.
258-
if self.deduped {
259-
let mut map_ser = serializer.serialize_map(Some(self.len()))?;
302+
let deduped = self.as_deduped_map();
303+
let mut map_ser = serializer.serialize_map(Some(deduped.len()))?;
260304

261-
for (k, v) in self {
262-
map_ser.serialize_entry(k, v)?;
263-
}
305+
for (k, v) in deduped.iter() {
306+
map_ser.serialize_entry(k, v)?;
307+
}
264308

265-
map_ser.end()
266-
} else {
267-
// Note: using `dedup` would need an additional `clone()` of the whole map here. We can
268-
// use references instead.
269-
self.data
270-
.iter()
271-
.map(|(k, v)| (k, v))
272-
// Since the iterator is sized, `collect()` should pre-allocate with the right
273-
// capacity in one shot.
274-
.collect::<HashMap<&K, &V>>()
275-
.serialize(serializer)
309+
map_ser.end()
310+
}
311+
}
312+
313+
pub enum DedupedVecMap<'a, K, V> {
314+
Borrowed(&'a VecMap<K, V>),
315+
Owned(HashMap<&'a K, &'a V>),
316+
}
317+
318+
impl<'a, K, V> DedupedVecMap<'a, K, V> {
319+
#[inline]
320+
pub fn iter(&self) -> DedupedVecMapIter<'_, 'a, K, V> {
321+
match self {
322+
DedupedVecMap::Borrowed(vec_map) => DedupedVecMapIter::Borrowed(vec_map.iter()),
323+
DedupedVecMap::Owned(map) => DedupedVecMapIter::Owned(map.iter()),
276324
}
277325
}
326+
327+
#[inline]
328+
pub fn len(&self) -> usize {
329+
match self {
330+
DedupedVecMap::Borrowed(vec_map) => vec_map.len(),
331+
DedupedVecMap::Owned(map) => map.len(),
332+
}
333+
}
334+
335+
#[inline]
336+
pub fn is_empty(&self) -> bool {
337+
match self {
338+
DedupedVecMap::Borrowed(vec_map) => vec_map.is_empty(),
339+
DedupedVecMap::Owned(map) => map.is_empty(),
340+
}
341+
}
342+
}
343+
344+
pub enum DedupedVecMapIter<'b, 'a: 'b, K, V> {
345+
Borrowed(slice::Iter<'a, (K, V)>),
346+
Owned(std::collections::hash_map::Iter<'b, &'a K, &'a V>),
278347
}
279348

349+
impl<'b, 'a: 'b, K, V> Iterator for DedupedVecMapIter<'b, 'a, K, V> {
350+
type Item = (&'a K, &'a V);
351+
352+
fn next(&mut self) -> Option<Self::Item> {
353+
match self {
354+
DedupedVecMapIter::Borrowed(iter) => iter.next().map(|(k, v)| (k, v)),
355+
DedupedVecMapIter::Owned(iter) => iter.next().map(|(&k, &v)| (k, v)),
356+
}
357+
}
358+
359+
fn size_hint(&self) -> (usize, Option<usize>) {
360+
match self {
361+
DedupedVecMapIter::Borrowed(iter) => iter.size_hint(),
362+
DedupedVecMapIter::Owned(iter) => iter.size_hint(),
363+
}
364+
}
365+
}
366+
367+
impl<'b, 'a: 'b, K, V> ExactSizeIterator for DedupedVecMapIter<'b, 'a, K, V> {}
368+
280369
#[cfg(test)]
281370
mod tests {
282371
use super::*;
@@ -448,6 +537,61 @@ mod tests {
448537
assert!(!m.is_deduped());
449538
}
450539

540+
#[test]
541+
fn deduped_vec_map_borrowed_iter_and_len() {
542+
let mut m = VecMap::new();
543+
m.insert("a", 1);
544+
m.insert("b", 2);
545+
m.dedup();
546+
547+
let d = m.defensive_dedup();
548+
assert!(matches!(d, DedupedVecMap::Borrowed(_)));
549+
assert_eq!(d.len(), 2);
550+
551+
let mut items: Vec<_> = d.iter().collect();
552+
items.sort_by_key(|(k, _)| **k);
553+
assert_eq!(items, vec![(&"a", &1), (&"b", &2)]);
554+
}
555+
556+
#[test]
557+
fn deduped_vec_map_copy_iter_and_len() {
558+
let mut m = VecMap::new();
559+
m.insert("a", 1);
560+
m.insert("a", 2);
561+
m.insert("b", 3);
562+
563+
let d = m.defensive_dedup();
564+
assert!(matches!(d, DedupedVecMap::Owned(_)));
565+
assert_eq!(d.len(), 2);
566+
567+
let items: HashMap<&&str, &i32> = d.iter().collect();
568+
assert_eq!(items[&"a"], &2);
569+
assert_eq!(items[&"b"], &3);
570+
}
571+
572+
#[test]
573+
fn deduped_vec_map_iter_exact_size() {
574+
let mut m = VecMap::new();
575+
m.insert("a", 1);
576+
m.insert("b", 2);
577+
m.insert("c", 3);
578+
m.dedup();
579+
580+
let d = m.defensive_dedup();
581+
let mut iter = d.iter();
582+
assert_eq!(iter.len(), 3);
583+
iter.next();
584+
assert_eq!(iter.len(), 2);
585+
}
586+
587+
#[test]
588+
fn deduped_vec_map_empty() {
589+
let m: VecMap<String, i32> = VecMap::new();
590+
let d = DedupedVecMap::Borrowed(&m);
591+
assert_eq!(d.len(), 0);
592+
assert_eq!(d.iter().count(), 0);
593+
}
594+
451595
#[test]
452596
fn serialize_deduplicates_keeping_last() {
453597
let mut m = VecMap::new();

0 commit comments

Comments
 (0)