perf(sampling): optimize matching and limit cache memory [APMSP-2948] (#1977)

bantonsson · web-flow · commit 3d1e6c1ff323 · 2026-05-15T10:08:47.000Z
# What does this PR do? Optimizes the glob matching and limits the memory taken up by the LRU match cache. # Motivation A follow up PR for bigger performance questions that came up in #1927 as well as bounding the cache memory for the LRU cache that came up in a security review. # How to test the change? Unit tests and benchmarks are in the PR. Co-authored-by: bjorn.antonsson <bjorn.antonsson@datadoghq.com>
diff --git a/benchmark/run_benchmarks_ci.sh b/benchmark/run_benchmarks_ci.sh
@@ -22,7 +22,7 @@ pushd "${PROJECT_DIR}" > /dev/null
 
 # Run benchmarks
 message "Running benchmarks"
-cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span -- --warm-up-time 1 --measurement-time 5 --sample-size=200
+cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
 message "Finished running benchmarks"
 
 # Copy the benchmark results to the output directory
diff --git a/libdd-sampling/Cargo.toml b/libdd-sampling/Cargo.toml
@@ -22,6 +22,12 @@ harness = false
 path = "benches/sampling_bench.rs"
 required-features = ["v04_span"]
 
+[[bench]]
+name = "glob_matcher_bench"
+harness = false
+path = "benches/glob_matcher_bench.rs"
+required-features = ["bench-internals"]
+
 [dependencies]
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
@@ -31,6 +37,9 @@ libdd-trace-utils = { path = "../libdd-trace-utils", version = "3.0.1", optional
 
 [features]
 v04_span = ["dep:libdd-trace-utils"]
+# Exposes internal modules (e.g. `glob_matcher`) for benchmarks. Not intended for downstream
+# consumers — enable only when running benches in this crate.
+bench-internals = []
 
 [dev-dependencies]
 criterion = "0.5"
diff --git a/libdd-sampling/benches/glob_matcher_bench.rs b/libdd-sampling/benches/glob_matcher_bench.rs
@@ -0,0 +1,130 @@
+// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
+
+//! Microbenchmarks for `GlobMatcher` covering the `*` short-circuit, ASCII fast path (with and
+//! without wildcards, including backtracking), and Unicode fallback path.
+
+use std::alloc::System;
+use std::hint::black_box;
+
+use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
+use libdd_common::bench_utils::{
+    memory_allocated_measurement, AllocatedBytesMeasurement, ReportingAllocator,
+};
+use libdd_sampling::glob_matcher::GlobMatcher;
+
+#[global_allocator]
+static GLOBAL: ReportingAllocator<System> = ReportingAllocator::new(System);
+
+struct BenchCase {
+    name: &'static str,
+    pattern: &'static str,
+    subject: &'static str,
+}
+
+fn cases() -> Vec<BenchCase> {
+    vec![
+        BenchCase {
+            name: "star_short_circuit",
+            pattern: "*",
+            subject: "anything-goes-here",
+        },
+        BenchCase {
+            name: "ascii_exact_match",
+            pattern: "my-service",
+            subject: "my-service",
+        },
+        BenchCase {
+            name: "ascii_exact_miss",
+            pattern: "my-service",
+            subject: "other-service",
+        },
+        BenchCase {
+            name: "ascii_case_insensitive_match",
+            pattern: "my-service",
+            subject: "MY-SERVICE",
+        },
+        BenchCase {
+            name: "ascii_wildcard_star_match",
+            pattern: "svc-*",
+            subject: "svc-web",
+        },
+        BenchCase {
+            name: "ascii_wildcard_question_match",
+            pattern: "svc-???",
+            subject: "svc-web",
+        },
+        BenchCase {
+            name: "ascii_wildcard_backtrack_match",
+            pattern: "*-controller",
+            subject: "users-controller",
+        },
+        // Worst-case shape for the two-pointer backtracking algorithm.
+        BenchCase {
+            name: "ascii_wildcard_heavy_backtrack",
+            pattern: "a*a*a*a*b",
+            subject: "aaaaaaaaaaaaaaaaaaaab",
+        },
+        BenchCase {
+            name: "unicode_pattern_wildcard_match",
+            pattern: "caf\u{00e9}-*",
+            subject: "CAF\u{00c9}-PAYMENT",
+        },
+        BenchCase {
+            name: "unicode_pattern_ascii_subject",
+            pattern: "caf\u{00e9}-*",
+            subject: "CAFE-PAYMENT",
+        },
+        BenchCase {
+            name: "ascii_pattern_unicode_subject",
+            pattern: "caf*",
+            subject: "caf\u{00e9}-controller",
+        },
+        BenchCase {
+            name: "unicode_exact_match",
+            pattern: "caf\u{00e9}",
+            subject: "CAF\u{00c9}",
+        },
+    ]
+}
+
+fn bench_wall_time(c: &mut Criterion) {
+    for case in cases() {
+        let matcher = GlobMatcher::new(case.pattern);
+        c.bench_function(&format!("glob_matcher/{}/wall_time", case.name), |b| {
+            b.iter_batched(
+                || (),
+                |_| {
+                    black_box(matcher.matches(black_box(case.subject)));
+                },
+                BatchSize::SmallInput,
+            )
+        });
+    }
+}
+
+fn bench_allocs(c: &mut Criterion<AllocatedBytesMeasurement<System>>) {
+    for case in cases() {
+        let matcher = GlobMatcher::new(case.pattern);
+        c.bench_function(
+            &format!("glob_matcher/{}/allocated_bytes", case.name),
+            |b| {
+                b.iter_batched(
+                    || (),
+                    |_| {
+                        black_box(matcher.matches(black_box(case.subject)));
+                    },
+                    BatchSize::SmallInput,
+                )
+            },
+        );
+    }
+}
+
+criterion_group!(benches, bench_wall_time);
+criterion_group!(
+    name = alloc_benches;
+    config = memory_allocated_measurement(&GLOBAL);
+    targets = bench_allocs
+);
+criterion_main!(alloc_benches, benches);
diff --git a/libdd-sampling/benches/sampling_bench.rs b/libdd-sampling/benches/sampling_bench.rs
@@ -435,4 +435,4 @@ criterion_group!(
     config = memory_allocated_measurement(&GLOBAL);
     targets = criterion_benchmark_allocs
 );
-criterion_main!(benches, alloc_benches);
+criterion_main!(alloc_benches, benches);
diff --git a/libdd-sampling/src/bounded_byte_cache.rs b/libdd-sampling/src/bounded_byte_cache.rs
@@ -0,0 +1,219 @@
+// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
+// SPDX-License-Identifier: Apache-2.0
+
+//! LRU cache wrapper with dual limits: maximum entry count AND maximum tracked byte size.
+//!
+//! `lru::LruCache` only supports an entry-count capacity, which is unsafe for caches keyed on
+//! arbitrary user strings: a few very large keys can balloon memory. `BoundedByteCache` adds
+//! a byte budget on top, evicting least-recently-used entries until both limits are satisfied.
+
+use lru::LruCache;
+use std::borrow::Borrow;
+use std::hash::Hash;
+use std::mem::size_of;
+use std::num::NonZeroUsize;
+
+/// Default maximum entry count.
+pub const DEFAULT_MAX_ENTRIES: usize = 256;
+
+/// Default maximum tracked byte size (64 KiB).
+///
+/// Under normal use (short keys such as service or resource names), the entry count cap binds
+/// first. This byte budget acts as a safety valve against memory exhaustion from unexpectedly
+/// large keys due to misconfiguration or adversarial input.
+pub const DEFAULT_MAX_BYTES: usize = 256 * 256;
+
+/// LRU cache bounded by both entry count and total tracked byte size.
+///
+/// Byte accounting covers `key.as_ref().len() + size_of::<V>()`. Heap-allocated value contents
+/// are not tracked; this wrapper assumes small inline values (e.g. `bool`).
+pub struct BoundedByteCache<K, V>
+where
+    K: Hash + Eq + AsRef<[u8]>,
+{
+    inner: LruCache<K, V>,
+    current_bytes: usize,
+    max_bytes: usize,
+}
+
+impl<K, V> BoundedByteCache<K, V>
+where
+    K: Hash + Eq + AsRef<[u8]>,
+{
+    /// `max_entries` of zero is treated as 1 (a cache with no slots is nonsensical).
+    pub fn new(max_entries: usize, max_bytes: usize) -> Self {
+        let entry_cap = NonZeroUsize::new(max_entries).unwrap_or(NonZeroUsize::MIN);
+        Self {
+            inner: LruCache::new(entry_cap),
+            current_bytes: 0,
+            max_bytes,
+        }
+    }
+
+    #[inline]
+    pub fn get<Q>(&mut self, key: &Q) -> Option<&V>
+    where
+        K: Borrow<Q>,
+        Q: Hash + Eq + ?Sized,
+    {
+        self.inner.get(key)
+    }
+
+    /// Insert `key -> value`. Entries larger than `max_bytes` are dropped silently. Otherwise
+    /// LRU entries are evicted until the new entry fits.
+    pub fn put(&mut self, key: K, value: V) {
+        let entry_bytes = Self::entry_size(&key);
+
+        if entry_bytes > self.max_bytes {
+            return;
+        }
+
+        // Replacing an existing key: deduct its bytes first.
+        if self.inner.pop(&key).is_some() {
+            self.current_bytes = self.current_bytes.saturating_sub(entry_bytes);
+        }
+
+        while self.current_bytes + entry_bytes > self.max_bytes {
+            match self.inner.pop_lru() {
+                Some((evicted_key, _)) => {
+                    self.current_bytes = self
+                        .current_bytes
+                        .saturating_sub(Self::entry_size(&evicted_key));
+                }
+                None => break,
+            }
+        }
+
+        // `push` may evict an LRU entry to honor the entry-count cap; deduct its bytes.
+        if let Some((replaced_key, _)) = self.inner.push(key, value) {
+            self.current_bytes = self
+                .current_bytes
+                .saturating_sub(Self::entry_size(&replaced_key));
+        }
+        self.current_bytes += entry_bytes;
+    }
+
+    #[cfg(test)]
+    #[inline]
+    pub fn current_bytes(&self) -> usize {
+        self.current_bytes
+    }
+
+    pub fn len(&self) -> usize {
+        self.inner.len()
+    }
+
+    fn entry_size(key: &K) -> usize {
+        Self::PER_ENTRY_OVERHEAD + key.as_ref().len() + size_of::<V>()
+    }
+
+    /// Approximate per-entry fixed heap overhead, in bytes. Covers:
+    /// - `Vec<u8>` header on the key (24 B on 64-bit targets)
+    /// - `lru` doubly-linked-list node (prev/next pointers, ~24 B)
+    /// - `HashMap` bucket amortized (~16 B)
+    ///
+    /// Rounded up so `max_bytes` is a pessimistic upper bound on actual heap usage. Recheck
+    /// if the `lru` crate is upgraded across a major version — the linked-list node layout
+    /// is the volatile piece.
+    const PER_ENTRY_OVERHEAD: usize = 64;
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_basic_put_and_get() {
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 1024);
+        cache.put(b"hello".to_vec(), true);
+        assert_eq!(cache.get(b"hello".as_ref()), Some(&true));
+        assert_eq!(cache.len(), 1);
+    }
+
+    #[test]
+    fn test_evicts_lru_when_over_byte_budget() {
+        // Each entry costs PER_ENTRY_OVERHEAD (64) + 4 (key) + 1 (bool) = 69 bytes. Budget
+        // of 150 fits two entries (138 B) but not three.
+        let budget = 150;
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, budget);
+        cache.put(b"aaaa".to_vec(), true);
+        cache.put(b"bbbb".to_vec(), false);
+        assert_eq!(cache.len(), 2);
+        cache.put(b"cccc".to_vec(), true);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.get(b"aaaa".as_ref()), None);
+        assert_eq!(cache.get(b"bbbb".as_ref()), Some(&false));
+        assert_eq!(cache.get(b"cccc".as_ref()), Some(&true));
+        assert!(cache.current_bytes() <= budget);
+    }
+
+    #[test]
+    fn test_evicts_lru_when_over_entry_count() {
+        // Generous byte budget; entry-count cap of 2 drives eviction.
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(2, 1024);
+        cache.put(b"a".to_vec(), true);
+        cache.put(b"b".to_vec(), false);
+        cache.put(b"c".to_vec(), true);
+        assert_eq!(cache.len(), 2);
+        assert_eq!(cache.get(b"a".as_ref()), None);
+        assert_eq!(cache.get(b"b".as_ref()), Some(&false));
+        assert_eq!(cache.get(b"c".as_ref()), Some(&true));
+    }
+
+    #[test]
+    fn test_oversize_entry_is_rejected() {
+        // Any entry costs at least PER_ENTRY_OVERHEAD bytes, so a 32-byte budget rejects
+        // every insertion.
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 32);
+        cache.put(b"small".to_vec(), true);
+        assert_eq!(cache.len(), 0);
+        assert_eq!(cache.current_bytes(), 0);
+    }
+
+    #[test]
+    fn test_replacing_key_does_not_double_count() {
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 1024);
+        cache.put(b"k".to_vec(), true);
+        let bytes_after_first = cache.current_bytes();
+        cache.put(b"k".to_vec(), false);
+        assert_eq!(cache.current_bytes(), bytes_after_first);
+        assert_eq!(cache.get(b"k".as_ref()), Some(&false));
+        assert_eq!(cache.len(), 1);
+    }
+
+    #[test]
+    fn test_get_bumps_recency() {
+        // Budget for exactly two 4-byte-keyed entries (69 B each = 138 B total).
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 150);
+        cache.put(b"aaaa".to_vec(), true);
+        cache.put(b"bbbb".to_vec(), true);
+        let _ = cache.get(b"aaaa".as_ref());
+        cache.put(b"cccc".to_vec(), true);
+        assert_eq!(cache.get(b"aaaa".as_ref()), Some(&true));
+        assert_eq!(cache.get(b"bbbb".as_ref()), None);
+    }
+
+    #[test]
+    fn test_many_inserts_stay_within_both_limits() {
+        let max_entries = 8;
+        // 8 entries * (PER_ENTRY_OVERHEAD 64 + 8-byte key + 1) = 584 bytes; round up.
+        let max_bytes = 600;
+        let mut cache: BoundedByteCache<Vec<u8>, bool> =
+            BoundedByteCache::new(max_entries, max_bytes);
+        for i in 0u16..1000 {
+            cache.put(format!("key-{:04}", i).into_bytes(), i % 2 == 0);
+            assert!(cache.current_bytes() <= max_bytes);
+            assert!(cache.len() <= max_entries);
+        }
+    }
+
+    #[test]
+    fn test_zero_entries_clamps_to_one() {
+        let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(0, 1024);
+        cache.put(b"a".to_vec(), true);
+        cache.put(b"b".to_vec(), false);
+        assert_eq!(cache.len(), 1);
+        assert_eq!(cache.get(b"a".as_ref()), None);
+        assert_eq!(cache.get(b"b".as_ref()), Some(&false));
+    }
+}
diff --git a/libdd-sampling/src/glob_matcher.rs b/libdd-sampling/src/glob_matcher.rs
diff --git a/libdd-sampling/src/lib.rs b/libdd-sampling/src/lib.rs