Skip to content

Commit 3d1e6c1

Browse files
authored
perf(sampling): optimize matching and limit cache memory [APMSP-2948] (#1977)
# What does this PR do? Optimizes the glob matching and limits the memory taken up by the LRU match cache. # Motivation A follow up PR for bigger performance questions that came up in #1927 as well as bounding the cache memory for the LRU cache that came up in a security review. # How to test the change? Unit tests and benchmarks are in the PR. Co-authored-by: bjorn.antonsson <bjorn.antonsson@datadoghq.com>
1 parent 68c6519 commit 3d1e6c1

7 files changed

Lines changed: 644 additions & 99 deletions

File tree

benchmark/run_benchmarks_ci.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ pushd "${PROJECT_DIR}" > /dev/null
2222

2323
# Run benchmarks
2424
message "Running benchmarks"
25-
cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span -- --warm-up-time 1 --measurement-time 5 --sample-size=200
25+
cargo bench --workspace --features libdd-crashtracker/benchmarking,libdd-sampling/v04_span,libdd-sampling/bench-internals -- --warm-up-time 1 --measurement-time 5 --sample-size=200
2626
message "Finished running benchmarks"
2727

2828
# Copy the benchmark results to the output directory

libdd-sampling/Cargo.toml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,12 @@ harness = false
2222
path = "benches/sampling_bench.rs"
2323
required-features = ["v04_span"]
2424

25+
[[bench]]
26+
name = "glob_matcher_bench"
27+
harness = false
28+
path = "benches/glob_matcher_bench.rs"
29+
required-features = ["bench-internals"]
30+
2531
[dependencies]
2632
serde = { version = "1.0", features = ["derive"] }
2733
serde_json = "1.0"
@@ -31,6 +37,9 @@ libdd-trace-utils = { path = "../libdd-trace-utils", version = "3.0.1", optional
3137

3238
[features]
3339
v04_span = ["dep:libdd-trace-utils"]
40+
# Exposes internal modules (e.g. `glob_matcher`) for benchmarks. Not intended for downstream
41+
# consumers — enable only when running benches in this crate.
42+
bench-internals = []
3443

3544
[dev-dependencies]
3645
criterion = "0.5"
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! Microbenchmarks for `GlobMatcher` covering the `*` short-circuit, ASCII fast path (with and
5+
//! without wildcards, including backtracking), and Unicode fallback path.
6+
7+
use std::alloc::System;
8+
use std::hint::black_box;
9+
10+
use criterion::{criterion_group, criterion_main, BatchSize, Criterion};
11+
use libdd_common::bench_utils::{
12+
memory_allocated_measurement, AllocatedBytesMeasurement, ReportingAllocator,
13+
};
14+
use libdd_sampling::glob_matcher::GlobMatcher;
15+
16+
#[global_allocator]
17+
static GLOBAL: ReportingAllocator<System> = ReportingAllocator::new(System);
18+
19+
struct BenchCase {
20+
name: &'static str,
21+
pattern: &'static str,
22+
subject: &'static str,
23+
}
24+
25+
fn cases() -> Vec<BenchCase> {
26+
vec![
27+
BenchCase {
28+
name: "star_short_circuit",
29+
pattern: "*",
30+
subject: "anything-goes-here",
31+
},
32+
BenchCase {
33+
name: "ascii_exact_match",
34+
pattern: "my-service",
35+
subject: "my-service",
36+
},
37+
BenchCase {
38+
name: "ascii_exact_miss",
39+
pattern: "my-service",
40+
subject: "other-service",
41+
},
42+
BenchCase {
43+
name: "ascii_case_insensitive_match",
44+
pattern: "my-service",
45+
subject: "MY-SERVICE",
46+
},
47+
BenchCase {
48+
name: "ascii_wildcard_star_match",
49+
pattern: "svc-*",
50+
subject: "svc-web",
51+
},
52+
BenchCase {
53+
name: "ascii_wildcard_question_match",
54+
pattern: "svc-???",
55+
subject: "svc-web",
56+
},
57+
BenchCase {
58+
name: "ascii_wildcard_backtrack_match",
59+
pattern: "*-controller",
60+
subject: "users-controller",
61+
},
62+
// Worst-case shape for the two-pointer backtracking algorithm.
63+
BenchCase {
64+
name: "ascii_wildcard_heavy_backtrack",
65+
pattern: "a*a*a*a*b",
66+
subject: "aaaaaaaaaaaaaaaaaaaab",
67+
},
68+
BenchCase {
69+
name: "unicode_pattern_wildcard_match",
70+
pattern: "caf\u{00e9}-*",
71+
subject: "CAF\u{00c9}-PAYMENT",
72+
},
73+
BenchCase {
74+
name: "unicode_pattern_ascii_subject",
75+
pattern: "caf\u{00e9}-*",
76+
subject: "CAFE-PAYMENT",
77+
},
78+
BenchCase {
79+
name: "ascii_pattern_unicode_subject",
80+
pattern: "caf*",
81+
subject: "caf\u{00e9}-controller",
82+
},
83+
BenchCase {
84+
name: "unicode_exact_match",
85+
pattern: "caf\u{00e9}",
86+
subject: "CAF\u{00c9}",
87+
},
88+
]
89+
}
90+
91+
fn bench_wall_time(c: &mut Criterion) {
92+
for case in cases() {
93+
let matcher = GlobMatcher::new(case.pattern);
94+
c.bench_function(&format!("glob_matcher/{}/wall_time", case.name), |b| {
95+
b.iter_batched(
96+
|| (),
97+
|_| {
98+
black_box(matcher.matches(black_box(case.subject)));
99+
},
100+
BatchSize::SmallInput,
101+
)
102+
});
103+
}
104+
}
105+
106+
fn bench_allocs(c: &mut Criterion<AllocatedBytesMeasurement<System>>) {
107+
for case in cases() {
108+
let matcher = GlobMatcher::new(case.pattern);
109+
c.bench_function(
110+
&format!("glob_matcher/{}/allocated_bytes", case.name),
111+
|b| {
112+
b.iter_batched(
113+
|| (),
114+
|_| {
115+
black_box(matcher.matches(black_box(case.subject)));
116+
},
117+
BatchSize::SmallInput,
118+
)
119+
},
120+
);
121+
}
122+
}
123+
124+
criterion_group!(benches, bench_wall_time);
125+
criterion_group!(
126+
name = alloc_benches;
127+
config = memory_allocated_measurement(&GLOBAL);
128+
targets = bench_allocs
129+
);
130+
criterion_main!(alloc_benches, benches);

libdd-sampling/benches/sampling_bench.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -435,4 +435,4 @@ criterion_group!(
435435
config = memory_allocated_measurement(&GLOBAL);
436436
targets = criterion_benchmark_allocs
437437
);
438-
criterion_main!(benches, alloc_benches);
438+
criterion_main!(alloc_benches, benches);
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
//! LRU cache wrapper with dual limits: maximum entry count AND maximum tracked byte size.
5+
//!
6+
//! `lru::LruCache` only supports an entry-count capacity, which is unsafe for caches keyed on
7+
//! arbitrary user strings: a few very large keys can balloon memory. `BoundedByteCache` adds
8+
//! a byte budget on top, evicting least-recently-used entries until both limits are satisfied.
9+
10+
use lru::LruCache;
11+
use std::borrow::Borrow;
12+
use std::hash::Hash;
13+
use std::mem::size_of;
14+
use std::num::NonZeroUsize;
15+
16+
/// Default maximum entry count.
17+
pub const DEFAULT_MAX_ENTRIES: usize = 256;
18+
19+
/// Default maximum tracked byte size (64 KiB).
20+
///
21+
/// Under normal use (short keys such as service or resource names), the entry count cap binds
22+
/// first. This byte budget acts as a safety valve against memory exhaustion from unexpectedly
23+
/// large keys due to misconfiguration or adversarial input.
24+
pub const DEFAULT_MAX_BYTES: usize = 256 * 256;
25+
26+
/// LRU cache bounded by both entry count and total tracked byte size.
27+
///
28+
/// Byte accounting covers `key.as_ref().len() + size_of::<V>()`. Heap-allocated value contents
29+
/// are not tracked; this wrapper assumes small inline values (e.g. `bool`).
30+
pub struct BoundedByteCache<K, V>
31+
where
32+
K: Hash + Eq + AsRef<[u8]>,
33+
{
34+
inner: LruCache<K, V>,
35+
current_bytes: usize,
36+
max_bytes: usize,
37+
}
38+
39+
impl<K, V> BoundedByteCache<K, V>
40+
where
41+
K: Hash + Eq + AsRef<[u8]>,
42+
{
43+
/// `max_entries` of zero is treated as 1 (a cache with no slots is nonsensical).
44+
pub fn new(max_entries: usize, max_bytes: usize) -> Self {
45+
let entry_cap = NonZeroUsize::new(max_entries).unwrap_or(NonZeroUsize::MIN);
46+
Self {
47+
inner: LruCache::new(entry_cap),
48+
current_bytes: 0,
49+
max_bytes,
50+
}
51+
}
52+
53+
#[inline]
54+
pub fn get<Q>(&mut self, key: &Q) -> Option<&V>
55+
where
56+
K: Borrow<Q>,
57+
Q: Hash + Eq + ?Sized,
58+
{
59+
self.inner.get(key)
60+
}
61+
62+
/// Insert `key -> value`. Entries larger than `max_bytes` are dropped silently. Otherwise
63+
/// LRU entries are evicted until the new entry fits.
64+
pub fn put(&mut self, key: K, value: V) {
65+
let entry_bytes = Self::entry_size(&key);
66+
67+
if entry_bytes > self.max_bytes {
68+
return;
69+
}
70+
71+
// Replacing an existing key: deduct its bytes first.
72+
if self.inner.pop(&key).is_some() {
73+
self.current_bytes = self.current_bytes.saturating_sub(entry_bytes);
74+
}
75+
76+
while self.current_bytes + entry_bytes > self.max_bytes {
77+
match self.inner.pop_lru() {
78+
Some((evicted_key, _)) => {
79+
self.current_bytes = self
80+
.current_bytes
81+
.saturating_sub(Self::entry_size(&evicted_key));
82+
}
83+
None => break,
84+
}
85+
}
86+
87+
// `push` may evict an LRU entry to honor the entry-count cap; deduct its bytes.
88+
if let Some((replaced_key, _)) = self.inner.push(key, value) {
89+
self.current_bytes = self
90+
.current_bytes
91+
.saturating_sub(Self::entry_size(&replaced_key));
92+
}
93+
self.current_bytes += entry_bytes;
94+
}
95+
96+
#[cfg(test)]
97+
#[inline]
98+
pub fn current_bytes(&self) -> usize {
99+
self.current_bytes
100+
}
101+
102+
pub fn len(&self) -> usize {
103+
self.inner.len()
104+
}
105+
106+
fn entry_size(key: &K) -> usize {
107+
Self::PER_ENTRY_OVERHEAD + key.as_ref().len() + size_of::<V>()
108+
}
109+
110+
/// Approximate per-entry fixed heap overhead, in bytes. Covers:
111+
/// - `Vec<u8>` header on the key (24 B on 64-bit targets)
112+
/// - `lru` doubly-linked-list node (prev/next pointers, ~24 B)
113+
/// - `HashMap` bucket amortized (~16 B)
114+
///
115+
/// Rounded up so `max_bytes` is a pessimistic upper bound on actual heap usage. Recheck
116+
/// if the `lru` crate is upgraded across a major version — the linked-list node layout
117+
/// is the volatile piece.
118+
const PER_ENTRY_OVERHEAD: usize = 64;
119+
}
120+
121+
#[cfg(test)]
122+
mod tests {
123+
use super::*;
124+
125+
#[test]
126+
fn test_basic_put_and_get() {
127+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 1024);
128+
cache.put(b"hello".to_vec(), true);
129+
assert_eq!(cache.get(b"hello".as_ref()), Some(&true));
130+
assert_eq!(cache.len(), 1);
131+
}
132+
133+
#[test]
134+
fn test_evicts_lru_when_over_byte_budget() {
135+
// Each entry costs PER_ENTRY_OVERHEAD (64) + 4 (key) + 1 (bool) = 69 bytes. Budget
136+
// of 150 fits two entries (138 B) but not three.
137+
let budget = 150;
138+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, budget);
139+
cache.put(b"aaaa".to_vec(), true);
140+
cache.put(b"bbbb".to_vec(), false);
141+
assert_eq!(cache.len(), 2);
142+
cache.put(b"cccc".to_vec(), true);
143+
assert_eq!(cache.len(), 2);
144+
assert_eq!(cache.get(b"aaaa".as_ref()), None);
145+
assert_eq!(cache.get(b"bbbb".as_ref()), Some(&false));
146+
assert_eq!(cache.get(b"cccc".as_ref()), Some(&true));
147+
assert!(cache.current_bytes() <= budget);
148+
}
149+
150+
#[test]
151+
fn test_evicts_lru_when_over_entry_count() {
152+
// Generous byte budget; entry-count cap of 2 drives eviction.
153+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(2, 1024);
154+
cache.put(b"a".to_vec(), true);
155+
cache.put(b"b".to_vec(), false);
156+
cache.put(b"c".to_vec(), true);
157+
assert_eq!(cache.len(), 2);
158+
assert_eq!(cache.get(b"a".as_ref()), None);
159+
assert_eq!(cache.get(b"b".as_ref()), Some(&false));
160+
assert_eq!(cache.get(b"c".as_ref()), Some(&true));
161+
}
162+
163+
#[test]
164+
fn test_oversize_entry_is_rejected() {
165+
// Any entry costs at least PER_ENTRY_OVERHEAD bytes, so a 32-byte budget rejects
166+
// every insertion.
167+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 32);
168+
cache.put(b"small".to_vec(), true);
169+
assert_eq!(cache.len(), 0);
170+
assert_eq!(cache.current_bytes(), 0);
171+
}
172+
173+
#[test]
174+
fn test_replacing_key_does_not_double_count() {
175+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 1024);
176+
cache.put(b"k".to_vec(), true);
177+
let bytes_after_first = cache.current_bytes();
178+
cache.put(b"k".to_vec(), false);
179+
assert_eq!(cache.current_bytes(), bytes_after_first);
180+
assert_eq!(cache.get(b"k".as_ref()), Some(&false));
181+
assert_eq!(cache.len(), 1);
182+
}
183+
184+
#[test]
185+
fn test_get_bumps_recency() {
186+
// Budget for exactly two 4-byte-keyed entries (69 B each = 138 B total).
187+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(256, 150);
188+
cache.put(b"aaaa".to_vec(), true);
189+
cache.put(b"bbbb".to_vec(), true);
190+
let _ = cache.get(b"aaaa".as_ref());
191+
cache.put(b"cccc".to_vec(), true);
192+
assert_eq!(cache.get(b"aaaa".as_ref()), Some(&true));
193+
assert_eq!(cache.get(b"bbbb".as_ref()), None);
194+
}
195+
196+
#[test]
197+
fn test_many_inserts_stay_within_both_limits() {
198+
let max_entries = 8;
199+
// 8 entries * (PER_ENTRY_OVERHEAD 64 + 8-byte key + 1) = 584 bytes; round up.
200+
let max_bytes = 600;
201+
let mut cache: BoundedByteCache<Vec<u8>, bool> =
202+
BoundedByteCache::new(max_entries, max_bytes);
203+
for i in 0u16..1000 {
204+
cache.put(format!("key-{:04}", i).into_bytes(), i % 2 == 0);
205+
assert!(cache.current_bytes() <= max_bytes);
206+
assert!(cache.len() <= max_entries);
207+
}
208+
}
209+
210+
#[test]
211+
fn test_zero_entries_clamps_to_one() {
212+
let mut cache: BoundedByteCache<Vec<u8>, bool> = BoundedByteCache::new(0, 1024);
213+
cache.put(b"a".to_vec(), true);
214+
cache.put(b"b".to_vec(), false);
215+
assert_eq!(cache.len(), 1);
216+
assert_eq!(cache.get(b"a".as_ref()), None);
217+
assert_eq!(cache.get(b"b".as_ref()), Some(&false));
218+
}
219+
}

0 commit comments

Comments
 (0)