Skip to content

Commit e460e28

Browse files
jbachorikclaude
andcommitted
perf(profiling): eliminate allocations in OTLP encoder hot paths
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 60049bd commit e460e28

8 files changed

Lines changed: 377 additions & 261 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,4 @@ mise*.local.toml
9393

9494
# Exclude kotlin build files
9595
.kotlin
96+
.claude/state/

dd-java-agent/agent-profiling/profiling-otel/build.gradle.kts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@ jmh {
2727
"-XX:+DebugNonSafepoints"
2828
)
2929
}
30+
31+
// async-profiler CPU flamegraph
32+
// Usage: ./gradlew jmh -PjmhAsyncProfiler=/path/to/libasyncProfiler.dylib
33+
// Output: /tmp/jmh-async-profile.html
34+
if (project.hasProperty("jmhAsyncProfiler")) {
35+
val lib = project.property("jmhAsyncProfiler") as String
36+
jvmArgs = listOf(
37+
"-XX:+UnlockDiagnosticVMOptions",
38+
"-XX:+DebugNonSafepoints",
39+
"-agentpath:$lib=start,event=cpu,file=/tmp/jmh-async-profile.html,flamegraph"
40+
)
41+
}
3042
}
3143

3244
// OTLP validation tests removed - use profcheck validation instead (see validateOtlp task below)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
package com.datadog.profiling.otel.benchmark;
2+
3+
import static java.util.concurrent.TimeUnit.SECONDS;
4+
import static org.openjdk.jmh.annotations.Mode.Throughput;
5+
6+
import com.datadog.profiling.otel.JfrToOtlpConverter;
7+
import java.io.IOException;
8+
import java.nio.file.Path;
9+
import java.nio.file.Paths;
10+
import java.time.Instant;
11+
import org.openjdk.jmh.annotations.Benchmark;
12+
import org.openjdk.jmh.annotations.BenchmarkMode;
13+
import org.openjdk.jmh.annotations.Fork;
14+
import org.openjdk.jmh.annotations.Level;
15+
import org.openjdk.jmh.annotations.Measurement;
16+
import org.openjdk.jmh.annotations.OutputTimeUnit;
17+
import org.openjdk.jmh.annotations.Param;
18+
import org.openjdk.jmh.annotations.Scope;
19+
import org.openjdk.jmh.annotations.Setup;
20+
import org.openjdk.jmh.annotations.State;
21+
import org.openjdk.jmh.annotations.Warmup;
22+
import org.openjdk.jmh.infra.Blackhole;
23+
24+
/** Benchmarks JFR-to-OTLP conversion on real production JFR recordings. */
25+
@State(Scope.Benchmark)
26+
@BenchmarkMode(Throughput)
27+
@OutputTimeUnit(SECONDS)
28+
@Fork(value = 1)
29+
@Warmup(iterations = 3, time = 5)
30+
@Measurement(iterations = 5, time = 10)
31+
public class RealFileConversionBenchmark {
32+
33+
@Param({"/tmp/inventory-cache.jfr", "/tmp/sbom.jfr", "/tmp/otelp.jfr"})
34+
String jfrFilePath;
35+
36+
private Path jfrFile;
37+
private JfrToOtlpConverter converter;
38+
private Instant start;
39+
private Instant end;
40+
41+
@Setup(Level.Trial)
42+
public void setup() throws IOException {
43+
jfrFile = Paths.get(jfrFilePath);
44+
converter = new JfrToOtlpConverter();
45+
start = Instant.EPOCH;
46+
end = Instant.now();
47+
}
48+
49+
@Benchmark
50+
public void convertJfrToOtlp(Blackhole bh) throws IOException {
51+
byte[] result = converter.addFile(jfrFile, start, end).convert();
52+
bh.consume(result);
53+
converter.reset();
54+
}
55+
}

dd-java-agent/agent-profiling/profiling-otel/src/main/java/com/datadog/profiling/otel/JfrToOtlpConverter.java

Lines changed: 111 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,73 @@ public enum Kind {
6767
JSON_PRETTY
6868
}
6969

70+
/**
71+
* Open-addressing long→int hash map. Eliminates boxing overhead of {@code HashMap<Long,Integer>}
72+
* for the stack-trace and frame caches.
73+
*/
74+
private static final class LongIntMap {
75+
private static final long EMPTY = Long.MIN_VALUE;
76+
private static final int INITIAL_CAPACITY = 1024; // power of 2
77+
78+
private long[] keys;
79+
private int[] values;
80+
private int mask;
81+
82+
LongIntMap() {
83+
keys = new long[INITIAL_CAPACITY];
84+
values = new int[INITIAL_CAPACITY];
85+
mask = INITIAL_CAPACITY - 1;
86+
java.util.Arrays.fill(keys, EMPTY);
87+
}
88+
89+
int get(long key) {
90+
int slot = (int) (mix(key) & mask);
91+
while (keys[slot] != EMPTY) {
92+
if (keys[slot] == key) return values[slot];
93+
slot = (slot + 1) & mask;
94+
}
95+
return -1;
96+
}
97+
98+
void put(long key, int value) {
99+
if (size * 2 >= keys.length) rehash();
100+
int slot = (int) (mix(key) & mask);
101+
while (keys[slot] != EMPTY && keys[slot] != key) {
102+
slot = (slot + 1) & mask;
103+
}
104+
if (keys[slot] == EMPTY) size++;
105+
keys[slot] = key;
106+
values[slot] = value;
107+
}
108+
109+
void clear() {
110+
java.util.Arrays.fill(keys, EMPTY);
111+
size = 0;
112+
}
113+
114+
private int size;
115+
116+
private void rehash() {
117+
long[] oldKeys = keys;
118+
int[] oldValues = values;
119+
keys = new long[oldKeys.length * 2];
120+
values = new int[oldKeys.length * 2];
121+
mask = keys.length - 1;
122+
java.util.Arrays.fill(keys, EMPTY);
123+
size = 0;
124+
for (int i = 0; i < oldKeys.length; i++) {
125+
if (oldKeys[i] != EMPTY) put(oldKeys[i], oldValues[i]);
126+
}
127+
}
128+
129+
private static long mix(long key) {
130+
key ^= key >>> 33;
131+
key *= 0xff51afd7ed558ccdL;
132+
key ^= key >>> 33;
133+
return key;
134+
}
135+
}
136+
70137
private static final class PathEntry {
71138
final Path path;
72139
final boolean ephemeral;
@@ -109,8 +176,16 @@ public int hashCode() {
109176
private final AttributeTable attributeTable = new AttributeTable();
110177

111178
// Stack trace cache: maps (stackTraceId + chunkId) → stack index
112-
// This avoids redundant frame processing for duplicate stack traces
113-
private final java.util.Map<Long, Integer> stackTraceCache = new java.util.HashMap<>();
179+
private final LongIntMap stackTraceCache = new LongIntMap();
180+
181+
// Frame cache: maps (methodId + chunkId + lineNumber) → locationIndex
182+
private final LongIntMap frameCache = new LongIntMap();
183+
184+
// Per-type attribute index arrays, lazily initialized once per conversion session and shared
185+
// across all samples of the same type to avoid per-sample allocation.
186+
private int[] cpuAttrIndices;
187+
private int[] wallAttrIndices;
188+
private int[] lockAttrIndices;
114189

115190
// Sample collectors by profile type
116191
private final List<SampleData> cpuSamples = new ArrayList<>();
@@ -287,6 +362,10 @@ public void reset() {
287362
linkTable.reset();
288363
attributeTable.reset();
289364
stackTraceCache.clear();
365+
frameCache.clear();
366+
cpuAttrIndices = null;
367+
wallAttrIndices = null;
368+
lockAttrIndices = null;
290369
cpuSamples.clear();
291370
wallSamples.clear();
292371
allocSamples.clear();
@@ -372,8 +451,8 @@ private void handleExecutionSample(ExecutionSample event, Control ctl) {
372451
int linkIndex = extractLinkIndex(event.spanId(), event.localRootSpanId());
373452
long timestamp = convertTimestamp(event.startTime(), ctl);
374453

375-
int[] attributeIndices = new int[] {getSampleTypeAttributeIndex("cpu")};
376-
cpuSamples.add(new SampleData(stackIndex, linkIndex, 1, timestamp, attributeIndices));
454+
if (cpuAttrIndices == null) cpuAttrIndices = new int[] {getSampleTypeAttributeIndex("cpu")};
455+
cpuSamples.add(new SampleData(stackIndex, linkIndex, 1, timestamp, cpuAttrIndices));
377456
}
378457

379458
private void handleMethodSample(MethodSample event, Control ctl) {
@@ -384,8 +463,8 @@ private void handleMethodSample(MethodSample event, Control ctl) {
384463
int linkIndex = extractLinkIndex(event.spanId(), event.localRootSpanId());
385464
long timestamp = convertTimestamp(event.startTime(), ctl);
386465

387-
int[] attributeIndices = new int[] {getSampleTypeAttributeIndex("wall")};
388-
wallSamples.add(new SampleData(stackIndex, linkIndex, 1, timestamp, attributeIndices));
466+
if (wallAttrIndices == null) wallAttrIndices = new int[] {getSampleTypeAttributeIndex("wall")};
467+
wallSamples.add(new SampleData(stackIndex, linkIndex, 1, timestamp, wallAttrIndices));
389468
}
390469

391470
private void handleObjectSample(ObjectSample event, Control ctl) {
@@ -446,8 +525,8 @@ private void handleMonitorEnter(JavaMonitorEnter event, Control ctl) {
446525
long timestamp = convertTimestamp(event.startTime(), ctl);
447526
long durationNanos = ctl.chunkInfo().asDuration(event.duration()).toNanos();
448527

449-
int[] attributeIndices = new int[] {getSampleTypeAttributeIndex("lock-contention")};
450-
lockSamples.add(new SampleData(stackIndex, 0, durationNanos, timestamp, attributeIndices));
528+
if (lockAttrIndices == null) lockAttrIndices = new int[] {getSampleTypeAttributeIndex("lock-contention")};
529+
lockSamples.add(new SampleData(stackIndex, 0, durationNanos, timestamp, lockAttrIndices));
451530
}
452531

453532
private void handleMonitorWait(JavaMonitorWait event, Control ctl) {
@@ -458,8 +537,8 @@ private void handleMonitorWait(JavaMonitorWait event, Control ctl) {
458537
long timestamp = convertTimestamp(event.startTime(), ctl);
459538
long durationNanos = ctl.chunkInfo().asDuration(event.duration()).toNanos();
460539

461-
int[] attributeIndices = new int[] {getSampleTypeAttributeIndex("lock-contention")};
462-
lockSamples.add(new SampleData(stackIndex, 0, durationNanos, timestamp, attributeIndices));
540+
if (lockAttrIndices == null) lockAttrIndices = new int[] {getSampleTypeAttributeIndex("lock-contention")};
541+
lockSamples.add(new SampleData(stackIndex, 0, durationNanos, timestamp, lockAttrIndices));
463542
}
464543

465544
private JfrStackTrace safeGetStackTrace(java.util.function.Supplier<JfrStackTrace> supplier) {
@@ -479,8 +558,8 @@ private int convertStackTrace(
479558
long cacheKey = stackTraceId ^ ((long) System.identityHashCode(ctl.chunkInfo()) << 32);
480559

481560
// Check cache first - avoid resolving stack trace if cached
482-
Integer cachedIndex = stackTraceCache.get(cacheKey);
483-
if (cachedIndex != null) {
561+
int cachedIndex = stackTraceCache.get(cacheKey);
562+
if (cachedIndex != -1) {
484563
return cachedIndex;
485564
}
486565

@@ -499,15 +578,15 @@ private int convertStackTrace(
499578

500579
int[] locationIndices = new int[frames.length];
501580
for (int i = 0; i < frames.length; i++) {
502-
locationIndices[i] = convertFrame(frames[i]);
581+
locationIndices[i] = convertFrame(frames[i], ctl);
503582
}
504583

505584
int stackIndex = stackTable.intern(locationIndices);
506585
stackTraceCache.put(cacheKey, stackIndex);
507586
return stackIndex;
508587
}
509588

510-
private int convertFrame(JfrStackFrame frame) {
589+
private int convertFrame(JfrStackFrame frame, Control ctl) {
511590
if (frame == null) {
512591
return 0;
513592
}
@@ -517,33 +596,40 @@ private int convertFrame(JfrStackFrame frame) {
517596
return 0;
518597
}
519598

520-
// Get class and method names
599+
int lineNumber = frame.lineNumber();
600+
long methodId = frame.methodId();
601+
602+
// Cache key mirrors the stackTraceCache pattern: tag methodId with chunk identity
603+
// so per-chunk CP indices don't collide across chunks.
604+
long cacheKey =
605+
methodId ^ ((long) System.identityHashCode(ctl.chunkInfo()) << 32) ^ (lineNumber * 1000003L);
606+
int cached = frameCache.get(cacheKey);
607+
if (cached != -1) {
608+
return cached;
609+
}
610+
611+
// Cache miss — full processing
521612
String methodName = method.name();
522613
JfrClass type = method.type();
523614
String className = type != null ? type.name() : null;
524615

525-
// Get line number
526-
int lineNumber = frame.lineNumber();
527616
long line = Math.max(lineNumber, 0);
528617

529-
// Build full name
530618
String fullName;
531619
if (className != null && !className.isEmpty()) {
532620
fullName = className + "." + (methodName != null ? methodName : "");
533621
} else {
534622
fullName = methodName != null ? methodName : "";
535623
}
536624

537-
// Intern strings
538625
int nameIndex = stringTable.intern(fullName);
539626
int classNameIndex = stringTable.intern(className);
540627
int methodNameIndex = stringTable.intern(methodName);
541-
542-
// Intern function
543628
int functionIndex = functionTable.intern(nameIndex, methodNameIndex, classNameIndex, 0);
629+
int locationIndex = locationTable.intern(0, 0, functionIndex, line, 0);
544630

545-
// Create location entry
546-
return locationTable.intern(0, 0, functionIndex, line, 0);
631+
frameCache.put(cacheKey, locationIndex);
632+
return locationIndex;
547633
}
548634

549635
private int extractLinkIndex(long spanId, long localRootSpanId) {
@@ -725,12 +811,11 @@ private void encodeSample(ProtobufEncoder encoder, SampleData sample) {
725811
encoder.writeVarintField(OtlpProtoFields.Sample.LINK_INDEX, sample.linkIndex);
726812

727813
// Field 4: values (packed)
728-
encoder.writePackedVarintField(OtlpProtoFields.Sample.VALUES, new long[] {sample.value});
814+
encoder.writePackedVarintField(OtlpProtoFields.Sample.VALUES, sample.value);
729815

730816
// Field 5: timestamps_unix_nano (packed)
731817
if (sample.timestampNanos > 0) {
732-
encoder.writePackedFixed64Field(
733-
OtlpProtoFields.Sample.TIMESTAMPS_UNIX_NANO, new long[] {sample.timestampNanos});
818+
encoder.writePackedFixed64Field(OtlpProtoFields.Sample.TIMESTAMPS_UNIX_NANO, sample.timestampNanos);
734819
}
735820
}
736821

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,15 @@
11
package com.datadog.profiling.otel.jfr;
22

3+
import io.jafar.parser.api.JfrField;
34
import io.jafar.parser.api.JfrType;
45

56
/** Represents a JFR stack frame. */
67
@JfrType("jdk.types.StackFrame")
78
public interface JfrStackFrame {
89
JfrMethod method();
910

11+
@JfrField(value = "method", raw = true)
12+
long methodId();
13+
1014
int lineNumber();
1115
}

0 commit comments

Comments
 (0)