-
Notifications
You must be signed in to change notification settings - Fork 35
Expand file tree
/
Copy pathRunMetrics.java
More file actions
134 lines (119 loc) · 6.4 KB
/
Copy pathRunMetrics.java
File metadata and controls
134 lines (119 loc) · 6.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package org.beehive.gpullama3.auxiliary;
/**
* Singleton that accumulates fine-grained performance metrics across one inference run.
*
* <p>Metrics are set incrementally by different layers of the stack:</p>
* <ul>
* <li>{@link #setLoadDuration} — called from {@code LlamaApp} around model file loading</li>
* <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
* <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
* <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
* </ul>
*
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:</p>
* <ul>
* <li>Standard engine: {@code Total: X tok/s}</li>
* <li>Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}</li>
* </ul>
*/
public final class RunMetrics {
// ── Core metrics (nanoseconds) ────────────────────────────────────────────
private long totalDurationNs;
private long loadDurationNs;
private int promptEvalCount;
private long promptEvalDurationNs;
private int evalCount;
private long evalDurationNs;
private boolean hasPrefillPhase;
// ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
private long tornadoPlanCreationNs;
private long tornadoJitNs;
private long readOnlyWeightsCopyInNs;
// ── Singleton ─────────────────────────────────────────────────────────────
private static final RunMetrics INSTANCE = new RunMetrics();
private RunMetrics() {}
// ── Setters ───────────────────────────────────────────────────────────────
/** Records the time spent loading the model file (not including TornadoVM initialisation). */
public static void setLoadDuration(long ns) {
INSTANCE.loadDurationNs = ns;
}
/**
* Records TornadoVM-specific initialisation durations.
*
* @param planCreationNs task-graph construction ({@code createExecutionPlan()})
* @param jitNs JIT compilation ({@code withPreCompilation()})
* @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
*/
public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
INSTANCE.tornadoPlanCreationNs = planCreationNs;
INSTANCE.tornadoJitNs = jitNs;
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
}
/**
* Records inference-phase durations at the end of a generation run.
*
* @param promptCount number of prompt tokens processed (prefill)
* @param prefillNs wall-clock time spent in the prefill phase
* @param generatedCount number of tokens generated (decode)
* @param decodeNs wall-clock time spent in the decode phase
* @param totalNs total wall-clock time for the full inference call
*/
public static void setInferenceMetrics(int promptCount, long prefillNs,
int generatedCount, long decodeNs,
long totalNs) {
INSTANCE.promptEvalCount = promptCount;
INSTANCE.promptEvalDurationNs = prefillNs;
INSTANCE.evalCount = generatedCount;
INSTANCE.evalDurationNs = decodeNs;
INSTANCE.totalDurationNs = totalNs;
}
/**
* Signals that prefill and decode are distinct timed phases.
* Called by {@code InferenceEngineWithPrefillDecode} and
* {@code InferenceEngineWithBatchPrefillDecode} before returning.
*/
public static void setHasPrefillPhase(boolean value) {
INSTANCE.hasPrefillPhase = value;
}
// ── Output ────────────────────────────────────────────────────────────────
/** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */
public static void printMetrics() {
RunMetrics m = INSTANCE;
int totalTokens = m.promptEvalCount + m.evalCount;
double totalSecs = m.totalDurationNs / 1e9;
double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0;
System.err.println();
System.err.println("==== Performance Metrics ====");
if (m.hasPrefillPhase) {
double prefillSecs = m.promptEvalDurationNs / 1e9;
double decodeSecs = m.evalDurationNs / 1e9;
double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0)
? m.promptEvalCount / prefillSecs : 0;
double decodeRate = (decodeSecs > 0 && m.evalCount > 0)
? m.evalCount / decodeSecs : 0;
System.err.printf(
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
totalRate, totalTokens, totalSecs,
prefillRate, m.promptEvalCount, prefillSecs,
decodeRate, m.evalCount, decodeSecs);
} else {
System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
totalRate, totalTokens, totalSecs);
}
if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
&& m.tornadoPlanCreationNs > 0) {
System.err.printf(
"GGUF Model Load: %.2f ms%n" +
"Compilation & CodeGen: %.2f ms%n" +
"Warmup: %.2f ms%n" +
"Read-only weights Copy-in: %.2f ms%n",
m.loadDurationNs / 1_000_000.0,
m.tornadoPlanCreationNs / 1_000_000.0,
m.tornadoJitNs / 1_000_000.0,
m.readOnlyWeightsCopyInNs / 1_000_000.0);
}
System.err.println();
}
}