|
1 | 1 | package org.beehive.gpullama3.auxiliary; |
2 | 2 |
|
| 3 | +import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer; |
| 4 | +import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer; |
| 5 | +import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer; |
| 6 | +import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer; |
| 7 | +import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot; |
| 8 | + |
| 9 | +import java.io.IOException; |
| 10 | +import java.io.UncheckedIOException; |
| 11 | +import java.nio.file.Files; |
| 12 | +import java.nio.file.Path; |
| 13 | + |
3 | 14 | /** |
4 | 15 | * Singleton that accumulates fine-grained performance metrics across one inference run. |
5 | 16 | * |
6 | 17 | * <p>Metrics are set incrementally by different layers of the stack:</p> |
7 | 18 | * <ul> |
8 | | - * <li>{@link #setLoadDuration} — called from {@code LlamaApp} around model file loading</li> |
| 19 | + * <li>{@link #setLoadDuration} — called from {@code ModelLoader}</li> |
9 | 20 | * <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li> |
10 | 21 | * <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li> |
11 | 22 | * <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li> |
12 | 23 | * </ul> |
13 | 24 | * |
14 | | - * <p>All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:</p> |
| 25 | + * <p>All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable |
| 26 | + * {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.</p> |
| 27 | + * |
| 28 | + * <p>Configurable via system properties:</p> |
15 | 29 | * <ul> |
16 | | - * <li>Standard engine: {@code Total: X tok/s}</li> |
17 | | - * <li>Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}</li> |
| 30 | + * <li>{@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}</li> |
| 31 | + * <li>{@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}</li> |
| 32 | + * <li>{@code llama.metrics.file} — target path when {@code output=file}</li> |
18 | 33 | * </ul> |
19 | 34 | */ |
20 | 35 | public final class RunMetrics { |
21 | 36 |
|
22 | 37 | // ── Core metrics (nanoseconds) ──────────────────────────────────────────── |
23 | | - private long totalDurationNs; |
24 | | - private long loadDurationNs; |
25 | | - private int promptEvalCount; |
26 | | - private long promptEvalDurationNs; |
27 | | - private int evalCount; |
28 | | - private long evalDurationNs; |
| 38 | + private long totalDurationNs; |
| 39 | + private long loadDurationNs; |
| 40 | + private int promptEvalCount; |
| 41 | + private long promptEvalDurationNs; |
| 42 | + private int evalCount; |
| 43 | + private long evalDurationNs; |
29 | 44 | private boolean hasPrefillPhase; |
30 | 45 |
|
31 | 46 | // ── TornadoVM-specific metrics (nanoseconds) ────────────────────────────── |
@@ -53,9 +68,9 @@ public static void setLoadDuration(long ns) { |
53 | 68 | * @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()}) |
54 | 69 | */ |
55 | 70 | public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) { |
56 | | - INSTANCE.tornadoPlanCreationNs = planCreationNs; |
57 | | - INSTANCE.tornadoJitNs = jitNs; |
58 | | - INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs; |
| 71 | + INSTANCE.tornadoPlanCreationNs = planCreationNs; |
| 72 | + INSTANCE.tornadoJitNs = jitNs; |
| 73 | + INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs; |
59 | 74 | } |
60 | 75 |
|
61 | 76 | /** |
@@ -86,49 +101,57 @@ public static void setHasPrefillPhase(boolean value) { |
86 | 101 | INSTANCE.hasPrefillPhase = value; |
87 | 102 | } |
88 | 103 |
|
| 104 | + // ── Snapshot ────────────────────────────────────────────────────────────── |
| 105 | + |
| 106 | + /** Returns an immutable snapshot of all currently collected metrics. */ |
| 107 | + public static RunMetricsSnapshot snapshot() { |
| 108 | + RunMetrics m = INSTANCE; |
| 109 | + return RunMetricsSnapshot.of( |
| 110 | + m.totalDurationNs, m.loadDurationNs, |
| 111 | + m.promptEvalCount, m.promptEvalDurationNs, |
| 112 | + m.evalCount, m.evalDurationNs, |
| 113 | + m.hasPrefillPhase, |
| 114 | + m.tornadoPlanCreationNs, m.tornadoJitNs, |
| 115 | + m.readOnlyWeightsCopyInNs); |
| 116 | + } |
| 117 | + |
89 | 118 | // ── Output ──────────────────────────────────────────────────────────────── |
90 | 119 |
|
91 | | - /** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */ |
| 120 | + /** |
| 121 | + * Builds a snapshot, selects a renderer based on {@code llama.metrics.format}, |
| 122 | + * and writes the result to the sink configured by {@code llama.metrics.output}. |
| 123 | + */ |
92 | 124 | public static void printMetrics() { |
93 | | - RunMetrics m = INSTANCE; |
| 125 | + RunMetricsSnapshot snap = snapshot(); |
| 126 | + |
| 127 | + MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) { |
| 128 | + case "json" -> new JsonMetricsRenderer(); |
| 129 | + case "github" -> new GitHubMetricsRenderer(); |
| 130 | + default -> new HumanMetricsRenderer(); |
| 131 | + }; |
94 | 132 |
|
95 | | - int totalTokens = m.promptEvalCount + m.evalCount; |
96 | | - double totalSecs = m.totalDurationNs / 1e9; |
97 | | - double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0; |
98 | | - |
99 | | - System.err.println(); |
100 | | - System.err.println("==== Performance Metrics ===="); |
101 | | - if (m.hasPrefillPhase) { |
102 | | - double prefillSecs = m.promptEvalDurationNs / 1e9; |
103 | | - double decodeSecs = m.evalDurationNs / 1e9; |
104 | | - double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0) |
105 | | - ? m.promptEvalCount / prefillSecs : 0; |
106 | | - double decodeRate = (decodeSecs > 0 && m.evalCount > 0) |
107 | | - ? m.evalCount / decodeSecs : 0; |
108 | | - System.err.printf( |
109 | | - "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + |
110 | | - "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" + |
111 | | - "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", |
112 | | - totalRate, totalTokens, totalSecs, |
113 | | - prefillRate, m.promptEvalCount, prefillSecs, |
114 | | - decodeRate, m.evalCount, decodeSecs); |
115 | | - } else { |
116 | | - System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n", |
117 | | - totalRate, totalTokens, totalSecs); |
| 133 | + String rendered = renderer.render(snap); |
| 134 | + |
| 135 | + switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) { |
| 136 | + case "stdout" -> System.out.print(rendered); |
| 137 | + case "file" -> writeToFile(rendered); |
| 138 | + default -> System.err.print(rendered); |
118 | 139 | } |
| 140 | + } |
119 | 141 |
|
120 | | - if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false")) |
121 | | - && m.tornadoPlanCreationNs > 0) { |
122 | | - System.err.printf( |
123 | | - "GGUF Model Load: %.2f ms%n" + |
124 | | - "Compilation & CodeGen: %.2f ms%n" + |
125 | | - "Warmup: %.2f ms%n" + |
126 | | - "Read-only weights Copy-in: %.2f ms%n", |
127 | | - m.loadDurationNs / 1_000_000.0, |
128 | | - m.tornadoPlanCreationNs / 1_000_000.0, |
129 | | - m.tornadoJitNs / 1_000_000.0, |
130 | | - m.readOnlyWeightsCopyInNs / 1_000_000.0); |
| 142 | + private static void writeToFile(String content) { |
| 143 | + String filePath = System.getProperty("llama.metrics.file"); |
| 144 | + if (filePath == null || filePath.isBlank()) { |
| 145 | + throw new IllegalStateException( |
| 146 | + "llama.metrics.output=file requires llama.metrics.file to be set"); |
| 147 | + } |
| 148 | + Path path = Path.of(filePath); |
| 149 | + try { |
| 150 | + Path parent = path.getParent(); |
| 151 | + if (parent != null) Files.createDirectories(parent); |
| 152 | + Files.writeString(path, content); |
| 153 | + } catch (IOException e) { |
| 154 | + throw new UncheckedIOException("Failed to write metrics to " + filePath, e); |
131 | 155 | } |
132 | | - System.err.println(); |
133 | 156 | } |
134 | 157 | } |
0 commit comments