Skip to content

Commit e966ed1

Browse files
Add RunMetricsRenderer abstraction with support for human-readable, JSON, and GitHub Step Summary formats
1 parent 964927b commit e966ed1

6 files changed

Lines changed: 297 additions & 50 deletions

File tree

Lines changed: 73 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,46 @@
11
package org.beehive.gpullama3.auxiliary;
22

3+
import org.beehive.gpullama3.auxiliary.metrics.GitHubMetricsRenderer;
4+
import org.beehive.gpullama3.auxiliary.metrics.HumanMetricsRenderer;
5+
import org.beehive.gpullama3.auxiliary.metrics.JsonMetricsRenderer;
6+
import org.beehive.gpullama3.auxiliary.metrics.MetricsRenderer;
7+
import org.beehive.gpullama3.auxiliary.metrics.RunMetricsSnapshot;
8+
9+
import java.io.IOException;
10+
import java.io.UncheckedIOException;
11+
import java.nio.file.Files;
12+
import java.nio.file.Path;
13+
314
/**
415
* Singleton that accumulates fine-grained performance metrics across one inference run.
516
*
617
* <p>Metrics are set incrementally by different layers of the stack:</p>
718
* <ul>
8-
* <li>{@link #setLoadDuration} — called from {@code LlamaApp} around model file loading</li>
19+
* <li>{@link #setLoadDuration} — called from {@code ModelLoader}</li>
920
* <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
1021
* <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
1122
* <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
1223
* </ul>
1324
*
14-
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:</p>
25+
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} builds an immutable
26+
* {@link RunMetricsSnapshot}, selects a {@link MetricsRenderer}, and writes to the configured sink.</p>
27+
*
28+
* <p>Configurable via system properties:</p>
1529
* <ul>
16-
* <li>Standard engine: {@code Total: X tok/s}</li>
17-
* <li>Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}</li>
30+
* <li>{@code llama.metrics.format} — {@code human} (default) | {@code json} | {@code github}</li>
31+
* <li>{@code llama.metrics.output} — {@code stderr} (default) | {@code stdout} | {@code file}</li>
32+
* <li>{@code llama.metrics.file} — target path when {@code output=file}</li>
1833
* </ul>
1934
*/
2035
public final class RunMetrics {
2136

2237
// ── Core metrics (nanoseconds) ────────────────────────────────────────────
23-
private long totalDurationNs;
24-
private long loadDurationNs;
25-
private int promptEvalCount;
26-
private long promptEvalDurationNs;
27-
private int evalCount;
28-
private long evalDurationNs;
38+
private long totalDurationNs;
39+
private long loadDurationNs;
40+
private int promptEvalCount;
41+
private long promptEvalDurationNs;
42+
private int evalCount;
43+
private long evalDurationNs;
2944
private boolean hasPrefillPhase;
3045

3146
// ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
@@ -53,9 +68,9 @@ public static void setLoadDuration(long ns) {
5368
* @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
5469
*/
5570
public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
56-
INSTANCE.tornadoPlanCreationNs = planCreationNs;
57-
INSTANCE.tornadoJitNs = jitNs;
58-
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
71+
INSTANCE.tornadoPlanCreationNs = planCreationNs;
72+
INSTANCE.tornadoJitNs = jitNs;
73+
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
5974
}
6075

6176
/**
@@ -86,49 +101,57 @@ public static void setHasPrefillPhase(boolean value) {
86101
INSTANCE.hasPrefillPhase = value;
87102
}
88103

104+
// ── Snapshot ──────────────────────────────────────────────────────────────
105+
106+
/** Returns an immutable snapshot of all currently collected metrics. */
107+
public static RunMetricsSnapshot snapshot() {
108+
RunMetrics m = INSTANCE;
109+
return RunMetricsSnapshot.of(
110+
m.totalDurationNs, m.loadDurationNs,
111+
m.promptEvalCount, m.promptEvalDurationNs,
112+
m.evalCount, m.evalDurationNs,
113+
m.hasPrefillPhase,
114+
m.tornadoPlanCreationNs, m.tornadoJitNs,
115+
m.readOnlyWeightsCopyInNs);
116+
}
117+
89118
// ── Output ────────────────────────────────────────────────────────────────
90119

91-
/** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */
120+
/**
121+
* Builds a snapshot, selects a renderer based on {@code llama.metrics.format},
122+
* and writes the result to the sink configured by {@code llama.metrics.output}.
123+
*/
92124
public static void printMetrics() {
93-
RunMetrics m = INSTANCE;
125+
RunMetricsSnapshot snap = snapshot();
126+
127+
MetricsRenderer renderer = switch (System.getProperty("llama.metrics.format", "human").toLowerCase()) {
128+
case "json" -> new JsonMetricsRenderer();
129+
case "github" -> new GitHubMetricsRenderer();
130+
default -> new HumanMetricsRenderer();
131+
};
94132

95-
int totalTokens = m.promptEvalCount + m.evalCount;
96-
double totalSecs = m.totalDurationNs / 1e9;
97-
double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0;
98-
99-
System.err.println();
100-
System.err.println("==== Performance Metrics ====");
101-
if (m.hasPrefillPhase) {
102-
double prefillSecs = m.promptEvalDurationNs / 1e9;
103-
double decodeSecs = m.evalDurationNs / 1e9;
104-
double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0)
105-
? m.promptEvalCount / prefillSecs : 0;
106-
double decodeRate = (decodeSecs > 0 && m.evalCount > 0)
107-
? m.evalCount / decodeSecs : 0;
108-
System.err.printf(
109-
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
110-
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
111-
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
112-
totalRate, totalTokens, totalSecs,
113-
prefillRate, m.promptEvalCount, prefillSecs,
114-
decodeRate, m.evalCount, decodeSecs);
115-
} else {
116-
System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
117-
totalRate, totalTokens, totalSecs);
133+
String rendered = renderer.render(snap);
134+
135+
switch (System.getProperty("llama.metrics.output", "stderr").toLowerCase()) {
136+
case "stdout" -> System.out.print(rendered);
137+
case "file" -> writeToFile(rendered);
138+
default -> System.err.print(rendered);
118139
}
140+
}
119141

120-
if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
121-
&& m.tornadoPlanCreationNs > 0) {
122-
System.err.printf(
123-
"GGUF Model Load: %.2f ms%n" +
124-
"Compilation & CodeGen: %.2f ms%n" +
125-
"Warmup: %.2f ms%n" +
126-
"Read-only weights Copy-in: %.2f ms%n",
127-
m.loadDurationNs / 1_000_000.0,
128-
m.tornadoPlanCreationNs / 1_000_000.0,
129-
m.tornadoJitNs / 1_000_000.0,
130-
m.readOnlyWeightsCopyInNs / 1_000_000.0);
142+
private static void writeToFile(String content) {
143+
String filePath = System.getProperty("llama.metrics.file");
144+
if (filePath == null || filePath.isBlank()) {
145+
throw new IllegalStateException(
146+
"llama.metrics.output=file requires llama.metrics.file to be set");
147+
}
148+
Path path = Path.of(filePath);
149+
try {
150+
Path parent = path.getParent();
151+
if (parent != null) Files.createDirectories(parent);
152+
Files.writeString(path, content);
153+
} catch (IOException e) {
154+
throw new UncheckedIOException("Failed to write metrics to " + filePath, e);
131155
}
132-
System.err.println();
133156
}
134157
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/**
4+
* Renders metrics as a Markdown table suitable for appending to {@code $GITHUB_STEP_SUMMARY}.
5+
* TornadoVM rows (compile, JIT, weight copy-in) are included only when plan-creation duration
6+
* is non-zero, i.e. on GPU runs.
7+
*
8+
* <p>Enable via system properties and append the output file to the step summary:</p>
9+
* <pre>
10+
* -Dllama.metrics.format=github
11+
* -Dllama.metrics.output=file
12+
* -Dllama.metrics.file=/tmp/metrics.md
13+
* </pre>
14+
*
15+
* <p>In a GitHub Actions workflow step:</p>
16+
* <pre>
17+
* cat /tmp/metrics.md >> $GITHUB_STEP_SUMMARY
18+
* </pre>
19+
*/
20+
public final class GitHubMetricsRenderer implements MetricsRenderer {
21+
22+
@Override
23+
public String render(RunMetricsSnapshot s) {
24+
StringBuilder sb = new StringBuilder();
25+
sb.append("| metric | value |\n");
26+
sb.append("|---|---:|\n");
27+
sb.append(String.format("| eval tok/s | %.2f |%n", s.evalRate()));
28+
sb.append(String.format("| prompt eval tok/s | %.2f |%n", s.promptEvalRate()));
29+
sb.append(String.format("| total tok/s | %.2f |%n", s.totalRate()));
30+
sb.append(String.format("| load ms | %.2f |%n", s.loadDuration() / 1_000_000.0));
31+
sb.append(String.format("| eval tokens | %d |%n", s.evalCount()));
32+
sb.append(String.format("| prompt tokens | %d |%n", s.promptEvalCount()));
33+
sb.append(String.format("| total tokens | %d |%n", s.totalCount()));
34+
if (s.tornadoPlanCreationDuration() > 0) {
35+
sb.append(String.format("| compile ms | %.2f |%n",
36+
s.tornadoPlanCreationDuration() / 1_000_000.0));
37+
sb.append(String.format("| jit ms | %.2f |%n",
38+
s.tornadoJitDuration() / 1_000_000.0));
39+
sb.append(String.format("| weight copy-in ms | %.2f |%n",
40+
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
41+
}
42+
return sb.toString();
43+
}
44+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/**
4+
* Renders metrics in human-readable format to {@code stderr}.
5+
*
6+
* <p>This is the default renderer — no configuration needed. To enable explicitly:</p>
7+
* <pre>
8+
* -Dllama.metrics.format=human (default, can be omitted)
9+
* -Dllama.metrics.output=stderr (default, can be omitted)
10+
* </pre>
11+
*
12+
* <p>To also print TornadoVM initialisation timings (plan creation, JIT, weight copy-in),
13+
* additionally set:</p>
14+
* <pre>
15+
* -Dllama.EnableTimingForTornadoVMInit=true
16+
* </pre>
17+
*/
18+
public final class HumanMetricsRenderer implements MetricsRenderer {
19+
20+
@Override
21+
public String render(RunMetricsSnapshot s) {
22+
StringBuilder sb = new StringBuilder();
23+
sb.append("\n==== Performance Metrics ====\n");
24+
25+
if (s.hasPrefillPhase()) {
26+
sb.append(String.format(
27+
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
28+
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
29+
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
30+
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9,
31+
s.promptEvalRate(), s.promptEvalCount(), s.promptEvalDuration() / 1e9,
32+
s.evalRate(), s.evalCount(), s.evalDuration() / 1e9));
33+
} else {
34+
sb.append(String.format("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
35+
s.totalRate(), s.totalCount(), s.totalDuration() / 1e9));
36+
}
37+
38+
if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
39+
&& s.tornadoPlanCreationDuration() > 0) {
40+
sb.append(String.format(
41+
"GGUF Model Load: %.2f ms%n" +
42+
"Compilation & CodeGen: %.2f ms%n" +
43+
"Warmup: %.2f ms%n" +
44+
"Read-only weights Copy-in: %.2f ms%n",
45+
s.loadDuration() / 1_000_000.0,
46+
s.tornadoPlanCreationDuration() / 1_000_000.0,
47+
s.tornadoJitDuration() / 1_000_000.0,
48+
s.tornadoReadOnlyWeightsCopyInDuration() / 1_000_000.0));
49+
}
50+
51+
return sb.toString();
52+
}
53+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
import java.util.ArrayList;
4+
import java.util.List;
5+
6+
/**
7+
* Renders metrics as an Ollama-compatible JSON object.
8+
* All duration fields are in nanoseconds; rate fields are in tokens per second.
9+
* The {@code tornadovm} nested object is always included (fields are zero on CPU runs).
10+
*
11+
* <p>Enable via system properties:</p>
12+
* <pre>
13+
* -Dllama.metrics.format=json
14+
* -Dllama.metrics.output=stdout # pipe to jq or another tool
15+
* </pre>
16+
*
17+
* <p>Or write to a file:</p>
18+
* <pre>
19+
* -Dllama.metrics.format=json
20+
* -Dllama.metrics.output=file
21+
* -Dllama.metrics.file=/path/to/metrics.json
22+
* </pre>
23+
*/
24+
public final class JsonMetricsRenderer implements MetricsRenderer {
25+
26+
@Override
27+
public String render(RunMetricsSnapshot s) {
28+
List<String> fields = new ArrayList<>();
29+
fields.add(field(" ", "total_duration", s.totalDuration()));
30+
fields.add(field(" ", "load_duration", s.loadDuration()));
31+
fields.add(field(" ", "prompt_eval_count", s.promptEvalCount()));
32+
fields.add(field(" ", "prompt_eval_duration", s.promptEvalDuration()));
33+
fields.add(field(" ", "eval_count", s.evalCount()));
34+
fields.add(field(" ", "eval_duration", s.evalDuration()));
35+
fields.add(field(" ", "total_count", s.totalCount()));
36+
fields.add(field(" ", "prompt_eval_rate", s.promptEvalRate()));
37+
fields.add(field(" ", "eval_rate", s.evalRate()));
38+
fields.add(field(" ", "total_rate", s.totalRate()));
39+
fields.add(field(" ", "has_prefill_phase", s.hasPrefillPhase()));
40+
fields.add(tornadoObject(s));
41+
return "{\n" + String.join(",\n", fields) + "\n}";
42+
}
43+
44+
private static String tornadoObject(RunMetricsSnapshot s) {
45+
List<String> inner = new ArrayList<>();
46+
inner.add(field(" ", "plan_creation_duration", s.tornadoPlanCreationDuration()));
47+
inner.add(field(" ", "jit_duration", s.tornadoJitDuration()));
48+
inner.add(field(" ", "read_only_weights_copy_in_duration", s.tornadoReadOnlyWeightsCopyInDuration()));
49+
return " \"tornadovm\": {\n" + String.join(",\n", inner) + "\n }";
50+
}
51+
52+
private static String field(String indent, String key, long value) {
53+
return indent + "\"" + key + "\": " + value;
54+
}
55+
56+
private static String field(String indent, String key, int value) {
57+
return indent + "\"" + key + "\": " + value;
58+
}
59+
60+
private static String field(String indent, String key, double value) {
61+
return indent + "\"" + key + "\": " + String.format("%.4f", value);
62+
}
63+
64+
private static String field(String indent, String key, boolean value) {
65+
return indent + "\"" + key + "\": " + value;
66+
}
67+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/** Renders a {@link RunMetricsSnapshot} to a string for a specific output format. */
4+
@FunctionalInterface
5+
public interface MetricsRenderer {
6+
String render(RunMetricsSnapshot snapshot);
7+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package org.beehive.gpullama3.auxiliary.metrics;
2+
3+
/**
4+
* Immutable snapshot of all performance metrics for a single inference run.
5+
*
6+
* <p>Raw duration fields are in nanoseconds. Derived rate fields are in tokens per second.
7+
* Construct via {@link #of} — it computes the derived values from the raw inputs.</p>
8+
*/
9+
public record RunMetricsSnapshot(
10+
long totalDuration,
11+
long loadDuration,
12+
int promptEvalCount,
13+
long promptEvalDuration,
14+
int evalCount,
15+
long evalDuration,
16+
boolean hasPrefillPhase,
17+
long tornadoPlanCreationDuration,
18+
long tornadoJitDuration,
19+
long tornadoReadOnlyWeightsCopyInDuration,
20+
// derived
21+
int totalCount,
22+
double promptEvalRate,
23+
double evalRate,
24+
double totalRate
25+
) {
26+
public static RunMetricsSnapshot of(
27+
long totalDuration, long loadDuration,
28+
int promptEvalCount, long promptEvalDuration,
29+
int evalCount, long evalDuration,
30+
boolean hasPrefillPhase,
31+
long tornadoPlanCreationDuration,
32+
long tornadoJitDuration,
33+
long tornadoReadOnlyWeightsCopyInDuration) {
34+
35+
int totalCount = promptEvalCount + evalCount;
36+
double promptEvalRate = tokensPerSecond(promptEvalCount, promptEvalDuration);
37+
double evalRate = tokensPerSecond(evalCount, evalDuration);
38+
double totalRate = tokensPerSecond(totalCount, totalDuration);
39+
40+
return new RunMetricsSnapshot(
41+
totalDuration, loadDuration,
42+
promptEvalCount, promptEvalDuration,
43+
evalCount, evalDuration,
44+
hasPrefillPhase,
45+
tornadoPlanCreationDuration, tornadoJitDuration,
46+
tornadoReadOnlyWeightsCopyInDuration,
47+
totalCount, promptEvalRate, evalRate, totalRate);
48+
}
49+
50+
private static double tokensPerSecond(int tokens, long durationNs) {
51+
return (durationNs > 0 && tokens > 0) ? tokens / (durationNs / 1e9) : 0.0;
52+
}
53+
}

0 commit comments

Comments
 (0)