Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion LlamaTornadoCli.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
package org.beehive.gpullama3.cli;

import org.beehive.gpullama3.Options;
import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;

Expand Down
2 changes: 0 additions & 2 deletions llama-tornado
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,6 @@ class LlamaRunner:

if args.cuda_graphs:
cmd.append("-Dllama.cudaGraphs=true")
elif args.no_cuda_graphs:
cmd.append("-Dllama.cudaGraphs=false")

# Debug options
debug_config = []
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/beehive/gpullama3/LlamaApp.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.beehive.gpullama3;

import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.model.Model;

Expand All @@ -18,7 +18,7 @@ private static void runSingleInstruction(Model model, Sampler sampler, Options o
String response = model.runInstructOnce(sampler, options);
System.out.println(response);
if (SHOW_PERF_INTERACTIVE) {
LastRunMetrics.printMetrics();
RunMetrics.printMetrics();
}
}

Expand Down
33 changes: 0 additions & 33 deletions src/main/java/org/beehive/gpullama3/auxiliary/LastRunMetrics.java

This file was deleted.

134 changes: 134 additions & 0 deletions src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
package org.beehive.gpullama3.auxiliary;

/**
* Singleton that accumulates fine-grained performance metrics across one inference run.
*
* <p>Metrics are set incrementally by different layers of the stack:</p>
* <ul>
* <li>{@link #setLoadDuration} — called from {@code LlamaApp} around model file loading</li>
* <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
* <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
* <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
* </ul>
*
* <p>All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:</p>
* <ul>
* <li>Standard engine: {@code Total: X tok/s}</li>
* <li>Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}</li>
* </ul>
*/
public final class RunMetrics {

// ── Core metrics (nanoseconds) ────────────────────────────────────────────
private long totalDurationNs;
private long loadDurationNs;
private int promptEvalCount;
private long promptEvalDurationNs;
private int evalCount;
private long evalDurationNs;
private boolean hasPrefillPhase;

// ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
private long tornadoPlanCreationNs;
private long tornadoJitNs;
private long readOnlyWeightsCopyInNs;

// ── Singleton ─────────────────────────────────────────────────────────────
private static final RunMetrics INSTANCE = new RunMetrics();

private RunMetrics() {}

// ── Setters ───────────────────────────────────────────────────────────────

/** Records the time spent loading the model file (not including TornadoVM initialisation). */
public static void setLoadDuration(long ns) {
INSTANCE.loadDurationNs = ns;
}

/**
* Records TornadoVM-specific initialisation durations.
*
* @param planCreationNs task-graph construction ({@code createExecutionPlan()})
* @param jitNs JIT compilation ({@code withPreCompilation()})
* @param weightCopyNs first-execution weight upload ({@code forceCopyInReadOnlyData()})
*/
public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
INSTANCE.tornadoPlanCreationNs = planCreationNs;
INSTANCE.tornadoJitNs = jitNs;
INSTANCE.readOnlyWeightsCopyInNs = weightCopyNs;
}

/**
* Records inference-phase durations at the end of a generation run.
*
* @param promptCount number of prompt tokens processed (prefill)
* @param prefillNs wall-clock time spent in the prefill phase
* @param generatedCount number of tokens generated (decode)
* @param decodeNs wall-clock time spent in the decode phase
* @param totalNs total wall-clock time for the full inference call
*/
public static void setInferenceMetrics(int promptCount, long prefillNs,
int generatedCount, long decodeNs,
long totalNs) {
INSTANCE.promptEvalCount = promptCount;
INSTANCE.promptEvalDurationNs = prefillNs;
INSTANCE.evalCount = generatedCount;
INSTANCE.evalDurationNs = decodeNs;
INSTANCE.totalDurationNs = totalNs;
}

/**
* Signals that prefill and decode are distinct timed phases.
* Called by {@code InferenceEngineWithPrefillDecode} and
* {@code InferenceEngineWithBatchPrefillDecode} before returning.
*/
public static void setHasPrefillPhase(boolean value) {
INSTANCE.hasPrefillPhase = value;
}

// ── Output ────────────────────────────────────────────────────────────────

/** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */
public static void printMetrics() {
RunMetrics m = INSTANCE;

int totalTokens = m.promptEvalCount + m.evalCount;
double totalSecs = m.totalDurationNs / 1e9;
double totalRate = totalSecs > 0 ? totalTokens / totalSecs : 0;

System.err.println();
System.err.println("==== Performance Metrics ====");
if (m.hasPrefillPhase) {
double prefillSecs = m.promptEvalDurationNs / 1e9;
double decodeSecs = m.evalDurationNs / 1e9;
double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0)
? m.promptEvalCount / prefillSecs : 0;
double decodeRate = (decodeSecs > 0 && m.evalCount > 0)
? m.evalCount / decodeSecs : 0;
System.err.printf(
"Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
"¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
totalRate, totalTokens, totalSecs,
prefillRate, m.promptEvalCount, prefillSecs,
decodeRate, m.evalCount, decodeSecs);
} else {
System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
totalRate, totalTokens, totalSecs);
}

if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
&& m.tornadoPlanCreationNs > 0) {
System.err.printf(
"GGUF Model Load: %.2f ms%n" +
"Compilation & CodeGen: %.2f ms%n" +
"Warmup: %.2f ms%n" +
"Read-only weights Copy-in: %.2f ms%n",
m.loadDurationNs / 1_000_000.0,
m.tornadoPlanCreationNs / 1_000_000.0,
m.tornadoJitNs / 1_000_000.0,
m.readOnlyWeightsCopyInNs / 1_000_000.0);
}
System.err.println();
}
}
63 changes: 29 additions & 34 deletions src/main/java/org/beehive/gpullama3/inference/InferenceEngine.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.beehive.gpullama3.inference;

import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
Expand Down Expand Up @@ -132,10 +132,9 @@ public static List<Integer> generateTokensLlama(Model model, State state, int st

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -213,10 +212,9 @@ public static List<Integer> generateTokensQwen3(Model model, State state, int st

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand All @@ -225,6 +223,7 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta
IntConsumer onTokenGenerated) {

long startNanos = System.nanoTime();
long inferenceStartNanos = 0;
if (maxTokens < 0 || model.configuration().contextLength() < maxTokens) {
maxTokens = model.configuration().contextLength();
}
Expand All @@ -245,6 +244,9 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta
System.err.print(Tokenizer.replaceControlCharacters(model.tokenizer().decode(List.of(nextToken))));
}
} else {
if (inferenceStartNanos == 0) {
inferenceStartNanos = System.nanoTime();
}
nextToken = sampler.sampleToken(state.logits);
if (echo) {
// log inferred token
Expand All @@ -266,10 +268,9 @@ public static List<Integer> generateTokensPhi3(Model model, State state, int sta

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;

Expand Down Expand Up @@ -356,11 +357,9 @@ public static List<Integer> generateTokensGPULlama(Model model, State state, int

// === Performance Metrics ===
long endNanos = System.nanoTime();
double totalSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

// Set metrics for tokens achieved
LastRunMetrics.setMetrics(totalTokens, totalSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -449,10 +448,9 @@ public static List<Integer> generateTokensGPUQwen3(Model model, State state, int

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -524,10 +522,9 @@ public static List<Integer> generateTokensGPUPhi3(Model model, State state, int

// Calculate and print performance metrics
long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -591,10 +588,9 @@ public static List<Integer> generateTokensGranite(Model model, State state, int
}

long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down Expand Up @@ -658,10 +654,9 @@ public static List<Integer> generateTokensGPUGranite(Model model, State state, i
}

long endNanos = System.nanoTime();
double totalTimeSeconds = (endNanos - startNanos) / 1_000_000_000.0;
int totalTokens = promptIndex + generatedTokens.size();

LastRunMetrics.setMetrics(totalTokens, totalTimeSeconds);
long decodeStart = inferenceStartNanos > 0 ? inferenceStartNanos : endNanos;
RunMetrics.setInferenceMetrics(promptIndex, decodeStart - startNanos,
generatedTokens.size(), endNanos - decodeStart, endNanos - startNanos);

return generatedTokens;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.beehive.gpullama3.inference;

import org.beehive.gpullama3.auxiliary.LastRunMetrics;
import org.beehive.gpullama3.auxiliary.RunMetrics;
import org.beehive.gpullama3.inference.sampler.Sampler;
import org.beehive.gpullama3.inference.state.State;
import org.beehive.gpullama3.model.Configuration;
Expand Down Expand Up @@ -102,6 +102,7 @@ public static List<Integer> generateTokensLlama(
}

state.latestToken = currentToken;
long decodeStartNanos = System.nanoTime();

// ── Decode ────────────────────────────────────────────────────────────
while (pos < actualMaxTokens) {
Expand Down Expand Up @@ -129,8 +130,9 @@ public static List<Integer> generateTokensLlama(
}

long endNanos = System.nanoTime();
int totalTokens = promptTokens.size() + generatedTokens.size();
LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
RunMetrics.setHasPrefillPhase(true);

return generatedTokens;
}
Expand Down Expand Up @@ -197,6 +199,7 @@ public static List<Integer> generateTokensGPULlama(
currentToken = promptTokens.get(N - 1);
pos = startPosition + N;
state.latestToken = currentToken;
long decodeStartNanos = System.nanoTime();

// ── Decode ────────────────────────────────────────────────────────────
while (pos < actualMaxTokens) {
Expand Down Expand Up @@ -224,8 +227,9 @@ public static List<Integer> generateTokensGPULlama(
}

long endNanos = System.nanoTime();
int totalTokens = promptTokens.size() + generatedTokens.size();
LastRunMetrics.setMetrics(totalTokens, (endNanos - startNanos) / 1_000_000_000.0);
RunMetrics.setInferenceMetrics(promptTokens.size(), decodeStartNanos - startNanos,
generatedTokens.size(), endNanos - decodeStartNanos, endNanos - startNanos);
RunMetrics.setHasPrefillPhase(true);

return generatedTokens;
}
Expand Down
Loading
Loading