GPULlama3.java/src/main/java/org/beehive/gpullama3/auxiliary/RunMetrics.java at 964927b65dcddc8259a01dae39ffb9ac04f13caa · beehive-lab/GPULlama3.java · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
package org.beehive.gpullama3.auxiliary;

/**
 * Singleton that accumulates fine-grained performance metrics across one inference run.
 *
 * <p>Metrics are set incrementally by different layers of the stack:</p>
 * <ul>
 *   <li>{@link #setLoadDuration} — called from {@code LlamaApp} around model file loading</li>
 *   <li>{@link #setTornadoMetrics} — called from TornadoVM plan constructors</li>
 *   <li>{@link #setInferenceMetrics} — called from InferenceEngine variants at end of generation</li>
 *   <li>{@link #setHasPrefillPhase} — called from prefill-decode engine variants</li>
 * </ul>
 *
 * <p>All durations are stored in nanoseconds. {@link #printMetrics()} prints throughput only:</p>
 * <ul>
 *   <li>Standard engine: {@code Total: X tok/s}</li>
 *   <li>Prefill-decode engines: {@code Prefill: X tok/s | Decode: Y tok/s | Total: Z tok/s}</li>
 * </ul>
 */
public final class RunMetrics {

    // ── Core metrics (nanoseconds) ────────────────────────────────────────────
    private long totalDurationNs;
    private long loadDurationNs;
    private int  promptEvalCount;
    private long promptEvalDurationNs;
    private int  evalCount;
    private long evalDurationNs;
    private boolean hasPrefillPhase;

    // ── TornadoVM-specific metrics (nanoseconds) ──────────────────────────────
    private long tornadoPlanCreationNs;
    private long tornadoJitNs;
    private long readOnlyWeightsCopyInNs;

    // ── Singleton ─────────────────────────────────────────────────────────────
    private static final RunMetrics INSTANCE = new RunMetrics();

    private RunMetrics() {}

    // ── Setters ───────────────────────────────────────────────────────────────

    /** Records the time spent loading the model file (not including TornadoVM initialisation). */
    public static void setLoadDuration(long ns) {
        INSTANCE.loadDurationNs = ns;
    }

    /**
     * Records TornadoVM-specific initialisation durations.
     *
     * @param planCreationNs  task-graph construction ({@code createExecutionPlan()})
     * @param jitNs           JIT compilation ({@code withPreCompilation()})
     * @param weightCopyNs    first-execution weight upload ({@code forceCopyInReadOnlyData()})
     */
    public static void setTornadoMetrics(long planCreationNs, long jitNs, long weightCopyNs) {
        INSTANCE.tornadoPlanCreationNs    = planCreationNs;
        INSTANCE.tornadoJitNs             = jitNs;
        INSTANCE.readOnlyWeightsCopyInNs  = weightCopyNs;
    }

    /**
     * Records inference-phase durations at the end of a generation run.
     *
     * @param promptCount    number of prompt tokens processed (prefill)
     * @param prefillNs      wall-clock time spent in the prefill phase
     * @param generatedCount number of tokens generated (decode)
     * @param decodeNs       wall-clock time spent in the decode phase
     * @param totalNs        total wall-clock time for the full inference call
     */
    public static void setInferenceMetrics(int promptCount, long prefillNs,
                                           int generatedCount, long decodeNs,
                                           long totalNs) {
        INSTANCE.promptEvalCount      = promptCount;
        INSTANCE.promptEvalDurationNs = prefillNs;
        INSTANCE.evalCount            = generatedCount;
        INSTANCE.evalDurationNs       = decodeNs;
        INSTANCE.totalDurationNs      = totalNs;
    }

    /**
     * Signals that prefill and decode are distinct timed phases.
     * Called by {@code InferenceEngineWithPrefillDecode} and
     * {@code InferenceEngineWithBatchPrefillDecode} before returning.
     */
    public static void setHasPrefillPhase(boolean value) {
        INSTANCE.hasPrefillPhase = value;
    }

    // ── Output ────────────────────────────────────────────────────────────────

    /** Prints throughput metrics to {@code stderr}, and TornadoVM init metrics when enabled. */
    public static void printMetrics() {
        RunMetrics m = INSTANCE;

        int    totalTokens = m.promptEvalCount + m.evalCount;
        double totalSecs   = m.totalDurationNs / 1e9;
        double totalRate   = totalSecs > 0 ? totalTokens / totalSecs : 0;

        System.err.println();
        System.err.println("==== Performance Metrics ====");
        if (m.hasPrefillPhase) {
            double prefillSecs = m.promptEvalDurationNs / 1e9;
            double decodeSecs  = m.evalDurationNs / 1e9;
            double prefillRate = (prefillSecs > 0 && m.promptEvalCount > 0)
                    ? m.promptEvalCount / prefillSecs : 0;
            double decodeRate  = (decodeSecs > 0 && m.evalCount > 0)
                    ? m.evalCount / decodeSecs : 0;
            System.err.printf(
                    "Total achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
                    "¬Prefill achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n" +
                    "¬Decode achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
                    totalRate, totalTokens, totalSecs,
                    prefillRate, m.promptEvalCount, prefillSecs,
                    decodeRate,  m.evalCount,       decodeSecs);
        } else {
            System.err.printf("achieved tok/s: %.2f. Tokens: %d, seconds: %.2f%n",
                    totalRate, totalTokens, totalSecs);
        }

        if (Boolean.parseBoolean(System.getProperty("llama.EnableTimingForTornadoVMInit", "false"))
                && m.tornadoPlanCreationNs > 0) {
            System.err.printf(
                    "GGUF Model Load: %.2f ms%n" +
                    "Compilation & CodeGen: %.2f ms%n" +
                    "Warmup: %.2f ms%n" +
                    "Read-only weights Copy-in: %.2f ms%n",
                    m.loadDurationNs          / 1_000_000.0,
                    m.tornadoPlanCreationNs   / 1_000_000.0,
                    m.tornadoJitNs            / 1_000_000.0,
                    m.readOnlyWeightsCopyInNs / 1_000_000.0);
        }
        System.err.println();
    }
}