Skip to content

Commit 28fb15b

Browse files
authored
Merge branch 'master' into fix/per-dollar-png-axis-and-extension
2 parents 4b6b0ce + 186fab1 commit 28fb15b

14 files changed

Lines changed: 244 additions & 7 deletions

File tree

.claude/skills/write-inferencex-blog/SKILL.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ Common gotchas:
3636
- **Workload mismatch**: chart headers can mislead. Verify ISL/OSL from the data itself — 1k/1k and 8k/1k give wildly different `tok/s/GPU` and `$/M tokens` numbers. The blog title, lede, tables, and chart caption must all use the same ISL/OSL.
3737
- **Latest run only**: filter to the highest `run_attempt` per `github_run_id`, then take the latest `date` per `(config_id, conc, isl, osl)`. See the `inferencex-data` skill for the exact filter.
3838
- **Model spec verification**: never invent parameter counts. Always `WebSearch` the model's released specs (total params, active params, expert count, attention type) before writing the architecture paragraph. Cite sources. GLM-5 is _not_ GLM-4.5 — the numbers changed.
39-
- **TCO values**: pull from the [SemiAnalysis AI Cloud TCO Model](https://newsletter.semianalysis.com/p/ai-cloud-economics). Current values (verify if older than a quarter):
39+
- **TCO values**: pull from the [SemiAnalysis AI Cloud TCO Model](https://semianalysis.com/ai-cloud-tco-model/). Current values (verify if older than a quarter):
4040
- H100 $1.30, H200 $1.41, B200 $1.95, B300 $2.34, GB200 $2.21, GB300 $2.652
4141
- MI300X $1.12, MI325X $1.28, MI355X $1.48
4242
- **Cost per million tokens formula**: `$/M tok = TCO_$/GPU/hr * 1e6 / (3600 * tput_per_gpu)`. Equivalently in Python: `cost = tco / (3600 * tput / 1e6)`. Throughput is per-GPU, so GPU count cancels out for aggregated configs.

.github/CODEOWNERS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
* @adibarra

packages/app/content/blog/gb300-nvl72-vs-gb200-nvl72-dsv4-pro-vllm-fp4.mdx

Lines changed: 181 additions & 0 deletions
Large diffs are not rendered by default.
252 KB
Loading
252 KB
Loading
458 KB
Loading
458 KB
Loading

packages/app/src/components/inference/inference-chart-config.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,10 @@
9595
"y_measuredJPerOutputToken_label": "Measured J per Output Token (J/tok)",
9696
"y_measuredJPerOutputToken_title": "Measured Joules per Output Token",
9797
"y_measuredJPerOutputToken_roofline": "lower_right",
98+
"y_measuredJPerTotalToken": "measuredJPerTotalToken.y",
99+
"y_measuredJPerTotalToken_label": "Measured J per Token (J/tok)",
100+
"y_measuredJPerTotalToken_title": "Measured Joules per Token (incl. prompt)",
101+
"y_measuredJPerTotalToken_roofline": "lower_right",
98102
"y_cost_limit": 5,
99103
"y_latency_limit": 60
100104
},
@@ -193,6 +197,10 @@
193197
"y_measuredJPerOutputToken_label": "Measured J per Output Token (J/tok)",
194198
"y_measuredJPerOutputToken_title": "Measured Joules per Output Token",
195199
"y_measuredJPerOutputToken_roofline": "lower_left",
200+
"y_measuredJPerTotalToken": "measuredJPerTotalToken.y",
201+
"y_measuredJPerTotalToken_label": "Measured J per Token (J/tok)",
202+
"y_measuredJPerTotalToken_title": "Measured Joules per Token (incl. prompt)",
203+
"y_measuredJPerTotalToken_roofline": "lower_left",
196204
"y_cost_limit": 5,
197205
"y_latency_limit": 60
198206
}

packages/app/src/components/inference/types.ts

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,9 +68,10 @@ export interface AggDataEntry {
6868
std_e2el: number;
6969
p99_e2el: number;
7070
// Measured GPU telemetry (emitted by runner's aggregate_power.py).
71-
// Optional because historical runs predate the field.
71+
// Optional because historical runs predate the fields.
7272
avg_power_w?: number;
7373
joules_per_output_token?: number;
74+
joules_per_total_token?: number;
7475
disagg: boolean;
7576
num_prefill_gpu: number;
7677
num_decode_gpu: number;
@@ -162,6 +163,7 @@ export interface InferenceData extends Partial<Omit<AggDataEntry, AggDataConflic
162163
// emit these fields.
163164
measuredAvgPower?: { y: number; roof: boolean };
164165
measuredJPerOutputToken?: { y: number; roof: boolean };
166+
measuredJPerTotalToken?: { y: number; roof: boolean };
165167
}
166168

167169
/**
@@ -189,7 +191,8 @@ export type YAxisMetricKey =
189191
| 'jOutput'
190192
| 'jInput'
191193
| 'measuredAvgPower'
192-
| 'measuredJPerOutputToken';
194+
| 'measuredJPerOutputToken'
195+
| 'measuredJPerTotalToken';
193196

194197
/**
195198
* Defines the configuration and labels for a specific chart.
@@ -302,6 +305,10 @@ export interface ChartDefinition {
302305
y_measuredJPerOutputToken_label?: string;
303306
y_measuredJPerOutputToken_title?: string;
304307
y_measuredJPerOutputToken_roofline?: 'upper_right' | 'upper_left' | 'lower_left' | 'lower_right';
308+
y_measuredJPerTotalToken?: string;
309+
y_measuredJPerTotalToken_label?: string;
310+
y_measuredJPerTotalToken_title?: string;
311+
y_measuredJPerTotalToken_roofline?: 'upper_right' | 'upper_left' | 'lower_left' | 'lower_right';
305312
y_cost_limit?: number;
306313
y_latency_limit?: number;
307314
}

packages/app/src/components/inference/ui/ChartControls.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ const METRIC_GROUPS: { label: string; metrics: string[]; gated?: boolean }[] = [
5656
{ label: 'All-in Provisioned Energy per Token', metrics: ['y_jTotal', 'y_jOutput', 'y_jInput'] },
5757
{
5858
label: 'Measured Energy',
59-
metrics: ['y_measuredAvgPower', 'y_measuredJPerOutputToken'],
59+
metrics: ['y_measuredAvgPower', 'y_measuredJPerOutputToken', 'y_measuredJPerTotalToken'],
6060
gated: true,
6161
},
6262
{ label: 'Custom User Values', metrics: ['y_costUser', 'y_powerUser'] },

0 commit comments

Comments
 (0)