diff --git a/.vscode/settings.json b/.vscode/settings.json index d20e9ff2..b43223df 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -25,6 +25,7 @@ "dotenv", "esbuild", "evals", + "frontmatter", "gptoss", "gsutil", "hyperscaler", @@ -38,6 +39,7 @@ "kimi", "kimik", "lefthook", + "llms", "maxage", "minimaxm", "mooncake", diff --git a/AGENTS.md b/AGENTS.md index 29d68225..984524b3 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -34,13 +34,15 @@ pnpm test:e2e # Cypress E2E tests ``` packages/ ├── app/ # Next.js frontend (@semianalysisai/inferencex-app) +│ ├── content/blog/ # MDX blog posts (frontmatter + content) │ └── src/ │ ├── app/ # Pages, layouts, API routes (/api/v1/*) +│ │ └── blog/ # Blog list + [slug] post pages, OG image generation │ ├── components/ # Tab sections: inference/, evaluation/, historical-trends/, -│ │ # throughput-calculator/, reliability/, gpu-specs/, ui/ +│ │ # throughput-calculator/, reliability/, gpu-specs/, blog/, ui/ │ ├── hooks/api/ # React Query hooks (use-benchmarks, use-availability, etc.) -│ └── lib/ # Utilities, constants, d3-chart/, chart-utils, data-mappings -├── constants/ # Shared constants (GPU keys, model mappings) +│ └── lib/ # Utilities, constants, d3-chart/, chart-utils, blog, data-mappings +├── constants/ # Shared constants (GPU keys, model mappings, SEO) └── db/ # DB layer, ETL, migrations, queries, ingest scripts ``` @@ -66,6 +68,15 @@ API routes (`packages/app/src/app/api/v1/`): **API routes return raw DB data** — no presentation logic. Frontend handles all transformations. +Static content routes (no DB): + +- `/blog` — blog listing (statically generated from MDX files in `content/blog/`) +- `/blog/[slug]` — blog post page with MDX rendering and OG image generation +- `/feed.xml` — RSS 2.0 feed +- `/llms.txt` — LLM-readable site index +- `/llms-full.txt` — full article content for LLM ingestion +- `/sitemap.xml` — dynamic sitemap (includes blog posts) + ## Code Style & Tooling - **Linter**: oxlint — `pnpm lint` / `pnpm lint:fix` @@ -87,7 +98,7 @@ All interactive elements should have `track()` from `@/lib/analytics` (autocaptu **Convention**: `[section]_[action]` — e.g., `latency_zoom_reset`, `calculator_bar_selected`, `tab_changed` -**Prefixes**: `latency_`, `interactivity_`, `gpu_timeseries_`, `inference_`, `calculator_`, `evaluation_`, `reliability_`, `tab_`, `selector_` +**Prefixes**: `latency_`, `interactivity_`, `gpu_timeseries_`, `inference_`, `calculator_`, `evaluation_`, `reliability_`, `tab_`, `selector_`, `blog_`, `social_` ## Tab Structure @@ -118,6 +129,25 @@ Order: `inference` → `evaluation` → `historical` → `calculator` → `relia 6. Add disagg caveat banner in `ChartDisplay.tsx` for per-GPU or per-MW metrics (animated amber `border-l-2` banner pattern) 7. Expose in UI state: `InferenceContext.tsx` +### Add a new blog post + +1. Create `packages/app/content/blog/.mdx` with frontmatter: `title`, `subtitle`, `date` (required), `tags`, `modifiedDate` (optional) +2. Write content using Markdown + custom MDX components (`Figure`, `Blur`) +3. No code changes needed — the post automatically appears in the blog list, sitemap, RSS feed, llms.txt, and gets a generated OG image + +See [Blog](./docs/blog.md) for content format, available MDX components, and design details. + +### Modify blog components + +- Blog library (posts, headings, reading time): `src/lib/blog.ts` +- Blog list page: `src/app/blog/page.tsx` +- Blog post page: `src/app/blog/[slug]/page.tsx` +- MDX components: `src/components/blog/mdx-components.tsx` +- TOC sidebar: `src/components/blog/blog-toc.tsx` +- OG image generation: `src/app/blog/[slug]/og-image-render.tsx` +- RSS feed: `src/app/feed.xml/route.ts` +- SEO constants: `packages/constants/src/seo.ts` + ### Add a new model or GPU **First ask for the PR / GitHub Actions run URL** — see [Adding Entities](./docs/adding-entities.md) for the full workflow. Never ask other questions before getting the URL. @@ -144,6 +174,7 @@ Detailed design rationale (the "why" and "how", not the "what") lives in [docs/] - **[Testing](./docs/testing.md)** — Requirements, quality standards, pre-commit checklist - **[Data Transforms](./docs/data-transforms.md)** — BenchmarkRow → AggDataEntry → InferenceData pipeline, hardware key construction, derived metrics - **[State Ownership](./docs/state-ownership.md)** — Context provider state map, availability filtering cascade, comparison dates, URL params +- **[Blog](./docs/blog.md)** — MDX content system, SEO features, TOC sidebar, reading progress, analytics events ## Claude AI Agents diff --git a/docs/blog.md b/docs/blog.md new file mode 100644 index 00000000..8ff8f3aa --- /dev/null +++ b/docs/blog.md @@ -0,0 +1,117 @@ +# Blog Infrastructure + +## Why MDX + Static Generation + +Blog posts are MDX files in `packages/app/content/blog/`, compiled at build time via `next-mdx-remote`. This was chosen over a CMS or database because: + +- **No runtime dependency**: Posts are part of the repo, versioned in git, reviewed in PRs. No CMS outage can break the blog. +- **MDX flexibility**: Authors can embed custom React components (`Figure`, `Blur`) alongside Markdown. This matters for image-heavy technical articles with captions, paywall teasers, and code blocks. +- **Static generation**: `generateStaticParams()` pre-renders all post pages. No server-side rendering at request time. + +Syntax highlighting uses Shiki with dual light/dark themes (CSS class-based switching, not runtime theme detection). + +## Content Format + +```yaml +# Frontmatter (required: title, subtitle, date) +title: string +subtitle: string +date: YYYY-MM-DD +modifiedDate?: YYYY-MM-DD # Used in sitemap and JSON-LD +tags?: string[] # Used for filtering on /blog and in RSS categories +``` + +Slug is derived from the filename (e.g., `my-post.mdx` -> `my-post`), not from frontmatter. Reading time is calculated at 265 WPM. + +## MDX Components Available to Authors + +| Component | Usage | Notes | +| ---------------------------------------------- | -------------------------------- | --------------------------------------------------------------------------------- | +| `# / ## / ###` | Headings with auto-generated IDs | IDs are deduped: second `## Details` under `## Results` becomes `results-details` | +| `[text](url)` | Links | Internal links use ``, external get `target="_blank"` | +| `![alt](src)` | Images | Rendered via `next/image` with lazy loading (first image is eager) | +| `
` | Captioned figures | Uses `` (not `next/image`) for external URLs | +| `...` | Paywall teaser blur overlay | Content is blurred, unselectable, and not clickable | + +Heading ID deduplication: if two headings share a slug, the second gets prefixed with its parent heading's slug (e.g., `overview-details`). If no parent exists, a level suffix is appended (`intro-2`). + +## Blog Library (`src/lib/blog.ts`) + +| Function | Purpose | +| ------------------------- | --------------------------------------------------------- | +| `getAllPosts()` | All posts sorted newest-first | +| `getPostBySlug(slug)` | Single post meta + raw MDX content | +| `getAdjacentPosts(slug)` | `{ prev, next }` — prev is older, next is newer | +| `extractHeadings(rawMdx)` | h1-h3 headings with unique IDs (strips code blocks first) | +| `slugify(raw)` | URL-safe slug generation | +| `getReadingTime(content)` | Word count / 265, minimum 1 minute | + +## SEO Features + +### Dynamic OG Images (`/blog/[slug]/opengraph-image.tsx`) + +1200x630px images generated at build time with `next/og` (Satori). Design: decorative tile sidebar + dark content panel with title, subtitle, date, and logo. Title font size scales (56-72px) based on length for readability at thumbnail sizes. + +### RSS Feed (`/feed.xml`) + +RSS 2.0 with Dublin Core and Atom extensions. Includes all posts with title, link, description, creator, pubDate, categories. Cached 1 hour. + +### LLM Discovery (`/llms.txt`, `/llms-full.txt`) + +- `/llms.txt`: Site description + article index with titles, URLs, and subtitles +- `/llms-full.txt`: Full raw MDX content of every post, for LLM context ingestion + +### Sitemap Integration + +Blog index at priority 0.8 (weekly), individual posts at priority 0.7 (monthly, uses `modifiedDate` if present). + +### JSON-LD + +- `/blog` page: `Blog` schema +- `/blog/[slug]` page: `BlogPosting` schema (headline, author, publisher, dates, wordCount, timeRequired) + +## UI Components + +### Table of Contents (`blog-toc.tsx`) + +Two modes based on available viewport space: + +- **Sidebar** (>= 240px right of content): Fixed position, follows scroll via imperative DOM updates in a scroll handler. Active heading tracked via `IntersectionObserver` with `rootMargin: '0px 0px -80% 0px'`. Falls back to last heading when scrolled to page bottom. +- **Inline** (narrow screens): Collapsible `
` card. + +The sidebar position is calculated relative to the `[data-blog-section]` element and updated on scroll/resize. + +### Reading Progress Bar (`reading-progress-bar.tsx`) + +Fixed-top 0.5px bar tracking scroll position within the `
` element. Fires milestone events at 25/50/75/100% thresholds (each fires only once per page load). + +### Heading Links (`heading-link.tsx`) + +Copy-to-clipboard button shown on heading hover. State cycle: idle -> copied ("Link copied" text) -> fade out -> idle. + +### Post Navigation (`blog-post-nav.tsx`) + +Previous (older) / Next (newer) post links with title display. Uses `getAdjacentPosts()`. + +## Analytics Events + +All blog analytics use the `blog_` prefix per the `[section]_[action]` convention: + +| Event | Trigger | +| ------------------------------------------------ | ---------------------------- | +| `blog_post_clicked` | Click post card on list page | +| `blog_toc_clicked` | Click TOC heading | +| `blog_read_milestone` | Scroll past 25/50/75/100% | +| `blog_heading_link_copied` | Copy heading link | +| `blog_nav_prev` / `blog_nav_next` | Click prev/next post | +| `blog_back_clicked` | Click back to articles | +| `blog_tag_filtered` | Click tag filter | +| `social_share_twitter` / `social_share_linkedin` | Click share buttons | + +## Adding a New Blog Post + +1. Create `packages/app/content/blog/.mdx` with required frontmatter (`title`, `subtitle`, `date`) +2. Add optional `tags` and `modifiedDate` frontmatter +3. Write content using standard Markdown + available MDX components +4. The post automatically appears in: blog list, sitemap, RSS feed, llms.txt, OG image generation +5. No code changes needed — just the MDX file diff --git a/docs/index.md b/docs/index.md index 4745b981..8a8e0d69 100644 --- a/docs/index.md +++ b/docs/index.md @@ -14,3 +14,4 @@ Design rationale and non-obvious conventions. See [CLAUDE.md](../CLAUDE.md) for - [Testing](./testing.md) — Requirements, quality standards, pre-commit checklist - [Data Transforms](./data-transforms.md) — Full pipeline from BenchmarkRow to RenderableGraph: type hierarchy, hardware key construction, derived metrics, memoization strategy - [State Ownership](./state-ownership.md) — Which context owns which state, availability filtering cascade, comparison date mechanics, URL param sync +- [Blog](./blog.md) — MDX content system, SEO features (OG images, RSS, llms.txt, JSON-LD), TOC sidebar, reading progress, heading links, analytics events diff --git a/packages/app/content/blog/inferencemax-open-source-inference-benchmarking.mdx b/packages/app/content/blog/inferencemax-open-source-inference-benchmarking.mdx new file mode 100644 index 00000000..9266869f --- /dev/null +++ b/packages/app/content/blog/inferencemax-open-source-inference-benchmarking.mdx @@ -0,0 +1,615 @@ +--- +title: 'InferenceMAX: Open Source Inference Benchmarking' +subtitle: 'NVIDIA GB200 NVL72, AMD MI355X, Throughput Token per GPU, Latency Tok/s/user, Perf per Dollar, Cost per Million Tokens, Tokens per Provisioned Megawatt, DeepSeek R1 670B, GPTOSS 120B, Llama3 70B' +date: '2025-10-09' +tags: + - benchmark + - gpu + - inference + - announcement +--- + +LLM Inference performance is driven by two pillars, hardware and software. While hardware innovation drives step jumps in performance every year through the release of new GPUs/XPUs and new systems, software evolves every single day, delivering continuous performance gains on top of these step jumps. + +AI software like SGLang, vLLM, TensorRT-LLM, CUDA, and ROCm achieve continuous improvement in performance through kernel-level optimizations, distributed inference strategies, and scheduling innovations that increase the Pareto frontier of performance in incremental releases that can be just days apart. + +This pace of software advancement creates a challenge: benchmarks conducted at a fixed point in time quickly go stale and do not represent the performance that can be achieved with the latest software packages. + +InferenceMAX, [an open-source automated benchmark](https://github.com/InferenceMAX/InferenceMAX) designed to move at the same rapid speed as the software ecosystem itself, is built to address this challenge. + +
+ +InferenceMAX runs our suite of benchmarks every night on hundreds of chips, continually re-benchmarking the world's most popular open-source inference frameworks and models to track real performance in real-time. As these software stacks improve, InferenceMAX captures that progress in near real-time, providing a live indicator of inference performance progress. A live dashboard is available for free publicly at [https://inferencemax.ai/](https://inferencemax.ai/). + +
+ +AMD and Nvidia GPUs can both deliver competitive performance for different sets of workloads, with AMD performing best for some types of workloads and Nvidia excelling at others. Indeed, both ecosystems are advancing rapidly! + +There are many nuances and considerations when analyzing the results from InferenceMAX, and this is in no small part because it is designed to be a neutral benchmark, not cherry-picked to promote any specific vendor or solution. As such, there are models and interactivity (tok/s/user) levels where AMD currently does better against Nvidia GPUs of the same generation, and there are also interactivity levels where Nvidia currently does better. The goal of InferenceMAX is simple but ambitious -- to provide benchmarks that both emulate real world applications as much as possible and reflect the continuous pace of software innovation. + +For the initial InferenceMAX v1 release, we are benchmarking the GB200 NVL72, B200, MI355X, H200, MI325X, H100 and MI300X. Over the next two months, we're expanding InferenceMAX to include Google TPU and AWS Trainium backends, making it the first truly multi-vendor open benchmark across AMD, NVIDIA, and custom accelerators. + +InferenceMAX v1 is far from perfect, but we believe that it is a good first step in the right direction. There will be room in future releases to refine workloads, extend model coverage, and better reflect real-world workloads. + +## Acknowledgements + +Thank you to Lisa Su and Anush Elangovan for providing the MI355X and CDNA3 GPUs for this free and open-source project. We want to recognize Anush, Quentin Colombet, and dozens of additional AMD contributors for their responsiveness and help debugging, optimizing, and validating performance across AMD GPUs. Whenever we encounter ROCm issues (we note these issues are occurring at a far lower frequency than at the end of 2024!), they have immediately jumped in to help find temporary fixes that unblock us, following up with permanent patches into ROCm to ensure long-term stability. Quentin and his team embody the [AMD 2.0 sense of urgency](https://semianalysis.com/2025/04/23/amd-2-0-new-sense-of-urgency-mi450x-chance-to-beat-nvidia-nvidias-new-moat/) that [many customers such as xAI are very appreciative of](https://www.youtube.com/live/5dmFa9iXPWI?si=5HHNsDd7bw3lDASk&t=1073). + +We're also grateful to Jensen Huang and Ian Buck for supporting this open-source effort by providing access to a GB200 NVL72 rack (through OCI) and B200 GPUs. Thank you to Kedar Pandurang Potdar, Sridhar Ramaswamy, Kyle Kranen, ptrblck, the NVIDIA inference team, NVIDIA Dynamo team, NCCL team, as well as the Nvidia firmware/driver team for helping validate and optimize Blackwell and Hopper configurations and for fixing bugs with a fast time to resolution. + +We also want to recognize the SGLang, vLLM, and TensorRT-LLM maintainers for building a world-class software stack and open sourcing it to the entire world. Furthermore, we want to thank Simon Mo, Kaichao You, Michael Goin, and Robert Shaw whose help was invaluable for resolving a few critical Blackwell bugs. + +Finally, we're grateful to Crusoe, CoreWeave, Nebius, TensorWave, Oracle and TogetherAI for supporting open-source innovation through compute resources, enabling this project and we are thankful to the broader community for pushing inference benchmarking forward. + +## We are Hiring + +We are looking for an engineer to join our special projects team. This is a unique opportunity to work on high-visibility special projects such as InferenceMAX with support from many industry leaders and CEOs. If you're passionate about performance engineering, system reliability, and want to work at the intersection of hardware and software, this is a rare chance to make industry wide impact. + +**What you'll work on:** + +- Building and running large-scale benchmarks across multiple vendors (AMD, NVIDIA, TPU, Trainium, etc.) +- Designing reproducible CI/CD pipelines to automate benchmarking workflows +- Ensuring reliability and scalability of systems used by industry partners + +**What we're looking for:** + +- Strong skills in Python +- Background in Site Reliability Engineering (SRE) or systems-level problem solving +- Experience with CI/CD pipelines and modern DevOps practices +- Curiosity about GPUs, TPUs, Trainium, multi-cloud, and performance benchmarking + +Link to apply: [https://app.dover.com/apply/SemiAnalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1](https://app.dover.com/apply/SemiAnalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1) + +## InferenceMAX Initiative Supporters + +InferenceMAX initiative is supported by many major buyers of compute and prominent members of the ML community including those from OpenAI, Microsoft, PyTorch Foundation, etc.: + +> _"As we build systems at unprecedented scale, it's critical for the ML community to have open, transparent benchmarks that reflect how inference really performs across hardware and software. InferenceMAX's head-to-head benchmarks cut through the noise and provide a living picture of token throughput, performance per dollar, and tokens per Megawatt. This kind of open source effort strengthens the entire ecosystem and helps everyone, from researchers to operators of frontier datacenters, make smarter decisions."_ +> +> -- Peter Hoeschele, VP of Infrastructure and Industrial Compute, OpenAI Stargate + +> _"Open collaboration is driving the next era of AI innovation. The open-source InferenceMAX benchmark gives the community transparent, nightly results that inspire trust and accelerate progress. It highlights the competitive TCO performance of our AMD Instinct MI300, MI325X, and MI355X GPUs across diverse workloads, underscoring the strength of our platform and our commitment to giving developers real-time visibility into our software progress."_ +> +> -- Dr. Lisa Su, Chair and CEO, AMD + +> _"Inference demand is growing exponentially, driven by long-context reasoning. NVIDIA Grace Blackwell NVL72 was invented for this new era of thinking AI. NVIDIA is meeting that demand through constant hardware and software innovation to enable what's next in AI. By benchmarking frequently, InferenceMAX gives the industry a transparent view of LLM inference performance on real-world workloads. The results are clear: Grace Blackwell NVL72 with TRT-LLM and Dynamo delivers unmatched performance per dollar and per megawatt -- powering the most productive and cost-effective AI factories in the world."_ +> +> -- Jensen Huang, Founder & CEO, NVIDIA + +> _"Speed is the moat. InferenceMAX's nightly benchmarks match the speed of improvement of the AMD software stack. It's fantastic to see AMD's MI300, MI325, and MI355 GPUs performing so well across diverse workloads and interactivity levels."_ +> +> -- Anush Elangovan, VP GPU Software, AMD + +> _"InferenceMAX highlights workloads that the ML community cares about. At NVIDIA, we welcome these comparisons because they underscore the advantage of our full-stack approach -- from GPUs hardware to NVLink networking to NVL72 Rack Scale to Dynamo disaggregated serving that consistently delivers industry-leading inference performance and ROI at scale."_ +> +> -- Ian Buck, VP & GM, Hyperscale, NVIDIA & Inventor of CUDA + +> _"InferenceMAX's nightly results highlight the rapid pace of progress in the AMD software stack. It's exciting to witness the birth of an open project that provides a tied feedback loop between what the software team works on here at AMD and how it affects specific ML use cases across our MI300, MI325, and MI355 GPUs. I'm looking forward to see what's next for InferenceMAX and to showcase what the AMD platform can do. AMD GPUs will continue to get faster every week."_ +> +> -- Quentin Colombet, Senior Director, AMD, Ex-Brium CEO + +> _"Our mission at Azure is to give customers the most performant, efficient, and cost-effective cloud for AI. SemiAnalysis InferenceMAX supports that mission by providing transparent, reproducible benchmarks that track inference performance across GPUs and software stacks under realistic workloads. This continuous data on throughput, efficiency, and cost per watt strengthens our ability to tune Azure's inference platform for scale, helping customers build with confidence on Microsoft Cloud."_ +> +> -- Scott Guthrie, Executive Vice President, Microsoft Cloud & AI + +> _"At Microsoft, delivering the best inference performance and economics for our customers at scale requires a deep understanding of how AI models interact with real-world hardware and software. Open-source, reproducible benchmarks, like InferenceMAX, are essential for generating transparent insights into throughput, efficiency, and cost under realistic workloads. These continuous signals help guide our platform strategy, enabling us to optimize the entire stack from silicon, to systems, to software, so that every layer works together to unlock the full potential of our infrastructure."_ +> +> -- Saurabh Dighe, Corporate Vice President, Azure Strategic Planning & Architecture + +> _"The gap between theoretical peak and real-world inference throughput is often determined by systems software: inference engine, distributed strategies, and low-level kernels. InferenceMAX is valuable because it benchmarks the latest software showing how optimizations like FP4, MTP, speculative decode, and wide-EP actually play out across various hardware. Open, reproducible results like these help the whole community move faster."_ +> +> -- Tri Dao, Chief Scientist of Together AI & Inventor of Flash Attention + +> _"The industry needs many public, reproducible benchmarks of inference performance. We're excited to collaborate with InferenceMAX from the vLLM team. More diverse workloads and scenarios that everyone can trust and reference will help the ecosystem move forward. Fair, transparent measurements drive progress across every layer of the stack, from model architectures to inference engines to hardware."_ +> +> -- Simon Mo, vLLM Project Co-Lead + +> _"The benchmark is good sir"_ +> +> -- Michael Goin, vLLM maintainer + +> _"InferenceMAX benchmark is pogchamp & W in chat"_ +> +> -- Kaichao You, vLLM Project Co-lead + +> _"InferenceMAX demonstrates how an open ecosystem can operate in practice. Many leading inference stacks such as vLLM, SGLang, and TensorRT-LLM are built on PyTorch, and benchmarks like this show how innovations across kernels, runtimes, and frameworks translate into measurable performance on a range of hardware platforms, including NVIDIA and AMD GPUs. By being open source and running nightly, InferenceMAX offers a transparent, community-driven approach to tracking progress and providing PyTorch users with data-driven insights."_ +> +> -- Matt White, Executive Director, PyTorch Foundation + +> _"Oracle Cloud Infrastructure is built to give frontier labs & enterprises flexibility and choice, with many GPU SKUs available for AI at scale. InferenceMAX strengthens that mission by delivering open source, reproducible benchmarks that reflect real-world performance, efficiency, and cost on the latest hardware and software. With this transparency, customers can confidently select the platforms that best align with their AI strategies."_ +> +> -- Jay Jackson, Vice President, Oracle Cloud Infrastructure + +> _"InferenceMAX raises the bar by delivering open, transparent benchmarks that track how inference really performs across the latest GPUs and software stacks. For customers, having reproducible data that measures real world tokens per dollar & tokens per watt, turns abstract marketing numbers into actionable insight. At CoreWeave, we support this effort because it brings clarity to a fast-moving space and helps the entire ecosystem build with confidence."_ +> +> -- Peter Salanki, CTO, CoreWeave + +> _"InferenceMAX sets a new standard by providing open, transparent benchmarks that reveal how inference performs across today's leading GPUs and software stacks. With reproducible data measuring real-world tokens per dollar and tokens per watt, customers can move beyond marketing claims to actionable insights. For us at Nebius, as a full-stack AI cloud provider, this initiative helps us build our inference platform with confidence and ensure we are aligned with the ecosystem."_ +> +> -- Roman Chernin, co-founder and Chief Business Officer, Nebius + +> _"At Crusoe, we believe being a great partner means empowering our customers with choice and clarity. That's why we're proud to support InferenceMAX, which provides the entire AI community with open-source, reproducible benchmarks for the latest hardware. By delivering transparent, real-world data on throughput, efficiency, and cost, InferenceMAX cuts through the hype and helps our customers confidently select the very best platform for their unique workloads."_ +> +> -- Chase Lochmiller, Co-Founder & CEO, Crusoe + +> _"Supermicro is excited about the launch of InferenceMAX, the SemiAnalysis benchmarking system that measures real-world throughput, performance per dollar, and energy efficiency. This open-source tool provides reproducible benchmarks running on the latest hardware and software enabling AI labs and enterprises to choose the best platforms at scale."_ +> +> -- Charles Liang, Founder & CEO, Supermicro + +> _"At TensorWave, we're building a next-generation cloud on AMD GPUs because we believe innovation thrives when customers have strong alternatives. InferenceMAX reinforces that vision by providing open source, reproducible benchmarks that track throughput, efficiency, and cost across the latest hardware and software. By cutting through synthetic numbers and highlighting real-world inference performance, it helps customers see the full potential of AMD platforms for AI at scale."_ +> +> -- Darrick Horton, CEO, TensorWave + +> _"Vultr is committed to providing an open ecosystem that gives developers freedom in how they build and scale AI -- whether on NVIDIA or AMD GPUs. With InferenceMAX, customers gain open, reproducible benchmarks that deliver clear insights into throughput, efficiency, and cost across cutting-edge hardware and software. By showcasing real-world performance, we empower teams to confidently choose the right platform for their AI workloads."_ +> +> -- Nathan Goulding, SVP of Engineering, Vultr + +## The Fundamental Trade-off between Throughput (tok/s/gpu) & Latency/Interactivity (tok/s/user) + +The fundamental trade-off that comes with serving LLMs at scale is that of throughput versus interactivity (measured in units of tokens per second per user). Throughput is the rate at which each GPU can process tokens (tok/s/gpu), whereas interactivity describes the rate at which tokens are generated for each individual user (tokens/sec/user). Put simply, you can serve individual users fast and efficiently, usually by serving fewer users at a time, but doing so will come with the cost of lower overall GPU throughput. + +This trade-off exists because LLM inference relies on matrix multiplications that benefit from batching multiple requests together -- that is, serving many more users at the same time. Large batches enable better GPU utilization and higher token throughput, but they split available resources across more requests, slowing down token processing per user. Conversely, small batches concentrate GPU resources on fewer requests -- that is, fewer users, delivering high interactivity at the expense of overall throughput. In practice, most providers aim for a balance between these extremes. The optimal point on this trade-off depends on the use case: some applications prioritize responsiveness while others prioritize throughput. However, the target interactivity level translates directly to cost of inference. Higher interactivity means higher costs. + +Owning or renting a GPU system for inference typically comes with a fixed $/hour cost. Thus, as interactivity increases and overall throughput decreases, fewer tokens are processed per hour, driving up the unit cost per token (measured in cost per million tokens). To maintain profitability, providers must set their price per token above their cost to serve. This means that higher interactivity use cases will need higher prices per token to support this higher cost, while high throughput applications can be served at a lower price. + +A simple analogy illustrates the entire trade-off. A metro bus and a Ferrari may have a very similar absolute dollar cost of ownership, but the bus amortizes that cost across dozens of passengers while the Ferrari serves only one or two. The Ferrari delivers superior responsiveness with immediate departure, direct routes, and a premium experience, but at a fundamentally higher cost per passenger. LLM serving operates under a similar constraint. + +
+ +## Pareto Frontier Curve + +There is always a trade-off between throughput and latency. To identify the Pareto Frontier Curve, we try to find every data point P such that there is no point that is better than point P in both throughput and latency. This means data point P is **Pareto optimal**, i.e. no other point improves one axis without sacrificing the other. When we connect the pareto optimal dots, we get the Pareto Frontier Curve. + +
+ +## InferenceMAXv1 Benchmark Methodology + +Providing benchmarks that reflect the full spectrum of possibilities across many levels of interactivity for different GPUs, inference engines and workloads is a core goal of InferenceMAX. In this section we will describe how the benchmark methodology is designed to meet this goal. + +For each benchmark run, we set up an inference server and a benchmark client. An inference server listens to requests and processes them. We use vLLM, SGLang, and TRT-LLM depending on the model. For benchmark clients, we use the vLLM benchmark serving script with vLLM dependencies removed. A benchmark client sends requests, records the runtime, and saves metrics related to the inference job. + +We opted for benchmarking requests that are random sequences to avoid prefix caching due to the complexity of taking prefix caching into consideration for now. Prefix caching varies significantly by workload and requires a careful survey of request patterns in order to pick representative prefix ratios. In future iterations of InferenceMAX, we will be using datasets like shareGPT instead of random data. We set the request rate to infinite and set the max number of concurrent requests, so we capture the inference server behavior when processing a specific number of requests. We also set the total number of requests to be sufficiently large so that cold start instabilities, e.g. JIT compile time, are amortized. + +For input / output sequence lengths, we converged on three sets: 1024 input tokens / 1024 output tokens representing chat workloads, 1024 input tokens / 8192 output tokens representing reasoning workloads, 8192 input tokens / 1024 output tokens representing summarization workloads. To mimic real world requests having different input sequence lengths, we randomly vary each request's input length from 80% to 100% of the specified input sequence length. + +The config options for a benchmark run are as follows: + +- **Model**: LLaMA 70B, DeepSeek R1, gpt-oss 120B +- **Precision**: MXFP4 weights, FP8, FP4 +- **GPU**: H100, H200, B200, GB200 NVL72, MI300X, MI325X, MI355X +- **Open source Frameworks**: [vLLM](https://github.com/vllm-project/vllm), [SGLang](https://github.com/sgl-project/sglang), [TRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) +- **Parallelism**: 1, 2, 4, 8, etc. +- **Max concurrency**: 4, 8, 16, 32, 64, etc. + +Starting with models, we picked LLaMA3 70B to represent dense enterprise model deployments. + +To benchmark sparse MoE models, we decided on DeepSeekV3 670B. In terms of arithmetic intensity, approximate active, total parameter count, and memory access patterns, DeepSeekV3's model architecture is the model that best matches frontier closed models such as OpenAI's 4o/5 model architecture. Thus, DeepSeek is the best proxy model for benchmarking to what OpenAI's internal model architecture likely is. + +For smaller, sparse MoE models, we decided on GPT-OSS 120B MoE because it is the closest to GPT-5 mini in terms of arithmetic intensity, approximate active, total parameter count, and memory access patterns. + +We are benchmarking FP8, FP4 and MX4 weights across the models depending on whether the hardware supports it. We sweep through different max concurrency users (a concept similar to batch size) to plot the full throughput and latency curve. We also sweep through different model parallelism schemes because larger model parallelism can reduce memory loading time, which in turn increases throughput at a low latency regime to a certain extent to find the pareto frontier curve. + +To prevent a [restart of the SGLang vs vLLM benchmark wars](https://x.com/dylan522p/status/1920638653677596836) and to save compute time, we decided to first pick only one of vLLM or SGLang as the default engine for each model. Back in July, we let AMD & Nvidia know that we would be using SGLang for DeepSeek 670B, vLLM for Llama3 70B as well as vLLM for Llama4. We have since replaced Llama4 with GPT-OSS 120B since nobody uses Llama4 and GPT-OSS 120B more closely matches a smaller "mini" frontier model. + +We want server configs to reflect real world deployments as much as possible, thus we have asked AMD and Nvidia to submit configs that are decently close to what their documentation guides reference when they discuss how to deploy these models on their hardware: + +- [https://docs.nvidia.com/llm-inference-quick-start-recipes/index.html](https://docs.nvidia.com/llm-inference-quick-start-recipes/index.html) +- [recipes.vllm.ai](https://docs.vllm.ai/projects/recipes/en/latest/) +- [https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-vllm-gpt-oss-120b.html](https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-vllm-gpt-oss-120b.html) + +We hadn't clearly specified whether warmup was allowed for InferenceMAX, so Nvidia has included a warmup phase in their SGLang DeepSeek submission to handle certain JIT-compiled kernels. Towards the end of our work in developing the benchmarks, AMD noticed the above fact regarding Nvidia's submission and asked if warmup was permitted, since they hadn't realized they could do the same. After some discussion between AMD, Nvidia, and the SemiAnalysis AI Engineering team, all parties agreed that warmup would be disallowed for now, and that the DeepSeek benchmark length would instead be extended by up to 5x to ensure fairness. The confusion experienced was our fault for not being explicit from the start about warmup rules. We plan to revisit the topic after launch given that in real-world production inference, warmup often occurs before the Kubernetes control plane reports a pod as healthy. + +## Discussion: Strategies for Serving DeepSeek R1 + +We allow providers to optionally submit disaggregated serving configs for DeepSeek R1. Disaggregated serving assigns the two stages of inference, prefill and decode, to different GPU resources. By separating the two stages, requests at different stages won't interfere with each other, enabling better SLA guarantees, especially at high concurrency scenarios. + +We additionally combined disaggregated serving with large scale expert parallelism (wide EP). Wide EP is enabled by multiple techniques, most notably DeepEP. DeepEP provides two dispatch modes: normal and low latency. Normal mode specializes in improving throughput of the prefill stage, while low-latency mode is tailored for lowering the latency of decode stage. + +For disaggregated serving DeepSeek R1, we also received submissions with multi-token prediction (MTP) enabled. DeepSeek R1 implements MTP, where the model is trained to predict multiple tokens every forward pass with the help of additional MTP modules. According to DeepSeek, training with MTP improves the model's planning abilities. In addition, using MTP modules during inference boosts token throughput with minimal model quality loss. + +
+ +Nvidia has submitted runs for DeepSeek R1 on GB200 NVL72 with disaggregated serving, wide EP, and MTP. Nvidia has also submitted specific configs to plot out the Pareto frontier, and we plan to expand to sweep a larger config space in the future. + +
+ +When serving DeepSeek R1, SGLang offers multiple parallelism strategies, including **tensor parallel (TP)**, **data parallel (DP)**, and **expert parallel (EP)**. Parallelism strategies split up work between GPUs to lower the memory usage per GPU and improve hardware utilization. + +
+ +Typically, we use tensor parallel to split up work in the attention layer along the number of heads dimension, which is typically 128. However, this doesn't fit well with DeepSeek R1 because it uses Multi-Latent Attention (MLA), a special type of attention where there is only one KV head, leading to KV cache duplication. To tackle this issue, SGLang uses data parallel attention for lower interactivity and splits work along the batch dimension, removing the need to duplicate KV cache and reducing communication load. + +DeepSeek R1 also has a lot of expert layers, so we apply expert parallel and assign each GPU a set of expert layers. This lowers the memory usage at the cost of higher communication load. + +
+ +## Architecture of InferenceMAX + +InferenceMAX uses GitHub Actions to orchestrate benchmark runs. A GitHub Action runs each benchmark config as a [job](https://docs.github.com/en/actions/get-started/understand-github-actions#jobs) and executes it on a [runner](https://docs.github.com/en/actions/get-started/understand-github-actions#runners). We hook GPU servers into GitHub Actions as runners, so they listen to requests and execute jobs. When executing a job, a runner will execute a runner launch script written for that server, which in turn uses Docker or SLURM, depending on the server setup. The launch script will then execute the benchmark script, which contains the concrete benchmark config. + +We define the logic of parallelism strategies + max concurrency benchmark sweep as a parameterized [workflow](https://docs.github.com/en/actions/get-started/understand-github-actions#workflows), and we incrementally compose the workflow to execute all GPU types for all models and GPUs, as well as different input / output sequence lengths. + +
+ +## Performance Results -- Throughput vs E2E Latency/Interactivity (tok/s/user) + +Below is the performance snapshot for the nightly run on October 7th, 2025 at the time of writing this article. For the full set of nightly results, visit our dashboards on [http://inferencemax.ai/](http://inferencemax.ai/). + +When interpreting throughput vs latency/interactivity graphs, keep in mind that most practical applications operate somewhere between the extremes. Benchmark results measuring only one or a limited level of throughput or interactivity can sometimes be misleading. + +For instance, if GPU A delivers 4x the throughput of GPU B at a given interactivity level -- take 5 tokens/s/user as an example for a human facing AI chatbot application, the fact that this interactivity level is far too slow to be practical means that this performance difference has little real-world significance. Instead -- a realistic interactivity level should be chosen for the given applications. + +Later in this report, we will be also be normalizing throughput of GPUs by the total cost of ownership (TCO) of those GPUs. + +TCO per million tokens is the true north star that customers care about -- performance is merely a stepping stone to calculating this metric. For example, a B200 could deliver 1.5x higher throughput than an MI355X, but if it has 2x the TCO per hour -- the MI355X would be the better choice as it delivers better performance per TCO even if the MI355X delivers lower absolute performance in terms of throughput per GPU. + +Let's step through a few benchmark examples to explain how to analyze the results. + +In our first result, the H100 vLLM vs MI300X ROCm 7.0 vLLM comparison for Llama 3.3 70B FP8 in our reasoning scenario (1k in/ 8k out) shows a strong MI300X performance especially at low interactivity levels (20 to 30 tok/s/user) due to the MI300X's better memory bandwidth and memory capacity advantages when running at TP1. + +
+ +
+ +We are seeing competitive results comparing the H200 and the MI325X on vLLM GPT-OSS 120B with MX4 weights for a summarizing workload. The MI325X has an advantage over H200 for interactivity below 110 tok/s/user and still is somewhat competitive with Nvidia for levels above 110 tok/s/user. + +
+ +
+ +
+ +When it comes to LLaMA 70B FP4, B200 significantly outperforms MI355X across all three workload types in terms of throughput performance. This shows that AMD's FP4 kernels have room for improvement. + +Moving on to the B200 (vLLM and TRT-LLM) vs. MI355X vLLM for GPT-OSS 120B, we can see that MI355X is competitive with B200 vLLM after normalizing by TCO. In the next section, we will see that MI355X across some interactivity ranges is better perf per TCO than Nvidia. The throughput-latency graph appears to show a tighter race, with the MI355x never more than ~15 seconds slower than the B200 for a given tok/s/gpu throughput. The most practical range of interactivity we see in the real world is approximately 150-200 tok/s/user for GPT-OSS 120B. + +
+ +
+ +Moving to DeepSeek 670B MoE FP8, when comparing the MI325X on SGLang vs. the H200 on SGLang, we observe that the MI355X lags significantly in both latency and interactivity for a given level of throughput. The H200 on SGLang serves inference consistently at approximately 40% lower latency than the of MI325X for comparable throughputs. Furthermore, we also see a steady gap when comparing the Pareto frontier of their interactivities. Comparing the MI355X on SGLang and the B200 on SGLang tells a similar story to our MI325X vs H200 comparison. There appears to be lots of room for improvement for AMD when it comes to SGLang images. + +We can also see that for GB200 NVL72 SGLang Dynamo FP8 rack scale inferencing, it is not yet optimized and there is still room for improvement. + +
+ +
+ +Moving on FP4 DeepSeek 670B MoE, we see that GB200 NVL72 rack scale TRT-LLM inference beats single node SGLang inference by a wide margin. We look forward to benchmarking wideEP + disagg prefill on multi-node 8-way machines over the next couple of months. + +
+ +
+ +Next, we compare GB200 with Multi-Token Prediction (MTP) On and Off for DeepSeek R1 in an 8K input / 1K output scenario -- an input/output ratio that is meant to reflect summarization use case. The MTP On benefit is particularly noticeable comparing throughput vs. interactivity. Between the range of 70-140 tok/s/user, we see significantly higher throughput/GPU for the MTP On scenario when compared to the MTP Off -- up to 2-3x the throughput for some iso-interactivity (tok/s/user). + +
+ +
+ +## Performance Results -- TCO Per Million Tokens Versus Interactivity (tok/s/user) + +However, comparing token throughput per GPU is just one of a few data points needed to get to the real bottom line, namely total cost of ownership (TCO) per token. + +ML inference engineers typically measure this in units of TCO per million tokens. To get from throughput per GPU to TCO per million tokens, we must normalize by the total cost of ownership in units of USD/hr/GPU when comparing chips to chips. For example, if a B200 delivered 1.5x higher throughput than the MI355X but had 2x the TCO per hour -- the MI355X would be the better choice, even if it delivers lower absolute performance. + +At our InferenceMAX portal, located at [http://inferencemax.ai/](http://inferencemax.ai/), we have estimated the TCO per million vs. latency/interactivity for various customers segments such as: + +- Hyperscalers and Tier 1 Frontier Labs Buying & Owning Chips (4-year Economic Useful Life) +- Neocloud Giants and Giant Managed Inference providers that plan to own their own chips (4-year Economic Useful Life) +- Renting GPUs from Neoclouds with 3 Year contract, with 25% upfront payment + +Modeling Total Cost of Ownership per Token is no mean feat and it involves multiple SemiAnalysis teams and practice areas. In the AI Token Factory Economics stack, we show all the assumptions that are used to derive this north star metric, as well as the SemiAnalysis Models used to determine these quantities. + +
+ +In particular, the [SemiAnalysis AI TCO Model](https://semianalysis.com/ai-cloud-tco-model/) provides comprehensive modeling of total cost of ownership for combinations of various AI server solutions and networking architectures (i.e. InfiniBand vs SpectrumX vs Arista Ethernet vs WhiteBox Ethernet) and is the main source for the total cost of ownership per GPU as well as Neocloud rental market prices used in InferenceMAX. + +The SemiAnalysis GPU Cloud Market Rental Price Report is built on surveys with over 70+ GPU Clouds and over 100+ end users that rent from GPU clouds. In the future, we will explore implementing dashboards on the InferenceMAX.ai portal for different rental pricing contracts lengths like 1 year or 1 month. We also plan on allowing custom inputs such that you can input your own $/GPU/hr quotes to determine the GPU that best matches your interactivity targets and costs. + +In our analysis below, we focus on cost per million tokens for Hyperscaler tier operators that are owning chips and underwrite their business case to a 4-year economic lifespan. + +We see that across all interactivity levels, the cost per million tokens on MI325X on vLLM beats the cost per million tokens on the H200 using vLLM. When we bring in Nvidia's (mostly) open source TRT-LLM, we see that the H200 current software stack wins against the MI325X using today's vLLM stack. + +
+ +When we compare the B200 on vLLM vs the MI355 on ROCm 7.0 vLLM when running Llama3 70B FP4 on reasoning input/output length scenarios, the B200 currently outperforms the MI355. This also illustrates our suggestion that AMD focuses more on optimizing FP4 for Llama3. + +
+ +For GPT-OSS 120B FP4 summarization tasks, we see that the MI355X on vLLM has a lower TCO per million tokens than the B200 on vLLM and can even beat B200 on TRT-LLM when interactivity is below 225 tok/s/user. For interactivity levels greater than 225 tok/s/user, we see that the B200 on TRT-LLM as well as other inference engines are more optimized and deliver lower TCO per performance than the MI355X on vLLM. + +
+ +On GPT-OSS 120B with MX4 weights, we see very strong performance per TCO from the MI300X compared to the H100 across the entirely interactivity range. + +
+ +For gpt-oss 120B using MX4 weights, the H200 on TRT-LLM is neck and neck with the MI325X in terms of perf per TCO for interactivity levels less than 135 tok/s/user. Above this level, the MI325X vLLM takes the lead in terms of TCO per million tokens compared to H200 TRT-LLM. + +What is surprising about this result is that true open source vLLM for Hopper is faster than "mostly" open source TRT-LLM hopper. Even the MI325X on vLLM beats the H200 on TRT-LLM for interactivity levels greater than 135 tok/s/user. + +
+ +Moving on to DeepSeek 670B MoE using FP8, we see that when we hold TCO per million tokens constant, the B200 on SGLang delivers 1.5x faster interactivity as compared to the MI355X on SGLang. We note that there is still lots of optimizations currently in ROCm AITER that ROCm is integrating into SGLang, and so we expect that SGLang DeepSeek 670B MoE performance per TCO will improve soon. + +When holding interactivity constant at ~35 tok/s/user, the GB200 NVL72 beats everything else, delivering 4x better TCO per million tokens. We note that the Dynamo team has so far only had time to implement optimizations sufficient to lower the parallelism cost pareto frontier at the 30 tok/s/user region. There is still room for them to further optimize to push down the cost pareto frontier for interactivity levels of around 40 and above for the GB200 NVL72 using FP8. + +
+ +Moving on to DeepSeek R1 using FP4 for a summarization use case, we see that below 90 tok/s/user interactivity, the GB200 NVL72 on the TRT-LLM engine using Dynamo disagg prefill decisively outperforms all single node 8-GPU severs on TCO per million tokens. Interestingly, for interactivity levels above 90 tok/s/user, the B200 on TRT-LLM beats the GB200 NVL72. However, as it stands now, a single node B200 server can drive better TCO per performance than the GB200 NVL72 for high interactivity use cases. + +
+ +In the benchmark below focused on a reasoning use case, we see that the B200 on SGLang currently outperforms the MI355X on SGlang. + +
+ +For the summarization scenario, the GB200 using today's TRT-LLM Dynamo software outperforms a B200 single node for interactivity levels under 80 tok/s/user. Comparing the MI355X on SGLang to the B200 on SGLang, we see that the B200 delivers better TCO per million tokens. + +
+ +We also benchmarked workloads running on FP4 using Multi-Token Prediction (MTP), which is a feature implemented by the DeepSeek team during training. We see that when holding TCO per million tokens constant, MTP can deliver 2-3x greater interactivity (tok/s/user) than without MTP for that given cost level. Indeed, most frontier labs and tier 1 managed DeepSeek REST API endpoint providers have already enabled MTP for production workloads. + +
+ +## Estimated Token Throughput per All In Provisioned Utility Megawatt vs Interactivity (tok/s/user) + +Power is the ultimate constraint for AI infrastructure. Every datacenter operates within a finite power envelope, usually measured in Megawatts (MW). This directly determines how much useful computation, and ultimately, how many tokens can be produced by a given datacenter. + +Inference economics can also be analyzed not only through a lens of GPU performance in terms of throughput/GPU vs TCO, but also in terms of throughput per power as measured in terms of tokens/s per all-in provisioned MW of total utility power. Total utility power encompasses power requirement for GPUs, CPUs, networking equipment, other related cluster IT equipment as well as facility overhead. Facility overhead includes items such as electrical distribution losses and power expended on cooling equipment such as chillers, CDUs and cooling towers, among others. The greater the number of tokens processed per MW, the greater the potential revenue and profit per unit of energy. Please note that for InferenceMAX, we use all-in provisioned utility MW, which accounts for the aforementioned facility overhead, as opposed to tokens per all-in Critical IT MW, which does not account for facility overhead. These differ across sites, but we chose a representative for the industry based on our [AI TCO model](https://semianalysis.com/ai-cloud-tco-model/) and [Datacenter model](https://semianalysis.com/datacenter-industry-model/). + +Do note that colocation rent and electricity cost typically make up less than 20% of the total cost of ownership. This means that if a given GPU delivers 20% lower tokens per MW compared to another GPU, this would only translate to a delta of less than 4% of the total cost of ownership (i.e. 20% \* 20% = 4%). The lion's share of the TCO contribution is from the gross margins each GPU hardware vendor charges. Some charge up 75% gross margins (i.e. a 4x markup over cost of goods sold), while others less than 50% gross margins (i.e. less than 2x cost of goods sold). + +We use rate units -- i.e. token/s per MW as opposed to an accumulated amount of energy per token such as Joules per Token. This is because datacenter capacity is commissioned in terms of Megawatts (MW), which is a rate unit and is equivalent to 1 megajoule (MJ) per second. If we integrate the rate unit over a given time period -- we get the absolute quantity of energy consumed over that period. + +For now, we build up our estimate for MW needed for a given cluster by adding up Thermal Design Power (TDP) of each component in the datacenter. TDP is not the same as expected average power. Using an example to explain, for memory bandwidth bounded decode workloads, power consumption of the system should never reach TDP and will instead hover at a lower power level -- the expected average power. In the future, we will benchmark the actual power consumption of each system (and networking equipment) through ipmitool. Only then will we pivot to an accumulated quantity of energy per token. + +We estimate throughput per provisioned power based on the raw InferenceMAX results combined with data on total utility power for AI clusters from our [AI Datacenter Industry Model](https://semianalysis.com/datacenter-industry-model/). This model quantifies total utility power through power-normalized estimates across vendors, architectures, and inference stacks. Full estimates and ongoing nightly benchmarks are available at [InferenceMAX.ai](https://inferencemax.semianalysis.com/). + +## Performance per MW Results + +We see that for gpt-oss 120B, using MX4 weights for reasoning scenarios (1K input tokens / 8K output tokens) at the 90 tok/s/user interactivity level, the MI300X is able to process 750,000 token/s per all in provisioned MW (again this is measured per utility MW, and not per MW of Critical IT Power) while the MI355X is able to process 2,550,000 token/s per all in provisioned MW. This represents a ~3x improvement in power efficiency from the CDNA3 generation to the CDNA4 generation. + +
+ +We see a similar trend when comparing across generations for the Nvidia camp. Looking at the HGX H100 vs the HGX B200 for gpt-oss 120B using FP4 weights, an H100 can process 900,000 token/s per MW while a B200 can process 2.8M token/s per MW ~3x better power efficiency on a B200 vs an H100. When we look at even higher interactivity levels of around 180 tok/s/user, the B200 delivers an eye-popping 7x power efficiency gain. + +
+ +Let's compare power efficiency for GPUs of the same generation across AMD and Nvidia. We will first look at tokens/s per provisioned all in utility MW for GPTOSS 120B. Based on our initial InferenceMAX result snapshot below, we see that Blackwell is 20% more energy efficient compared to the CDNA4 architecture when measured by this throughput per power metric. A large factor in this divergence is the fact that the MI355X has a much higher TDP for the GPU alone at 1.4kW/GPU vs 1kW/GPU for the B200. + +
+ +In our next benchmark, we look at tokens per power at an interactivity level of 30 tok/s/user for DeepSeek R1. When comparing a single node H200 FP8 vs a GB200 NVL72 FP4 (without Multi Token Prediction), the GB200 NVL72 delivers an ~8x improvement in token/s processed per all-in provisioned MW. Note that both the H200 and B200 results are for single nodes. We will explore the potential for greater token throughput per MW for the B200 and H200 that can be unlocked by implementing disaggregated prefill and wide expert parallelism over SpectrumX as well as InfiniBand. SGLang's [GB200 NVL72 analysis](https://lmsys.org/blog/2025-09-25-gb200-part-2/) shows that 8-GPU systems can indeed achieve strong performance gains through implementing wide expert parallelism. However, the SGLang blog also shows that GB200 NVL72 still beats Hopper even when both implement disaggregated prefill and wide EP. + +
+ +Staying on DeepSeek, but turning to FP8, we see that the GB200 also dominates all the single-node systems on tok/s/gpu vs tok/s/user. We note that there are some nuances here -- the B200 and the MI355X are both running single node SGLang even though for DeepSeek, vLLM could deliver better results than SGLang on MI355X. We will explore adding DeepSeek on vLLM for the MI355X and/or adding SGLang multi-node wideEP to all of the 8-GPU servers as well. Furthermore, as we called out earlier, note that the Dynamo team has only had time to implement optimizations sufficient to achieve a shift lower in the parallelism pareto frontier up to around 30 tok/s/user. Further optimization can be done to push the pareto frontier lower, and thus lift the throughput per power up on GB200 NVL72 FP8 for higher interactivity levels. + +
+ +## AMD Bugs and NVIDIA Blackwell Bugs + +There were a few Blackwell bugs that were quite interesting to troubleshoot. The first bug was that the Blackwell vLLM image we started using back in July 2025 would lead to the instance stalling for up to 30 minutes on our bare metal B200 machines. This was especially challenging to replicate and debug as other people tried using the exact same image on their Blackwell cluster without encountering any hanging issues. + +The first tool we turned to in order to debug this hanging issue was [py-spy](https://github.com/benfred/py-spy), a python profiler, to collect a trace. We noticed is that it was stuck on [ncclCommInitRank](https://github.com/NVIDIA/nccl/blob/8d26308e6aba7f1667b24a861b5dc73f0f2e1f40/src/init.cc#L1974) which is strange - as many ML performance engineers know, this function should be run very quickly on a single node. Another to note was that vLLM was using their [own custom FFI bindings](https://github.com/vllm-project/vllm/blob/3d1f67616da88cbf0033bf5027cc0c6e5e9cacf6/vllm/distributed/device_communicators/pynccl_wrapper.py#L144) to NCCL due to [various technical reasons](https://github.com/vllm-project/vllm/blob/3d1f67616da88cbf0033bf5027cc0c6e5e9cacf6/vllm/distributed/device_communicators/pynccl_wrapper.py#L4-L23). + +
+ +Reading through vLLM's NCCL bindings, we weren't convinced that the FFI binding was the root cause of issue. Running nvidia-smi, we saw that the GPU_UTIL was not at 100% but instead at 0% - indicating that no kernels were running on the GPU, leading us to conclude this was not a device side NCCL deadlock. + +Next, we used linux [perf](https://perfwiki.github.io/main/) top profiler to look beneath the python layer and try to gain more insight into what specific shared library could be triggering this issue. We noted that most of the CPU cycles for this process (and sub processes) were running on "libnvidia-ptxjitcompiler.so". Reading the docs on "libnvidia-ptxjitcompiler", we came across a description that reads: _"The PTX JIT Compiler library (/usr/lib/libnvidia-ptxjitcompiler.so.575.57.08) is a JIT compiler which compiles PTX into GPU machine code and is used by the CUDA driver"._ This is extremely strange as we are not sure why this is calling the PTX compiler on init given that there are no just in time kernels to compile because typically all the NCCL kernels are prebuilt at build time. + +
+ +We were too ~~lazy~~ busy to rebuild the whole container image to compile NCCL from scratch with debug symbols enabled. Thus, we next used [strace](https://man7.org/linux/man-pages/man1/strace.1.html) to figure out what syscall calls ptxjitcompiler was making in order to dive one layer deeper into which functions are being called. We see that ptxjitcompiler was creating and adding files to ~/.nv/ComputeCache/ inside the container. + +
+ +Peeling back yet another layer of the onion, we read up on what ~/.nv/ComputeCache/ does. According to documentation, it is the cache to convert PTX virtual ISA to SASS machine code. This was also very puzzling to us as typically NCCL is built with the machine code already bundled in addition to the PTX virtual ISA. We started reading the [NCCL build scripts and we noticed that SM100 (Blackwell) wasn't enabled for CUDA 12](https://github.com/NVIDIA/nccl/commit/80f6bda4378b99d99e82b4d76a633791cc45fef0#diff-45a9034a0c75cbfbbb34e853a43f6513c1d4c933eccf6adca705abe234fc1113R42-R49) which was what we were using and found out that they had only enabled it for the upcoming CUDA 13. This means that SM100 SASS was not bundled in and we were JIT converting compute_90 (hopper) PTX to SM100 SASS resulting in the process taking an extremely long time. The reason why other people didn't see this bug when he ran it was that he was using an internal cluster using slurm with a setting that manually mounted his home directory. Since the SASS JIT cache is stored in the home directory, ~/.nv/ComputeCache/, the SASS was already cached! + +It turns out that the vLLM July container image was based on the pytorch container image which used a version of NCCL that didn't have Blackwell SM100 prebuilt. The fix is to use a [post fix version of 2.26.2](https://pypi.org/project/nvidia-nccl-cu12/2.26.2.post1/) that has Blackwell bundled such that we don't waste 30 minutes compiling virtual ISA to machine code. This bug has since been fixed in the latest vLLM container images. Thank you to simon-mo, youkaichao, mgoin, Robert-shaw, ptrblck, and Kedar Potdar for helping implement the permanent fix and immediate action on the quick resolution. + +
+ +Another Blackwell issue we ran into is that a sub-dependency of vLLM/SGLang, Flashinfer, was running into file lock race conditions. For whatever reason, Nvidia decided that instead of bundling compiled kernels into the container image, they were going to download them at server launch time. Since we have up to 8 processes per node (1 process per GPU), if the code was not process safe, we would run into race conditions while downloading these compiled kernels. + +It turns out that this race condition was introduced due to [an attempt to prevent race conditions from happening](https://github.com/flashinfer-ai/flashinfer/pull/1779)! Instead of relying on the builtin FileLock python package's lock cleanup, flashinfer manually cleans it up which results in a race condition. [This has been patched in Flashinfer](https://github.com/flashinfer-ai/flashinfer/pull/1779) but has not yet been upstreamed to vLLM/SGLang Blackwell release container images yet. Huge thanks to the Flashinfer team and Kedar Potar for jumping in and helping debug and patch this in record time - all within 4 hours of getting connected with the team. + +There is another Blackwell bug with respective to Flashinfer changing an build environment flag name to FLASHINFER_CUDA_ARCH_LIST but the Nvidians did not inform the vLLM/SGLang maintainers or contribute their own PR thus for a couple weeks, [vLLM](https://github.com/vllm-project/vllm/pull/25730) and [SGLang did](https://github.com/sgl-project/sglang/pull/11226) not support AOT for flashinfer. + +We were seeing that every so often, our Nvidia container toolkit would completely error out and display this message: + +> _"docker: Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running prestart hook #0: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy'_ +> +> _nvidia-container-cli: initialization error: driver rpc error: timed out: unknown"_ + +Trying to use the nvidia-smi in CLI would trigger a stall as well. This indicates that the entire Nvidia driver has actually crashed. After a detailed debug session with the Nvidia firmware/driver team and the Nvidia NCCL team, we found out that there is a slow resource leak bug around since NCCL 2.26 given that we are using CUDA graphs and launching over 500 Blackwell containers per night. + +Because we are stopping and starting so many Blackwell containers, all of these starts and stops accumulate until they eventually crash the driver. The specific cause of the resource leak bug stems from that fact that when CUDA graph is enabled, NCCL will by default enable user buffers. If it wasn't for the resource leakage bug, the NCCL user buffer feature would have reduced the data movement between the application level buffer and the internal NCCL buffer by having NCCL use the application buffer enabling zero-copy. [The temporary fix is to not enable NCCL user buffer](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-graph-register) in the interim period until the bug fix can be rolled out. The ETA for the fix is around Oct 20th, and it is expected to ship as part of a minor update to NCCL 2.28. Thank you to Kedar Potar and the numerous Nvidia team members for promptly identifying the root cause and fixing the bug with incredible speed and support. + +On the AMD front, we ran into fewer bugs while developing InferenceMAX, and these bugs were easier to fix. One such bug was that that AMD's CUDNN equivalent, AITER, was crashing in a helper function due to it not accounting for the fact that "/opt/rocm/llvm/bin/amdgpu-arch" not only returns the compute architecture (i.e. gfx942) but also would return gfx942 whilst including a suffix. AITER is meant to pattern match to figure out which architecture it is working with, but it did not account for a suffix being present. It was easy enough to [craft a temporary fix](https://github.com/InferenceMAX/InferenceMAX/blob/3b8879031799cac260ef00bd8911dabbe5982d49/benchmarks/70b_fp8_mi325x_slurm.sh#L39), but there will be a permanent fix coming into AITER in the next couple of weeks. Thank you to Quentin for helping patch this one! + +We also encountered a bug when benchmarking MI355X, where the benchmark runs crashed and dumps 1TB of files named gpucore.XXX. After investigation, we found out the root cause was chunked prefill size in the server configs was set too high. Lowering it from 196608 to 32768 fixed the issue ([PR link](https://github.com/InferenceMAX/InferenceMAX/pull/80/files)). + +AMD [recently added pyxis support](https://instinct.docs.amd.com/projects/container-toolkit/en/release-1.1.x/container-runtime/enroot-pyxis-installation.html), which has resulted in a good UX for using containers in SLURM, especially when it comes to multi-node training or multi-node offline batch inference jobs. However, we ran into one bug related to their ROCm 7.0 SGLang image _"rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915"_ which caused a hard crash when trying to run this image through pyxis SLURM. The root cause of this stems from how permissions are handled with the some of the layers that make up that docker image causing permission conflicts between layers. The AMD team is looking into how to permanently fix it and prevent such errors from happening again. + +Back in July, When we tried to enable AITER on SGLang for AMD GPUs, the process took 10x longer than normal (~30 minutes in total) given a slow process to finish compilation for DeepSeek V3 ([GitHub issue here](https://github.com/sgl-project/sglang/issues/7826)). This issue was eventually resolved in future releases and is currently fixed. + +## GitHub Action CI/CD Bugs + +GitHub Actions' [self-hosted runner](https://docs.github.com/en/actions/how-tos/manage-runners/self-hosted-runners/add-runners) support provides a straightforward solution for the benchmarks we wanted to run with InferenceMAX. The integration was quick to set up and allowed for running reproducible workflows on various GPU clusters without building custom infrastructure. However, as InferenceMAX began to scale up to include more jobs, some limitations of GitHub Actions were uncovered. + +Each benchmark variation runs as an individual job. For each model, we benchmark different combinations of the following: different GPUs, input/output sequence lengths, precision, tensor parallelism, and concurrency. This creates a [combinatorial explosion](https://en.wikipedia.org/wiki/Combinatorial_explosion) in the number of jobs per workflow as more configurations are added. + +To illustrate: InferenceMAX currently benchmarks 3 models across up to 7 GPU types, 3 distinct ISL/OSL pairs, 2 precision settings, and roughly 4 concurrency and tensor parallelism options. Not every model uses all possible configurations, but this worst-case estimate gives us 3 \* 7 \* 3 \* 2 \* 4 \* 4 = 2016 distinct jobs. At this scale, the GitHub Actions workflow visualization hits a limitation: the server times out after ten seconds when attempting to render the DAG, resulting in an [error message](https://github.com/503.html). This makes it extremely difficult to debug the run. Our workaround for this involved splitting up the single nightly workflow into three, splitting by ISL/OSL pairs. This reduced jobs per workflow from approximately 1500 to 500, which the server appears to handle reliably. + +Another bug involved a hard limit when using the [download-artifacts@v5](https://github.com/actions/download-artifact) action. At the end of each full sweep workflow, a job runs that collects and aggregates the performance results from all jobs, which are stored as artifacts of the workflow. As part of the collection process, the download-artifacts@v5 action is called. This initializes an [artifact client](https://github.com/actions/toolkit/blob/main/packages/artifact/src/internal/client.ts), which in turn invokes a [list artifacts function](https://github.com/actions/toolkit/blob/main/packages/artifact/src/internal/find/list-artifacts.ts) (needed to list all artifacts and then pattern match to find the requested one) that enforces a hard limit of 1000 for "performance reasons." There allegedly should have been a warning printed when the client tries to list more than 1000 artifacts, but we never observed this behavior. + +
+ +We would like to thank Scott Guthrie for connecting us with the right people at GitHub, and thank those team members for helping us implement temporary workarounds for these bugs. We look forward to continued use of GitHub Actions to create one of the largest GPU CI/CD fleets in the open-source world. + +## Recommendations to Nvidia and AMD + +Even though a great number of users and GPUs are running on SGLang and vLLM, Nvidia has been allocating most of their inference engineers towards working on TensorRT-LLM and have relatively few engineering resources dedicated towards supporting SGLang and vLLM. We recommend that Jensen allocate more inference engineering resources toward supporting and contributing to popular inference engines like vLLM and SGLang. This will allow Nvidia to better fulfill their mission of accelerating workloads no matter which inference engines users select. + +Furthermore, the ML community would benefit from a surge of additional time and resources from Nvidia for QA'ing their Blackwell software to minimize the number of bugs that end users encounter as they ramp applications on these new platforms. When developing InferenceMAX, we ran into many bugs that are only encountered in Blackwell and are not present on Hopper or other platforms. + +On the AMD front, we have suggested that they reduce the number of ROCm specific flags that need to be manually enabled to achieve reasonable performance. AMD has recognized this and has already commenced work on ensuring that optimized configs are set by default. In fact, many changes that act to reduce number of flags needed have already merged into master. + +We recommended the same for Nvidia's Blackwell platform and also suggest that Nvidia work on reducing the number of flags needed to get reasonable performance by moving towards [enabling performance optimizations](https://github.com/vllm-project/vllm/issues/25689) [by default](https://github.com/vllm-project/vllm/pull/25924). + +## InferenceMAX Next Steps + +Over the next couple months, we're expanding InferenceMAX's hardware coverage by integrating Google TPU and Amazon Trainium and we plan on going live within the next two months. This will enable unified, apples-to-apples comparisons across AMD, NVIDIA, Google, and AWS accelerators. This marks an important step toward making InferenceMAX a fully cross-vendor open benchmarking platform for the entire industry. + +Furthermore, another initiative we're also introducing is doing nightly evals including MATH-500 and GPQA-Diamond on FP4 models, allowing the community to measure throughput vs. quality trade-offs in a consistent, transparent way. This will help highlight how low-precision inference affects accuracy across diverse model families and deployment scenarios. In addition, we will be tracking output token throughput too to create more extensive insights. + +On the NVIDIA & AMD systems front, several exciting initiatives are underway. We're working on DeepSeek's disaggregated prefill + multi node expert parallelism configurations on MI300- and MI355-series GPUs & B200 GPUs too, testing how these advanced parallelism optimization scales across inference workloads. At the same time, we are exciting to test both HGX B300 Blackwell Ultra & GB300 NVL72 Blackwell Ultra to see what is the performance gains over GB200 NVL72. + +InferenceMAX is not perfect but it is our strong belief that we are heading in the right direction of having an benchmark that matches the pace of AI software progress & will continue to integrate feedback from ai chip vendors, frontier labs & major consumers of accelerators. + +Next, we will deep dive into breaking down the the different components that make up the TCO of the GPUs we are currently used in InferenceMAXv1, such as, H100, H200, B200, GB200 NVL72, MI300X, MI325X, MI355X. + +## Hyperscaler Total Cost Of Ownership - Hopper, Blackwell, GB200 NVL72, MI300X, MI325X, MI355X + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + + + +--- + +_This article continues on our Substack. [Subscribe to SemiAnalysis](https://newsletter.semianalysis.com/subscribe) to read the complete article._ diff --git a/packages/app/content/blog/inferencex-v2-nvidia-blackwell-vs-amd-vs-hopper.mdx b/packages/app/content/blog/inferencex-v2-nvidia-blackwell-vs-amd-vs-hopper.mdx new file mode 100644 index 00000000..740db151 --- /dev/null +++ b/packages/app/content/blog/inferencex-v2-nvidia-blackwell-vs-amd-vs-hopper.mdx @@ -0,0 +1,871 @@ +--- +title: 'InferenceX v2: NVIDIA Blackwell Vs AMD vs Hopper - Formerly InferenceMAX' +subtitle: 'GB300 NVL72, MI355X, B200, H100, Disaggregated Serving, Wide Expert Parallelism, Large Mixture of Experts, SGLang, vLLM, TRTLLM' +date: '2026-02-16' +tags: + - benchmark + - gpu + - inference + - announcement +--- + +## Introduction + +InferenceXv2 (formerly InferenceMAX) builds on the foundation established by InferenceMAXv1, [our open-source, continuously updated inference benchmark](https://github.com/SemiAnalysisAI/InferenceX) that has set a new standard for AI inference performance and economics. InferenceMAXv1 moved beyond static, point-in-time benchmarks by running continuous tests across hundreds of chips and popular open-source frameworks. [Free dashboard available here.](https://inferencemax.ai/) + +[Our benchmark has been widely reproduced, validated and/or supported by almost every major buyer](https://inferencemax.semianalysis.com/quotes) of compute from [Google Cloud](https://cloud.google.com/blog/products/compute/scaling-moe-inference-with-nvidia-dynamo-on-google-cloud-a4x) to [Microsoft Azure](https://blog.aks.azure.com/2025/10/24/dynamo-on-aks#enterprise-scale-inference-experiments--dynamo-with-gb200-running-on-aks) to [Oracle, OpenAI](https://inferencemax.semianalysis.com/quotes), and many more. + +InferenceXv2 builds on this foundation. It expands coverage to include large scale DeepSeek MoE disaggregated inference (disagg prefill, or simply “disagg”) with wide expert parallelism (wideEP) optimization to **all 6 NVIDIA western GPU SKUs from the past 4 years **as well as to every single AMD western GPU SKU released in the past 3 years – in total InferenceXv2 utilizes close to 1000 frontier GPUs for a full benchmark run across all SKUs. + +With today’s release, InferenceXv2 is now the first suite to benchmark the Blackwell Ultra GB300 NVL72 and B300 across the whole pareto frontier curve, and it is the first third party benchmark to test disagg+wideEP multi-node FP4 and FP8 MI355X performance. In future iterations of InferenceX, we will continue to focus heavily on disaggregated serving with wide expert parallelism as that is what is deployed in production at Frontier AI Labs like OpenAI, Anthropic, xAI, Google Deepmind, DeepSeek as well as advanced API providers like TogetherAI, Baseten, and Fireworks. In this article, we will also break down the system engineering principles and economics in play around the [latest Claude Code Fast mode feature](https://code.claude.com/docs/en/fast-mode). + +Our benchmark is completely open-source under Apache 2.0 – this means that we are able to move at the same rapid speed at which the AI software ecosystem is advancing. If you like our work and would like to show us some support, [please drop a star on our GitHub](https://github.com/SemiAnalysisAI/InferenceX)! We also provide a free data visualizer at [https://inferencex.com](https://inferencex.semianalysis.com/) for everyone in the ML community to explore the complete dataset themselves. + +We will add DeepSeekv4 and other popular Chinese frontier models with day 0 support as over the past 6 months, we now have cleaned up a lot of tech debt and are able to [move fast with stable infrastructure](https://www.cnet.com/tech/mobile/zuckerberg-move-fast-and-break-things-isnt-how-we-operate-anymore/). We will also be adding TPUv7 Ironwood and Trainium3 to InferenceX later this year! If you want to contribute to our impactful mission while earning a competitive compensation, [consider applying here](https://app.dover.com/apply/semianalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1). + +
+ +## Key Observations and Results to Highlight + +We see competitive perf per TCO results on FP8 MI355X disagg+wideEP SGLang on AMD compared to FP8 B200 disagg+wideEP SGLang, but when compared to widely used Dynamo TRTLLM B200 FP8, TRT continues to framemog. This is amazing news that AMD SGLang Disagg prefill+wideEP for FP8 is able to match NVIDIA’s SGLang performance. + +We also see that for single node aggregated serving, AMD’s SGLang delivers better perf per TCO than NVIDIA’s SGLang for FP8. [It is also great to see that AMD has deprecated their second class fork of vllm to move further upstream and closer to delivering first class experience.](https://x.com/vllm_project/status/2013928644302033208) Stay tuned for our “State of AMD” article where we talk about the many areas where AMD’s pace of improvement has been rapid & also the areas where the pace of improvement has been lackluster. We recommend that NVIDIA focus even more on SGLang & vLLM ecosystem in addition their TRTLLM engine. [Jensen needs to staff more resources & engineers towards contributing open ecosystems like SGLang & vLLM](https://www.linkedin.com/in/akbarnurlybayev?trk=feed-detail_main-feed-card_feed-actor-image). + +When it comes to the latest inference techniques that are used by the most prominent frontier large-scale inference services (such as disagg prefill+wideEP+FP4), Nvidia absolutely frame mogs with the B200, B300 and ASU frat leader, rack scale GB200/GB300 NVL72 across both SGLang and TRTLLM. Nvidia GPUs also dominate when it comes to energy efficiency, with much lower all-in provisioned picoJoules of energy per token across all workloads. + +Turning to AMD, we find that the biggest issue with inference on their systems and using their software is _[composability](https://en.wikipedia.org/wiki/Composability)_. That is, many of AMDs inference optimization implementations work well in isolation, but when combined with other optimizations, the result is not as competitive as one would expect. Specifically, the composability of disagg prefill, wideEP and FP4 inference optimizations needs significant improvement. + +While performance is competitive on AMD when enabling just a subset of the SOTA inference optimizations, enabling all three major optimizations that labs use, AMD’s performance is currently not competitive with Nvidia’s. We strongly recommend to AMD that they focus heavily on composability of different inference optimizations. We have been told that AMD will start focusing on software composability of FP4+distributed inferencing across their whole software stack. This will happen after Chinese New Year as most of their disagg prefill+wideEP 10x inference engineers are based in China + +Nvidia’s GB300 NVL72 doesn’t disappoint. It achieves up to 100x on FP8 vs FP4 compared to even a strong H100 disagg+wideEP+MTP baseline and 65x on FP8 vs FP8. On H100 vs GB200 NVL72, we see up to 55x realized performance difference at 75 tok/s/user. Rack scale Blackwell NVL72 is framemogging hopper and makes hopper looks like it is jestermaxxing. As Jensen said at GTC 2025, [he is chief revenue destroyer.](https://newsletter.semianalysis.com/i/174558496/ai-total-cost-of-ownership-cost-declines) + +At GTC 2024, Jensen claimed that Blackwell will deliver up to 30x perf on inference compared to H100, Jensen under promised & overdelivered on Blackwell inference performance. This should curtail the instances of analysts cracking “Jensen Math” jokes for some time. + +
+ +## Acknowledgments and InferenceX™ (formerly InferenceMAX) Initiative Supporters + +We would like to thank Jensen Huang and Ian Buck for supporting this open-source effort by providing access to the latest GB300 NVL72 systems along with access to servers representing all GPU SKUs that they have produced for the past four years. We would like to thank the Nvidia team for allowing us to conduct independent benchmarks across this close to 1000 GPUs. Thank you to Jatin Gangani, Kedar Potdar, Sridhar Ramaswamy, Ishan Dhanani, Sahithi Chigurupati, along with many other Nvidia inference engineers for helping to validate and optimize Blackwell & Hopper configurations. + +We’re also grateful to Lisa Su and Anush Elangovan for their support of InferenceMAX and for supporting our work with the dozens of AMD engineers like Chun, Andy, Bill, Ramine, Theresa, Parth, etc that contributed to InferenceMAX & upstream vLLM/SGLang bug fixes, as well as for their responsiveness on helping debug and triage AMD exclusive bugs so as to help optimize AMD performance. + +We also want to recognize the SGLang, vLLM, and TensorRT-LLM maintainers for building a world-class software stack and open sourcing it to the entire world. You can check their articles on InferenceX here: + +- [SemiAnalysis InferenceMAX: vLLM maintainers & NVIDIA accelerate Blackwell Inference](https://blog.vllm.ai/2025/10/09/blackwell-inferencemax.html) +- [GPT-OSS Performance Optimizations: Pushing Pareto Frontier](https://blog.vllm.ai/2026/02/01/gpt-oss-optimizations.html) +- [SGLang & NVIDIA Accelerating SemiAnalysis InferenceMAX & GB200 Together](https://lmsys.org/blog/2025-10-14-sa-inference-max/) + +The InferenceX initiative is also supported by many major buyers of compute and prominent members of the ML community including those from OpenAI, Microsoft, vLLM, Tri Dao, PyTorch Foundation, Oracle and more. [You can find the full list here](https://inferencemax.semianalysis.com/quotes). + +## A Primer on Important Technical Concepts + +In this section, we will give a brief primer on technical concepts that may help the reader better interpret results. Some readers may not need this and can skip directly to our analysis of results. We will take a deeper dive into some of these topics after the results analysis. + +## Interactivity vs Throughput Tradeoff + +The fundamental tradeoff with LLM inference is throughput versus latency. _Interactivity_ (tok/s/user) describes how fast each user of a system receives tokens – it is the inverse of time per output token (TPOT). _Throughput_ (tok/s) describes how many total tokens a system can crank out across all users. One can achieve higher total throughput by batching requests, but each request will be allocated less FLOPs and thus complete slower. This is analogous to the choice of riding a metro bus vs a race car. The metro bus serves many riders, but also makes frequent stops which takes time, but the cost of the metro bus can be amortized across many passengers. The race car can only carry one or two passengers, but it will make few if any additional stops meaning a faster travel time overall, but it is much more expensive to ride per passenger. The metro bus might make more sense for people heading to the park on a weekend, while the race car might be better for bringing a celebrity to their destination. There is no one size fits all solution. + +
+ +Most benchmark results we will show in this article are InferenceX is a curve. It is important to analyze throughput at various levels of interactivity/latency instead of just looking at maximum achieved throughput (which normally can only be achieved at a single low interactivity). With inference, there is no one size fits all use case. The level of interactivity and throughput needed depends on the use case. For instance, real-time speech models require extremely low latency so that the end user can maintain a natural “conversation” with the LLM, whereas a basic QA chatbot may allow for higher latency. We leave it up to the reader to look at the curve and apply this principle to identify where their use case falls on the throughput-interactivity curve. + +The Cost/Perf per TCO vs Interactivity/End-to-End Latency curve mostly follows the Throughput vs Interactivity/End-to-End Latency Curve: More tokens/hour leads to a lower cost per token as fixed $/hour costs are amortized over more tokens produced. + +### Prefill and Decode Phases + +Inference contains two main phases: prefill and decode. _Prefill_ occurs during the first forward pass of a request’s lifetime. It is computationally intensive since all tokens in the request are processed in parallel. This phase is responsible for “filling up” the KV cache for a sequence. After prefill, responses are generated (or _decoded_) one token at a time. Each forward pass loads the entire KV cache for a sequence from HBM, while only performing the computation for a single token, making decode memory (bandwidth) intensive. + +When prefill and decode performed on the same engine, prefill constantly disrupts decode batches leading to worse overall performance. + +### Disaggregated Prefill + +Disaggregated prefill (aka PD disaggregation or simply “disagg”) is the practice of separating the prefill and decode phases across separate pools of GPUs or clusters. These separate prefill and decode pools can be tuned independently and scaled to match the needs of workloads. + +## Tensor Parallel, Expert Parallel, Data Parallel (TP, EP, DP) + +TP allows for maximize interactivity at small batch sizes, but it must carry out an all-reduce at every layer. EP shards experts, exploiting MoE sparsity, with the drawback being an all-to-all collective (which is more costly than simpler collectives like all-reduce) is carried out for MoE layers and can be imbalanced at small batches. DP replicates the entire model (or just parts of a model, like attention) on multiple groups of GPUs (ranks) and then load balances requests among ranks. It is the simplest to scale, but repeats weight loading which can be wasteful at scale. + +## Tracking Improvements Over Time + +One of the main goals of InferenceX is to visualize performance improvements over time. While new chips are released on an O(yearly) cadence, software releases happen on an O(weekly) cadence. Our goal is to constantly update recipes with the latest and greatest software improvements and benchmark the configurations. + +## DeepSeek R1 + +The AMD team has significantly improved performance for all configurations of SGLang DeepSeek R1 FP4. For the same interactivity, AMD has almost doubled the amount of throughput in the span of less than 2 months. Moreover, we have pushed AMD to upstream performance enhancing changes from their forked SGLang images into the official SGLang image. From December 2025 to January 2026, AMD’s software was improved up to 2x in performance. + +
+ +In order to continue becoming closer to an first class experience, AMD needs increase their support of vLLM & SGLang maintainers through compute contributions and code contributions & having more reviewers that work for AMD to speed up the review process of AMD PRs into the upstream. + +
+ +On the other hand, Nvidia’s results were more consistent, with minor improvements for B200 SGLang over a similar time period. + +
+ +Many of the mature SKUs had minimal improvements. For example, H200 TRT single node has not changed in performance in the span of 4 months since October, but this is because Hopper support has been excellent since day 1, and performance has close to peak theoretical for this workload all along, making it hard to deliver incremental performance gains. + +
+ +MI300X and MI325X have seen some improvements, mainly from the most recent SGLang release. Note that for much of the history of InferenceX, AMD was using “private” ROCm images that were not upstreamed, so runs prior to ~Jan 2026 cannot be compared directly to those that are more recent. + +
+ +GB200 Dynamo TRT-LLM disagg has seen some significant improvements as well, with a 20% increase in max throughput in the span of a little over 1 month. We also see improvements in the middle interactivities, where wide EP is deployed. This is likely due to maturing wide EP kernels on GB200. + +
+ +B200 SGLang has seen steady and continuous improvement for both FP4 and FP8 scenarios since our initial launch, with throughput per GPU doubling at some interactivity levels since last October. + +
+ +For MI355X Disaggregated inference serving, AMD recommends using SGLang with MoRI. [MoRI is AMD’s MoE dispatch/combine collective and KV Cache transfer library](https://github.com/ROCm/mori/tree/main) built from first principles by AMD’s cracked 10x China-based engineering team. Although MoRI needs much more open CI and testing, we are strong supporters of the direction that MoRI is taking. This is because instead of taking AMD’s historical approach, which was to fork NVIDIA’s NCCL into RCCL, MoRI is built from scratch by taking the lessons from RCCL/NCCL and building an entirely new package from first principles. The use of MoRI has also delivered good speedups in the span of more than a month, with throughput per GPU increasing by more than 20% in the 20-45 tok/s/user interactivity range. + +
+ +## GPT-OSS 120B + +For MI300X and MI325X, we have seen marginal improvements across the board. Some AITER optimizations helped MI300X performance across all interactivities, and switching to the upstream vLLM ROCm image led to improvements. + +
+ +In the case of the MI325X, it appears that not all performance enhancements that were present in the downstream ROCm fork image (used during the October 5th, 2025 run) have made it into the official vLLM ROCm image. +Unfortunately, the MI355X literally still uses a fork of the vLLM 0.10.1 build `rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1`). We would love to have seen it updated it by now, but unfortunately the current official image (0.15.1, at the time this article was written) is not yet optimized for the MI355X and runs into hard errors. We had also run into hard errors crashes on Mi355 for vLLM 0.14. Word on the street is that vLLM 0.16.0 will finally deliver all the changes needed for better MI355X performance. + +
+ +Turning back to Nvidia’s systems, both Hopper and Blackwell saw a steady performance increase between vLLM 0.11.2 and 0.13.0. Soon, we will update recipes for Nvidia GPUs to use the latest vLLM version and we expect even greater performance gains after making the switch. We also observed a performance bump in the latest 1.2.0 version of TRT-LLM. + +
+ +
+ +## Disaggregated Inference Frameworks + +NVIDIA uses Dynamo for its disaggregated inference setup. [Dynamo](https://docs.nvidia.com/dynamo/design-docs/overall-architecture) is an inference framework designed for multi-node distributed inference, featuring techniques such as prefill-decode disaggregation, request routing, and KV cache offloading. It is inference-engine agnostic, allowing us to use SGLang and TRT LLM as backends in our benchmark. For AMD, we use SGLang with two different KV cache transfer frameworks: MoRI and Mooncake. [MoRI](https://github.com/rocm/mori) is a high-performance communication interface focusing on RDMA and GPU integration, offering applications such as network collective operations and expert parallel kernels. Mooncake, which [recently joined the PyTorch ecosystem](https://pytorch.org/blog/mooncake-joins-pytorch-ecosystem/), supports prefill-decode disaggregation and many fault tolerant multi-node features. + +## DeepSeek Disagg +WideEP Results Deep Dive + +At almost all interactivity levels, disagg outperform aggregated inference (grey lines) in terms of total token throughput per GPU. Multi-node disaggregrated prefill framemogs single node aggregrated serving. + +
+ +Nvidia continues to push new updates for B200/GB200 FP8. The latest data on DeepSeek FP8 B200 TRT single node (both MTP enabled/disabled) vs GB200 Dynamo+TRT disagg (both MTP enabled/disabled). This indicates consistent engineering effort to improve rack-scale inference software and wideEP kernels. + +
+ +When comparing MI355X disaggregated inference vs aggregated inference, we noticed a similar pattern. Disaggregated inference only overtakes aggregated inference at low interactivity, high batch sizes. This is true across FP4, and it is likely due to poorly optimized kernels. + +
+ +When composing disagg prefill+wideEP with FP4 on the MI355X, we observe suffers subpar performance. + +Although theoretical modeling shows that disagg inference on MI355Xs should perform way better than single node, disagg actually performs worse for higher interactivity levels due to a lack of kernel and collective optimization in the ROCm software stack when composing multiple SOTA inference optimizations together. + +
+ +### Nvidia TensorRT LLM and NVL72 + +TensorRT LLM already serves billions of tokens per hour globally across providers like TogetherAI and other advanced providers, and it has really allowed the GB200 NVL72 and GB300 NVL72 to shine, delivering more than double the performance at high throughput. MTP boosts these results even further, making use of the chips’ full potential. + +
+ +
+ +The benefits delivered from the larger world size of the NVL72 family is also evident if we look at cost graphs. At a fixed interactivity level of 60 tok/s/user, each GB200 NVL GPU produces slightly less than triple the number of tokens/s than each B200 does. + +
+ +This gap shrinks as interactivity increases. At 130 tok/s/user, the GB200 NVL72 has nearly no advantage and is even more expensive on a $/Million tokens basis. At low batch sizes, the inference workload shrinks enough to fit within a single HGX node’s NVLink domain (i.e. 8 GPUs), and the GB200 NVL72’s larger scale-out advantage starts to disappear. + +
+ +## Nvidia versus AMD Disagg Prefill + +With today’s release of InferenceXv2, for the first time the ML community is able to see a full Pareto frontier for open-source MI355X distributed inference. We show Pareto curves for the B200 and MI355X with and without enabling MTP. + +For FP8 disagg prefill, MI355X (MoRI SGLang) is quite competitive with B200 (Dynamo SGLang). Wide EP is not used for either of these configs as all prefill/decode instances run using EP8 at the most. At both ends of the throughput versus interactivity Pareto frontier, MI355X falls behind the B200 slightly. However, MI355X disagg has a slight advantage for certain levels of interactivity in the middle of the curve. Both the B200 and the MI355X benefit from employing MTP, and we observe the same relative performance improvement for both chips when using MTP. + +
+ +However, if we were to only measure output (decode) token throughput, we see that output token throughput is much higher for the B200 than for the MI355X at lower interactivity levels. Note that when looking at output token only throughput for disaggregated inference configurations, we normalize throughout by the number of decode GPUs, not total GPUs. It is possible that different numbers of GPUs are used for output when running inference jobs on the B200 and MI355X, but the bottom line is that whatever configuration decode is run on, B200 gets the decode job done faster. + +
+ +Despite the MI355X being competitive in FP8 disagg, its FP4 performance suffers from composability issues. AMD single node FP4 performance is decent, but when we compare AMD FP4 disagg prefill to Nvidia, performance is subpar and the MI355X gets absolutely mogged by Nvidia’s B200. In a 1k1k scenario, the MI355X (MoRI SGLang) with MTP barely manages to beat the B200 (Dynamo SGLang) without MTP. + +
+ +Once we bring Dynamo TRT-LLM into the equation, the B200’s performance is boosted even more to the point that the MI355X even with MTP can’t match the B200’s performance with Dynamo TRT-LLM and MTP. The MI355X can only match the B200 (without MTP) in performance by using MTP, and only for a range of interactivities from ~60 tok/s/user through ~120 tok/s/user. + +
+ +When comparing Dynamo TRTLLM B200 disagg prefill to SGLang MoRI MI355 disagg prefill, AMD gets framemogged due to the more mature implementation of disagg prefill on TRTLLM. + +
+ +
+ +The diagram below shows us the various parallelism configurations that form up the MI355X (MoRI SGLang) Pareto frontier. Note that currently, wide EP is not employed for any points (i.e., configurations with EP 16, 32, etc.). + +
+ +## Unpacking Inference Providers’ Unit Economics + +Below is a list on OpenRouter of all inference providers that serve DeepSeek R1 0528 FP8 along with their cost per million input/output tokens and average interactivity listed on. Disregarding Chutes, the middle of the pack provider serves at an interactivity of around 35 tok/s/user. + +
+ +We can then use real InferenceX data to interpolate the cost per million input/output tokens at an interactivity level of 35 tok/sec/user, which is a reasonable interactivity level given the data above. + +As we mention later in the article, this is best understood as *baseline *data and not completely representative of real-world inference, mainly because InferenceX benchmarks on random data and disables prefix caching. In other words, performance/cost will be *at least *this good. It is also important to note that there are not data points for _each GPU_ at *each *interactivity level. Thus we cannot make *exact *comparisons at each degree of interactivity. We nevertheless think the bar chart comparisons presented below are (very) reasonable interpolations in lieu of using exact data points. + +Comparing disagg+wideEP configs at this interactivity level, we see just how effective distributed inference techniques are when it comes to both perf/TCO and overall throughput. We also see how large scale up domains (like GB300 and GB200 NVL72) absolutely dominate in total throughput per GPU. + +It is interesting to note that at this interactivity level (on an 8k1k workload type), the B200 can achieve the best perf/TCO when MTP is enabled. Below we also list the Total Cost of Ownership (TCO) (Owning – Hyperscaler) for each GPU: + +
+ +
+ +
+ +
+ +Let’s use the findings above to dig deeper into the unit economics of serving LLMs at scale. From the OpenRouter data above, we see that Crusoe serves at 36 tok/sec/user at $1.35/M input tokens and $5.40/M output tokens. If we assume no cache hits and that Crusoe is using at least H200s with SOTA inference techniques like MTP, disagg, and wide EP, the data above suggests they incur a cost of *no more than *$0.226$/M input tokens and $2.955/M output tokens for a profit margin of up to 83% gross margin (depreciation counted in cost of goods sold) on input tokens and 45% gross margin on output tokens. + +Of course, these assumptions may not be *exactly *correct and these calculations don’t account for downtime or underutilization, but this gives an idea of some cool math you can do with InferenceX data. More analysis on the economics of inference can be found in the [SemiAnalysis Tokenomics Model](https://semianalysis.com/tokenomics-model/). + +The OpenRouter data also shows Nebius AI Studio (Fast) serving DeepSeek FP4 at 167 tok/sec/user at $2/M input, $6/M output tokens. Adjusting the interactivity level in InferenceX accordingly and we see the following data. + +
+ +
+ +
+ +At this high of interactivity, it becomes necessary to employ speculative decoding techniques like MTP to achieve high enough throughput to make inference economical. Luckily, MTP can increase throughput with relatively low risk to overall model accuracy. We will go on to talk more about MTP, and how it can be applied to increase throughput / decrease cost, in later sections of this article. + +Lastly, we show one more chart of an FP8 DeepSeek workload served at 125 tok/s/user. This is another low latency workload where MTP considerably improves economic viability. As with the previous example, we note that at these higher ranges of interactivity, the cheapest configs all use MTP. + +
+ +### Nvidia Disagg Prefill and WideEP + +EP requires all-to-all communication, where every GPU needs to send tokens to every other GPU. This is extremely bandwidth hungry. Recall that Nvidia’s servers have two separate networking domains – the scale-up NVLink domain, and the Scale-out Domain, usually using InfiniBand or Ethernet as the networking protocol. + +- NVLink domain (within the NVL72 rack): 72 GPUs connected via NVLink with 900 GB/s uni-directional bandwidth per GPU. This is roughly 7-10x the bandwidth of the InfiniBand/Ethernet based scale-out network. +- InfiniBand/RoCEv2 Ethernet (outside of the NVL72 rack): Typically 400-800 Gbit/s per GPU uni-directional (50-100 GB/s). Note that all our testing for Nvidia was conducted on InfiniBand based clusters. + +TP shards every layer’s weight matrices across GPUs. This means that every single token at every single layer requires up to two all-reduce communications (one after the column-parallel GEMM, one after the row-parallel GEMM). For EP, all-to-all is done only at MoE layers. Each GPU sends only the tokens routed to each expert. This means cheaper comms across all layers for EP vs TP. + +Because EP’s all-to-all communication bandwidth requirements scale with the number of participants, staying within the high-bandwidth NVLink domain before having to cross the slower IB/Eth fabric is better. With NVL72, EP across 72 GPUs is possible without ever leaving NVLink, whereas previous generations (with only 8-GPU NVLink domains) could only do EP across 8 GPUs at NVLink speed before hitting the slower IB/Eth networks. + +Wide EP also has a major advantage in weight loading efficiency. For a model like DeepSeek R1, decode is memory-bandwidth-bound: the bottleneck is how fast GPUs can load weights from HBM. With wide EP (e.g., DEP32), 32 GPUs collectively hold and load the 670B weights once, each loading only its shard (~21B). The total HBM bandwidth of all 32 chips is applied to loading a single copy of the model. By contrast, with narrower EP and more DP replicas (e.g., 5xDEP8), each of the 5 replicas needs its own full copy of the 670B weights, that’s 5×670B = 3.35T of redundant weight loading across the system. EP amortizes weights across chips; DP replicates them. This is why wider EP, enabled by high-bandwidth interconnects like NVLink, delivers significantly better throughput per GPU. + +
+ +Generally, TP is preferred at lower concurrencies due to load balancing. At small batch sizes, EP suffers from uneven token-to-expert routing, leaving some GPUs underutilized while others are overloaded. TP avoids this since each GPU holds a slice of every expert and always gets an equal share of work. At lower concurrency, the cost of this load imbalance outweighs TP’s additional communication overhead. + +At higher concurrencies, this tradeoff changes. Expert activation becomes more evenly distributed across larger batch sizes, and EP’s communication and weight-loading advantages dominate over TP’s expensive per-layer all-reduce. In the middle of the curve, hybrid TP+EP configurations balance both concerns using small TP groups within each expert for load balancing while EP is used across the wider set of GPUs to amortize weights and reduce communication. + +For higher interactivity levels (low batch size), large scale-up world sizes tend not to deliver stronger performance. B300 disagg over IB has the same performance as GB300 with NVL72, since the workload is latency-bound, not bandwidth-bound. The massive NVLink bandwidth advantage of NVL72 doesn’t matter because not even the much slower IB link is saturated by the tiny batches of tokens in flight. + +Prefill/decode disaggregation also plays a role. Prefill is compute-heavy and bursty; decode is memory-bandwidth-bound and steady-state. When they share the same GPUs, they interfere with each other, causing latency jitter and wasted capacity. Separating them onto dedicated GPU pools lets each run a workload matched to its characteristics, improving effective utilization. This is why disaggregated B200 configs outperform single-node B200 in the middle of the throughput-interactivity curve. PD separation combined with wider EP across more GPUs over IB amortizes weights more efficiently than cramming both phases onto a single 8-GPU node. + +[Side Note: the 10x inference engineers at TogetherAI noticed an pattern for multi-turn traffic where the requirements of first turn prefill is much different from the following turns prefill’s and disaggregrated it leading to better TTFT performance.](https://www.together.ai/blog/cache-aware-disaggregated-inference) + +
+ +## Jensen Under Promising and Overdelivering - Hopper vs Blackwell vs Rack Scale NVL72 + +At GTC 2024, Jensen was on stage promising up to 30x performance gains from H100 to GB200 NVL72, [everyone thought it was classic marketing lookmaxxing and would not be achievable in real world.](https://newsletter.semianalysis.com/p/nvidia-blackwell-perf-tco-analysis) Many looked to come up with labels for this perceived use of a reality distortion field so they could crack more Jensen Math jokes. Indeed – [we did point to the comparison of 30x performance difference between the worst case](https://newsletter.semianalysis.com/i/175661150/benchmarking-the-h200-on-its-bad-hair-day) for H200 on FP8 to a reasonable case of the GB200 on FP4. + +
+ +But it turns out the joke is on them. Fast forward almost two years later, and we can now see that it wasn’t marketing hype lookmaxing after all, and Jensen was actually under promising on Blackwell performance the whole time. From our testing, Blackwell is so good at large scale MoE inferencing compared to even a strong H100 disagg+wideEP FP8 baseline that it, at 116 toks/s/user, delivers up to 98x better perf on GB200 NVL72 FP4 and up to 100x better perf on GB300 NVL72 FP4! Maybe the new Jensen Math rule is that he delivers double whatever he promises in terms of token throughput. The more you spend, the more you save indeed! + +
+ +Even when factoring in the increased total cost of ownership of Blackwell and Blackwell Ultra, we see a 9.7x(40 tok/s/user) up to 65x(116 tok/s/user) improvement in tokens per dollar compared to Hopper. [You can explore Hopper vs Blackwell performance in detail on our free website](https://inferencemax.semianalysis.com/?i_seq=8k%2F1k&g_model=DeepSeek-R1-0528&g_rundate=2026-02-12&g_runid=21928999802&i_prec=fp4%2Cfp8&i_metric=y_costh&i_log=1#inference). Blackwell performance is so good compared to Hopper that we needed to an log scale to our dashboard in order to visualize it. + +
+ +As mentioned earlier in the article, B300 servers only connect at most 8 GPUs using the 900GByte/s/GPU NVLink scale-up network whereas GB300 NVL72 servers connect 72 GPUs using the NVlink scale-up network. So when we need more than 8 GPUs (but less than 72 GPUs) for the inference setup, we need to bring in multiple nodes of B300 servers to form our inference system which means communications falls back to the lower InfiniBand XDR scale-out network featuring 800Gbit/s (uni-di) per GPU of bandwidth. Compare this to a rack scale GB300 NVL72 which connects 72 GPUs over NVLink delivering 900GByte/s (uni-di) per GPU of bandwidth and we can see that the rack-scale server allows the GPUs in the inference setup to talk to each other with over 9x higher bandwidth compared to the case of the multiple nodes of B300 servers. + +
+ +Admittedly the GB300 NVL72 has a higher all-in cost per GPU, but this only reduces the bandwidth per TCO advantage to being 8x faster. The bandwidth advantage of the rack-scale architecture directly drives a much lower cost per token. Google TPU, AWS Trainium and Nvidia are the only AI chips to have rack scale system designs deployed today. Engineering samples and low volume production of AMD’s first rack scale MI455X UALoE72 system will be in H2 2026 while due to manufacturing delays, the mass production ramp and first production tokens will only be generated on an MI455X UALoE72 by Q2 2027. + +
+ +## Blackwell vs Blackwell Ultra + +On paper, the newly released Blackwell Ultra has the same memory bandwidth as Blackwell, the same FP8 performance and only 1.5x higher FP4 performance, but when measuring we actually see up to 1.5x better FP8 performance on the Blackwell Ultra, though we only see 1.1x better performance on FP4. This may be due to Blackwell Ultra being a newly released GPU, meaning software is not fully optimized yet. + +
+ +
+ +## MI355X vs MI325X vs MI300X + +On AMD SKUs, we see up to 10x better performance on the MI355X vs the MI300X. AMD has only gotten DeepSeek SGLang Disaggregated Inferencing to work on the MI355X so far AMD has not submitted MI300X or MI325X disaggregated inferencing results, potentially due to software issues on older SKUs that are still being solved. + +
+ +
+ +
+ +Turning to cost, for DeepSeekR1 on FP8, at an interactivity of 24 tok/s/user, the MI355X delivers inferences a cost that is slightly less than 3x cheaper than for the MI325X. The throughput of each GPU is slightly less than 4 times that of MI325X. + +
+ +## AMD Composability Issue on FP4, Distributed Inferencing and Wide Expert Parallelism + +While AMD performs somewhat decently on single node FP4 and performs competitively to B200 SGLang on FP8 distributed inferencing, the issue with the current AMD open source inferencing stack is that, while individual inference optimizations perform well, real customers deploy with multiple optimizations composed together. Top tier AI labs are all using FP4 **with **disaggregated inferencing **with** wide expert parallelism all enabled at the same time, and this is where the issue occurs. + +AMD software is still not meeting the mark, and the theoretical speed of light modelling at SemiAnalysis and at AMD show that for FP4, disaggregated inferencing with wide expert parallelism should perform better than inference on a single node of MI355X. Unfortunately, Software continues to be a massive bottleneck for AMD GPUs. AMD management needs to continue to sharpen resource allocation of their engineering talent, for instance, re-allocate their engineering resources away from pet single node projects that nobody uses like ATOM towards fixing the aforementioned issues with composability of inference optimizations between disaggregated inferencing, wide expert parallelism and FP4. The current subpar software is due to lack of focus and incorrect prioritization of where the industry already is at. All top tier labs are already using disaggregated inferencing and wide expert parallelism; AMD needs to stop focusing on single node and heavily invest focus into multi node inferencing for open source solutions. + +AMD is more than six months behind on open source distributed inferencing and wide expert parallelism and FP4 composability as shown by [Nvidia and SGLang team showing off their NVFP4 performance on DeepSeek six months ago](https://lmsys.org/blog/2025-09-25-gb200-part-2/). + +
+ +## AMD ATOM Engine + +AMD has launched a new inference engine called ATOM. Atom can deliver slightly better single node performance, but it is completely lacking on a lot of features that makes it unusable for real workloads. One such example is that it does not support NVMe or CPU KVCache offloading, tool parsing, wide expert parallelism, or disaggregated serving. This has led to zero customers using it in production. Unlike Nvidia’s TRTLLM which generates billions of tokens per hour globally at companies like TogetherAI, etc and [does support tool parsing and other features](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve/trtllm-serve.html#cmdoption-trtllm-serve-serve-tool_parser), there are no token factories currently using ATOM due to the lack of the aforementioned features. + +Furthermore, maintainers of open-source inference engines like vLLM are disappointed in AMD due to a lack of engineering and GPU resources provided by AMD. For example, Simon Mo, lead vLLM maintainer, states in this GitHub RFC that there is still no working MI355X that he can add to vLLM CI, hence the poor user experience. There are currently zero Mi355X tests on vLLM, while NVIDIA’s B200 has many tests on vLLM. Similarly, there are still not enough MI300X CI machines on vLLM. Upstream vLLM needs at least 20 more MI300 machines, 20 more MI325 machines and 20 more MI355X machines to reach the same level of usability as CUDA. + +We at SemiAnalysis have been trying to get AMD to contribute more compute to vLLM and have had some success on that within the couple weeks. vLLM will start to get a couple of MI355X machines such that they can bring their CI test parity from 0% to non-0%. We will talk more about AMD’s previous lackluster contribution towards vLLM, SGLang, PyTorch CI machine situation & how Anush started to fix it in our upcoming State of AMD article. At SemiAnalysis, we will have internal dashboard to track the # of tests & quality of tests that AMD & NVIDIA runs on vLLM, SGLang, PyTorch, & JAX. + +Moreover, the vLLM maintainers say that they cannot support day 0 vLLM support for ROCm due to this issue of lack of machine resources. This huge disparity in time to market continues to lead to ROCm lagging behind and leaving a huge opening for Nvidia to continue to charge an insane 75% gross margin (4x markup on cost of goods). + +
+ +Lastly, AMD has not had enough committers “who demonstrated sustained upstream engagement through feature shepherding and code ownership” and has a lack of reviewers that can review their own code. This is why the pace of development on ROCm vLLM has been much slower than for CUDA vLLM. + +There are many talented 10x engineers at AMD that work on ATOM and we would encourage AMD management to think about re-deploying these 10x engineers towards working on libraries and frameworks that people actually use, such as vLLM and SGLang. + +As we mentioned earlier, AMD also needs to prioritize addressing composability issues with FP4, wideEP and disaggregated serving as opposed to overly focusing on optimizing FP4 for a single node. + +
+ +## Multi Token Prediction (MTP) + +Speculative decoding reduces the cost of autoregressive generation by using a small, inexpensive draft model to propose several tokens ahead. The large model then checks the proposed tokens in a single forward pass that resembles a prefill computation. For a given input sequence length, a single forward pass can take roughly the same time when the input has N more tokens. Speculative decoding uses this property to run inference on a smaller model to draft multiple tokens for the main model to verify with a single forward pass, producing at most N additional tokens in a similar time budget. + +
+ +This assumption regarding additional token production with the same time budget is strongest for dense models because batched verification can reuse the same weight stream across multiple positions. For Mixture-of-Experts models, different tokens may route to different experts, so verifying multiple draft tokens can activate more experts than single-token decoding and force additional expert weights to be fetched from memory. As shown in the Mixtral 8x7B Instruct model results in the EAGLE paper, this extra memory traffic erodes bandwidth savings and can make verification notably comparable to a standard decoding step. + +Multi-token prediction pursues similar benefits without requiring a separate draft model. Auxiliary prediction heads are added to the model architecture, so a single model can propose several future tokens from the same underlying representation. This improves distribution alignment because the proposals come from the same model that ultimately scores them. Multi-token prediction also avoids the operational complexity of serving an additional model while still enabling multi-token generation strategies but requires the MTP heads to be pretrained alongside the main model. + +
+ +Across all SKUs, enabling MTP results in performance gains. By making use of the typically unused logits to verify the extra tokens, minimal compute overhead is added, saving extra expensive weight loads during decode. + +
+ +At large batch sizes, the inference regime is less memory-bandwidth bound compared to for low batch sizes. Since speculative decoding (including MTP) works by trading excess compute for fewer memory-bound decoding steps, this extra verification work from speculative tokens may not fit cleanly into slack, resulting in smaller improvements at high batch sizes. + +In terms of cost, MTP can drive huge cost savings, in the below table, we see that DeepSeek-R1-0528 run on FP4 using Dynamo TRT costs $0.251 per million total tokens, but enabling MTP can push costs down dramatically to only $0.057 per million total tokens. + +
+ +In all configs, when all else is held equal, using MTP with DeepSeek R1 increases interactivity with no significant impact on model accuracy. This is in line with the DeepSeek V3 tech report findings. + +
+ +Regarding the validity of MTP performance numbers, one may argue that the distribution of a synthetic dataset may not resemble real data. However, comparing MTP acceptance behavior between MTBench and our 1k1k benchmark, we see a very similar distribution confirming that our InferenceX benchmark is a good proxy for real world production performance. That said, InferenceX is not perfect and we are always looking to improve. If you want to be part of the mission, [apply to join our special projects team here](https://app.dover.com/apply/semianalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1). + +
+ +## Accuracy Evaluations + +Throughput optimizations can sometimes quietly trade off accuracy (e.g. via aggressively relaxed acceptance rates, decoding tweaks, numerically unstable kernels, or endpoint misconfiguration). Without evals, a misconfigured server (truncation, bad decoding, wrong endpoint params) can still produce great throughput numbers but deliver garbage answers. For example, this additional layer of checks has helped us discover issues with some DP attention implementation for GPT-OSS. + +Each representative throughput config now has an associated numerical accuracy check. Currently we are only using GSM8k, but being a very easy benchmark, the evaluation scores may not change much from differences in numerical calculation, and a harder benchmark may have a larger delta with respect to numerical accuracy. Thus, we plan to expand towards harder ones in the future, such as GPQA, HLE, MATH-500, SWE-Bench verified. + +Another form of performance-accuracy tradeoff is quantization. Serving models at lower precision may result in worse model outputs. For DeepSeek R1, FP8 runs have very slightly higher evaluation scores than FP4. Note that GSM8k evals are saturated and often during QAT/PAT it is calibrated to common popular GSM8k, MATH-500, etc, leading to sometimes evals showing great results while real world end user evaluation being subpar. If we want to be part of the team to figure out how to properly evaluate inference engine accuracy, [apply to join the mission here](https://app.dover.com/apply/semianalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1). + +
+ +## Anthropic Fast Mode Inferencing Explained + +Anthropic recently released “[fast mode](https://code.claude.com/docs/en/fast-mode)” alongside Opus 4.6. The value proposition: the same model quality at roughly 2.5× the speed, for around 6–12× the price. Both figures might seem surprising, and some users have speculated that [this must require new hardware](https://x.com/Yuchenj_UW/status/2020214926133063705). It doesn’t. In fact, this is just the fundamental tradeoff at play. Any model can be served at a wide range of interactivity levels (tokens/sec per user), and the cost per million tokens (CPMT) shifts accordingly. Mercedes makes metro busses as well as race cars, to follow long with our analogy. + +Bean counters may think that fast mode is more expensive, but when looking at it through a total cost of ownership lens, fast mode is actually way cheaper for some situations. For example, a GB200 NVL72 rack can cost 3.3 million dollars, and as such, if claude code agentic loops (which runs on Trainium in production) that tool use call NVL72 racks, and these racks run inference 2.5x slower, you would need 2.5x more racks to deliver inference, meaning that not enabling fast mode would cost close to 5 million dollars in extra spend. + +
+ +
+ +Consider a DeepSeek R1 0528 FP4 coding workflow served on B200s with TRT-LLM. At an interactivity of 50 tok/sec/user, inference cost is approximately $0.56/M output tokens. At an interactivity of 125 tok/sec/user, this rises to around $4/M output tokens, a 2.5× speed increase for a ~7× price increase, closely mirroring what we see with Anthropic’s fast mode. Note that this assumes DeepSeek R1 is similar to Opus 4.6, which isn’t the case. Still, the general principle holds true. + +
+ +
+ +This follows directly from the fundamental latency-throughput tradeoff in LLM inference. At high batch sizes, GPUs achieve better utilization and greater total token throughput, meaning more users served concurrently and lower cost per token. At low batch sizes with greater parallelism per request, each user gets faster responses, but total token throughput drops. Since the [hourly cost of the accelerators](https://semianalysis.com/ai-cloud-tco-model/) is fixed regardless of how they’re used, lower throughput means fewer tokens over which to amortize that cost, and thus a higher price per token. + +In short, fast mode isn’t necessarily a hardware story, but merely the natural consequence of trading throughput for latency on the same GPUs. + +
+ +Furthermore, we observe that inference optimization techniques such as speculative decoding, as explained earlier, can directly lead to cheaper inference; no new chips are required. + +Take the following example, DeepSeek R1 FP4 on an 8k/1k workload. At an interactivity level of 150 tok/sec/user, the baseline GB300 Dynamo TRT cost per million tokens is approximately $2.35, whereas enabling MTP decreases the price to approximately $0.11. This is a ~21x price decrease at this interactivity level simply by employing an inference optimization technique. + +
+ +
+ +
+ +Fixing an interactivity level of 50 tok/sec/user, we further see how much MTP can effectively decrease CPMT across a variety of chips. + +
+ +## Wide Expert Parallelism (WideEP) and Disaggregated Prefill + +In this section, we will go deeper on expert parallelism and go on to explain what *wide *expert parallelism is. We will then explain the idea of Disaggregated Prefill, how it is different from WideEP, and how WideEP and Disaggregated Prefill are used in unison to achieve SOTA performance. + +## WideEP + +By now, most frontier AI labs employ Mixture of Experts (MoE) model architectures as opposed to dense. In MoE architectures, only a subset of “experts” are activated for each token. For instance, DeepSeek R1 has 671B total parameters, but only 37B active parameters. Specifically, DeepSeek R1 has 256 routed experts (and 1 shared expert) with each token being routed to 8 distinct experts. This architecture lends itself naturally to expert parallelism (EP), which evenly distributes expert weights across some number of GPUs. + +Consider serving DeepSeek R1 on a single 8-GPU server. At 671B parameters, some form of parallelism is required to fit the model across available HBM. The naive approach is tensor parallelism (TP), which shards every weight matrix across all GPUs. This works well for dense models but ignores the sparse activation pattern of MoE. With TP=8, each expert’s weights are sharded across all 8 GPUs, meaning every expert activation requires an all-reduce across all GPUs & the reduction dims of the GEMM is smaller leading to lower arithmetic intensity, even though only 8 of 256 experts activate per token. TP treats each expert like a dense layer, paying full cross-GPU communication cost while the model’s sparsity goes unexploited. + +Expert parallelism takes a more well-suited approach, assigning whole experts to individual GPUs. With EP=8, we divide the 256 experts per layer across 8 GPUs for a total of 32 experts/layer/GPU. Each GPU holds approximately 1/8th of the expert weights plus a full replica of the non-expert weights (attention projections, embeddings, normalization, and the shared expert). Since roughly 90%+ of DeepSeek R1’s parameters are routed expert weights, EP captures most of the memory savings, and replicating the remaining less than 30B non-expert parameters across all 8 GPUs is affordable. + +The forward pass proceeds in two phases per layer. During attention, each GPU acts as an independent data-parallel rank, processing its own subset of requests using its replicated non-expert weights, no inter-GPU communication is needed. During the MoE phase, a lightweight router determines which experts each token requires, and tokens are dispatched to the appropriate GPUs via all-to-all communication. Each GPU executes its local experts on only the tokens routed to it, and results are returned via a second all-to-all. + +
+ +The obvious way to scale is replication: deploy N independent EP8 instances across N nodes. Each instance serves requests independently with no cross-node communication. This scales throughput linearly, but each GPU still holds 32 experts per layer, and each token activates at most 8 of those 32 local experts. 75% of expert weights sit cold in HBM. + +**Wide expert parallelism** (WideEP) takes a different approach by scaling EP *across *nodes rather than replicating independent instances. On a 64-GPU cluster (8 nodes), DP64/EP64 places only 256/64 = 4 experts per layer per GPU, each still holding a full replica of the non-expert weights. During the MoE phase, tokens from all 64 DP ranks are dispatched via all-to-all to the GPUs hosting their routed experts. + +This yields three compounding benefits over the single-node EP8 baseline. First, reducing expert footprint from 32 to 4 experts/GPU frees substantial HBM for KV cache, directly increasing per-GPU batch size capacity. Second, 64 DP ranks funneling tokens through fewer experts per GPU increases tokens-per-expert, raising arithmetic intensity (more FLOPs per byte of weights loaded) and improving compute utilization. The same expert weights service 8x more tokens per step. Third, aggregate HBM bandwidth scales linearly with GPU count; 64 GPUs loading expert weights simultaneously provide 8x the memory bandwidth of a single node, reducing memory bottleneck. + +
+ +The above configurations use only DP+EP (also known as DEP), where each GPU holds a full replica of all non-expert weights. As GPU count grows, this replication becomes increasingly wasteful. On a 64-GPU DP64/EP64 deployment, every GPU stores an identical copy of the ~40B non-expert parameters. + +Adding tensor parallelism within groups of GPUs addresses this. In an EP64/DP8/TP8 configuration, the 64 GPUs are organized into 8 DP groups of 8 GPUs each. Within each TP group, the attention projections, shared expert, normalization, and LM head are sharded 8 ways, so each GPU holds only 1/8th of the non-expert weights. Across the full cluster, the 256 experts are still distributed one-per-4-GPUs as before. + +Pure DEP has a single communication pattern: all-to-all for expert routing. Adding TP introduces a second all-reduce within each TP group for the attention and non-expert computations. The key design principle is to place TP groups within a single node, where NVLink or MNNVL provides high-bandwidth interconnect, and run EP/DP across nodes, where the all-to-all communication pattern can tolerate higher latency. + +As always, the tradeoff is that of throughput versus latency. TP=8 within a group means those 8 GPUs now share a batch and must synchronize every decode step, reducing effective DP degree from 64 to 8. Per-GPU batching independence on the attention side is lost. But each DP group now processes attention 8x faster per step, since the matmul is split 8 ways across the TP group. Per-token latency drops while peak concurrency also drops, sliding the configuration along the latency-throughput Pareto frontier relative to pure DEP. + +## Disaggregated Prefill + +Disaggregated prefill, sometimes referred to as prefill-decode (PD) disaggregation, is the process of performing prefill and decode phases of LLM inference on separate nodes. Prefill occurs when a request is first processed, and a forward pass is computed on all tokens at once, thereby “prefilling” the KV cache for this request. This is a compute-intensive operation as all tokens feed through the forward pass in parallel. Tokens are then generated or “decoded” one at a time, loading the KV cache from HBM at each decode step. This is a memory-intensive process as the growing KV cache is constantly being loaded. + +In traditional single-node inference, engines interleave prefill and decode on the same GPUs. Incoming prefill requests stall in-flight decode batches, increasing both time-to-first-token and inter-token latency. Chunked prefill mitigates this by breaking long prefills into smaller pieces, but the fundamental resource contention remains. Disaggregated prefill eliminates this entirely! + +
+ +Disaggregation also enables independent scaling and optimization of each phase. With separate nodes, each phase can be tuned independently: different parallelism strategies, different batch sizes, and different memory allocation ratios. The ratio of prefill to decode nodes can also be matched to the workload’s input-output length ratio. For instance, prefill-dominated workloads (long input, short output e.g., summarization, RAG, agentic coding with large context windows) allocate more prefill instances. Decode-dominated workloads (short input, long output e.g., chain-of-thought reasoning, long-form generation) allocate more decode instances. Workloads with high cache hit rates also tend toward more decode, since reused KV cache entries from shared system prompts or multi-turn conversation history skip prefill entirely. + +The key cost of disaggregation is KV cache transfer. After prefill completes, the full KV cache for that request must be transmitted from the prefill node to the decode node before the first decode token can be generated. For a model like DeepSeek R1 with 61 layers and FP8 KV cache, an 8192-token prefill produces roughly 500MB of KV data that must cross the network, adding directly to TTFT. This transfer is performed over RDMA (typically RoCE or InfiniBand) using zero-copy GPU-to-GPU data movement without CPU involvement. Libraries like NIXL (NVIDIA Inference Transfer Library) abstract the data movement layer behind a unified asynchronous API with pluggable backends for UCX, GPUDirect Storage, and other transports. This decouples the inference engine from any specific transfer protocol and enables disaggregation across heterogeneous hardware where prefill and decode instances may span different device types or interconnects. + +
+ +## Optimizing Inference with Wide EP + Disaggregated Serving + +Wide EP and disaggregated prefill are separate techniques that are often used together to achieve Pareto optimal performance. In this section, we walk through real results from InferenceX to build intuition for which combinations of parallelism strategy, wide EP, and disaggregated prefill are appropriate at different interactivity levels. + +It helps to first understand what parallelism strategies fall on what parts of the Pareto frontier for single-node configurations. Take the example of DeepSeek R1 FP4 8k/1k on a single 8-GPU B200 node with TRT-LLM. The optimal strategy shifts as you move along the frontier, driven primarily by batch size and its effect on expert activation density. + +At the highest interactivity levels (batch 1-16), pure TP outperforms any configuration involving EP. At low batch sizes, only a small fraction of experts activate per step. With EP, these activations are distributed unevenly across GPUs: at batch 4, only 32 of 256 experts fire, and any given GPU has roughly a low double digit percent chance of receiving zero routed tokens in a given layer. TP avoids this by sharding every expert across all GPUs, so all 8 GPUs participate equally in every expert computation regardless of which experts the router selects. We collected expert activation ratio versus batch size data while profiling DeepSeek R1, which confirms that at batch sizes 16 and below, expert activation per layer is very low. + +
+ +As we move to slightly lower interactivities, batch sizes remain small enough that expert weights are still sharded via TP rather than EP. The crossover occurs around batch 32, where approximately 50-60% of experts activate per layer. At this density, EP’s load imbalance becomes tolerable and its token-routing overhead is cheaper than the per-expert all-reduce required by TP. Configurations in this range use TEP: tensor parallelism for attention (all GPUs collaborate on each attention computation), expert parallelism for MoE layers (experts assigned to specific GPUs with all-to-all routing). In the highest throughput, lowest interactivity region of the frontier, batch sizes are large (128+) and configurations shift to full DEP: attention weights are fully replicated across all GPUs as independent data-parallel ranks, experts are distributed via EP, and batch capacity is maximized at the cost of per-token latency. (128+) and attention weights are fully replicated across all DP ranks, maximizing throughput. + +
+ +We observe the same general pattern when extending to wide EP with disaggregated prefill. Prefill and decode run with separate parallelism strategies and node counts, both tuned to the workload and target interactivity level. Take an 8k/1k workload (prefill heavy) at the high-throughput, low-interactivity end of the frontier. Prefill is the bottleneck as each request requires a forward pass of 8192 input tokens, which is computationally expensive. Recipes in this region allocate more prefill nodes than decode (4P1D, 7P2D, 4P3D) to sustain high prefill throughput. These prefill nodes run DEP configurations, replicating attention weights across independent data-parallel ranks so that multiple long-context prefills can be processed simultaneously. Decode nodes are fewer but run wide DEP with large batch sizes by the same principle as with single node. + +On the low interactivity end of the frontier, there are fewer concurrent requests in flight, so a single prefill instance can keep pace with incoming demand. Yet each request still requires 1024 decode steps, and at high interactivity those steps must be fast. Recipes in this region shift to more decode nodes than prefill (1P3D, 1P4D), with each decode instance running TEP at low batch size. Tensor parallelism on attention minimizes per-step latency by sharding the computation across all GPUs in the instance, while expert parallelism handles MoE routing at the moderate batch sizes where EP load balance is sufficient. Multiple small-batch decode instances, rather than fewer large-batch ones, keep per-token latency low while still providing enough concurrent serving capacity. + +
+ +
+ +
+ +## Dive into DeepSeek R1 Single Node Results + +On DeepSeek R1 FP8 1k1k, we see that MI355X is competitive with its counterpart B200 on single node scenarios, despite getting mogged on FP4 multi node scenarios. MI355X (SGLang) even beats B200 (SGLang) in throughput performance at lower interactivity levels. Moreover, MI355X (SGLang) beats B200 (TRT and SGLang) in most cases from a perf/TCO perspective. + +Unfortunately, the year is 2026, and most frontier labs and inference providers are not running FP8 nor single node inference. + +This result goes to show that AMDs chips are great and can be extremely competitive with Nvidia if only they could move faster on the software front. Speed is the moat. + +
+ +
+ +To that end, we see MI355X fall well behind B200 in performance on FP4: + +
+ +In comparing DeepSeek R1 FP8 perf between H200 (SGLang) and MI325X (SGLang), not much has changed since our initial release of InferenceXv1 last October. The MI325X data was captured on Feb 12th, 2026 with SGLang 0.5.8 whereas the B200 data was captured Jan 23, 2026 with SGLang 0.5.7. + +One thing we note is the considerably smaller interactivity range for MI325X than H200, with H200 ranging from 30-90 tok/sec/user whereas MI325X ranges from only 13-35 tok/sec/user. This is problematic for providers who would like to serve users at a broader range of interactivity. + +
+ +## GPT-OSS 120B Single Node + +MI300X, MI325X, H200, and H100 group in the lower-left of the throughput vs interactivity plot, indicating broadly similar tradeoffs, with Nvidia generally holding a modest lead. The next step up is MI355X, which delivers roughly more than 2x higher token throughput per GPU at a given interactivity level, relative to that first group. Within MI355X, ATOM shifts the curve toward higher throughput at low interactivity, suggesting it prioritizes peak throughput over per-user responsiveness. + +Above that tier sits NVIDIA’s B200 and GB200, which outperform MI355X across the frontier. While B200 and GB200 share the same Blackwell compute die, GB200 achieves a higher throughput–interactivity curve because the platform and serving stack reduce non-compute bottlenecks at scale (interconnect/topology, CPU-GPU coupling, and runtime scheduling), translating into effective scale-out and less overhead per token. + +
+ +If we add cost into the equation, MI355x becomes more competitive: beating B200 at high throughputs. However, GB200 still takes the cake for being the cheapest choice. + +
+ +Turning again to the comparison between B200 and GB200 NVL72, it is obvious the impact NVL72 has. We discussed the impact of the GB200 NVL72’s larger 72 GPU scale-up world size vs the B200’s 8 GPU scale-up world size earlier in this article. The output token throughput per GPU more than doubles in the ~100 tok/s/user interactivity range, showing the impact of the NVL72’s larger scale up domain. + +
+ +## Core InferenceX Repo Updates + +We have made a few core architectural changes to the InferenceX repository to make it easier to understand and reproduce benchmarks. Additionally, we have fully subscribed to AI usage to maximize productivity and increase developer velocity. + +## Core Changes Since InferenceXv1 + +One of the main changes we have made since v1 is the cadence with which we perform sweeps. Previously, we were jestermaxing and performed a full sweep over each configuration nightly. However, as we added more chips, disaggregated prefill, wide EP, and other features, we realized that running every single night was way too time consuming and wasteful. Moreover, it’s just not necessary – benchmarks only really need to be re-run when recipes change or a new software version is released. + +We now trigger sweeps based on additions to a [changelog](https://github.com/InferenceMAX/InferenceMAX/blob/main/perf-changelog.yaml)at the root of the repo. When a developer makes a performance-impacting change to a given config, they add an entry to the changelog listing the affected config along with a brief description of the change. All configs are defined in a [master configuration YAML file](https://github.com/InferenceMAX/InferenceMAX/blob/main/.github/configs/nvidia-master.yaml), which serves as the stateful representation of every data point to be swept, including core settings like ISL/OSL, EP, TP, DP, MTP, and so on. When a PR containing a changelog addition is merged, a workflow parses the referenced config keys, pulls the corresponding sweep definitions from the master config, and fans them out as individual GitHub Actions jobs. The jobs collect all data points for the full sweep and upload the results as artifacts. + +Below is a high-level diagram of how InferenceX launches jobs. + +
+ +## Klaud Cold AI Usage + +Shortly after the release of InferenceX v1, we realized how much developer throughput was being left on the table by not utilizing AI more in our InferenceX development. So, we rolled our sleeves up and decided to embrace Claude Code and begin absorbing intelligence, one token at a time to the point that we are currently spending at a $6,000/day run rate. If you want to contribute towards our KPI of absorbing an annualized $3 million dollars’ worth of Claude intelligence, [apply here to join the mission.](https://app.dover.com/apply/semianalysis/2a9c8da5-6d59-4ac8-8302-3877345dbce1) We started our enlightenment journey when we realized the GitHub Copilot agent was free – at first we couldn’t believe this feature came at no cost! We soon realized that Copilot is terrible and it became apparent why GitHub was giving it away for free. You probably would have had to *pay us *to keep using it. + +We had been using Claude Code locally ever since it was released. But recently, we have integrated Claude Code into InferenceX development, using it for the usual tasks such as reviewing PRs, but we also have given it the ability to perform sweeps on clusters. With the workflows we setup, Claude can manually initiate runs, view the results, and iterate. This has enabled us to deploy quick fixes easily on the go via the GitHub app. + +Another cool use case is using Claude to find recipes for new vLLM/SGLang images. When a new image is released, recipes sometimes need to be updated to achieve optimal performance (new environment variables, modified engine arguments, etc.) With our Claude Code integration, we simply open an issue and ask Claude to search through all commits in the image changelog to find necessary changes to be added to the recipe. This works quite well, and although it’s not _perfect_, it often gives a good starting point. + +## GitHub Actions + +In the spirit of open source, all runs occur on GitHub Actions, so benchmark results are verifiable, transparent, and reproducible. However, GitHub outages have been a constant obstacle to our goals recently. [We have seen more unicorns lately than any other animal](https://github.com/503.html)! But maybe it’s time for us to touch some grass. + +Microsoft/GitHub themselves are aware of this and have stopped updating its status page with aggregate uptime numbers and are down to a single 9: 97.36% over the past 90 days. The problem doesn’t seem to go away if you choose to ignore it... + +
+ +
+ +All in all, GitHub Actions is just alright. It provides a painfully average experience for developers. It is certainly not meant for launching thousands of jobs across a fleet of hundreds of GPUs. Nevertheless, we have worked closely with some GitHub Actions engineers since our launch to better meet the needs of InferenceX, and we can confidently say they have been a pleasure to work with. Moreover, one of our direct asks was to implement lazy loading for jobs when clicking on a workflow run and, while it did take them a while, [they eventually implemented the feature.](http://github.blog/changelog/2025-12-22-improved-performance-for-github-actions-workflows-page/) + +## Future of InferenceX + +Since the initial release of InferenceX in early October 2025, we have worked hard to continuously improve InferenceX. After release, we spent some time refactoring the codebase to make it more scalable, such that new models and inference techniques can now be added in a “plug and play” fashion. These changes enabled us to seamlessly integrate PD-disagg benchmarks for H100, H200, B200, B300, GB200, GB300, and MI355X. We also added accuracy evaluations to our default benchmark pipeline to ensure visibility into model performance across all configurations. + +Although we have made many improvements since our release, there is still much work to be done to achieve the north star goal of providing the most real-world inference benchmarks possible. To achieve this goal, we plan to benchmark on real datasets, add an agentic coding performance benchmark, include more SOTA inference optimizations, benchmark more models, and so much more. + +## Migration to Multi Turn Real Multi-Turn Chat and Agentic Coding Datasets + +Currently, InferenceX uses completely random tokens as input for benchmarking. We then vary the ISL/OSL uniformly subject to the distribution [ISL*0.8, ISL], similarly for OSL. Because of the random data, we disable prefix caching in all our benchmarks, as the expected value of a prefix cache hit rate on completely random data is 0%. Furthermore, all the random data is single-turn, meaning each conversation contains only one prompt and one response. While this provides a good baseline Pareto frontier, it is not a practical benchmark setup that mimics real-world production inference workloads. + +In the near term, we will create a basic multi-turn benchmark with a dataset like [allenai/WildChat-4.8M](https://huggingface.co/datasets/allenai/WildChat-4.8M), which captures real users’ multi-turn conversations. In addition to enabling prefix caching on all scenarios, we will enable KV cache CPU offloading, as this is what we see being done in production workloads. This will more accurately evaluate the strengths and weaknesses of each chip. For instance, MI355X has 288GB HBM3e versus B200s 192GB. Therefore, we expect MI355X to perform better in a high concurrency multiturn scenarios as more memory can be allocated to the KV cache. On the other hand, in scenarios where the GPU KV cache is stressed and blocks are offloaded to the CPU, we expect the GBs to excel as these chips have 900GB/s bidirectional CPU-GPU bandwidth, compared to 128GB/s / 256GB/s on HGX with PCIe 5.0 and 6.0, respectively. Moreover, currently we see AMD’s software for CPU offloading is poor, which may negatively affect performance in the same scenarios. + +The point is: real-world multiturn datasets test more SOTA inference engine features and can capture more nuanced and robust performance data across all chips. + +With the rise of Claude Code, Codex, and Kimi, it is becoming increasingly important to benchmark performance in agentic coding scenarios. Like above, these scenarios are multi-turn but also include extremely long context conversations as well as tool use. In the next few months, we plan on creating a benchmark suite that will most accurately capture the performance of open models in these agentic coding scenarios across all chips. + +## Adding TPU, Trainium and More Models + +Currently, we continuously benchmark DeepSeek R1 and GPT OSS 120B (previously Llama 3.1 70B as well). To keep up with the newest model architectures, we plan on adding DeepSeek V3.2 (w/ DSA), DeepSeek V4 on Day 0, Kimi K2.5, Qwen3, GLM5, and many more over the course of the next few months. We will also eventually add multi-modal models and be using EPD & CFD (invented by TogetherAI) optimization too. + +In addition to new models, we are actively working on adding both TPU and Trainium. + +## Total Cost of Ownership (NVL72, Blackwell, Blackwell Ultra, MI355, Hopper, MI325, MI300) + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + + + +--- + +_This article continues on our Substack. [Subscribe to SemiAnalysis](https://newsletter.semianalysis.com/subscribe) to read the complete article._ diff --git a/packages/app/cypress/e2e/blog.cy.ts b/packages/app/cypress/e2e/blog.cy.ts new file mode 100644 index 00000000..f8f8a523 --- /dev/null +++ b/packages/app/cypress/e2e/blog.cy.ts @@ -0,0 +1,52 @@ +describe('Blog', () => { + describe('Blog listing page', () => { + before(() => { + cy.visit('/blog'); + }); + + it('renders the blog page with heading', () => { + cy.get('h2').should('contain.text', 'Articles'); + }); + + it('displays at least one blog post card', () => { + cy.get('article').should('have.length.gte', 1); + }); + + it('post cards have titles and excerpts', () => { + cy.get('article') + .first() + .within(() => { + cy.get('h2').should('exist').and('not.be.empty'); + cy.get('p').should('exist'); + }); + }); + + it('post cards link to individual posts', () => { + cy.get('a[href^="/blog/"]').should('have.length.gte', 1); + }); + }); + + describe('Blog post page', () => { + before(() => { + cy.visit('/blog/inferencemax-open-source-inference-benchmarking'); + }); + + it('renders the post title', () => { + cy.get('h2').should('contain.text', 'InferenceMAX'); + }); + + it('displays post metadata', () => { + cy.contains('SemiAnalysis').should('exist'); + cy.contains('min read').should('exist'); + }); + + it('renders the article content', () => { + cy.get('article.prose').should('exist'); + cy.get('article.prose').should('contain.text', 'InferenceMAX'); + }); + + it('has a back link to the blog listing', () => { + cy.get('a[href="/blog"]').should('exist'); + }); + }); +}); diff --git a/packages/app/cypress/e2e/gpu-specs.cy.ts b/packages/app/cypress/e2e/gpu-specs.cy.ts index 61ad29e7..9d78bd19 100644 --- a/packages/app/cypress/e2e/gpu-specs.cy.ts +++ b/packages/app/cypress/e2e/gpu-specs.cy.ts @@ -165,7 +165,7 @@ describe('GPU Specs Tab', () => { .within(() => { cy.get('defs pattern[id^="logo-scaleout-"]').should('exist'); cy.get('defs pattern[id^="logo-scaleout-"] image') - .should('have.attr', 'href', '/logo.png') + .should('have.attr', 'href', '/brand/logo-color.png') .and('have.attr', 'opacity', '0.1'); }); }); @@ -176,7 +176,7 @@ describe('GPU Specs Tab', () => { .within(() => { cy.get('defs pattern[id^="logo-scaleup-sw-"]').should('exist'); cy.get('defs pattern[id^="logo-scaleup-sw-"] image') - .should('have.attr', 'href', '/logo.png') + .should('have.attr', 'href', '/brand/logo-color.png') .and('have.attr', 'opacity', '0.1'); }); }); @@ -187,7 +187,7 @@ describe('GPU Specs Tab', () => { .within(() => { cy.get('defs pattern[id^="logo-scaleup-mesh-"]').should('exist'); cy.get('defs pattern[id^="logo-scaleup-mesh-"] image') - .should('have.attr', 'href', '/logo.png') + .should('have.attr', 'href', '/brand/logo-color.png') .and('have.attr', 'opacity', '0.1'); }); }); @@ -198,7 +198,7 @@ describe('GPU Specs Tab', () => { .within(() => { cy.get('defs pattern[id^="logo-scaleup-nvl72-"]').should('exist'); cy.get('defs pattern[id^="logo-scaleup-nvl72-"] image') - .should('have.attr', 'href', '/logo.png') + .should('have.attr', 'href', '/brand/logo-color.png') .and('have.attr', 'opacity', '0.1'); }); }); @@ -353,7 +353,7 @@ describe('GPU Specs Radar Chart View', () => { .within(() => { cy.get('defs pattern#logo-pattern').should('exist'); cy.get('defs pattern#logo-pattern image') - .should('have.attr', 'href', '/logo.png') + .should('have.attr', 'href', '/brand/logo-color.png') .and('have.attr', 'opacity', '0.1'); }); }); diff --git a/packages/app/next.config.ts b/packages/app/next.config.ts index c5ee5988..0e558aaf 100644 --- a/packages/app/next.config.ts +++ b/packages/app/next.config.ts @@ -2,6 +2,13 @@ import type { NextConfig } from 'next'; const nextConfig: NextConfig = { transpilePackages: ['@semianalysisai/inferencex-constants'], + serverExternalPackages: ['shiki'], + images: { + remotePatterns: [ + { hostname: 'placehold.co' }, + { hostname: 'substack-post-media.s3.amazonaws.com' }, + ], + }, }; export default nextConfig; diff --git a/packages/app/package.json b/packages/app/package.json index 747a59f3..cb46bcb6 100644 --- a/packages/app/package.json +++ b/packages/app/package.json @@ -36,6 +36,8 @@ "@radix-ui/react-tooltip": "^1.2.8", "@semianalysisai/inferencex-constants": "workspace:*", "@semianalysisai/inferencex-db": "workspace:*", + "@shikijs/rehype": "^4.0.2", + "@tailwindcss/typography": "^0.5.19", "@tanstack/react-query": "^5.95.2", "@vercel/analytics": "^2.0.1", "@vercel/blob": "^2.3.1", @@ -44,13 +46,17 @@ "class-variance-authority": "^0.7.1", "clsx": "^2.1.1", "d3": "^7.9.0", + "gray-matter": "^4.0.3", "lodash-es": "^4.17.23", "lucide-react": "^1.0.1", "next": "16.2.1", + "next-mdx-remote": "^6.0.0", "next-themes": "^0.4.6", "posthog-js": "^1.363.3", "react": "19.2.4", "react-dom": "19.2.4", + "remark-gfm": "^4.0.1", + "shiki": "^4.0.2", "tailwind-merge": "^3.5.0" }, "devDependencies": { @@ -59,6 +65,7 @@ "@types/adm-zip": "^0.5.8", "@types/d3": "^7.4.3", "@types/lodash-es": "^4.17.12", + "@types/mdast": "^4.0.4", "@types/node": "^25.5.0", "@types/react": "^19.2.14", "@types/react-dom": "^19.2.3", diff --git a/packages/app/public/left-pattern-full.svg b/packages/app/public/brand/left-pattern-full.svg similarity index 100% rename from packages/app/public/left-pattern-full.svg rename to packages/app/public/brand/left-pattern-full.svg diff --git a/packages/app/public/brand/logo-color.png b/packages/app/public/brand/logo-color.png new file mode 100644 index 00000000..0ca97f5d Binary files /dev/null and b/packages/app/public/brand/logo-color.png differ diff --git a/packages/app/public/logo.webp b/packages/app/public/brand/logo-color.webp similarity index 100% rename from packages/app/public/logo.webp rename to packages/app/public/brand/logo-color.webp diff --git a/packages/app/public/brand/logo-white.png b/packages/app/public/brand/logo-white.png new file mode 100755 index 00000000..805544b9 Binary files /dev/null and b/packages/app/public/brand/logo-white.png differ diff --git a/packages/app/public/brand/og-tiles/gold-circuit.png b/packages/app/public/brand/og-tiles/gold-circuit.png new file mode 100644 index 00000000..7879748c Binary files /dev/null and b/packages/app/public/brand/og-tiles/gold-circuit.png differ diff --git a/packages/app/public/brand/og-tiles/gold-diagonal.png b/packages/app/public/brand/og-tiles/gold-diagonal.png new file mode 100644 index 00000000..9998a87e Binary files /dev/null and b/packages/app/public/brand/og-tiles/gold-diagonal.png differ diff --git a/packages/app/public/brand/og-tiles/gold-wavy.png b/packages/app/public/brand/og-tiles/gold-wavy.png new file mode 100644 index 00000000..8b6ceb85 Binary files /dev/null and b/packages/app/public/brand/og-tiles/gold-wavy.png differ diff --git a/packages/app/public/brand/og-tiles/teal-chevron.png b/packages/app/public/brand/og-tiles/teal-chevron.png new file mode 100644 index 00000000..65c2bc13 Binary files /dev/null and b/packages/app/public/brand/og-tiles/teal-chevron.png differ diff --git a/packages/app/public/brand/og-tiles/teal-chip.png b/packages/app/public/brand/og-tiles/teal-chip.png new file mode 100644 index 00000000..5d9a165f Binary files /dev/null and b/packages/app/public/brand/og-tiles/teal-chip.png differ diff --git a/packages/app/public/brand/og-tiles/teal-circuit.png b/packages/app/public/brand/og-tiles/teal-circuit.png new file mode 100644 index 00000000..d8f21df7 Binary files /dev/null and b/packages/app/public/brand/og-tiles/teal-circuit.png differ diff --git a/packages/app/public/brand/og-tiles/teal-organic.png b/packages/app/public/brand/og-tiles/teal-organic.png new file mode 100644 index 00000000..6b722bf9 Binary files /dev/null and b/packages/app/public/brand/og-tiles/teal-organic.png differ diff --git a/packages/app/public/brand/og-tiles/teal-topo.png b/packages/app/public/brand/og-tiles/teal-topo.png new file mode 100644 index 00000000..7ba281f0 Binary files /dev/null and b/packages/app/public/brand/og-tiles/teal-topo.png differ diff --git a/packages/app/public/right-pattern-full.svg b/packages/app/public/brand/right-pattern-full.svg similarity index 100% rename from packages/app/public/right-pattern-full.svg rename to packages/app/public/brand/right-pattern-full.svg diff --git a/packages/app/public/logo-black.png b/packages/app/public/logo-black.png deleted file mode 100644 index 48c7e4ca..00000000 Binary files a/packages/app/public/logo-black.png and /dev/null differ diff --git a/packages/app/public/logo.png b/packages/app/public/logo.png deleted file mode 100644 index 69343020..00000000 Binary files a/packages/app/public/logo.png and /dev/null differ diff --git a/packages/app/src/app/blog/[slug]/og-image-render.tsx b/packages/app/src/app/blog/[slug]/og-image-render.tsx new file mode 100644 index 00000000..06ce2aaf --- /dev/null +++ b/packages/app/src/app/blog/[slug]/og-image-render.tsx @@ -0,0 +1,189 @@ +/** + * Blog OG image — circuit tile sidebar with content panel. + */ +import { readFile } from 'node:fs/promises'; +import { join } from 'node:path'; + +import { ImageResponse } from 'next/og'; + +import type { BlogPostMeta } from '@/lib/blog'; + +export const size = { width: 1200, height: 630 }; + +const BLUE = '#0B86D1'; +const BG = '#131416'; +const PANEL_BG = '#0F1214'; + +// Tile grid layout (row-major, 2 cols). rotate: degrees to apply at render time. +const TILE_GRID: ({ file: string; rotate?: number } | null)[] = [ + { file: 'teal-chevron.png', rotate: 180 }, // r0c0 + { file: 'gold-diagonal.png' }, // r0c1 + { file: 'teal-circuit.png' }, // r1c0 + null, // r1c1 + { file: 'gold-wavy.png' }, // r2c0 + { file: 'teal-chip.png' }, // r2c1 + { file: 'teal-chevron.png', rotate: 90 }, // r3c0 + { file: 'teal-organic.png' }, // r3c1 + null, // r4c0 + { file: 'gold-circuit.png' }, // r4c1 + { file: 'teal-circuit.png', rotate: 180 }, // r5c0 + { file: 'teal-organic.png', rotate: 180 }, // r5c1 +]; + +async function loadTile(name: string): Promise { + const buf = await readFile(join(process.cwd(), 'public/brand/og-tiles', name)); + return `data:image/png;base64,${buf.toString('base64')}`; +} + +// Dedupe file loads — same file used multiple times only loads once +async function loadTiles() { + const uniqueFiles = [...new Set(TILE_GRID.filter(Boolean).map((t) => t!.file))]; + const loaded = await Promise.all(uniqueFiles.map(async (f) => [f, await loadTile(f)] as const)); + const cache = Object.fromEntries(loaded); + return TILE_GRID.map((t) => (t ? { src: cache[t.file], rotate: t.rotate } : null)); +} + +export async function renderOgImage(meta: BlogPostMeta) { + const [logoSrc, tiles] = await Promise.all([ + readFile(join(process.cwd(), 'public/brand/logo-color.png')).then( + (buf) => `data:image/png;base64,${buf.toString('base64')}`, + ), + loadTiles(), + ]); + const titleSize = meta.title.length > 50 ? 56 : meta.title.length > 35 ? 64 : 72; + + // Estimate how much subtitle fits after the title + const contentWidth = 895; // 1200 - 195 panel - 55*2 padding + const textBoxHeight = 482; // 630 - 48 top - 20 bottom - 80 logo row + const titleLineH = Math.ceil(titleSize * 1.2); + const charsPerTitleLine = Math.floor(contentWidth / (titleSize * 0.55)); + const titleLines = Math.ceil(meta.title.length / charsPerTitleLine); + const titleHeight = titleLines * titleLineH + 18; // +18 for gap + const subtitleLineH = Math.ceil(42 * 1.4); + const subtitleSpace = textBoxHeight - titleHeight; + const maxSubtitleLines = Math.max(0, Math.floor(subtitleSpace / subtitleLineH)); + const charsPerSubtitleLine = Math.floor(contentWidth / (42 * 0.52)); + const maxSubtitleChars = maxSubtitleLines * charsPerSubtitleLine; + + let subtitle = meta.subtitle; + if (subtitle.length > maxSubtitleChars && maxSubtitleChars > 0) { + subtitle = subtitle.slice(0, maxSubtitleChars).replace(/\s\S*$/, '') + '…'; + } else if (maxSubtitleChars <= 0) { + subtitle = ''; + } + + return new ImageResponse( +
+ {/* Left panel */} +
+ {/* Grid of circuit tile images */} + {tiles.map((tile, i) => { + if (!tile) return null; + const row = Math.floor(i / 2); + const col = i % 2; + return ( + + ); + })} + {/* Blue accent bar */} +
+
+ + {/* Content */} +
+
+
+ {meta.title} +
+
+ {subtitle} +
+
+ +
+ + {new Date(meta.date + 'T00:00:00Z').toLocaleDateString('en-US', { + year: 'numeric', + month: 'long', + day: 'numeric', + timeZone: 'UTC', + })} + + +
+
+
, + size, + ); +} diff --git a/packages/app/src/app/blog/[slug]/opengraph-image.tsx b/packages/app/src/app/blog/[slug]/opengraph-image.tsx new file mode 100644 index 00000000..92f7349b --- /dev/null +++ b/packages/app/src/app/blog/[slug]/opengraph-image.tsx @@ -0,0 +1,41 @@ +import { ImageResponse } from 'next/og'; + +import { getAllPosts, getPostBySlug } from '@/lib/blog'; + +import { renderOgImage, size } from './og-image-render'; + +export const alt = 'InferenceX Articles'; +export { size }; +export const contentType = 'image/png'; + +export function generateStaticParams() { + return getAllPosts().map((post) => ({ slug: post.slug })); +} + +export default async function OgImage({ params }: { params: Promise<{ slug: string }> }) { + const { slug } = await params; + const result = getPostBySlug(slug); + + if (!result) { + return new ImageResponse( +
+ InferenceX Articles +
, + size, + ); + } + + return await renderOgImage(result.meta); +} diff --git a/packages/app/src/app/blog/[slug]/page.tsx b/packages/app/src/app/blog/[slug]/page.tsx new file mode 100644 index 00000000..4de8d8dd --- /dev/null +++ b/packages/app/src/app/blog/[slug]/page.tsx @@ -0,0 +1,199 @@ +import type { Metadata } from 'next'; +import { notFound } from 'next/navigation'; +import { compileMDX } from 'next-mdx-remote/rsc'; +import rehypeShikiFromHighlighter from '@shikijs/rehype/core'; +import remarkGfm from 'remark-gfm'; +import { createHighlighterCore } from 'shiki/core'; +import { createOnigurumaEngine } from 'shiki/engine/oniguruma'; + +import { BlogBackLink } from '@/components/blog/blog-back-link'; +import { BlogPostNav } from '@/components/blog/blog-post-nav'; +import { BlogToc } from '@/components/blog/blog-toc'; +import { HashScroll } from '@/components/blog/hash-scroll'; +import { createMdxComponents } from '@/components/blog/mdx-components'; +import { ReadingProgressBar } from '@/components/blog/reading-progress-bar'; +import { ShareTwitterButton, ShareLinkedInButton } from '@/components/share-buttons'; +import { Card } from '@/components/ui/card'; +import { getAllPosts, getAdjacentPosts, extractHeadings, getPostBySlug } from '@/lib/blog'; +import { + AUTHOR_HANDLE, + AUTHOR_NAME, + SITE_NAME, + SITE_URL, +} from '@semianalysisai/inferencex-constants'; + +interface Props { + params: Promise<{ slug: string }>; +} + +export function generateStaticParams() { + return getAllPosts().map((post) => ({ slug: post.slug })); +} + +export async function generateMetadata({ params }: Props): Promise { + const { slug } = await params; + const result = getPostBySlug(slug); + if (!result) return {}; + const { meta } = result; + + return { + title: meta.title, + description: meta.subtitle, + keywords: meta.tags, + authors: [{ name: AUTHOR_NAME }], + alternates: { canonical: `${SITE_URL}/blog/${slug}` }, + openGraph: { + title: `${meta.title} | ${SITE_NAME}`, + description: meta.subtitle, + url: `${SITE_URL}/blog/${slug}`, + type: 'article', + publishedTime: `${meta.date}T00:00:00Z`, + ...(meta.modifiedDate && { modifiedTime: `${meta.modifiedDate}T00:00:00Z` }), + authors: [AUTHOR_NAME], + tags: meta.tags, + }, + twitter: { + card: 'summary_large_image', + title: meta.title, + description: meta.subtitle, + site: AUTHOR_HANDLE, + creator: AUTHOR_HANDLE, + }, + }; +} + +let highlighterPromise: ReturnType | null = null; + +function getHighlighter() { + if (!highlighterPromise) { + highlighterPromise = createHighlighterCore({ + themes: [import('shiki/themes/github-dark.mjs'), import('shiki/themes/github-light.mjs')], + langs: [ + import('shiki/langs/typescript.mjs'), + import('shiki/langs/javascript.mjs'), + import('shiki/langs/python.mjs'), + import('shiki/langs/bash.mjs'), + import('shiki/langs/json.mjs'), + import('shiki/langs/yaml.mjs'), + import('shiki/langs/css.mjs'), + import('shiki/langs/html.mjs'), + import('shiki/langs/tsx.mjs'), + import('shiki/langs/jsx.mjs'), + import('shiki/langs/sql.mjs'), + import('shiki/langs/go.mjs'), + import('shiki/langs/rust.mjs'), + ], + engine: createOnigurumaEngine(import('shiki/wasm')), + }); + } + return highlighterPromise; +} + +export default async function BlogPostPage({ params }: Props) { + const { slug } = await params; + const result = getPostBySlug(slug); + if (!result) notFound(); + + const { meta, raw } = result; + const adjacent = getAdjacentPosts(slug); + const headings = extractHeadings(raw); + const highlighter = await getHighlighter(); + + const { content } = await compileMDX({ + source: raw, + components: createMdxComponents(), + options: { + mdxOptions: { + remarkPlugins: [remarkGfm], + rehypePlugins: [ + [ + rehypeShikiFromHighlighter, + highlighter, + { + themes: { dark: 'github-dark', light: 'github-light' }, + defaultColor: false, + }, + ], + ], + }, + }, + }); + + const jsonLd = { + '@context': 'https://schema.org', + '@type': 'BlogPosting', + headline: meta.title, + author: { '@type': 'Person', name: AUTHOR_NAME }, + publisher: { '@type': 'Organization', name: AUTHOR_NAME }, + datePublished: `${meta.date}T00:00:00Z`, + ...(meta.modifiedDate && { dateModified: `${meta.modifiedDate}T00:00:00Z` }), + description: meta.subtitle, + url: `${SITE_URL}/blog/${slug}`, + wordCount: raw.trim().split(/\s+/).length, + timeRequired: `PT${meta.readingTime}M`, + }; + + return ( +
+ + + +
+
+ + +
+

{meta.title}

+

{meta.subtitle}

+
+ {AUTHOR_NAME} + · + + · + {meta.readingTime} min read + {meta.tags && meta.tags.length > 0 && ( + <> + · + {meta.tags.map((tag) => ( + + {tag} + + ))} + + )} +
+
+ + +
+
+
+ {headings.length > 0 && } + +
+ {content} +

+ All articles and posts are © SemiAnalysis. All rights reserved. The AGPL-3.0 + license covering the application source code does not apply to article content. +

+
+
+ +
+
+
+ ); +} diff --git a/packages/app/src/app/blog/page.tsx b/packages/app/src/app/blog/page.tsx new file mode 100644 index 00000000..502ef2bc --- /dev/null +++ b/packages/app/src/app/blog/page.tsx @@ -0,0 +1,118 @@ +import type { Metadata } from 'next'; +import Link from 'next/link'; + +import { BlogPostCard } from '@/components/blog/blog-post-card'; +import { BlogTagLink } from '@/components/blog/blog-tag-link'; +import { Card } from '@/components/ui/card'; +import { getAllPosts } from '@/lib/blog'; +import { SITE_URL, SITE_NAME, AUTHOR_NAME } from '@semianalysisai/inferencex-constants'; + +export const metadata: Metadata = { + title: 'Articles', + description: `Technical articles from ${SITE_NAME} by ${AUTHOR_NAME} — AI inference benchmarking, GPU performance analysis, and ML infrastructure insights.`, + alternates: { canonical: `${SITE_URL}/blog` }, + openGraph: { + title: `Articles | ${SITE_NAME} by ${AUTHOR_NAME}`, + description: 'AI inference benchmarking insights and GPU performance analysis.', + url: `${SITE_URL}/blog`, + }, +}; + +const jsonLd = { + '@context': 'https://schema.org', + '@type': 'Blog', + name: `${SITE_NAME} Articles`, + url: `${SITE_URL}/blog`, + publisher: { + '@type': 'Organization', + name: AUTHOR_NAME, + }, +}; + +export default async function BlogPage({ + searchParams, +}: { + searchParams: Promise<{ tag?: string }>; +}) { + const { tag: activeTag } = await searchParams; + const posts = getAllPosts(); + const allTags = [...new Set(posts.flatMap((p) => p.tags ?? []))].sort(); + const filtered = activeTag ? posts.filter((p) => p.tags?.includes(activeTag)) : posts; + + return ( +
+ +
+
+ +

Articles

+

+ Insights on AI inference benchmarking, GPU performance, and ML infrastructure. +

+ {allTags.length > 0 && ( +
+ + All + + {allTags.map((tag) => ( + + ))} +
+ )} +
+ + {filtered.length === 0 ? ( +

+ {activeTag ? `No articles tagged "${activeTag}".` : 'Coming soon.'} +

+ ) : ( +
+ {filtered.map((post) => ( + +
+
+ + · + {post.readingTime} min read +
+

+ {post.title} +

+

{post.subtitle}

+ {post.tags && post.tags.length > 0 && ( +
+ {post.tags.map((tag) => ( + + {tag} + + ))} +
+ )} +
+
+ ))} +
+ )} +
+
+
+
+ ); +} diff --git a/packages/app/src/app/feed.xml/route.test.ts b/packages/app/src/app/feed.xml/route.test.ts new file mode 100644 index 00000000..2168a29f --- /dev/null +++ b/packages/app/src/app/feed.xml/route.test.ts @@ -0,0 +1,50 @@ +import { describe, expect, it, vi } from 'vitest'; + +import type { BlogPostMeta } from '@/lib/blog'; + +const FAKE_POSTS: BlogPostMeta[] = [ + { + title: 'Test Post', + subtitle: 'A test subtitle', + date: '2026-01-15', + slug: 'test-post', + readingTime: 1, + tags: ['testing'], + }, +]; + +vi.mock('@/lib/blog', () => ({ + getAllPosts: () => FAKE_POSTS, +})); + +const { GET } = await import('./route'); + +describe('RSS feed route', () => { + it('returns valid RSS XML with correct content type', async () => { + const response = await GET(); + + expect(response.headers.get('Content-Type')).toBe('application/rss+xml; charset=utf-8'); + + const body = await response.text(); + expect(body).toContain(''); + expect(body).toContain(''); + }); + + it('includes blog posts in the feed', async () => { + const response = await GET(); + const body = await response.text(); + + expect(body).toContain(''); + expect(body).toContain('Test Post'); + }); + + it('includes required RSS namespaces', async () => { + const response = await GET(); + const body = await response.text(); + + expect(body).toContain('xmlns:dc='); + expect(body).toContain('xmlns:atom='); + }); +}); diff --git a/packages/app/src/app/feed.xml/route.ts b/packages/app/src/app/feed.xml/route.ts new file mode 100644 index 00000000..1ecb24ed --- /dev/null +++ b/packages/app/src/app/feed.xml/route.ts @@ -0,0 +1,55 @@ +import { getAllPosts } from '@/lib/blog'; +import { AUTHOR_NAME, SITE_NAME, SITE_URL } from '@semianalysisai/inferencex-constants'; + +function escapeXml(s: string): string { + return s + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"') + .replace(/'/g, '''); +} + +export async function GET() { + const posts = getAllPosts(); + const now = new Date().toUTCString(); + + const items = posts + .map( + (post) => ` + ${escapeXml(post.title)} + ${SITE_URL}/blog/${post.slug} + ${SITE_URL}/blog/${post.slug} + ${escapeXml(post.subtitle)} + ${escapeXml(AUTHOR_NAME)} + ${new Date(post.date + 'T00:00:00Z').toUTCString()}${ + post.tags + ? post.tags.map((tag) => `\n ${escapeXml(tag)}`).join('') + : '' + } + `, + ) + .join('\n'); + + const xml = ` + + + ${escapeXml(SITE_NAME)} Articles + Technical articles from ${escapeXml(SITE_NAME)} by ${escapeXml(AUTHOR_NAME)} + ${SITE_URL}/blog + ${now} + +${items} + +`; + + return new Response(xml, { + headers: { + 'Content-Type': 'application/rss+xml; charset=utf-8', + 'Cache-Control': 'public, max-age=3600, s-maxage=3600', + }, + }); +} diff --git a/packages/app/src/app/globals.css b/packages/app/src/app/globals.css index 55c11214..8ab14ee3 100644 --- a/packages/app/src/app/globals.css +++ b/packages/app/src/app/globals.css @@ -1,8 +1,40 @@ @import 'tailwindcss'; @import 'tw-animate-css'; +@plugin '@tailwindcss/typography'; @custom-variant dark (&:is(.dark *)); +/* Allow hash navigation to clear the fixed navbar */ +.blog-prose h2[id], +.blog-prose h3[id] { + scroll-margin-top: 6rem; +} + +/* Allow long inline code to wrap on mobile */ +.blog-prose code { + overflow-wrap: break-word; + word-break: break-all; +} + +/* Remove auto-inserted curly quotes from blockquotes in blog prose */ +.blog-prose blockquote p:first-of-type::before, +.blog-prose blockquote p:last-of-type::after { + content: none; +} + +/* Shiki dual-theme: light by default, dark when .dark class is active */ +.shiki, +.shiki span { + color: var(--shiki-light); + background-color: var(--shiki-light-bg); +} + +.dark .shiki, +.dark .shiki span { + color: var(--shiki-dark); + background-color: var(--shiki-dark-bg); +} + @theme inline { /* Radius */ --radius-sm: calc(var(--radius) - 4px); diff --git a/packages/app/src/app/layout.tsx b/packages/app/src/app/layout.tsx index ad753cf1..dc0a0009 100644 --- a/packages/app/src/app/layout.tsx +++ b/packages/app/src/app/layout.tsx @@ -76,6 +76,9 @@ export const metadata: Metadata = { }, alternates: { canonical: SITE_URL, + types: { + 'application/rss+xml': `${SITE_URL}/feed.xml`, + }, }, icons: { icon: [ @@ -158,13 +161,12 @@ export default function RootLayout({ + + - { + const result = getPostBySlug(post.slug); + if (!result) return ''; + + return [ + `# ${post.title}`, + '', + `> ${post.subtitle}`, + '', + `- **Author**: ${AUTHOR_NAME}`, + `- **Date**: ${post.date}`, + `- **URL**: ${SITE_URL}/blog/${post.slug}`, + ...(post.tags ? [`- **Tags**: ${post.tags.join(', ')}`] : []), + `- **Reading time**: ${post.readingTime} min`, + '', + result.raw, + ].join('\n'); + }); + + const body = [ + `# ${SITE_NAME} Articles — Full Content`, + `> By ${AUTHOR_NAME}`, + '', + `This file contains the full text of all articles from ${SITE_NAME} (${SITE_URL}/blog).`, + `It is intended for consumption by large language models and AI assistants.`, + '', + '---', + '', + ...sections.flatMap((s) => [s, '', '---', '']), + ].join('\n'); + + return new Response(body, { + headers: { + 'Content-Type': 'text/plain; charset=utf-8', + 'Cache-Control': 'public, max-age=3600, s-maxage=3600', + }, + }); +} diff --git a/packages/app/src/app/llms.txt/route.ts b/packages/app/src/app/llms.txt/route.ts new file mode 100644 index 00000000..5196e1e1 --- /dev/null +++ b/packages/app/src/app/llms.txt/route.ts @@ -0,0 +1,31 @@ +import { getAllPosts } from '@/lib/blog'; +import { AUTHOR_NAME, SITE_NAME, SITE_URL } from '@semianalysisai/inferencex-constants'; + +export async function GET() { + const posts = getAllPosts(); + + const lines = [ + `# ${SITE_NAME} by ${AUTHOR_NAME}`, + '', + `> ${SITE_NAME} is the open-source AI inference benchmark dashboard. We compare GPU performance for LLM inference across NVIDIA GB200, H100, AMD MI355X, and more.`, + '', + `## Links`, + '', + `- [Dashboard](${SITE_URL})`, + `- [Articles](${SITE_URL}/blog)`, + `- [RSS Feed](${SITE_URL}/feed.xml)`, + `- [Full content for LLMs](${SITE_URL}/llms-full.txt)`, + `- [GitHub](https://github.com/SemiAnalysisAI/InferenceX)`, + '', + `## Articles`, + '', + ...posts.map((post) => `- [${post.title}](${SITE_URL}/blog/${post.slug}): ${post.subtitle}`), + ]; + + return new Response(lines.join('\n'), { + headers: { + 'Content-Type': 'text/plain; charset=utf-8', + 'Cache-Control': 'public, max-age=3600, s-maxage=3600', + }, + }); +} diff --git a/packages/app/src/app/sitemap.ts b/packages/app/src/app/sitemap.ts index e6920a3b..69461dfe 100644 --- a/packages/app/src/app/sitemap.ts +++ b/packages/app/src/app/sitemap.ts @@ -1,5 +1,6 @@ import type { MetadataRoute } from 'next'; +import { getAllPosts } from '@/lib/blog'; import { SITE_URL as BASE_URL } from '@semianalysisai/inferencex-constants'; const TABS = [ @@ -39,5 +40,17 @@ export default function sitemap(): MetadataRoute.Sitemap { changeFrequency: 'monthly', priority: 0.6, }, + { + url: `${BASE_URL}/blog`, + lastModified: now, + changeFrequency: 'weekly', + priority: 0.8, + }, + ...getAllPosts().map((post) => ({ + url: `${BASE_URL}/blog/${post.slug}`, + lastModified: new Date((post.modifiedDate ?? post.date) + 'T00:00:00Z').toISOString(), + changeFrequency: 'monthly' as const, + priority: 0.7, + })), ]; } diff --git a/packages/app/src/components/blog/blog-back-link.tsx b/packages/app/src/components/blog/blog-back-link.tsx new file mode 100644 index 00000000..e895da59 --- /dev/null +++ b/packages/app/src/components/blog/blog-back-link.tsx @@ -0,0 +1,18 @@ +'use client'; + +import Link from 'next/link'; +import { track } from '@/lib/analytics'; + +export function BlogBackLink() { + return ( + + ); +} diff --git a/packages/app/src/components/blog/blog-post-card.tsx b/packages/app/src/components/blog/blog-post-card.tsx new file mode 100644 index 00000000..686d7bb8 --- /dev/null +++ b/packages/app/src/components/blog/blog-post-card.tsx @@ -0,0 +1,23 @@ +'use client'; + +import Link from 'next/link'; +import type { ReactNode } from 'react'; +import { track } from '@/lib/analytics'; + +interface BlogPostCardProps { + slug: string; + title: string; + children: ReactNode; +} + +export function BlogPostCard({ slug, title, children }: BlogPostCardProps) { + return ( + track('blog_post_clicked', { slug, title })} + > + {children} + + ); +} diff --git a/packages/app/src/components/blog/blog-post-nav.tsx b/packages/app/src/components/blog/blog-post-nav.tsx new file mode 100644 index 00000000..c006e494 --- /dev/null +++ b/packages/app/src/components/blog/blog-post-nav.tsx @@ -0,0 +1,54 @@ +'use client'; + +import Link from 'next/link'; +import { ChevronLeft, ChevronRight } from 'lucide-react'; +import { track } from '@/lib/analytics'; + +interface PostLink { + slug: string; + title: string; +} + +interface BlogPostNavProps { + prev: PostLink | null; + next: PostLink | null; +} + +export function BlogPostNav({ prev, next }: BlogPostNavProps) { + if (!prev && !next) return null; + + return ( + + ); +} diff --git a/packages/app/src/components/blog/blog-tag-link.tsx b/packages/app/src/components/blog/blog-tag-link.tsx new file mode 100644 index 00000000..c7ae87c7 --- /dev/null +++ b/packages/app/src/components/blog/blog-tag-link.tsx @@ -0,0 +1,28 @@ +'use client'; + +import Link from 'next/link'; +import { track } from '@/lib/analytics'; + +interface BlogTagLinkProps { + tag: string; + active?: boolean; +} + +export function BlogTagLink({ tag, active }: BlogTagLinkProps) { + return ( + { + e.stopPropagation(); + track('blog_tag_filtered', { tag }); + }} + > + {tag} + + ); +} diff --git a/packages/app/src/components/blog/blog-toc.tsx b/packages/app/src/components/blog/blog-toc.tsx new file mode 100644 index 00000000..11a8534f --- /dev/null +++ b/packages/app/src/components/blog/blog-toc.tsx @@ -0,0 +1,170 @@ +'use client'; + +import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { track } from '@/lib/analytics'; +import { Card } from '@/components/ui/card'; +import type { TocHeading } from '@/lib/blog'; + +interface BlogTocProps { + headings: TocHeading[]; +} + +export function BlogToc({ headings }: BlogTocProps) { + const [activeId, setActiveId] = useState(''); + const [showSidebar, setShowSidebar] = useState(false); + const observerRef = useRef(null); + const sectionRef = useRef(null); + const sidebarRef = useRef(null); + const sectionTopRef = useRef(0); + const sidebarLeftRef = useRef(0); + + const updateLayout = useCallback(() => { + const section = sectionRef.current ?? document.querySelector('[data-blog-section]'); + if (!section) return; + sectionRef.current = section as HTMLElement; + const rect = section.getBoundingClientRect(); + const rightEdge = rect.right; + sectionTopRef.current = rect.top + window.scrollY; + sidebarLeftRef.current = rightEdge + 32; + const fits = window.innerWidth - rightEdge >= 240; + setShowSidebar(fits); + if (fits && sidebarRef.current) { + sidebarRef.current.style.left = `${rightEdge + 32}px`; + const top = Math.max(32, sectionTopRef.current - window.scrollY); + sidebarRef.current.style.top = `${top}px`; + } + }, []); + + useEffect(() => { + const raf = requestAnimationFrame(updateLayout); + window.addEventListener('resize', updateLayout); + + function onScroll() { + if (!sidebarRef.current) return; + const top = Math.max(32, sectionTopRef.current - window.scrollY); + sidebarRef.current.style.top = `${top}px`; + } + + window.addEventListener('scroll', onScroll, { passive: true }); + return () => { + cancelAnimationFrame(raf); + window.removeEventListener('resize', updateLayout); + window.removeEventListener('scroll', onScroll); + }; + }, [updateLayout]); + + useEffect(() => { + const elements = headings + .map((h) => document.getElementById(h.id)) + .filter(Boolean) as HTMLElement[]; + + if (elements.length === 0) return; + + observerRef.current = new IntersectionObserver( + (entries) => { + for (const entry of entries) { + if (entry.isIntersecting) { + setActiveId(entry.target.id); + break; + } + } + }, + { rootMargin: '0px 0px -80% 0px', threshold: 0 }, + ); + + for (const el of elements) { + observerRef.current.observe(el); + } + + function onScrollEnd() { + const atBottom = window.innerHeight + window.scrollY >= document.body.scrollHeight - 50; + if (atBottom && headings.length > 0) { + setActiveId(headings[headings.length - 1].id); + } + } + + window.addEventListener('scroll', onScrollEnd, { passive: true }); + return () => { + observerRef.current?.disconnect(); + window.removeEventListener('scroll', onScrollEnd); + }; + }, [headings]); + + const activeIndex = useMemo( + () => headings.findIndex((h) => h.id === activeId), + [headings, activeId], + ); + + // Auto-scroll the sidebar TOC to keep the active item visible + const activeItemRef = useRef(null); + useEffect(() => { + if (!showSidebar || !activeItemRef.current) return; + activeItemRef.current.scrollIntoView({ block: 'nearest', behavior: 'smooth' }); + }, [activeId, showSidebar]); + + if (headings.length === 0) return null; + + function handleClick(heading: TocHeading) { + track('blog_toc_clicked', { heading: heading.text }); + const el = document.getElementById(heading.id); + if (!el) return; + const top = el.getBoundingClientRect().top + window.scrollY - 32; + window.scrollTo({ top, behavior: 'smooth' }); + } + + function itemClass(h: TocHeading, index: number): string { + const indent = h.level === 2 ? 'pl-3' : h.level === 3 ? 'pl-6' : ''; + if (activeId === h.id) return `${indent} text-brand font-medium`; + if (activeIndex >= 0 && index < activeIndex) return `${indent} text-muted-foreground/50`; + return `${indent} text-muted-foreground hover:text-foreground`; + } + + const list = ( +
    + {headings.map((h, i) => ( +
  • + +
  • + ))} +
+ ); + + return ( + <> + {/* Inline: when sidebar doesn't fit */} + {!showSidebar && ( + +
+ + On this page{' '} + (click to expand) + +
{list}
+
+
+ )} + + {/* Sidebar: aligned with title, follows scroll */} + {showSidebar && ( + + )} + + ); +} diff --git a/packages/app/src/components/blog/hash-scroll.tsx b/packages/app/src/components/blog/hash-scroll.tsx new file mode 100644 index 00000000..39db4193 --- /dev/null +++ b/packages/app/src/components/blog/hash-scroll.tsx @@ -0,0 +1,18 @@ +'use client'; + +import { useEffect } from 'react'; + +/** Scrolls to the element matching the URL hash after hydration. */ +export function HashScroll() { + useEffect(() => { + const hash = window.location.hash.slice(1); + if (!hash) return; + // Small delay to let the page finish rendering + const timer = setTimeout(() => { + document.getElementById(hash)?.scrollIntoView({ behavior: 'smooth' }); + }, 100); + return () => clearTimeout(timer); + }, []); + + return null; +} diff --git a/packages/app/src/components/blog/heading-link.tsx b/packages/app/src/components/blog/heading-link.tsx new file mode 100644 index 00000000..c72296e1 --- /dev/null +++ b/packages/app/src/components/blog/heading-link.tsx @@ -0,0 +1,49 @@ +'use client'; + +import { useCallback, useRef, useState } from 'react'; +import { LinkIcon } from 'lucide-react'; +import { track } from '@/lib/analytics'; + +export function HeadingLink({ id }: { id: string }) { + const [state, setState] = useState<'idle' | 'copied' | 'fading'>('idle'); + const timerRef = useRef>(undefined); + + const handleClick = useCallback( + (e: React.MouseEvent) => { + e.preventDefault(); + clearTimeout(timerRef.current); + const url = `${window.location.origin}${window.location.pathname}#${id}`; + navigator.clipboard.writeText(url).then( + () => { + track('blog_heading_link_copied', { id }); + setState('copied'); + timerRef.current = setTimeout(() => { + setState('fading'); + timerRef.current = setTimeout(() => setState('idle'), 300); + }, 2000); + }, + () => { + /* clipboard denied — silent fallback */ + }, + ); + }, + [id], + ); + + const visible = state !== 'idle'; + + return ( + + {state === 'idle' ? ( + + ) : ( + Link copied + )} + + ); +} diff --git a/packages/app/src/components/blog/mdx-components.tsx b/packages/app/src/components/blog/mdx-components.tsx new file mode 100644 index 00000000..da4b1ef7 --- /dev/null +++ b/packages/app/src/components/blog/mdx-components.tsx @@ -0,0 +1,122 @@ +import type { ReactNode } from 'react'; +import Image from 'next/image'; +import Link from 'next/link'; +import { slugify } from '@/lib/blog'; +import { HeadingLink } from '@/components/blog/heading-link'; + +function childrenToText(children: ReactNode): string { + if (typeof children === 'string') return children; + if (typeof children === 'number') return String(children); + if (Array.isArray(children)) return children.map(childrenToText).join(''); + if (children && typeof children === 'object' && 'props' in children) { + return childrenToText((children as { props: { children?: ReactNode } }).props.children); + } + return ''; +} + +function CustomLink(props: React.AnchorHTMLAttributes) { + const { href, children, ...rest } = props; + if (href?.startsWith('/')) { + return ( + + {children} + + ); + } + return ( + + {children} + + ); +} + +function CustomImage(props: React.ImgHTMLAttributes) { + const { src, alt, width, height } = props; + if (!src || typeof src !== 'string') return null; + return ( + {alt + ); +} + +function Blur(props: { children?: ReactNode }) { + return
{props.children}
; +} + +/** Creates a fresh set of MDX components with clean heading dedup state per render. */ +export function createMdxComponents(): Record> { + const seen = new Set(); + const parents: string[] = []; + let figureCount = 0; + + function uniqueId(text: string, level: number): string { + const base = slugify(text); + parents[level] = base; + let id = base; + if (seen.has(id)) { + const parent = parents.slice(1, level).findLast((p) => p); + id = parent ? `${parent}-${base}` : `${base}-${level}`; + } + seen.add(id); + return id; + } + + return { + h1: (props: React.HTMLAttributes) => { + const id = uniqueId(childrenToText(props.children), 1); + return ( +

+ {props.children} + +

+ ); + }, + h2: (props: React.HTMLAttributes) => { + const id = uniqueId(childrenToText(props.children), 2); + return ( +

+ {props.children} + +

+ ); + }, + h3: (props: React.HTMLAttributes) => { + const id = uniqueId(childrenToText(props.children), 3); + return ( +

+ {props.children} + +

+ ); + }, + a: CustomLink, + img: CustomImage, + Figure: (props: { src: string; alt?: string; caption?: string }) => { + const isFirst = figureCount === 0; + figureCount++; + return ( +
+ {/* eslint-disable-next-line @next/next/no-img-element */} + {props.alt + {props.caption && ( +
+ {props.caption} +
+ )} +
+ ); + }, + Blur, + }; +} diff --git a/packages/app/src/components/blog/reading-progress-bar.tsx b/packages/app/src/components/blog/reading-progress-bar.tsx new file mode 100644 index 00000000..0f7252f6 --- /dev/null +++ b/packages/app/src/components/blog/reading-progress-bar.tsx @@ -0,0 +1,52 @@ +'use client'; + +import { useEffect, useRef, useState } from 'react'; +import { track } from '@/lib/analytics'; + +export function ReadingProgressBar({ slug }: { slug: string }) { + const [progress, setProgress] = useState(0); + const firedRef = useRef>(new Set()); + const rafRef = useRef(0); + + useEffect(() => { + function onScroll() { + cancelAnimationFrame(rafRef.current); + rafRef.current = requestAnimationFrame(() => { + const article = document.querySelector('[data-blog-article]'); + if (!article) return; + + const rect = article.getBoundingClientRect(); + const viewportHeight = window.innerHeight; + // 0 when article top is at viewport top, 1 when article bottom reaches viewport bottom + const totalDistance = rect.height - viewportHeight; + const p = totalDistance <= 0 ? 1 : Math.min(1, Math.max(0, -rect.top / totalDistance)); + + setProgress(p); + + for (const milestone of [25, 50, 75, 100]) { + if (p * 100 >= milestone && !firedRef.current.has(milestone)) { + firedRef.current.add(milestone); + track('blog_read_milestone', { milestone, slug }); + } + } + }); + } + + window.addEventListener('scroll', onScroll, { passive: true }); + return () => { + window.removeEventListener('scroll', onScroll); + cancelAnimationFrame(rafRef.current); + }; + }, [slug]); + + if (progress <= 0) return null; + + return ( +
+
+
+ ); +} diff --git a/packages/app/src/components/calculator/ThroughputCalculatorDisplay.tsx b/packages/app/src/components/calculator/ThroughputCalculatorDisplay.tsx index dcde7d9e..eca1af01 100644 --- a/packages/app/src/components/calculator/ThroughputCalculatorDisplay.tsx +++ b/packages/app/src/components/calculator/ThroughputCalculatorDisplay.tsx @@ -883,7 +883,7 @@ export default function ThroughputCalculatorDisplay() { className="absolute inset-0 pointer-events-none flex items-center justify-center" aria-hidden="true" > - +
diff --git a/packages/app/src/components/footer/footer.tsx b/packages/app/src/components/footer/footer.tsx index 37f8f31e..37833757 100644 --- a/packages/app/src/components/footer/footer.tsx +++ b/packages/app/src/components/footer/footer.tsx @@ -12,7 +12,7 @@ export const Footer = () => {
{ 'before:w-1/2', 'before:h-[200%]', 'before:right-0', - "before:mask-[url('/left-pattern-full.svg')]", + "before:mask-[url('/brand/left-pattern-full.svg')]", 'before:mask-no-repeat', 'before:mask-position-[top_right]', 'before:mask-size-[100%]', @@ -44,20 +44,7 @@ export const Footer = () => { href="https://semianalysis.com/" className="inline-block w-35 h-14.5" > - - +

{ + const pathname = usePathname() ?? '/'; + return (

{ 'before:w-1/2', 'before:h-full', 'before:left-0', - "before:mask-[url('/left-pattern-full.svg')]", + "before:mask-[url('/brand/left-pattern-full.svg')]", 'before:mask-no-repeat', 'before:mask-position-[top_right]', 'before:mask-size-[100%]', @@ -45,7 +82,7 @@ export const Header = () => { By {
- track('header_dashboard_clicked')} - > - Dashboard - - track('header_media_clicked')} - > - Media - - track('header_supporters_clicked')} - > - Supporters - + {NAV_LINKS.map(({ href, label, testId, event }) => ( + track(event)} + > + {label} + + ))}
- track('header_dashboard_clicked')} - > - Dashboard - - track('header_media_clicked')} - > - Media - - track('header_supporters_clicked')} - > - Supporters - + {NAV_LINKS.map(({ href, label, event }) => ( + track(event)} + > + {label} + + ))}
diff --git a/packages/app/src/components/share-buttons.tsx b/packages/app/src/components/share-buttons.tsx index 4e54c3d2..3d73b788 100644 --- a/packages/app/src/components/share-buttons.tsx +++ b/packages/app/src/components/share-buttons.tsx @@ -13,7 +13,7 @@ function getShareUrl(): string { return window.location.href; } -export function ShareTwitterButton() { +export function ShareTwitterButton({ text }: { text?: string }) { return (