Skip to content

Commit 2190a6b

Browse files
LostBeardclaude
andcommitted
FP8 capability flags: AcceleratorRequirements.RequiresFloat8E4M3 / RequiresFloat8E5M2
Completes the FP8 story in the selection gate. The two flags mirror the existing always-true RequiresBFloat16: FP8 (like bf16/Half) is supported on every backend (1-byte storage, f32-register compute, portable conversion incl. CUDA), so they're no-op documentation filters that never rule out a backend - a consumer declares "my kernel uses FP8" for intent/symmetry, and the selection path stays consistent with the rest of the matrix. No fork Capabilities.xml change needed (there's no native-vs-emulated split to expose, same as bf16). Also corrected the RequiresBFloat16 doc comment that still claimed native sm_80 cvt (it's portable bit-manip as of 4.13.0). Gate: BackendTestBase... no - AcceleratorRequirementsTests.Satisfies_LowPrecisionFloats_AllDevicesPass (every device satisfies Half+bf16+E4M3+E5M2 combined); PMT_FILTER=AcceleratorRequirements 19/0/1. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent bd55818 commit 2190a6b

2 files changed

Lines changed: 53 additions & 5 deletions

File tree

SpawnDev.ILGPU.DemoConsole/AcceleratorRequirementsTests.cs

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,29 @@ public Task Satisfies_Atomics_PassesOnCpu()
5555
return Task.CompletedTask;
5656
}
5757

58+
[TestMethod]
59+
public Task Satisfies_LowPrecisionFloats_AllDevicesPass()
60+
{
61+
using var context = Context.CreateDefault();
62+
// bf16 + both FP8 formats are supported (always emulated) on every backend, so these are
63+
// no-op documentation filters - every device must satisfy them, including combined.
64+
var req = new AcceleratorRequirements
65+
{
66+
RequiresFloat16 = true,
67+
RequiresBFloat16 = true,
68+
RequiresFloat8E4M3 = true,
69+
RequiresFloat8E5M2 = true,
70+
};
71+
foreach (var device in context.Devices)
72+
{
73+
if (!device.Satisfies(req))
74+
throw new Exception(
75+
$"Device {device.AcceleratorType} (name={device.Name}) failed the low-precision-float " +
76+
$"requirements - Half/bf16/FP8 are supported on every backend and must never filter.");
77+
}
78+
return Task.CompletedTask;
79+
}
80+
5881
[TestMethod]
5982
public Task CreatePreferredAccelerator_NoRequirements_Returns()
6083
{

SpawnDev.ILGPU/AcceleratorRequirements.cs

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,30 @@ public sealed class AcceleratorRequirements
7272

7373
/// <summary>
7474
/// Kernel uses BFloat16 (<c>ILGPU.BFloat16</c>, "brain float"). Every backend supports it -
75-
/// always emulated (top-16-bits-of-fp32 + round-to-nearest-even), with native <c>cvt.*.bf16</c>
76-
/// on CUDA sm_80+ used only at the load/store boundary. There is no native-vs-emulated split
77-
/// (no hardware has native bf16 arithmetic), so this is a no-op documentation filter like
75+
/// always emulated (top-16-bits-of-fp32 + round-to-nearest-even). The conversion is portable
76+
/// bit-manipulation on every backend including CUDA (4.13.0+ uses basic integer ops, NOT the
77+
/// sm_80+ native <c>cvt.*.bf16</c>, so bf16 runs on pre-Ampere cards too). There is no
78+
/// native-vs-emulated split, so this is a no-op documentation filter like
7879
/// <see cref="RequiresFloat16"/> - it never rules out a backend.
7980
/// </summary>
8081
public bool RequiresBFloat16 { get; init; }
8182

83+
/// <summary>
84+
/// Kernel uses the 8-bit float <c>ILGPU.Float8E4M3</c> (E4M3FN: 1/4/3, bias 7, no Inf, saturates
85+
/// to +-448 - the FP8 forward/inference format). Every backend supports it - always emulated
86+
/// (1-byte storage, f32-register compute, portable conversion on every backend incl. CUDA). Like
87+
/// <see cref="RequiresBFloat16"/> this is a no-op documentation filter - it never rules out a backend.
88+
/// </summary>
89+
public bool RequiresFloat8E4M3 { get; init; }
90+
91+
/// <summary>
92+
/// Kernel uses the 8-bit float <c>ILGPU.Float8E5M2</c> (1/5/2, bias 15, IEEE Inf/NaN - the FP8
93+
/// backward/gradient format). Every backend supports it - always emulated (1-byte storage,
94+
/// f32-register compute, portable conversion on every backend incl. CUDA). Like
95+
/// <see cref="RequiresBFloat16"/> this is a no-op documentation filter - it never rules out a backend.
96+
/// </summary>
97+
public bool RequiresFloat8E5M2 { get; init; }
98+
8299
/// <summary>
83100
/// Kernel uses Float64 (<c>double</c>). True is compatible with every backend - WebGPU
84101
/// and WebGL run Float64 through Dekker emulation (see <c>CLAUDE.md</c>). Use
@@ -233,6 +250,8 @@ public static bool Satisfies(this Device device, AcceleratorRequirements require
233250
if (requirements.RequiresFloat16 && !HasFloat16(device)) return false;
234251
if (requirements.RequiresFloat16Native && !HasFloat16Native(device)) return false;
235252
if (requirements.RequiresBFloat16 && !HasBFloat16(device)) return false;
253+
if (requirements.RequiresFloat8E4M3 && !HasFloat8(device)) return false;
254+
if (requirements.RequiresFloat8E5M2 && !HasFloat8(device)) return false;
236255
if (requirements.RequiresFloat64 && !HasFloat64(device)) return false;
237256
if (requirements.RequiresFloat64Native && !HasFloat64Native(device)) return false;
238257
if (requirements.RequiresFloat64Strict && !HasFloat64Strict(device)) return false;
@@ -280,10 +299,14 @@ public static bool Satisfies(this Device device, AcceleratorRequirements require
280299
private static bool HasFloat16(Device device)
281300
=> device.Capabilities?.Float16 ?? true; // every shipped backend supports Float16 via emulation
282301

283-
// Every backend supports BFloat16 (always emulated; native cvt only at the load/store
284-
// boundary on CUDA). No native-vs-emulated split, so this is always true.
302+
// Every backend supports BFloat16 (always emulated; portable bit-manip conversion on every
303+
// backend incl. CUDA). No native-vs-emulated split, so this is always true.
285304
private static bool HasBFloat16(Device device) => true;
286305

306+
// Every backend supports FP8 (Float8E4M3 + Float8E5M2) - always emulated (1-byte storage,
307+
// f32-register compute, portable conversion on every backend incl. CUDA). Always true.
308+
private static bool HasFloat8(Device device) => true;
309+
287310
private static bool HasFloat16Native(Device device)
288311
{
289312
// Native-or-nothing: CUDA (SM_53+), OpenCL w/ cl_khr_fp16. WebGPU with shader-f16
@@ -363,6 +386,8 @@ public static string Describe(this AcceleratorRequirements r)
363386
if (r.RequiresFloat16) flags.Add("Float16");
364387
if (r.RequiresFloat16Native) flags.Add("Float16Native");
365388
if (r.RequiresBFloat16) flags.Add("BFloat16");
389+
if (r.RequiresFloat8E4M3) flags.Add("Float8E4M3");
390+
if (r.RequiresFloat8E5M2) flags.Add("Float8E5M2");
366391
if (r.RequiresFloat64) flags.Add("Float64");
367392
if (r.RequiresFloat64Native) flags.Add("Float64Native");
368393
if (r.RequiresFloat64Strict) flags.Add("Float64Strict");

0 commit comments

Comments
 (0)