FP8 capability flags: AcceleratorRequirements.RequiresFloat8E4M3 / RequiresFloat8E5M2

LostBeard · claude · LostBeard · commit 2190a6bd41c9 · 2026-06-16T20:04:33.000-04:00
Completes the FP8 story in the selection gate. The two flags mirror the existing always-true
RequiresBFloat16: FP8 (like bf16/Half) is supported on every backend (1-byte storage, f32-register
compute, portable conversion incl. CUDA), so they're no-op documentation filters that never rule out
a backend - a consumer declares "my kernel uses FP8" for intent/symmetry, and the selection path
stays consistent with the rest of the matrix. No fork Capabilities.xml change needed (there's no
native-vs-emulated split to expose, same as bf16). Also corrected the RequiresBFloat16 doc comment
that still claimed native sm_80 cvt (it's portable bit-manip as of 4.13.0).

Gate: BackendTestBase... no - AcceleratorRequirementsTests.Satisfies_LowPrecisionFloats_AllDevicesPass
(every device satisfies Half+bf16+E4M3+E5M2 combined); PMT_FILTER=AcceleratorRequirements 19/0/1.

Co-Authored-By: Claude Opus 4.8 &lt;noreply@anthropic.com&gt;
diff --git a/SpawnDev.ILGPU.DemoConsole/AcceleratorRequirementsTests.cs b/SpawnDev.ILGPU.DemoConsole/AcceleratorRequirementsTests.cs
@@ -55,6 +55,29 @@ public Task Satisfies_Atomics_PassesOnCpu()
         return Task.CompletedTask;
     }
 
+    [TestMethod]
+    public Task Satisfies_LowPrecisionFloats_AllDevicesPass()
+    {
+        using var context = Context.CreateDefault();
+        // bf16 + both FP8 formats are supported (always emulated) on every backend, so these are
+        // no-op documentation filters - every device must satisfy them, including combined.
+        var req = new AcceleratorRequirements
+        {
+            RequiresFloat16 = true,
+            RequiresBFloat16 = true,
+            RequiresFloat8E4M3 = true,
+            RequiresFloat8E5M2 = true,
+        };
+        foreach (var device in context.Devices)
+        {
+            if (!device.Satisfies(req))
+                throw new Exception(
+                    $"Device {device.AcceleratorType} (name={device.Name}) failed the low-precision-float " +
+                    $"requirements - Half/bf16/FP8 are supported on every backend and must never filter.");
+        }
+        return Task.CompletedTask;
+    }
+
     [TestMethod]
     public Task CreatePreferredAccelerator_NoRequirements_Returns()
     {
diff --git a/SpawnDev.ILGPU/AcceleratorRequirements.cs b/SpawnDev.ILGPU/AcceleratorRequirements.cs
@@ -72,13 +72,30 @@ public sealed class AcceleratorRequirements
 
     /// <summary>
     /// Kernel uses BFloat16 (<c>ILGPU.BFloat16</c>, "brain float"). Every backend supports it -
-    /// always emulated (top-16-bits-of-fp32 + round-to-nearest-even), with native <c>cvt.*.bf16</c>
-    /// on CUDA sm_80+ used only at the load/store boundary. There is no native-vs-emulated split
-    /// (no hardware has native bf16 arithmetic), so this is a no-op documentation filter like
+    /// always emulated (top-16-bits-of-fp32 + round-to-nearest-even). The conversion is portable
+    /// bit-manipulation on every backend including CUDA (4.13.0+ uses basic integer ops, NOT the
+    /// sm_80+ native <c>cvt.*.bf16</c>, so bf16 runs on pre-Ampere cards too). There is no
+    /// native-vs-emulated split, so this is a no-op documentation filter like
     /// <see cref="RequiresFloat16"/> - it never rules out a backend.
     /// </summary>
     public bool RequiresBFloat16 { get; init; }
 
+    /// <summary>
+    /// Kernel uses the 8-bit float <c>ILGPU.Float8E4M3</c> (E4M3FN: 1/4/3, bias 7, no Inf, saturates
+    /// to +-448 - the FP8 forward/inference format). Every backend supports it - always emulated
+    /// (1-byte storage, f32-register compute, portable conversion on every backend incl. CUDA). Like
+    /// <see cref="RequiresBFloat16"/> this is a no-op documentation filter - it never rules out a backend.
+    /// </summary>
+    public bool RequiresFloat8E4M3 { get; init; }
+
+    /// <summary>
+    /// Kernel uses the 8-bit float <c>ILGPU.Float8E5M2</c> (1/5/2, bias 15, IEEE Inf/NaN - the FP8
+    /// backward/gradient format). Every backend supports it - always emulated (1-byte storage,
+    /// f32-register compute, portable conversion on every backend incl. CUDA). Like
+    /// <see cref="RequiresBFloat16"/> this is a no-op documentation filter - it never rules out a backend.
+    /// </summary>
+    public bool RequiresFloat8E5M2 { get; init; }
+
     /// <summary>
     /// Kernel uses Float64 (<c>double</c>). True is compatible with every backend - WebGPU
     /// and WebGL run Float64 through Dekker emulation (see <c>CLAUDE.md</c>). Use
@@ -233,6 +250,8 @@ public static bool Satisfies(this Device device, AcceleratorRequirements require
         if (requirements.RequiresFloat16 && !HasFloat16(device)) return false;
         if (requirements.RequiresFloat16Native && !HasFloat16Native(device)) return false;
         if (requirements.RequiresBFloat16 && !HasBFloat16(device)) return false;
+        if (requirements.RequiresFloat8E4M3 && !HasFloat8(device)) return false;
+        if (requirements.RequiresFloat8E5M2 && !HasFloat8(device)) return false;
         if (requirements.RequiresFloat64 && !HasFloat64(device)) return false;
         if (requirements.RequiresFloat64Native && !HasFloat64Native(device)) return false;
         if (requirements.RequiresFloat64Strict && !HasFloat64Strict(device)) return false;
@@ -280,10 +299,14 @@ public static bool Satisfies(this Device device, AcceleratorRequirements require
     private static bool HasFloat16(Device device)
         => device.Capabilities?.Float16 ?? true; // every shipped backend supports Float16 via emulation
 
-    // Every backend supports BFloat16 (always emulated; native cvt only at the load/store
-    // boundary on CUDA). No native-vs-emulated split, so this is always true.
+    // Every backend supports BFloat16 (always emulated; portable bit-manip conversion on every
+    // backend incl. CUDA). No native-vs-emulated split, so this is always true.
     private static bool HasBFloat16(Device device) => true;
 
+    // Every backend supports FP8 (Float8E4M3 + Float8E5M2) - always emulated (1-byte storage,
+    // f32-register compute, portable conversion on every backend incl. CUDA). Always true.
+    private static bool HasFloat8(Device device) => true;
+
     private static bool HasFloat16Native(Device device)
     {
         // Native-or-nothing: CUDA (SM_53+), OpenCL w/ cl_khr_fp16. WebGPU with shader-f16
@@ -363,6 +386,8 @@ public static string Describe(this AcceleratorRequirements r)
         if (r.RequiresFloat16) flags.Add("Float16");
         if (r.RequiresFloat16Native) flags.Add("Float16Native");
         if (r.RequiresBFloat16) flags.Add("BFloat16");
+        if (r.RequiresFloat8E4M3) flags.Add("Float8E4M3");
+        if (r.RequiresFloat8E5M2) flags.Add("Float8E5M2");
         if (r.RequiresFloat64) flags.Add("Float64");
         if (r.RequiresFloat64Native) flags.Add("Float64Native");
         if (r.RequiresFloat64Strict) flags.Add("Float64Strict");