FP8 Phase 3 (PTX/CUDA) COMPLETE: portable bit-manip - FP8 now on ALL 6 BACKENDS

LostBeard · claude · LostBeard · commit 1dd4d5b98856 · 2026-06-16T18:45:03.000-04:00
Wires the portable FP8<->f32 PTX conversion (EmitFP8BitsToF32 / EmitF32ToFP8Bits, committed unwired in 3bd846d) into the CUDA load/store/convert/scalar-param. FP8 has no native PTX cvt on the cards we target, so - exactly like the bf16 pre-Ampere fix - it uses only basic integer ops (branchless setp/selp, unrolled subnormal-normalize), working on EVERY CUDA arch incl. the 1080 (sm_61) / 2060 (sm_75). - ConvertValue: FP8<->f32 (and FP8<->FP8) is a register no-op (f32-register model, like bf16) - this is what makes PrecisionConvert.ConvertToSingle/ConvertFromSingle<FP8> lower to nothing on PTX, and closes the earlier "Float32 -> Float8E4M3 does not have an intrinsic implementation" on CUDA. - Load: ArrayView<FP8> -> ld.u8 -> EmitFP8BitsToF32. Store: EmitF32ToFP8Bits -> st.u8 (keyed off the target buffer element type, like bf16). - FP8 scalar param: .b8 param declaration (AppendParamDeclaration) + ld.param.u8 + widen (BindParameters) - the f32-register model, so the host's 1-byte pack lines up (was arriving 0). VERIFIED on the 4070 (`DemoConsole -- fp8-verify`): CPU + OpenCL + CUDA all E4M3 257/257 + E5M2 257/257 (the relu(x*scale+bias) generic kernel through the FP8 load/store/arith/scalar/convert paths). Basic-ops-only => 4070-correct implies 1080/2060-correct (universal instruction set). Test skip removed - FP8 now runs on all 6 backends; full PMT round-trip next. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
diff --git a/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs b/ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs
@@ -241,6 +241,25 @@ public void GenerateCode(ConvertValue value)
                 }
             }
 
+            // FP8 uses the SAME f32-register model: the FP8 value lives as f32 in-register and is
+            // rounded to the 1-byte FP8 grid only at the store boundary (EmitF32ToFP8Bits). So an
+            // FP8<->f32 (or FP8<->FP8) ConvertValue is a register no-op here - this is what makes
+            // PrecisionConvert.ConvertToSingle/ConvertFromSingle<FP8> lower to nothing on PTX.
+            bool srcFp8 = sourceType == ArithmeticBasicValueType.Float8E4M3
+                || sourceType == ArithmeticBasicValueType.Float8E5M2;
+            bool dstFp8 = targetType == ArithmeticBasicValueType.Float8E4M3
+                || targetType == ArithmeticBasicValueType.Float8E5M2;
+            if (srcFp8 || dstFp8)
+            {
+                if (srcFp8) sourceType = ArithmeticBasicValueType.Float32;
+                if (dstFp8) targetType = ArithmeticBasicValueType.Float32;
+                if (sourceType == targetType)
+                {
+                    Alias(value, value.Value);
+                    return;
+                }
+            }
+
             var sourceValue = LoadPrimitive(value.Value);
 
             var convertOperation = PTXInstructions.GetConvertOperation(
@@ -938,6 +957,27 @@ public void GenerateCode(Load load)
                 return;
             }
 
+            if (load.Type.BasicValueType == BasicValueType.Float8E4M3 ||
+                load.Type.BasicValueType == BasicValueType.Float8E5M2)
+            {
+                // FP8 storage is a packed 1-byte value; load it into a temp .b16 register, then
+                // widen to the f32 value register via portable bit-manip (every CUDA arch - FP8
+                // has no native PTX cvt on the cards we target). f32-register model like bf16.
+                bool isE4M3 = load.Type.BasicValueType == BasicValueType.Float8E4M3;
+                var fp8Target = AllocateHardware(load);
+                var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
+                using (var cmd = BeginCommand(PTXInstructions.LoadOperation))
+                {
+                    cmd.AppendAddressSpace(sourceType.AddressSpace);
+                    cmd.AppendSuffix("u8");
+                    cmd.AppendArgument(rawReg);
+                    cmd.AppendArgumentValue(address, 0);
+                }
+                EmitFP8BitsToF32(rawReg, fp8Target, isE4M3);
+                FreeRegister(rawReg);
+                return;
+            }
+
             var targetRegister = Allocate(load);
 
             EmitVectorizedCommand(
@@ -1072,6 +1112,27 @@ public void GenerateCode(Store store)
                 return;
             }
 
+            if (targetType.ElementType.BasicValueType == BasicValueType.Float8E4M3 ||
+                targetType.ElementType.BasicValueType == BasicValueType.Float8E5M2)
+            {
+                // FP8 store: round the f32 value register to the 1-byte FP8 pattern via portable
+                // bit-manip (EmitF32ToFP8Bits - every CUDA arch) into a temp .b16 register, then
+                // write the low byte. Keyed off the TARGET BUFFER element type (same reason as bf16).
+                bool isE4M3 = targetType.ElementType.BasicValueType == BasicValueType.Float8E4M3;
+                var valueReg = EnsureHardwareRegister(value.AsNotNullCast<PrimitiveRegister>());
+                var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
+                EmitF32ToFP8Bits(valueReg, rawReg, isE4M3);
+                using (var cmd = BeginCommand(PTXInstructions.StoreOperation))
+                {
+                    cmd.AppendAddressSpace(targetType.AddressSpace);
+                    cmd.AppendSuffix("u8");
+                    cmd.AppendArgumentValue(address, 0);
+                    cmd.AppendArgument(rawReg);
+                }
+                FreeRegister(rawReg);
+                return;
+            }
+
             // A bf16-TYPED value stored to a NON-bf16 buffer (the target-bf16 case was handled above).
             // bf16 is held in an f32 register and the `(float)bf16` widening Convert is a no-op alias
             // that preserves the bf16 IR type, so `floatBuf[i] = (float)bf16Buf[i]` reaches here with a
diff --git a/ILGPU/Backends/PTX/PTXCodeGenerator.cs b/ILGPU/Backends/PTX/PTXCodeGenerator.cs
@@ -1014,6 +1014,25 @@ internal void BindParameters(List<MappedParameter> parameters)
                     continue;
                 }
 
+                // FP8 scalar param: declared at 1-byte .b8 storage (AppendParamDeclaration). Load the
+                // raw byte (.u8) into a temp register, then widen to the f32 VALUE register via
+                // EmitFP8BitsToF32 - the same storage->compute conversion an FP8 buffer load uses.
+                var fp8ParamBvt = mappedParameter.Parameter.Type.BasicValueType;
+                if ((fp8ParamBvt == BasicValueType.Float8E4M3 || fp8ParamBvt == BasicValueType.Float8E5M2) &&
+                    mappedParameter.Register is HardwareRegister fp8ValueRegister)
+                {
+                    var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
+                    using (var cmd = BeginCommand(PTXInstructions.LoadParamOperation))
+                    {
+                        cmd.AppendSuffix("u8");
+                        cmd.AppendArgument(rawReg);
+                        cmd.AppendRawValue(mappedParameter.PTXName, 0);
+                    }
+                    EmitFP8BitsToF32(rawReg, fp8ValueRegister, fp8ParamBvt == BasicValueType.Float8E4M3);
+                    FreeRegister(rawReg);
+                    continue;
+                }
+
                 EmitLoadParam(
                     mappedParameter.PTXName,
                     mappedParameter.Register,
@@ -1107,6 +1126,16 @@ protected void AppendParamDeclaration(
                     targetBuilder.Append("b16 ");
                     targetBuilder.Append(paramName);
                     break;
+                case PrimitiveType primFp8
+                    when primFp8.BasicValueType == BasicValueType.Float8E4M3
+                        || primFp8.BasicValueType == BasicValueType.Float8E5M2:
+                    // FP8 computes in an f32 register but its STORAGE (host-packed scalar arg / buffer
+                    // element) is the 1-byte FP8 pattern. Declare the param at 1-byte .b8 storage so the
+                    // host's 1-byte pack lines up; BindParameters loads it (.b8) and widens to f32 via
+                    // EmitFP8BitsToF32. Same fix as bf16's .b16 (declaring .f32 made the 1 byte arrive 0).
+                    targetBuilder.Append("b8 ");
+                    targetBuilder.Append(paramName);
+                    break;
                 case PrimitiveType _:
                 case StringType _:
                 case PointerType _:
diff --git a/SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.GenericPrecision.cs b/SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.GenericPrecision.cs
@@ -79,16 +79,10 @@ public async Task PrecisionConvert_Float8E5M2_RoundTripBitExact() =>
 
         private async Task RunFP8RoundTrip<T>(Func<float, T> toT, Func<T, float> toF)
             where T : unmanaged, INumber<T>
+            // FP8 (Float8E4M3 + Float8E5M2) codegen is wired on ALL 6 backends
+            // (CPU, OpenCL, WebGPU, WebGL, Wasm, CUDA) - no skip needed.
             => await RunTest(async accelerator =>
-        {
-            // FP8 codegen wired on CPU/OpenCL/WebGPU/WebGL/Wasm so far; skip PTX (CUDA) until done.
-            var at = accelerator.AcceleratorType;
-            if (at != AcceleratorType.CPU && at != AcceleratorType.OpenCL &&
-                at != AcceleratorType.WebGPU && at != AcceleratorType.WebGL &&
-                at != AcceleratorType.Wasm)
-                return;
-            await RunPrecisionRoundTripCore<T>(accelerator, toT, toF);
-        });
+                await RunPrecisionRoundTripCore<T>(accelerator, toT, toF));
 
         private async Task RunPrecisionRoundTrip<T>(Func<float, T> toT, Func<T, float> toF)
             where T : unmanaged, INumber<T>