Skip to content

Commit 1dd4d5b

Browse files
LostBeardclaude
andcommitted
FP8 Phase 3 (PTX/CUDA) COMPLETE: portable bit-manip - FP8 now on ALL 6 BACKENDS
Wires the portable FP8<->f32 PTX conversion (EmitFP8BitsToF32 / EmitF32ToFP8Bits, committed unwired in 3bd846d) into the CUDA load/store/convert/scalar-param. FP8 has no native PTX cvt on the cards we target, so - exactly like the bf16 pre-Ampere fix - it uses only basic integer ops (branchless setp/selp, unrolled subnormal-normalize), working on EVERY CUDA arch incl. the 1080 (sm_61) / 2060 (sm_75). - ConvertValue: FP8<->f32 (and FP8<->FP8) is a register no-op (f32-register model, like bf16) - this is what makes PrecisionConvert.ConvertToSingle/ConvertFromSingle<FP8> lower to nothing on PTX, and closes the earlier "Float32 -> Float8E4M3 does not have an intrinsic implementation" on CUDA. - Load: ArrayView<FP8> -> ld.u8 -> EmitFP8BitsToF32. Store: EmitF32ToFP8Bits -> st.u8 (keyed off the target buffer element type, like bf16). - FP8 scalar param: .b8 param declaration (AppendParamDeclaration) + ld.param.u8 + widen (BindParameters) - the f32-register model, so the host's 1-byte pack lines up (was arriving 0). VERIFIED on the 4070 (`DemoConsole -- fp8-verify`): CPU + OpenCL + CUDA all E4M3 257/257 + E5M2 257/257 (the relu(x*scale+bias) generic kernel through the FP8 load/store/arith/scalar/convert paths). Basic-ops-only => 4070-correct implies 1080/2060-correct (universal instruction set). Test skip removed - FP8 now runs on all 6 backends; full PMT round-trip next. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
1 parent 3bd846d commit 1dd4d5b

3 files changed

Lines changed: 93 additions & 9 deletions

File tree

ILGPU/Backends/PTX/PTXCodeGenerator.Values.cs

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,25 @@ public void GenerateCode(ConvertValue value)
241241
}
242242
}
243243

244+
// FP8 uses the SAME f32-register model: the FP8 value lives as f32 in-register and is
245+
// rounded to the 1-byte FP8 grid only at the store boundary (EmitF32ToFP8Bits). So an
246+
// FP8<->f32 (or FP8<->FP8) ConvertValue is a register no-op here - this is what makes
247+
// PrecisionConvert.ConvertToSingle/ConvertFromSingle<FP8> lower to nothing on PTX.
248+
bool srcFp8 = sourceType == ArithmeticBasicValueType.Float8E4M3
249+
|| sourceType == ArithmeticBasicValueType.Float8E5M2;
250+
bool dstFp8 = targetType == ArithmeticBasicValueType.Float8E4M3
251+
|| targetType == ArithmeticBasicValueType.Float8E5M2;
252+
if (srcFp8 || dstFp8)
253+
{
254+
if (srcFp8) sourceType = ArithmeticBasicValueType.Float32;
255+
if (dstFp8) targetType = ArithmeticBasicValueType.Float32;
256+
if (sourceType == targetType)
257+
{
258+
Alias(value, value.Value);
259+
return;
260+
}
261+
}
262+
244263
var sourceValue = LoadPrimitive(value.Value);
245264

246265
var convertOperation = PTXInstructions.GetConvertOperation(
@@ -938,6 +957,27 @@ public void GenerateCode(Load load)
938957
return;
939958
}
940959

960+
if (load.Type.BasicValueType == BasicValueType.Float8E4M3 ||
961+
load.Type.BasicValueType == BasicValueType.Float8E5M2)
962+
{
963+
// FP8 storage is a packed 1-byte value; load it into a temp .b16 register, then
964+
// widen to the f32 value register via portable bit-manip (every CUDA arch - FP8
965+
// has no native PTX cvt on the cards we target). f32-register model like bf16.
966+
bool isE4M3 = load.Type.BasicValueType == BasicValueType.Float8E4M3;
967+
var fp8Target = AllocateHardware(load);
968+
var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
969+
using (var cmd = BeginCommand(PTXInstructions.LoadOperation))
970+
{
971+
cmd.AppendAddressSpace(sourceType.AddressSpace);
972+
cmd.AppendSuffix("u8");
973+
cmd.AppendArgument(rawReg);
974+
cmd.AppendArgumentValue(address, 0);
975+
}
976+
EmitFP8BitsToF32(rawReg, fp8Target, isE4M3);
977+
FreeRegister(rawReg);
978+
return;
979+
}
980+
941981
var targetRegister = Allocate(load);
942982

943983
EmitVectorizedCommand(
@@ -1072,6 +1112,27 @@ public void GenerateCode(Store store)
10721112
return;
10731113
}
10741114

1115+
if (targetType.ElementType.BasicValueType == BasicValueType.Float8E4M3 ||
1116+
targetType.ElementType.BasicValueType == BasicValueType.Float8E5M2)
1117+
{
1118+
// FP8 store: round the f32 value register to the 1-byte FP8 pattern via portable
1119+
// bit-manip (EmitF32ToFP8Bits - every CUDA arch) into a temp .b16 register, then
1120+
// write the low byte. Keyed off the TARGET BUFFER element type (same reason as bf16).
1121+
bool isE4M3 = targetType.ElementType.BasicValueType == BasicValueType.Float8E4M3;
1122+
var valueReg = EnsureHardwareRegister(value.AsNotNullCast<PrimitiveRegister>());
1123+
var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
1124+
EmitF32ToFP8Bits(valueReg, rawReg, isE4M3);
1125+
using (var cmd = BeginCommand(PTXInstructions.StoreOperation))
1126+
{
1127+
cmd.AppendAddressSpace(targetType.AddressSpace);
1128+
cmd.AppendSuffix("u8");
1129+
cmd.AppendArgumentValue(address, 0);
1130+
cmd.AppendArgument(rawReg);
1131+
}
1132+
FreeRegister(rawReg);
1133+
return;
1134+
}
1135+
10751136
// A bf16-TYPED value stored to a NON-bf16 buffer (the target-bf16 case was handled above).
10761137
// bf16 is held in an f32 register and the `(float)bf16` widening Convert is a no-op alias
10771138
// that preserves the bf16 IR type, so `floatBuf[i] = (float)bf16Buf[i]` reaches here with a

ILGPU/Backends/PTX/PTXCodeGenerator.cs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,25 @@ internal void BindParameters(List<MappedParameter> parameters)
10141014
continue;
10151015
}
10161016

1017+
// FP8 scalar param: declared at 1-byte .b8 storage (AppendParamDeclaration). Load the
1018+
// raw byte (.u8) into a temp register, then widen to the f32 VALUE register via
1019+
// EmitFP8BitsToF32 - the same storage->compute conversion an FP8 buffer load uses.
1020+
var fp8ParamBvt = mappedParameter.Parameter.Type.BasicValueType;
1021+
if ((fp8ParamBvt == BasicValueType.Float8E4M3 || fp8ParamBvt == BasicValueType.Float8E5M2) &&
1022+
mappedParameter.Register is HardwareRegister fp8ValueRegister)
1023+
{
1024+
var rawReg = AllocateRegister(BasicValueType.Int16, PTXRegisterKind.Int16);
1025+
using (var cmd = BeginCommand(PTXInstructions.LoadParamOperation))
1026+
{
1027+
cmd.AppendSuffix("u8");
1028+
cmd.AppendArgument(rawReg);
1029+
cmd.AppendRawValue(mappedParameter.PTXName, 0);
1030+
}
1031+
EmitFP8BitsToF32(rawReg, fp8ValueRegister, fp8ParamBvt == BasicValueType.Float8E4M3);
1032+
FreeRegister(rawReg);
1033+
continue;
1034+
}
1035+
10171036
EmitLoadParam(
10181037
mappedParameter.PTXName,
10191038
mappedParameter.Register,
@@ -1107,6 +1126,16 @@ protected void AppendParamDeclaration(
11071126
targetBuilder.Append("b16 ");
11081127
targetBuilder.Append(paramName);
11091128
break;
1129+
case PrimitiveType primFp8
1130+
when primFp8.BasicValueType == BasicValueType.Float8E4M3
1131+
|| primFp8.BasicValueType == BasicValueType.Float8E5M2:
1132+
// FP8 computes in an f32 register but its STORAGE (host-packed scalar arg / buffer
1133+
// element) is the 1-byte FP8 pattern. Declare the param at 1-byte .b8 storage so the
1134+
// host's 1-byte pack lines up; BindParameters loads it (.b8) and widens to f32 via
1135+
// EmitFP8BitsToF32. Same fix as bf16's .b16 (declaring .f32 made the 1 byte arrive 0).
1136+
targetBuilder.Append("b8 ");
1137+
targetBuilder.Append(paramName);
1138+
break;
11101139
case PrimitiveType _:
11111140
case StringType _:
11121141
case PointerType _:

SpawnDev.ILGPU.Demo.Shared/UnitTests/BackendTestBase.GenericPrecision.cs

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -79,16 +79,10 @@ public async Task PrecisionConvert_Float8E5M2_RoundTripBitExact() =>
7979

8080
private async Task RunFP8RoundTrip<T>(Func<float, T> toT, Func<T, float> toF)
8181
where T : unmanaged, INumber<T>
82+
// FP8 (Float8E4M3 + Float8E5M2) codegen is wired on ALL 6 backends
83+
// (CPU, OpenCL, WebGPU, WebGL, Wasm, CUDA) - no skip needed.
8284
=> await RunTest(async accelerator =>
83-
{
84-
// FP8 codegen wired on CPU/OpenCL/WebGPU/WebGL/Wasm so far; skip PTX (CUDA) until done.
85-
var at = accelerator.AcceleratorType;
86-
if (at != AcceleratorType.CPU && at != AcceleratorType.OpenCL &&
87-
at != AcceleratorType.WebGPU && at != AcceleratorType.WebGL &&
88-
at != AcceleratorType.Wasm)
89-
return;
90-
await RunPrecisionRoundTripCore<T>(accelerator, toT, toF);
91-
});
85+
await RunPrecisionRoundTripCore<T>(accelerator, toT, toF));
9286

9387
private async Task RunPrecisionRoundTrip<T>(Func<float, T> toT, Func<T, float> toF)
9488
where T : unmanaged, INumber<T>

0 commit comments

Comments
 (0)