LostBeard
diff --git a/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎ILGPU.Algorithms/ILGPU.Algorithms.csproj‎
Lines changed: 1 addition & 1 deletion b/‎ILGPU.Algorithms/ILGPU.Algorithms.csproj‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ILGPU.Algorithms/RadixSortExtensions.Float4E2M1.cs‎
Lines changed: 183 additions & 0 deletions b/‎ILGPU.Algorithms/RadixSortExtensions.Float4E2M1.cs‎
Lines changed: 183 additions & 0 deletions
diff --git a/‎ILGPU.Algorithms/RadixSortExtensions.cs‎
Lines changed: 29 additions & 0 deletions b/‎ILGPU.Algorithms/RadixSortExtensions.cs‎
Lines changed: 29 additions & 0 deletions
@@ -2,6 +2,15 @@
 
 This file tracks notable changes per release. The README's "Recent Highlights" section links here for the full version history.
 
+## 4.14.0-local.6 (2026-06-17) - `Float4E2M1` radix-sort keys on all 6 backends + a latent PTX struct-field IO bug fix
+
+Closes the FP4 follow-up from local.5: `Float4E2M1` arrays can now be radix-sorted (keys-only + pairs, ascending + descending) on CPU/CUDA/OpenCL/WebGPU/WebGL/Wasm. Forks bump to `2.0.33`.
+
+- **`Ascending/DescendingFloat4E2M1` radix operations.** The same sign-flip + ones-complement key transform Half/bf16/FP8 use, adapted to the 4-bit E2M1 layout (sign at **bit 3**, not the top bit; magnitude in the low 3 bits). FP4 is stored as a 1-byte element (value in the low nibble), so it sorts as a native 1-byte key (NumBits=8, the key is 0..15 so it stays monotonic) on 5 backends; WebGL uses the unpacked-f32 working representation (the whole-texel scatter can't move a 1-byte sub-word), like Half/bf16/FP8.
+- **Per-backend `FloatAsInt(Float4E2M1)` radix codegen** completed: the PTX + Wasm `FloatAsIntCast`/`IntAsFloatCast` for FP4 (the convert release wired OpenCL/WGSL/GLSL; PTX/Wasm got it here) recover the 4-bit pattern via the portable bit-manip helpers.
+- **Fixed a real latent PTX struct-field IO bug (Rule 2a).** The PTX `EmitIOLoad`/`EmitIOStore` (the path the `RadixSortPairs` kernel uses to bundle the 1-byte key with the value) handled bf16 + FP8 but not FP4, so an FP4 key field was stored as the f32 register's raw low byte (= 0 for most values) → CUDA FP4 pairs returned **all-zero keys**. Added FP4 to both (round f32 → the 4-bit pattern via `EmitF32ToFP4Bits` on store, widen via `EmitFP4BitsToF32` on load). Root-caused with a desktop repro printing the actual CUDA sorted output (keys-only worked; only the key-bundle pairs path was wrong).
+- Gates: PMT `Fp4Radix` (ExtractBits GPU-vs-CPU + KeysDescending + PairsAscending) **23/0 all 6 backends**; no regression (`Fp8Radix` 44/0, `Float4E2M1` convert 23/0).
+
 ## 4.14.0-local.5 (2026-06-17) - New 4-bit float type `Float4E2M1` (NVFP4/MXFP4 element format) on all 6 backends + a latent low-precision store-widening bug fix
 
 Adds `ILGPU.Float4E2M1`, the OCP **E2M1FN** 4-bit float (the element format of NVFP4 / MXFP4): 1 sign / 2 exp / 1 mantissa, bias 1, **16 finite codes (no Inf, no NaN)**, magnitudes `{0,.5,1,1.5,2,3,4,6}`, max 6, finite overflow + ±Inf saturate to ±6, NaN→-0. 1-byte storage (value in the low nibble), f32-register compute. Forks bump to `2.0.32`. Bit-exact to `ml_dtypes.float4_e2m1fn` (PyTorch/JAX share it).
 
@@ -12,7 +12,7 @@
          SpawnDev.ILGPU.Fork* PackageReference Versions inside SpawnDev.ILGPU.csproj.
          Run `_check-fork-version-sync.bat` at repo root. See the banner comment in
          SpawnDev.ILGPU.csproj for the full procedure. -->
-    <Version>2.0.32</Version>
+    <Version>2.0.33</Version>
     <IsPackable>true</IsPackable>
     <GeneratePackageOnBuild>true</GeneratePackageOnBuild>
   </PropertyGroup>
 
@@ -0,0 +1,183 @@
+// ---------------------------------------------------------------------------------------
+//                                   ILGPU Algorithms
+//                        Copyright (c) 2019-2023 ILGPU Project
+//                                    www.ilgpu.net
+//
+// File: RadixSortExtensions.Float4E2M1.cs
+//
+// This file is part of ILGPU and is distributed under the University of Illinois Open
+// Source License. See LICENSE.txt for details.
+// ---------------------------------------------------------------------------------------
+
+using ILGPU.Algorithms.RadixSortOperations;
+using ILGPU.Algorithms.ScanReduceOperations;
+using ILGPU.Runtime;
+using System.Runtime.CompilerServices;
+
+namespace ILGPU.Algorithms
+{
+    // WebGL Float4E2M1-key radix sort. FP4 is a 1-byte sub-word key (value in the low
+    // nibble); the WebGL render-to-texture scatter writes WHOLE 32-bit texels and cannot
+    // move a sub-texel value, so - exactly like Half, BFloat16 and FP8 (RadixSortExtensions.cs
+    // / RadixSortExtensions.BFloat16.cs / RadixSortExtensions.Float8E4M3.cs) - FP4 sorts via
+    // an UNPACKED f32 working representation: copy-in widens each Float4E2M1 to f32 (lossless:
+    // every one of the 16 finite FP4 codes is a strict subset of f32), the radix bit is derived
+    // by narrowing back to Float4E2M1 and calling the canonical ExtractRadixBits, and copy-out
+    // narrows the sorted f32 back to Float4E2M1 (exact round-trip for any value that began as a
+    // Float4E2M1). Mirrors the FP8 path one-for-one; the only difference is NumBits == 4 (the
+    // FP4 value occupies the low nibble) rather than 8, so the per-bit loop runs 4 passes.
+    static partial class RadixSortExtensions
+    {
+        private static void WebGLScatterRadixCopyInFloat4E2M1<TStride>(
+            Index1D index, ArrayView1D<Float4E2M1, TStride> input,
+            ArrayView1D<float, Stride1D.Dense> output)
+            where TStride : struct, IStride1D =>
+            output[index.X] = (float)input[index.X];
+
+        private static void WebGLScatterRadixCopyOutFloat4E2M1<TStride>(
+            Index1D index, ArrayView1D<float, Stride1D.Dense> input,
+            ArrayView1D<Float4E2M1, TStride> output)
+            where TStride : struct, IStride1D =>
+            output[index.X] = (Float4E2M1)input[index.X];
+
+        private static void WebGLScatterRadixExtractBitFloat4E2M1<TRadixSortOperation>(
+            Index1D index, ArrayView1D<float, Stride1D.Dense> keys,
+            ArrayView1D<int, Stride1D.Dense> flags, int bit)
+            where TRadixSortOperation : struct, IRadixSortOperation<Float4E2M1>
+        {
+            TRadixSortOperation op = default;
+            flags[index.X] = op.ExtractRadixBits((Float4E2M1)keys[index.X], bit, 1);
+        }
+
+        // Keys-only Float4E2M1 sort. Invoked by reflection from CreateRadixSort (the outer
+        // method is generic on T; the compiler can't see T == Float4E2M1 to bind the
+        // IRadixSortOperation<Float4E2M1> constraint statically). Called once per handler.
+        private static RadixSort<Float4E2M1, TStride> CreateWebGLScatterRadixSortFloat4E2M1<
+            TStride, TRadixSortOperation>(Accelerator accelerator, IScatterProvider scatter)
+            where TStride : struct, IStride1D
+            where TRadixSortOperation : struct, IRadixSortOperation<Float4E2M1>
+        {
+            var copyIn = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<Float4E2M1, TStride>, ArrayView1D<float, Stride1D.Dense>>(
+                WebGLScatterRadixCopyInFloat4E2M1<TStride>);
+            var copyOut = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<float, Stride1D.Dense>, ArrayView1D<Float4E2M1, TStride>>(
+                WebGLScatterRadixCopyOutFloat4E2M1<TStride>);
+            var extractBit = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<float, Stride1D.Dense>, ArrayView1D<int, Stride1D.Dense>, int>(
+                WebGLScatterRadixExtractBitFloat4E2M1<TRadixSortOperation>);
+            var computeDest = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<int, Stride1D.Dense>, ArrayView1D<int, Stride1D.Dense>,
+                ArrayView1D<int, Stride1D.Dense>, int>(WebGLScatterRadixComputeDest);
+            var exclusiveScan = accelerator.CreateScan<
+                int, Stride1D.Dense, Stride1D.Dense, AddInt32>(ScanKind.Exclusive);
+
+            int numBits = default(TRadixSortOperation).NumBits; // 4
+
+            return (stream, view, temp) =>
+            {
+                int n = (int)view.Length;
+                if (n <= 1)
+                    return;
+
+                using var keysA = accelerator.Allocate1D<float>(n);
+                using var keysB = accelerator.Allocate1D<float>(n);
+                using var flags = accelerator.Allocate1D<int>(n);
+                using var onePrefix = accelerator.Allocate1D<int>(n);
+                using var dest = accelerator.Allocate1D<int>(n);
+                using var scanTemp = accelerator.Allocate1D<int>(1);
+
+                copyIn(stream, n, view, keysA.View);
+
+                var src = keysA;
+                var dst = keysB;
+                for (int bit = 0; bit < numBits; bit++)
+                {
+                    extractBit(stream, n, src.View, flags.View, bit);
+                    exclusiveScan(stream, flags.View, onePrefix.View, scanTemp.View);
+                    computeDest(stream, n, flags.View, onePrefix.View, dest.View, n);
+                    scatter.Scatter(dst.View, src.View, dest.View, n, "float");
+                    var tmp = src; src = dst; dst = tmp;
+                }
+
+                copyOut(stream, n, src.View, view);
+            };
+        }
+
+
+        // Float4E2M1-KEY pairs sort (FP4 key + any 4/8-byte non-FP4 value). Keys use the
+        // unpacked f32 working representation; values use the same int/float/uint scatter
+        // program as the generic pairs path. Invoked by reflection from CreateRadixSortPairs.
+        private static RadixSortPairs<Float4E2M1, TKeyStride, TValue, TValueStride>
+            CreateWebGLScatterRadixSortPairsFloat4E2M1Key<
+                TKeyStride, TValue, TValueStride, TRadixSortOperation>(
+            Accelerator accelerator, IScatterProvider scatter)
+            where TKeyStride : struct, IStride1D
+            where TValue : unmanaged
+            where TValueStride : struct, IStride1D
+            where TRadixSortOperation : struct, IRadixSortOperation<Float4E2M1>
+        {
+            var copyInKeys = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<Float4E2M1, TKeyStride>, ArrayView1D<float, Stride1D.Dense>>(
+                WebGLScatterRadixCopyInFloat4E2M1<TKeyStride>);
+            var copyOutKeys = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<float, Stride1D.Dense>, ArrayView1D<Float4E2M1, TKeyStride>>(
+                WebGLScatterRadixCopyOutFloat4E2M1<TKeyStride>);
+            var copyInVals = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<TValue, TValueStride>, ArrayView1D<TValue, Stride1D.Dense>>(
+                WebGLScatterRadixCopyIn<TValue, TValueStride>);
+            var copyOutVals = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<TValue, Stride1D.Dense>, ArrayView1D<TValue, TValueStride>>(
+                WebGLScatterRadixCopyOut<TValue, TValueStride>);
+            var extractBit = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<float, Stride1D.Dense>, ArrayView1D<int, Stride1D.Dense>, int>(
+                WebGLScatterRadixExtractBitFloat4E2M1<TRadixSortOperation>);
+            var computeDest = accelerator.LoadAutoGroupedKernel<
+                Index1D, ArrayView1D<int, Stride1D.Dense>, ArrayView1D<int, Stride1D.Dense>,
+                ArrayView1D<int, Stride1D.Dense>, int>(WebGLScatterRadixComputeDest);
+            var exclusiveScan = accelerator.CreateScan<
+                int, Stride1D.Dense, Stride1D.Dense, AddInt32>(ScanKind.Exclusive);
+
+            int numBits = default(TRadixSortOperation).NumBits; // 4
+            string valType = WebGLScatterValueType<TValue>();
+            int valCpe = WebGLScatterCpe<TValue>();
+
+            return (stream, keys, values, tempView) =>
+            {
+                int n = (int)keys.Length;
+                if (n <= 1)
+                    return;
+
+                using var keysA = accelerator.Allocate1D<float>(n);
+                using var keysB = accelerator.Allocate1D<float>(n);
+                using var valsA = accelerator.Allocate1D<TValue>(n);
+                using var valsB = accelerator.Allocate1D<TValue>(n);
+                using var flags = accelerator.Allocate1D<int>(n);
+                using var onePrefix = accelerator.Allocate1D<int>(n);
+                using var dest = accelerator.Allocate1D<int>(n);
+                using var scanTemp = accelerator.Allocate1D<int>(1);
+
+                copyInKeys(stream, n, keys, keysA.View);
+                copyInVals(stream, n, values, valsA.View);
+
+                var kSrc = keysA;
+                var kDst = keysB;
+                var vSrc = valsA;
+                var vDst = valsB;
+                for (int bit = 0; bit < numBits; bit++)
+                {
+                    extractBit(stream, n, kSrc.View, flags.View, bit);
+                    exclusiveScan(stream, flags.View, onePrefix.View, scanTemp.View);
+                    computeDest(stream, n, flags.View, onePrefix.View, dest.View, n);
+                    scatter.Scatter(kDst.View, kSrc.View, dest.View, n, "float", 1);
+                    scatter.Scatter(vDst.View, vSrc.View, dest.View, n, valType, valCpe);
+                    var kt = kSrc; kSrc = kDst; kDst = kt;
+                    var vt = vSrc; vSrc = vDst; vDst = vt;
+                }
+
+                copyOutKeys(stream, n, kSrc.View, keys);
+                copyOutVals(stream, n, vSrc.View, values);
+            };
+        }
+    }
+}
@@ -1210,6 +1210,23 @@ accelerator is IScatterProvider scatterProviderE5M2Key &&
                 return (RadixSortPairs<TKey, TKeyStride, TValue, TValueStride>)handler;
             }
 
+            // Float4E2M1 KEY (+ any 4/8-byte non-FP4 value): same sub-word unpacked-f32 path.
+            if (accelerator.AcceleratorType == AcceleratorType.WebGL &&
+                accelerator is IScatterProvider scatterProviderE2M1Key &&
+                typeof(TKey) == typeof(Float4E2M1) &&
+                (Interop.SizeOf<TValue>() == 4 || Interop.SizeOf<TValue>() == 8) &&
+                typeof(TValue) != typeof(Float4E2M1))
+            {
+                var handler = typeof(RadixSortExtensions)
+                    .GetMethod(nameof(CreateWebGLScatterRadixSortPairsFloat4E2M1Key),
+                        BindingFlags.NonPublic | BindingFlags.Static)!
+                    .MakeGenericMethod(
+                        typeof(TKeyStride), typeof(TValue), typeof(TValueStride),
+                        typeof(TRadixSortOperation))
+                    .Invoke(null, new object[] { accelerator, scatterProviderE2M1Key })!;
+                return (RadixSortPairs<TKey, TKeyStride, TValue, TValueStride>)handler;
+            }
+
             if (accelerator.AcceleratorType == AcceleratorType.WebGL &&
                 accelerator is IScatterProvider scatterProviderPairs &&
                 (Interop.SizeOf<TKey>() == 4 || Interop.SizeOf<TKey>() == 8) &&
@@ -1564,6 +1581,18 @@ private static RadixSort<T, TStride> CreateWebGLScatterRadixSortDispatch<
                     .Invoke(null, new object[] { accelerator, scatterProvider })!;
                 return (RadixSort<T, TStride>)handler;
             }
+            // FP4 (E2M1FN) is a 1-byte sub-word key (value in the low nibble) - same
+            // unpacked-f32 working representation as Half/bf16/FP8 (every one of the 16
+            // finite FP4 codes is a strict subset of f32).
+            if (typeof(T) == typeof(Float4E2M1))
+            {
+                var handler = typeof(RadixSortExtensions)
+                    .GetMethod(nameof(CreateWebGLScatterRadixSortFloat4E2M1),
+                        BindingFlags.NonPublic | BindingFlags.Static)!
+                    .MakeGenericMethod(typeof(TStride), typeof(TRadixSortOperation))
+                    .Invoke(null, new object[] { accelerator, scatterProvider })!;
+                return (RadixSort<T, TStride>)handler;
+            }
             return CreateWebGLScatterRadixSort<T, TStride, TRadixSortOperation>(
                 accelerator, scatterProvider);
         }