@@ -1713,6 +1713,22 @@ struct block_mxfp4
17131713#define A_TYPE block_mxfp4
17141714#endif
17151715
1716+ #define QUANT_K_NVFP4 64
1717+ #define QUANT_R_NVFP4 1
1718+
1719+ struct block_nvfp4
1720+ {
1721+ uint8_t d[QUANT_K_NVFP4 / 16 ];
1722+ uint8_t qs[QUANT_K_NVFP4 / 2 ];
1723+ };
1724+
1725+ #if defined(DATA_A_NVFP4)
1726+ #define QUANT_K QUANT_K_NVFP4
1727+ #define QUANT_R QUANT_R_NVFP4
1728+ #define QUANT_AUXF 1
1729+ #define A_TYPE block_nvfp4
1730+ #endif
1731+
17161732#if defined(DATA_A_IQ4_NL) || defined(DATA_A_IQ4_XS)
17171733const int8_t kvalues_iq4nl_const[16 ] = {
17181734 int8_t(- 127 ), int8_t(- 104 ), int8_t(- 83 ), int8_t(- 65 ), int8_t(- 49 ), int8_t(- 35 ), int8_t(- 22 ), int8_t(- 10 ),
@@ -1732,21 +1748,44 @@ void init_iq_shmem(uvec3 wgsize)
17321748}
17331749#endif
17341750
1735- #if defined(DATA_A_MXFP4)
1751+ #if defined(DATA_A_MXFP4) || defined(DATA_A_NVFP4)
17361752const int8_t kvalues_mxfp4_const[16 ] = {
17371753 int8_t(0 ), int8_t(1 ), int8_t(2 ), int8_t(3 ), int8_t(4 ), int8_t(6 ), int8_t(8 ), int8_t(12 ),
17381754 int8_t(0 ), int8_t(- 1 ), int8_t(- 2 ), int8_t(- 3 ), int8_t(- 4 ), int8_t(- 6 ), int8_t(- 8 ), int8_t(- 12 ),
17391755};
17401756
17411757shared int8_t kvalues_mxfp4[16 ];
17421758
1759+ #if defined(DATA_A_NVFP4)
1760+ // UE4M3 scale in NVFP4 blocks use only 7 bits; sign (bit 7) is always zero.
1761+ shared float ue4m3_fp32_lut[128 ];
1762+
1763+ float ue4m3_to_fp32_build(uint u) {
1764+ if (u == 0u || u == 127u) {
1765+ return 0.0 ;
1766+ }
1767+ const uint exp = (u >> 3 ) & 15u;
1768+ const uint man = u & 7u;
1769+ if (exp == 0u) {
1770+ return float (man) * (1.0 / 512.0 );
1771+ }
1772+ const uint bits = (exp + 120u) << 23 | (man << 20 );
1773+ return uintBitsToFloat(bits);
1774+ }
1775+ #endif
1776+
17431777#define NEEDS_INIT_IQ_SHMEM
17441778void init_iq_shmem(uvec3 wgsize)
17451779{
17461780 // copy the table into shared memory and sync
17471781 for (uint i = gl_LocalInvocationIndex.x; i < kvalues_mxfp4.length (); i += wgsize.x) {
17481782 kvalues_mxfp4[i] = kvalues_mxfp4_const[i];
17491783 }
1784+ #if defined(DATA_A_NVFP4)
1785+ for (uint i = gl_LocalInvocationIndex.x; i < 128u; i += wgsize.x) {
1786+ ue4m3_fp32_lut[i] = ue4m3_to_fp32_build(i);
1787+ }
1788+ #endif
17501789 barrier();
17511790}
17521791#endif
@@ -1783,6 +1822,12 @@ float e8m0_to_fp32(uint8_t x) {
17831822 return uintBitsToFloat(bits);
17841823}
17851824
1825+ #if defined(DATA_A_NVFP4)
1826+ float ue4m3_to_fp32(uint8_t x) {
1827+ return ue4m3_fp32_lut[uint (x)];
1828+ }
1829+ #endif
1830+
17861831#if BDA
17871832
17881833#extension GL_EXT_buffer_reference : enable
0 commit comments