Skip to content

Commit 36794ad

Browse files
committed
fix(format): correct fp16 subnormal decode bias (113-shifts, not 112-exp)
The fp16 subnormal decoder had an incorrect exponent calculation. When decoding subnormal values (exp=0), the biased exponent was computed as (112 - exp) where exp wraps negatively. This produced values 131072x too large for small subnormals. Fix: use (113 - shifts) where shifts counts the normalization steps. Before: fp16(0x0002) = 1.5625e-2 (wrong) After: fp16(0x0002) = 1.1921e-7 (correct: 2/1024 * 2^(-14)) Closes #63
1 parent ae8ffa2 commit 36794ad

1 file changed

Lines changed: 41 additions & 3 deletions

File tree

src/formats/formats_root.zig

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,12 @@ fn fp16ToF32(x: u16) f32 {
152152
if (e == 0) {
153153
if (m == 0) return @bitCast(sign);
154154
var mant = @as(u32, m) << 13;
155-
var exp: u32 = 0;
156-
while ((mant & 0x00800000) == 0) : (exp -= 1) {
155+
var shifts: u32 = 0;
156+
while ((mant & 0x00800000) == 0) : (shifts += 1) {
157157
mant <<= 1;
158158
}
159-
const f32_bits = sign | ((112 - exp) << 23) | (mant & 0x7FFFFF);
159+
const biased_exp: u32 = 113 - shifts;
160+
const f32_bits = sign | (biased_exp << 23) | (mant & 0x7FFFFF);
160161
return @bitCast(f32_bits);
161162
}
162163
if (e == 0x1F) {
@@ -647,3 +648,40 @@ test "BF16: quantizeValue roundtrip all formats" {
647648
try std.testing.expect(@abs(bf16_round - test_val) / test_val < 0.05);
648649
try std.testing.expect(@abs(fp16_round - test_val) / test_val < 0.05);
649650
}
651+
652+
test "FP16: subnormal decode mantissa=1" {
653+
const bits: u16 = 0x0001;
654+
const val = fp16ToF32(bits);
655+
const expected: f32 = 5.960464e-8;
656+
try std.testing.expectApproxEqAbs(expected, val, 1e-14);
657+
}
658+
659+
test "FP16: subnormal decode mantissa=2" {
660+
const bits: u16 = 0x0002;
661+
const val = fp16ToF32(bits);
662+
const expected: f32 = 1.192093e-7;
663+
try std.testing.expectApproxEqAbs(expected, val, 1e-14);
664+
}
665+
666+
test "FP16: subnormal decode mantissa=1023 (max)" {
667+
const bits: u16 = 0x03FF;
668+
const val = fp16ToF32(bits);
669+
try std.testing.expect(val > 0.0);
670+
try std.testing.expect(val < 6.1e-5);
671+
}
672+
673+
test "FP16: quantizeValue small values preserve sign" {
674+
const pos = quantizeValue(0.003, .fp16);
675+
const neg = quantizeValue(-0.003, .fp16);
676+
try std.testing.expect(pos > 0.0);
677+
try std.testing.expect(neg < 0.0);
678+
}
679+
680+
test "FP16: subnormal roundtrip accuracy" {
681+
const vals = [_]f32{ 1e-5, 5e-5, 1e-4, 5e-4 };
682+
for (vals) |v| {
683+
const q = quantizeValue(v, .fp16);
684+
const rel_err = @abs(q - v) / v;
685+
try std.testing.expect(rel_err < 0.1);
686+
}
687+
}

0 commit comments

Comments
 (0)