|
| 1 | +/* |
| 2 | + * Copyright 2026 Diligent Graphics LLC |
| 3 | + * |
| 4 | + * Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 | + * you may not use this file except in compliance with the License. |
| 6 | + * You may obtain a copy of the License at |
| 7 | + * |
| 8 | + * http://www.apache.org/licenses/LICENSE-2.0 |
| 9 | + * |
| 10 | + * Unless required by applicable law or agreed to in writing, software |
| 11 | + * distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | + * See the License for the specific language governing permissions and |
| 14 | + * limitations under the License. |
| 15 | + * |
| 16 | + * In no event and under no legal theory, whether in tort (including negligence), |
| 17 | + * contract, or otherwise, unless required by applicable law (such as deliberate |
| 18 | + * and grossly negligent acts) or agreed to in writing, shall any Contributor be |
| 19 | + * liable for any damages, including any direct, indirect, special, incidental, |
| 20 | + * or consequential damages of any character arising as a result of this License or |
| 21 | + * out of the use or inability to use the software (including but not limited to damages |
| 22 | + * for loss of goodwill, work stoppage, computer failure or malfunction, or any and |
| 23 | + * all other commercial damages or losses), even if such Contributor has been advised |
| 24 | + * of the possibility of such damages. |
| 25 | + */ |
| 26 | + |
| 27 | +#pragma once |
| 28 | + |
| 29 | +#include <cstdint> |
| 30 | +#include <cstring> |
| 31 | +#include <cmath> |
| 32 | +#include <limits> |
| 33 | +#include <type_traits> |
| 34 | + |
| 35 | +namespace Diligent |
| 36 | +{ |
| 37 | + |
| 38 | +class Float16 |
| 39 | +{ |
| 40 | +public: |
| 41 | + constexpr Float16() noexcept = default; |
| 42 | + constexpr explicit Float16(uint16_t Bits) noexcept : |
| 43 | + m_Bits{Bits} |
| 44 | + {} |
| 45 | + |
| 46 | + explicit Float16(float f) noexcept : |
| 47 | + m_Bits(FloatToHalfBits(f)) |
| 48 | + {} |
| 49 | + |
| 50 | + explicit Float16(double d) noexcept : |
| 51 | + m_Bits(DoubleToHalfBits(d)) |
| 52 | + {} |
| 53 | + |
| 54 | + explicit Float16(int32_t i) noexcept : |
| 55 | + m_Bits(FloatToHalfBits(static_cast<float>(i))) |
| 56 | + {} |
| 57 | + |
| 58 | + explicit operator float() const noexcept |
| 59 | + { |
| 60 | + return HalfBitsToFloat(m_Bits); |
| 61 | + } |
| 62 | + |
| 63 | + explicit operator double() const noexcept |
| 64 | + { |
| 65 | + return static_cast<double>(HalfBitsToFloat(m_Bits)); |
| 66 | + } |
| 67 | + |
| 68 | + // Int32 conversion: trunc toward 0, saturate on overflow, NaN->0 |
| 69 | + explicit operator int32_t() const noexcept |
| 70 | + { |
| 71 | + const float f = HalfBitsToFloat(m_Bits); |
| 72 | + |
| 73 | + if (std::isnan(f)) return 0; |
| 74 | + if (f >= static_cast<float>(std::numeric_limits<int32_t>::max())) |
| 75 | + return std::numeric_limits<int32_t>::max(); |
| 76 | + if (f <= static_cast<float>(std::numeric_limits<int32_t>::min())) |
| 77 | + return std::numeric_limits<int32_t>::min(); |
| 78 | + |
| 79 | + return static_cast<int32_t>(f); // C++ truncates toward 0 |
| 80 | + } |
| 81 | + |
| 82 | + bool IsZero() const { return (m_Bits & 0x7FFFu) == 0; } |
| 83 | + bool Sign() const { return (m_Bits >> 15) != 0; } |
| 84 | + uint16_t Raw() const { return m_Bits; } |
| 85 | + |
| 86 | + |
| 87 | + static float HalfBitsToFloat(uint16_t h) |
| 88 | + { |
| 89 | + const uint32_t sign = (uint32_t(h) & 0x8000u) << 16; |
| 90 | + const uint32_t exp = (h >> 10) & 0x1Fu; |
| 91 | + const uint32_t mant = h & 0x03FFu; |
| 92 | + |
| 93 | + uint32_t fbits = 0; |
| 94 | + |
| 95 | + if (exp == 0) |
| 96 | + { |
| 97 | + if (mant == 0) |
| 98 | + { |
| 99 | + // +/-0 |
| 100 | + fbits = sign; |
| 101 | + } |
| 102 | + else |
| 103 | + { |
| 104 | + // Subnormal: normalize mantissa |
| 105 | + // value = mant * 2^-24 |
| 106 | + // Convert to float bits by shifting into float mantissa with adjusted exponent. |
| 107 | + uint32_t m = mant; |
| 108 | + int e = -14; |
| 109 | + while ((m & 0x0400u) == 0) |
| 110 | + { |
| 111 | + m <<= 1; |
| 112 | + --e; |
| 113 | + } |
| 114 | + m &= 0x03FFu; |
| 115 | + const uint32_t exp_f = uint32_t(e + 127); |
| 116 | + fbits = sign | (exp_f << 23) | (m << 13); |
| 117 | + } |
| 118 | + } |
| 119 | + else if (exp == 0x1F) |
| 120 | + { |
| 121 | + // Inf/NaN |
| 122 | + fbits = sign | 0x7F800000u | (mant << 13); |
| 123 | + if (mant != 0) fbits |= 0x00400000u; // Make sure it's a quiet NaN in float |
| 124 | + } |
| 125 | + else |
| 126 | + { |
| 127 | + // Normal |
| 128 | + const uint32_t exp_f = exp + (127 - 15); |
| 129 | + fbits = sign | (exp_f << 23) | (mant << 13); |
| 130 | + } |
| 131 | + |
| 132 | + float out; |
| 133 | + std::memcpy(&out, &fbits, sizeof(out)); |
| 134 | + return out; |
| 135 | + } |
| 136 | + |
| 137 | + static uint16_t DoubleToHalfBits(double d) |
| 138 | + { |
| 139 | + // Convert via float to keep code smaller; every half is exactly representable as float. |
| 140 | + return FloatToHalfBits(static_cast<float>(d)); |
| 141 | + } |
| 142 | + |
| 143 | + // float -> half (binary16), round-to-nearest-even |
| 144 | + static uint16_t FloatToHalfBits(float f) |
| 145 | + { |
| 146 | + uint32_t x; |
| 147 | + std::memcpy(&x, &f, sizeof(x)); |
| 148 | + |
| 149 | + const uint32_t sign = (x >> 16) & 0x8000u; |
| 150 | + uint32_t exp = (x >> 23) & 0xFFu; |
| 151 | + uint32_t mant = x & 0x007FFFFFu; |
| 152 | + |
| 153 | + // NaN/Inf |
| 154 | + if (exp == 0xFFu) |
| 155 | + { |
| 156 | + if (mant == 0) return static_cast<uint16_t>(sign | 0x7C00u); // Inf |
| 157 | + // Preserve some payload; ensure qNaN |
| 158 | + uint16_t payload = static_cast<uint16_t>(mant >> 13); |
| 159 | + if (payload == 0) payload = 1; |
| 160 | + return static_cast<uint16_t>(sign | 0x7C00u | payload | 0x0200u); |
| 161 | + } |
| 162 | + |
| 163 | + // Unbias exponent from float, then bias to half |
| 164 | + int32_t e = static_cast<int32_t>(exp) - 127 + 15; |
| 165 | + |
| 166 | + // Handle subnormals/underflow |
| 167 | + if (e <= 0) |
| 168 | + { |
| 169 | + if (e < -10) |
| 170 | + { |
| 171 | + // Too small -> signed zero |
| 172 | + return static_cast<uint16_t>(sign); |
| 173 | + } |
| 174 | + |
| 175 | + // Make implicit leading 1 explicit |
| 176 | + mant |= 0x00800000u; |
| 177 | + |
| 178 | + // Shift to subnormal half mantissa position |
| 179 | + const int shift = 1 - e; // 1..10 |
| 180 | + uint32_t mant_shifted = mant >> (shift + 13); |
| 181 | + |
| 182 | + // Round-to-nearest-even using the bits we threw away |
| 183 | + const uint32_t round_mask = (1u << (shift + 13)) - 1u; |
| 184 | + const uint32_t round_bits = mant & round_mask; |
| 185 | + const uint32_t halfway = 1u << (shift + 12); |
| 186 | + |
| 187 | + if (round_bits > halfway || (round_bits == halfway && (mant_shifted & 1u))) |
| 188 | + mant_shifted++; |
| 189 | + |
| 190 | + return static_cast<uint16_t>(sign | static_cast<uint16_t>(mant_shifted)); |
| 191 | + } |
| 192 | + |
| 193 | + // Overflow -> Inf |
| 194 | + if (e >= 31) |
| 195 | + { |
| 196 | + return static_cast<uint16_t>(sign | 0x7C00u); |
| 197 | + } |
| 198 | + |
| 199 | + // Normal case: round mantissa from 23 to 10 bits |
| 200 | + uint32_t mant_half = mant >> 13; |
| 201 | + const uint32_t round_bits = mant & 0x1FFFu; // lower 13 bits |
| 202 | + |
| 203 | + // Round-to-nearest-even |
| 204 | + if (round_bits > 0x1000u || (round_bits == 0x1000u && (mant_half & 1u))) |
| 205 | + { |
| 206 | + mant_half++; |
| 207 | + if (mant_half == 0x0400u) // mantissa overflow |
| 208 | + { |
| 209 | + mant_half = 0; |
| 210 | + e++; |
| 211 | + if (e >= 31) return static_cast<uint16_t>(sign | 0x7C00u); |
| 212 | + } |
| 213 | + } |
| 214 | + |
| 215 | + return static_cast<uint16_t>(sign | (static_cast<uint16_t>(e) << 10) | static_cast<uint16_t>(mant_half)); |
| 216 | + } |
| 217 | + |
| 218 | +private: |
| 219 | + uint16_t m_Bits{0}; |
| 220 | +}; |
| 221 | + |
| 222 | +} // namespace Diligent |
0 commit comments