Skip to content

Commit df4eb09

Browse files
Common: add Float16 type + conversions
1 parent b318706 commit df4eb09

3 files changed

Lines changed: 614 additions & 0 deletions

File tree

Common/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ set(INTERFACE
1212
interface/Array2DTools.hpp
1313
interface/AsyncInitializer.hpp
1414
interface/BasicMath.hpp
15+
interface/Float16.hpp
1516
interface/BasicFileStream.hpp
1617
interface/DataBlobImpl.hpp
1718
interface/DefaultRawMemoryAllocator.hpp

Common/interface/Float16.hpp

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
/*
2+
* Copyright 2026 Diligent Graphics LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* In no event and under no legal theory, whether in tort (including negligence),
17+
* contract, or otherwise, unless required by applicable law (such as deliberate
18+
* and grossly negligent acts) or agreed to in writing, shall any Contributor be
19+
* liable for any damages, including any direct, indirect, special, incidental,
20+
* or consequential damages of any character arising as a result of this License or
21+
* out of the use or inability to use the software (including but not limited to damages
22+
* for loss of goodwill, work stoppage, computer failure or malfunction, or any and
23+
* all other commercial damages or losses), even if such Contributor has been advised
24+
* of the possibility of such damages.
25+
*/
26+
27+
#pragma once
28+
29+
#include <cstdint>
30+
#include <cstring>
31+
#include <cmath>
32+
#include <limits>
33+
#include <type_traits>
34+
35+
namespace Diligent
36+
{
37+
38+
class Float16
39+
{
40+
public:
41+
constexpr Float16() noexcept = default;
42+
constexpr explicit Float16(uint16_t Bits) noexcept :
43+
m_Bits{Bits}
44+
{}
45+
46+
explicit Float16(float f) noexcept :
47+
m_Bits(FloatToHalfBits(f))
48+
{}
49+
50+
explicit Float16(double d) noexcept :
51+
m_Bits(DoubleToHalfBits(d))
52+
{}
53+
54+
explicit Float16(int32_t i) noexcept :
55+
m_Bits(FloatToHalfBits(static_cast<float>(i)))
56+
{}
57+
58+
explicit operator float() const noexcept
59+
{
60+
return HalfBitsToFloat(m_Bits);
61+
}
62+
63+
explicit operator double() const noexcept
64+
{
65+
return static_cast<double>(HalfBitsToFloat(m_Bits));
66+
}
67+
68+
// Int32 conversion: trunc toward 0, saturate on overflow, NaN->0
69+
explicit operator int32_t() const noexcept
70+
{
71+
const float f = HalfBitsToFloat(m_Bits);
72+
73+
if (std::isnan(f)) return 0;
74+
if (f >= static_cast<float>(std::numeric_limits<int32_t>::max()))
75+
return std::numeric_limits<int32_t>::max();
76+
if (f <= static_cast<float>(std::numeric_limits<int32_t>::min()))
77+
return std::numeric_limits<int32_t>::min();
78+
79+
return static_cast<int32_t>(f); // C++ truncates toward 0
80+
}
81+
82+
bool IsZero() const { return (m_Bits & 0x7FFFu) == 0; }
83+
bool Sign() const { return (m_Bits >> 15) != 0; }
84+
uint16_t Raw() const { return m_Bits; }
85+
86+
87+
static float HalfBitsToFloat(uint16_t h)
88+
{
89+
const uint32_t sign = (uint32_t(h) & 0x8000u) << 16;
90+
const uint32_t exp = (h >> 10) & 0x1Fu;
91+
const uint32_t mant = h & 0x03FFu;
92+
93+
uint32_t fbits = 0;
94+
95+
if (exp == 0)
96+
{
97+
if (mant == 0)
98+
{
99+
// +/-0
100+
fbits = sign;
101+
}
102+
else
103+
{
104+
// Subnormal: normalize mantissa
105+
// value = mant * 2^-24
106+
// Convert to float bits by shifting into float mantissa with adjusted exponent.
107+
uint32_t m = mant;
108+
int e = -14;
109+
while ((m & 0x0400u) == 0)
110+
{
111+
m <<= 1;
112+
--e;
113+
}
114+
m &= 0x03FFu;
115+
const uint32_t exp_f = uint32_t(e + 127);
116+
fbits = sign | (exp_f << 23) | (m << 13);
117+
}
118+
}
119+
else if (exp == 0x1F)
120+
{
121+
// Inf/NaN
122+
fbits = sign | 0x7F800000u | (mant << 13);
123+
if (mant != 0) fbits |= 0x00400000u; // Make sure it's a quiet NaN in float
124+
}
125+
else
126+
{
127+
// Normal
128+
const uint32_t exp_f = exp + (127 - 15);
129+
fbits = sign | (exp_f << 23) | (mant << 13);
130+
}
131+
132+
float out;
133+
std::memcpy(&out, &fbits, sizeof(out));
134+
return out;
135+
}
136+
137+
static uint16_t DoubleToHalfBits(double d)
138+
{
139+
// Convert via float to keep code smaller; every half is exactly representable as float.
140+
return FloatToHalfBits(static_cast<float>(d));
141+
}
142+
143+
// float -> half (binary16), round-to-nearest-even
144+
static uint16_t FloatToHalfBits(float f)
145+
{
146+
uint32_t x;
147+
std::memcpy(&x, &f, sizeof(x));
148+
149+
const uint32_t sign = (x >> 16) & 0x8000u;
150+
uint32_t exp = (x >> 23) & 0xFFu;
151+
uint32_t mant = x & 0x007FFFFFu;
152+
153+
// NaN/Inf
154+
if (exp == 0xFFu)
155+
{
156+
if (mant == 0) return static_cast<uint16_t>(sign | 0x7C00u); // Inf
157+
// Preserve some payload; ensure qNaN
158+
uint16_t payload = static_cast<uint16_t>(mant >> 13);
159+
if (payload == 0) payload = 1;
160+
return static_cast<uint16_t>(sign | 0x7C00u | payload | 0x0200u);
161+
}
162+
163+
// Unbias exponent from float, then bias to half
164+
int32_t e = static_cast<int32_t>(exp) - 127 + 15;
165+
166+
// Handle subnormals/underflow
167+
if (e <= 0)
168+
{
169+
if (e < -10)
170+
{
171+
// Too small -> signed zero
172+
return static_cast<uint16_t>(sign);
173+
}
174+
175+
// Make implicit leading 1 explicit
176+
mant |= 0x00800000u;
177+
178+
// Shift to subnormal half mantissa position
179+
const int shift = 1 - e; // 1..10
180+
uint32_t mant_shifted = mant >> (shift + 13);
181+
182+
// Round-to-nearest-even using the bits we threw away
183+
const uint32_t round_mask = (1u << (shift + 13)) - 1u;
184+
const uint32_t round_bits = mant & round_mask;
185+
const uint32_t halfway = 1u << (shift + 12);
186+
187+
if (round_bits > halfway || (round_bits == halfway && (mant_shifted & 1u)))
188+
mant_shifted++;
189+
190+
return static_cast<uint16_t>(sign | static_cast<uint16_t>(mant_shifted));
191+
}
192+
193+
// Overflow -> Inf
194+
if (e >= 31)
195+
{
196+
return static_cast<uint16_t>(sign | 0x7C00u);
197+
}
198+
199+
// Normal case: round mantissa from 23 to 10 bits
200+
uint32_t mant_half = mant >> 13;
201+
const uint32_t round_bits = mant & 0x1FFFu; // lower 13 bits
202+
203+
// Round-to-nearest-even
204+
if (round_bits > 0x1000u || (round_bits == 0x1000u && (mant_half & 1u)))
205+
{
206+
mant_half++;
207+
if (mant_half == 0x0400u) // mantissa overflow
208+
{
209+
mant_half = 0;
210+
e++;
211+
if (e >= 31) return static_cast<uint16_t>(sign | 0x7C00u);
212+
}
213+
}
214+
215+
return static_cast<uint16_t>(sign | (static_cast<uint16_t>(e) << 10) | static_cast<uint16_t>(mant_half));
216+
}
217+
218+
private:
219+
uint16_t m_Bits{0};
220+
};
221+
222+
} // namespace Diligent

0 commit comments

Comments
 (0)