@@ -105,6 +105,46 @@ typedef uint64_t uint64;
105105typedef const char * str;
106106
107107
108+ // Float-to-unsigned conversions: cast through int64 to avoid C++ UB
109+ // (C++ 7.3.11: float -> unsigned is UB when truncated value is negative)
110+ template <typename F> CUDA_CALLABLE inline int64 safe_float_to_int64 (F x)
111+ {
112+ if (!(x == x))
113+ return 0 ;
114+ constexpr F min_int64 = static_cast <F>(-9223372036854775808.0 ); // -2^63
115+ constexpr F max_overflow = static_cast <F>(9223372036854775808.0 ); // 2^63
116+ if (x < min_int64)
117+ return -9223372036854775807LL - 1LL ;
118+ if (x >= max_overflow)
119+ return 9223372036854775807LL ;
120+ return static_cast <int64>(x);
121+ }
122+
123+ template <typename F> CUDA_CALLABLE inline uint64 safe_float_to_uint64 (F x)
124+ {
125+ if (!(x == x))
126+ return 0 ;
127+ if (x <= 0.0 )
128+ return static_cast <uint64>(safe_float_to_int64 (x));
129+ constexpr F pow2_63 = static_cast <F>(9223372036854775808.0 ); // 2^63
130+ constexpr F overflow_uint64 = static_cast <F>(18446744073709551616.0 ); // 2^64
131+ if (x >= overflow_uint64)
132+ return 18446744073709551615ULL ;
133+ if (x >= pow2_63)
134+ return static_cast <uint64>(safe_float_to_int64 (x - pow2_63)) + 9223372036854775808ULL ;
135+ return static_cast <uint64>(safe_float_to_int64 (x));
136+ }
137+
138+ CUDA_CALLABLE inline uint8 float32_to_uint8 (float32 x) { return static_cast <uint8>(safe_float_to_int64 (x)); }
139+ CUDA_CALLABLE inline uint8 float64_to_uint8 (float64 x) { return static_cast <uint8>(safe_float_to_int64 (x)); }
140+ CUDA_CALLABLE inline uint16 float32_to_uint16 (float32 x) { return static_cast <uint16>(safe_float_to_int64 (x)); }
141+ CUDA_CALLABLE inline uint16 float64_to_uint16 (float64 x) { return static_cast <uint16>(safe_float_to_int64 (x)); }
142+ CUDA_CALLABLE inline uint32 float32_to_uint32 (float32 x) { return static_cast <uint32>(safe_float_to_int64 (x)); }
143+ CUDA_CALLABLE inline uint32 float64_to_uint32 (float64 x) { return static_cast <uint32>(safe_float_to_int64 (x)); }
144+ CUDA_CALLABLE inline uint64 float32_to_uint64 (float32 x) { return safe_float_to_uint64 (x); }
145+ CUDA_CALLABLE inline uint64 float64_to_uint64 (float64 x) { return safe_float_to_uint64 (x); }
146+
147+
108148struct half ;
109149
110150CUDA_CALLABLE half float_to_half (float x);
@@ -182,6 +222,12 @@ static_assert(sizeof(half) == 2, "Size of half / float16 type must be 2-bytes");
182222
183223typedef half float16;
184224
225+ // Handle float16 source
226+ CUDA_CALLABLE inline uint8 float16_to_uint8 (float16 x) { return float32_to_uint8 (float32 (x)); }
227+ CUDA_CALLABLE inline uint16 float16_to_uint16 (float16 x) { return float32_to_uint16 (float32 (x)); }
228+ CUDA_CALLABLE inline uint32 float16_to_uint32 (float16 x) { return float32_to_uint32 (float32 (x)); }
229+ CUDA_CALLABLE inline uint64 float16_to_uint64 (float16 x) { return float32_to_uint64 (float32 (x)); }
230+
185231// Approximate division/reciprocal intrinsics
186232#if defined(__CUDA_ARCH__)
187233
@@ -337,6 +383,19 @@ template <typename T> CUDA_CALLABLE inline void adj_float16(T x, T& adj_x, float
337383template <typename T> CUDA_CALLABLE inline void adj_float32 (T x, T& adj_x, float32 adj_ret) { adj_x += T (adj_ret); }
338384template <typename T> CUDA_CALLABLE inline void adj_float64 (T x, T& adj_x, float64 adj_ret) { adj_x += T (adj_ret); }
339385
386+ // Adjoint stubs for safe float-to-unsigned casts (no-op since they are cast functions)
387+ template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint8 (T, T&, uint8) { }
388+ template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint8 (T, T&, uint8) { }
389+ template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint8 (T, T&, uint8) { }
390+ template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint16 (T, T&, uint16) { }
391+ template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint16 (T, T&, uint16) { }
392+ template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint16 (T, T&, uint16) { }
393+ template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint32 (T, T&, uint32) { }
394+ template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint32 (T, T&, uint32) { }
395+ template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint32 (T, T&, uint32) { }
396+ template <typename T> CUDA_CALLABLE inline void adj_float32_to_uint64 (T, T&, uint64) { }
397+ template <typename T> CUDA_CALLABLE inline void adj_float64_to_uint64 (T, T&, uint64) { }
398+ template <typename T> CUDA_CALLABLE inline void adj_float16_to_uint64 (T, T&, uint64) { }
340399
341400#define kEps 0 .0f
342401
0 commit comments