Skip to content

Commit 5c5ba90

Browse files
committed
feat(ustring): ustring hash collision protection
The gist is that the ustring::strhash(str) function is modified to strip out the MSB from Strutil::strhash. The rep entry is filed in the ustring table based on this hash. So effectively, the computed hash is 63 bits, not 64. But rep->hashed field consists of the lower 63 bits being the computed hash, and the MSB indicates whether this is the 2nd (or more) entry in the table that had the same 63 bit hash. ustring::hash() then is modified as follows: If the MSB is 0, the computed hash is the hash. If the MSB is 1, though, we DON'T use that hash, and instead we use the pointer to the unique characters, but with the MSB set (that's an invalid address by itself). Note that the computed hashes never have MSB set, and the char*+MSB always have MSB set, so therefore ustring::hash() will never have the same value for two different ustrings. But -- please note! -- that ustring::strhash(str) and ustring(str).hash() will only match (and also be the same value on every execution) if the ustring is the first to receive that hash, which should be approximately always. Probably always, in practice. But in the very improbable case of a hash collision, one of them (the second to be turned into a ustring) will be using the alternate hash based on the character address, which is both not the same as ustring::strhash(chars), nor is it expected to be the same constant on every program execution. Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent 5439e0f commit 5c5ba90

3 files changed

Lines changed: 174 additions & 169 deletions

File tree

src/include/OpenImageIO/ustring.h

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ OIIO_NAMESPACE_BEGIN
3232
#define OIIO_USTRING_HAS_CTR_FROM_USTRINGHASH 1
3333
#define OIIO_USTRING_HAS_STDHASH 1
3434
#define OIIO_HAS_USTRINGHASH_FORMATTER 1
35+
#define OIIO_USTRING_SAFE_HASH 1
3536

3637

3738
class ustringhash; // forward declaration
@@ -123,6 +124,16 @@ class ustringhash; // forward declaration
123124
/// - if you don't need to do a lot of string assignment or equality
124125
/// testing, but lots of more complex string manipulation.
125126
///
127+
/// The ustring implementation guarantees that no two ustrings return the same
128+
/// value for hash() (despite the slim probability that two strings could
129+
/// numerically hash to the same value). For the first ustring added with a
130+
/// given hash, u.hash() will be the same value as ustring::strhash(chars),
131+
/// and will deterministically be the same on every execution. In the very
132+
/// improbable case of a hash collision, subsequent ustrings with the same
133+
/// numeric hash will use an alternate hash based on the character address,
134+
/// which is both not the same as ustring::strhash(chars), nor is it expected
135+
/// to be the same constant on every program execution.
136+
126137
class OIIO_UTIL_API ustring {
127138
public:
128139
using rep_t = const char*; ///< The underlying representation type
@@ -288,11 +299,7 @@ class OIIO_UTIL_API ustring {
288299
/// Return a C++ std::string representation of a ustring.
289300
const std::string& string() const noexcept
290301
{
291-
if (m_chars) {
292-
const TableRep* rep = (const TableRep*)m_chars - 1;
293-
return rep->str;
294-
} else
295-
return empty_std_string;
302+
return m_chars ? rep()->str : empty_std_string;
296303
}
297304

298305
/// Reset to an empty string.
@@ -303,17 +310,27 @@ class OIIO_UTIL_API ustring {
303310
{
304311
if (!m_chars)
305312
return 0;
306-
const TableRep* rep = ((const TableRep*)m_chars) - 1;
307-
return rep->length;
313+
return rep()->length;
308314
}
309315

310-
/// Return a hashed version of the string
316+
/// ustring::strhash() uses Strutil::strhash but clears the MSB.
317+
static OIIO_HOSTDEVICE constexpr hash_t strhash(string_view str)
318+
{
319+
return Strutil::strhash(str) & hash_mask;
320+
}
321+
322+
/// Return a hashed version of the string. To guarantee unique hashes,
323+
/// we check if the "duplicate bit" of the hash is set. If not, then
324+
/// we just return the hash which we know is unique. But if that bit
325+
/// is set, we utilize the unique character address.
311326
hash_t hash() const noexcept
312327
{
313328
if (!m_chars)
314329
return 0;
315-
const TableRep* rep = ((const TableRep*)m_chars) - 1;
316-
return rep->hashed;
330+
hash_t h = rep()->hashed;
331+
return OIIO_LIKELY((h & duplicate_bit) == 0)
332+
? h
333+
: hash_t(m_chars) | duplicate_bit;
317334
}
318335

319336
/// Return a hashed version of the string
@@ -749,6 +766,8 @@ class OIIO_UTIL_API ustring {
749766
// if you know the rep, the chars are at (char *)(rep+1), and if you
750767
// know the chars, the rep is at ((TableRep *)chars - 1).
751768
struct TableRep {
769+
// hashed has the MSB set if and only if this is the second or
770+
// greater ustring to have the same hash.
752771
hash_t hashed; // precomputed Hash value
753772
std::string str; // String representation
754773
size_t length; // Length of the string; must be right before cap
@@ -757,10 +776,29 @@ class OIIO_UTIL_API ustring {
757776
TableRep(string_view strref, hash_t hash);
758777
~TableRep();
759778
const char* c_str() const noexcept { return (const char*)(this + 1); }
779+
constexpr bool unique_hash() const
780+
{
781+
return (hashed & duplicate_bit) == 0;
782+
}
760783
};
761784

785+
// duplicate_bit is a 1 in the MSB, which if set indicates a hash that
786+
// is a duplicate.
787+
static constexpr hash_t duplicate_bit = hash_t(1) << 63;
788+
// hash_mask is what you `&` with hashed to get the real hash (clearing
789+
// the duplicate bit).
790+
#if 1
791+
static constexpr hash_t hash_mask = ~duplicate_bit;
792+
#else
793+
// Alternate to force lots of hash collisions for testing purposes:
794+
static constexpr hash_t hash_mask = ~duplicate_bit & 0xffff;
795+
#endif
796+
bool has_unique_hash() const { return rep()->unique_hash(); }
797+
762798
private:
763799
static std::string empty_std_string;
800+
801+
const TableRep* rep() const { return ((const TableRep*)m_chars) - 1; }
764802
};
765803

766804

@@ -811,7 +849,7 @@ class OIIO_UTIL_API ustringhash {
811849
OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str)
812850
#ifdef __CUDA_ARCH__
813851
// GPU: just compute the hash. This can be constexpr!
814-
: m_hash(Strutil::strhash(str))
852+
: m_hash(ustring::strhash(str))
815853
#else
816854
// CPU: make ustring, get its hash. Note that ustring ctr can't be
817855
// constexpr because it has to modify the internal ustring table.
@@ -823,7 +861,7 @@ class OIIO_UTIL_API ustringhash {
823861
OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str, size_t len)
824862
#ifdef __CUDA_ARCH__
825863
// GPU: just compute the hash. This can be constexpr!
826-
: m_hash(Strutil::strhash(len, str))
864+
: m_hash(ustring::strhash(len, str))
827865
#else
828866
// CPU: make ustring, get its hash. Note that ustring ctr can't be
829867
// constexpr because it has to modify the internal ustring table.
@@ -837,7 +875,7 @@ class OIIO_UTIL_API ustringhash {
837875
OIIO_DEVICE_CONSTEXPR explicit ustringhash(string_view str)
838876
#ifdef __CUDA_ARCH__
839877
// GPU: just compute the hash. This can be constexpr!
840-
: m_hash(Strutil::strhash(str))
878+
: m_hash(ustring::strhash(str))
841879
#else
842880
// CPU: make ustring, get its hash. Note that ustring ctr can't be
843881
// constexpr because it has to modify the internal ustring table.
@@ -931,13 +969,13 @@ class OIIO_UTIL_API ustringhash {
931969
/// Test for equality with a char*.
932970
constexpr bool operator==(const char* str) const noexcept
933971
{
934-
return m_hash == Strutil::strhash(str);
972+
return m_hash == ustring::strhash(str);
935973
}
936974

937975
/// Test for inequality with a char*.
938976
constexpr bool operator!=(const char* str) const noexcept
939977
{
940-
return m_hash != Strutil::strhash(str);
978+
return m_hash != ustring::strhash(str);
941979
}
942980

943981
#ifndef __CUDA_ARCH__

0 commit comments

Comments
 (0)