int: Remove deprecated std::codecvt from strutil.cpp internals (#5107)

lgritz · web-flow · commit 201fecbff6ef · 2026-04-10T21:16:22.000-07:00
For the sake of Windows' need for wstring filenames, we do some
utf8&lt;-&gt;utf16 conversion in strutil.cpp. That has always used the
std::codecvt set of functions, but they have been deprecated beginning
in C++17 and will be removed entirely in C++26, so their days are
numbered and we already have to suppress warnings to get the compiler to
shut up about it.

Assisted-by: Claude Code / Opus 4.6

Signed-off-by: Larry Gritz &lt;lg@larrygritz.com&gt;
diff --git a/src/libutil/strutil.cpp b/src/libutil/strutil.cpp
@@ -5,13 +5,7 @@
 
 #include <OpenImageIO/platform.h>
 
-// Special dance to disable warnings in the included files related to
-// the deprecation of unicode conversion functions.
-OIIO_PRAGMA_WARNING_PUSH
-OIIO_CLANG_PRAGMA(clang diagnostic ignored "-Wdeprecated-declarations")
-#include <codecvt>
 #include <locale>
-OIIO_PRAGMA_WARNING_POP
 
 #include <algorithm>
 #include <cmath>
@@ -958,62 +952,113 @@ Strutil::replace(string_view str, string_view pattern, string_view replacement,
 
 
 
-// Conversion functions between UTF-8 and UTF-16 for windows.
+// UTF-8 <-> UTF-16 conversion utilities.
 //
-// For historical reasons, the standard encoding for strings on windows is
-// UTF-16, whereas the unix world seems to have settled on UTF-8.  These two
-// encodings can be stored in std::string and std::wstring respectively, with
-// the caveat that they're both variable-width encodings, so not all the
-// standard string methods will make sense (for example std::string::size()
-// won't return the number of glyphs in a UTF-8 string, unless it happens to
-// be made up of only the 7-bit ASCII subset).
+// OIIO uses UTF-8 for all string/path handling. On Windows, OS APIs require
+// UTF-16 (wchar_t*), so we convert at API boundaries. Some non-Windows uses
+// also exist (e.g., parsing UTF-16 ICC profile metadata).
 //
-// The standard windows API functions usually have two versions, a UTF-16
-// version with a 'W' suffix (using wchar_t* strings), and an ANSI version
-// with a 'A' suffix (using char* strings) which uses the current windows
-// code page to define the encoding.  (To make matters more confusing there is
-// also a further "TCHAR" version which is #defined to the UTF-16 or ANSI
-// version, depending on whether UNICODE is defined during compilation.
-// This is meant to make it possible to support compiling libraries in
-// either unicode or ansi mode from the same codebase.)
+// On Windows, we use the native MultiByteToWideChar/WideCharToMultiByte APIs.
+// On other platforms, we use hand-rolled UTF-8/UTF-16 codec functions below,
+// replacing the deprecated std::codecvt_utf8_utf16 (removed in C++26).
 //
-// Using std::string as the string container (as in OIIO) implies that we
-// can't use UTF-16.  It also means we need a variable-width encoding to
-// represent characters in non-Latin alphabets in an unambiguous way; the
-// obvious candidate is UTF-8.  File paths in OIIO are considered to be
-// represented in UTF-8, and must be converted to UTF-16 before passing to
-// windows API file opening functions.
-//
-// On the other hand, the encoding used for the ANSI versions of the windows
-// API is the current windows code page.  This is more compatible with the
-// default setup of the standard windows command prompt, and may be more
-// appropriate for error messages.
+// Note: wchar_t is 16-bit on Windows (natural UTF-16) but 32-bit on
+// macOS/Linux. The non-Windows path still produces UTF-16 encoding in
+// wchar_t units (with surrogate pairs) to match the expected semantics of
+// utf8_to_utf16wstring().
+
+// Decode one UTF-8 sequence starting at `src[pos]`, advance `pos` past it,
+// and return the codepoint. Returns 0xFFFD on malformed input and advances
+// past the bad byte(s).
+static uint32_t
+decode_utf8(const char* src, size_t len, size_t& pos)
+{
+    auto byte    = [&](size_t i) -> uint8_t { return uint8_t(src[i]); };
+    auto is_cont = [](uint8_t b) { return (b & 0xC0) == 0x80; };
+    uint8_t b0   = byte(pos);
+    if (b0 < 0x80) {
+        pos += 1;
+        return b0;
+    } else if ((b0 & 0xE0) == 0xC0 && pos + 1 < len && is_cont(byte(pos + 1))) {
+        uint32_t cp = (uint32_t(b0 & 0x1F) << 6)
+                      | uint32_t(byte(pos + 1) & 0x3F);
+        pos += 2;
+        return cp >= 0x80 ? cp : 0xFFFD;  // reject overlong
+    } else if ((b0 & 0xF0) == 0xE0 && pos + 2 < len && is_cont(byte(pos + 1))
+               && is_cont(byte(pos + 2))) {
+        uint32_t cp = (uint32_t(b0 & 0x0F) << 12)
+                      | (uint32_t(byte(pos + 1) & 0x3F) << 6)
+                      | uint32_t(byte(pos + 2) & 0x3F);
+        pos += 3;
+        if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF))
+            return 0xFFFD;  // overlong or surrogate
+        return cp;
+    } else if ((b0 & 0xF8) == 0xF0 && pos + 3 < len && is_cont(byte(pos + 1))
+               && is_cont(byte(pos + 2)) && is_cont(byte(pos + 3))) {
+        uint32_t cp = (uint32_t(b0 & 0x07) << 18)
+                      | (uint32_t(byte(pos + 1) & 0x3F) << 12)
+                      | (uint32_t(byte(pos + 2) & 0x3F) << 6)
+                      | uint32_t(byte(pos + 3) & 0x3F);
+        pos += 4;
+        return (cp >= 0x10000 && cp <= 0x10FFFF) ? cp : 0xFFFD;
+    }
+    pos += 1;  // skip bad byte
+    return 0xFFFD;
+}
+
+
+// Encode a Unicode codepoint as UTF-8, appending to `out`.
+static void
+encode_utf8(uint32_t cp, std::string& out)
+{
+    if (cp < 0x80) {
+        out += char(cp);
+    } else if (cp < 0x800) {
+        out += char(0xC0 | (cp >> 6));
+        out += char(0x80 | (cp & 0x3F));
+    } else if (cp < 0x10000) {
+        out += char(0xE0 | (cp >> 12));
+        out += char(0x80 | ((cp >> 6) & 0x3F));
+        out += char(0x80 | (cp & 0x3F));
+    } else if (cp <= 0x10FFFF) {
+        out += char(0xF0 | (cp >> 18));
+        out += char(0x80 | ((cp >> 12) & 0x3F));
+        out += char(0x80 | ((cp >> 6) & 0x3F));
+        out += char(0x80 | (cp & 0x3F));
+    }
+}
+
 
 std::wstring
 Strutil::utf8_to_utf16wstring(string_view str) noexcept
 {
 #ifdef _WIN32
     // UTF8<->UTF16 conversions are primarily needed on Windows, so use the
-    // fastest option (C++11 <codecvt> is many times slower due to locale
-    // access overhead, and is deprecated starting with C++17).
+    // fastest option.
     std::wstring result;
     result.resize(
         MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), NULL, 0));
     MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), result.data(),
                         (int)result.size());
     return result;
 #else
-    try {
-        OIIO_PRAGMA_WARNING_PUSH
-#    if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
-        OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
-#    endif
-        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> conv;
-        OIIO_PRAGMA_WARNING_POP
-        return conv.from_bytes(str.data(), str.data() + str.size());
-    } catch (const std::exception&) {
-        return std::wstring();
+    // Decode UTF-8 into codepoints and encode as UTF-16 stored in wchar_t
+    // units (matching the behavior of the now-deprecated codecvt_utf8_utf16).
+    std::wstring result;
+    result.reserve(str.size());
+    size_t pos = 0;
+    while (pos < str.size()) {
+        uint32_t cp = decode_utf8(str.data(), str.size(), pos);
+        if (cp < 0x10000) {
+            result += wchar_t(cp);
+        } else {
+            // Encode as surrogate pair in wchar_t units
+            cp -= 0x10000;
+            result += wchar_t(0xD800 + (cp >> 10));
+            result += wchar_t(0xDC00 + (cp & 0x3FF));
+        }
     }
+    return result;
 #endif
 }
 
@@ -1024,26 +1069,41 @@ Strutil::utf16_to_utf8(const std::wstring& str) noexcept
 {
 #ifdef _WIN32
     // UTF8<->UTF16 conversions are primarily needed on Windows, so use the
-    // fastest option (C++11 <codecvt> is many times slower due to locale
-    // access overhead, and is deprecated starting with C++17).
+    // fastest option.
     std::string result;
     result.resize(WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(),
                                       NULL, 0, NULL, NULL));
     WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(), &result[0],
                         (int)result.size(), NULL, NULL);
     return result;
 #else
-    try {
-        OIIO_PRAGMA_WARNING_PUSH
-#    if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
-        OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
-#    endif
-        std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> conv;
-        OIIO_PRAGMA_WARNING_POP
-        return conv.to_bytes(str);
-    } catch (const std::exception&) {
-        return std::string();
+    // Decode UTF-16 stored in wchar_t units (matching the behavior of the
+    // now-deprecated codecvt_utf8_utf16) and encode as UTF-8.
+    std::string result;
+    result.reserve(str.size() * 2);
+    size_t i = 0;
+    while (i < str.size()) {
+        uint32_t w = uint32_t(str[i]);
+        uint32_t cp;
+        if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size()) {
+            uint32_t w2 = uint32_t(str[i + 1]);
+            if (w2 >= 0xDC00 && w2 <= 0xDFFF) {
+                cp = 0x10000 + ((w - 0xD800) << 10) + (w2 - 0xDC00);
+                i += 2;
+            } else {
+                cp = 0xFFFD;  // unpaired high surrogate
+                i += 1;
+            }
+        } else if (w >= 0xDC00 && w <= 0xDFFF) {
+            cp = 0xFFFD;  // unpaired low surrogate
+            i += 1;
+        } else {
+            cp = w;
+            i += 1;
+        }
+        encode_utf8(cp, result);
     }
+    return result;
 #endif
 }
 
@@ -1060,17 +1120,32 @@ Strutil::utf16_to_utf8(const std::u16string& str) noexcept
                         &result[0], (int)result.size(), NULL, NULL);
     return result;
 #else
-    try {
-        OIIO_PRAGMA_WARNING_PUSH
-#    if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
-        OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
-#    endif
-        std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> conv;
-        return conv.to_bytes(str);
-        OIIO_PRAGMA_WARNING_POP
-    } catch (const std::exception&) {
-        return std::string();
+    // Decode UTF-16 from char16_t units and encode as UTF-8.
+    std::string result;
+    result.reserve(str.size() * 2);
+    size_t i = 0;
+    while (i < str.size()) {
+        uint32_t w = uint32_t(str[i]);
+        uint32_t cp;
+        if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size()) {
+            uint32_t w2 = uint32_t(str[i + 1]);
+            if (w2 >= 0xDC00 && w2 <= 0xDFFF) {
+                cp = 0x10000 + ((w - 0xD800) << 10) + (w2 - 0xDC00);
+                i += 2;
+            } else {
+                cp = 0xFFFD;
+                i += 1;
+            }
+        } else if (w >= 0xDC00 && w <= 0xDFFF) {
+            cp = 0xFFFD;
+            i += 1;
+        } else {
+            cp = w;
+            i += 1;
+        }
+        encode_utf8(cp, result);
     }
+    return result;
 #endif
 }
 
diff --git a/src/libutil/strutil_test.cpp b/src/libutil/strutil_test.cpp
@@ -1759,6 +1759,108 @@ getargs(int argc, char* argv[])
 
 
 
+void
+test_utf_conversions()
+{
+    std::cout << "Testing utf8_to_utf16wstring / utf16_to_utf8\n";
+
+    // ASCII round-trip
+    {
+        std::string ascii = "hello";
+        std::wstring w    = Strutil::utf8_to_utf16wstring(ascii);
+        OIIO_CHECK_EQUAL(w.size(), 5);
+        OIIO_CHECK_EQUAL((int)w[0], (int)L'h');
+        OIIO_CHECK_EQUAL((int)w[4], (int)L'o');
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, ascii);
+    }
+
+    // Empty string
+    {
+        std::wstring w = Strutil::utf8_to_utf16wstring("");
+        OIIO_CHECK_EQUAL(w.size(), 0);
+        std::string s = Strutil::utf16_to_utf8(std::wstring());
+        OIIO_CHECK_EQUAL(s.size(), 0);
+        std::string s2 = Strutil::utf16_to_utf8(std::u16string());
+        OIIO_CHECK_EQUAL(s2.size(), 0);
+    }
+
+    // 2-byte UTF-8 (Latin/Greek/Cyrillic, U+0080..U+07FF)
+    // "café" = U+0063 U+0061 U+0066 U+00E9
+    {
+        std::string utf8 = "caf\xc3\xa9";  // café in UTF-8
+        std::wstring w   = Strutil::utf8_to_utf16wstring(utf8);
+        OIIO_CHECK_EQUAL(w.size(), 4);
+        OIIO_CHECK_EQUAL((int)w[3], 0x00E9);
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, utf8);
+    }
+
+    // 3-byte UTF-8 (CJK, U+0800..U+FFFF)
+    // U+6620 U+753B = "映画" (movie in Japanese)
+    {
+        std::string utf8 = "\xe6\x98\xa0\xe7\x94\xbb";
+        std::wstring w   = Strutil::utf8_to_utf16wstring(utf8);
+        OIIO_CHECK_EQUAL(w.size(), 2);
+        OIIO_CHECK_EQUAL((int)w[0], 0x6620);
+        OIIO_CHECK_EQUAL((int)w[1], 0x753B);
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, utf8);
+    }
+
+    // 4-byte UTF-8 / surrogate pairs (U+10000..U+10FFFF)
+    // U+1F600 (grinning face emoji)
+    {
+        std::string utf8 = "\xf0\x9f\x98\x80";
+        std::wstring w   = Strutil::utf8_to_utf16wstring(utf8);
+        // Should be encoded as surrogate pair: 0xD83D 0xDE00
+        OIIO_CHECK_EQUAL(w.size(), 2);
+        OIIO_CHECK_EQUAL((int)w[0], 0xD83D);
+        OIIO_CHECK_EQUAL((int)w[1], 0xDE00);
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, utf8);
+    }
+
+    // Mixed ASCII + multibyte round-trip
+    {
+        // "Ñoño" = U+00D1 U+006F U+00F1 U+006F
+        std::string utf8 = "\xc3\x91o\xc3\xb1o";
+        std::wstring w   = Strutil::utf8_to_utf16wstring(utf8);
+        OIIO_CHECK_EQUAL(w.size(), 4);
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, utf8);
+    }
+
+    // utf16_to_utf8 with u16string variant
+    {
+        // Basic Multilingual Plane: U+0041 U+00E9 U+6620
+        std::u16string u16 = { char16_t(0x0041), char16_t(0x00E9),
+                               char16_t(0x6620) };
+        std::string utf8   = Strutil::utf16_to_utf8(u16);
+        OIIO_CHECK_EQUAL(utf8, "A\xc3\xa9\xe6\x98\xa0");
+    }
+
+    // utf16_to_utf8 u16string with surrogate pair
+    {
+        // U+1F600 as surrogate pair: 0xD83D 0xDE00
+        std::u16string u16 = { char16_t(0xD83D), char16_t(0xDE00) };
+        std::string utf8   = Strutil::utf16_to_utf8(u16);
+        OIIO_CHECK_EQUAL(utf8, "\xf0\x9f\x98\x80");
+    }
+
+    // Round-trip through u16string for supplementary plane
+    {
+        // U+1D11E (musical symbol G clef)
+        std::string utf8 = "\xf0\x9d\x84\x9e";
+        std::wstring w   = Strutil::utf8_to_utf16wstring(utf8);
+        OIIO_CHECK_EQUAL(w.size(), 2);  // surrogate pair
+        std::string back = Strutil::utf16_to_utf8(w);
+        OIIO_CHECK_EQUAL(back, utf8);
+    }
+}
+
+
+
 int
 main(int argc, char* argv[])
 {
@@ -1810,6 +1912,7 @@ main(int argc, char* argv[])
     test_edit_distance();
     test_base64_encode();
     test_eval_as_bool();
+    test_utf_conversions();
 
     Strutil::debug("debug message\n");