55
66#include < OpenImageIO/platform.h>
77
8- // Special dance to disable warnings in the included files related to
9- // the deprecation of unicode conversion functions.
10- OIIO_PRAGMA_WARNING_PUSH
11- OIIO_CLANG_PRAGMA (clang diagnostic ignored " -Wdeprecated-declarations" )
12- #include < codecvt>
138#include < locale>
14- OIIO_PRAGMA_WARNING_POP
159
1610#include < algorithm>
1711#include < cmath>
@@ -958,62 +952,113 @@ Strutil::replace(string_view str, string_view pattern, string_view replacement,
958952
959953
960954
961- // Conversion functions between UTF-8 and UTF-16 for windows .
955+ // UTF-8 <-> UTF-16 conversion utilities .
962956//
963- // For historical reasons, the standard encoding for strings on windows is
964- // UTF-16, whereas the unix world seems to have settled on UTF-8. These two
965- // encodings can be stored in std::string and std::wstring respectively, with
966- // the caveat that they're both variable-width encodings, so not all the
967- // standard string methods will make sense (for example std::string::size()
968- // won't return the number of glyphs in a UTF-8 string, unless it happens to
969- // be made up of only the 7-bit ASCII subset).
957+ // OIIO uses UTF-8 for all string/path handling. On Windows, OS APIs require
958+ // UTF-16 (wchar_t*), so we convert at API boundaries. Some non-Windows uses
959+ // also exist (e.g., parsing UTF-16 ICC profile metadata).
970960//
971- // The standard windows API functions usually have two versions, a UTF-16
972- // version with a 'W' suffix (using wchar_t* strings), and an ANSI version
973- // with a 'A' suffix (using char* strings) which uses the current windows
974- // code page to define the encoding. (To make matters more confusing there is
975- // also a further "TCHAR" version which is #defined to the UTF-16 or ANSI
976- // version, depending on whether UNICODE is defined during compilation.
977- // This is meant to make it possible to support compiling libraries in
978- // either unicode or ansi mode from the same codebase.)
961+ // On Windows, we use the native MultiByteToWideChar/WideCharToMultiByte APIs.
962+ // On other platforms, we use hand-rolled UTF-8/UTF-16 codec functions below,
963+ // replacing the deprecated std::codecvt_utf8_utf16 (removed in C++26).
979964//
980- // Using std::string as the string container (as in OIIO) implies that we
981- // can't use UTF-16. It also means we need a variable-width encoding to
982- // represent characters in non-Latin alphabets in an unambiguous way; the
983- // obvious candidate is UTF-8. File paths in OIIO are considered to be
984- // represented in UTF-8, and must be converted to UTF-16 before passing to
985- // windows API file opening functions.
986- //
987- // On the other hand, the encoding used for the ANSI versions of the windows
988- // API is the current windows code page. This is more compatible with the
989- // default setup of the standard windows command prompt, and may be more
990- // appropriate for error messages.
965+ // Note: wchar_t is 16-bit on Windows (natural UTF-16) but 32-bit on
966+ // macOS/Linux. The non-Windows path still produces UTF-16 encoding in
967+ // wchar_t units (with surrogate pairs) to match the expected semantics of
968+ // utf8_to_utf16wstring().
969+
970+ // Decode one UTF-8 sequence starting at `src[pos]`, advance `pos` past it,
971+ // and return the codepoint. Returns 0xFFFD on malformed input and advances
972+ // past the bad byte(s).
973+ static uint32_t
974+ decode_utf8 (const char * src, size_t len, size_t & pos)
975+ {
976+ auto byte = [&](size_t i) -> uint8_t { return uint8_t (src[i]); };
977+ auto is_cont = [](uint8_t b) { return (b & 0xC0 ) == 0x80 ; };
978+ uint8_t b0 = byte (pos);
979+ if (b0 < 0x80 ) {
980+ pos += 1 ;
981+ return b0;
982+ } else if ((b0 & 0xE0 ) == 0xC0 && pos + 1 < len && is_cont (byte (pos + 1 ))) {
983+ uint32_t cp = (uint32_t (b0 & 0x1F ) << 6 )
984+ | uint32_t (byte (pos + 1 ) & 0x3F );
985+ pos += 2 ;
986+ return cp >= 0x80 ? cp : 0xFFFD ; // reject overlong
987+ } else if ((b0 & 0xF0 ) == 0xE0 && pos + 2 < len && is_cont (byte (pos + 1 ))
988+ && is_cont (byte (pos + 2 ))) {
989+ uint32_t cp = (uint32_t (b0 & 0x0F ) << 12 )
990+ | (uint32_t (byte (pos + 1 ) & 0x3F ) << 6 )
991+ | uint32_t (byte (pos + 2 ) & 0x3F );
992+ pos += 3 ;
993+ if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF ))
994+ return 0xFFFD ; // overlong or surrogate
995+ return cp;
996+ } else if ((b0 & 0xF8 ) == 0xF0 && pos + 3 < len && is_cont (byte (pos + 1 ))
997+ && is_cont (byte (pos + 2 )) && is_cont (byte (pos + 3 ))) {
998+ uint32_t cp = (uint32_t (b0 & 0x07 ) << 18 )
999+ | (uint32_t (byte (pos + 1 ) & 0x3F ) << 12 )
1000+ | (uint32_t (byte (pos + 2 ) & 0x3F ) << 6 )
1001+ | uint32_t (byte (pos + 3 ) & 0x3F );
1002+ pos += 4 ;
1003+ return (cp >= 0x10000 && cp <= 0x10FFFF ) ? cp : 0xFFFD ;
1004+ }
1005+ pos += 1 ; // skip bad byte
1006+ return 0xFFFD ;
1007+ }
1008+
1009+
1010+ // Encode a Unicode codepoint as UTF-8, appending to `out`.
1011+ static void
1012+ encode_utf8 (uint32_t cp, std::string& out)
1013+ {
1014+ if (cp < 0x80 ) {
1015+ out += char (cp);
1016+ } else if (cp < 0x800 ) {
1017+ out += char (0xC0 | (cp >> 6 ));
1018+ out += char (0x80 | (cp & 0x3F ));
1019+ } else if (cp < 0x10000 ) {
1020+ out += char (0xE0 | (cp >> 12 ));
1021+ out += char (0x80 | ((cp >> 6 ) & 0x3F ));
1022+ out += char (0x80 | (cp & 0x3F ));
1023+ } else if (cp <= 0x10FFFF ) {
1024+ out += char (0xF0 | (cp >> 18 ));
1025+ out += char (0x80 | ((cp >> 12 ) & 0x3F ));
1026+ out += char (0x80 | ((cp >> 6 ) & 0x3F ));
1027+ out += char (0x80 | (cp & 0x3F ));
1028+ }
1029+ }
1030+
9911031
9921032std::wstring
9931033Strutil::utf8_to_utf16wstring (string_view str) noexcept
9941034{
9951035#ifdef _WIN32
9961036 // UTF8<->UTF16 conversions are primarily needed on Windows, so use the
997- // fastest option (C++11 <codecvt> is many times slower due to locale
998- // access overhead, and is deprecated starting with C++17).
1037+ // fastest option.
9991038 std::wstring result;
10001039 result.resize (
10011040 MultiByteToWideChar (CP_UTF8, 0 , str.data (), str.length (), NULL , 0 ));
10021041 MultiByteToWideChar (CP_UTF8, 0 , str.data (), str.length (), result.data (),
10031042 (int )result.size ());
10041043 return result;
10051044#else
1006- try {
1007- OIIO_PRAGMA_WARNING_PUSH
1008- # if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1009- OIIO_GCC_PRAGMA (GCC diagnostic ignored " -Wdeprecated-declarations" )
1010- # endif
1011- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t >, wchar_t > conv;
1012- OIIO_PRAGMA_WARNING_POP
1013- return conv.from_bytes (str.data (), str.data () + str.size ());
1014- } catch (const std::exception&) {
1015- return std::wstring ();
1045+ // Decode UTF-8 into codepoints and encode as UTF-16 stored in wchar_t
1046+ // units (matching the behavior of the now-deprecated codecvt_utf8_utf16).
1047+ std::wstring result;
1048+ result.reserve (str.size ());
1049+ size_t pos = 0 ;
1050+ while (pos < str.size ()) {
1051+ uint32_t cp = decode_utf8 (str.data (), str.size (), pos);
1052+ if (cp < 0x10000 ) {
1053+ result += wchar_t (cp);
1054+ } else {
1055+ // Encode as surrogate pair in wchar_t units
1056+ cp -= 0x10000 ;
1057+ result += wchar_t (0xD800 + (cp >> 10 ));
1058+ result += wchar_t (0xDC00 + (cp & 0x3FF ));
1059+ }
10161060 }
1061+ return result;
10171062#endif
10181063}
10191064
@@ -1024,26 +1069,41 @@ Strutil::utf16_to_utf8(const std::wstring& str) noexcept
10241069{
10251070#ifdef _WIN32
10261071 // UTF8<->UTF16 conversions are primarily needed on Windows, so use the
1027- // fastest option (C++11 <codecvt> is many times slower due to locale
1028- // access overhead, and is deprecated starting with C++17).
1072+ // fastest option.
10291073 std::string result;
10301074 result.resize (WideCharToMultiByte (CP_UTF8, 0 , str.data (), str.length (),
10311075 NULL , 0 , NULL , NULL ));
10321076 WideCharToMultiByte (CP_UTF8, 0 , str.data (), str.length (), &result[0 ],
10331077 (int )result.size (), NULL , NULL );
10341078 return result;
10351079#else
1036- try {
1037- OIIO_PRAGMA_WARNING_PUSH
1038- # if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1039- OIIO_GCC_PRAGMA (GCC diagnostic ignored " -Wdeprecated-declarations" )
1040- # endif
1041- std::wstring_convert<std::codecvt_utf8_utf16<wchar_t >, wchar_t > conv;
1042- OIIO_PRAGMA_WARNING_POP
1043- return conv.to_bytes (str);
1044- } catch (const std::exception&) {
1045- return std::string ();
1080+ // Decode UTF-16 stored in wchar_t units (matching the behavior of the
1081+ // now-deprecated codecvt_utf8_utf16) and encode as UTF-8.
1082+ std::string result;
1083+ result.reserve (str.size () * 2 );
1084+ size_t i = 0 ;
1085+ while (i < str.size ()) {
1086+ uint32_t w = uint32_t (str[i]);
1087+ uint32_t cp;
1088+ if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size ()) {
1089+ uint32_t w2 = uint32_t (str[i + 1 ]);
1090+ if (w2 >= 0xDC00 && w2 <= 0xDFFF ) {
1091+ cp = 0x10000 + ((w - 0xD800 ) << 10 ) + (w2 - 0xDC00 );
1092+ i += 2 ;
1093+ } else {
1094+ cp = 0xFFFD ; // unpaired high surrogate
1095+ i += 1 ;
1096+ }
1097+ } else if (w >= 0xDC00 && w <= 0xDFFF ) {
1098+ cp = 0xFFFD ; // unpaired low surrogate
1099+ i += 1 ;
1100+ } else {
1101+ cp = w;
1102+ i += 1 ;
1103+ }
1104+ encode_utf8 (cp, result);
10461105 }
1106+ return result;
10471107#endif
10481108}
10491109
@@ -1060,17 +1120,32 @@ Strutil::utf16_to_utf8(const std::u16string& str) noexcept
10601120 &result[0 ], (int )result.size (), NULL , NULL );
10611121 return result;
10621122#else
1063- try {
1064- OIIO_PRAGMA_WARNING_PUSH
1065- # if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1066- OIIO_GCC_PRAGMA (GCC diagnostic ignored " -Wdeprecated-declarations" )
1067- # endif
1068- std::wstring_convert<std::codecvt_utf8_utf16<char16_t >, char16_t > conv;
1069- return conv.to_bytes (str);
1070- OIIO_PRAGMA_WARNING_POP
1071- } catch (const std::exception&) {
1072- return std::string ();
1123+ // Decode UTF-16 from char16_t units and encode as UTF-8.
1124+ std::string result;
1125+ result.reserve (str.size () * 2 );
1126+ size_t i = 0 ;
1127+ while (i < str.size ()) {
1128+ uint32_t w = uint32_t (str[i]);
1129+ uint32_t cp;
1130+ if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size ()) {
1131+ uint32_t w2 = uint32_t (str[i + 1 ]);
1132+ if (w2 >= 0xDC00 && w2 <= 0xDFFF ) {
1133+ cp = 0x10000 + ((w - 0xD800 ) << 10 ) + (w2 - 0xDC00 );
1134+ i += 2 ;
1135+ } else {
1136+ cp = 0xFFFD ;
1137+ i += 1 ;
1138+ }
1139+ } else if (w >= 0xDC00 && w <= 0xDFFF ) {
1140+ cp = 0xFFFD ;
1141+ i += 1 ;
1142+ } else {
1143+ cp = w;
1144+ i += 1 ;
1145+ }
1146+ encode_utf8 (cp, result);
10731147 }
1148+ return result;
10741149#endif
10751150}
10761151
0 commit comments