Skip to content

Commit 201fecb

Browse files
authored
int: Remove deprecated std::codecvt from strutil.cpp internals (#5107)
For the sake of Windows' need for wstring filenames, we do some utf8<->utf16 conversion in strutil.cpp. That has always used the std::codecvt set of functions, but they have been deprecated beginning in C++17 and will be removed entirely in C++26, so their days are numbered and we already have to suppress warnings to get the compiler to shut up about it. Assisted-by: Claude Code / Opus 4.6 Signed-off-by: Larry Gritz <lg@larrygritz.com>
1 parent 5b481fd commit 201fecb

File tree

2 files changed

+245
-67
lines changed

2 files changed

+245
-67
lines changed

src/libutil/strutil.cpp

Lines changed: 142 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,7 @@
55

66
#include <OpenImageIO/platform.h>
77

8-
// Special dance to disable warnings in the included files related to
9-
// the deprecation of unicode conversion functions.
10-
OIIO_PRAGMA_WARNING_PUSH
11-
OIIO_CLANG_PRAGMA(clang diagnostic ignored "-Wdeprecated-declarations")
12-
#include <codecvt>
138
#include <locale>
14-
OIIO_PRAGMA_WARNING_POP
159

1610
#include <algorithm>
1711
#include <cmath>
@@ -958,62 +952,113 @@ Strutil::replace(string_view str, string_view pattern, string_view replacement,
958952

959953

960954

961-
// Conversion functions between UTF-8 and UTF-16 for windows.
955+
// UTF-8 <-> UTF-16 conversion utilities.
962956
//
963-
// For historical reasons, the standard encoding for strings on windows is
964-
// UTF-16, whereas the unix world seems to have settled on UTF-8. These two
965-
// encodings can be stored in std::string and std::wstring respectively, with
966-
// the caveat that they're both variable-width encodings, so not all the
967-
// standard string methods will make sense (for example std::string::size()
968-
// won't return the number of glyphs in a UTF-8 string, unless it happens to
969-
// be made up of only the 7-bit ASCII subset).
957+
// OIIO uses UTF-8 for all string/path handling. On Windows, OS APIs require
958+
// UTF-16 (wchar_t*), so we convert at API boundaries. Some non-Windows uses
959+
// also exist (e.g., parsing UTF-16 ICC profile metadata).
970960
//
971-
// The standard windows API functions usually have two versions, a UTF-16
972-
// version with a 'W' suffix (using wchar_t* strings), and an ANSI version
973-
// with a 'A' suffix (using char* strings) which uses the current windows
974-
// code page to define the encoding. (To make matters more confusing there is
975-
// also a further "TCHAR" version which is #defined to the UTF-16 or ANSI
976-
// version, depending on whether UNICODE is defined during compilation.
977-
// This is meant to make it possible to support compiling libraries in
978-
// either unicode or ansi mode from the same codebase.)
961+
// On Windows, we use the native MultiByteToWideChar/WideCharToMultiByte APIs.
962+
// On other platforms, we use hand-rolled UTF-8/UTF-16 codec functions below,
963+
// replacing the deprecated std::codecvt_utf8_utf16 (removed in C++26).
979964
//
980-
// Using std::string as the string container (as in OIIO) implies that we
981-
// can't use UTF-16. It also means we need a variable-width encoding to
982-
// represent characters in non-Latin alphabets in an unambiguous way; the
983-
// obvious candidate is UTF-8. File paths in OIIO are considered to be
984-
// represented in UTF-8, and must be converted to UTF-16 before passing to
985-
// windows API file opening functions.
986-
//
987-
// On the other hand, the encoding used for the ANSI versions of the windows
988-
// API is the current windows code page. This is more compatible with the
989-
// default setup of the standard windows command prompt, and may be more
990-
// appropriate for error messages.
965+
// Note: wchar_t is 16-bit on Windows (natural UTF-16) but 32-bit on
966+
// macOS/Linux. The non-Windows path still produces UTF-16 encoding in
967+
// wchar_t units (with surrogate pairs) to match the expected semantics of
968+
// utf8_to_utf16wstring().
969+
970+
// Decode one UTF-8 sequence starting at `src[pos]`, advance `pos` past it,
971+
// and return the codepoint. Returns 0xFFFD on malformed input and advances
972+
// past the bad byte(s).
973+
static uint32_t
974+
decode_utf8(const char* src, size_t len, size_t& pos)
975+
{
976+
auto byte = [&](size_t i) -> uint8_t { return uint8_t(src[i]); };
977+
auto is_cont = [](uint8_t b) { return (b & 0xC0) == 0x80; };
978+
uint8_t b0 = byte(pos);
979+
if (b0 < 0x80) {
980+
pos += 1;
981+
return b0;
982+
} else if ((b0 & 0xE0) == 0xC0 && pos + 1 < len && is_cont(byte(pos + 1))) {
983+
uint32_t cp = (uint32_t(b0 & 0x1F) << 6)
984+
| uint32_t(byte(pos + 1) & 0x3F);
985+
pos += 2;
986+
return cp >= 0x80 ? cp : 0xFFFD; // reject overlong
987+
} else if ((b0 & 0xF0) == 0xE0 && pos + 2 < len && is_cont(byte(pos + 1))
988+
&& is_cont(byte(pos + 2))) {
989+
uint32_t cp = (uint32_t(b0 & 0x0F) << 12)
990+
| (uint32_t(byte(pos + 1) & 0x3F) << 6)
991+
| uint32_t(byte(pos + 2) & 0x3F);
992+
pos += 3;
993+
if (cp < 0x800 || (cp >= 0xD800 && cp <= 0xDFFF))
994+
return 0xFFFD; // overlong or surrogate
995+
return cp;
996+
} else if ((b0 & 0xF8) == 0xF0 && pos + 3 < len && is_cont(byte(pos + 1))
997+
&& is_cont(byte(pos + 2)) && is_cont(byte(pos + 3))) {
998+
uint32_t cp = (uint32_t(b0 & 0x07) << 18)
999+
| (uint32_t(byte(pos + 1) & 0x3F) << 12)
1000+
| (uint32_t(byte(pos + 2) & 0x3F) << 6)
1001+
| uint32_t(byte(pos + 3) & 0x3F);
1002+
pos += 4;
1003+
return (cp >= 0x10000 && cp <= 0x10FFFF) ? cp : 0xFFFD;
1004+
}
1005+
pos += 1; // skip bad byte
1006+
return 0xFFFD;
1007+
}
1008+
1009+
1010+
// Encode a Unicode codepoint as UTF-8, appending to `out`.
1011+
static void
1012+
encode_utf8(uint32_t cp, std::string& out)
1013+
{
1014+
if (cp < 0x80) {
1015+
out += char(cp);
1016+
} else if (cp < 0x800) {
1017+
out += char(0xC0 | (cp >> 6));
1018+
out += char(0x80 | (cp & 0x3F));
1019+
} else if (cp < 0x10000) {
1020+
out += char(0xE0 | (cp >> 12));
1021+
out += char(0x80 | ((cp >> 6) & 0x3F));
1022+
out += char(0x80 | (cp & 0x3F));
1023+
} else if (cp <= 0x10FFFF) {
1024+
out += char(0xF0 | (cp >> 18));
1025+
out += char(0x80 | ((cp >> 12) & 0x3F));
1026+
out += char(0x80 | ((cp >> 6) & 0x3F));
1027+
out += char(0x80 | (cp & 0x3F));
1028+
}
1029+
}
1030+
9911031

9921032
std::wstring
9931033
Strutil::utf8_to_utf16wstring(string_view str) noexcept
9941034
{
9951035
#ifdef _WIN32
9961036
// UTF8<->UTF16 conversions are primarily needed on Windows, so use the
997-
// fastest option (C++11 <codecvt> is many times slower due to locale
998-
// access overhead, and is deprecated starting with C++17).
1037+
// fastest option.
9991038
std::wstring result;
10001039
result.resize(
10011040
MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), NULL, 0));
10021041
MultiByteToWideChar(CP_UTF8, 0, str.data(), str.length(), result.data(),
10031042
(int)result.size());
10041043
return result;
10051044
#else
1006-
try {
1007-
OIIO_PRAGMA_WARNING_PUSH
1008-
# if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1009-
OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
1010-
# endif
1011-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> conv;
1012-
OIIO_PRAGMA_WARNING_POP
1013-
return conv.from_bytes(str.data(), str.data() + str.size());
1014-
} catch (const std::exception&) {
1015-
return std::wstring();
1045+
// Decode UTF-8 into codepoints and encode as UTF-16 stored in wchar_t
1046+
// units (matching the behavior of the now-deprecated codecvt_utf8_utf16).
1047+
std::wstring result;
1048+
result.reserve(str.size());
1049+
size_t pos = 0;
1050+
while (pos < str.size()) {
1051+
uint32_t cp = decode_utf8(str.data(), str.size(), pos);
1052+
if (cp < 0x10000) {
1053+
result += wchar_t(cp);
1054+
} else {
1055+
// Encode as surrogate pair in wchar_t units
1056+
cp -= 0x10000;
1057+
result += wchar_t(0xD800 + (cp >> 10));
1058+
result += wchar_t(0xDC00 + (cp & 0x3FF));
1059+
}
10161060
}
1061+
return result;
10171062
#endif
10181063
}
10191064

@@ -1024,26 +1069,41 @@ Strutil::utf16_to_utf8(const std::wstring& str) noexcept
10241069
{
10251070
#ifdef _WIN32
10261071
// UTF8<->UTF16 conversions are primarily needed on Windows, so use the
1027-
// fastest option (C++11 <codecvt> is many times slower due to locale
1028-
// access overhead, and is deprecated starting with C++17).
1072+
// fastest option.
10291073
std::string result;
10301074
result.resize(WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(),
10311075
NULL, 0, NULL, NULL));
10321076
WideCharToMultiByte(CP_UTF8, 0, str.data(), str.length(), &result[0],
10331077
(int)result.size(), NULL, NULL);
10341078
return result;
10351079
#else
1036-
try {
1037-
OIIO_PRAGMA_WARNING_PUSH
1038-
# if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1039-
OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
1040-
# endif
1041-
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> conv;
1042-
OIIO_PRAGMA_WARNING_POP
1043-
return conv.to_bytes(str);
1044-
} catch (const std::exception&) {
1045-
return std::string();
1080+
// Decode UTF-16 stored in wchar_t units (matching the behavior of the
1081+
// now-deprecated codecvt_utf8_utf16) and encode as UTF-8.
1082+
std::string result;
1083+
result.reserve(str.size() * 2);
1084+
size_t i = 0;
1085+
while (i < str.size()) {
1086+
uint32_t w = uint32_t(str[i]);
1087+
uint32_t cp;
1088+
if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size()) {
1089+
uint32_t w2 = uint32_t(str[i + 1]);
1090+
if (w2 >= 0xDC00 && w2 <= 0xDFFF) {
1091+
cp = 0x10000 + ((w - 0xD800) << 10) + (w2 - 0xDC00);
1092+
i += 2;
1093+
} else {
1094+
cp = 0xFFFD; // unpaired high surrogate
1095+
i += 1;
1096+
}
1097+
} else if (w >= 0xDC00 && w <= 0xDFFF) {
1098+
cp = 0xFFFD; // unpaired low surrogate
1099+
i += 1;
1100+
} else {
1101+
cp = w;
1102+
i += 1;
1103+
}
1104+
encode_utf8(cp, result);
10461105
}
1106+
return result;
10471107
#endif
10481108
}
10491109

@@ -1060,17 +1120,32 @@ Strutil::utf16_to_utf8(const std::u16string& str) noexcept
10601120
&result[0], (int)result.size(), NULL, NULL);
10611121
return result;
10621122
#else
1063-
try {
1064-
OIIO_PRAGMA_WARNING_PUSH
1065-
# if defined(__clang__) || OIIO_GNUC_VERSION >= 150000
1066-
OIIO_GCC_PRAGMA(GCC diagnostic ignored "-Wdeprecated-declarations")
1067-
# endif
1068-
std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> conv;
1069-
return conv.to_bytes(str);
1070-
OIIO_PRAGMA_WARNING_POP
1071-
} catch (const std::exception&) {
1072-
return std::string();
1123+
// Decode UTF-16 from char16_t units and encode as UTF-8.
1124+
std::string result;
1125+
result.reserve(str.size() * 2);
1126+
size_t i = 0;
1127+
while (i < str.size()) {
1128+
uint32_t w = uint32_t(str[i]);
1129+
uint32_t cp;
1130+
if (w >= 0xD800 && w <= 0xDBFF && i + 1 < str.size()) {
1131+
uint32_t w2 = uint32_t(str[i + 1]);
1132+
if (w2 >= 0xDC00 && w2 <= 0xDFFF) {
1133+
cp = 0x10000 + ((w - 0xD800) << 10) + (w2 - 0xDC00);
1134+
i += 2;
1135+
} else {
1136+
cp = 0xFFFD;
1137+
i += 1;
1138+
}
1139+
} else if (w >= 0xDC00 && w <= 0xDFFF) {
1140+
cp = 0xFFFD;
1141+
i += 1;
1142+
} else {
1143+
cp = w;
1144+
i += 1;
1145+
}
1146+
encode_utf8(cp, result);
10731147
}
1148+
return result;
10741149
#endif
10751150
}
10761151

src/libutil/strutil_test.cpp

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,6 +1759,108 @@ getargs(int argc, char* argv[])
17591759

17601760

17611761

1762+
void
1763+
test_utf_conversions()
1764+
{
1765+
std::cout << "Testing utf8_to_utf16wstring / utf16_to_utf8\n";
1766+
1767+
// ASCII round-trip
1768+
{
1769+
std::string ascii = "hello";
1770+
std::wstring w = Strutil::utf8_to_utf16wstring(ascii);
1771+
OIIO_CHECK_EQUAL(w.size(), 5);
1772+
OIIO_CHECK_EQUAL((int)w[0], (int)L'h');
1773+
OIIO_CHECK_EQUAL((int)w[4], (int)L'o');
1774+
std::string back = Strutil::utf16_to_utf8(w);
1775+
OIIO_CHECK_EQUAL(back, ascii);
1776+
}
1777+
1778+
// Empty string
1779+
{
1780+
std::wstring w = Strutil::utf8_to_utf16wstring("");
1781+
OIIO_CHECK_EQUAL(w.size(), 0);
1782+
std::string s = Strutil::utf16_to_utf8(std::wstring());
1783+
OIIO_CHECK_EQUAL(s.size(), 0);
1784+
std::string s2 = Strutil::utf16_to_utf8(std::u16string());
1785+
OIIO_CHECK_EQUAL(s2.size(), 0);
1786+
}
1787+
1788+
// 2-byte UTF-8 (Latin/Greek/Cyrillic, U+0080..U+07FF)
1789+
// "café" = U+0063 U+0061 U+0066 U+00E9
1790+
{
1791+
std::string utf8 = "caf\xc3\xa9"; // café in UTF-8
1792+
std::wstring w = Strutil::utf8_to_utf16wstring(utf8);
1793+
OIIO_CHECK_EQUAL(w.size(), 4);
1794+
OIIO_CHECK_EQUAL((int)w[3], 0x00E9);
1795+
std::string back = Strutil::utf16_to_utf8(w);
1796+
OIIO_CHECK_EQUAL(back, utf8);
1797+
}
1798+
1799+
// 3-byte UTF-8 (CJK, U+0800..U+FFFF)
1800+
// U+6620 U+753B = "映画" (movie in Japanese)
1801+
{
1802+
std::string utf8 = "\xe6\x98\xa0\xe7\x94\xbb";
1803+
std::wstring w = Strutil::utf8_to_utf16wstring(utf8);
1804+
OIIO_CHECK_EQUAL(w.size(), 2);
1805+
OIIO_CHECK_EQUAL((int)w[0], 0x6620);
1806+
OIIO_CHECK_EQUAL((int)w[1], 0x753B);
1807+
std::string back = Strutil::utf16_to_utf8(w);
1808+
OIIO_CHECK_EQUAL(back, utf8);
1809+
}
1810+
1811+
// 4-byte UTF-8 / surrogate pairs (U+10000..U+10FFFF)
1812+
// U+1F600 (grinning face emoji)
1813+
{
1814+
std::string utf8 = "\xf0\x9f\x98\x80";
1815+
std::wstring w = Strutil::utf8_to_utf16wstring(utf8);
1816+
// Should be encoded as surrogate pair: 0xD83D 0xDE00
1817+
OIIO_CHECK_EQUAL(w.size(), 2);
1818+
OIIO_CHECK_EQUAL((int)w[0], 0xD83D);
1819+
OIIO_CHECK_EQUAL((int)w[1], 0xDE00);
1820+
std::string back = Strutil::utf16_to_utf8(w);
1821+
OIIO_CHECK_EQUAL(back, utf8);
1822+
}
1823+
1824+
// Mixed ASCII + multibyte round-trip
1825+
{
1826+
// "Ñoño" = U+00D1 U+006F U+00F1 U+006F
1827+
std::string utf8 = "\xc3\x91o\xc3\xb1o";
1828+
std::wstring w = Strutil::utf8_to_utf16wstring(utf8);
1829+
OIIO_CHECK_EQUAL(w.size(), 4);
1830+
std::string back = Strutil::utf16_to_utf8(w);
1831+
OIIO_CHECK_EQUAL(back, utf8);
1832+
}
1833+
1834+
// utf16_to_utf8 with u16string variant
1835+
{
1836+
// Basic Multilingual Plane: U+0041 U+00E9 U+6620
1837+
std::u16string u16 = { char16_t(0x0041), char16_t(0x00E9),
1838+
char16_t(0x6620) };
1839+
std::string utf8 = Strutil::utf16_to_utf8(u16);
1840+
OIIO_CHECK_EQUAL(utf8, "A\xc3\xa9\xe6\x98\xa0");
1841+
}
1842+
1843+
// utf16_to_utf8 u16string with surrogate pair
1844+
{
1845+
// U+1F600 as surrogate pair: 0xD83D 0xDE00
1846+
std::u16string u16 = { char16_t(0xD83D), char16_t(0xDE00) };
1847+
std::string utf8 = Strutil::utf16_to_utf8(u16);
1848+
OIIO_CHECK_EQUAL(utf8, "\xf0\x9f\x98\x80");
1849+
}
1850+
1851+
// Round-trip through u16string for supplementary plane
1852+
{
1853+
// U+1D11E (musical symbol G clef)
1854+
std::string utf8 = "\xf0\x9d\x84\x9e";
1855+
std::wstring w = Strutil::utf8_to_utf16wstring(utf8);
1856+
OIIO_CHECK_EQUAL(w.size(), 2); // surrogate pair
1857+
std::string back = Strutil::utf16_to_utf8(w);
1858+
OIIO_CHECK_EQUAL(back, utf8);
1859+
}
1860+
}
1861+
1862+
1863+
17621864
int
17631865
main(int argc, char* argv[])
17641866
{
@@ -1810,6 +1912,7 @@ main(int argc, char* argv[])
18101912
test_edit_distance();
18111913
test_base64_encode();
18121914
test_eval_as_bool();
1915+
test_utf_conversions();
18131916

18141917
Strutil::debug("debug message\n");
18151918

0 commit comments

Comments
 (0)