Use byte-level widening/narrowing on Linux SQLWCHAR paths

fdcastel · fdcastel · commit bea6403a6522 · 2026-04-19T16:10:57.000-03:00
Follow-up to the ConvertingString discussion in #289. The 8-byte floor in Alloc() introduced by the previous commit was a crutch: it avoided the strcpy heap overflow for the SQL-state case but left the destructor's mbstowcs((wchar_t*)unicodeString, ..., lengthString) writing 4-byte wchar_t units into a caller buffer that is SQLWCHAR-sized (2 bytes). Even when that did not overflow, the data was wrong — 'HY000' became 'H' after the client reinterpreted the bytes as UCS-2. Replace the Linux non-connection paths in both directions with the same byte-widening / byte-narrowing loop that unixODBC itself uses internally (ansi_to_unicode_copy / unicode_to_ansi_copy). This is correct for the ASCII-only error/state strings that reach this code; non-ASCII handling remains the subject of the broader rewrite tracked as Tier 9.1 in #287. Changes in MainUnicode.cpp: - lengthString now uses sizeof(SQLWCHAR) again (correct SQLWCHAR count). - Destructor Linux branch widens bytes into SQLWCHAR units. - convUnicodeToString Linux branch narrows SQLWCHAR to low bytes. - Temporary NUL is written as a SQLWCHAR-sized zero (not wchar_t). - Remove the Alloc() floor — no longer needed. - Add sqlwcharLen() helper; wcslen() on SQLWCHAR data is unsafe on Linux.
diff --git a/MainUnicode.cpp b/MainUnicode.cpp
@@ -38,6 +38,21 @@
 extern FILE	*logFile;
 using namespace OdbcJdbcLibrary;
 
+#ifndef _WINDOWS
+// SQLWCHAR-aware length (in SQLWCHAR units), safe on Linux where
+// sizeof(wchar_t) != sizeof(SQLWCHAR). Do NOT use wcslen() on SQLWCHAR
+// data on Linux — it reads two SQLWCHARs per wchar_t and runs off the end.
+static size_t sqlwcharLen( const SQLWCHAR *s )
+{
+	size_t n = 0;
+	if ( !s )
+		return 0;
+	while ( s[n] )
+		++n;
+	return n;
+}
+#endif
+
 #ifdef _WINDOWS
 extern UINT codePage; // from Main.cpp
 #endif
@@ -85,7 +100,7 @@ class ConvertingString
 			if ( length == SQL_NTS )
 				lengthString = 0;
 			else if ( retCountOfBytes )
-				lengthString = length / sizeof(wchar_t);
+				lengthString = length / sizeof(SQLWCHAR);
 			else
 				lengthString = length;
 		}
@@ -135,13 +150,33 @@ class ConvertingString
 					if ( len > 0 )
 						len--;
 #else
-					len = mbstowcs( (wchar_t*)unicodeString, (const char*)byteString, lengthString );
+					// SQLWCHAR is 2 bytes on Linux (unixODBC defines it as unsigned short),
+					// but wchar_t is 4 bytes, so mbstowcs((wchar_t*)unicodeString, ...)
+					// both corrupts the output and risks overflowing the caller's buffer.
+					// Widen byte-by-byte into SQLWCHAR units, matching what unixODBC's
+					// ansi_to_unicode_copy() does internally. This is correct for the
+					// ASCII-only error/state strings that reach this code path; non-ASCII
+					// input will be handled by the broader ConvertingString rewrite tracked
+					// in issue #287 (Tier 9.1).
+					{
+						const SQLCHAR *src = byteString;
+						size_t i = 0;
+						while ( i < (size_t)lengthString && src[i] != 0 )
+						{
+							unicodeString[i] = (SQLWCHAR)( src[i] & 0xFF );
+							++i;
+						}
+						len = i;
+					}
 #endif
 				}
 
 				if ( len > 0 )
 				{
-					*(LPWSTR)(unicodeString + len) = L'\0';
+					// NUL-terminate in SQLWCHAR units. LPWSTR assignment of L'\0' writes
+					// sizeof(wchar_t) bytes, which overruns the output buffer by 2 bytes
+					// on Linux.
+					unicodeString[len] = 0;
 
 					if ( realLength )
 					{
@@ -170,12 +205,18 @@ class ConvertingString
 		wchar_t saveWC;
 
 		if ( length == SQL_NTS )
+#ifdef _WINDOWS
 			length = (int)wcslen( (const wchar_t*)wcString );
-		else if ( wcString[length] != L'\0' )
+#else
+			length = (int)sqlwcharLen( wcString );
+#endif
+		else if ( wcString[length] != 0 )
 		{
 			ptEndWC = (wchar_t*)&wcString[length];
 			saveWC = *ptEndWC;
-			*ptEndWC = L'\0';
+			// Write a SQLWCHAR-sized NUL so we don't overrun the input by 2 bytes
+			// on Linux (wchar_t is 4 bytes there).
+			wcString[length] = 0;
 		}
 
 		if ( connection )
@@ -185,7 +226,10 @@ class ConvertingString
 #ifdef _WINDOWS
 			bytesNeeded = WideCharToMultiByte( codePage, (DWORD)0, wcString, length, NULL, (int)0, NULL, NULL );
 #else
-			bytesNeeded = wcstombs( NULL, (const wchar_t*)wcString, length );
+			// See the symmetric comment in the destructor above: wcstombs assumes
+			// wchar_t-sized input, which corrupts SQLWCHAR data on Linux. The
+			// byte-narrowing loop below produces exactly `length` output bytes.
+			bytesNeeded = (size_t)length;
 #endif
 		}
 
@@ -198,7 +242,15 @@ class ConvertingString
 #ifdef _WINDOWS
 			bytesNeeded = WideCharToMultiByte( codePage, 0, wcString, length, (LPSTR)byteString, (int)bytesNeeded, NULL, NULL );
 #else
-			bytesNeeded = wcstombs( (char *)byteString, (const wchar_t*)wcString, bytesNeeded );
+			{
+				size_t i = 0;
+				while ( i < (size_t)length && wcString[i] != 0 )
+				{
+					byteString[i] = (SQLCHAR)( wcString[i] & 0xFF );
+					++i;
+				}
+				bytesNeeded = i;
+			}
 #endif
 		}
 
@@ -219,16 +271,8 @@ class ConvertingString
 		case BYTESCHARS:
 			if ( lengthString )
 			{
-				// Floor the internal buffer at 8 bytes so that callers which pass a
-				// small SQLWCHAR output buffer (e.g. SQLGetDiagRecW with a 12-byte
-				// SQL state, yielding lengthString=3 on Linux where sizeof(wchar_t)=4)
-				// still have room for the 6-byte SQL state ("HY000\0") that
-				// OdbcError::sqlGetDiagRec strcpy's into this buffer. Keeping
-				// lengthString itself unchanged preserves the mbstowcs writeback
-				// bound and avoids smashing the caller's stack buffer.
-				const size_t bufSize = (lengthString + 2 < 8) ? 8 : (size_t)lengthString + 2;
-				byteString = new SQLCHAR[ bufSize ];
-				memset(byteString, 0, bufSize);
+				byteString = new SQLCHAR[ lengthString + 2 ];
+				memset( byteString, 0, lengthString + 2 );
 			}
 			else
 				byteString = NULL;