Skip to content

Commit a260ca6

Browse files
committed
fix(transport): decode BOM-less utf-16 response bodies as big-endian
When a response declares Content-Type charset=utf-16 with no byte-order mark, the SDKs disagreed on byte order — several decoded little-endian — so identical bytes produced different strings across languages. These are HTTP clients decoding by the IANA/MIME charset registration, and RFC 2781 specifies UTF-16 with no BOM defaults to big-endian. Standardize on big-endian for a BOM-less utf-16 label across all transports (csharp, go, python, ruby, rust, swift, node were little-endian; dart, java, kotlin, php, elixir already big-endian). An explicit utf-16le/utf-16be label and a present BOM are still honored. All 12 SDKs gain a parity test decoding the BOM-less UTF-16BE bytes of "Pet" and asserting the result, plus that the little-endian interpretation differs (so the big-endian choice is load-bearing). Also drops a stray HTML entity (>) from the C# concurrent-redirect test's doc comment, restoring CSharpFormattingSpec.
1 parent 043a8f2 commit a260ca6

40 files changed

Lines changed: 913 additions & 14 deletions

File tree

src/main/resources/templates/csharp/default_api_client.mustache

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,20 @@ public sealed class DefaultApiClient : IApiClient, IDisposable
10331033
System.Net.Mime.ContentType media = new(contentType);
10341034
if (!string.IsNullOrEmpty(media.CharSet))
10351035
{
1036-
return Encoding.GetEncoding(media.CharSet);
1036+
string charset = media.CharSet.Trim();
1037+
/* A bare "utf-16" / "utf16" label (no endianness suffix, no BOM)
1038+
must decode big-endian per RFC 2781: a UTF-16 stream lacking a
1039+
BOM defaults to big-endian. .NET's Encoding.GetEncoding("utf-16")
1040+
returns a little-endian UnicodeEncoding, which would disagree
1041+
with the other SDKs, so resolve bare utf-16 to UTF-16BE here.
1042+
Explicit "utf-16le" / "utf-16be" labels fall through to
1043+
Encoding.GetEncoding so their declared byte order is honored. */
1044+
if (string.Equals(charset, "utf-16", StringComparison.OrdinalIgnoreCase)
1045+
|| string.Equals(charset, "utf16", StringComparison.OrdinalIgnoreCase))
1046+
{
1047+
return new UnicodeEncoding(bigEndian: true, byteOrderMark: false);
1048+
}
1049+
return Encoding.GetEncoding(charset);
10371050
}
10381051
}
10391052
catch (FormatException)

src/main/resources/templates/csharp/test/DefaultApiClientTest.mustache

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -700,7 +700,7 @@ public class DefaultApiClientTest
700700
}
701701
702702
/// <summary>
703-
/// Stateless redirect-chain handler: a request to /chain/{n} with n &gt; 0
703+
/// Stateless redirect-chain handler: a request to /chain/{n} with n above 0
704704
/// returns a 302 pointing at /chain/{n-1}; /chain/0 returns 200 OK. The
705705
/// chain length lives entirely in the URL, so the handler holds no
706706
/// per-request state and can serve many concurrent chains of different

src/main/resources/templates/csharp/test/DefaultApiClientUnitTest.mustache

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,31 @@ public class DefaultApiClientUnitTest
598598
Assert.Equal("héllo", response.Body);
599599
}
600600
601+
[Fact]
602+
public async Task DecodesBomlessUtf16BodyAsBigEndian()
603+
{
604+
// RFC 2781: a UTF-16 stream with no BOM defaults to big-endian. A bare
605+
// "charset=utf-16" (no endianness suffix, no BOM) must therefore decode
606+
// big-endian, matching the other SDKs. .NET's Encoding.GetEncoding(
607+
// "utf-16") is little-endian, so this would have decoded to garbage
608+
// before the fix.
609+
byte[] utf16BeBytes = { 0x00, 0x50, 0x00, 0x65, 0x00, 0x74 }; // "Pet" UTF-16BE
610+
var handler = new RawByteHandler(
611+
HttpStatusCode.OK,
612+
utf16BeBytes,
613+
"text/plain; charset=utf-16");
614+
var client = new DefaultApiClient(new HttpClient(handler));
615+
var response = await client.SendRequestAsync(
616+
"GET", new Uri("http://example.com/utf16-no-bom"),
617+
new Dictionary<string, string>(), null);
618+
Assert.Equal("Pet", response.Body);
619+
620+
// Sanity check that the choice is load-bearing: interpreting the very
621+
// same bytes little-endian yields a different (non-"Pet") string, so a
622+
// passing assertion above can only mean big-endian decoding was used.
623+
Assert.NotEqual("Pet", Encoding.Unicode.GetString(utf16BeBytes));
624+
}
625+
601626
// ---- 3.1: SensitiveHeaderNames default contents ----
602627

603628
[Fact]

src/main/resources/templates/dart/test/default_api_client_unit_test.mustache

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -843,6 +843,47 @@ void main() {
843843
}
844844
});
845845

846+
/* utf16-no-bom-big-endian: a charset=utf-16 response body WITHOUT a
847+
* byte-order mark must default to big-endian (RFC 2781), uniformly
848+
* with the other 11 SDKs. The bytes 00 50 00 65 00 74 are "Pet" when
849+
* read big-endian; read little-endian they would decode to U+5000
850+
* U+6500 U+7400 instead, so a correct decode proves the byte order. */
851+
test('decodes a BOM-less charset=utf-16 body as big-endian', () async {
852+
final server = await HttpServer.bind(InternetAddress.loopbackIPv4, 0);
853+
// UTF-16BE for "Pet", no BOM.
854+
final payload = <int>[
855+
0x00, 0x50, // P
856+
0x00, 0x65, // e
857+
0x00, 0x74, // t
858+
];
859+
server.listen((request) {
860+
request.response
861+
..statusCode = 200
862+
..headers.set('Content-Type', 'text/plain; charset=utf-16')
863+
..add(payload)
864+
..close();
865+
});
866+
867+
try {
868+
final client = DefaultApiClient();
869+
final resp = await client.sendRequest(
870+
'GET',
871+
'http://127.0.0.1:${server.port}/utf16-nobom',
872+
{},
873+
null,
874+
);
875+
expect(resp.statusCode, equals(200));
876+
expect(resp.body, equals('Pet'));
877+
// Prove the big-endian choice: the same bytes read little-endian
878+
// would not yield "Pet".
879+
expect(resp.body, isNot(equals(String.fromCharCodes(
880+
<int>[0x5000, 0x6500, 0x7400],
881+
))));
882+
} finally {
883+
await server.close();
884+
}
885+
});
886+
846887
/* dart-charset-utf16-mojibake (lenient fallback): an unrecognised
847888
* charset with non-UTF-8 bytes must fall back to a lossy UTF-8
848889
* decode rather than throwing a FormatException. */

src/main/resources/templates/elixir/test/base_api_test.mustache

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -770,6 +770,32 @@ defmodule {{moduleName}}.Api.BaseApiTest do
770770
assert result == bytes
771771
end
772772

773+
# WAVE F: a BOM-less utf-16 response body must decode as big-endian per
774+
# RFC 2781. The bytes 00 50 00 65 00 74 are "Pet" in UTF-16BE; the same
775+
# bytes read little-endian would be a different (CJK) string, so decoding
776+
# to exactly "Pet" proves the big-endian default. Erlang's bare :utf16
777+
# encoding atom is {:utf16, :big}, so this SDK already honours the rule.
778+
test "decode_text_body decodes BOM-less utf-16 as big-endian (RFC 2781)" do
779+
bytes = <<0x00, 0x50, 0x00, 0x65, 0x00, 0x74>>
780+
781+
result =
782+
{{moduleName}}.DefaultApiClient.decode_text_body(
783+
bytes,
784+
"text/plain; charset=utf-16"
785+
)
786+
787+
assert result == "Pet"
788+
789+
# Prove the choice: the same bytes read little-endian are NOT "Pet".
790+
little =
791+
{{moduleName}}.DefaultApiClient.decode_text_body(
792+
bytes,
793+
"text/plain; charset=utf-16le"
794+
)
795+
796+
refute little == "Pet"
797+
end
798+
773799
test "empty JSON object body includes Content-Type" do
774800
{:ok, name} = CapturingHeadersApiClient.start()
775801
config = {{moduleName}}.Configuration.new(base_url: "http://localhost")

src/main/resources/templates/go/default_api_client.mustache

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -455,9 +455,10 @@ func decodeBodyByCharset(body []byte, contentType string) string {
455455
return string(runes)
456456
case "windows-1252", "cp1252":
457457
return decodeWindows1252(body)
458-
case "utf-16", "utf16", "utf-16le", "utf16le":
458+
case "utf-16le", "utf16le":
459459
return decodeUtf16(body, false)
460-
case "utf-16be", "utf16be":
460+
case "utf-16", "utf16", "utf-16be", "utf16be":
461+
// A BOM-less utf-16 body defaults to big-endian per RFC 2781.
461462
return decodeUtf16(body, true)
462463
default:
463464
return string(body)

src/main/resources/templates/go/test/default_api_client_unit_test.mustache

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,34 @@ func TestDefaultApiClient_DecodesIso88591ResponseBody(t *testing.T) {
433433
}
434434
}
435435

436+
func TestDefaultApiClient_DecodesBomlessUtf16AsBigEndian(t *testing.T) {
437+
t.Parallel()
438+
/* A BOM-less utf-16 body must decode as big-endian per RFC 2781. The
439+
* bytes below are "Pet" in UTF-16BE with no byte-order mark; decoded
440+
* little-endian they would yield CJK code points, not "Pet". */
441+
bigEndian := []byte{0x00, 0x50, 0x00, 0x65, 0x00, 0x74}
442+
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
443+
w.Header().Set("Content-Type", "text/plain; charset=utf-16")
444+
w.WriteHeader(200)
445+
_, _ = w.Write(bigEndian)
446+
}))
447+
defer server.Close()
448+
449+
client := NewDefaultApiClient(nil)
450+
resp, err := client.SendRequest("GET", server.URL+"/utf16-no-bom", map[string]string{}, nil)
451+
if err != nil {
452+
t.Fatalf("unexpected error: %v", err)
453+
}
454+
if resp.Body != "Pet" {
455+
t.Errorf("expected BOM-less utf-16 to decode big-endian to \"Pet\", got %q", resp.Body)
456+
}
457+
/* Prove the byte order matters: the same bytes read little-endian must
458+
* not equal "Pet", so a default of utf-16le would fail this test. */
459+
if le := decodeUtf16(bigEndian, false); le == "Pet" {
460+
t.Errorf("expected little-endian decode of big-endian bytes to differ from \"Pet\", got %q", le)
461+
}
462+
}
463+
436464
func TestDefaultApiClient_DefaultsToUtf8WhenCharsetMissing(t *testing.T) {
437465
t.Parallel()
438466
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {

src/main/resources/templates/java/test/DefaultApiClientUnitTest.mustache

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,18 @@ class DefaultApiClientUnitTest {
144144
os.write(body);
145145
}
146146
});
147+
server.createContext(
148+
"/utf16-no-bom",
149+
exchange -> {
150+
// RFC 2781: a UTF-16 body with NO byte-order mark defaults to
151+
// big-endian. "Pet" as BOM-less UTF-16BE is 00 50 00 65 00 74.
152+
byte[] body = new byte[] {0x00, 0x50, 0x00, 0x65, 0x00, 0x74};
153+
exchange.getResponseHeaders().add("Content-Type", "text/plain; charset=utf-16");
154+
exchange.sendResponseHeaders(200, body.length);
155+
try (OutputStream os = exchange.getResponseBody()) {
156+
os.write(body);
157+
}
158+
});
147159
server.createContext(
148160
"/content-encoding-lie",
149161
exchange -> {
@@ -838,6 +850,27 @@ class DefaultApiClientUnitTest {
838850
assertEquals("héllo", response.body());
839851
}
840852

853+
{{! -- WAVE F: a BOM-less utf-16 response body must decode as big-endian -- }}
854+
{{! -- (RFC 2781). The bytes 00 50 00 65 00 74 are "Pet" in UTF-16BE; the -- }}
855+
{{! -- same bytes read little-endian would be a different (CJK) string, so -- }}
856+
{{! -- decoding to exactly "Pet" proves the big-endian default. Java's -- }}
857+
{{! -- UTF-16 charset already honours this, so this guard is green here. -- }}
858+
859+
@Test
860+
void decodesBomlessUtf16AsBigEndian() throws Exception {
861+
DefaultApiClient client = new DefaultApiClient();
862+
ApiHttpResponse response = client.sendRequest("GET", baseUrl + "/utf16-no-bom", Map.of(), null);
863+
assertEquals(200, response.statusCode());
864+
assertEquals("Pet", response.body(),
865+
"a BOM-less utf-16 body must decode as big-endian per RFC 2781");
866+
// Prove the choice: the same bytes read little-endian are NOT "Pet".
867+
byte[] sameBytes = new byte[] {0x00, 0x50, 0x00, 0x65, 0x00, 0x74};
868+
assertNotEquals(
869+
"Pet",
870+
new String(sameBytes, java.nio.charset.StandardCharsets.UTF_16LE),
871+
"little-endian interpretation of the same bytes must NOT equal \"Pet\"");
872+
}
873+
841874
{{! -- Gap AL: Content-Encoding lie (server claims gzip, sends plaintext) -- }}
842875
843876
@Test

src/main/resources/templates/kotlin/test/DefaultApiClientUnitTest.mustache

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,40 @@ class DefaultApiClientUnitTest {
493493
assertEquals("é", response.body, "ISO-8859-1 body must decode using the declared charset")
494494
}
495495
496+
@Test
497+
@DisplayName("decodes BOM-less utf-16 response body as big-endian (RFC 2781)")
498+
fun decodesBomlessUtf16AsBigEndian() {
499+
// A response declaring charset=utf-16 with NO byte-order mark must
500+
// decode as UTF-16 BIG-ENDIAN per RFC 2781. The bytes 00 50 00 65
501+
// 00 74 are "Pet" in UTF-16BE; the same bytes read little-endian
502+
// would yield the wrong string, proving the byte-order choice.
503+
val bigEndianNoBom = byteArrayOf(0x00, 0x50, 0x00, 0x65, 0x00, 0x74)
504+
val engine = MockEngine { _ ->
505+
respond(
506+
content = bigEndianNoBom,
507+
status = HttpStatusCode.OK,
508+
headers = headersOf("Content-Type", "text/plain; charset=utf-16"),
509+
)
510+
}
511+
val apiClient = DefaultApiClient(HttpClient(engine) { followRedirects = false })
512+
val response = runBlocking {
513+
apiClient.sendRequest("GET", "http://localhost/utf16-no-bom", emptyMap(), null)
514+
}
515+
assertEquals(
516+
"Pet",
517+
response.body,
518+
"BOM-less utf-16 must decode as big-endian",
519+
)
520+
// Same bytes interpreted little-endian must NOT equal "Pet",
521+
// confirming the big-endian default is what produced the match.
522+
val asLittleEndian = String(bigEndianNoBom, Charsets.UTF_16LE)
523+
assertNotEquals(
524+
"Pet",
525+
asLittleEndian,
526+
"little-endian interpretation of the same bytes must differ",
527+
)
528+
}
529+
496530
@Test
497531
@DisplayName("defaults to UTF-8 when Content-Type has no charset")
498532
fun defaultsToUtf8WhenNoCharset() {

src/main/resources/templates/node/abstract_api_client.mustache

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -540,13 +540,23 @@ export abstract class AbstractApiClient implements ApiClient {
540540
* present, and falls back to UTF-8 if the declared charset is not
541541
* recognised by the runtime's `TextDecoder`.
542542
*
543+
* A bare `utf-16`/`utf16` label (no explicit byte order) is decoded as
544+
* big-endian. Node's `TextDecoder` maps the WHATWG `'utf-16'` label to
545+
* little-endian, but for an HTTP client resolving the IANA/MIME charset
546+
* `UTF-16`, RFC 2781 specifies big-endian as the default when no BOM is
547+
* present. A leading BOM is still honoured by `TextDecoder` and overrides
548+
* the byte order; explicit `utf-16le`/`utf-16be` labels are left untouched.
549+
*
543550
* @param buf the raw response bytes
544551
* @param contentType the response Content-Type header value
545552
* @returns the decoded text
546553
*/
547554
static decodeBody(buf: Buffer, contentType?: string): string {
548555
const m = /charset=([^;]+)/i.exec(contentType ?? '');
549-
const cs = m ? m[1].trim().toLowerCase().replace(/^["']|["']$/g, '') : 'utf-8';
556+
let cs = m ? m[1].trim().toLowerCase().replace(/^["']|["']$/g, '') : 'utf-8';
557+
if (cs === 'utf-16' || cs === 'utf16') {
558+
cs = 'utf-16be';
559+
}
550560
try {
551561
return new TextDecoder(cs, { fatal: false }).decode(buf);
552562
} catch {

0 commit comments

Comments
 (0)