feat: decode declared wide-charset bodies in previews; tighten BodyPreview seam

OmarAlJarrah · OmarAlJarrah · commit d96831c7fccc · 2026-06-16T22:26:32.000+03:00
The binary heuristic treated any NUL byte as a definitive binary signal and
ran before the declared charset was consulted, so a body declared as a
fixed-width multi-byte charset (UTF-16/UTF-32) was rendered as
[binary N bytes] even though its NUL padding is legitimate text. render()
now consults the declared charset first: when the MediaType names a
resolvable charset whose ASCII encoding contains NUL bytes, the body is
decoded with that charset and the NUL-based heuristic is skipped. Single-byte
charsets and bodies with no declared charset still run the heuristic.

Document the wide-charset handling and the best-effort nature of the
content sniff in the BodyPreview KDoc. Mark previewText and DEFAULT_CHARSET
private so the shared/tested seam is exactly render + isProbablyText. Add an
async end-to-end test mirroring the sync ISO-8859-1 response-preview
assertion so the async step's mediaType() pass-through is covered.
diff --git a/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt b/sdk-core/src/main/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreview.kt
@@ -30,12 +30,32 @@ import java.nio.charset.Charset
  * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
  * bounded capture) still yields a usable preview rather than crashing the log line.
  *
+ * ## Multi-byte charsets and the binary heuristic
+ * The text/binary heuristic ([isProbablyText]) normally treats a NUL byte as a definitive binary
+ * signal. That rule misfires for a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
+ * ASCII content is padded with NUL bytes (`'A'` is `0x00 0x41` in UTF-16BE). To avoid rendering
+ * legitimate wide-charset text as `[binary N bytes]`, [render] consults the declared charset
+ * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not
+ * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the
+ * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte
+ * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised
+ * rather than decoded into noise.
+ *
+ * ## Best-effort detection
+ * The heuristic is **best-effort**, not a guarantee. It samples only the first
+ * [SNIFF_SAMPLE_BYTES] and counts C0 control bytes; bytes `>= 0x80` are not counted (they are
+ * legitimate in UTF-8 and single-byte charsets). A binary payload whose leading bytes happen to
+ * contain no NUL and few control bytes — e.g. a region dominated by high bytes — can therefore
+ * slip through as text and decode to replacement-character noise. Most real binary formats carry
+ * a NUL or control run early, so the practical miss rate is low, but callers should treat the
+ * preview as a diagnostic aid rather than a reliable content-type classifier.
+ *
  * This is an `internal` seam shared by [DefaultInstrumentationStep] and
  * [DefaultAsyncInstrumentationStep]; it has no public API surface.
  */
 internal object BodyPreview {
     /** Charset used when a text body declares no charset, or names an unknown one. */
-    internal val DEFAULT_CHARSET: Charset = Charsets.UTF_8
+    private val DEFAULT_CHARSET: Charset = Charsets.UTF_8
 
     /**
      * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the
@@ -64,34 +84,53 @@ internal object BodyPreview {
     /**
      * Renders [bytes] as a preview string.
      *
-     * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered
-     * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset
-     * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown).
+     * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable
+     * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and
+     * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does
+     * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body
+     * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
+     * absent/unknown).
      */
     internal fun render(
         bytes: ByteArray,
         mediaType: MediaType?,
     ): String {
         if (bytes.isEmpty()) return ""
+        val declared = mediaType?.charset
+        if (declared != null && encodesAsciiWithNul(declared)) {
+            // A declared wide charset (UTF-16/UTF-32) pads ASCII with NUL bytes, so the NUL-based
+            // heuristic would misclassify it as binary. Trust the explicit declaration and decode.
+            return previewText(bytes, declared)
+        }
         if (!isProbablyText(bytes)) return binarySummary(bytes.size)
-        return previewText(bytes, mediaType)
+        return previewText(bytes, declared ?: DEFAULT_CHARSET)
     }
 
     /**
-     * Decodes [bytes] using the charset declared on [mediaType], falling back to
-     * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the
-     * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte
-     * sequences are replaced rather than throwing.
+     * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing.
      */
-    internal fun previewText(
+    private fun previewText(
         bytes: ByteArray,
-        mediaType: MediaType?,
+        charset: Charset,
     ): String {
         if (bytes.isEmpty()) return ""
-        val charset = mediaType?.charset ?: DEFAULT_CHARSET
         return String(bytes, charset)
     }
 
+    /**
+     * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL
+     * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
+     * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8
+     * encode ASCII without NUL padding and return false. The probe is computed from the charset's
+     * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list.
+     */
+    private fun encodesAsciiWithNul(charset: Charset): Boolean =
+        try {
+            "A".toByteArray(charset).any { it.toInt() == NUL }
+        } catch (_: Exception) {
+            false
+        }
+
     /**
      * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]:
      * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/AsyncInstrumentationStepTest.kt
@@ -131,6 +131,47 @@ class AsyncInstrumentationStepTest {
         response.close()
     }
 
+    @Test
+    fun `response body preview honours the declared ISO-8859-1 charset`() {
+        // Mirrors the sync InstrumentationStepTest assertion: 0xE9 is 'é' in ISO-8859-1; decoded
+        // as UTF-8 (the old hardcoded assumption) it would be U+FFFD. This pins that the async
+        // step passes mediaType() through to BodyPreview rather than regressing to UTF-8.
+        val fakeSlf4j = FakeSlf4jLogger("test.async.instrumentation.charset")
+        val clientLogger = ClientLogger.forTesting(fakeSlf4j)
+        val latin1 = MediaType.parse("text/plain;charset=ISO-8859-1")
+        val bytes = "café".toByteArray(Charsets.ISO_8859_1)
+        val fakeAsync =
+            AsyncHttpClient { request ->
+                CompletableFuture.completedFuture(
+                    Response.builder()
+                        .request(request)
+                        .protocol(Protocol.HTTP_1_1)
+                        .status(Status.OK)
+                        .body(ResponseBody.create(Io.provider.source(bytes), latin1, bytes.size.toLong()))
+                        .build(),
+                )
+            }
+        val pipeline =
+            AsyncHttpPipelineBuilder(fakeAsync)
+                .append(
+                    DefaultAsyncInstrumentationStep(
+                        options = HttpInstrumentationOptions(logLevel = HttpLogLevel.BODY_AND_HEADERS),
+                        logger = clientLogger,
+                    ),
+                )
+                .build()
+
+        val response = pipeline.sendAsync(getRequest("https://api.example.com/x")).join()
+        response.close()
+
+        val responseRecord =
+            fakeSlf4j.records.first { rec ->
+                rec.keyValues.any { it.key == "event" && it.value == "http.response" }
+            }
+        val preview = responseRecord.keyValues.first { it.key == "response.body.preview" }.value
+        assertEquals("café", preview)
+    }
+
     @Test
     fun `known-length response body is wrapped and bounded to the preview cap`() {
         // A known-length body larger than bodyPreviewMaxBytes is wrapped: the capture is bounded
diff --git a/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt b/sdk-core/src/test/kotlin/org/dexpace/sdk/core/http/pipeline/steps/BodyPreviewTest.kt
@@ -68,6 +68,22 @@ class BodyPreviewTest {
         assertEquals("[binary ${bytes.size} bytes]", preview)
     }
 
+    @Test
+    fun `text body declared as UTF-16 is decoded despite NUL padding`() {
+        // UTF-16BE pads ASCII with NUL bytes ('A' = 0x00 0x41), which the NUL heuristic would
+        // otherwise treat as binary. An explicitly declared wide charset must still decode.
+        val bytes = "café".toByteArray(StandardCharsets.UTF_16BE)
+        val preview = BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16BE"))
+        assertEquals("café", preview)
+        assertFalse(preview.contains("[binary"), "declared UTF-16 text must not be summarised as binary")
+    }
+
+    @Test
+    fun `text body declared as UTF-16LE is decoded despite NUL padding`() {
+        val bytes = "hello".toByteArray(StandardCharsets.UTF_16LE)
+        assertEquals("hello", BodyPreview.render(bytes, MediaType.parse("text/plain;charset=UTF-16LE")))
+    }
+
     @Test
     fun `isProbablyText accepts plain ASCII and latin-1 high bytes`() {
         assertTrue(BodyPreview.isProbablyText("plain ascii text".toByteArray(StandardCharsets.US_ASCII)))