@@ -30,12 +30,32 @@ import java.nio.charset.Charset
3030 * for malformed input, so a snapshot that ends mid-multibyte-sequence (a real possibility on a
3131 * bounded capture) still yields a usable preview rather than crashing the log line.
3232 *
33+ * ## Multi-byte charsets and the binary heuristic
34+ * The text/binary heuristic ([isProbablyText]) normally treats a NUL byte as a definitive binary
35+ * signal. That rule misfires for a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
36+ * ASCII content is padded with NUL bytes (`'A'` is `0x00 0x41` in UTF-16BE). To avoid rendering
37+ * legitimate wide-charset text as `[binary N bytes]`, [render] consults the declared charset
38+ * first: when the [MediaType] explicitly declares a *resolvable* charset whose encoding is not
39+ * NUL-free for ASCII (UTF-16/UTF-32 and friends), the body is decoded with that charset and the
40+ * NUL-based heuristic is skipped. When no charset is declared, or the declared one is single-byte
41+ * (US-ASCII, ISO-8859-1, UTF-8), the heuristic still runs so a genuinely binary body is summarised
42+ * rather than decoded into noise.
43+ *
44+ * ## Best-effort detection
45+ * The heuristic is **best-effort**, not a guarantee. It samples only the first
46+ * [SNIFF_SAMPLE_BYTES] and counts C0 control bytes; bytes `>= 0x80` are not counted (they are
47+ * legitimate in UTF-8 and single-byte charsets). A binary payload whose leading bytes happen to
48+ * contain no NUL and few control bytes — e.g. a region dominated by high bytes — can therefore
49+ * slip through as text and decode to replacement-character noise. Most real binary formats carry
50+ * a NUL or control run early, so the practical miss rate is low, but callers should treat the
51+ * preview as a diagnostic aid rather than a reliable content-type classifier.
52+ *
3353 * This is an `internal` seam shared by [DefaultInstrumentationStep] and
3454 * [DefaultAsyncInstrumentationStep]; it has no public API surface.
3555 */
3656internal object BodyPreview {
3757 /* * Charset used when a text body declares no charset, or names an unknown one. */
38- internal val DEFAULT_CHARSET : Charset = Charsets .UTF_8
58+ private val DEFAULT_CHARSET : Charset = Charsets .UTF_8
3959
4060 /* *
4161 * Number of leading bytes inspected by [isProbablyText]. A small fixed sample keeps the
@@ -64,34 +84,53 @@ internal object BodyPreview {
6484 /* *
6585 * Renders [bytes] as a preview string.
6686 *
67- * Empty input yields the empty string. A body that does not pass [isProbablyText] is rendered
68- * as a size-only `[binary N bytes]` summary. Otherwise the bytes are decoded with the charset
69- * from [mediaType] (or [DEFAULT_CHARSET] when absent/unknown).
87+ * Empty input yields the empty string. When [mediaType] explicitly declares a resolvable
88+ * multi-byte charset (UTF-16/UTF-32 and friends), the bytes are decoded with that charset and
89+ * the NUL-based binary heuristic is skipped — see the class KDoc. Otherwise a body that does
90+ * not pass [isProbablyText] is rendered as a size-only `[binary N bytes]` summary, and a body
91+ * that does is decoded with the charset from [mediaType] (or [DEFAULT_CHARSET] when
92+ * absent/unknown).
7093 */
7194 internal fun render (
7295 bytes : ByteArray ,
7396 mediaType : MediaType ? ,
7497 ): String {
7598 if (bytes.isEmpty()) return " "
99+ val declared = mediaType?.charset
100+ if (declared != null && encodesAsciiWithNul(declared)) {
101+ // A declared wide charset (UTF-16/UTF-32) pads ASCII with NUL bytes, so the NUL-based
102+ // heuristic would misclassify it as binary. Trust the explicit declaration and decode.
103+ return previewText(bytes, declared)
104+ }
76105 if (! isProbablyText(bytes)) return binarySummary(bytes.size)
77- return previewText(bytes, mediaType )
106+ return previewText(bytes, declared ? : DEFAULT_CHARSET )
78107 }
79108
80109 /* *
81- * Decodes [bytes] using the charset declared on [mediaType], falling back to
82- * [DEFAULT_CHARSET] when the media type is null, declares no charset, or names a charset the
83- * JVM cannot resolve ([MediaType.charset] returns null in the latter two cases). Invalid byte
84- * sequences are replaced rather than throwing.
110+ * Decodes [bytes] using [charset]. Invalid byte sequences are replaced rather than throwing.
85111 */
86- internal fun previewText (
112+ private fun previewText (
87113 bytes : ByteArray ,
88- mediaType : MediaType ? ,
114+ charset : Charset ,
89115 ): String {
90116 if (bytes.isEmpty()) return " "
91- val charset = mediaType?.charset ? : DEFAULT_CHARSET
92117 return String (bytes, charset)
93118 }
94119
120+ /* *
121+ * True when [charset] encodes a plain ASCII character to a byte sequence that contains a NUL
122+ * byte — the signature of a fixed-width multi-byte charset such as UTF-16 or UTF-32, where
123+ * `'A'` becomes e.g. `0x00 0x41`. Single-byte charsets (US-ASCII, ISO-8859-1) and UTF-8
124+ * encode ASCII without NUL padding and return false. The probe is computed from the charset's
125+ * own encoder, so it covers any such charset the JVM knows, not a hard-coded name list.
126+ */
127+ private fun encodesAsciiWithNul (charset : Charset ): Boolean =
128+ try {
129+ " A" .toByteArray(charset).any { it.toInt() == NUL }
130+ } catch (_: Exception ) {
131+ false
132+ }
133+
95134 /* *
96135 * Heuristically decides whether [bytes] is text. Samples the first [SNIFF_SAMPLE_BYTES]:
97136 * a single NUL byte is treated as a strong binary signal, and a control-byte ratio above
0 commit comments