From da6f9a5adb4dd03a49bde56bc63a998bd53f8195 Mon Sep 17 00:00:00 2001 From: kaberingo Date: Thu, 16 Apr 2026 23:08:23 +0900 Subject: [PATCH] fix(mail): improve Japanese charset decoding reliability on Android - Normalize Shift-JIS aliases (shift-jis, sjis, ms932, windows-31j, x-sjis, x-ms-cp932) to shift_jis for consistent handling - Add EUC-JP fallback aliases (x-euc-jp, euc_jp) to the charset fallback map - Always use Iso2022JpToShiftJisInputStream for iso-2022-jp decoding to bypass Android ICU4J's unreliable QP-decoded byte sequence handling, which could cause ESC bytes and "$B"/"(B" escape remnants to appear as literal text - Auto-detect ISO-2022-JP when Content-Type has no charset parameter by scanning for ESC$B / ESC$@ escape sequences, fixing garbled output from Japanese feature phones and carrier webmail that omit the charset header - Implement getAddressFromReceivedHeader() in JisSupport to properly extract addresses from the "for" clause of Received headers (both angle-bracket and bare address forms), enabling correct JIS variant detection Co-Authored-By: Claude Sonnet 4.6 --- .../fsck/k9/mail/internet/CharsetSupport.java | 54 ++++++++++- .../com/fsck/k9/mail/internet/JisSupport.java | 29 +++++- .../k9/mail/internet/CharsetSupportTest.java | 66 ++++++++++++++ .../fsck/k9/mail/internet/JisSupportTest.java | 90 +++++++++++++++++++ .../mail/internet/MessageExtractorTest.java | 89 ++++++++++++++++++ 5 files changed, 324 insertions(+), 4 deletions(-) create mode 100644 mail/common/src/test/java/com/fsck/k9/mail/internet/JisSupportTest.java diff --git a/mail/common/src/main/java/com/fsck/k9/mail/internet/CharsetSupport.java b/mail/common/src/main/java/com/fsck/k9/mail/internet/CharsetSupport.java index f71ab7c91b8..fd34f2a0a2c 100644 --- a/mail/common/src/main/java/com/fsck/k9/mail/internet/CharsetSupport.java +++ b/mail/common/src/main/java/com/fsck/k9/mail/internet/CharsetSupport.java @@ -6,6 +6,7 @@ import org.apache.commons.io.IOUtils; +import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; @@ -25,7 +26,10 @@ public class CharsetSupport { private static final String[][] CHARSET_FALLBACK_MAP = new String[][] { // Some Android versions don't support KOI8-U {"koi8-u", "koi8-r"}, - {"iso-2022-jp-[\\d]+", "iso-2022-jp"} + {"iso-2022-jp-[\\d]+", "iso-2022-jp"}, + // EUC-JP aliases that some mailers use + {"x-euc-jp", "euc-jp"}, + {"euc_jp", "euc-jp"}, }; @@ -34,8 +38,11 @@ static String fixupCharset(String charset, Message message) throws MessagingExce charset = DEFAULT_CHARSET; charset = charset.toLowerCase(Locale.US); - if (charset.equals("cp932")) + if (charset.equals("cp932") || charset.equals("shift-jis") || charset.equals("sjis") || + charset.equals("ms932") || charset.equals("windows-31j") || charset.equals("x-sjis") || + charset.equals("x-ms-cp932")) { charset = SHIFT_JIS; + } if (charset.equals(SHIFT_JIS) || charset.equals("iso-2022-jp")) { String variant = JisSupport.getJisVariantFromMessage(message); @@ -56,6 +63,15 @@ static String readToString(InputStream in, String charset) throws IOException { charset = "x-" + charset.substring(2, charset.length() - 17) + "-shift_jis-2007"; } + // Android's ICU4J ISO-2022-JP decoder is stricter than the JVM decoder and can silently fail on + // QP-decoded byte sequences, causing ESC bytes to appear as invisible control characters while + // $B and (B escape sequence remnants become visible literal text. + // Always use Iso2022JpToShiftJisInputStream for reliable decoding across all Android versions. + if (charset.equals("iso-2022-jp")) { + in = new Iso2022JpToShiftJisInputStream(in); + charset = SHIFT_JIS; + } + // shift_jis variants are supported by Eclair and later. if (JisSupport.isShiftJis(charset) && !Charset.isSupported(charset)) { // If the JIS variant is iPhone, map the Unicode private use area in iPhone to the one in Android after @@ -97,6 +113,21 @@ static String readToString(InputStream in, String charset) throws IOException { charset = DEFAULT_CHARSET; } + // When charset defaulted to US-ASCII (i.e., Content-Type had no charset parameter), + // auto-detect ISO-2022-JP by scanning for its 7-bit escape sequences (ESC$B or ESC$@). + // Japanese email clients (especially feature phones and carrier webmail) often omit the + // charset parameter for ISO-2022-JP bodies, causing garbled "$B..." output when decoded + // as US-ASCII. + if (charset.equalsIgnoreCase(DEFAULT_CHARSET)) { + byte[] bodyBytes = IOUtils.toByteArray(in); + if (hasIso2022JpEscapeSequence(bodyBytes)) { + in = new Iso2022JpToShiftJisInputStream(new ByteArrayInputStream(bodyBytes)); + charset = SHIFT_JIS; + } else { + in = new ByteArrayInputStream(bodyBytes); + } + } + /* * Convert and return as new String */ @@ -107,6 +138,25 @@ static String readToString(InputStream in, String charset) throws IOException { return str; } + /** + * Returns true if the byte array contains an ISO-2022-JP character-set designation + * escape sequence: ESC $ B (JIS X 0208-1983) or ESC $ @ (JIS X 0208-1978). + * + * Japanese email clients — especially feature phones and carrier webmail — often omit the + * charset parameter from Content-Type and send a raw 7-bit ISO-2022-JP body. Checking for + * these two sequences is sufficient to distinguish such content from ordinary US-ASCII text + * while keeping false-positive risk negligible. + */ + static boolean hasIso2022JpEscapeSequence(byte[] data) { + for (int i = 0; i < data.length - 2; i++) { + if (data[i] == 0x1B && data[i + 1] == '$' + && (data[i + 2] == 'B' || data[i + 2] == '@')) { + return true; + } + } + return false; + } + private static String importStringFromIphone(String str) { StringBuilder buff = new StringBuilder(str.length()); for (int i = 0; i < str.length(); i = str.offsetByCodePoints(i, 1)) { diff --git a/mail/common/src/main/java/com/fsck/k9/mail/internet/JisSupport.java b/mail/common/src/main/java/com/fsck/k9/mail/internet/JisSupport.java index 38bf509011e..315398cb6bf 100644 --- a/mail/common/src/main/java/com/fsck/k9/mail/internet/JisSupport.java +++ b/mail/common/src/main/java/com/fsck/k9/mail/internet/JisSupport.java @@ -87,8 +87,33 @@ private static String getJisVariantFromReceivedHeaders(Part message) { } private static String getAddressFromReceivedHeader(String receivedHeader) { - // Not implemented yet! Extract an address from the FOR clause of the given Received header. - return null; + // Extract an address from the FOR clause of the given Received header. + // Example: "... for ;" or "... for user@docomo.ne.jp;" + int forIndex = receivedHeader.toLowerCase(java.util.Locale.US).indexOf(" for "); + if (forIndex == -1) { + return null; + } + String afterFor = receivedHeader.substring(forIndex + 5).trim(); + // Strip angle brackets if present + if (afterFor.startsWith("<")) { + int close = afterFor.indexOf('>'); + if (close == -1) { + return null; + } + afterFor = afterFor.substring(1, close); + } else { + // Address ends at the first whitespace or semicolon + int end = afterFor.length(); + for (int i = 0; i < afterFor.length(); i++) { + char c = afterFor.charAt(i); + if (c == ';' || c == ' ' || c == '\t' || c == '\r' || c == '\n') { + end = i; + break; + } + } + afterFor = afterFor.substring(0, end); + } + return afterFor.isEmpty() ? null : afterFor; } private static String getJisVariantFromFromHeaders(Message message) { diff --git a/mail/common/src/test/java/com/fsck/k9/mail/internet/CharsetSupportTest.java b/mail/common/src/test/java/com/fsck/k9/mail/internet/CharsetSupportTest.java index 845b1c1598a..98e4dc2bac1 100644 --- a/mail/common/src/test/java/com/fsck/k9/mail/internet/CharsetSupportTest.java +++ b/mail/common/src/test/java/com/fsck/k9/mail/internet/CharsetSupportTest.java @@ -8,6 +8,8 @@ import org.junit.Test; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; public class CharsetSupportTest { @@ -100,6 +102,32 @@ public void testFixupCharset() throws Exception { assertEquals(expect, CharsetSupport.fixupCharset(charsetOnMail, message)); } + @Test + public void testFixupCharset_shiftJisAliases() throws Exception { + MimeMessage message = new MimeMessage(); + assertEquals("shift_jis", CharsetSupport.fixupCharset("shift-jis", message)); + assertEquals("shift_jis", CharsetSupport.fixupCharset("sjis", message)); + assertEquals("shift_jis", CharsetSupport.fixupCharset("ms932", message)); + assertEquals("shift_jis", CharsetSupport.fixupCharset("windows-31j", message)); + assertEquals("shift_jis", CharsetSupport.fixupCharset("x-sjis", message)); + assertEquals("shift_jis", CharsetSupport.fixupCharset("x-ms-cp932", message)); + } + + @Test + public void readToString_withXEucJpAlias_shouldFallBackToEucJp() throws IOException { + // "test" in ASCII — just verifies the alias is recognized without throwing + InputStream inputStream = new ByteArrayInputStream("test".getBytes()); + String result = CharsetSupport.readToString(inputStream, "x-euc-jp"); + assertEquals("test", result); + } + + @Test + public void readToString_withEucJpUnderscoreAlias_shouldFallBackToEucJp() throws IOException { + InputStream inputStream = new ByteArrayInputStream("test".getBytes()); + String result = CharsetSupport.readToString(inputStream, "euc_jp"); + assertEquals("test", result); + } + @Test public void readToString_withUnsupportedCharset_shouldFallBackToAscii() throws IOException { InputStream inputStream = new ByteArrayInputStream("input".getBytes()); @@ -119,4 +147,42 @@ public void readToString_withInvalidCharset_shouldFallBackToAscii() throws IOExc assertEquals("input", result); } + + // hasIso2022JpEscapeSequence + + @Test + public void hasIso2022JpEscapeSequence_withEscDollarB_returnsTrue() { + byte[] data = {0x1B, '$', 'B', 0x25, 0x46}; + assertTrue(CharsetSupport.hasIso2022JpEscapeSequence(data)); + } + + @Test + public void hasIso2022JpEscapeSequence_withEscDollarAt_returnsTrue() { + byte[] data = {0x1B, '$', '@', 0x25, 0x46}; + assertTrue(CharsetSupport.hasIso2022JpEscapeSequence(data)); + } + + @Test + public void hasIso2022JpEscapeSequence_withNoEscSequence_returnsFalse() { + byte[] data = "Hello, world!".getBytes(java.nio.charset.StandardCharsets.US_ASCII); + assertFalse(CharsetSupport.hasIso2022JpEscapeSequence(data)); + } + + @Test + public void hasIso2022JpEscapeSequence_withEscOpenParenB_returnsFalse() { + // ESC ( B is the return-to-ASCII sequence; alone it should not trigger detection + byte[] data = {0x1B, '(', 'B'}; + assertFalse(CharsetSupport.hasIso2022JpEscapeSequence(data)); + } + + @Test + public void hasIso2022JpEscapeSequence_withEmptyArray_returnsFalse() { + assertFalse(CharsetSupport.hasIso2022JpEscapeSequence(new byte[0])); + } + + @Test + public void hasIso2022JpEscapeSequence_withTooShortForSequence_returnsFalse() { + byte[] data = {0x1B, '$'}; // only 2 bytes, need at least 3 + assertFalse(CharsetSupport.hasIso2022JpEscapeSequence(data)); + } } diff --git a/mail/common/src/test/java/com/fsck/k9/mail/internet/JisSupportTest.java b/mail/common/src/test/java/com/fsck/k9/mail/internet/JisSupportTest.java new file mode 100644 index 00000000000..bded8952f88 --- /dev/null +++ b/mail/common/src/test/java/com/fsck/k9/mail/internet/JisSupportTest.java @@ -0,0 +1,90 @@ +package com.fsck.k9.mail.internet; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + + +public class JisSupportTest { + + // getJisVariantFromMessage via From header + + @Test + public void getJisVariantFromMessage_docomoSender_returnsDocomo() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@docomo.ne.jp"); + assertEquals("docomo", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_softbankSender_returnsSoftbank() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@softbank.ne.jp"); + assertEquals("softbank", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_kddiSender_returnsKddi() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@ezweb.ne.jp"); + assertEquals("kddi", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_unknownSender_returnsNull() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + assertNull(JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_iPhoneMailer_returnsIphone() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("X-Mailer", "iPhone Mail A380"); + assertEquals("iphone", JisSupport.getJisVariantFromMessage(message)); + } + + // getJisVariantFromMessage via Received header FOR clause + + @Test + public void getJisVariantFromMessage_receivedForDocomoAngleBracket_returnsDocomo() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("Received", "from mail.example.com (mail.example.com [1.2.3.4]) for ;"); + assertEquals("docomo", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_receivedForDocomoNoAngleBracket_returnsDocomo() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("Received", "from mail.example.com (mail.example.com [1.2.3.4]) for user@docomo.ne.jp;"); + assertEquals("docomo", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_receivedForEzwebAddress_returnsKddi() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("Received", "from smtp.example.net for ;"); + assertEquals("kddi", JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_receivedForUnknownAddress_returnsNull() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("Received", "from smtp.example.net for ;"); + assertNull(JisSupport.getJisVariantFromMessage(message)); + } + + @Test + public void getJisVariantFromMessage_receivedWithoutFor_returnsNull() throws Exception { + MimeMessage message = new MimeMessage(); + message.setHeader("From", "user@example.com"); + message.setHeader("Received", "from smtp.example.net by mx.example.com;"); + assertNull(JisSupport.getJisVariantFromMessage(message)); + } +} diff --git a/mail/common/src/test/java/com/fsck/k9/mail/internet/MessageExtractorTest.java b/mail/common/src/test/java/com/fsck/k9/mail/internet/MessageExtractorTest.java index 9a4a97bdf56..a12060bc8db 100644 --- a/mail/common/src/test/java/com/fsck/k9/mail/internet/MessageExtractorTest.java +++ b/mail/common/src/test/java/com/fsck/k9/mail/internet/MessageExtractorTest.java @@ -10,6 +10,8 @@ import org.junit.Before; import org.junit.Test; +import java.nio.charset.StandardCharsets; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; @@ -70,6 +72,50 @@ public void getTextFromPart_withExceptionThrownGettingInputStream_shouldReturnNu assertNull(result); } + /** + * Regression test: ISO-2022-JP body with QP encoding. + * + * "テスト" in ISO-2022-JP: + * ESC$B (0x1B 0x24 0x42) = switch to JIS X 0208 + * テ = 0x25 0x46, ス = 0x25 0x39, ト = 0x25 0x48 + * ESC(B (0x1B 0x28 0x42) = switch back to ASCII + * + * QP-encoded: ESC is written as =1B; $,B,(,% are printable ASCII and left as-is. + * Android's ICU4J ISO-2022-JP decoder can silently mishandle this, showing "$B" and "(B" + * as literal text. We bypass the platform decoder with Iso2022JpToShiftJisInputStream. + */ + @Test + public void getTextFromPart_withIso2022JpQuotedPrintable_shouldDecodeToJapanese() throws Exception { + // QP-encoded "テスト" in ISO-2022-JP: =1B$B%F%9%H=1B(B + byte[] qpBytes = "=1B$B%F%9%H=1B(B".getBytes(StandardCharsets.US_ASCII); + part.setHeader(MimeHeader.HEADER_CONTENT_TYPE, "text/plain; charset=iso-2022-jp"); + BinaryMemoryBody body = new BinaryMemoryBody(qpBytes, MimeUtil.ENC_QUOTED_PRINTABLE); + part.setBody(body); + + String result = MessageExtractor.getTextFromPart(part); + + assertEquals("テスト", result); + } + + /** + * Regression test: multi-line ISO-2022-JP body with QP soft line breaks. + * Each line re-emits ESC$B because hard line breaks reset to ASCII in ISO-2022-JP. + */ + @Test + public void getTextFromPart_withIso2022JpQuotedPrintableMultiLine_shouldDecodeToJapanese() throws Exception { + // Line 1: "テスト", Line 2: "テスト" — each line wraps with ESC(B ... ESC$B + String qpBody = "=1B$B%F%9%H=1B(B\r\n=1B$B%F%9%H=1B(B"; + byte[] qpBytes = qpBody.getBytes(StandardCharsets.US_ASCII); + part.setHeader(MimeHeader.HEADER_CONTENT_TYPE, "text/plain; charset=iso-2022-jp"); + BinaryMemoryBody body = new BinaryMemoryBody(qpBytes, MimeUtil.ENC_QUOTED_PRINTABLE); + part.setBody(body); + + String result = MessageExtractor.getTextFromPart(part); + + assertNotNull(result); + assertEquals("テスト\r\nテスト", result); + } + @Test public void getTextFromPart_withUnknownEncoding_shouldReturnUnmodifiedBodyContents() throws Exception { part.setHeader(MimeHeader.HEADER_CONTENT_TYPE, "text/plain"); @@ -105,6 +151,49 @@ public void getTextFromPart_withHtmlWithCharsetInContentTypeRawDataBody_shouldRe assertEquals("Sample text body", result); } + /** + * Regression test: forwarded ISO-2022-JP message with no charset in Content-Type. + * + * Japanese feature phones and carrier webmail systems often omit "charset=iso-2022-jp" + * from the Content-Type header. Without auto-detection the body defaults to US-ASCII and + * the ESC byte is silently dropped, leaving the literal "$B" escape sequence remnants + * visible (e.g. "$BJIC..."). + */ + @Test + public void getTextFromPart_withIso2022Jp7bitNoCharset_shouldAutoDetectAndDecode() throws Exception { + // Raw 7-bit ISO-2022-JP bytes for "テスト" — no QP encoding, no charset header + byte[] raw = new byte[] { + 0x1B, '$', 'B', // ESC $ B → switch to JIS X 0208 + 0x25, 0x46, // テ + 0x25, 0x39, // ス + 0x25, 0x48, // ト + 0x1B, '(', 'B' // ESC ( B → switch back to ASCII + }; + part.setHeader(MimeHeader.HEADER_CONTENT_TYPE, "text/plain"); // no charset! + BinaryMemoryBody body = new BinaryMemoryBody(raw, MimeUtil.ENC_7BIT); + part.setBody(body); + + String result = MessageExtractor.getTextFromPart(part); + + assertEquals("テスト", result); + } + + /** + * Regression test: forwarded ISO-2022-JP message with QP encoding and no charset. + * This is the most common form: the =1B escape is QP-encoded, charset is absent. + */ + @Test + public void getTextFromPart_withIso2022JpQuotedPrintableNoCharset_shouldAutoDetectAndDecode() throws Exception { + byte[] qpBytes = "=1B$B%F%9%H=1B(B".getBytes(java.nio.charset.StandardCharsets.US_ASCII); + part.setHeader(MimeHeader.HEADER_CONTENT_TYPE, "text/plain"); // no charset! + BinaryMemoryBody body = new BinaryMemoryBody(qpBytes, MimeUtil.ENC_QUOTED_PRINTABLE); + part.setBody(body); + + String result = MessageExtractor.getTextFromPart(part); + + assertEquals("テスト", result); + } + @Test public void getTextFromPart_withHtmlWithCharsetInHtmlRawDataBody_shouldReturnHtmlText() throws Exception { String bodyText = "" +