huggingface · john-rocky · May 15, 2026
diff --git a/Sources/Tokenizers/Decoder.swift b/Sources/Tokenizers/Decoder.swift
@@ -84,7 +84,14 @@ class WordPieceDecoder: Decoder {
     }
 
     func decode(tokens: [String]) -> [String] {
-        let firstToken = cleanup ? cleanUpTokenization(tokens.first!) : tokens.first!
+        // An empty token list can reach this decoder when the calling chain has
+        // filtered out every input — e.g. `decode(tokens:, skipSpecialTokens: true)`
+        // on an id sequence that consisted entirely of special tokens, or after a
+        // `compactMap { convertIdToToken($0) }` drops every entry as out-of-vocab.
+        // The reference Rust implementation no-ops in that case rather than
+        // unwrapping `tokens.first!` (which previously crashed).
+        guard let first = tokens.first else { return [] }
+        let firstToken = cleanup ? cleanUpTokenization(first) : first
         return [firstToken]
             + tokens.dropFirst().map { token in
                 let token = token.hasPrefix(prefix) ? token.replacingCharacters(in: token.range(of: prefix)!, with: "") : " \(token)"
@@ -186,19 +193,26 @@ class ByteFallbackDecoder: Decoder {
             return Int(token[startIndex..<endIndex], radix: 16)
         }
 
+        func flushPendingBytes() {
+            guard !byteTokens.isEmpty else { return }
+            let codeUnits = byteTokens.map { UTF8.CodeUnit($0) }
+            newTokens.append(String(decoding: codeUnits, as: UTF8.self))
+            byteTokens.removeAll()
+        }
+
         for token in tokens {
             if let byte = parseByte(token) {
                 byteTokens.append(byte)
             } else {
-                if !byteTokens.isEmpty {
-                    // decode as utf8 and append
-                    let codeUnits = byteTokens.map { UTF8.CodeUnit($0) }
-                    newTokens.append(String(decoding: codeUnits, as: UTF8.self))
-                    byteTokens.removeAll()
-                }
+                flushPendingBytes()
                 newTokens.append(token)
             }
         }
+        // Flush trailing byte tokens — when the input ends with a run of
+        // `<0xHH>` (e.g. multi-byte UTF-8 for a final emoji or CJK character),
+        // the previous implementation dropped them on the floor instead of
+        // appending the decoded UTF-8 string.
+        flushPendingBytes()
         return newTokens
     }
 }

diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift
@@ -181,10 +181,13 @@ struct ChatTemplateTests {
             messages: messages, chatTemplate: whitespaceSensitiveTemplate
         )
         let decoded = tokenizer.decode(tokens: encoded)
-        let expected = """
-            Describe the Swift programming language.
-            assistant
-            """
+        // The template ends with a `{% endif %}` block on its own line, so the
+        // rendered text legitimately ends with "\n". Phi's tokenizer encodes that
+        // trailing newline as a `<0x0A>` byte-fallback token; before
+        // `ByteFallbackDecoder` was fixed to flush its pending byte buffer at
+        // end-of-input, the trailing newline was silently dropped and this test
+        // appeared to pass on the truncated string.
+        let expected = "Describe the Swift programming language.\nassistant\n"
         #expect(decoded == expected)
         #expect(!decoded.hasPrefix("\n"))
         #expect(!decoded.contains("\n\n"))

diff --git a/Tests/TokenizersTests/DecoderTests.swift b/Tests/TokenizersTests/DecoderTests.swift
@@ -114,6 +114,43 @@ struct DecoderTests {
         #expect(decoded.joined() == "How are you?")
     }
 
+    /// Regression coverage: `WordPieceDecoder.decode(tokens: [])` previously crashed
+    /// with `Fatal error: Unexpectedly found nil while unwrapping an Optional value`
+    /// because of `tokens.first!`. An empty token list can legitimately reach the
+    /// decoder when `Tokenizer.decode(tokens:, skipSpecialTokens: true)` is called
+    /// on an id sequence whose non-special-token ids all fail vocab lookup (the
+    /// `compactMap` upstream drops them silently), so the decoder has to no-op
+    /// instead of asserting.
+    @Test("WordPiece decoder no-ops on empty input")
+    func wordPieceDecoderEmptyInput() {
+        let decoder = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": true]))
+        #expect(decoder.decode(tokens: []) == [])
+        let decoderNoCleanup = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": false]))
+        #expect(decoderNoCleanup.decode(tokens: []) == [])
+    }
+
+    /// Regression coverage: `ByteFallbackDecoder.decode` accumulated runs of
+    /// `<0xHH>` byte tokens and flushed them only when a non-byte token followed.
+    /// Inputs that ended with a multi-byte UTF-8 run (e.g. a trailing emoji or
+    /// CJK character whose final code point fell back to bytes) had those bytes
+    /// silently dropped from the output.
+    @Test("ByteFallback decoder flushes trailing byte tokens")
+    func byteFallbackDecoderTrailingBytes() {
+        let decoder = ByteFallbackDecoder(config: Config(["type": "ByteFallback"]))
+        // © is U+00A9 → UTF-8 0xC2 0xA9.
+        #expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>"]) == ["©"])
+        // Trailing bytes after a real token must still flush.
+        #expect(decoder.decode(tokens: ["abc", "<0xC2>", "<0xA9>"]) == ["abc", "©"])
+        // 🚀 is U+1F680 → UTF-8 0xF0 0x9F 0x9A 0x80 (four-byte fallback at end).
+        #expect(decoder.decode(tokens: ["hi", "<0xF0>", "<0x9F>", "<0x9A>", "<0x80>"]) == ["hi", "🚀"])
+        // Existing in-loop flush behavior is preserved.
+        #expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>", "xyz"]) == ["©", "xyz"])
+        // Empty input still returns empty.
+        #expect(decoder.decode(tokens: []) == [])
+        // Non-byte-only inputs unchanged.
+        #expect(decoder.decode(tokens: ["abc", "def"]) == ["abc", "def"])
+    }
+
     @Test("WordPiece decoder with prefix and cleanup")
     func wordPieceDecoder() {
         let config = Config(["prefix": "##", "cleanup": true])