diff --git a/Sources/Tokenizers/Decoder.swift b/Sources/Tokenizers/Decoder.swift index 49021645..0cb3f626 100644 --- a/Sources/Tokenizers/Decoder.swift +++ b/Sources/Tokenizers/Decoder.swift @@ -84,7 +84,14 @@ class WordPieceDecoder: Decoder { } func decode(tokens: [String]) -> [String] { - let firstToken = cleanup ? cleanUpTokenization(tokens.first!) : tokens.first! + // An empty token list can reach this decoder when the calling chain has + // filtered out every input — e.g. `decode(tokens:, skipSpecialTokens: true)` + // on an id sequence that consisted entirely of special tokens, or after a + // `compactMap { convertIdToToken($0) }` drops every entry as out-of-vocab. + // The reference Rust implementation no-ops in that case rather than + // unwrapping `tokens.first!` (which previously crashed). + guard let first = tokens.first else { return [] } + let firstToken = cleanup ? cleanUpTokenization(first) : first return [firstToken] + tokens.dropFirst().map { token in let token = token.hasPrefix(prefix) ? token.replacingCharacters(in: token.range(of: prefix)!, with: "") : " \(token)" @@ -186,19 +193,26 @@ class ByteFallbackDecoder: Decoder { return Int(token[startIndex..` (e.g. multi-byte UTF-8 for a final emoji or CJK character), + // the previous implementation dropped them on the floor instead of + // appending the decoded UTF-8 string. + flushPendingBytes() return newTokens } } diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index 50f0117e..007d9331 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -181,10 +181,13 @@ struct ChatTemplateTests { messages: messages, chatTemplate: whitespaceSensitiveTemplate ) let decoded = tokenizer.decode(tokens: encoded) - let expected = """ - Describe the Swift programming language. - assistant - """ + // The template ends with a `{% endif %}` block on its own line, so the + // rendered text legitimately ends with "\n". Phi's tokenizer encodes that + // trailing newline as a `<0x0A>` byte-fallback token; before + // `ByteFallbackDecoder` was fixed to flush its pending byte buffer at + // end-of-input, the trailing newline was silently dropped and this test + // appeared to pass on the truncated string. + let expected = "Describe the Swift programming language.\nassistant\n" #expect(decoded == expected) #expect(!decoded.hasPrefix("\n")) #expect(!decoded.contains("\n\n")) diff --git a/Tests/TokenizersTests/DecoderTests.swift b/Tests/TokenizersTests/DecoderTests.swift index ddd0a735..7e2227c3 100644 --- a/Tests/TokenizersTests/DecoderTests.swift +++ b/Tests/TokenizersTests/DecoderTests.swift @@ -114,6 +114,43 @@ struct DecoderTests { #expect(decoded.joined() == "How are you?") } + /// Regression coverage: `WordPieceDecoder.decode(tokens: [])` previously crashed + /// with `Fatal error: Unexpectedly found nil while unwrapping an Optional value` + /// because of `tokens.first!`. An empty token list can legitimately reach the + /// decoder when `Tokenizer.decode(tokens:, skipSpecialTokens: true)` is called + /// on an id sequence whose non-special-token ids all fail vocab lookup (the + /// `compactMap` upstream drops them silently), so the decoder has to no-op + /// instead of asserting. + @Test("WordPiece decoder no-ops on empty input") + func wordPieceDecoderEmptyInput() { + let decoder = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": true])) + #expect(decoder.decode(tokens: []) == []) + let decoderNoCleanup = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": false])) + #expect(decoderNoCleanup.decode(tokens: []) == []) + } + + /// Regression coverage: `ByteFallbackDecoder.decode` accumulated runs of + /// `<0xHH>` byte tokens and flushed them only when a non-byte token followed. + /// Inputs that ended with a multi-byte UTF-8 run (e.g. a trailing emoji or + /// CJK character whose final code point fell back to bytes) had those bytes + /// silently dropped from the output. + @Test("ByteFallback decoder flushes trailing byte tokens") + func byteFallbackDecoderTrailingBytes() { + let decoder = ByteFallbackDecoder(config: Config(["type": "ByteFallback"])) + // © is U+00A9 → UTF-8 0xC2 0xA9. + #expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>"]) == ["©"]) + // Trailing bytes after a real token must still flush. + #expect(decoder.decode(tokens: ["abc", "<0xC2>", "<0xA9>"]) == ["abc", "©"]) + // 🚀 is U+1F680 → UTF-8 0xF0 0x9F 0x9A 0x80 (four-byte fallback at end). + #expect(decoder.decode(tokens: ["hi", "<0xF0>", "<0x9F>", "<0x9A>", "<0x80>"]) == ["hi", "🚀"]) + // Existing in-loop flush behavior is preserved. + #expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>", "xyz"]) == ["©", "xyz"]) + // Empty input still returns empty. + #expect(decoder.decode(tokens: []) == []) + // Non-byte-only inputs unchanged. + #expect(decoder.decode(tokens: ["abc", "def"]) == ["abc", "def"]) + } + @Test("WordPiece decoder with prefix and cleanup") func wordPieceDecoder() { let config = Config(["prefix": "##", "cleanup": true])