Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 21 additions & 7 deletions Sources/Tokenizers/Decoder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,14 @@ class WordPieceDecoder: Decoder {
}

func decode(tokens: [String]) -> [String] {
let firstToken = cleanup ? cleanUpTokenization(tokens.first!) : tokens.first!
// An empty token list can reach this decoder when the calling chain has
// filtered out every input — e.g. `decode(tokens:, skipSpecialTokens: true)`
// on an id sequence that consisted entirely of special tokens, or after a
// `compactMap { convertIdToToken($0) }` drops every entry as out-of-vocab.
// The reference Rust implementation no-ops in that case rather than
// unwrapping `tokens.first!` (which previously crashed).
guard let first = tokens.first else { return [] }
let firstToken = cleanup ? cleanUpTokenization(first) : first
return [firstToken]
+ tokens.dropFirst().map { token in
let token = token.hasPrefix(prefix) ? token.replacingCharacters(in: token.range(of: prefix)!, with: "") : " \(token)"
Expand Down Expand Up @@ -186,19 +193,26 @@ class ByteFallbackDecoder: Decoder {
return Int(token[startIndex..<endIndex], radix: 16)
}

func flushPendingBytes() {
guard !byteTokens.isEmpty else { return }
let codeUnits = byteTokens.map { UTF8.CodeUnit($0) }
newTokens.append(String(decoding: codeUnits, as: UTF8.self))
byteTokens.removeAll()
}

for token in tokens {
if let byte = parseByte(token) {
byteTokens.append(byte)
} else {
if !byteTokens.isEmpty {
// decode as utf8 and append
let codeUnits = byteTokens.map { UTF8.CodeUnit($0) }
newTokens.append(String(decoding: codeUnits, as: UTF8.self))
byteTokens.removeAll()
}
flushPendingBytes()
newTokens.append(token)
}
}
// Flush trailing byte tokens — when the input ends with a run of
// `<0xHH>` (e.g. multi-byte UTF-8 for a final emoji or CJK character),
// the previous implementation dropped them on the floor instead of
// appending the decoded UTF-8 string.
flushPendingBytes()
return newTokens
}
}
Expand Down
11 changes: 7 additions & 4 deletions Tests/TokenizersTests/ChatTemplateTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,13 @@ struct ChatTemplateTests {
messages: messages, chatTemplate: whitespaceSensitiveTemplate
)
let decoded = tokenizer.decode(tokens: encoded)
let expected = """
Describe the Swift programming language.
assistant
"""
// The template ends with a `{% endif %}` block on its own line, so the
// rendered text legitimately ends with "\n". Phi's tokenizer encodes that
// trailing newline as a `<0x0A>` byte-fallback token; before
// `ByteFallbackDecoder` was fixed to flush its pending byte buffer at
// end-of-input, the trailing newline was silently dropped and this test
// appeared to pass on the truncated string.
let expected = "Describe the Swift programming language.\nassistant\n"
#expect(decoded == expected)
#expect(!decoded.hasPrefix("\n"))
#expect(!decoded.contains("\n\n"))
Expand Down
37 changes: 37 additions & 0 deletions Tests/TokenizersTests/DecoderTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,43 @@ struct DecoderTests {
#expect(decoded.joined() == "How are you?")
}

/// Regression coverage: `WordPieceDecoder.decode(tokens: [])` previously crashed
/// with `Fatal error: Unexpectedly found nil while unwrapping an Optional value`
/// because of `tokens.first!`. An empty token list can legitimately reach the
/// decoder when `Tokenizer.decode(tokens:, skipSpecialTokens: true)` is called
/// on an id sequence whose non-special-token ids all fail vocab lookup (the
/// `compactMap` upstream drops them silently), so the decoder has to no-op
/// instead of asserting.
@Test("WordPiece decoder no-ops on empty input")
func wordPieceDecoderEmptyInput() {
let decoder = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": true]))
#expect(decoder.decode(tokens: []) == [])
let decoderNoCleanup = WordPieceDecoder(config: Config(["prefix": "##", "cleanup": false]))
#expect(decoderNoCleanup.decode(tokens: []) == [])
}

/// Regression coverage: `ByteFallbackDecoder.decode` accumulated runs of
/// `<0xHH>` byte tokens and flushed them only when a non-byte token followed.
/// Inputs that ended with a multi-byte UTF-8 run (e.g. a trailing emoji or
/// CJK character whose final code point fell back to bytes) had those bytes
/// silently dropped from the output.
@Test("ByteFallback decoder flushes trailing byte tokens")
func byteFallbackDecoderTrailingBytes() {
let decoder = ByteFallbackDecoder(config: Config(["type": "ByteFallback"]))
// © is U+00A9 → UTF-8 0xC2 0xA9.
#expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>"]) == ["©"])
// Trailing bytes after a real token must still flush.
#expect(decoder.decode(tokens: ["abc", "<0xC2>", "<0xA9>"]) == ["abc", "©"])
// 🚀 is U+1F680 → UTF-8 0xF0 0x9F 0x9A 0x80 (four-byte fallback at end).
#expect(decoder.decode(tokens: ["hi", "<0xF0>", "<0x9F>", "<0x9A>", "<0x80>"]) == ["hi", "🚀"])
// Existing in-loop flush behavior is preserved.
#expect(decoder.decode(tokens: ["<0xC2>", "<0xA9>", "xyz"]) == ["©", "xyz"])
// Empty input still returns empty.
#expect(decoder.decode(tokens: []) == [])
// Non-byte-only inputs unchanged.
#expect(decoder.decode(tokens: ["abc", "def"]) == ["abc", "def"])
}

@Test("WordPiece decoder with prefix and cleanup")
func wordPieceDecoder() {
let config = Config(["prefix": "##", "cleanup": true])
Expand Down