Skip to content

Commit 2fa33e1

Browse files
apocryphxclaudepcuenca
authored
Unigram lattice walks Unicode scalars (#352, Bug 3) (#356)
* Unigram lattice walks Unicode scalars, not grapheme clusters (#352, Bug 3) `UnigramTokenizer.tokenize(text:)` and `TokenLattice` indexed the input by Swift `Character` (extended grapheme clusters). SentencePiece Unigram vocabularies are scalar-indexed, so an input grapheme that spans multiple scalars never gets exposed as its constituent scalars to the trie walk and the vocab lookup. For example `"1️⃣"` is one grapheme but three scalars (digit `1`, VS-16, combining keycap U+20E3); HF Python emits `▁1 <unk> </s>` for it, while Swift previously returned `▁ <unk> </s>` — the digit was silently dropped because the entire keycap grapheme occupied a single lattice slot that didn't match any vocab key. Switch the iteration unit to `Unicode.Scalar`: - `UnigramTokenizer.trie` is now `Trie<Unicode.Scalar>` and is fed each vocab entry's `unicodeScalars` view. - `tokenize(text:)` walks `Array(text.unicodeScalars)` and reconstructs token strings from `String.UnicodeScalarView` slices. - `TokenLattice.chars` is `[Unicode.Scalar]`. The convenience `init(sentence:)` and the `piece(_:)` reconstruction follow the same convention. Lattice offsets and lengths are now in scalar units; the Viterbi algorithm is unchanged (it only sees integer positions). Adds a regression test using google-t5/t5-small over the keycap emoji. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Address review: tone down comments per @pcuenca Per @pcuenca's review on #356: - Replace the multi-line TokenLattice.chars docstring with the shorter three-line version @pcuenca suggested. - Remove the in-body comment blocks at the trie-build site in `UnigramTokenizer.init` and at the top of `UnigramTokenizer.tokenize(text:)`. The remaining TokenLattice docstring plus the regression test in TokenizerTests carry the explanation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * I hate linters --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
1 parent eb08337 commit 2fa33e1

3 files changed

Lines changed: 30 additions & 13 deletions

File tree

Sources/Tokenizers/TokenLattice.swift

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,20 @@ struct TokenLattice {
1414
let bosTokenId: Int
1515
let eosTokenId: Int
1616

17-
/// `Character` view of the input String for performance.
18-
/// Lattice offsets and lengths are in `Character` units, so direct
19-
/// access through the array does not pay the cost of
20-
/// `String.index(_:offsetBy:)` traversal (O(N) per token, quadratic for sequences).
21-
private let chars: [Character]
17+
/// `Unicode.Scalar` view of the input String. Lattice offsets and lengths are in
18+
/// scalar units so multi-scalar grapheme clusters are addressable at the same
19+
/// granularity the SentencePiece vocab uses.
20+
private let chars: [Unicode.Scalar]
2221

2322
var nodes: [TokenLatticeNode] = []
2423
var beginNodes: [[TokenLatticeNode]]
2524
var endNodes: [[TokenLatticeNode]]
2625

2726
init(sentence: String, bosTokenId: Int, eosTokenId: Int) {
28-
self.init(chars: Array(sentence), bosTokenId: bosTokenId, eosTokenId: eosTokenId)
27+
self.init(chars: Array(sentence.unicodeScalars), bosTokenId: bosTokenId, eosTokenId: eosTokenId)
2928
}
3029

31-
init(chars: [Character], bosTokenId: Int, eosTokenId: Int) {
30+
init(chars: [Unicode.Scalar], bosTokenId: Int, eosTokenId: Int) {
3231
self.chars = chars
3332
self.bosTokenId = bosTokenId
3433
self.eosTokenId = eosTokenId
@@ -109,9 +108,9 @@ extension TokenLattice {
109108
///
110109
/// - Parameter node: The node defining the token to be extracted.
111110
///
112-
/// - Returns: A `String` reconstructed from the cached `Character` array.
111+
/// - Returns: A `String` reconstructed from the cached `Unicode.Scalar` array.
113112
func piece(_ node: TokenLatticeNode) -> any StringProtocol {
114-
String(chars[node.startOffset..<(node.startOffset + node.length)])
113+
String(String.UnicodeScalarView(chars[node.startOffset..<(node.startOffset + node.length)]))
115114
}
116115
}
117116

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
5757
/// Whether consecutive unknown tokens should be fused (always true for Unigram).
5858
let fuseUnknownTokens: Bool = true
5959

60-
private let trie: Trie<Character>
60+
private let trie: Trie<Unicode.Scalar>
6161

6262
/// Initializes a Unigram tokenizer from configuration data.
6363
///
@@ -108,7 +108,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
108108
eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
109109

110110
trie = Trie()
111-
trie.append(contentsOf: vocab.map { $0.token })
111+
trie.append(contentsOf: vocab.map { $0.token.unicodeScalars })
112112
}
113113

114114
/// Converts a token string to its corresponding numeric ID.
@@ -132,7 +132,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
132132
/// - Parameter text: The input text to tokenize
133133
/// - Returns: An array of token strings representing the most probable segmentation
134134
func tokenize(text: String) -> [String] {
135-
let chars = Array(text)
135+
let chars = Array(text.unicodeScalars)
136136
let charsCount = chars.count
137137
var lattice = TokenLattice(
138138
chars: chars,
@@ -148,7 +148,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
148148
let suffix = chars[beginPos...]
149149
for tokenChars in trie.commonPrefixSearchIterator(suffix) {
150150
let tokenLength = tokenChars.count
151-
let token = String(tokenChars)
151+
let token = String(String.UnicodeScalarView(tokenChars))
152152
guard let tokenId = tokensToIds[token as NSString] else { fatalError("Token not in vocab: \(token)") }
153153
let tokenScore = vocab[tokenId].score
154154
lattice.insert(startOffset: beginPos, length: tokenLength, score: tokenScore, tokenId: tokenId)

Tests/TokenizersTests/TokenizerTests.swift

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,24 @@ struct TokenizerTests {
224224
#expect(inputIds == [1, 29871, 6324])
225225
}
226226

227+
/// https://github.com/huggingface/swift-transformers/issues/352 (Bug 3)
228+
@Test
229+
func t5UnigramKeycapEmoji() async throws {
230+
// Keycap "1️⃣" is one grapheme cluster but three Unicode scalars:
231+
// U+0031 DIGIT ONE
232+
// U+FE0F VARIATION SELECTOR-16
233+
// U+20E3 COMBINING ENCLOSING KEYCAP
234+
// SentencePiece Unigram lookups operate per scalar, so HF Python tokenizes
235+
// this as ▁1 <unk> </s> — the digit `1` matches; the VS-16 + keycap tail UNKs.
236+
// Pre-fix, the Swift tokenizer iterated by `Character`, so the whole grapheme
237+
// became one lattice slot that never matched any vocab entry and the digit was
238+
// silently lost (▁ <unk> </s>).
239+
let tokenizerOpt = try await AutoTokenizer.from(pretrained: "google-t5/t5-small") as? PreTrainedTokenizer
240+
#expect(tokenizerOpt != nil)
241+
let tokenizer = tokenizerOpt!
242+
243+
#expect(tokenizer.encode(text: "1️⃣") == [209, 2, 1])
244+
}
227245
/// https://github.com/huggingface/swift-transformers/issues/352 (Bug 4)
228246
@Test
229247
func llama7bCombiningMarks() async throws {

0 commit comments

Comments
 (0)