Unigram lattice walks Unicode scalars (#352, Bug 3) (#356)

apocryphx · claude · pcuenca · web-flow · commit 2fa33e1f5e71 · 2026-05-16T13:25:34.000+02:00
* Unigram lattice walks Unicode scalars, not grapheme clusters (#352, Bug 3) `UnigramTokenizer.tokenize(text:)` and `TokenLattice` indexed the input by Swift `Character` (extended grapheme clusters). SentencePiece Unigram vocabularies are scalar-indexed, so an input grapheme that spans multiple scalars never gets exposed as its constituent scalars to the trie walk and the vocab lookup. For example `"1️⃣"` is one grapheme but three scalars (digit `1`, VS-16, combining keycap U+20E3); HF Python emits `▁1 <unk> </s>` for it, while Swift previously returned `▁ <unk> </s>` — the digit was silently dropped because the entire keycap grapheme occupied a single lattice slot that didn't match any vocab key. Switch the iteration unit to `Unicode.Scalar`: - `UnigramTokenizer.trie` is now `Trie<Unicode.Scalar>` and is fed each vocab entry's `unicodeScalars` view. - `tokenize(text:)` walks `Array(text.unicodeScalars)` and reconstructs token strings from `String.UnicodeScalarView` slices. - `TokenLattice.chars` is `[Unicode.Scalar]`. The convenience `init(sentence:)` and the `piece(_:)` reconstruction follow the same convention. Lattice offsets and lengths are now in scalar units; the Viterbi algorithm is unchanged (it only sees integer positions). Adds a regression test using google-t5/t5-small over the keycap emoji. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * Address review: tone down comments per @pcuenca Per @pcuenca's review on #356: - Replace the multi-line TokenLattice.chars docstring with the shorter three-line version @pcuenca suggested. - Remove the in-body comment blocks at the trie-build site in `UnigramTokenizer.init` and at the top of `UnigramTokenizer.tokenize(text:)`. The remaining TokenLattice docstring plus the regression test in TokenizerTests carry the explanation. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> * I hate linters --------- Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>
diff --git a/Sources/Tokenizers/TokenLattice.swift b/Sources/Tokenizers/TokenLattice.swift
@@ -14,21 +14,20 @@ struct TokenLattice {
     let bosTokenId: Int
     let eosTokenId: Int
 
-    /// `Character` view of the input String for performance.
-    /// Lattice offsets and lengths are in `Character` units, so direct
-    /// access through the array does not pay the cost of
-    /// `String.index(_:offsetBy:)` traversal (O(N) per token, quadratic for sequences).
-    private let chars: [Character]
+    /// `Unicode.Scalar` view of the input String. Lattice offsets and lengths are in
+    /// scalar units so multi-scalar grapheme clusters are addressable at the same
+    /// granularity the SentencePiece vocab uses.
+    private let chars: [Unicode.Scalar]
 
     var nodes: [TokenLatticeNode] = []
     var beginNodes: [[TokenLatticeNode]]
     var endNodes: [[TokenLatticeNode]]
 
     init(sentence: String, bosTokenId: Int, eosTokenId: Int) {
-        self.init(chars: Array(sentence), bosTokenId: bosTokenId, eosTokenId: eosTokenId)
+        self.init(chars: Array(sentence.unicodeScalars), bosTokenId: bosTokenId, eosTokenId: eosTokenId)
     }
 
-    init(chars: [Character], bosTokenId: Int, eosTokenId: Int) {
+    init(chars: [Unicode.Scalar], bosTokenId: Int, eosTokenId: Int) {
         self.chars = chars
         self.bosTokenId = bosTokenId
         self.eosTokenId = eosTokenId
@@ -109,9 +108,9 @@ extension TokenLattice {
     ///
     /// - Parameter node: The node defining the token to be extracted.
     ///
-    /// - Returns: A `String` reconstructed from the cached `Character` array.
+    /// - Returns: A `String` reconstructed from the cached `Unicode.Scalar` array.
     func piece(_ node: TokenLatticeNode) -> any StringProtocol {
-        String(chars[node.startOffset..<(node.startOffset + node.length)])
+        String(String.UnicodeScalarView(chars[node.startOffset..<(node.startOffset + node.length)]))
     }
 }
 
diff --git a/Sources/Tokenizers/UnigramTokenizer.swift b/Sources/Tokenizers/UnigramTokenizer.swift
@@ -57,7 +57,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
     /// Whether consecutive unknown tokens should be fused (always true for Unigram).
     let fuseUnknownTokens: Bool = true
 
-    private let trie: Trie<Character>
+    private let trie: Trie<Unicode.Scalar>
 
     /// Initializes a Unigram tokenizer from configuration data.
     ///
@@ -108,7 +108,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
         eosTokenId = eosToken == nil ? nil : tokensToIds[eosToken! as NSString]
 
         trie = Trie()
-        trie.append(contentsOf: vocab.map { $0.token })
+        trie.append(contentsOf: vocab.map { $0.token.unicodeScalars })
     }
 
     /// Converts a token string to its corresponding numeric ID.
@@ -132,7 +132,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
     /// - Parameter text: The input text to tokenize
     /// - Returns: An array of token strings representing the most probable segmentation
     func tokenize(text: String) -> [String] {
-        let chars = Array(text)
+        let chars = Array(text.unicodeScalars)
         let charsCount = chars.count
         var lattice = TokenLattice(
             chars: chars,
@@ -148,7 +148,7 @@ class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
             let suffix = chars[beginPos...]
             for tokenChars in trie.commonPrefixSearchIterator(suffix) {
                 let tokenLength = tokenChars.count
-                let token = String(tokenChars)
+                let token = String(String.UnicodeScalarView(tokenChars))
                 guard let tokenId = tokensToIds[token as NSString] else { fatalError("Token not in vocab: \(token)") }
                 let tokenScore = vocab[tokenId].score
                 lattice.insert(startOffset: beginPos, length: tokenLength, score: tokenScore, tokenId: tokenId)
diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift
@@ -224,6 +224,24 @@ struct TokenizerTests {
         #expect(inputIds == [1, 29871, 6324])
     }
 
+    /// https://github.com/huggingface/swift-transformers/issues/352 (Bug 3)
+    @Test
+    func t5UnigramKeycapEmoji() async throws {
+        // Keycap "1️⃣" is one grapheme cluster but three Unicode scalars:
+        //   U+0031 DIGIT ONE
+        //   U+FE0F VARIATION SELECTOR-16
+        //   U+20E3 COMBINING ENCLOSING KEYCAP
+        // SentencePiece Unigram lookups operate per scalar, so HF Python tokenizes
+        // this as ▁1 <unk> </s> — the digit `1` matches; the VS-16 + keycap tail UNKs.
+        // Pre-fix, the Swift tokenizer iterated by `Character`, so the whole grapheme
+        // became one lattice slot that never matched any vocab entry and the digit was
+        // silently lost (▁ <unk> </s>).
+        let tokenizerOpt = try await AutoTokenizer.from(pretrained: "google-t5/t5-small") as? PreTrainedTokenizer
+        #expect(tokenizerOpt != nil)
+        let tokenizer = tokenizerOpt!
+
+        #expect(tokenizer.encode(text: "1️⃣") == [209, 2, 1])
+    }
     /// https://github.com/huggingface/swift-transformers/issues/352 (Bug 4)
     @Test
     func llama7bCombiningMarks() async throws {