From 8f6a6c2fef438a2e6eee11d4b12ab8218dd3fe94 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 9 Mar 2026 04:53:25 -0700 Subject: [PATCH 1/4] Add view-based tokenizer offset mapping --- Sources/Tokenizers/PostProcessor.swift | 27 +++ Sources/Tokenizers/PreTokenizer.swift | 48 +++++ Sources/Tokenizers/Tokenizer.swift | 182 +++++++++++++++++- .../TokenizersTests/BertTokenizerTests.swift | 23 +++ Tests/TokenizersTests/TokenizerTests.swift | 6 + 5 files changed, 285 insertions(+), 1 deletion(-) diff --git a/Sources/Tokenizers/PostProcessor.swift b/Sources/Tokenizers/PostProcessor.swift index 1f6ce434..fa922fd7 100644 --- a/Sources/Tokenizers/PostProcessor.swift +++ b/Sources/Tokenizers/PostProcessor.swift @@ -212,3 +212,30 @@ class SequenceProcessing: PostProcessor { return currentTokens } } + +struct PostProcessedToken { + let text: String + let offset: Range? +} + +func postProcessWithOffsets(postProcessor: PostProcessor?, tokens: [PostProcessedToken], addSpecialTokens: Bool = true) -> [PostProcessedToken] { + guard let postProcessor else { return tokens } + + let tokenStrings = tokens.map(\.text) + let processedStrings = postProcessor.postProcess(tokens: tokenStrings, tokensPair: nil, addSpecialTokens: addSpecialTokens) + + var spanQueues: [String: [Range?]] = [:] + for token in tokens { + spanQueues[token.text, default: []].append(token.offset) + } + + return processedStrings.map { token in + if var queue = spanQueues[token], !queue.isEmpty { + let offset = queue.removeFirst() + spanQueues[token] = queue + return PostProcessedToken(text: token, offset: offset) + } + // Synthetic/special tokens added by post-processing have no source span. + return PostProcessedToken(text: token, offset: nil) + } +} diff --git a/Sources/Tokenizers/PreTokenizer.swift b/Sources/Tokenizers/PreTokenizer.swift index 07cf5ec7..304c8c77 100644 --- a/Sources/Tokenizers/PreTokenizer.swift +++ b/Sources/Tokenizers/PreTokenizer.swift @@ -286,3 +286,51 @@ class SplitPreTokenizer: PreTokenizer { return pattern.split(text, invert: invert) } } + +struct PreTokenizedText { + let text: String + let offset: Range? +} + +func preTokenizeWithOffsets(preTokenizer: PreTokenizer?, text: String, options: PreTokenizerOptions = [.firstSection], baseOffset: Int = 0) -> [PreTokenizedText] { + guard let preTokenizer else { + return [PreTokenizedText(text: text, offset: baseOffset..<(baseOffset + text.count))] + } + + if let sequence = preTokenizer as? PreTokenizerSequence { + var current = [PreTokenizedText(text: text, offset: baseOffset..<(baseOffset + text.count))] + for nextTokenizer in sequence.preTokenizers { + current = current.flatMap { item in + guard let offset = item.offset else { + return nextTokenizer.preTokenize(text: item.text, options: options).map { PreTokenizedText(text: $0, offset: nil) } + } + return preTokenizeWithOffsets(preTokenizer: nextTokenizer, text: item.text, options: options, baseOffset: offset.lowerBound) + } + } + return current + } + + let ranges: [Range]? + switch preTokenizer { + case let tokenizer as BertPreTokenizer: + ranges = text.ranges(of: tokenizer.re) + case let tokenizer as WhitespacePreTokenizer: + ranges = text.ranges(of: tokenizer.re) + case let tokenizer as PunctuationPreTokenizer: + ranges = text.ranges(of: tokenizer.re) + case let tokenizer as DigitsPreTokenizer: + ranges = text.ranges(of: tokenizer.re) + default: + ranges = nil + } + + if let ranges { + return ranges.map { range in + let lower = baseOffset + text.distance(from: text.startIndex, to: range.lowerBound) + let upper = baseOffset + text.distance(from: text.startIndex, to: range.upperBound) + return PreTokenizedText(text: String(text[range]), offset: lower..? + } + + /// The source text used for encoding. + public let text: String + private let storage: [Element] + + public var startIndex: Int { storage.startIndex } + public var endIndex: Int { storage.endIndex } + + public subscript(position: Int) -> Element { + storage[position] + } + + init(text: String, storage: [Element]) { + self.text = text + self.storage = storage + } +} + /// Errors that can occur during tokenizer operations. public enum TokenizerError: LocalizedError { case missingConfig @@ -125,6 +154,25 @@ func addedTokenAsString(_ addedToken: Config?) -> String? { return addedToken.content.string() } +private func splitByAddedTokensRegex(text: String, regex: NSRegularExpression) -> [(String, Range?)] { + let sections = text.split(by: regex) + var result: [(String, Range?)] = [] + var cursor = text.startIndex + + for section in sections { + if section.isEmpty { continue } + if let range = text.range(of: section, range: cursor.. [String] { tokenize(text: text) @@ -240,6 +288,14 @@ public protocol Tokenizer: Sendable { /// - Returns: An array of token IDs func encode(text: String, addSpecialTokens: Bool) -> [Int] + /// Encodes text into a view of token IDs, token strings, and source spans. + /// + /// - Parameters: + /// - text: The input text to encode + /// - addSpecialTokens: Whether to add special tokens (e.g., BOS, EOS) + /// - Returns: A token encoding view. Spans are `nil` for synthetic/special tokens. + func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView + /// Function call syntax for encoding text. /// /// - Parameters: @@ -424,6 +480,10 @@ public extension Tokenizer { encode(text: text, addSpecialTokens: addSpecialTokens) } + func encodeWithOffsets(text: String) -> TokenEncodingView { + encodeWithOffsets(text: text, addSpecialTokens: true) + } + func decode(tokens: [Int]) -> String { decode(tokens: tokens, skipSpecialTokens: false) } @@ -613,6 +673,109 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { return fused } + private struct TokenWithOffset { + let token: String + let offset: Range? + } + + private func fuseUnknown(_ tokens: [TokenWithOffset]) -> [TokenWithOffset] { + guard fuseUnknownTokens else { return tokens } + let (fused, _) = tokens.reduce((fused: [TokenWithOffset](), previousIsUnknown: false)) { result, token in + var (fused, previousIsUnknown) = result + let isUnknown = model.convertTokenToId(token.token) == model.unknownTokenId + if isUnknown { + if !previousIsUnknown { + fused.append(token) + } else if let last = fused.last { + let merged: Range? + switch (last.offset, token.offset) { + case let (.some(lhs), .some(rhs)): + merged = min(lhs.lowerBound, rhs.lowerBound).., text: String) -> Range? { + guard offset.lowerBound >= 0, offset.upperBound >= offset.lowerBound, offset.upperBound <= text.count else { + return nil + } + guard let lower = text.index(text.startIndex, offsetBy: offset.lowerBound, limitedBy: text.endIndex), + let upper = text.index(text.startIndex, offsetBy: offset.upperBound, limitedBy: text.endIndex) + else { + return nil + } + return lower.. [TokenWithOffset] { + let sections: [(String, Range?)] = + if let regex = addedTokensRegex { + splitByAddedTokensRegex(text: text, regex: regex) + } else { + [(text, 0.. [TokenWithOffset] in + let (sectionText, sectionRange) = item + if addedTokens.contains(sectionText) { + return [TokenWithOffset(token: sectionText, offset: sectionRange)] + } + + let normalized = normalize(sectionText) + let preTokenized: [PreTokenizedText] = + if let sectionRange, normalized.count == sectionText.count { + preTokenizeWithOffsets(preTokenizer: preTokenizer, text: normalized, options: section == 0 ? [.firstSection] : [], baseOffset: sectionRange.lowerBound) + } else { + preTokenize(normalized, options: section == 0 ? [.firstSection] : []).map { PreTokenizedText(text: $0, offset: nil) } + } + + return preTokenized.flatMap { item -> [TokenWithOffset] in + let subtokens = model(item.text) + guard let offset = item.offset else { + return subtokens.map { TokenWithOffset(token: $0, offset: nil) } + } + + if model is BertTokenizer { + return zip(subtokens, bertWordPieceOffsets(tokens: subtokens, tokenOffset: offset)).map { + TokenWithOffset(token: $0.0, offset: $0.1) + } + } + return subtokens.map { TokenWithOffset(token: $0, offset: offset) } + } + } + return fuseUnknown(tokens) + } + + private func bertWordPieceOffsets(tokens: [String], tokenOffset: Range) -> [Range?] { + var cursor = tokenOffset.lowerBound + var result: [Range?] = [] + for token in tokens { + if token == "[UNK]" { + result.append(cursor.. [Int] { - postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { model.convertTokenToId($0)! } + encodeWithOffsets(text: text, addSpecialTokens: addSpecialTokens).map(\.id) } /// Encodes input text into token IDs with special tokens included by default. @@ -651,6 +814,23 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { encode(text: text, addSpecialTokens: true) } + public func encodeWithOffsets(text: String, addSpecialTokens: Bool = true) -> TokenEncodingView { + let tokenized = tokenizeWithOffsets(text: text) + let processed = postProcessWithOffsets( + postProcessor: postProcessor, + tokens: tokenized.map { PostProcessedToken(text: $0.token, offset: $0.offset) }, + addSpecialTokens: addSpecialTokens + ) + let storage = processed.map { token in + TokenEncodingView.Element( + id: model.convertTokenToId(token.text)!, + token: token.text, + span: token.offset.flatMap { rangeInText(from: $0, text: text) } + ) + } + return TokenEncodingView(text: text, storage: storage) + } + /// Decodes token IDs back into human-readable text. /// /// - Parameters: diff --git a/Tests/TokenizersTests/BertTokenizerTests.swift b/Tests/TokenizersTests/BertTokenizerTests.swift index 33493b43..d6ffd3aa 100644 --- a/Tests/TokenizersTests/BertTokenizerTests.swift +++ b/Tests/TokenizersTests/BertTokenizerTests.swift @@ -251,4 +251,27 @@ struct BertTokenizerTests { #expect(decoded == String(expected)) } } + + @Test("BERT offset mapping aligns simple spans") + func bertOffsetMapping() async throws { + let tokenizerOpt = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer + #expect(tokenizerOpt != nil) + let tokenizer = tokenizerOpt! + + let input = "John Smith works at Google" + let encoding = tokenizer.encodeWithOffsets(text: input) + #expect(encoding.map(\.id) == tokenizer.encode(text: input)) + #expect(encoding.first?.token == "[CLS]") + #expect(encoding.first?.span == nil) + #expect(encoding.last?.token == "[SEP]") + #expect(encoding.last?.span == nil) + let sourceBacked = encoding.dropFirst().dropLast() + #expect(sourceBacked.allSatisfy { $0.span != nil }) + + if let johnSpan = encoding.first(where: { $0.token == "john" })?.span { + #expect(String(input[johnSpan]) == "John") + } else { + Issue.record("Expected token 'john' with a source span") + } + } } diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 1065a3d2..a7334e7b 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -114,6 +114,12 @@ struct TokenizerTests { #expect(tokenizer.tokenize(text: dataset.text) == dataset.bpe_tokens) #expect(tokenizer.encode(text: dataset.text) == dataset.token_ids) + let encoding = tokenizer.encodeWithOffsets(text: dataset.text) + #expect(encoding.map(\.id) == dataset.token_ids) + #expect(encoding.count == encoding.map(\.token).count) + if let span = encoding.first(where: { $0.span != nil })?.span { + #expect(!String(dataset.text[span]).isEmpty) + } #expect(tokenizer.decode(tokens: dataset.token_ids) == dataset.decoded_text) // Edge cases (if available) From 98089e669f0495a9ea13aab68717def5473448fc Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 9 Mar 2026 05:36:28 -0700 Subject: [PATCH 2/4] Incorporate feedback from review --- Sources/Tokenizers/PostProcessor.swift | 24 ++++++++----- Sources/Tokenizers/Tokenizer.swift | 40 ++++++++++++++++------ Tests/TokenizersTests/TokenizerTests.swift | 2 +- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/Sources/Tokenizers/PostProcessor.swift b/Sources/Tokenizers/PostProcessor.swift index fa922fd7..8cc88f0c 100644 --- a/Sources/Tokenizers/PostProcessor.swift +++ b/Sources/Tokenizers/PostProcessor.swift @@ -224,17 +224,23 @@ func postProcessWithOffsets(postProcessor: PostProcessor?, tokens: [PostProcesse let tokenStrings = tokens.map(\.text) let processedStrings = postProcessor.postProcess(tokens: tokenStrings, tokensPair: nil, addSpecialTokens: addSpecialTokens) - var spanQueues: [String: [Range?]] = [:] - for token in tokens { - spanQueues[token.text, default: []].append(token.offset) - } - + // Map offsets by source token position (not token text) to avoid collisions + // with inserted special tokens and to preserve order after post-processing. + var sourceIndex = 0 return processedStrings.map { token in - if var queue = spanQueues[token], !queue.isEmpty { - let offset = queue.removeFirst() - spanQueues[token] = queue - return PostProcessedToken(text: token, offset: offset) + guard sourceIndex < tokens.count else { + return PostProcessedToken(text: token, offset: nil) + } + + let sourceToken = tokens[sourceIndex] + let isDirectMatch = token == sourceToken.text + let isWhitespaceNormalizedMatch = token.trimmingCharacters(in: .whitespaces) == sourceToken.text.trimmingCharacters(in: .whitespaces) + + if isDirectMatch || isWhitespaceNormalizedMatch { + sourceIndex += 1 + return PostProcessedToken(text: token, offset: sourceToken.offset) } + // Synthetic/special tokens added by post-processing have no source span. return PostProcessedToken(text: token, offset: nil) } diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 117626fb..d066f5c0 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -173,6 +173,20 @@ private func splitByAddedTokensRegex(text: String, regex: NSRegularExpression) - return result } +private func canShareCharacterOffsets(original: String, normalized: String) -> Bool { + guard original.count == normalized.count else { return false } + for (lhs, rhs) in zip(original, normalized) { + if lhs == rhs { continue } + // Allow one-to-one case changes (e.g. "John" -> "john") while rejecting + // transformations that may change grapheme boundary alignment. + if String(lhs).lowercased() == String(rhs) || String(rhs).lowercased() == String(lhs) { + continue + } + return false + } + return true +} + public extension TokenizingModel { func callAsFunction(_ text: String) -> [String] { tokenize(text: text) @@ -708,16 +722,19 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { return fused } - private func rangeInText(from offset: Range, text: String) -> Range? { - guard offset.lowerBound >= 0, offset.upperBound >= offset.lowerBound, offset.upperBound <= text.count else { - return nil + private struct TextIndexLookup { + let indices: [String.Index] + + init(text: String) { + indices = Array(text.indices) + [text.endIndex] } - guard let lower = text.index(text.startIndex, offsetBy: offset.lowerBound, limitedBy: text.endIndex), - let upper = text.index(text.startIndex, offsetBy: offset.upperBound, limitedBy: text.endIndex) - else { - return nil + + func range(from offset: Range) -> Range? { + guard offset.lowerBound >= 0, offset.upperBound >= offset.lowerBound, offset.upperBound < indices.count else { + return nil + } + return indices[offset.lowerBound].. [TokenWithOffset] { @@ -736,7 +753,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { let normalized = normalize(sectionText) let preTokenized: [PreTokenizedText] = - if let sectionRange, normalized.count == sectionText.count { + if let sectionRange, canShareCharacterOffsets(original: sectionText, normalized: normalized) { preTokenizeWithOffsets(preTokenizer: preTokenizer, text: normalized, options: section == 0 ? [.firstSection] : [], baseOffset: sectionRange.lowerBound) } else { preTokenize(normalized, options: section == 0 ? [.firstSection] : []).map { PreTokenizedText(text: $0, offset: nil) } @@ -803,7 +820,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { /// - addSpecialTokens: Whether to add special tokens during post-processing /// - Returns: An array of token IDs public func encode(text: String, addSpecialTokens: Bool = true) -> [Int] { - encodeWithOffsets(text: text, addSpecialTokens: addSpecialTokens).map(\.id) + postProcess(tokenize(text: text), addSpecialTokens: addSpecialTokens).map { model.convertTokenToId($0)! } } /// Encodes input text into token IDs with special tokens included by default. @@ -815,6 +832,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { } public func encodeWithOffsets(text: String, addSpecialTokens: Bool = true) -> TokenEncodingView { + let indexLookup = TextIndexLookup(text: text) let tokenized = tokenizeWithOffsets(text: text) let processed = postProcessWithOffsets( postProcessor: postProcessor, @@ -825,7 +843,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { TokenEncodingView.Element( id: model.convertTokenToId(token.text)!, token: token.text, - span: token.offset.flatMap { rangeInText(from: $0, text: text) } + span: token.offset.flatMap { indexLookup.range(from: $0) } ) } return TokenEncodingView(text: text, storage: storage) diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index a7334e7b..761f5628 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -116,7 +116,7 @@ struct TokenizerTests { #expect(tokenizer.encode(text: dataset.text) == dataset.token_ids) let encoding = tokenizer.encodeWithOffsets(text: dataset.text) #expect(encoding.map(\.id) == dataset.token_ids) - #expect(encoding.count == encoding.map(\.token).count) + #expect(encoding.allSatisfy { tokenizer.convertTokenToId($0.token) == $0.id }) if let span = encoding.first(where: { $0.span != nil })?.span { #expect(!String(dataset.text[span]).isEmpty) } From a3eb39a600b05cd891661d24487b8680d7c0d509 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 9 Mar 2026 06:01:28 -0700 Subject: [PATCH 3/4] Incorporate feedback from second round of review --- Sources/Tokenizers/Tokenizer.swift | 14 ++++++++++++-- Tests/TokenizersTests/BertTokenizerTests.swift | 2 +- Tests/TokenizersTests/TokenizerTests.swift | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index d066f5c0..581fc659 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -23,7 +23,8 @@ public struct TokenEncodingView: RandomAccessCollection, Sendable { public let id: Int /// Token text. public let token: String - /// Span in the original input text. Special or synthetic tokens have `nil`. + /// Span in the original input text when available. + /// Special/synthetic tokens or tokens without reliable offset mapping have `nil`. public let span: Range? } @@ -307,7 +308,8 @@ public protocol Tokenizer: Sendable { /// - Parameters: /// - text: The input text to encode /// - addSpecialTokens: Whether to add special tokens (e.g., BOS, EOS) - /// - Returns: A token encoding view. Spans are `nil` for synthetic/special tokens. + /// - Returns: A token encoding view. Spans are `nil` for synthetic/special tokens + /// or when offset mapping is unavailable. func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView /// Function call syntax for encoding text. @@ -494,6 +496,14 @@ public extension Tokenizer { encode(text: text, addSpecialTokens: addSpecialTokens) } + func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView { + let ids = encode(text: text, addSpecialTokens: addSpecialTokens) + let elements = ids.map { id in + TokenEncodingView.Element(id: id, token: convertIdToToken(id) ?? "", span: nil) + } + return TokenEncodingView(text: text, storage: elements) + } + func encodeWithOffsets(text: String) -> TokenEncodingView { encodeWithOffsets(text: text, addSpecialTokens: true) } diff --git a/Tests/TokenizersTests/BertTokenizerTests.swift b/Tests/TokenizersTests/BertTokenizerTests.swift index d6ffd3aa..908ccf3e 100644 --- a/Tests/TokenizersTests/BertTokenizerTests.swift +++ b/Tests/TokenizersTests/BertTokenizerTests.swift @@ -269,7 +269,7 @@ struct BertTokenizerTests { #expect(sourceBacked.allSatisfy { $0.span != nil }) if let johnSpan = encoding.first(where: { $0.token == "john" })?.span { - #expect(String(input[johnSpan]) == "John") + #expect(String(encoding.text[johnSpan]) == "John") } else { Issue.record("Expected token 'john' with a source span") } diff --git a/Tests/TokenizersTests/TokenizerTests.swift b/Tests/TokenizersTests/TokenizerTests.swift index 761f5628..7f9ddce5 100644 --- a/Tests/TokenizersTests/TokenizerTests.swift +++ b/Tests/TokenizersTests/TokenizerTests.swift @@ -118,7 +118,7 @@ struct TokenizerTests { #expect(encoding.map(\.id) == dataset.token_ids) #expect(encoding.allSatisfy { tokenizer.convertTokenToId($0.token) == $0.id }) if let span = encoding.first(where: { $0.span != nil })?.span { - #expect(!String(dataset.text[span]).isEmpty) + #expect(!String(encoding.text[span]).isEmpty) } #expect(tokenizer.decode(tokens: dataset.token_ids) == dataset.decoded_text) From 95692c941a8edb91d99dc95f89aa3df842abb4f6 Mon Sep 17 00:00:00 2001 From: Mattt Zmuda Date: Mon, 9 Mar 2026 06:24:44 -0700 Subject: [PATCH 4/4] Replace fallback-with-failure approach with new OffsetMappingTokenizer protocol --- Sources/Tokenizers/Tokenizer.swift | 38 ++++++++++++------------------ 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 581fc659..6322b2d2 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -303,15 +303,6 @@ public protocol Tokenizer: Sendable { /// - Returns: An array of token IDs func encode(text: String, addSpecialTokens: Bool) -> [Int] - /// Encodes text into a view of token IDs, token strings, and source spans. - /// - /// - Parameters: - /// - text: The input text to encode - /// - addSpecialTokens: Whether to add special tokens (e.g., BOS, EOS) - /// - Returns: A token encoding view. Spans are `nil` for synthetic/special tokens - /// or when offset mapping is unavailable. - func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView - /// Function call syntax for encoding text. /// /// - Parameters: @@ -466,6 +457,19 @@ public protocol Tokenizer: Sendable { ) throws -> [Int] } +/// A tokenizer that can return source spans for encoded tokens. +public protocol OffsetMappingTokenizer: Tokenizer { + /// Encodes text into a view of token IDs, token strings, and source spans. + /// + /// - Parameters: + /// - text: The input text to encode + /// - addSpecialTokens: Whether to add special tokens (e.g., BOS, EOS) + /// - Returns: A token encoding view. + /// Spans are `nil` for synthetic/special tokens + /// or when offset mapping is unavailable. + func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView +} + extension Tokenizer { public var hasChatTemplate: Bool { false } @@ -496,18 +500,6 @@ public extension Tokenizer { encode(text: text, addSpecialTokens: addSpecialTokens) } - func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView { - let ids = encode(text: text, addSpecialTokens: addSpecialTokens) - let elements = ids.map { id in - TokenEncodingView.Element(id: id, token: convertIdToToken(id) ?? "", span: nil) - } - return TokenEncodingView(text: text, storage: elements) - } - - func encodeWithOffsets(text: String) -> TokenEncodingView { - encodeWithOffsets(text: text, addSpecialTokens: true) - } - func decode(tokens: [Int]) -> String { decode(tokens: tokens, skipSpecialTokens: false) } @@ -537,7 +529,7 @@ let specialTokenAttributes: [String] = [ /// This class provides a complete tokenizer implementation that can be initialized from /// Hugging Face Hub configuration files and supports all standard tokenization operations /// including chat template application, normalization, pre-tokenization, and post-processing. -public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { +public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer, OffsetMappingTokenizer { let model: TokenizingModel public var bosToken: String? { model.bosToken } @@ -780,7 +772,7 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { TokenWithOffset(token: $0.0, offset: $0.1) } } - return subtokens.map { TokenWithOffset(token: $0, offset: offset) } + return subtokens.map { TokenWithOffset(token: $0, offset: nil) } } } return fuseUnknown(tokens)