Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions Sources/Tokenizers/PostProcessor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,36 @@ class SequenceProcessing: PostProcessor {
return currentTokens
}
}

struct PostProcessedToken {
let text: String
let offset: Range<Int>?
}

func postProcessWithOffsets(postProcessor: PostProcessor?, tokens: [PostProcessedToken], addSpecialTokens: Bool = true) -> [PostProcessedToken] {
guard let postProcessor else { return tokens }

let tokenStrings = tokens.map(\.text)
let processedStrings = postProcessor.postProcess(tokens: tokenStrings, tokensPair: nil, addSpecialTokens: addSpecialTokens)

// Map offsets by source token position (not token text) to avoid collisions
// with inserted special tokens and to preserve order after post-processing.
var sourceIndex = 0
return processedStrings.map { token in
guard sourceIndex < tokens.count else {
return PostProcessedToken(text: token, offset: nil)
}

let sourceToken = tokens[sourceIndex]
let isDirectMatch = token == sourceToken.text
let isWhitespaceNormalizedMatch = token.trimmingCharacters(in: .whitespaces) == sourceToken.text.trimmingCharacters(in: .whitespaces)

if isDirectMatch || isWhitespaceNormalizedMatch {
sourceIndex += 1
return PostProcessedToken(text: token, offset: sourceToken.offset)
}

// Synthetic/special tokens added by post-processing have no source span.
return PostProcessedToken(text: token, offset: nil)
}
Comment thread
mattt marked this conversation as resolved.
Comment on lines +221 to +246
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

postProcessWithOffsets advances sourceIndex by comparing token text (token == sourceToken.text / trimmed match). This can mis-assign spans when the post-processor inserts special tokens whose string happens to equal a real source token (e.g. user text contains "[CLS]" / "[SEP]" as added tokens). In that case the inserted special token can incorrectly consume the first source token’s offset and shift the rest. A more reliable approach is to drive offset assignment from the post-processor configuration/structure (e.g. for BertProcessing/RobertaProcessing insert known specials with nil offsets at fixed positions; for TemplateProcessing walk the template and consume offsets only for Sequence items).

Copilot uses AI. Check for mistakes.
}
48 changes: 48 additions & 0 deletions Sources/Tokenizers/PreTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,51 @@ class SplitPreTokenizer: PreTokenizer {
return pattern.split(text, invert: invert)
}
}

struct PreTokenizedText {
let text: String
let offset: Range<Int>?
}

func preTokenizeWithOffsets(preTokenizer: PreTokenizer?, text: String, options: PreTokenizerOptions = [.firstSection], baseOffset: Int = 0) -> [PreTokenizedText] {
guard let preTokenizer else {
return [PreTokenizedText(text: text, offset: baseOffset..<(baseOffset + text.count))]
}

if let sequence = preTokenizer as? PreTokenizerSequence {
var current = [PreTokenizedText(text: text, offset: baseOffset..<(baseOffset + text.count))]
for nextTokenizer in sequence.preTokenizers {
current = current.flatMap { item in
guard let offset = item.offset else {
return nextTokenizer.preTokenize(text: item.text, options: options).map { PreTokenizedText(text: $0, offset: nil) }
}
return preTokenizeWithOffsets(preTokenizer: nextTokenizer, text: item.text, options: options, baseOffset: offset.lowerBound)
}
}
return current
}

let ranges: [Range<String.Index>]?
switch preTokenizer {
case let tokenizer as BertPreTokenizer:
ranges = text.ranges(of: tokenizer.re)
case let tokenizer as WhitespacePreTokenizer:
ranges = text.ranges(of: tokenizer.re)
case let tokenizer as PunctuationPreTokenizer:
ranges = text.ranges(of: tokenizer.re)
case let tokenizer as DigitsPreTokenizer:
ranges = text.ranges(of: tokenizer.re)
default:
ranges = nil
}

if let ranges {
return ranges.map { range in
let lower = baseOffset + text.distance(from: text.startIndex, to: range.lowerBound)
let upper = baseOffset + text.distance(from: text.startIndex, to: range.upperBound)
return PreTokenizedText(text: String(text[range]), offset: lower..<upper)
}
}

return preTokenizer.preTokenize(text: text, options: options).map { PreTokenizedText(text: $0, offset: nil) }
}
202 changes: 201 additions & 1 deletion Sources/Tokenizers/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,36 @@ public typealias Message = [String: any Sendable]
/// A type alias for tool specifications used in chat templating.
public typealias ToolSpec = [String: any Sendable]

/// A view over encoded tokens and their source spans.
public struct TokenEncodingView: RandomAccessCollection, Sendable {
/// A token in an encoded sequence.
public struct Element: Sendable {
/// Token ID.
public let id: Int
/// Token text.
public let token: String
/// Span in the original input text when available.
/// Special/synthetic tokens or tokens without reliable offset mapping have `nil`.
public let span: Range<String.Index>?
}

/// The source text used for encoding.
public let text: String
private let storage: [Element]

public var startIndex: Int { storage.startIndex }
public var endIndex: Int { storage.endIndex }

public subscript(position: Int) -> Element {
storage[position]
}

init(text: String, storage: [Element]) {
self.text = text
self.storage = storage
}
}

/// Errors that can occur during tokenizer operations.
public enum TokenizerError: LocalizedError {
case missingConfig
Expand Down Expand Up @@ -125,6 +155,39 @@ func addedTokenAsString(_ addedToken: Config?) -> String? {
return addedToken.content.string()
}

private func splitByAddedTokensRegex(text: String, regex: NSRegularExpression) -> [(String, Range<Int>?)] {
let sections = text.split(by: regex)
var result: [(String, Range<Int>?)] = []
var cursor = text.startIndex

for section in sections {
if section.isEmpty { continue }
if let range = text.range(of: section, range: cursor..<text.endIndex) {
let lower = text.distance(from: text.startIndex, to: range.lowerBound)
let upper = text.distance(from: text.startIndex, to: range.upperBound)
result.append((section, lower..<upper))
cursor = range.upperBound
} else {
result.append((section, nil))
}
}
return result
}

private func canShareCharacterOffsets(original: String, normalized: String) -> Bool {
guard original.count == normalized.count else { return false }
for (lhs, rhs) in zip(original, normalized) {
if lhs == rhs { continue }
// Allow one-to-one case changes (e.g. "John" -> "john") while rejecting
// transformations that may change grapheme boundary alignment.
if String(lhs).lowercased() == String(rhs) || String(rhs).lowercased() == String(lhs) {
continue
}
return false
}
return true
}

public extension TokenizingModel {
func callAsFunction(_ text: String) -> [String] {
tokenize(text: text)
Expand Down Expand Up @@ -394,6 +457,19 @@ public protocol Tokenizer: Sendable {
) throws -> [Int]
}

/// A tokenizer that can return source spans for encoded tokens.
public protocol OffsetMappingTokenizer: Tokenizer {
/// Encodes text into a view of token IDs, token strings, and source spans.
///
/// - Parameters:
/// - text: The input text to encode
/// - addSpecialTokens: Whether to add special tokens (e.g., BOS, EOS)
/// - Returns: A token encoding view.
/// Spans are `nil` for synthetic/special tokens
/// or when offset mapping is unavailable.
func encodeWithOffsets(text: String, addSpecialTokens: Bool) -> TokenEncodingView
}
Comment on lines +460 to +471
Copy link

Copilot AI Mar 9, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

encodeWithOffsets is only available via OffsetMappingTokenizer, but AutoTokenizer.from(...) returns Tokenizer (see same file). This means most callers won’t be able to access offset mapping without a downcast, which undermines the public API goal from #307. Consider making encodeWithOffsets a requirement on Tokenizer (with a default implementation returning nil spans when unsupported), or change the AutoTokenizer factory return type to OffsetMappingTokenizer and/or add a bridging API so callers don’t need to know the concrete type. Also, since the protocol requirement lacks a default arg, any OffsetMappingTokenizer values can’t call encodeWithOffsets(text:) without passing addSpecialTokens; adding an extension overload with addSpecialTokens: Bool = true would improve ergonomics.

Copilot uses AI. Check for mistakes.

extension Tokenizer {
public var hasChatTemplate: Bool { false }

Expand Down Expand Up @@ -453,7 +529,7 @@ let specialTokenAttributes: [String] = [
/// This class provides a complete tokenizer implementation that can be initialized from
/// Hugging Face Hub configuration files and supports all standard tokenization operations
/// including chat template application, normalization, pre-tokenization, and post-processing.
public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer, OffsetMappingTokenizer {
let model: TokenizingModel

public var bosToken: String? { model.bosToken }
Expand Down Expand Up @@ -613,6 +689,112 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
return fused
}

private struct TokenWithOffset {
let token: String
let offset: Range<Int>?
}

private func fuseUnknown(_ tokens: [TokenWithOffset]) -> [TokenWithOffset] {
guard fuseUnknownTokens else { return tokens }
let (fused, _) = tokens.reduce((fused: [TokenWithOffset](), previousIsUnknown: false)) { result, token in
var (fused, previousIsUnknown) = result
let isUnknown = model.convertTokenToId(token.token) == model.unknownTokenId
if isUnknown {
if !previousIsUnknown {
fused.append(token)
} else if let last = fused.last {
let merged: Range<Int>?
switch (last.offset, token.offset) {
case let (.some(lhs), .some(rhs)):
merged = min(lhs.lowerBound, rhs.lowerBound)..<max(lhs.upperBound, rhs.upperBound)
case let (.some(lhs), .none):
merged = lhs
case let (.none, .some(rhs)):
merged = rhs
case (.none, .none):
merged = nil
}
fused[fused.count - 1] = TokenWithOffset(token: last.token, offset: merged)
}
} else {
fused.append(token)
}
return (fused, isUnknown)
}
return fused
}

private struct TextIndexLookup {
let indices: [String.Index]

init(text: String) {
indices = Array(text.indices) + [text.endIndex]
}

func range(from offset: Range<Int>) -> Range<String.Index>? {
guard offset.lowerBound >= 0, offset.upperBound >= offset.lowerBound, offset.upperBound < indices.count else {
return nil
}
return indices[offset.lowerBound]..<indices[offset.upperBound]
}
}

private func tokenizeWithOffsets(text: String) -> [TokenWithOffset] {
let sections: [(String, Range<Int>?)] =
if let regex = addedTokensRegex {
splitByAddedTokensRegex(text: text, regex: regex)
} else {
[(text, 0..<text.count)]
}

let tokens = sections.enumerated().flatMap { section, item -> [TokenWithOffset] in
let (sectionText, sectionRange) = item
if addedTokens.contains(sectionText) {
return [TokenWithOffset(token: sectionText, offset: sectionRange)]
}

let normalized = normalize(sectionText)
let preTokenized: [PreTokenizedText] =
if let sectionRange, canShareCharacterOffsets(original: sectionText, normalized: normalized) {
preTokenizeWithOffsets(preTokenizer: preTokenizer, text: normalized, options: section == 0 ? [.firstSection] : [], baseOffset: sectionRange.lowerBound)
} else {
preTokenize(normalized, options: section == 0 ? [.firstSection] : []).map { PreTokenizedText(text: $0, offset: nil) }
}
Comment thread
mattt marked this conversation as resolved.

return preTokenized.flatMap { item -> [TokenWithOffset] in
let subtokens = model(item.text)
guard let offset = item.offset else {
return subtokens.map { TokenWithOffset(token: $0, offset: nil) }
}

if model is BertTokenizer {
return zip(subtokens, bertWordPieceOffsets(tokens: subtokens, tokenOffset: offset)).map {
TokenWithOffset(token: $0.0, offset: $0.1)
}
}
return subtokens.map { TokenWithOffset(token: $0, offset: nil) }
}
}
return fuseUnknown(tokens)
}

private func bertWordPieceOffsets(tokens: [String], tokenOffset: Range<Int>) -> [Range<Int>?] {
var cursor = tokenOffset.lowerBound
var result: [Range<Int>?] = []
for token in tokens {
if token == "[UNK]" {
result.append(cursor..<tokenOffset.upperBound)
cursor = tokenOffset.upperBound
continue
}
let raw = token.hasPrefix("##") ? String(token.dropFirst(2)) : token
let next = min(cursor + raw.count, tokenOffset.upperBound)
result.append(cursor..<next)
cursor = next
}
return result
}

/// Tokenizes input text using the configured normalization and pre-tokenization steps.
///
/// - Parameter text: The input text to tokenize
Expand Down Expand Up @@ -651,6 +833,24 @@ public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
encode(text: text, addSpecialTokens: true)
}

public func encodeWithOffsets(text: String, addSpecialTokens: Bool = true) -> TokenEncodingView {
let indexLookup = TextIndexLookup(text: text)
let tokenized = tokenizeWithOffsets(text: text)
let processed = postProcessWithOffsets(
postProcessor: postProcessor,
tokens: tokenized.map { PostProcessedToken(text: $0.token, offset: $0.offset) },
addSpecialTokens: addSpecialTokens
)
let storage = processed.map { token in
TokenEncodingView.Element(
id: model.convertTokenToId(token.text)!,
token: token.text,
span: token.offset.flatMap { indexLookup.range(from: $0) }
)
}
return TokenEncodingView(text: text, storage: storage)
}

/// Decodes token IDs back into human-readable text.
///
/// - Parameters:
Expand Down
23 changes: 23 additions & 0 deletions Tests/TokenizersTests/BertTokenizerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -251,4 +251,27 @@ struct BertTokenizerTests {
#expect(decoded == String(expected))
}
}

@Test("BERT offset mapping aligns simple spans")
func bertOffsetMapping() async throws {
let tokenizerOpt = try await AutoTokenizer.from(pretrained: "google-bert/bert-base-uncased") as? PreTrainedTokenizer
#expect(tokenizerOpt != nil)
let tokenizer = tokenizerOpt!

let input = "John Smith works at Google"
let encoding = tokenizer.encodeWithOffsets(text: input)
#expect(encoding.map(\.id) == tokenizer.encode(text: input))
#expect(encoding.first?.token == "[CLS]")
#expect(encoding.first?.span == nil)
#expect(encoding.last?.token == "[SEP]")
#expect(encoding.last?.span == nil)
let sourceBacked = encoding.dropFirst().dropLast()
#expect(sourceBacked.allSatisfy { $0.span != nil })

if let johnSpan = encoding.first(where: { $0.token == "john" })?.span {
#expect(String(encoding.text[johnSpan]) == "John")
} else {
Issue.record("Expected token 'john' with a source span")
}
}
}
6 changes: 6 additions & 0 deletions Tests/TokenizersTests/TokenizerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,12 @@ struct TokenizerTests {

#expect(tokenizer.tokenize(text: dataset.text) == dataset.bpe_tokens)
#expect(tokenizer.encode(text: dataset.text) == dataset.token_ids)
let encoding = tokenizer.encodeWithOffsets(text: dataset.text)
#expect(encoding.map(\.id) == dataset.token_ids)
#expect(encoding.allSatisfy { tokenizer.convertTokenToId($0.token) == $0.id })
if let span = encoding.first(where: { $0.span != nil })?.span {
#expect(!String(encoding.text[span]).isEmpty)
}
Comment thread
mattt marked this conversation as resolved.
#expect(tokenizer.decode(tokens: dataset.token_ids) == dataset.decoded_text)

// Edge cases (if available)
Expand Down