diff --git a/.gitignore b/.gitignore
index fe9534bc..739846c6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,7 @@ DerivedData/
 .idea
 .index-build
 *.out
+# Local Python venv used by Tools/generate_tokenizer_baselines.py
+.venv-tokenizer-baselines/
+
+Tools/__pycache__/
diff --git a/Tests/TokenizersTests/MultilingualConformanceTests.swift b/Tests/TokenizersTests/MultilingualConformanceTests.swift
new file mode 100644
index 00000000..4caa021c
--- /dev/null
+++ b/Tests/TokenizersTests/MultilingualConformanceTests.swift
@@ -0,0 +1,294 @@
+//
+//  MultilingualConformanceTests.swift
+//
+//  Byte-identical parity tests against HuggingFace Python `transformers`.
+//
+//  Baselines under `Resources/MultilingualConformance/baselines/` are produced
+//  by `Tools/generate_tokenizer_baselines.py` and treated as the authoritative
+//  reference. Each Swift tokenizer kernel is expected to produce identical
+//  `input_ids` for every input in the corpus.
+//
+//  Inputs known to diverge today because of bugs being tracked upstream are
+//  enumerated in `expectedDivergences` below with a reference to the relevant
+//  issue or PR, so the target lands green while the work is in flight. Any
+//  divergence that isn't in that list is a hard failure (regression catch).
+//  Any input listed there that now matches Python emits a printed hint inviting
+//  removal of the entry — but doesn't fail the test, so the green CI signal
+//  isn't broken by an upstream improvement.
+//
+//  Adding a model: append to `kernels`, append to `MODELS` in the Python
+//  script, and re-run it. Adding an input: append to `inputs.json` and re-run
+//  the script.
+//
+
+import Foundation
+import Testing
+
+@testable import Hub
+@testable import Models
+@testable import Tokenizers
+
+private let downloadDestination: URL = {
+    let base = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first!
+    return base.appending(component: "huggingface-tests")
+}()
+
+private let hubApiForTests = HubApi(downloadBase: downloadDestination)
+
+// MARK: - Fixtures
+
+private struct CorpusInput: Decodable {
+    let id: String
+    let category: String
+    let text: String
+}
+
+private struct Corpus: Decodable {
+    let schema_version: Int
+    let description: String
+    let inputs: [CorpusInput]
+}
+
+private struct BaselineEntry: Decodable {
+    let input_ids: [Int]
+    let tokens: [String]
+    // The Python generator also emits `decoded_with_special` / `decoded_skip_special`
+    // for future use; they are intentionally not decoded here because decoder-side
+    // parity has its own failure modes that deserve a dedicated test (and at least
+    // one known-buggy path — see WordPieceDecoder's empty-tokens `tokens.first!`).
+}
+
+private struct Baseline: Decodable {
+    let model_id: String
+    let transformers_version: String
+    let entries: [String: BaselineEntry]
+}
+
+private enum FixtureError: Error, CustomStringConvertible {
+    case missingResource(String)
+
+    var description: String {
+        switch self {
+        case .missingResource(let name): "missing resource: \(name)"
+        }
+    }
+}
+
+// Resource lookup deliberately doesn't use the `subdirectory:` parameter of
+// `Bundle.module.url(forResource:withExtension:subdirectory:)`. SPM's
+// `.process("Resources")` does not always preserve the directory layout in a way
+// that subdirectory lookup can rely on, but flat lookup by basename works
+// because every fixture filename below is unique within the bundle (the corpus
+// is named `inputs.json` and every baseline uses a slugified model id).
+private func loadCorpus() throws -> Corpus {
+    guard let url = Bundle.module.url(forResource: "inputs", withExtension: "json") else {
+        throw FixtureError.missingResource("inputs.json")
+    }
+    return try JSONDecoder().decode(Corpus.self, from: try Data(contentsOf: url))
+}
+
+private func loadBaseline(_ slug: String) throws -> Baseline {
+    // Slugified model ids replace `/` with `__` so they're valid as filesystem and bundle names.
+    guard let url = Bundle.module.url(forResource: slug, withExtension: "json") else {
+        throw FixtureError.missingResource("\(slug).json")
+    }
+    return try JSONDecoder().decode(Baseline.self, from: try Data(contentsOf: url))
+}
+
+private func makeTokenizer(_ modelId: String) async throws -> Tokenizer {
+    let config = LanguageModelConfigurationFromHub(modelName: modelId, hubApi: hubApiForTests)
+    guard let tokenizerConfig = try await config.tokenizerConfig else {
+        Issue.record("Missing tokenizer config for \(modelId)")
+        throw FixtureError.missingResource("tokenizer_config.json for \(modelId)")
+    }
+    let tokenizerData = try await config.tokenizerData
+    return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData)
+}
+
+// MARK: - Diff formatting
+
+private func formatTokenDiff(
+    expected: [Int],
+    actual: [Int],
+    expectedTokens: [String],
+    actualTokens: [String]
+) -> String {
+    let common = min(expected.count, actual.count)
+    var firstDiff = common
+    for i in 0..<common where expected[i] != actual[i] {
+        firstDiff = i
+        break
+    }
+
+    var lines: [String] = []
+    lines.append("expected \(expected.count) ids, got \(actual.count) ids; first divergence at index \(firstDiff)")
+    let window = 3
+    let start = max(0, firstDiff - window)
+    let endExpected = min(expected.count, firstDiff + window + 1)
+    let endActual = min(actual.count, firstDiff + window + 1)
+    lines.append("  expected[\(start)..<\(endExpected)]: \(Array(expected[start..<endExpected])) \(Array(expectedTokens[start..<min(expectedTokens.count, endExpected)]))")
+    lines.append("    actual[\(start)..<\(endActual)]:   \(Array(actual[start..<endActual])) \(Array(actualTokens[start..<min(actualTokens.count, endActual)]))")
+    return lines.joined(separator: "\n")
+}
+
+// MARK: - Kernel matrix
+
+private struct Kernel: Sendable, CustomStringConvertible {
+    let modelId: String
+    let baselineSlug: String
+
+    var description: String { modelId }
+
+    init(_ modelId: String) {
+        self.modelId = modelId
+        self.baselineSlug = modelId.replacingOccurrences(of: "/", with: "__")
+    }
+}
+
+private let kernels: [Kernel] = [
+    Kernel("BAAI/bge-small-en-v1.5"), // WordPiece (Bert family)
+    Kernel("google-t5/t5-small"), // Unigram / SentencePiece
+    Kernel("openai-community/gpt2"), // Byte-level BPE (legacy)
+    Kernel("Qwen/Qwen2.5-0.5B"), // Byte-level BPE (modern)
+    Kernel("TinyLlama/TinyLlama-1.1B-Chat-v1.0"), // SentencePiece BPE with byte-fallback
+]
+
+/// (modelId, inputId) pairs that are known to diverge from the Python reference
+/// today, paired with a reason string that links the divergence to an upstream
+/// issue or PR. Entries should be removed as upstream fixes land.
+///
+/// The test fails for any (model, input) pair NOT listed here that diverges,
+/// and also surfaces a non-fatal warning if a pair listed here now matches —
+/// that's the signal to drop the entry from this table.
+private let expectedDivergences: [String: [String: String]] = [
+    "BAAI/bge-small-en-v1.5": [
+        // BasicTokenizer should strip combining marks (NFD then drop Mn) before
+        // WordPiece lookup — see #352 Bug 2 / #354.
+        "ja_dakuten": "swift-transformers#352 Bug 2 (BasicTokenizer voiced-kana)",
+        "ja_handakuten": "swift-transformers#352 Bug 2 (BasicTokenizer voiced-kana)",
+        "ja_kanji_mixed": "swift-transformers#352 Bug 2 (BasicTokenizer voiced-kana)",
+        "ja_romaji_mixed": "swift-transformers#352 Bug 2 (BasicTokenizer voiced-kana)",
+        "ja_long_sentence": "swift-transformers#352 Bug 2 (BasicTokenizer voiced-kana)",
+        "ar_diacritics": "swift-transformers#352 Bug 1 / #353 (BasicTokenizer combining mark stripping)",
+        "hi_devanagari": "swift-transformers#352 Bug 1 / #353 (BasicTokenizer combining mark stripping)",
+        "mixed_polyglot": "swift-transformers#352 Bug 1 / #353 (BasicTokenizer combining mark stripping)",
+    ],
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": [
+        // SentencePiece BPE with byte-fallback — merges happen at grapheme
+        // cluster boundaries in Swift but at Unicode scalar boundaries in HF.
+        // See #352 Bug 4 / #355.
+        "ascii_code": "swift-transformers#352 (whitespace prefix handling in SentencePiece)",
+        "ar_diacritics": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+        "hi_devanagari": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+        "th_basic": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+        "emoji_zwj_family": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+        "mixed_polyglot": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+        "edge_combining": "swift-transformers#352 Bug 4 / #355 (BPE merge by grapheme vs scalar)",
+    ],
+    "Qwen/Qwen2.5-0.5B": [
+        // Byte-level BPE merge differences in Arabic / Thai / Devanagari
+        // contexts — relates to Unicode scalar handling in BPE merge loop.
+        "ar_diacritics": "swift-transformers#352 Bug 4 / #355 (byte-level BPE merge ordering)",
+        "th_basic": "swift-transformers#352 Bug 4 / #355 (byte-level BPE merge ordering)",
+        "mixed_polyglot": "swift-transformers#352 Bug 4 / #355 (byte-level BPE merge ordering)",
+    ],
+]
+
+// MARK: - Tests
+
+@Suite("Multilingual Conformance Tests")
+struct MultilingualConformanceTests {
+
+    /// Each kernel is checked against every input in the corpus. Failures are reported
+    /// per (kernel, input) pair with a windowed diff around the first divergence so the
+    /// root cause is visible without re-running the test against a single id.
+    @Test("Byte-identical token ids", arguments: kernels)
+    fileprivate func tokenIdsMatchPython(kernel: Kernel) async throws {
+        let corpus = try loadCorpus()
+        let baseline = try loadBaseline(kernel.baselineSlug)
+        let tokenizer = try await makeTokenizer(kernel.modelId)
+        let expectedDivergent = expectedDivergences[kernel.modelId] ?? [:]
+
+        var unexpectedDivergences: [(input: CorpusInput, message: String)] = []
+        var unexpectedMatches: [String] = [] // entries in expected list that now pass — invite removal
+
+        for input in corpus.inputs {
+            guard let expected = baseline.entries[input.id] else {
+                Issue.record("Baseline for \(kernel.modelId) is missing entry \(input.id) — regenerate with Tools/generate_tokenizer_baselines.py")
+                continue
+            }
+            let actualIds = tokenizer.encode(text: input.text)
+            let isMatch = actualIds == expected.input_ids
+            let isListedAsDivergent = expectedDivergent[input.id] != nil
+
+            if isMatch {
+                if isListedAsDivergent {
+                    unexpectedMatches.append(input.id)
+                }
+                continue
+            }
+            if isListedAsDivergent { continue }
+
+            let actualTokens = actualIds.map { tokenizer.convertIdToToken($0) ?? "<\($0)>" }
+            let message = formatTokenDiff(
+                expected: expected.input_ids,
+                actual: actualIds,
+                expectedTokens: expected.tokens,
+                actualTokens: actualTokens
+            )
+            unexpectedDivergences.append((input, "[\(input.category)] id=\(input.id) text=\(input.text.debugDescription)\n\(message)"))
+        }
+
+        // Unexpected divergence is a hard failure: either swift-transformers regressed,
+        // or the corpus / baseline added a case that wasn't classified yet.
+        for failure in unexpectedDivergences {
+            Issue.record("\(failure.message)")
+        }
+        if !unexpectedDivergences.isEmpty {
+            Issue.record("\(kernel.modelId): \(unexpectedDivergences.count) unexpected divergence(s) from Python `transformers` \(baseline.transformers_version). Either swift-transformers regressed or `expectedDivergences` needs a new entry.")
+        }
+
+        // Unexpected match is informational: an upstream fix has landed and the entry
+        // should be dropped from `expectedDivergences`. Printed but does NOT fail the
+        // test, so freshly merged improvements don't break CI; the message surfaces
+        // when running locally and is the trigger to clean up the expected list.
+        if !unexpectedMatches.isEmpty {
+            print(
+                "[MultilingualConformance] \(kernel.modelId): \(unexpectedMatches.count) input(s) now match Python — "
+                    + "please remove from `expectedDivergences` in MultilingualConformanceTests.swift: "
+                    + unexpectedMatches.sorted().joined(separator: ", ")
+            )
+        }
+    }
+
+    /// Sanity check: the corpus itself should not regress in shape or schema between
+    /// edits. Caught here so a malformed inputs.json fails fast rather than silently
+    /// skipping cases inside the parity test.
+    @Test("Corpus is well-formed")
+    func corpusIsWellFormed() throws {
+        let corpus = try loadCorpus()
+        #expect(corpus.schema_version == 1)
+        #expect(!corpus.inputs.isEmpty)
+
+        var seen = Set<String>()
+        for input in corpus.inputs {
+            #expect(!input.id.isEmpty, "input id must not be empty")
+            #expect(!seen.contains(input.id), "duplicate input id: \(input.id)")
+            seen.insert(input.id)
+            #expect(!input.text.isEmpty, "input text must not be empty (id: \(input.id))")
+        }
+    }
+
+    /// Sanity check: every kernel must have a baseline file covering every input id.
+    @Test("Baselines cover the corpus", arguments: kernels)
+    fileprivate func baselinesCoverCorpus(kernel: Kernel) throws {
+        let corpus = try loadCorpus()
+        let baseline = try loadBaseline(kernel.baselineSlug)
+        let corpusIds = Set(corpus.inputs.map(\.id))
+        let baselineIds = Set(baseline.entries.keys)
+        let missing = corpusIds.subtracting(baselineIds)
+        let extra = baselineIds.subtracting(corpusIds)
+        #expect(missing.isEmpty, "baseline missing entries: \(missing.sorted())")
+        #expect(extra.isEmpty, "baseline has stale entries not in corpus: \(extra.sorted())")
+    }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json
new file mode 100644
index 00000000..f781cc00
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json
@@ -0,0 +1,1474 @@
+{
+  "model_id": "BAAI/bge-small-en-v1.5",
+  "transformers_version": "4.57.1",
+  "entries": {
+    "ascii_simple": {
+      "input_ids": [
+        101,
+        1996,
+        4248,
+        2829,
+        4419,
+        14523,
+        2058,
+        1996,
+        13971,
+        3899,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "the",
+        "quick",
+        "brown",
+        "fox",
+        "jumps",
+        "over",
+        "the",
+        "lazy",
+        "dog",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] the quick brown fox jumps over the lazy dog. [SEP]",
+      "decoded_skip_special": "the quick brown fox jumps over the lazy dog."
+    },
+    "ascii_punct": {
+      "input_ids": [
+        101,
+        7592,
+        1010,
+        2088,
+        999,
+        2009,
+        1005,
+        1055,
+        2260,
+        1024,
+        4090,
+        1517,
+        1000,
+        14686,
+        1000,
+        1005,
+        9706,
+        14122,
+        18981,
+        5369,
+        1005,
+        1006,
+        11968,
+        2368,
+        1007,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "hello",
+        ",",
+        "world",
+        "!",
+        "it",
+        "'",
+        "s",
+        "12",
+        ":",
+        "34",
+        "—",
+        "\"",
+        "quote",
+        "\"",
+        "'",
+        "ap",
+        "##ost",
+        "##rop",
+        "##he",
+        "'",
+        "(",
+        "par",
+        "##en",
+        ")",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] hello, world! it's 12 : 34 — \" quote \"'apostrophe'( paren ). [SEP]",
+      "decoded_skip_special": "hello, world! it's 12 : 34 — \" quote \"'apostrophe'( paren )."
+    },
+    "ascii_numbers": {
+      "input_ids": [
+        101,
+        14255,
+        2003,
+        3155,
+        1017,
+        1012,
+        15471,
+        28154,
+        1025,
+        1041,
+        2003,
+        2055,
+        1016,
+        1012,
+        6390,
+        2620,
+        22407,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "pi",
+        "is",
+        "approximately",
+        "3",
+        ".",
+        "141",
+        "##59",
+        ";",
+        "e",
+        "is",
+        "about",
+        "2",
+        ".",
+        "71",
+        "##8",
+        "##28",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] pi is approximately 3. 14159 ; e is about 2. 71828. [SEP]",
+      "decoded_skip_special": "pi is approximately 3. 14159 ; e is about 2. 71828."
+    },
+    "ascii_url": {
+      "input_ids": [
+        101,
+        3942,
+        16770,
+        1024,
+        1013,
+        1013,
+        17662,
+        12172,
+        1012,
+        2522,
+        1013,
+        9986,
+        2015,
+        1998,
+        5653,
+        3960,
+        1030,
+        2742,
+        1012,
+        4012,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "visit",
+        "https",
+        ":",
+        "/",
+        "/",
+        "hugging",
+        "##face",
+        ".",
+        "co",
+        "/",
+        "doc",
+        "##s",
+        "and",
+        "mail",
+        "bob",
+        "@",
+        "example",
+        ".",
+        "com",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] visit https : / / huggingface. co / docs and mail bob @ example. com. [SEP]",
+      "decoded_skip_special": "visit https : / / huggingface. co / docs and mail bob @ example. com."
+    },
+    "ascii_code": {
+      "input_ids": [
+        101,
+        13366,
+        5587,
+        1006,
+        1037,
+        1024,
+        20014,
+        1010,
+        1038,
+        1024,
+        20014,
+        1007,
+        1011,
+        1028,
+        20014,
+        1024,
+        2709,
+        1037,
+        1009,
+        1038,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "def",
+        "add",
+        "(",
+        "a",
+        ":",
+        "int",
+        ",",
+        "b",
+        ":",
+        "int",
+        ")",
+        "-",
+        ">",
+        "int",
+        ":",
+        "return",
+        "a",
+        "+",
+        "b",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] def add ( a : int, b : int ) - > int : return a + b [SEP]",
+      "decoded_skip_special": "def add ( a : int, b : int ) - > int : return a + b"
+    },
+    "ja_kana_basic": {
+      "input_ids": [
+        101,
+        1646,
+        30173,
+        30174,
+        30175,
+        30176,
+        30177,
+        30178,
+        30179,
+        30180,
+        30181,
+        30182,
+        30183,
+        30184,
+        30185,
+        30186,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "あ",
+        "##い",
+        "##う",
+        "##え",
+        "##お",
+        "##か",
+        "##き",
+        "##く",
+        "##け",
+        "##こ",
+        "##さ",
+        "##し",
+        "##す",
+        "##せ",
+        "##そ",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] あいうえおかきくけこさしすせそ [SEP]",
+      "decoded_skip_special": "あいうえおかきくけこさしすせそ"
+    },
+    "ja_dakuten": {
+      "input_ids": [
+        101,
+        1651,
+        30178,
+        30179,
+        30180,
+        30181,
+        30182,
+        30183,
+        30184,
+        30185,
+        30186,
+        30187,
+        30188,
+        30190,
+        30191,
+        30192,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "か",
+        "##き",
+        "##く",
+        "##け",
+        "##こ",
+        "##さ",
+        "##し",
+        "##す",
+        "##せ",
+        "##そ",
+        "##た",
+        "##ち",
+        "##つ",
+        "##て",
+        "##と",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] かきくけこさしすせそたちつてと [SEP]",
+      "decoded_skip_special": "かきくけこさしすせそたちつてと"
+    },
+    "ja_handakuten": {
+      "input_ids": [
+        101,
+        1672,
+        30199,
+        30200,
+        30201,
+        30202,
+        30244,
+        30245,
+        30246,
+        30247,
+        30248,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "は",
+        "##ひ",
+        "##ふ",
+        "##へ",
+        "##ほ",
+        "##ハ",
+        "##ヒ",
+        "##フ",
+        "##ヘ",
+        "##ホ",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] はひふへほハヒフヘホ [SEP]",
+      "decoded_skip_special": "はひふへほハヒフヘホ"
+    },
+    "ja_kanji_mixed": {
+      "input_ids": [
+        101,
+        1864,
+        1876,
+        1950,
+        1671,
+        100,
+        100,
+        100,
+        100,
+        100,
+        1672,
+        100,
+        1657,
+        30173,
+        100,
+        100,
+        1665,
+        30184,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "日",
+        "本",
+        "語",
+        "の",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "は",
+        "[UNK]",
+        "し",
+        "##い",
+        "[UNK]",
+        "[UNK]",
+        "て",
+        "##す",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] 日 本 語 の [UNK] [UNK] [UNK] [UNK] [UNK] は [UNK] しい [UNK] [UNK] てす 。 [SEP]",
+      "decoded_skip_special": "日 本 語 の は しい てす 。"
+    },
+    "ja_romaji_mixed": {
+      "input_ids": [
+        101,
+        9170,
+        1665,
+        8285,
+        18715,
+        18595,
+        6290,
+        1690,
+        100,
+        1658,
+        1636,
+        100,
+        100,
+        1672,
+        7953,
+        1035,
+        8909,
+        2015,
+        1666,
+        19204,
+        2015,
+        1671,
+        1752,
+        1664,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "swift",
+        "て",
+        "auto",
+        "##tok",
+        "##eni",
+        "##zer",
+        "を",
+        "[UNK]",
+        "す",
+        "。",
+        "[UNK]",
+        "[UNK]",
+        "は",
+        "input",
+        "_",
+        "id",
+        "##s",
+        "と",
+        "token",
+        "##s",
+        "の",
+        "二",
+        "つ",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] swift て autotokenizer を [UNK] す 。 [UNK] [UNK] は input _ ids と tokens の 二 つ 。 [SEP]",
+      "decoded_skip_special": "swift て autotokenizer を す 。 は input _ ids と tokens の 二 つ 。"
+    },
+    "ja_long_sentence": {
+      "input_ids": [
+        101,
+        1879,
+        1755,
+        100,
+        100,
+        100,
+        100,
+        100,
+        1671,
+        100,
+        100,
+        100,
+        1967,
+        1651,
+        100,
+        100,
+        1690,
+        1774,
+        1657,
+        30187,
+        30211,
+        30183,
+        30173,
+        30177,
+        1635,
+        1876,
+        100,
+        1661,
+        30215,
+        30174,
+        30177,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "東",
+        "京",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "の",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "長",
+        "か",
+        "[UNK]",
+        "[UNK]",
+        "を",
+        "出",
+        "し",
+        "##た",
+        "##ら",
+        "##し",
+        "##い",
+        "##か",
+        "、",
+        "本",
+        "[UNK]",
+        "た",
+        "##ろ",
+        "##う",
+        "##か",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] 東 京 [UNK] [UNK] [UNK] [UNK] [UNK] の [UNK] [UNK] [UNK] 長 か [UNK] [UNK] を 出 したらしいか 、 本 [UNK] たろうか 。 [SEP]",
+      "decoded_skip_special": "東 京 の 長 か を 出 したらしいか 、 本 たろうか 。"
+    },
+    "ko_hangul_simple": {
+      "input_ids": [
+        101,
+        1463,
+        30006,
+        30021,
+        29992,
+        30010,
+        30025,
+        30005,
+        30006,
+        29997,
+        30009,
+        29999,
+        30013,
+        1012,
+        1460,
+        30006,
+        30021,
+        29991,
+        30006,
+        30024,
+        29997,
+        30017,
+        30024,
+        29992,
+        30019,
+        29993,
+        30006,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ᄋ",
+        "##ᅡ",
+        "##ᆫ",
+        "##ᄂ",
+        "##ᅧ",
+        "##ᆼ",
+        "##ᄒ",
+        "##ᅡ",
+        "##ᄉ",
+        "##ᅦ",
+        "##ᄋ",
+        "##ᅭ",
+        ".",
+        "ᄇ",
+        "##ᅡ",
+        "##ᆫ",
+        "##ᄀ",
+        "##ᅡ",
+        "##ᆸ",
+        "##ᄉ",
+        "##ᅳ",
+        "##ᆸ",
+        "##ᄂ",
+        "##ᅵ",
+        "##ᄃ",
+        "##ᅡ",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] 안녕하세요. 반갑습니다. [SEP]",
+      "decoded_skip_special": "안녕하세요. 반갑습니다."
+    },
+    "ko_hangul_jamo": {
+      "input_ids": [
+        101,
+        1469,
+        30006,
+        30021,
+        29991,
+        30017,
+        30022,
+        1464,
+        30006,
+        29995,
+        30011,
+        1460,
+        30014,
+        30021,
+        29994,
+        30019,
+        1469,
+        30010,
+        30025,
+        29997,
+        30019,
+        30020,
+        29991,
+        30012,
+        1469,
+        30006,
+        30021,
+        29991,
+        30017,
+        30022,
+        1463,
+        30017,
+        30023,
+        30000,
+        30008,
+        30022,
+        1469,
+        30010,
+        30025,
+        29997,
+        30019,
+        30020,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ᄒ",
+        "##ᅡ",
+        "##ᆫ",
+        "##ᄀ",
+        "##ᅳ",
+        "##ᆯ",
+        "ᄌ",
+        "##ᅡ",
+        "##ᄆ",
+        "##ᅩ",
+        "ᄇ",
+        "##ᅮ",
+        "##ᆫ",
+        "##ᄅ",
+        "##ᅵ",
+        "ᄒ",
+        "##ᅧ",
+        "##ᆼ",
+        "##ᄉ",
+        "##ᅵ",
+        "##ᆨ",
+        "##ᄀ",
+        "##ᅪ",
+        "ᄒ",
+        "##ᅡ",
+        "##ᆫ",
+        "##ᄀ",
+        "##ᅳ",
+        "##ᆯ",
+        "ᄋ",
+        "##ᅳ",
+        "##ᆷ",
+        "##ᄌ",
+        "##ᅥ",
+        "##ᆯ",
+        "ᄒ",
+        "##ᅧ",
+        "##ᆼ",
+        "##ᄉ",
+        "##ᅵ",
+        "##ᆨ",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] 한글 자모 분리 형식과 한글 음절 형식. [SEP]",
+      "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식."
+    },
+    "zh_simplified": {
+      "input_ids": [
+        101,
+        100,
+        100,
+        1817,
+        100,
+        100,
+        1756,
+        100,
+        1869,
+        100,
+        1916,
+        1740,
+        100,
+        100,
+        100,
+        1775,
+        100,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "[UNK]",
+        "[UNK]",
+        "学",
+        "[UNK]",
+        "[UNK]",
+        "人",
+        "[UNK]",
+        "智",
+        "[UNK]",
+        "的",
+        "一",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "分",
+        "[UNK]",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] [UNK] [UNK] 学 [UNK] [UNK] 人 [UNK] 智 [UNK] 的 一 [UNK] [UNK] [UNK] 分 [UNK] 。 [SEP]",
+      "decoded_skip_special": "学 人 智 的 一 分 。"
+    },
+    "zh_traditional": {
+      "input_ids": [
+        101,
+        100,
+        100,
+        100,
+        100,
+        100,
+        1756,
+        100,
+        1869,
+        100,
+        1916,
+        1740,
+        100,
+        100,
+        100,
+        1775,
+        100,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "人",
+        "[UNK]",
+        "智",
+        "[UNK]",
+        "的",
+        "一",
+        "[UNK]",
+        "[UNK]",
+        "[UNK]",
+        "分",
+        "[UNK]",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] [UNK] [UNK] [UNK] [UNK] [UNK] 人 [UNK] 智 [UNK] 的 一 [UNK] [UNK] [UNK] 分 [UNK] 。 [SEP]",
+      "decoded_skip_special": "人 智 的 一 分 。"
+    },
+    "zh_mixed_en": {
+      "input_ids": [
+        101,
+        1052,
+        22123,
+        2953,
+        2818,
+        100,
+        1740,
+        100,
+        2784,
+        4083,
+        100,
+        100,
+        1636,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "p",
+        "##yt",
+        "##or",
+        "##ch",
+        "[UNK]",
+        "一",
+        "[UNK]",
+        "deep",
+        "learning",
+        "[UNK]",
+        "[UNK]",
+        "。",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] pytorch [UNK] 一 [UNK] deep learning [UNK] [UNK] 。 [SEP]",
+      "decoded_skip_special": "pytorch 一 deep learning 。"
+    },
+    "ar_basic": {
+      "input_ids": [
+        101,
+        1270,
+        23673,
+        23673,
+        29831,
+        19433,
+        1270,
+        23673,
+        29830,
+        17149,
+        29816,
+        14498,
+        19433,
+        1275,
+        22192,
+        14498,
+        23673,
+        19433,
+        1275,
+        15394,
+        25573,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ا",
+        "##ل",
+        "##ل",
+        "##غ",
+        "##ة",
+        "ا",
+        "##ل",
+        "##ع",
+        "##ر",
+        "##ب",
+        "##ي",
+        "##ة",
+        "ج",
+        "##م",
+        "##ي",
+        "##ل",
+        "##ة",
+        "ج",
+        "##د",
+        "##ا",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] اللغة العربية جميلة جدا. [SEP]",
+      "decoded_skip_special": "اللغة العربية جميلة جدا."
+    },
+    "ar_diacritics": {
+      "input_ids": [
+        101,
+        1271,
+        29824,
+        22192,
+        1270,
+        23673,
+        23673,
+        14157,
+        1270,
+        23673,
+        17149,
+        29820,
+        22192,
+        15915,
+        1270,
+        23673,
+        17149,
+        29820,
+        14498,
+        22192,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ب",
+        "##س",
+        "##م",
+        "ا",
+        "##ل",
+        "##ل",
+        "##ه",
+        "ا",
+        "##ل",
+        "##ر",
+        "##ح",
+        "##م",
+        "##ن",
+        "ا",
+        "##ل",
+        "##ر",
+        "##ح",
+        "##ي",
+        "##م",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] بسم الله الرحمن الرحيم [SEP]",
+      "decoded_skip_special": "بسم الله الرحمن الرحيم"
+    },
+    "he_basic": {
+      "input_ids": [
+        101,
+        1266,
+        29799,
+        29792,
+        29800,
+        1259,
+        29792,
+        29799,
+        29800,
+        1012,
+        1247,
+        29128,
+        1249,
+        29810,
+        29804,
+        29795,
+        1259,
+        29789,
+        29811,
+        29796,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ש",
+        "##ל",
+        "##ו",
+        "##ם",
+        "ע",
+        "##ו",
+        "##ל",
+        "##ם",
+        ".",
+        "ז",
+        "##ה",
+        "ט",
+        "##ק",
+        "##ס",
+        "##ט",
+        "ע",
+        "##ב",
+        "##ר",
+        "##י",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] שלום עולם. זה טקסט עברי. [SEP]",
+      "decoded_skip_special": "שלום עולם. זה טקסט עברי."
+    },
+    "hi_devanagari": {
+      "input_ids": [
+        101,
+        1339,
+        29877,
+        29863,
+        29861,
+        29878,
+        1330,
+        29876,
+        29873,
+        29876,
+        1329,
+        29875,
+        29859,
+        1338,
+        29867,
+        29861,
+        29862,
+        1339,
+        1344,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "ह",
+        "##ि",
+        "##न",
+        "##द",
+        "##ी",
+        "भ",
+        "##ा",
+        "##ष",
+        "##ा",
+        "ब",
+        "##ह",
+        "##त",
+        "स",
+        "##म",
+        "##द",
+        "##ध",
+        "ह",
+        "।",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] हिनदी भाषा बहत समदध ह । [SEP]",
+      "decoded_skip_special": "हिनदी भाषा बहत समदध ह ।"
+    },
+    "th_basic": {
+      "input_ids": [
+        101,
+        100,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "[UNK]",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] [UNK] [SEP]",
+      "decoded_skip_special": ""
+    },
+    "emoji_bmp": {
+      "input_ids": [
+        101,
+        3103,
+        100,
+        4231,
+        100,
+        2732,
+        1620,
+        2540,
+        1625,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "sun",
+        "[UNK]",
+        "moon",
+        "[UNK]",
+        "star",
+        "★",
+        "heart",
+        "♥",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] sun [UNK] moon [UNK] star ★ heart ♥ [SEP]",
+      "decoded_skip_special": "sun moon star ★ heart ♥"
+    },
+    "emoji_astral": {
+      "input_ids": [
+        101,
+        100,
+        2000,
+        1996,
+        4231,
+        100,
+        2007,
+        1037,
+        100,
+        1998,
+        1037,
+        100,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "[UNK]",
+        "to",
+        "the",
+        "moon",
+        "[UNK]",
+        "with",
+        "a",
+        "[UNK]",
+        "and",
+        "a",
+        "[UNK]",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] [UNK] to the moon [UNK] with a [UNK] and a [UNK] [SEP]",
+      "decoded_skip_special": "to the moon with a and a"
+    },
+    "emoji_zwj_family": {
+      "input_ids": [
+        101,
+        2155,
+        1024,
+        100,
+        1998,
+        5210,
+        1024,
+        100,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "family",
+        ":",
+        "[UNK]",
+        "and",
+        "flag",
+        ":",
+        "[UNK]",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] family : [UNK] and flag : [UNK] [SEP]",
+      "decoded_skip_special": "family : and flag :"
+    },
+    "emoji_skin_tone": {
+      "input_ids": [
+        101,
+        100,
+        4400,
+        2007,
+        3096,
+        12623,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "[UNK]",
+        "wave",
+        "with",
+        "skin",
+        "tones",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] [UNK] wave with skin tones [SEP]",
+      "decoded_skip_special": "wave with skin tones"
+    },
+    "mixed_polyglot": {
+      "input_ids": [
+        101,
+        7592,
+        1745,
+        100,
+        1463,
+        30006,
+        30021,
+        29992,
+        30010,
+        30025,
+        1266,
+        29799,
+        29792,
+        29800,
+        1295,
+        17149,
+        29820,
+        29816,
+        25573,
+        1327,
+        29867,
+        29874,
+        29859,
+        100,
+        1655,
+        30217,
+        30194,
+        30188,
+        30198,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "hello",
+        "世",
+        "[UNK]",
+        "ᄋ",
+        "##ᅡ",
+        "##ᆫ",
+        "##ᄂ",
+        "##ᅧ",
+        "##ᆼ",
+        "ש",
+        "##ל",
+        "##ו",
+        "##ם",
+        "م",
+        "##ر",
+        "##ح",
+        "##ب",
+        "##ا",
+        "न",
+        "##म",
+        "##स",
+        "##त",
+        "[UNK]",
+        "こ",
+        "##ん",
+        "##に",
+        "##ち",
+        "##は",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] hello 世 [UNK] 안녕 שלום مرحبا नमसत [UNK] こんにちは [SEP]",
+      "decoded_skip_special": "hello 世 안녕 שלום مرحبا नमसत こんにちは"
+    },
+    "mixed_code_jp": {
+      "input_ids": [
+        101,
+        1013,
+        1013,
+        1864,
+        1876,
+        1950,
+        1704,
+        30252,
+        30263,
+        30240,
+        2292,
+        14806,
+        1027,
+        1000,
+        1655,
+        30217,
+        30194,
+        30188,
+        30198,
+        1635,
+        1745,
+        100,
+        999,
+        1000,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "/",
+        "/",
+        "日",
+        "本",
+        "語",
+        "コ",
+        "##メ",
+        "##ン",
+        "##ト",
+        "let",
+        "greeting",
+        "=",
+        "\"",
+        "こ",
+        "##ん",
+        "##に",
+        "##ち",
+        "##は",
+        "、",
+        "世",
+        "[UNK]",
+        "!",
+        "\"",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] / / 日 本 語 コメント let greeting = \" こんにちは 、 世 [UNK]! \" [SEP]",
+      "decoded_skip_special": "/ / 日 本 語 コメント let greeting = \" こんにちは 、 世! \""
+    },
+    "ipa_basic": {
+      "input_ids": [
+        101,
+        1996,
+        24531,
+        2005,
+        1005,
+        3869,
+        1005,
+        2003,
+        1013,
+        1042,
+        29685,
+        29696,
+        1013,
+        1998,
+        2005,
+        1005,
+        2911,
+        1005,
+        2003,
+        1013,
+        1130,
+        29685,
+        2361,
+        1013,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "the",
+        "ipa",
+        "for",
+        "'",
+        "fish",
+        "'",
+        "is",
+        "/",
+        "f",
+        "##ɪ",
+        "##ʃ",
+        "/",
+        "and",
+        "for",
+        "'",
+        "ship",
+        "'",
+        "is",
+        "/",
+        "ʃ",
+        "##ɪ",
+        "##p",
+        "/",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] the ipa for'fish'is / fɪʃ / and for'ship'is / ʃɪp /. [SEP]",
+      "decoded_skip_special": "the ipa for'fish'is / fɪʃ / and for'ship'is / ʃɪp /."
+    },
+    "edge_combining": {
+      "input_ids": [
+        101,
+        7668,
+        1006,
+        22309,
+        1007,
+        5443,
+        7668,
+        1006,
+        1050,
+        2546,
+        2094,
+        1007,
+        1517,
+        2168,
+        2773,
+        1010,
+        2367,
+        27507,
+        1012,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "cafe",
+        "(",
+        "nfc",
+        ")",
+        "vs",
+        "cafe",
+        "(",
+        "n",
+        "##f",
+        "##d",
+        ")",
+        "—",
+        "same",
+        "word",
+        ",",
+        "different",
+        "bytes",
+        ".",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] cafe ( nfc ) vs cafe ( nfd ) — same word, different bytes. [SEP]",
+      "decoded_skip_special": "cafe ( nfc ) vs cafe ( nfd ) — same word, different bytes."
+    },
+    "edge_long_repetition": {
+      "input_ids": [
+        101,
+        13360,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        11057,
+        2050,
+        22861,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        10322,
+        102
+      ],
+      "tokens": [
+        "[CLS]",
+        "aaa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##aa",
+        "##a",
+        "bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "##bb",
+        "[SEP]"
+      ],
+      "decoded_with_special": "[CLS] aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb [SEP]",
+      "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    }
+  }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json
new file mode 100644
index 00000000..dc2f41f5
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json
@@ -0,0 +1,1230 @@
+{
+  "model_id": "Qwen/Qwen2.5-0.5B",
+  "transformers_version": "4.57.1",
+  "entries": {
+    "ascii_simple": {
+      "input_ids": [
+        785,
+        3974,
+        13876,
+        38835,
+        34208,
+        916,
+        279,
+        15678,
+        5562,
+        13
+      ],
+      "tokens": [
+        "The",
+        "Ġquick",
+        "Ġbrown",
+        "Ġfox",
+        "Ġjumps",
+        "Ġover",
+        "Ġthe",
+        "Ġlazy",
+        "Ġdog",
+        "."
+      ],
+      "decoded_with_special": "The quick brown fox jumps over the lazy dog.",
+      "decoded_skip_special": "The quick brown fox jumps over the lazy dog."
+    },
+    "ascii_punct": {
+      "input_ids": [
+        9707,
+        11,
+        1879,
+        0,
+        1084,
+        594,
+        220,
+        16,
+        17,
+        25,
+        18,
+        19,
+        1959,
+        330,
+        2949,
+        1,
+        364,
+        391,
+        535,
+        47883,
+        6,
+        320,
+        41064,
+        568
+      ],
+      "tokens": [
+        "Hello",
+        ",",
+        "Ġworld",
+        "!",
+        "ĠIt",
+        "'s",
+        "Ġ",
+        "1",
+        "2",
+        ":",
+        "3",
+        "4",
+        "ĠâĢĶ",
+        "Ġ\"",
+        "quote",
+        "\"",
+        "Ġ'",
+        "ap",
+        "ost",
+        "rophe",
+        "'",
+        "Ġ(",
+        "paren",
+        ")."
+      ],
+      "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).",
+      "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)."
+    },
+    "ascii_numbers": {
+      "input_ids": [
+        34767,
+        374,
+        13187,
+        220,
+        18,
+        13,
+        16,
+        19,
+        16,
+        20,
+        24,
+        26,
+        384,
+        374,
+        911,
+        220,
+        17,
+        13,
+        22,
+        16,
+        23,
+        17,
+        23,
+        13
+      ],
+      "tokens": [
+        "Pi",
+        "Ġis",
+        "Ġapproximately",
+        "Ġ",
+        "3",
+        ".",
+        "1",
+        "4",
+        "1",
+        "5",
+        "9",
+        ";",
+        "Ġe",
+        "Ġis",
+        "Ġabout",
+        "Ġ",
+        "2",
+        ".",
+        "7",
+        "1",
+        "8",
+        "2",
+        "8",
+        "."
+      ],
+      "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.",
+      "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828."
+    },
+    "ascii_url": {
+      "input_ids": [
+        26218,
+        3703,
+        1110,
+        71,
+        35268,
+        1564,
+        6830,
+        25967,
+        323,
+        8072,
+        35192,
+        35487,
+        905,
+        13
+      ],
+      "tokens": [
+        "Visit",
+        "Ġhttps",
+        "://",
+        "h",
+        "ugging",
+        "face",
+        ".co",
+        "/docs",
+        "Ġand",
+        "Ġmail",
+        "Ġbob",
+        "@example",
+        ".com",
+        "."
+      ],
+      "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.",
+      "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com."
+    },
+    "ascii_code": {
+      "input_ids": [
+        750,
+        912,
+        2877,
+        25,
+        526,
+        11,
+        293,
+        25,
+        526,
+        8,
+        1464,
+        526,
+        510,
+        262,
+        470,
+        264,
+        488,
+        293
+      ],
+      "tokens": [
+        "def",
+        "Ġadd",
+        "(a",
+        ":",
+        "Ġint",
+        ",",
+        "Ġb",
+        ":",
+        "Ġint",
+        ")",
+        "Ġ->",
+        "Ġint",
+        ":Ċ",
+        "ĠĠĠ",
+        "Ġreturn",
+        "Ġa",
+        "Ġ+",
+        "Ġb"
+      ],
+      "decoded_with_special": "def add(a: int, b: int) -> int:\n    return a + b",
+      "decoded_skip_special": "def add(a: int, b: int) -> int:\n    return a + b"
+    },
+    "ja_kana_basic": {
+      "input_ids": [
+        29491,
+        94504,
+        57842,
+        141940,
+        49734,
+        46784,
+        75522,
+        22168,
+        29713,
+        14682,
+        17219,
+        71242,
+        26831
+      ],
+      "tokens": [
+        "ãģĤ",
+        "ãģĦãģĨ",
+        "ãģĪ",
+        "ãģĬãģĭ",
+        "ãģį",
+        "ãģı",
+        "ãģĳ",
+        "ãģĵ",
+        "ãģķ",
+        "ãģĹ",
+        "ãģĻ",
+        "ãģĽ",
+        "ãģĿ"
+      ],
+      "decoded_with_special": "あいうえおかきくけこさしすせそ",
+      "decoded_skip_special": "あいうえおかきくけこさしすせそ"
+    },
+    "ja_dakuten": {
+      "input_ids": [
+        28195,
+        124902,
+        125161,
+        124682,
+        76021,
+        99104,
+        124145,
+        125973,
+        127264,
+        35685,
+        144635,
+        125301,
+        16161,
+        66545
+      ],
+      "tokens": [
+        "ãģĮ",
+        "ãģİ",
+        "ãģĲ",
+        "ãģĴ",
+        "ãģĶãģĸ",
+        "ãģĺ",
+        "ãģļ",
+        "ãģľ",
+        "ãģŀ",
+        "ãģł",
+        "ãģ¢",
+        "ãģ¥",
+        "ãģ§",
+        "ãģ©"
+      ],
+      "decoded_with_special": "がぎぐげござじずぜぞだぢづでど",
+      "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど"
+    },
+    "ja_handakuten": {
+      "input_ids": [
+        144099,
+        139813,
+        143262,
+        144184,
+        142459,
+        79705,
+        69463,
+        56226,
+        98595,
+        88054
+      ],
+      "tokens": [
+        "ãģ±",
+        "ãģ´",
+        "ãģ·",
+        "ãģº",
+        "ãģ½",
+        "ãĥĳ",
+        "ãĥĶ",
+        "ãĥĹ",
+        "ãĥļ",
+        "ãĥĿ"
+      ],
+      "decoded_with_special": "ぱぴぷぺぽパピプペポ",
+      "decoded_skip_special": "ぱぴぷぺぽパピプペポ"
+    },
+    "ja_kanji_mixed": {
+      "input_ids": [
+        101059,
+        102819,
+        15767,
+        82699,
+        101008,
+        71138,
+        106637,
+        15322,
+        133073,
+        104832,
+        37541,
+        1773
+      ],
+      "tokens": [
+        "æĹ¥æľ¬",
+        "èªŀ",
+        "ãģ®",
+        "å½¢",
+        "æħĭ",
+        "ç´ł",
+        "è§£æŀĲ",
+        "ãģ¯",
+        "éĽ£ãģĹãģĦ",
+        "åķıé¡Į",
+        "ãģ§ãģĻ",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "日本語の形態素解析は難しい問題です。",
+      "decoded_skip_special": "日本語の形態素解析は難しい問題です。"
+    },
+    "ja_romaji_mixed": {
+      "input_ids": [
+        55336,
+        220,
+        16161,
+        8979,
+        37434,
+        94271,
+        102854,
+        17219,
+        1773,
+        10236,
+        113,
+        238,
+        27773,
+        15322,
+        1946,
+        8077,
+        220,
+        19182,
+        11211,
+        96618,
+        40820,
+        58639,
+        1773
+      ],
+      "tokens": [
+        "Swift",
+        "Ġ",
+        "ãģ§",
+        "ĠAuto",
+        "Tokenizer",
+        "ĠãĤĴ",
+        "è©¦",
+        "ãģĻ",
+        "ãĢĤ",
+        "Ġç",
+        "µ",
+        "Ĳ",
+        "æŀľ",
+        "ãģ¯",
+        "Ġinput",
+        "_ids",
+        "Ġ",
+        "ãģ¨",
+        "Ġtokens",
+        "Ġãģ®",
+        "äºĮ",
+        "ãģ¤",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。",
+      "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。"
+    },
+    "ja_long_sentence": {
+      "input_ids": [
+        102356,
+        46553,
+        65278,
+        100955,
+        100955,
+        30440,
+        89977,
+        15767,
+        100955,
+        30440,
+        89977,
+        100435,
+        28195,
+        100955,
+        30440,
+        29412,
+        137246,
+        127056,
+        28195,
+        5373,
+        129790,
+        129085,
+        31049,
+        1773
+      ],
+      "tokens": [
+        "æĿ±",
+        "äº¬",
+        "çī¹",
+        "è¨±",
+        "è¨±",
+        "åı¯",
+        "å±Ģ",
+        "ãģ®",
+        "è¨±",
+        "åı¯",
+        "å±Ģ",
+        "éķ·",
+        "ãģĮ",
+        "è¨±",
+        "åı¯",
+        "ãĤĴ",
+        "åĩºãģĹãģŁ",
+        "ãĤīãģĹãģĦ",
+        "ãģĮ",
+        "ãĢģ",
+        "æľ¬å½ĵ",
+        "ãģłãĤįãģĨ",
+        "ãģĭ",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。",
+      "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。"
+    },
+    "ko_hangul_simple": {
+      "input_ids": [
+        126246,
+        144370,
+        91145,
+        13,
+        63757,
+        138685,
+        38231,
+        13
+      ],
+      "tokens": [
+        "ìķĪ",
+        "ëħķ",
+        "íķĺìĦ¸ìļĶ",
+        ".",
+        "Ġë°ĺ",
+        "ê°ĳ",
+        "ìĬµëĭĪëĭ¤",
+        "."
+      ],
+      "decoded_with_special": "안녕하세요. 반갑습니다.",
+      "decoded_skip_special": "안녕하세요. 반갑습니다."
+    },
+    "ko_hangul_jamo": {
+      "input_ids": [
+        23573,
+        83291,
+        64577,
+        129439,
+        128618,
+        28002,
+        141965,
+        76337,
+        53680,
+        61298,
+        83291,
+        16751,
+        234,
+        126550,
+        141965,
+        76337,
+        13
+      ],
+      "tokens": [
+        "íķľ",
+        "ê¸Ģ",
+        "ĠìŀĲ",
+        "ëª¨",
+        "Ġë¶Ħ",
+        "ë¦¬",
+        "Ġíĺķ",
+        "ìĭĿ",
+        "ê³¼",
+        "Ġíķľ",
+        "ê¸Ģ",
+        "ĠìĿ",
+        "Į",
+        "ìłĪ",
+        "Ġíĺķ",
+        "ìĭĿ",
+        "."
+      ],
+      "decoded_with_special": "한글 자모 분리 형식과 한글 음절 형식.",
+      "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식."
+    },
+    "zh_simplified": {
+      "input_ids": [
+        102182,
+        100134,
+        20412,
+        104455,
+        104111,
+        99335,
+        103799,
+        1773
+      ],
+      "tokens": [
+        "æľºåĻ¨",
+        "åŃ¦ä¹ł",
+        "æĺ¯",
+        "äººå·¥æĻºèĥ½",
+        "çļĦä¸Ģä¸ª",
+        "éĩįè¦ģ",
+        "åĪĨæĶ¯",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "机器学习是人工智能的一个重要分支。",
+      "decoded_skip_special": "机器学习是人工智能的一个重要分支。"
+    },
+    "zh_traditional": {
+      "input_ids": [
+        100482,
+        31548,
+        106745,
+        20412,
+        102249,
+        101934,
+        99774,
+        99542,
+        99335,
+        103799,
+        1773
+      ],
+      "tokens": [
+        "æ©Ł",
+        "åĻ¨",
+        "åŃ¸ç¿Ĵ",
+        "æĺ¯",
+        "äººå·¥",
+        "æĻºæħ§",
+        "çļĦä¸Ģ",
+        "åĢĭ",
+        "éĩįè¦ģ",
+        "åĪĨæĶ¯",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "機器學習是人工智慧的一個重要分支。",
+      "decoded_skip_special": "機器學習是人工智慧的一個重要分支。"
+    },
+    "zh_mixed_en": {
+      "input_ids": [
+        13828,
+        51,
+        21584,
+        54851,
+        46944,
+        5538,
+        6832,
+        6567,
+        94,
+        228,
+        99630,
+        1773
+      ],
+      "tokens": [
+        "Py",
+        "T",
+        "orch",
+        "Ġæĺ¯",
+        "ä¸Ģä¸ª",
+        "Ġdeep",
+        "Ġlearning",
+        "Ġæ",
+        "¡",
+        "Ĩ",
+        "æŀ¶",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "PyTorch 是一个 deep learning 框架。",
+      "decoded_skip_special": "PyTorch 是一个 deep learning 框架。"
+    },
+    "ar_basic": {
+      "input_ids": [
+        31382,
+        130353,
+        25871,
+        129071,
+        138518,
+        25871,
+        127119,
+        13
+      ],
+      "tokens": [
+        "Ø§ÙĦ",
+        "ÙĦØº",
+        "Ø©",
+        "ĠØ§ÙĦØ¹Ø±Ø¨ÙĬØ©",
+        "ĠØ¬ÙħÙĬÙĦ",
+        "Ø©",
+        "ĠØ¬Ø¯Ø§",
+        "."
+      ],
+      "decoded_with_special": "اللغة العربية جميلة جدا.",
+      "decoded_skip_special": "اللغة العربية جميلة جدا."
+    },
+    "ar_diacritics": {
+      "input_ids": [
+        21360,
+        52704,
+        20064,
+        59397,
+        10176,
+        52704,
+        124478,
+        27910,
+        73771,
+        16157,
+        52704,
+        124269,
+        27910,
+        73771,
+        29825,
+        59397,
+        10176,
+        27910,
+        149,
+        108,
+        11798,
+        52704,
+        124269,
+        27910,
+        73771,
+        29825,
+        52704,
+        124176,
+        52704
+      ],
+      "tokens": [
+        "Ø¨",
+        "ÙĲ",
+        "Ø³",
+        "ÙĴ",
+        "Ùħ",
+        "ÙĲ",
+        "ĠØ§ÙĦÙĦ",
+        "Ùİ",
+        "Ùĳ",
+        "Ùĩ",
+        "ÙĲ",
+        "ĠØ§ÙĦØ±",
+        "Ùİ",
+        "Ùĳ",
+        "ØŃ",
+        "ÙĴ",
+        "Ùħ",
+        "Ùİ",
+        "Ù",
+        "°",
+        "ÙĨ",
+        "ÙĲ",
+        "ĠØ§ÙĦØ±",
+        "Ùİ",
+        "Ùĳ",
+        "ØŃ",
+        "ÙĲ",
+        "ÙĬÙħ",
+        "ÙĲ"
+      ],
+      "decoded_with_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
+      "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ"
+    },
+    "he_basic": {
+      "input_ids": [
+        126654,
+        123855,
+        124907,
+        13,
+        126197,
+        124395,
+        123792,
+        125127,
+        123855,
+        129390,
+        13
+      ],
+      "tokens": [
+        "×©×ľ×ķ×Ŀ",
+        "Ġ×¢",
+        "×ķ×ľ×Ŀ",
+        ".",
+        "Ġ×ĸ×Ķ",
+        "Ġ×ĺ",
+        "×§",
+        "×¡×ĺ",
+        "Ġ×¢",
+        "×ĳ×¨×Ļ",
+        "."
+      ],
+      "decoded_with_special": "שלום עולם. זה טקסט עברי.",
+      "decoded_skip_special": "שלום עולם. זה טקסט עברי."
+    },
+    "hi_devanagari": {
+      "input_ids": [
+        93948,
+        42311,
+        101,
+        30484,
+        99,
+        43647,
+        14925,
+        255,
+        31411,
+        115,
+        23868,
+        14925,
+        105,
+        93948,
+        72653,
+        79238,
+        68158,
+        87244,
+        12619,
+        225,
+        145256,
+        30484,
+        100,
+        84310,
+        12619,
+        230,
+        146031
+      ],
+      "tokens": [
+        "à¤¹",
+        "à¤¿à¤",
+        "¨",
+        "à¥įà¤",
+        "¦",
+        "à¥Ģ",
+        "Ġà¤",
+        "Ń",
+        "à¤¾à¤",
+        "·",
+        "à¤¾",
+        "Ġà¤",
+        "¬",
+        "à¤¹",
+        "à¥ģ",
+        "à¤¤",
+        "Ġà¤¸",
+        "à¤®",
+        "à¥",
+        "ĥ",
+        "à¤¦",
+        "à¥įà¤",
+        "§",
+        "Ġà¤¹",
+        "à¥",
+        "Ī",
+        "à¥¤"
+      ],
+      "decoded_with_special": "हिन्दी भाषा बहुत समृद्ध है।",
+      "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।"
+    },
+    "th_basic": {
+      "input_ids": [
+        93874,
+        123899,
+        140235,
+        124396,
+        127382,
+        125451,
+        123885,
+        83546,
+        123885,
+        125820,
+        129674,
+        86032,
+        124342,
+        28319,
+        48120,
+        124961,
+        37213,
+        123958,
+        129778,
+        123958,
+        125506
+      ],
+      "tokens": [
+        "à¸ģà¸²à¸£",
+        "à¸Ľà¸£à¸°",
+        "à¸¡à¸§à¸¥",
+        "à¸ľà¸¥",
+        "à¸łà¸²à¸©à¸²",
+        "à¹Ħà¸Ĺà¸¢",
+        "à¸ĭ",
+        "à¸±à¸ļ",
+        "à¸ĭ",
+        "à¹īà¸Ńà¸Ļ",
+        "à¹Ģà¸ŀà¸£à¸²à¸°",
+        "à¹Ħà¸¡",
+        "à¹Īà¸¡",
+        "à¸µ",
+        "à¸Ĭ",
+        "à¹Īà¸Ńà¸ĩ",
+        "à¸§",
+        "à¹Īà¸²à¸ĩ",
+        "à¸£à¸°à¸«à¸§",
+        "à¹Īà¸²à¸ĩ",
+        "à¸Ħà¸³"
+      ],
+      "decoded_with_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ",
+      "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ"
+    },
+    "emoji_bmp": {
+      "input_ids": [
+        30092,
+        25125,
+        222,
+        17788,
+        25125,
+        122,
+        6774,
+        37234,
+        4746,
+        67579
+      ],
+      "tokens": [
+        "Sun",
+        "Ġâĺ",
+        "Ģ",
+        "Ġmoon",
+        "Ġâĺ",
+        "¾",
+        "Ġstar",
+        "Ġâĺħ",
+        "Ġheart",
+        "ĠâĻ¥"
+      ],
+      "decoded_with_special": "Sun ☀ moon ☾ star ★ heart ♥",
+      "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥"
+    },
+    "emoji_astral": {
+      "input_ids": [
+        145836,
+        311,
+        279,
+        17788,
+        11162,
+        234,
+        247,
+        448,
+        264,
+        11162,
+        238,
+        109,
+        323,
+        264,
+        11162,
+        236,
+        231
+      ],
+      "tokens": [
+        "ðŁļĢ",
+        "Ġto",
+        "Ġthe",
+        "Ġmoon",
+        "ĠðŁ",
+        "Į",
+        "Ļ",
+        "Ġwith",
+        "Ġa",
+        "ĠðŁ",
+        "Ĳ",
+        "±",
+        "Ġand",
+        "Ġa",
+        "ĠðŁ",
+        "İ",
+        "ī"
+      ],
+      "decoded_with_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉",
+      "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉"
+    },
+    "emoji_zwj_family": {
+      "input_ids": [
+        15192,
+        25,
+        61804,
+        101,
+        378,
+        235,
+        145233,
+        378,
+        235,
+        145665,
+        378,
+        235,
+        145988,
+        323,
+        5181,
+        25,
+        11162,
+        229,
+        107,
+        145516,
+        146035,
+        145070,
+        145793,
+        145754
+      ],
+      "tokens": [
+        "Family",
+        ":",
+        "ĠðŁĳ",
+        "¨",
+        "âĢ",
+        "į",
+        "ðŁĳ©",
+        "âĢ",
+        "į",
+        "ðŁĳ§",
+        "âĢ",
+        "į",
+        "ðŁĳ¦",
+        "Ġand",
+        "Ġflag",
+        ":",
+        "ĠðŁ",
+        "ĩ",
+        "¯",
+        "ðŁĩµ",
+        "ðŁĩ°",
+        "ðŁĩ·",
+        "ðŁĩ¨",
+        "ðŁĩ³"
+      ],
+      "decoded_with_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳",
+      "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳"
+    },
+    "emoji_skin_tone": {
+      "input_ids": [
+        145707,
+        144321,
+        145707,
+        145375,
+        145707,
+        146530,
+        12060,
+        448,
+        6787,
+        41976
+      ],
+      "tokens": [
+        "ðŁĳĭ",
+        "ðŁı»",
+        "ðŁĳĭ",
+        "ðŁı½",
+        "ðŁĳĭ",
+        "ðŁı¿",
+        "Ġwave",
+        "Ġwith",
+        "Ġskin",
+        "Ġtones"
+      ],
+      "decoded_with_special": "👋🏻👋🏽👋🏿 wave with skin tones",
+      "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones"
+    },
+    "mixed_polyglot": {
+      "input_ids": [
+        9707,
+        220,
+        99489,
+        95170,
+        144370,
+        124756,
+        123881,
+        23364,
+        126860,
+        124671,
+        14925,
+        101,
+        87244,
+        78368,
+        30484,
+        97,
+        34370,
+        129328,
+        37213,
+        23271,
+        125136,
+        28319,
+        220,
+        89015
+      ],
+      "tokens": [
+        "Hello",
+        "Ġ",
+        "ä¸ĸçķĮ",
+        "ĠìķĪ",
+        "ëħķ",
+        "Ġ×©×ľ",
+        "×ķ×Ŀ",
+        "ĠÙħ",
+        "Ø±ØŃ",
+        "Ø¨Ø§",
+        "Ġà¤",
+        "¨",
+        "à¤®",
+        "à¤¸",
+        "à¥įà¤",
+        "¤",
+        "à¥ĩ",
+        "Ġà¸ª",
+        "à¸§",
+        "à¸±",
+        "à¸ªà¸Ķ",
+        "à¸µ",
+        "Ġ",
+        "ãģĵãĤĵãģ«ãģ¡ãģ¯"
+      ],
+      "decoded_with_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは",
+      "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは"
+    },
+    "mixed_code_jp": {
+      "input_ids": [
+        322,
+        75402,
+        21894,
+        102819,
+        89078,
+        198,
+        1149,
+        42113,
+        284,
+        330,
+        89015,
+        5373,
+        99489,
+        8958
+      ],
+      "tokens": [
+        "//",
+        "ĠæĹ¥",
+        "æľ¬",
+        "èªŀ",
+        "ãĤ³ãĥ¡ãĥ³ãĥĪ",
+        "Ċ",
+        "let",
+        "Ġgreeting",
+        "Ġ=",
+        "Ġ\"",
+        "ãģĵãĤĵãģ«ãģ¡ãģ¯",
+        "ãĢģ",
+        "ä¸ĸçķĮ",
+        "!\""
+      ],
+      "decoded_with_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"",
+      "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\""
+    },
+    "ipa_basic": {
+      "input_ids": [
+        785,
+        55747,
+        369,
+        364,
+        18170,
+        6,
+        374,
+        608,
+        69,
+        145076,
+        145388,
+        14,
+        323,
+        369,
+        364,
+        5270,
+        6,
+        374,
+        608,
+        145388,
+        145076,
+        79,
+        11930
+      ],
+      "tokens": [
+        "The",
+        "ĠIPA",
+        "Ġfor",
+        "Ġ'",
+        "fish",
+        "'",
+        "Ġis",
+        "Ġ/",
+        "f",
+        "Éª",
+        "Êĥ",
+        "/",
+        "Ġand",
+        "Ġfor",
+        "Ġ'",
+        "ship",
+        "'",
+        "Ġis",
+        "Ġ/",
+        "Êĥ",
+        "Éª",
+        "p",
+        "/."
+      ],
+      "decoded_with_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.",
+      "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/."
+    },
+    "edge_combining": {
+      "input_ids": [
+        924,
+        58858,
+        320,
+        45,
+        6754,
+        8,
+        6165,
+        51950,
+        320,
+        45,
+        14596,
+        8,
+        1959,
+        1852,
+        3409,
+        11,
+        2155,
+        5820,
+        13
+      ],
+      "tokens": [
+        "ca",
+        "fÃ©",
+        "Ġ(",
+        "N",
+        "FC",
+        ")",
+        "Ġvs",
+        "ĠcafÃ©",
+        "Ġ(",
+        "N",
+        "FD",
+        ")",
+        "ĠâĢĶ",
+        "Ġsame",
+        "Ġword",
+        ",",
+        "Ġdifferent",
+        "Ġbytes",
+        "."
+      ],
+      "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.",
+      "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes."
+    },
+    "edge_long_repetition": {
+      "input_ids": [
+        69440,
+        69440,
+        69440,
+        69440,
+        293,
+        87609,
+        87609,
+        87609,
+        87609,
+        87609,
+        87609,
+        87609,
+        53151
+      ],
+      "tokens": [
+        "aaaaaaaa",
+        "aaaaaaaa",
+        "aaaaaaaa",
+        "aaaaaaaa",
+        "Ġb",
+        "bbbb",
+        "bbbb",
+        "bbbb",
+        "bbbb",
+        "bbbb",
+        "bbbb",
+        "bbbb",
+        "bbb"
+      ],
+      "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
+      "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    }
+  }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json
new file mode 100644
index 00000000..5212ab36
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json
@@ -0,0 +1,1946 @@
+{
+  "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+  "transformers_version": "4.57.1",
+  "entries": {
+    "ascii_simple": {
+      "input_ids": [
+        1,
+        450,
+        4996,
+        17354,
+        1701,
+        29916,
+        432,
+        17204,
+        975,
+        278,
+        17366,
+        11203,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁The",
+        "▁quick",
+        "▁brown",
+        "▁fo",
+        "x",
+        "▁j",
+        "umps",
+        "▁over",
+        "▁the",
+        "▁lazy",
+        "▁dog",
+        "."
+      ],
+      "decoded_with_special": "<s> The quick brown fox jumps over the lazy dog.",
+      "decoded_skip_special": "The quick brown fox jumps over the lazy dog."
+    },
+    "ascii_punct": {
+      "input_ids": [
+        1,
+        15043,
+        29892,
+        3186,
+        29991,
+        739,
+        29915,
+        29879,
+        29871,
+        29896,
+        29906,
+        29901,
+        29941,
+        29946,
+        813,
+        376,
+        1396,
+        29908,
+        525,
+        481,
+        520,
+        1336,
+        354,
+        29915,
+        313,
+        862,
+        264,
+        467
+      ],
+      "tokens": [
+        "<s>",
+        "▁Hello",
+        ",",
+        "▁world",
+        "!",
+        "▁It",
+        "'",
+        "s",
+        "▁",
+        "1",
+        "2",
+        ":",
+        "3",
+        "4",
+        "▁—",
+        "▁\"",
+        "quote",
+        "\"",
+        "▁'",
+        "ap",
+        "ost",
+        "rop",
+        "he",
+        "'",
+        "▁(",
+        "par",
+        "en",
+        ")."
+      ],
+      "decoded_with_special": "<s> Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).",
+      "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)."
+    },
+    "ascii_numbers": {
+      "input_ids": [
+        1,
+        7362,
+        338,
+        14235,
+        29871,
+        29941,
+        29889,
+        29896,
+        29946,
+        29896,
+        29945,
+        29929,
+        29936,
+        321,
+        338,
+        1048,
+        29871,
+        29906,
+        29889,
+        29955,
+        29896,
+        29947,
+        29906,
+        29947,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁Pi",
+        "▁is",
+        "▁approximately",
+        "▁",
+        "3",
+        ".",
+        "1",
+        "4",
+        "1",
+        "5",
+        "9",
+        ";",
+        "▁e",
+        "▁is",
+        "▁about",
+        "▁",
+        "2",
+        ".",
+        "7",
+        "1",
+        "8",
+        "2",
+        "8",
+        "."
+      ],
+      "decoded_with_special": "<s> Pi is approximately 3.14159; e is about 2.71828.",
+      "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828."
+    },
+    "ascii_url": {
+      "input_ids": [
+        1,
+        5741,
+        277,
+        2045,
+        597,
+        29882,
+        688,
+        3460,
+        2161,
+        29889,
+        1111,
+        29914,
+        2640,
+        322,
+        10524,
+        289,
+        711,
+        29992,
+        4773,
+        29889,
+        510,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁Vis",
+        "it",
+        "▁https",
+        "://",
+        "h",
+        "ug",
+        "ging",
+        "face",
+        ".",
+        "co",
+        "/",
+        "docs",
+        "▁and",
+        "▁mail",
+        "▁b",
+        "ob",
+        "@",
+        "example",
+        ".",
+        "com",
+        "."
+      ],
+      "decoded_with_special": "<s> Visit https://huggingface.co/docs and mail bob@example.com.",
+      "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com."
+    },
+    "ascii_code": {
+      "input_ids": [
+        1,
+        822,
+        788,
+        29898,
+        29874,
+        29901,
+        938,
+        29892,
+        289,
+        29901,
+        938,
+        29897,
+        1599,
+        938,
+        29901,
+        13,
+        1678,
+        736,
+        263,
+        718,
+        289
+      ],
+      "tokens": [
+        "<s>",
+        "▁def",
+        "▁add",
+        "(",
+        "a",
+        ":",
+        "▁int",
+        ",",
+        "▁b",
+        ":",
+        "▁int",
+        ")",
+        "▁->",
+        "▁int",
+        ":",
+        "<0x0A>",
+        "▁▁▁",
+        "▁return",
+        "▁a",
+        "▁+",
+        "▁b"
+      ],
+      "decoded_with_special": "<s> def add(a: int, b: int) -> int:\n    return a + b",
+      "decoded_skip_special": "def add(a: int, b: int) -> int:\n    return a + b"
+    },
+    "ja_kana_basic": {
+      "input_ids": [
+        1,
+        29871,
+        30641,
+        30298,
+        30465,
+        30914,
+        30697,
+        30412,
+        30538,
+        30568,
+        30807,
+        30589,
+        30566,
+        30326,
+        30427,
+        31095,
+        31110
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "あ",
+        "い",
+        "う",
+        "え",
+        "お",
+        "か",
+        "き",
+        "く",
+        "け",
+        "こ",
+        "さ",
+        "し",
+        "す",
+        "せ",
+        "そ"
+      ],
+      "decoded_with_special": "<s> あいうえおかきくけこさしすせそ",
+      "decoded_skip_special": "あいうえおかきくけこさしすせそ"
+    },
+    "ja_dakuten": {
+      "input_ids": [
+        1,
+        29871,
+        30458,
+        230,
+        132,
+        145,
+        31907,
+        31991,
+        31622,
+        230,
+        132,
+        153,
+        31115,
+        31761,
+        230,
+        132,
+        159,
+        230,
+        132,
+        161,
+        30955,
+        230,
+        132,
+        165,
+        230,
+        132,
+        168,
+        30499,
+        31250
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "が",
+        "<0xE3>",
+        "<0x81>",
+        "<0x8E>",
+        "ぐ",
+        "げ",
+        "ご",
+        "<0xE3>",
+        "<0x81>",
+        "<0x96>",
+        "じ",
+        "ず",
+        "<0xE3>",
+        "<0x81>",
+        "<0x9C>",
+        "<0xE3>",
+        "<0x81>",
+        "<0x9E>",
+        "だ",
+        "<0xE3>",
+        "<0x81>",
+        "<0xA2>",
+        "<0xE3>",
+        "<0x81>",
+        "<0xA5>",
+        "で",
+        "ど"
+      ],
+      "decoded_with_special": "<s> がぎぐげござじずぜぞだぢづでど",
+      "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど"
+    },
+    "ja_handakuten": {
+      "input_ids": [
+        1,
+        29871,
+        230,
+        132,
+        180,
+        230,
+        132,
+        183,
+        230,
+        132,
+        186,
+        230,
+        132,
+        189,
+        230,
+        132,
+        192,
+        30715,
+        31172,
+        30605,
+        31501,
+        31205
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "<0xE3>",
+        "<0x81>",
+        "<0xB1>",
+        "<0xE3>",
+        "<0x81>",
+        "<0xB4>",
+        "<0xE3>",
+        "<0x81>",
+        "<0xB7>",
+        "<0xE3>",
+        "<0x81>",
+        "<0xBA>",
+        "<0xE3>",
+        "<0x81>",
+        "<0xBD>",
+        "パ",
+        "ピ",
+        "プ",
+        "ペ",
+        "ポ"
+      ],
+      "decoded_with_special": "<s> ぱぴぷぺぽパピプペポ",
+      "decoded_skip_special": "ぱぴぷぺぽパピプペポ"
+    },
+    "ja_kanji_mixed": {
+      "input_ids": [
+        1,
+        29871,
+        30325,
+        30346,
+        30968,
+        30199,
+        31305,
+        233,
+        136,
+        142,
+        31605,
+        31201,
+        233,
+        161,
+        147,
+        30449,
+        236,
+        158,
+        166,
+        30326,
+        30298,
+        232,
+        152,
+        146,
+        236,
+        164,
+        143,
+        30499,
+        30427,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "日",
+        "本",
+        "語",
+        "の",
+        "形",
+        "<0xE6>",
+        "<0x85>",
+        "<0x8B>",
+        "素",
+        "解",
+        "<0xE6>",
+        "<0x9E>",
+        "<0x90>",
+        "は",
+        "<0xE9>",
+        "<0x9B>",
+        "<0xA3>",
+        "し",
+        "い",
+        "<0xE5>",
+        "<0x95>",
+        "<0x8F>",
+        "<0xE9>",
+        "<0xA1>",
+        "<0x8C>",
+        "で",
+        "す",
+        "。"
+      ],
+      "decoded_with_special": "<s> 日本語の形態素解析は難しい問題です。",
+      "decoded_skip_special": "日本語の形態素解析は難しい問題です。"
+    },
+    "ja_romaji_mixed": {
+      "input_ids": [
+        1,
+        14156,
+        29871,
+        30499,
+        11133,
+        6066,
+        3950,
+        29871,
+        30396,
+        235,
+        172,
+        169,
+        30427,
+        30267,
+        29871,
+        31711,
+        30801,
+        30449,
+        1881,
+        29918,
+        4841,
+        29871,
+        30364,
+        18897,
+        29871,
+        30199,
+        30685,
+        30773,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁Swift",
+        "▁",
+        "で",
+        "▁Auto",
+        "Token",
+        "izer",
+        "▁",
+        "を",
+        "<0xE8>",
+        "<0xA9>",
+        "<0xA6>",
+        "す",
+        "。",
+        "▁",
+        "結",
+        "果",
+        "は",
+        "▁input",
+        "_",
+        "ids",
+        "▁",
+        "と",
+        "▁tokens",
+        "▁",
+        "の",
+        "二",
+        "つ",
+        "。"
+      ],
+      "decoded_with_special": "<s> Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。",
+      "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。"
+    },
+    "ja_long_sentence": {
+      "input_ids": [
+        1,
+        29871,
+        30591,
+        30675,
+        31141,
+        235,
+        171,
+        180,
+        235,
+        171,
+        180,
+        30682,
+        31655,
+        30199,
+        235,
+        171,
+        180,
+        30682,
+        31655,
+        30899,
+        30458,
+        235,
+        171,
+        180,
+        30682,
+        30396,
+        30544,
+        30326,
+        30366,
+        30513,
+        30326,
+        30298,
+        30458,
+        30330,
+        30346,
+        30948,
+        30955,
+        31206,
+        30465,
+        30412,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "東",
+        "京",
+        "特",
+        "<0xE8>",
+        "<0xA8>",
+        "<0xB1>",
+        "<0xE8>",
+        "<0xA8>",
+        "<0xB1>",
+        "可",
+        "局",
+        "の",
+        "<0xE8>",
+        "<0xA8>",
+        "<0xB1>",
+        "可",
+        "局",
+        "長",
+        "が",
+        "<0xE8>",
+        "<0xA8>",
+        "<0xB1>",
+        "可",
+        "を",
+        "出",
+        "し",
+        "た",
+        "ら",
+        "し",
+        "い",
+        "が",
+        "、",
+        "本",
+        "当",
+        "だ",
+        "ろ",
+        "う",
+        "か",
+        "。"
+      ],
+      "decoded_with_special": "<s> 東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。",
+      "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。"
+    },
+    "ko_hangul_simple": {
+      "input_ids": [
+        1,
+        29871,
+        31734,
+        238,
+        136,
+        152,
+        30944,
+        31578,
+        31527,
+        29889,
+        29871,
+        238,
+        179,
+        155,
+        237,
+        179,
+        148,
+        239,
+        141,
+        184,
+        31063,
+        30709,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "안",
+        "<0xEB>",
+        "<0x85>",
+        "<0x95>",
+        "하",
+        "세",
+        "요",
+        ".",
+        "▁",
+        "<0xEB>",
+        "<0xB0>",
+        "<0x98>",
+        "<0xEA>",
+        "<0xB0>",
+        "<0x91>",
+        "<0xEC>",
+        "<0x8A>",
+        "<0xB5>",
+        "니",
+        "다",
+        "."
+      ],
+      "decoded_with_special": "<s> 안녕하세요. 반갑습니다.",
+      "decoded_skip_special": "안녕하세요. 반갑습니다."
+    },
+    "ko_hangul_jamo": {
+      "input_ids": [
+        1,
+        29871,
+        228,
+        135,
+        149,
+        228,
+        136,
+        164,
+        228,
+        137,
+        174,
+        237,
+        187,
+        131,
+        29871,
+        31013,
+        31962,
+        29871,
+        238,
+        185,
+        135,
+        30826,
+        29871,
+        240,
+        155,
+        152,
+        31895,
+        31906,
+        29871,
+        30877,
+        237,
+        187,
+        131,
+        29871,
+        31966,
+        239,
+        163,
+        139,
+        29871,
+        240,
+        155,
+        152,
+        31895,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "<0xE1>",
+        "<0x84>",
+        "<0x92>",
+        "<0xE1>",
+        "<0x85>",
+        "<0xA1>",
+        "<0xE1>",
+        "<0x86>",
+        "<0xAB>",
+        "<0xEA>",
+        "<0xB8>",
+        "<0x80>",
+        "▁",
+        "자",
+        "모",
+        "▁",
+        "<0xEB>",
+        "<0xB6>",
+        "<0x84>",
+        "리",
+        "▁",
+        "<0xED>",
+        "<0x98>",
+        "<0x95>",
+        "식",
+        "과",
+        "▁",
+        "한",
+        "<0xEA>",
+        "<0xB8>",
+        "<0x80>",
+        "▁",
+        "음",
+        "<0xEC>",
+        "<0xA0>",
+        "<0x88>",
+        "▁",
+        "<0xED>",
+        "<0x98>",
+        "<0x95>",
+        "식",
+        "."
+      ],
+      "decoded_with_special": "<s> 한글 자모 분리 형식과 한글 음절 형식.",
+      "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식."
+    },
+    "zh_simplified": {
+      "input_ids": [
+        1,
+        29871,
+        31429,
+        30943,
+        30415,
+        231,
+        188,
+        163,
+        30392,
+        30313,
+        31041,
+        31676,
+        30815,
+        30210,
+        30287,
+        30502,
+        30908,
+        30698,
+        30748,
+        31541,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "机",
+        "器",
+        "学",
+        "<0xE4>",
+        "<0xB9>",
+        "<0xA0>",
+        "是",
+        "人",
+        "工",
+        "智",
+        "能",
+        "的",
+        "一",
+        "个",
+        "重",
+        "要",
+        "分",
+        "支",
+        "。"
+      ],
+      "decoded_with_special": "<s> 机器学习是人工智能的一个重要分支。",
+      "decoded_skip_special": "机器学习是人工智能的一个重要分支。"
+    },
+    "zh_traditional": {
+      "input_ids": [
+        1,
+        29871,
+        31540,
+        30943,
+        31274,
+        234,
+        194,
+        149,
+        30392,
+        30313,
+        31041,
+        31676,
+        233,
+        136,
+        170,
+        30210,
+        30287,
+        232,
+        131,
+        142,
+        30908,
+        30698,
+        30748,
+        31541,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "機",
+        "器",
+        "學",
+        "<0xE7>",
+        "<0xBF>",
+        "<0x92>",
+        "是",
+        "人",
+        "工",
+        "智",
+        "<0xE6>",
+        "<0x85>",
+        "<0xA7>",
+        "的",
+        "一",
+        "<0xE5>",
+        "<0x80>",
+        "<0x8B>",
+        "重",
+        "要",
+        "分",
+        "支",
+        "。"
+      ],
+      "decoded_with_special": "<s> 機器學習是人工智慧的一個重要分支。",
+      "decoded_skip_special": "機器學習是人工智慧的一個重要分支。"
+    },
+    "zh_mixed_en": {
+      "input_ids": [
+        1,
+        10772,
+        29911,
+        25350,
+        29871,
+        30392,
+        30287,
+        30502,
+        6483,
+        6509,
+        29871,
+        233,
+        164,
+        137,
+        233,
+        161,
+        185,
+        30267
+      ],
+      "tokens": [
+        "<s>",
+        "▁Py",
+        "T",
+        "orch",
+        "▁",
+        "是",
+        "一",
+        "个",
+        "▁deep",
+        "▁learning",
+        "▁",
+        "<0xE6>",
+        "<0xA1>",
+        "<0x86>",
+        "<0xE6>",
+        "<0x9E>",
+        "<0xB6>",
+        "。"
+      ],
+      "decoded_with_special": "<s> PyTorch 是一个 deep learning 框架。",
+      "decoded_skip_special": "PyTorch 是一个 deep learning 框架。"
+    },
+    "ar_basic": {
+      "input_ids": [
+        1,
+        24508,
+        30138,
+        30611,
+        30242,
+        24508,
+        30218,
+        30156,
+        30177,
+        30163,
+        30242,
+        29871,
+        30270,
+        30159,
+        30163,
+        30138,
+        30242,
+        29871,
+        30270,
+        30172,
+        30112,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁ال",
+        "ل",
+        "غ",
+        "ة",
+        "▁ال",
+        "ع",
+        "ر",
+        "ب",
+        "ي",
+        "ة",
+        "▁",
+        "ج",
+        "م",
+        "ي",
+        "ل",
+        "ة",
+        "▁",
+        "ج",
+        "د",
+        "ا",
+        "."
+      ],
+      "decoded_with_special": "<s> اللغة العربية جميلة جدا.",
+      "decoded_skip_special": "اللغة العربية جميلة جدا."
+    },
+    "ar_diacritics": {
+      "input_ids": [
+        1,
+        29871,
+        30177,
+        30567,
+        30198,
+        30741,
+        30159,
+        30567,
+        24508,
+        30138,
+        30323,
+        30857,
+        30204,
+        30567,
+        24508,
+        30156,
+        30323,
+        30857,
+        30240,
+        30741,
+        30159,
+        30323,
+        220,
+        179,
+        30162,
+        30567,
+        24508,
+        30156,
+        30323,
+        30857,
+        30240,
+        30567,
+        30163,
+        30159,
+        30567
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "ب",
+        "ِ",
+        "س",
+        "ْ",
+        "م",
+        "ِ",
+        "▁ال",
+        "ل",
+        "َ",
+        "ّ",
+        "ه",
+        "ِ",
+        "▁ال",
+        "ر",
+        "َ",
+        "ّ",
+        "ح",
+        "ْ",
+        "م",
+        "َ",
+        "<0xD9>",
+        "<0xB0>",
+        "ن",
+        "ِ",
+        "▁ال",
+        "ر",
+        "َ",
+        "ّ",
+        "ح",
+        "ِ",
+        "ي",
+        "م",
+        "ِ"
+      ],
+      "decoded_with_special": "<s> بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
+      "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ"
+    },
+    "he_basic": {
+      "input_ids": [
+        1,
+        29871,
+        30294,
+        30249,
+        30205,
+        30404,
+        29871,
+        30324,
+        30205,
+        30249,
+        30404,
+        29889,
+        29871,
+        30776,
+        30235,
+        29871,
+        30639,
+        30433,
+        30504,
+        30639,
+        29871,
+        30324,
+        30276,
+        30236,
+        30196,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "ש",
+        "ל",
+        "ו",
+        "ם",
+        "▁",
+        "ע",
+        "ו",
+        "ל",
+        "ם",
+        ".",
+        "▁",
+        "ז",
+        "ה",
+        "▁",
+        "ט",
+        "ק",
+        "ס",
+        "ט",
+        "▁",
+        "ע",
+        "ב",
+        "ר",
+        "י",
+        "."
+      ],
+      "decoded_with_special": "<s> שלום עולם. זה טקסט עברי.",
+      "decoded_skip_special": "שלום עולם. זה טקסט עברי."
+    },
+    "hi_devanagari": {
+      "input_ids": [
+        1,
+        29871,
+        30714,
+        30436,
+        30424,
+        30296,
+        30694,
+        30580,
+        29871,
+        31380,
+        30269,
+        31330,
+        30269,
+        29871,
+        31012,
+        30714,
+        30702,
+        30475,
+        29871,
+        30489,
+        30485,
+        227,
+        168,
+        134,
+        30694,
+        30296,
+        31437,
+        29871,
+        30714,
+        31678,
+        31776
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "ह",
+        "ि",
+        "न",
+        "्",
+        "द",
+        "ी",
+        "▁",
+        "भ",
+        "ा",
+        "ष",
+        "ा",
+        "▁",
+        "ब",
+        "ह",
+        "ु",
+        "त",
+        "▁",
+        "स",
+        "म",
+        "<0xE0>",
+        "<0xA5>",
+        "<0x83>",
+        "द",
+        "्",
+        "ध",
+        "▁",
+        "ह",
+        "ै",
+        "।"
+      ],
+      "decoded_with_special": "<s> हिन्दी भाषा बहुत समृद्ध है।",
+      "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।"
+    },
+    "th_basic": {
+      "input_ids": [
+        1,
+        29871,
+        30425,
+        30289,
+        30297,
+        31010,
+        30297,
+        30823,
+        30501,
+        30492,
+        30496,
+        227,
+        187,
+        159,
+        30496,
+        31070,
+        30289,
+        31964,
+        30289,
+        31252,
+        30595,
+        30549,
+        227,
+        187,
+        142,
+        30510,
+        30526,
+        227,
+        187,
+        142,
+        30652,
+        30351,
+        30348,
+        30401,
+        30727,
+        30297,
+        30289,
+        30823,
+        31252,
+        30501,
+        30543,
+        30501,
+        30691,
+        30913,
+        30543,
+        30351,
+        30398,
+        30492,
+        30543,
+        30289,
+        30398,
+        30297,
+        30823,
+        30663,
+        30492,
+        30543,
+        30289,
+        30398,
+        30759,
+        30747
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "ก",
+        "า",
+        "ร",
+        "ป",
+        "ร",
+        "ะ",
+        "ม",
+        "ว",
+        "ล",
+        "<0xE0>",
+        "<0xB8>",
+        "<0x9C>",
+        "ล",
+        "ภ",
+        "า",
+        "ษ",
+        "า",
+        "ไ",
+        "ท",
+        "ย",
+        "<0xE0>",
+        "<0xB8>",
+        "<0x8B>",
+        "ั",
+        "บ",
+        "<0xE0>",
+        "<0xB8>",
+        "<0x8B>",
+        "้",
+        "อ",
+        "น",
+        "เ",
+        "พ",
+        "ร",
+        "า",
+        "ะ",
+        "ไ",
+        "ม",
+        "่",
+        "ม",
+        "ี",
+        "ช",
+        "่",
+        "อ",
+        "ง",
+        "ว",
+        "่",
+        "า",
+        "ง",
+        "ร",
+        "ะ",
+        "ห",
+        "ว",
+        "่",
+        "า",
+        "ง",
+        "ค",
+        "ำ"
+      ],
+      "decoded_with_special": "<s> การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ",
+      "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ"
+    },
+    "emoji_bmp": {
+      "input_ids": [
+        1,
+        8991,
+        29871,
+        229,
+        155,
+        131,
+        18786,
+        29871,
+        229,
+        155,
+        193,
+        5810,
+        29871,
+        30950,
+        5192,
+        29871,
+        30922
+      ],
+      "tokens": [
+        "<s>",
+        "▁Sun",
+        "▁",
+        "<0xE2>",
+        "<0x98>",
+        "<0x80>",
+        "▁moon",
+        "▁",
+        "<0xE2>",
+        "<0x98>",
+        "<0xBE>",
+        "▁star",
+        "▁",
+        "★",
+        "▁heart",
+        "▁",
+        "♥"
+      ],
+      "decoded_with_special": "<s> Sun ☀ moon ☾ star ★ heart ♥",
+      "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥"
+    },
+    "emoji_astral": {
+      "input_ids": [
+        1,
+        29871,
+        243,
+        162,
+        157,
+        131,
+        304,
+        278,
+        18786,
+        29871,
+        243,
+        162,
+        143,
+        156,
+        411,
+        263,
+        29871,
+        243,
+        162,
+        147,
+        180,
+        322,
+        263,
+        29871,
+        243,
+        162,
+        145,
+        140
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x9A>",
+        "<0x80>",
+        "▁to",
+        "▁the",
+        "▁moon",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x8C>",
+        "<0x99>",
+        "▁with",
+        "▁a",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x90>",
+        "<0xB1>",
+        "▁and",
+        "▁a",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x8E>",
+        "<0x89>"
+      ],
+      "decoded_with_special": "<s> 🚀 to the moon 🌙 with a 🐱 and a 🎉",
+      "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉"
+    },
+    "emoji_zwj_family": {
+      "input_ids": [
+        1,
+        14662,
+        29901,
+        29871,
+        243,
+        162,
+        148,
+        171,
+        30722,
+        243,
+        162,
+        148,
+        172,
+        30722,
+        243,
+        162,
+        148,
+        170,
+        30722,
+        243,
+        162,
+        148,
+        169,
+        322,
+        7353,
+        29901,
+        29871,
+        243,
+        162,
+        138,
+        178,
+        243,
+        162,
+        138,
+        184,
+        243,
+        162,
+        138,
+        179,
+        243,
+        162,
+        138,
+        186,
+        243,
+        162,
+        138,
+        171,
+        243,
+        162,
+        138,
+        182
+      ],
+      "tokens": [
+        "<s>",
+        "▁Family",
+        ":",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0xA8>",
+        "‍",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0xA9>",
+        "‍",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0xA7>",
+        "‍",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0xA6>",
+        "▁and",
+        "▁flag",
+        ":",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xAF>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xB5>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xB0>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xB7>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xA8>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x87>",
+        "<0xB3>"
+      ],
+      "decoded_with_special": "<s> Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳",
+      "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳"
+    },
+    "emoji_skin_tone": {
+      "input_ids": [
+        1,
+        29871,
+        243,
+        162,
+        148,
+        142,
+        243,
+        162,
+        146,
+        190,
+        243,
+        162,
+        148,
+        142,
+        243,
+        162,
+        146,
+        192,
+        243,
+        162,
+        148,
+        142,
+        243,
+        162,
+        146,
+        194,
+        10742,
+        411,
+        19309,
+        260,
+        2873
+      ],
+      "tokens": [
+        "<s>",
+        "▁",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0x8B>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x8F>",
+        "<0xBB>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0x8B>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x8F>",
+        "<0xBD>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x91>",
+        "<0x8B>",
+        "<0xF0>",
+        "<0x9F>",
+        "<0x8F>",
+        "<0xBF>",
+        "▁wave",
+        "▁with",
+        "▁skin",
+        "▁t",
+        "ones"
+      ],
+      "decoded_with_special": "<s> 👋🏻👋🏽👋🏿 wave with skin tones",
+      "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones"
+    },
+    "mixed_polyglot": {
+      "input_ids": [
+        1,
+        15043,
+        29871,
+        30793,
+        30967,
+        29871,
+        31734,
+        238,
+        136,
+        152,
+        29871,
+        30294,
+        30249,
+        30205,
+        30404,
+        29871,
+        30159,
+        30156,
+        30240,
+        30177,
+        30112,
+        29871,
+        30424,
+        30485,
+        30489,
+        30296,
+        30475,
+        30569,
+        29871,
+        30547,
+        30492,
+        30510,
+        30547,
+        30718,
+        30691,
+        29871,
+        30589,
+        30389,
+        30353,
+        30644,
+        30449
+      ],
+      "tokens": [
+        "<s>",
+        "▁Hello",
+        "▁",
+        "世",
+        "界",
+        "▁",
+        "안",
+        "<0xEB>",
+        "<0x85>",
+        "<0x95>",
+        "▁",
+        "ש",
+        "ל",
+        "ו",
+        "ם",
+        "▁",
+        "م",
+        "ر",
+        "ح",
+        "ب",
+        "ا",
+        "▁",
+        "न",
+        "म",
+        "स",
+        "्",
+        "त",
+        "े",
+        "▁",
+        "ส",
+        "ว",
+        "ั",
+        "ส",
+        "ด",
+        "ี",
+        "▁",
+        "こ",
+        "ん",
+        "に",
+        "ち",
+        "は"
+      ],
+      "decoded_with_special": "<s> Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは",
+      "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは"
+    },
+    "mixed_code_jp": {
+      "input_ids": [
+        1,
+        849,
+        29871,
+        30325,
+        30346,
+        30968,
+        30459,
+        30604,
+        30203,
+        30279,
+        13,
+        1026,
+        1395,
+        15133,
+        353,
+        376,
+        30589,
+        30389,
+        30353,
+        30644,
+        30449,
+        30330,
+        30793,
+        30967,
+        3850
+      ],
+      "tokens": [
+        "<s>",
+        "▁//",
+        "▁",
+        "日",
+        "本",
+        "語",
+        "コ",
+        "メ",
+        "ン",
+        "ト",
+        "<0x0A>",
+        "let",
+        "▁gre",
+        "eting",
+        "▁=",
+        "▁\"",
+        "こ",
+        "ん",
+        "に",
+        "ち",
+        "は",
+        "、",
+        "世",
+        "界",
+        "!\""
+      ],
+      "decoded_with_special": "<s> // 日本語コメント\nlet greeting = \"こんにちは、世界!\"",
+      "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\""
+    },
+    "ipa_basic": {
+      "input_ids": [
+        1,
+        450,
+        5641,
+        29909,
+        363,
+        525,
+        15161,
+        29915,
+        338,
+        847,
+        29888,
+        30312,
+        30376,
+        29914,
+        322,
+        363,
+        525,
+        3527,
+        29915,
+        338,
+        847,
+        30376,
+        30312,
+        29886,
+        6294
+      ],
+      "tokens": [
+        "<s>",
+        "▁The",
+        "▁IP",
+        "A",
+        "▁for",
+        "▁'",
+        "fish",
+        "'",
+        "▁is",
+        "▁/",
+        "f",
+        "ɪ",
+        "ʃ",
+        "/",
+        "▁and",
+        "▁for",
+        "▁'",
+        "ship",
+        "'",
+        "▁is",
+        "▁/",
+        "ʃ",
+        "ɪ",
+        "p",
+        "/."
+      ],
+      "decoded_with_special": "<s> The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.",
+      "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/."
+    },
+    "edge_combining": {
+      "input_ids": [
+        1,
+        274,
+        28059,
+        313,
+        29940,
+        8610,
+        29897,
+        7186,
+        5777,
+        1725,
+        30103,
+        313,
+        22498,
+        29928,
+        29897,
+        813,
+        1021,
+        1734,
+        29892,
+        1422,
+        6262,
+        29889
+      ],
+      "tokens": [
+        "<s>",
+        "▁c",
+        "afé",
+        "▁(",
+        "N",
+        "FC",
+        ")",
+        "▁vs",
+        "▁ca",
+        "fe",
+        "́",
+        "▁(",
+        "NF",
+        "D",
+        ")",
+        "▁—",
+        "▁same",
+        "▁word",
+        ",",
+        "▁different",
+        "▁bytes",
+        "."
+      ],
+      "decoded_with_special": "<s> café (NFC) vs café (NFD) — same word, different bytes.",
+      "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes."
+    },
+    "edge_long_repetition": {
+      "input_ids": [
+        1,
+        263,
+        27137,
+        27137,
+        27137,
+        27137,
+        27137,
+        27137,
+        27137,
+        7340,
+        29874,
+        289,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        1327,
+        29890
+      ],
+      "tokens": [
+        "<s>",
+        "▁a",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aa",
+        "a",
+        "▁b",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "b"
+      ],
+      "decoded_with_special": "<s> aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
+      "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    }
+  }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json
new file mode 100644
index 00000000..e063cb8d
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json
@@ -0,0 +1,1108 @@
+{
+  "model_id": "google-t5/t5-small",
+  "transformers_version": "4.57.1",
+  "entries": {
+    "ascii_simple": {
+      "input_ids": [
+        37,
+        1704,
+        4216,
+        3,
+        20400,
+        4418,
+        7,
+        147,
+        8,
+        19743,
+        1782,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁The",
+        "▁quick",
+        "▁brown",
+        "▁",
+        "fox",
+        "▁jump",
+        "s",
+        "▁over",
+        "▁the",
+        "▁lazy",
+        "▁dog",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "The quick brown fox jumps over the lazy dog.</s>",
+      "decoded_skip_special": "The quick brown fox jumps over the lazy dog."
+    },
+    "ascii_punct": {
+      "input_ids": [
+        8774,
+        6,
+        296,
+        55,
+        94,
+        31,
+        7,
+        586,
+        10,
+        3710,
+        3,
+        318,
+        96,
+        8270,
+        15,
+        121,
+        3,
+        31,
+        2521,
+        14618,
+        15,
+        31,
+        41,
+        1893,
+        35,
+        137,
+        1
+      ],
+      "tokens": [
+        "▁Hello",
+        ",",
+        "▁world",
+        "!",
+        "▁It",
+        "'",
+        "s",
+        "▁12",
+        ":",
+        "34",
+        "▁",
+        "—",
+        "▁\"",
+        "quot",
+        "e",
+        "\"",
+        "▁",
+        "'",
+        "apos",
+        "troph",
+        "e",
+        "'",
+        "▁(",
+        "par",
+        "en",
+        ").",
+        "</s>"
+      ],
+      "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).</s>",
+      "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)."
+    },
+    "ascii_numbers": {
+      "input_ids": [
+        2745,
+        19,
+        3241,
+        1877,
+        2534,
+        27904,
+        117,
+        3,
+        15,
+        19,
+        81,
+        3,
+        21280,
+        2606,
+        2577,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁Pi",
+        "▁is",
+        "▁approximately",
+        "▁3.",
+        "14",
+        "159",
+        ";",
+        "▁",
+        "e",
+        "▁is",
+        "▁about",
+        "▁",
+        "2.7",
+        "18",
+        "28",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.</s>",
+      "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828."
+    },
+    "ascii_url": {
+      "input_ids": [
+        4957,
+        4893,
+        1303,
+        107,
+        13917,
+        53,
+        4861,
+        5,
+        509,
+        87,
+        7171,
+        7,
+        11,
+        4842,
+        3,
+        17396,
+        1741,
+        994,
+        9,
+        9208,
+        5,
+        287,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁Visit",
+        "▁https",
+        "://",
+        "h",
+        "ugg",
+        "ing",
+        "face",
+        ".",
+        "co",
+        "/",
+        "doc",
+        "s",
+        "▁and",
+        "▁mail",
+        "▁",
+        "bob",
+        "@",
+        "ex",
+        "a",
+        "mple",
+        ".",
+        "com",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.</s>",
+      "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com."
+    },
+    "ascii_code": {
+      "input_ids": [
+        20,
+        89,
+        617,
+        599,
+        9,
+        10,
+        16,
+        17,
+        6,
+        3,
+        115,
+        10,
+        16,
+        17,
+        61,
+        3,
+        13114,
+        16,
+        17,
+        10,
+        1205,
+        3,
+        9,
+        1768,
+        3,
+        115,
+        1
+      ],
+      "tokens": [
+        "▁de",
+        "f",
+        "▁add",
+        "(",
+        "a",
+        ":",
+        "▁in",
+        "t",
+        ",",
+        "▁",
+        "b",
+        ":",
+        "▁in",
+        "t",
+        ")",
+        "▁",
+        "->",
+        "▁in",
+        "t",
+        ":",
+        "▁return",
+        "▁",
+        "a",
+        "▁+",
+        "▁",
+        "b",
+        "</s>"
+      ],
+      "decoded_with_special": "def add(a: int, b: int) -> int: return a + b</s>",
+      "decoded_skip_special": "def add(a: int, b: int) -> int: return a + b"
+    },
+    "ja_kana_basic": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "ja_dakuten": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "ja_handakuten": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "ja_kanji_mixed": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "ja_romaji_mixed": {
+      "input_ids": [
+        20477,
+        3,
+        2,
+        2040,
+        3696,
+        2217,
+        8585,
+        3,
+        2,
+        3,
+        2,
+        3785,
+        834,
+        23,
+        26,
+        7,
+        3,
+        2,
+        14145,
+        7,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁Swift",
+        "▁",
+        "<unk>",
+        "▁Auto",
+        "To",
+        "ken",
+        "izer",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁input",
+        "_",
+        "i",
+        "d",
+        "s",
+        "▁",
+        "<unk>",
+        "▁token",
+        "s",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "Swift <unk> AutoTokenizer <unk> <unk> input_ids <unk> tokens <unk></s>",
+      "decoded_skip_special": "Swift  AutoTokenizer   input_ids  tokens "
+    },
+    "ja_long_sentence": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "ko_hangul_simple": {
+      "input_ids": [
+        3,
+        2,
+        5,
+        3,
+        2,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        ".",
+        "▁",
+        "<unk>",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk>. <unk>.</s>",
+      "decoded_skip_special": ".."
+    },
+    "ko_hangul_jamo": {
+      "input_ids": [
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> <unk> <unk> <unk> <unk> <unk> <unk>.</s>",
+      "decoded_skip_special": "     ."
+    },
+    "zh_simplified": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "zh_traditional": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "zh_mixed_en": {
+      "input_ids": [
+        12901,
+        382,
+        127,
+        524,
+        3,
+        2,
+        1659,
+        1036,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁Py",
+        "T",
+        "or",
+        "ch",
+        "▁",
+        "<unk>",
+        "▁deep",
+        "▁learning",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "PyTorch <unk> deep learning <unk></s>",
+      "decoded_skip_special": "PyTorch  deep learning "
+    },
+    "ar_basic": {
+      "input_ids": [
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> <unk> <unk> <unk>.</s>",
+      "decoded_skip_special": "  ."
+    },
+    "ar_diacritics": {
+      "input_ids": [
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> <unk> <unk> <unk></s>",
+      "decoded_skip_special": "   "
+    },
+    "he_basic": {
+      "input_ids": [
+        3,
+        2,
+        3,
+        2,
+        5,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        ".",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> <unk>. <unk> <unk> <unk>.</s>",
+      "decoded_skip_special": ".  ."
+    },
+    "hi_devanagari": {
+      "input_ids": [
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> <unk> <unk> <unk> <unk></s>",
+      "decoded_skip_special": "    "
+    },
+    "th_basic": {
+      "input_ids": [
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk></s>",
+      "decoded_skip_special": ""
+    },
+    "emoji_bmp": {
+      "input_ids": [
+        3068,
+        3,
+        2,
+        8114,
+        3,
+        2,
+        2213,
+        3,
+        2,
+        842,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁Sun",
+        "▁",
+        "<unk>",
+        "▁moon",
+        "▁",
+        "<unk>",
+        "▁star",
+        "▁",
+        "<unk>",
+        "▁heart",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "Sun <unk> moon <unk> star <unk> heart <unk></s>",
+      "decoded_skip_special": "Sun  moon  star  heart "
+    },
+    "emoji_astral": {
+      "input_ids": [
+        3,
+        2,
+        12,
+        8,
+        8114,
+        3,
+        2,
+        28,
+        3,
+        9,
+        3,
+        2,
+        11,
+        3,
+        9,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁to",
+        "▁the",
+        "▁moon",
+        "▁",
+        "<unk>",
+        "▁with",
+        "▁",
+        "a",
+        "▁",
+        "<unk>",
+        "▁and",
+        "▁",
+        "a",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> to the moon <unk> with a <unk> and a <unk></s>",
+      "decoded_skip_special": " to the moon  with a  and a "
+    },
+    "emoji_zwj_family": {
+      "input_ids": [
+        3712,
+        10,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        11,
+        5692,
+        10,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁Family",
+        ":",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁and",
+        "▁flag",
+        ":",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "Family: <unk> <unk> <unk> <unk> and flag: <unk></s>",
+      "decoded_skip_special": "Family:     and flag: "
+    },
+    "emoji_skin_tone": {
+      "input_ids": [
+        3,
+        2,
+        6772,
+        28,
+        1133,
+        12,
+        1496,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "<unk>",
+        "▁wave",
+        "▁with",
+        "▁skin",
+        "▁to",
+        "nes",
+        "</s>"
+      ],
+      "decoded_with_special": "<unk> wave with skin tones</s>",
+      "decoded_skip_special": " wave with skin tones"
+    },
+    "mixed_polyglot": {
+      "input_ids": [
+        8774,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        3,
+        2,
+        1
+      ],
+      "tokens": [
+        "▁Hello",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "▁",
+        "<unk>",
+        "</s>"
+      ],
+      "decoded_with_special": "Hello <unk> <unk> <unk> <unk> <unk> <unk> <unk></s>",
+      "decoded_skip_special": "Hello       "
+    },
+    "mixed_code_jp": {
+      "input_ids": [
+        13751,
+        3,
+        2,
+        752,
+        18660,
+        3274,
+        96,
+        2,
+        4720,
+        1
+      ],
+      "tokens": [
+        "▁//",
+        "▁",
+        "<unk>",
+        "▁let",
+        "▁greeting",
+        "▁=",
+        "▁\"",
+        "<unk>",
+        "!\"",
+        "</s>"
+      ],
+      "decoded_with_special": "// <unk> let greeting = \"<unk>!\"</s>",
+      "decoded_skip_special": "//  let greeting = \"!\""
+    },
+    "ipa_basic": {
+      "input_ids": [
+        37,
+        3,
+        25981,
+        21,
+        3,
+        31,
+        6779,
+        31,
+        19,
+        3,
+        87,
+        89,
+        2,
+        87,
+        11,
+        21,
+        3,
+        31,
+        2009,
+        31,
+        19,
+        3,
+        87,
+        2,
+        102,
+        87,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁The",
+        "▁",
+        "IPA",
+        "▁for",
+        "▁",
+        "'",
+        "fish",
+        "'",
+        "▁is",
+        "▁",
+        "/",
+        "f",
+        "<unk>",
+        "/",
+        "▁and",
+        "▁for",
+        "▁",
+        "'",
+        "ship",
+        "'",
+        "▁is",
+        "▁",
+        "/",
+        "<unk>",
+        "p",
+        "/",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "The IPA for 'fish' is /f<unk>/ and for'ship' is /<unk>p/.</s>",
+      "decoded_skip_special": "The IPA for 'fish' is /f/ and for'ship' is /p/."
+    },
+    "edge_combining": {
+      "input_ids": [
+        11949,
+        41,
+        567,
+        5390,
+        61,
+        3,
+        208,
+        7,
+        11949,
+        41,
+        12619,
+        308,
+        61,
+        3,
+        318,
+        337,
+        1448,
+        6,
+        315,
+        57,
+        1422,
+        5,
+        1
+      ],
+      "tokens": [
+        "▁café",
+        "▁(",
+        "N",
+        "FC",
+        ")",
+        "▁",
+        "v",
+        "s",
+        "▁café",
+        "▁(",
+        "NF",
+        "D",
+        ")",
+        "▁",
+        "—",
+        "▁same",
+        "▁word",
+        ",",
+        "▁different",
+        "▁by",
+        "tes",
+        ".",
+        "</s>"
+      ],
+      "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.</s>",
+      "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes."
+    },
+    "edge_long_repetition": {
+      "input_ids": [
+        3,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        9,
+        3,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        115,
+        1
+      ],
+      "tokens": [
+        "▁",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "a",
+        "▁",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "b",
+        "</s>"
+      ],
+      "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb</s>",
+      "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    }
+  }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json
new file mode 100644
index 00000000..3fc1760e
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json
@@ -0,0 +1,2096 @@
+{
+  "model_id": "openai-community/gpt2",
+  "transformers_version": "4.57.1",
+  "entries": {
+    "ascii_simple": {
+      "input_ids": [
+        464,
+        2068,
+        7586,
+        21831,
+        18045,
+        625,
+        262,
+        16931,
+        3290,
+        13
+      ],
+      "tokens": [
+        "The",
+        "Ġquick",
+        "Ġbrown",
+        "Ġfox",
+        "Ġjumps",
+        "Ġover",
+        "Ġthe",
+        "Ġlazy",
+        "Ġdog",
+        "."
+      ],
+      "decoded_with_special": "The quick brown fox jumps over the lazy dog.",
+      "decoded_skip_special": "The quick brown fox jumps over the lazy dog."
+    },
+    "ascii_punct": {
+      "input_ids": [
+        15496,
+        11,
+        995,
+        0,
+        632,
+        338,
+        1105,
+        25,
+        2682,
+        851,
+        366,
+        22708,
+        1,
+        705,
+        499,
+        455,
+        22599,
+        6,
+        357,
+        11730,
+        737
+      ],
+      "tokens": [
+        "Hello",
+        ",",
+        "Ġworld",
+        "!",
+        "ĠIt",
+        "'s",
+        "Ġ12",
+        ":",
+        "34",
+        "ĠâĢĶ",
+        "Ġ\"",
+        "quote",
+        "\"",
+        "Ġ'",
+        "ap",
+        "ost",
+        "rophe",
+        "'",
+        "Ġ(",
+        "paren",
+        ")."
+      ],
+      "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).",
+      "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)."
+    },
+    "ascii_numbers": {
+      "input_ids": [
+        38729,
+        318,
+        6702,
+        513,
+        13,
+        1415,
+        19707,
+        26,
+        304,
+        318,
+        546,
+        362,
+        13,
+        45720,
+        2078,
+        13
+      ],
+      "tokens": [
+        "Pi",
+        "Ġis",
+        "Ġapproximately",
+        "Ġ3",
+        ".",
+        "14",
+        "159",
+        ";",
+        "Ġe",
+        "Ġis",
+        "Ġabout",
+        "Ġ2",
+        ".",
+        "718",
+        "28",
+        "."
+      ],
+      "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.",
+      "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828."
+    },
+    "ascii_url": {
+      "input_ids": [
+        31141,
+        3740,
+        1378,
+        71,
+        1018,
+        2667,
+        2550,
+        13,
+        1073,
+        14,
+        31628,
+        290,
+        6920,
+        29202,
+        31,
+        20688,
+        13,
+        785,
+        13
+      ],
+      "tokens": [
+        "Visit",
+        "Ġhttps",
+        "://",
+        "h",
+        "ug",
+        "ging",
+        "face",
+        ".",
+        "co",
+        "/",
+        "docs",
+        "Ġand",
+        "Ġmail",
+        "Ġbob",
+        "@",
+        "example",
+        ".",
+        "com",
+        "."
+      ],
+      "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.",
+      "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com."
+    },
+    "ascii_code": {
+      "input_ids": [
+        4299,
+        751,
+        7,
+        64,
+        25,
+        493,
+        11,
+        275,
+        25,
+        493,
+        8,
+        4613,
+        493,
+        25,
+        198,
+        220,
+        220,
+        220,
+        1441,
+        257,
+        1343,
+        275
+      ],
+      "tokens": [
+        "def",
+        "Ġadd",
+        "(",
+        "a",
+        ":",
+        "Ġint",
+        ",",
+        "Ġb",
+        ":",
+        "Ġint",
+        ")",
+        "Ġ->",
+        "Ġint",
+        ":",
+        "Ċ",
+        "Ġ",
+        "Ġ",
+        "Ġ",
+        "Ġreturn",
+        "Ġa",
+        "Ġ+",
+        "Ġb"
+      ],
+      "decoded_with_special": "def add(a: int, b: int) -> int:\n    return a + b",
+      "decoded_skip_special": "def add(a: int, b: int) -> int:\n    return a + b"
+    },
+    "ja_kana_basic": {
+      "input_ids": [
+        40948,
+        18566,
+        29557,
+        2515,
+        230,
+        2515,
+        232,
+        27370,
+        33778,
+        31917,
+        2515,
+        239,
+        46036,
+        43357,
+        22180,
+        33623,
+        2515,
+        249,
+        2515,
+        251
+      ],
+      "tokens": [
+        "ãģĤ",
+        "ãģĦ",
+        "ãģĨ",
+        "ãģ",
+        "Ī",
+        "ãģ",
+        "Ĭ",
+        "ãģĭ",
+        "ãģį",
+        "ãģı",
+        "ãģ",
+        "ĳ",
+        "ãģĵ",
+        "ãģķ",
+        "ãģĹ",
+        "ãģĻ",
+        "ãģ",
+        "Ľ",
+        "ãģ",
+        "Ŀ"
+      ],
+      "decoded_with_special": "あいうえおかきくけこさしすせそ",
+      "decoded_skip_special": "あいうえおかきくけこさしすせそ"
+    },
+    "ja_dakuten": {
+      "input_ids": [
+        35585,
+        2515,
+        236,
+        2515,
+        238,
+        2515,
+        240,
+        2515,
+        242,
+        2515,
+        244,
+        2515,
+        246,
+        2515,
+        248,
+        2515,
+        250,
+        2515,
+        252,
+        46777,
+        2515,
+        95,
+        2515,
+        98,
+        30640,
+        2515,
+        102
+      ],
+      "tokens": [
+        "ãģĮ",
+        "ãģ",
+        "İ",
+        "ãģ",
+        "Ĳ",
+        "ãģ",
+        "Ĵ",
+        "ãģ",
+        "Ķ",
+        "ãģ",
+        "ĸ",
+        "ãģ",
+        "ĺ",
+        "ãģ",
+        "ļ",
+        "ãģ",
+        "ľ",
+        "ãģ",
+        "ŀ",
+        "ãģł",
+        "ãģ",
+        "¢",
+        "ãģ",
+        "¥",
+        "ãģ§",
+        "ãģ",
+        "©"
+      ],
+      "decoded_with_special": "がぎぐげござじずぜぞだぢづでど",
+      "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど"
+    },
+    "ja_handakuten": {
+      "input_ids": [
+        2515,
+        109,
+        2515,
+        112,
+        2515,
+        115,
+        2515,
+        118,
+        2515,
+        121,
+        32546,
+        1209,
+        242,
+        30965,
+        1209,
+        248,
+        1209,
+        251
+      ],
+      "tokens": [
+        "ãģ",
+        "±",
+        "ãģ",
+        "´",
+        "ãģ",
+        "·",
+        "ãģ",
+        "º",
+        "ãģ",
+        "½",
+        "ãĥĳ",
+        "ãĥ",
+        "Ķ",
+        "ãĥĹ",
+        "ãĥ",
+        "ļ",
+        "ãĥ",
+        "Ŀ"
+      ],
+      "decoded_with_special": "ぱぴぷぺぽパピプペポ",
+      "decoded_skip_special": "ぱぴぷぺぽパピプペポ"
+    },
+    "ja_kanji_mixed": {
+      "input_ids": [
+        33768,
+        98,
+        17312,
+        105,
+        45739,
+        252,
+        15474,
+        121,
+        95,
+        162,
+        26534,
+        163,
+        112,
+        254,
+        164,
+        100,
+        96,
+        162,
+        252,
+        238,
+        31676,
+        37239,
+        96,
+        22180,
+        18566,
+        161,
+        243,
+        237,
+        165,
+        94,
+        234,
+        30640,
+        33623,
+        16764
+      ],
+      "tokens": [
+        "æĹ",
+        "¥",
+        "æľ",
+        "¬",
+        "èª",
+        "ŀ",
+        "ãģ®å",
+        "½",
+        "¢",
+        "æ",
+        "ħĭ",
+        "ç",
+        "´",
+        "ł",
+        "è",
+        "§",
+        "£",
+        "æ",
+        "ŀ",
+        "Ĳ",
+        "ãģ¯",
+        "éĽ",
+        "£",
+        "ãģĹ",
+        "ãģĦ",
+        "å",
+        "ķ",
+        "ı",
+        "é",
+        "¡",
+        "Į",
+        "ãģ§",
+        "ãģĻ",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "日本語の形態素解析は難しい問題です。",
+      "decoded_skip_special": "日本語の形態素解析は難しい問題です。"
+    },
+    "ja_romaji_mixed": {
+      "input_ids": [
+        10462,
+        2135,
+        23294,
+        100,
+        11160,
+        30642,
+        7509,
+        17433,
+        240,
+        164,
+        102,
+        99,
+        33623,
+        16764,
+        13328,
+        113,
+        238,
+        162,
+        252,
+        250,
+        31676,
+        5128,
+        62,
+        2340,
+        23294,
+        101,
+        16326,
+        220,
+        5641,
+        12859,
+        234,
+        2515,
+        97,
+        16764
+      ],
+      "tokens": [
+        "Sw",
+        "ift",
+        "Ġãģ",
+        "§",
+        "ĠAuto",
+        "Token",
+        "izer",
+        "ĠãĤ",
+        "Ĵ",
+        "è",
+        "©",
+        "¦",
+        "ãģĻ",
+        "ãĢĤ",
+        "Ġç",
+        "µ",
+        "Ĳ",
+        "æ",
+        "ŀ",
+        "ľ",
+        "ãģ¯",
+        "Ġinput",
+        "_",
+        "ids",
+        "Ġãģ",
+        "¨",
+        "Ġtokens",
+        "Ġ",
+        "ãģ®",
+        "äº",
+        "Į",
+        "ãģ",
+        "¤",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。",
+      "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。"
+    },
+    "ja_long_sentence": {
+      "input_ids": [
+        30266,
+        109,
+        12859,
+        105,
+        31965,
+        117,
+        164,
+        101,
+        109,
+        164,
+        101,
+        109,
+        20998,
+        107,
+        161,
+        109,
+        222,
+        5641,
+        164,
+        101,
+        109,
+        20998,
+        107,
+        161,
+        109,
+        222,
+        165,
+        243,
+        115,
+        35585,
+        164,
+        101,
+        109,
+        20998,
+        107,
+        31758,
+        49035,
+        118,
+        22180,
+        25224,
+        36853,
+        22180,
+        18566,
+        35585,
+        23513,
+        17312,
+        105,
+        37605,
+        241,
+        46777,
+        1792,
+        235,
+        29557,
+        27370,
+        16764
+      ],
+      "tokens": [
+        "æĿ",
+        "±",
+        "äº",
+        "¬",
+        "çī",
+        "¹",
+        "è",
+        "¨",
+        "±",
+        "è",
+        "¨",
+        "±",
+        "åı",
+        "¯",
+        "å",
+        "±",
+        "Ģ",
+        "ãģ®",
+        "è",
+        "¨",
+        "±",
+        "åı",
+        "¯",
+        "å",
+        "±",
+        "Ģ",
+        "é",
+        "ķ",
+        "·",
+        "ãģĮ",
+        "è",
+        "¨",
+        "±",
+        "åı",
+        "¯",
+        "ãĤĴ",
+        "åĩ",
+        "º",
+        "ãģĹ",
+        "ãģŁ",
+        "ãĤī",
+        "ãģĹ",
+        "ãģĦ",
+        "ãģĮ",
+        "ãĢģ",
+        "æľ",
+        "¬",
+        "å½",
+        "ĵ",
+        "ãģł",
+        "ãĤ",
+        "į",
+        "ãģĨ",
+        "ãģĭ",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。",
+      "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。"
+    },
+    "ko_hangul_simple": {
+      "input_ids": [
+        168,
+        243,
+        230,
+        167,
+        227,
+        243,
+        47991,
+        246,
+        168,
+        226,
+        116,
+        168,
+        248,
+        242,
+        13,
+        31619,
+        108,
+        246,
+        166,
+        108,
+        239,
+        168,
+        232,
+        113,
+        46695,
+        230,
+        46695,
+        97,
+        13
+      ],
+      "tokens": [
+        "ì",
+        "ķ",
+        "Ī",
+        "ë",
+        "ħ",
+        "ķ",
+        "íķ",
+        "ĺ",
+        "ì",
+        "Ħ",
+        "¸",
+        "ì",
+        "ļ",
+        "Ķ",
+        ".",
+        "Ġë",
+        "°",
+        "ĺ",
+        "ê",
+        "°",
+        "ĳ",
+        "ì",
+        "Ĭ",
+        "µ",
+        "ëĭ",
+        "Ī",
+        "ëĭ",
+        "¤",
+        "."
+      ],
+      "decoded_with_special": "안녕하세요. 반갑습니다.",
+      "decoded_skip_special": "안녕하세요. 반갑습니다."
+    },
+    "ko_hangul_jamo": {
+      "input_ids": [
+        157,
+        226,
+        240,
+        157,
+        227,
+        94,
+        157,
+        228,
+        104,
+        166,
+        116,
+        222,
+        23821,
+        252,
+        238,
+        167,
+        103,
+        101,
+        31619,
+        114,
+        226,
+        167,
+        99,
+        105,
+        220,
+        169,
+        246,
+        243,
+        168,
+        233,
+        251,
+        166,
+        111,
+        120,
+        220,
+        47991,
+        250,
+        166,
+        116,
+        222,
+        23821,
+        251,
+        234,
+        168,
+        254,
+        230,
+        220,
+        169,
+        246,
+        243,
+        168,
+        233,
+        251,
+        13
+      ],
+      "tokens": [
+        "á",
+        "Ħ",
+        "Ĵ",
+        "á",
+        "ħ",
+        "¡",
+        "á",
+        "Ĩ",
+        "«",
+        "ê",
+        "¸",
+        "Ģ",
+        "Ġì",
+        "ŀ",
+        "Ĳ",
+        "ë",
+        "ª",
+        "¨",
+        "Ġë",
+        "¶",
+        "Ħ",
+        "ë",
+        "¦",
+        "¬",
+        "Ġ",
+        "í",
+        "ĺ",
+        "ķ",
+        "ì",
+        "ĭ",
+        "Ŀ",
+        "ê",
+        "³",
+        "¼",
+        "Ġ",
+        "íķ",
+        "ľ",
+        "ê",
+        "¸",
+        "Ģ",
+        "Ġì",
+        "Ŀ",
+        "Į",
+        "ì",
+        "ł",
+        "Ī",
+        "Ġ",
+        "í",
+        "ĺ",
+        "ķ",
+        "ì",
+        "ĭ",
+        "Ŀ",
+        "."
+      ],
+      "decoded_with_special": "한글 자모 분리 형식과 한글 음절 형식.",
+      "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식."
+    },
+    "zh_simplified": {
+      "input_ids": [
+        17312,
+        118,
+        161,
+        247,
+        101,
+        27764,
+        99,
+        20046,
+        254,
+        42468,
+        21689,
+        32432,
+        98,
+        162,
+        247,
+        118,
+        47797,
+        121,
+        21410,
+        31660,
+        10310,
+        103,
+        34932,
+        235,
+        17358,
+        223,
+        26344,
+        228,
+        162,
+        242,
+        107,
+        16764
+      ],
+      "tokens": [
+        "æľ",
+        "º",
+        "å",
+        "Ļ",
+        "¨",
+        "åŃ",
+        "¦",
+        "ä¹",
+        "ł",
+        "æĺ¯",
+        "äºº",
+        "å·",
+        "¥",
+        "æ",
+        "Ļ",
+        "º",
+        "èĥ",
+        "½",
+        "çļĦ",
+        "ä¸Ģ",
+        "ä¸",
+        "ª",
+        "éĩ",
+        "į",
+        "è¦",
+        "ģ",
+        "åĪ",
+        "Ĩ",
+        "æ",
+        "Ķ",
+        "¯",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "机器学习是人工智能的一个重要分支。",
+      "decoded_skip_special": "机器学习是人工智能的一个重要分支。"
+    },
+    "zh_traditional": {
+      "input_ids": [
+        49960,
+        161,
+        247,
+        101,
+        27764,
+        116,
+        163,
+        123,
+        240,
+        42468,
+        21689,
+        32432,
+        98,
+        162,
+        247,
+        118,
+        162,
+        227,
+        100,
+        21410,
+        31660,
+        161,
+        222,
+        233,
+        34932,
+        235,
+        17358,
+        223,
+        26344,
+        228,
+        162,
+        242,
+        107,
+        16764
+      ],
+      "tokens": [
+        "æ©Ł",
+        "å",
+        "Ļ",
+        "¨",
+        "åŃ",
+        "¸",
+        "ç",
+        "¿",
+        "Ĵ",
+        "æĺ¯",
+        "äºº",
+        "å·",
+        "¥",
+        "æ",
+        "Ļ",
+        "º",
+        "æ",
+        "ħ",
+        "§",
+        "çļĦ",
+        "ä¸Ģ",
+        "å",
+        "Ģ",
+        "ĭ",
+        "éĩ",
+        "į",
+        "è¦",
+        "ģ",
+        "åĪ",
+        "Ĩ",
+        "æ",
+        "Ķ",
+        "¯",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "機器學習是人工智慧的一個重要分支。",
+      "decoded_skip_special": "機器學習是人工智慧的一個重要分支。"
+    },
+    "zh_mixed_en": {
+      "input_ids": [
+        20519,
+        15884,
+        354,
+        10545,
+        246,
+        107,
+        31660,
+        10310,
+        103,
+        2769,
+        4673,
+        10545,
+        94,
+        228,
+        162,
+        252,
+        114,
+        16764
+      ],
+      "tokens": [
+        "Py",
+        "Tor",
+        "ch",
+        "Ġæ",
+        "ĺ",
+        "¯",
+        "ä¸Ģ",
+        "ä¸",
+        "ª",
+        "Ġdeep",
+        "Ġlearning",
+        "Ġæ",
+        "¡",
+        "Ĩ",
+        "æ",
+        "ŀ",
+        "¶",
+        "ãĢĤ"
+      ],
+      "decoded_with_special": "PyTorch 是一个 deep learning 框架。",
+      "decoded_skip_special": "PyTorch 是一个 deep learning 框架。"
+    },
+    "ar_basic": {
+      "input_ids": [
+        23525,
+        13862,
+        148,
+        118,
+        45632,
+        28981,
+        44690,
+        26897,
+        39848,
+        22654,
+        45632,
+        17550,
+        105,
+        25405,
+        22654,
+        13862,
+        45632,
+        17550,
+        105,
+        38843,
+        12919,
+        13
+      ],
+      "tokens": [
+        "Ø§ÙĦ",
+        "ÙĦ",
+        "Ø",
+        "º",
+        "Ø©",
+        "ĠØ§ÙĦ",
+        "Ø¹",
+        "Ø±",
+        "Ø¨",
+        "ÙĬ",
+        "Ø©",
+        "ĠØ",
+        "¬",
+        "Ùħ",
+        "ÙĬ",
+        "ÙĦ",
+        "Ø©",
+        "ĠØ",
+        "¬",
+        "Ø¯",
+        "Ø§",
+        "."
+      ],
+      "decoded_with_special": "اللغة العربية جميلة جدا.",
+      "decoded_skip_special": "اللغة العربية جميلة جدا."
+    },
+    "ar_diacritics": {
+      "input_ids": [
+        39848,
+        44208,
+        45692,
+        48763,
+        25405,
+        44208,
+        28981,
+        13862,
+        24333,
+        149,
+        239,
+        29519,
+        44208,
+        28981,
+        26897,
+        24333,
+        149,
+        239,
+        148,
+        255,
+        48763,
+        25405,
+        24333,
+        149,
+        108,
+        23338,
+        44208,
+        28981,
+        26897,
+        24333,
+        149,
+        239,
+        148,
+        255,
+        44208,
+        22654,
+        25405,
+        44208
+      ],
+      "tokens": [
+        "Ø¨",
+        "ÙĲ",
+        "Ø³",
+        "ÙĴ",
+        "Ùħ",
+        "ÙĲ",
+        "ĠØ§ÙĦ",
+        "ÙĦ",
+        "Ùİ",
+        "Ù",
+        "ĳ",
+        "Ùĩ",
+        "ÙĲ",
+        "ĠØ§ÙĦ",
+        "Ø±",
+        "Ùİ",
+        "Ù",
+        "ĳ",
+        "Ø",
+        "Ń",
+        "ÙĴ",
+        "Ùħ",
+        "Ùİ",
+        "Ù",
+        "°",
+        "ÙĨ",
+        "ÙĲ",
+        "ĠØ§ÙĦ",
+        "Ø±",
+        "Ùİ",
+        "Ù",
+        "ĳ",
+        "Ø",
+        "Ń",
+        "ÙĲ",
+        "ÙĬ",
+        "Ùħ",
+        "ÙĲ"
+      ],
+      "decoded_with_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ",
+      "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ"
+    },
+    "he_basic": {
+      "input_ids": [
+        50227,
+        40010,
+        27072,
+        147,
+        251,
+        14360,
+        95,
+        27072,
+        40010,
+        147,
+        251,
+        13,
+        14360,
+        244,
+        38269,
+        14360,
+        246,
+        147,
+        100,
+        147,
+        94,
+        147,
+        246,
+        14360,
+        95,
+        49603,
+        37778,
+        25529,
+        13
+      ],
+      "tokens": [
+        "×©",
+        "×ľ",
+        "×ķ",
+        "×",
+        "Ŀ",
+        "Ġ×",
+        "¢",
+        "×ķ",
+        "×ľ",
+        "×",
+        "Ŀ",
+        ".",
+        "Ġ×",
+        "ĸ",
+        "×Ķ",
+        "Ġ×",
+        "ĺ",
+        "×",
+        "§",
+        "×",
+        "¡",
+        "×",
+        "ĺ",
+        "Ġ×",
+        "¢",
+        "×ĳ",
+        "×¨",
+        "×Ļ",
+        "."
+      ],
+      "decoded_with_special": "שלום עולם. זה טקסט עברי.",
+      "decoded_skip_special": "שלום עולם. זה טקסט עברי."
+    },
+    "hi_devanagari": {
+      "input_ids": [
+        11976,
+        117,
+        11976,
+        123,
+        11976,
+        101,
+        24231,
+        235,
+        11976,
+        99,
+        24231,
+        222,
+        28225,
+        255,
+        48077,
+        11976,
+        115,
+        48077,
+        28225,
+        105,
+        11976,
+        117,
+        24231,
+        223,
+        11976,
+        97,
+        28225,
+        116,
+        11976,
+        106,
+        24231,
+        225,
+        11976,
+        99,
+        24231,
+        235,
+        11976,
+        100,
+        28225,
+        117,
+        24231,
+        230,
+        24231,
+        97
+      ],
+      "tokens": [
+        "à¤",
+        "¹",
+        "à¤",
+        "¿",
+        "à¤",
+        "¨",
+        "à¥",
+        "į",
+        "à¤",
+        "¦",
+        "à¥",
+        "Ģ",
+        "Ġà¤",
+        "Ń",
+        "à¤¾",
+        "à¤",
+        "·",
+        "à¤¾",
+        "Ġà¤",
+        "¬",
+        "à¤",
+        "¹",
+        "à¥",
+        "ģ",
+        "à¤",
+        "¤",
+        "Ġà¤",
+        "¸",
+        "à¤",
+        "®",
+        "à¥",
+        "ĥ",
+        "à¤",
+        "¦",
+        "à¥",
+        "į",
+        "à¤",
+        "§",
+        "Ġà¤",
+        "¹",
+        "à¥",
+        "Ī",
+        "à¥",
+        "¤"
+      ],
+      "decoded_with_special": "हिन्दी भाषा बहुत समृद्ध है।",
+      "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।"
+    },
+    "th_basic": {
+      "input_ids": [
+        19567,
+        223,
+        19567,
+        110,
+        19567,
+        96,
+        19567,
+        249,
+        19567,
+        96,
+        19567,
+        108,
+        19567,
+        94,
+        19567,
+        100,
+        19567,
+        98,
+        19567,
+        250,
+        19567,
+        98,
+        19567,
+        254,
+        19567,
+        110,
+        19567,
+        102,
+        19567,
+        110,
+        31479,
+        226,
+        19567,
+        245,
+        19567,
+        95,
+        19567,
+        233,
+        19567,
+        109,
+        19567,
+        248,
+        19567,
+        233,
+        31479,
+        231,
+        19567,
+        255,
+        19567,
+        247,
+        31479,
+        222,
+        19567,
+        252,
+        19567,
+        96,
+        19567,
+        110,
+        19567,
+        108,
+        31479,
+        226,
+        19567,
+        94,
+        31479,
+        230,
+        19567,
+        94,
+        19567,
+        113,
+        19567,
+        232,
+        31479,
+        230,
+        19567,
+        255,
+        19567,
+        229,
+        19567,
+        100,
+        31479,
+        230,
+        19567,
+        110,
+        19567,
+        229,
+        19567,
+        96,
+        19567,
+        108,
+        19567,
+        104,
+        19567,
+        100,
+        31479,
+        230,
+        19567,
+        110,
+        19567,
+        229,
+        19567,
+        226,
+        19567,
+        111
+      ],
+      "tokens": [
+        "à¸",
+        "ģ",
+        "à¸",
+        "²",
+        "à¸",
+        "£",
+        "à¸",
+        "Ľ",
+        "à¸",
+        "£",
+        "à¸",
+        "°",
+        "à¸",
+        "¡",
+        "à¸",
+        "§",
+        "à¸",
+        "¥",
+        "à¸",
+        "ľ",
+        "à¸",
+        "¥",
+        "à¸",
+        "ł",
+        "à¸",
+        "²",
+        "à¸",
+        "©",
+        "à¸",
+        "²",
+        "à¹",
+        "Ħ",
+        "à¸",
+        "Ĺ",
+        "à¸",
+        "¢",
+        "à¸",
+        "ĭ",
+        "à¸",
+        "±",
+        "à¸",
+        "ļ",
+        "à¸",
+        "ĭ",
+        "à¹",
+        "ī",
+        "à¸",
+        "Ń",
+        "à¸",
+        "Ļ",
+        "à¹",
+        "Ģ",
+        "à¸",
+        "ŀ",
+        "à¸",
+        "£",
+        "à¸",
+        "²",
+        "à¸",
+        "°",
+        "à¹",
+        "Ħ",
+        "à¸",
+        "¡",
+        "à¹",
+        "Ī",
+        "à¸",
+        "¡",
+        "à¸",
+        "µ",
+        "à¸",
+        "Ĭ",
+        "à¹",
+        "Ī",
+        "à¸",
+        "Ń",
+        "à¸",
+        "ĩ",
+        "à¸",
+        "§",
+        "à¹",
+        "Ī",
+        "à¸",
+        "²",
+        "à¸",
+        "ĩ",
+        "à¸",
+        "£",
+        "à¸",
+        "°",
+        "à¸",
+        "«",
+        "à¸",
+        "§",
+        "à¹",
+        "Ī",
+        "à¸",
+        "²",
+        "à¸",
+        "ĩ",
+        "à¸",
+        "Ħ",
+        "à¸",
+        "³"
+      ],
+      "decoded_with_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ",
+      "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ"
+    },
+    "emoji_bmp": {
+      "input_ids": [
+        16012,
+        34719,
+        222,
+        8824,
+        34719,
+        122,
+        3491,
+        23883,
+        2612,
+        20724,
+        98
+      ],
+      "tokens": [
+        "Sun",
+        "Ġâĺ",
+        "Ģ",
+        "Ġmoon",
+        "Ġâĺ",
+        "¾",
+        "Ġstar",
+        "Ġâĺħ",
+        "Ġheart",
+        "ĠâĻ",
+        "¥"
+      ],
+      "decoded_with_special": "Sun ☀ moon ☾ star ★ heart ♥",
+      "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥"
+    },
+    "emoji_astral": {
+      "input_ids": [
+        8582,
+        248,
+        222,
+        284,
+        262,
+        8824,
+        12520,
+        234,
+        247,
+        351,
+        257,
+        12520,
+        238,
+        109,
+        290,
+        257,
+        12520,
+        236,
+        231
+      ],
+      "tokens": [
+        "ðŁ",
+        "ļ",
+        "Ģ",
+        "Ġto",
+        "Ġthe",
+        "Ġmoon",
+        "ĠðŁ",
+        "Į",
+        "Ļ",
+        "Ġwith",
+        "Ġa",
+        "ĠðŁ",
+        "Ĳ",
+        "±",
+        "Ġand",
+        "Ġa",
+        "ĠðŁ",
+        "İ",
+        "ī"
+      ],
+      "decoded_with_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉",
+      "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉"
+    },
+    "emoji_zwj_family": {
+      "input_ids": [
+        24094,
+        25,
+        50169,
+        101,
+        447,
+        235,
+        41840,
+        102,
+        447,
+        235,
+        41840,
+        100,
+        447,
+        235,
+        41840,
+        99,
+        290,
+        6056,
+        25,
+        12520,
+        229,
+        107,
+        8582,
+        229,
+        113,
+        8582,
+        229,
+        108,
+        8582,
+        229,
+        115,
+        8582,
+        229,
+        101,
+        8582,
+        229,
+        111
+      ],
+      "tokens": [
+        "Family",
+        ":",
+        "ĠðŁĳ",
+        "¨",
+        "âĢ",
+        "į",
+        "ðŁĳ",
+        "©",
+        "âĢ",
+        "į",
+        "ðŁĳ",
+        "§",
+        "âĢ",
+        "į",
+        "ðŁĳ",
+        "¦",
+        "Ġand",
+        "Ġflag",
+        ":",
+        "ĠðŁ",
+        "ĩ",
+        "¯",
+        "ðŁ",
+        "ĩ",
+        "µ",
+        "ðŁ",
+        "ĩ",
+        "°",
+        "ðŁ",
+        "ĩ",
+        "·",
+        "ðŁ",
+        "ĩ",
+        "¨",
+        "ðŁ",
+        "ĩ",
+        "³"
+      ],
+      "decoded_with_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳",
+      "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳"
+    },
+    "emoji_skin_tone": {
+      "input_ids": [
+        41840,
+        233,
+        8582,
+        237,
+        119,
+        41840,
+        233,
+        8582,
+        237,
+        121,
+        41840,
+        233,
+        8582,
+        237,
+        123,
+        6769,
+        351,
+        4168,
+        23755
+      ],
+      "tokens": [
+        "ðŁĳ",
+        "ĭ",
+        "ðŁ",
+        "ı",
+        "»",
+        "ðŁĳ",
+        "ĭ",
+        "ðŁ",
+        "ı",
+        "½",
+        "ðŁĳ",
+        "ĭ",
+        "ðŁ",
+        "ı",
+        "¿",
+        "Ġwave",
+        "Ġwith",
+        "Ġskin",
+        "Ġtones"
+      ],
+      "decoded_with_special": "👋🏻👋🏽👋🏿 wave with skin tones",
+      "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones"
+    },
+    "mixed_polyglot": {
+      "input_ids": [
+        15496,
+        220,
+        10310,
+        244,
+        45911,
+        234,
+        23821,
+        243,
+        230,
+        167,
+        227,
+        243,
+        14360,
+        102,
+        40010,
+        27072,
+        147,
+        251,
+        47048,
+        26897,
+        148,
+        255,
+        39848,
+        12919,
+        28225,
+        101,
+        11976,
+        106,
+        11976,
+        116,
+        24231,
+        235,
+        11976,
+        97,
+        24231,
+        229,
+        220,
+        19567,
+        103,
+        19567,
+        100,
+        19567,
+        109,
+        19567,
+        103,
+        19567,
+        242,
+        19567,
+        113,
+        23294,
+        241,
+        22174,
+        28618,
+        2515,
+        94,
+        31676
+      ],
+      "tokens": [
+        "Hello",
+        "Ġ",
+        "ä¸",
+        "ĸ",
+        "çķ",
+        "Į",
+        "Ġì",
+        "ķ",
+        "Ī",
+        "ë",
+        "ħ",
+        "ķ",
+        "Ġ×",
+        "©",
+        "×ľ",
+        "×ķ",
+        "×",
+        "Ŀ",
+        "ĠÙħ",
+        "Ø±",
+        "Ø",
+        "Ń",
+        "Ø¨",
+        "Ø§",
+        "Ġà¤",
+        "¨",
+        "à¤",
+        "®",
+        "à¤",
+        "¸",
+        "à¥",
+        "į",
+        "à¤",
+        "¤",
+        "à¥",
+        "ĩ",
+        "Ġ",
+        "à¸",
+        "ª",
+        "à¸",
+        "§",
+        "à¸",
+        "±",
+        "à¸",
+        "ª",
+        "à¸",
+        "Ķ",
+        "à¸",
+        "µ",
+        "Ġãģ",
+        "ĵ",
+        "ãĤĵ",
+        "ãģ«",
+        "ãģ",
+        "¡",
+        "ãģ¯"
+      ],
+      "decoded_with_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは",
+      "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは"
+    },
+    "mixed_code_jp": {
+      "input_ids": [
+        1003,
+        10545,
+        245,
+        98,
+        17312,
+        105,
+        45739,
+        252,
+        24679,
+        26998,
+        6527,
+        13298,
+        198,
+        1616,
+        31933,
+        796,
+        366,
+        46036,
+        22174,
+        28618,
+        2515,
+        94,
+        31676,
+        23513,
+        10310,
+        244,
+        45911,
+        234,
+        2474
+      ],
+      "tokens": [
+        "//",
+        "Ġæ",
+        "Ĺ",
+        "¥",
+        "æľ",
+        "¬",
+        "èª",
+        "ŀ",
+        "ãĤ³",
+        "ãĥ¡",
+        "ãĥ³",
+        "ãĥĪ",
+        "Ċ",
+        "let",
+        "Ġgreeting",
+        "Ġ=",
+        "Ġ\"",
+        "ãģĵ",
+        "ãĤĵ",
+        "ãģ«",
+        "ãģ",
+        "¡",
+        "ãģ¯",
+        "ãĢģ",
+        "ä¸",
+        "ĸ",
+        "çķ",
+        "Į",
+        "!\""
+      ],
+      "decoded_with_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"",
+      "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\""
+    },
+    "ipa_basic": {
+      "input_ids": [
+        464,
+        27966,
+        329,
+        705,
+        11084,
+        6,
+        318,
+        1220,
+        69,
+        133,
+        103,
+        134,
+        225,
+        14,
+        290,
+        329,
+        705,
+        6720,
+        6,
+        318,
+        1220,
+        134,
+        225,
+        133,
+        103,
+        79,
+        11757
+      ],
+      "tokens": [
+        "The",
+        "ĠIPA",
+        "Ġfor",
+        "Ġ'",
+        "fish",
+        "'",
+        "Ġis",
+        "Ġ/",
+        "f",
+        "É",
+        "ª",
+        "Ê",
+        "ĥ",
+        "/",
+        "Ġand",
+        "Ġfor",
+        "Ġ'",
+        "ship",
+        "'",
+        "Ġis",
+        "Ġ/",
+        "Ê",
+        "ĥ",
+        "É",
+        "ª",
+        "p",
+        "/."
+      ],
+      "decoded_with_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.",
+      "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/."
+    },
+    "edge_combining": {
+      "input_ids": [
+        66,
+        1878,
+        2634,
+        357,
+        45,
+        4851,
+        8,
+        3691,
+        26725,
+        136,
+        223,
+        357,
+        21870,
+        35,
+        8,
+        851,
+        976,
+        1573,
+        11,
+        1180,
+        9881,
+        13
+      ],
+      "tokens": [
+        "c",
+        "af",
+        "Ã©",
+        "Ġ(",
+        "N",
+        "FC",
+        ")",
+        "Ġvs",
+        "Ġcafe",
+        "Ì",
+        "ģ",
+        "Ġ(",
+        "NF",
+        "D",
+        ")",
+        "ĠâĢĶ",
+        "Ġsame",
+        "Ġword",
+        ",",
+        "Ġdifferent",
+        "Ġbytes",
+        "."
+      ],
+      "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.",
+      "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes."
+    },
+    "edge_long_repetition": {
+      "input_ids": [
+        24794,
+        24794,
+        24794,
+        24794,
+        24794,
+        24794,
+        24794,
+        24794,
+        275,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        11848,
+        65
+      ],
+      "tokens": [
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "aaaa",
+        "Ġb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "bb",
+        "b"
+      ],
+      "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb",
+      "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+    }
+  }
+}
diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json b/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json
new file mode 100644
index 00000000..b5eda950
--- /dev/null
+++ b/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json
@@ -0,0 +1,36 @@
+{
+  "schema_version": 1,
+  "description": "Multilingual stress corpus for byte-identical tokenization parity tests against HuggingFace Python `transformers`. Each entry is keyed by a stable id so baselines can be re-aligned across tokenizer kernels. Designed to exercise script boundaries that decoder-only English corpora miss: voiced-kana / dakuten, Hangul jamo composition, Han ideographs (simplified + traditional), RTL bidi, Indic + Thai, astral-plane glyphs, ZWJ grapheme clusters, and mixed-script code.",
+  "inputs": [
+    { "id": "ascii_simple", "category": "ascii", "text": "The quick brown fox jumps over the lazy dog." },
+    { "id": "ascii_punct", "category": "ascii", "text": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." },
+    { "id": "ascii_numbers", "category": "ascii", "text": "Pi is approximately 3.14159; e is about 2.71828." },
+    { "id": "ascii_url", "category": "ascii", "text": "Visit https://huggingface.co/docs and mail bob@example.com." },
+    { "id": "ascii_code", "category": "code", "text": "def add(a: int, b: int) -> int:\n    return a + b" },
+    { "id": "ja_kana_basic", "category": "japanese", "text": "あいうえおかきくけこさしすせそ" },
+    { "id": "ja_dakuten", "category": "japanese", "text": "がぎぐげござじずぜぞだぢづでど" },
+    { "id": "ja_handakuten", "category": "japanese", "text": "ぱぴぷぺぽパピプペポ" },
+    { "id": "ja_kanji_mixed", "category": "japanese", "text": "日本語の形態素解析は難しい問題です。" },
+    { "id": "ja_romaji_mixed", "category": "japanese", "text": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。" },
+    { "id": "ja_long_sentence", "category": "japanese", "text": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。" },
+    { "id": "ko_hangul_simple", "category": "korean", "text": "안녕하세요. 반갑습니다." },
+    { "id": "ko_hangul_jamo", "category": "korean", "text": "한글 자모 분리 형식과 한글 음절 형식." },
+    { "id": "zh_simplified", "category": "chinese", "text": "机器学习是人工智能的一个重要分支。" },
+    { "id": "zh_traditional", "category": "chinese", "text": "機器學習是人工智慧的一個重要分支。" },
+    { "id": "zh_mixed_en", "category": "chinese", "text": "PyTorch 是一个 deep learning 框架。" },
+    { "id": "ar_basic", "category": "arabic", "text": "اللغة العربية جميلة جدا." },
+    { "id": "ar_diacritics", "category": "arabic", "text": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" },
+    { "id": "he_basic", "category": "hebrew", "text": "שלום עולם. זה טקסט עברי." },
+    { "id": "hi_devanagari", "category": "devanagari", "text": "हिन्दी भाषा बहुत समृद्ध है।" },
+    { "id": "th_basic", "category": "thai", "text": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ" },
+    { "id": "emoji_bmp", "category": "emoji", "text": "Sun ☀ moon ☾ star ★ heart ♥" },
+    { "id": "emoji_astral", "category": "emoji", "text": "🚀 to the moon 🌙 with a 🐱 and a 🎉" },
+    { "id": "emoji_zwj_family", "category": "emoji", "text": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳" },
+    { "id": "emoji_skin_tone", "category": "emoji", "text": "👋🏻👋🏽👋🏿 wave with skin tones" },
+    { "id": "mixed_polyglot", "category": "mixed", "text": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは" },
+    { "id": "mixed_code_jp", "category": "mixed", "text": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"" },
+    { "id": "ipa_basic", "category": "ipa", "text": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/." },
+    { "id": "edge_combining", "category": "edge", "text": "café (NFC) vs café (NFD) — same word, different bytes." },
+    { "id": "edge_long_repetition", "category": "edge", "text": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" }
+  ]
+}
diff --git a/Tools/README.md b/Tools/README.md
new file mode 100644
index 00000000..4e230d4c
--- /dev/null
+++ b/Tools/README.md
@@ -0,0 +1,52 @@
+# Tools
+
+Repository-side scripts that produce or maintain fixtures used by the Swift test
+suite. They run on macOS/Linux with a CPython interpreter and do not touch the
+Swift build.
+
+## `generate_tokenizer_baselines.py`
+
+Regenerates the byte-identical reference values consumed by
+`Tests/TokenizersTests/MultilingualConformanceTests.swift`. The Python
+`transformers` library is treated as the authoritative reference; whenever
+this script changes its output, the Swift parity tests are expected to be
+re-validated against the new baselines.
+
+### Setup
+
+```sh
+python3 -m venv .venv-tokenizer-baselines
+.venv-tokenizer-baselines/bin/pip install -r Tools/requirements.txt
+```
+
+### Regenerate all kernels
+
+```sh
+.venv-tokenizer-baselines/bin/python Tools/generate_tokenizer_baselines.py
+```
+
+This rewrites every `Tests/TokenizersTests/Resources/MultilingualConformance/baselines/*.json`
+file in place. Commit the diffs together with the upstream `transformers`
+version pinned in `Tools/requirements.txt` so the references are reproducible.
+
+### Regenerate a single kernel
+
+```sh
+.venv-tokenizer-baselines/bin/python Tools/generate_tokenizer_baselines.py \
+    --models BAAI/bge-small-en-v1.5
+```
+
+### Adding a new kernel or input
+
+1. Append the model id to the `MODELS` list in
+   `generate_tokenizer_baselines.py`, or add an entry to `inputs.json`.
+2. Rerun the script. The new baseline file appears under
+   `Tests/TokenizersTests/Resources/MultilingualConformance/baselines/`.
+3. Mirror the kernel in `MultilingualConformanceTests.swift`'s `kernels`
+   array.
+4. Run `swift test --filter MultilingualConformanceTests`.
+
+If the Swift tokenizer diverges from the new reference, add an entry to
+`expectedDivergences` linking to the relevant upstream issue or PR. The test
+target stays green while the divergence remains documented, and the test
+prints a hint when an upstream fix lands and the entry can be removed.
diff --git a/Tools/generate_tokenizer_baselines.py b/Tools/generate_tokenizer_baselines.py
new file mode 100644
index 00000000..df97e29d
--- /dev/null
+++ b/Tools/generate_tokenizer_baselines.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+"""Regenerate multilingual conformance baselines from HuggingFace Python `transformers`.
+
+This script is the single source of truth for the byte-identical reference values
+that `MultilingualConformanceTests` compares Swift output against. To regenerate:
+
+    pip install -r Tools/requirements.txt
+    python Tools/generate_tokenizer_baselines.py
+
+Each baseline file is a JSON dictionary keyed by input id, containing the
+`input_ids`, the convert_ids_to_tokens result, and the decoded form (both with
+and without special tokens). The values are produced by Python's
+`AutoTokenizer.from_pretrained(model_id)`, which is treated as the authoritative
+reference for byte-identical parity.
+
+When a Swift test fails against a baseline, regenerate locally with the same
+`transformers` version listed in `requirements.txt` to confirm the divergence
+isn't an upstream change.
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+try:
+    import transformers
+    from transformers import AutoTokenizer
+except ImportError:
+    sys.stderr.write(
+        "transformers is required. Install with: pip install -r Tools/requirements.txt\n"
+    )
+    sys.exit(1)
+
+
+# Model matrix is intentionally small and covers four distinct tokenizer kernels
+# observed in production HuggingFace text models. Adding a new kernel is
+# preferable to adding a near-duplicate of an existing one.
+MODELS = [
+    # WordPiece (Bert family) — exercises BasicTokenizer pre-tokenization on
+    # CJK / dakuten / diacritics. BGE-small-en is the encoder most embedding
+    # pipelines on Apple Silicon use today.
+    "BAAI/bge-small-en-v1.5",
+
+    # Unigram / SentencePiece — exercises Unigram lattice + Metaspace decoder
+    # on multi-codepoint graphemes. T5-small is the canonical Unigram model and
+    # ships the tokenizer.json required by swift-transformers.
+    "google-t5/t5-small",
+
+    # Byte-level BPE (GPT-2 family) — exercises ByteLevelPreTokenizer regex +
+    # byte encoding. Expected to be byte-identical with the Python reference
+    # across the entire corpus.
+    "openai-community/gpt2",
+
+    # Modern Byte-level BPE (Qwen family) — exercises a more recent vocabulary
+    # and merge table while sharing the GPT-2 kernel.
+    "Qwen/Qwen2.5-0.5B",
+
+    # SentencePiece BPE with byte-fallback (Llama family) — exercises BPE merge
+    # on multi-codepoint graphemes. TinyLlama uses the standard Llama tokenizer
+    # without an auth gate.
+    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+]
+
+
+def slugify(model_id: str) -> str:
+    """Filesystem-safe representation of a HuggingFace model id."""
+    return model_id.replace("/", "__")
+
+
+def encode_input(tokenizer: Any, text: str) -> dict[str, Any]:
+    """Produce a stable JSON-serializable view of how `tokenizer` handles `text`."""
+    # `add_special_tokens=True` matches what `tokenizer.encode(text)` and the
+    # default `AutoTokenizer(text)` callable produce.
+    input_ids = tokenizer.encode(text, add_special_tokens=True)
+    tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    decoded_with_special = tokenizer.decode(input_ids, skip_special_tokens=False)
+    decoded_skip_special = tokenizer.decode(input_ids, skip_special_tokens=True)
+    return {
+        "input_ids": list(input_ids),
+        "tokens": list(tokens),
+        "decoded_with_special": decoded_with_special,
+        "decoded_skip_special": decoded_skip_special,
+    }
+
+
+def generate(model_id: str, corpus: list[dict[str, Any]]) -> dict[str, Any]:
+    print(f"  loading tokenizer for {model_id}...", flush=True)
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    # swift-transformers loads tokenizers via `tokenizer.json` (the Rust-backed
+    # fast format), so the Python reference has to be the matching fast
+    # tokenizer for parity to be meaningful. Slow tokenizers can silently
+    # produce different ids on multi-codepoint inputs.
+    if not getattr(tokenizer, "is_fast", False):
+        raise RuntimeError(
+            f"{model_id} resolved to a slow tokenizer; swift-transformers requires a "
+            "tokenizer.json (fast) reference. Either pick a model with tokenizer.json "
+            "published, or pre-convert one with `AutoTokenizer.save_pretrained` and "
+            "point this script at the local path."
+        )
+    entries: dict[str, Any] = {}
+    for item in corpus:
+        entries[item["id"]] = encode_input(tokenizer, item["text"])
+    return {
+        "model_id": model_id,
+        "transformers_version": transformers.__version__,
+        "entries": entries,
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--corpus",
+        type=Path,
+        default=None,
+        help="Path to inputs.json (defaults to Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json)",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=None,
+        help="Where to write the per-model baseline JSON files",
+    )
+    parser.add_argument(
+        "--models",
+        nargs="+",
+        default=MODELS,
+        help="Override the model matrix (default: all 5 kernels)",
+    )
+    args = parser.parse_args()
+
+    repo_root = Path(__file__).resolve().parent.parent
+    base_dir = repo_root / "Tests" / "TokenizersTests" / "Resources" / "MultilingualConformance"
+    corpus_path = args.corpus or (base_dir / "inputs.json")
+    output_dir = args.output_dir or (base_dir / "baselines")
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    corpus = json.loads(corpus_path.read_text(encoding="utf-8"))["inputs"]
+    print(f"loaded {len(corpus)} inputs from {_display_path(corpus_path, repo_root)}")
+
+    for model_id in args.models:
+        baseline = generate(model_id, corpus)
+        path = output_dir / f"{slugify(model_id)}.json"
+        path.write_text(
+            json.dumps(baseline, ensure_ascii=False, indent=2) + "\n",
+            encoding="utf-8",
+        )
+        print(f"  wrote {_display_path(path, repo_root)}")
+
+    return 0
+
+
+def _display_path(path: Path, repo_root: Path) -> str:
+    """Return `path` as repo-relative when it lives inside the repo, otherwise absolute."""
+    try:
+        return str(path.resolve().relative_to(repo_root))
+    except ValueError:
+        return str(path.resolve())
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/Tools/requirements.txt b/Tools/requirements.txt
new file mode 100644
index 00000000..88942cb6
--- /dev/null
+++ b/Tools/requirements.txt
@@ -0,0 +1,7 @@
+# Versions pinned for reproducible baselines. Bump the `transformers` pin in
+# the same commit that re-generates the bundled baselines so the parity values
+# stay aligned with a known reference.
+transformers==4.57.1
+tokenizers>=0.20,<0.23
+sentencepiece>=0.2
+protobuf>=4,<8