diff --git a/.gitignore b/.gitignore index fe9534bc..739846c6 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ DerivedData/ .idea .index-build *.out +# Local Python venv used by Tools/generate_tokenizer_baselines.py +.venv-tokenizer-baselines/ + +Tools/__pycache__/ diff --git a/Tests/TokenizersTests/MultilingualConformanceTests.swift b/Tests/TokenizersTests/MultilingualConformanceTests.swift new file mode 100644 index 00000000..4caa021c --- /dev/null +++ b/Tests/TokenizersTests/MultilingualConformanceTests.swift @@ -0,0 +1,294 @@ +// +// MultilingualConformanceTests.swift +// +// Byte-identical parity tests against HuggingFace Python `transformers`. +// +// Baselines under `Resources/MultilingualConformance/baselines/` are produced +// by `Tools/generate_tokenizer_baselines.py` and treated as the authoritative +// reference. Each Swift tokenizer kernel is expected to produce identical +// `input_ids` for every input in the corpus. +// +// Inputs known to diverge today because of bugs being tracked upstream are +// enumerated in `expectedDivergences` below with a reference to the relevant +// issue or PR, so the target lands green while the work is in flight. Any +// divergence that isn't in that list is a hard failure (regression catch). +// Any input listed there that now matches Python emits a printed hint inviting +// removal of the entry — but doesn't fail the test, so the green CI signal +// isn't broken by an upstream improvement. +// +// Adding a model: append to `kernels`, append to `MODELS` in the Python +// script, and re-run it. Adding an input: append to `inputs.json` and re-run +// the script. +// + +import Foundation +import Testing + +@testable import Hub +@testable import Models +@testable import Tokenizers + +private let downloadDestination: URL = { + let base = FileManager.default.urls(for: .cachesDirectory, in: .userDomainMask).first! + return base.appending(component: "huggingface-tests") +}() + +private let hubApiForTests = HubApi(downloadBase: downloadDestination) + +// MARK: - Fixtures + +private struct CorpusInput: Decodable { + let id: String + let category: String + let text: String +} + +private struct Corpus: Decodable { + let schema_version: Int + let description: String + let inputs: [CorpusInput] +} + +private struct BaselineEntry: Decodable { + let input_ids: [Int] + let tokens: [String] + // The Python generator also emits `decoded_with_special` / `decoded_skip_special` + // for future use; they are intentionally not decoded here because decoder-side + // parity has its own failure modes that deserve a dedicated test (and at least + // one known-buggy path — see WordPieceDecoder's empty-tokens `tokens.first!`). +} + +private struct Baseline: Decodable { + let model_id: String + let transformers_version: String + let entries: [String: BaselineEntry] +} + +private enum FixtureError: Error, CustomStringConvertible { + case missingResource(String) + + var description: String { + switch self { + case .missingResource(let name): "missing resource: \(name)" + } + } +} + +// Resource lookup deliberately doesn't use the `subdirectory:` parameter of +// `Bundle.module.url(forResource:withExtension:subdirectory:)`. SPM's +// `.process("Resources")` does not always preserve the directory layout in a way +// that subdirectory lookup can rely on, but flat lookup by basename works +// because every fixture filename below is unique within the bundle (the corpus +// is named `inputs.json` and every baseline uses a slugified model id). +private func loadCorpus() throws -> Corpus { + guard let url = Bundle.module.url(forResource: "inputs", withExtension: "json") else { + throw FixtureError.missingResource("inputs.json") + } + return try JSONDecoder().decode(Corpus.self, from: try Data(contentsOf: url)) +} + +private func loadBaseline(_ slug: String) throws -> Baseline { + // Slugified model ids replace `/` with `__` so they're valid as filesystem and bundle names. + guard let url = Bundle.module.url(forResource: slug, withExtension: "json") else { + throw FixtureError.missingResource("\(slug).json") + } + return try JSONDecoder().decode(Baseline.self, from: try Data(contentsOf: url)) +} + +private func makeTokenizer(_ modelId: String) async throws -> Tokenizer { + let config = LanguageModelConfigurationFromHub(modelName: modelId, hubApi: hubApiForTests) + guard let tokenizerConfig = try await config.tokenizerConfig else { + Issue.record("Missing tokenizer config for \(modelId)") + throw FixtureError.missingResource("tokenizer_config.json for \(modelId)") + } + let tokenizerData = try await config.tokenizerData + return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) +} + +// MARK: - Diff formatting + +private func formatTokenDiff( + expected: [Int], + actual: [Int], + expectedTokens: [String], + actualTokens: [String] +) -> String { + let common = min(expected.count, actual.count) + var firstDiff = common + for i in 0.." } + let message = formatTokenDiff( + expected: expected.input_ids, + actual: actualIds, + expectedTokens: expected.tokens, + actualTokens: actualTokens + ) + unexpectedDivergences.append((input, "[\(input.category)] id=\(input.id) text=\(input.text.debugDescription)\n\(message)")) + } + + // Unexpected divergence is a hard failure: either swift-transformers regressed, + // or the corpus / baseline added a case that wasn't classified yet. + for failure in unexpectedDivergences { + Issue.record("\(failure.message)") + } + if !unexpectedDivergences.isEmpty { + Issue.record("\(kernel.modelId): \(unexpectedDivergences.count) unexpected divergence(s) from Python `transformers` \(baseline.transformers_version). Either swift-transformers regressed or `expectedDivergences` needs a new entry.") + } + + // Unexpected match is informational: an upstream fix has landed and the entry + // should be dropped from `expectedDivergences`. Printed but does NOT fail the + // test, so freshly merged improvements don't break CI; the message surfaces + // when running locally and is the trigger to clean up the expected list. + if !unexpectedMatches.isEmpty { + print( + "[MultilingualConformance] \(kernel.modelId): \(unexpectedMatches.count) input(s) now match Python — " + + "please remove from `expectedDivergences` in MultilingualConformanceTests.swift: " + + unexpectedMatches.sorted().joined(separator: ", ") + ) + } + } + + /// Sanity check: the corpus itself should not regress in shape or schema between + /// edits. Caught here so a malformed inputs.json fails fast rather than silently + /// skipping cases inside the parity test. + @Test("Corpus is well-formed") + func corpusIsWellFormed() throws { + let corpus = try loadCorpus() + #expect(corpus.schema_version == 1) + #expect(!corpus.inputs.isEmpty) + + var seen = Set() + for input in corpus.inputs { + #expect(!input.id.isEmpty, "input id must not be empty") + #expect(!seen.contains(input.id), "duplicate input id: \(input.id)") + seen.insert(input.id) + #expect(!input.text.isEmpty, "input text must not be empty (id: \(input.id))") + } + } + + /// Sanity check: every kernel must have a baseline file covering every input id. + @Test("Baselines cover the corpus", arguments: kernels) + fileprivate func baselinesCoverCorpus(kernel: Kernel) throws { + let corpus = try loadCorpus() + let baseline = try loadBaseline(kernel.baselineSlug) + let corpusIds = Set(corpus.inputs.map(\.id)) + let baselineIds = Set(baseline.entries.keys) + let missing = corpusIds.subtracting(baselineIds) + let extra = baselineIds.subtracting(corpusIds) + #expect(missing.isEmpty, "baseline missing entries: \(missing.sorted())") + #expect(extra.isEmpty, "baseline has stale entries not in corpus: \(extra.sorted())") + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json new file mode 100644 index 00000000..f781cc00 --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/BAAI__bge-small-en-v1.5.json @@ -0,0 +1,1474 @@ +{ + "model_id": "BAAI/bge-small-en-v1.5", + "transformers_version": "4.57.1", + "entries": { + "ascii_simple": { + "input_ids": [ + 101, + 1996, + 4248, + 2829, + 4419, + 14523, + 2058, + 1996, + 13971, + 3899, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "the", + "quick", + "brown", + "fox", + "jumps", + "over", + "the", + "lazy", + "dog", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] the quick brown fox jumps over the lazy dog. [SEP]", + "decoded_skip_special": "the quick brown fox jumps over the lazy dog." + }, + "ascii_punct": { + "input_ids": [ + 101, + 7592, + 1010, + 2088, + 999, + 2009, + 1005, + 1055, + 2260, + 1024, + 4090, + 1517, + 1000, + 14686, + 1000, + 1005, + 9706, + 14122, + 18981, + 5369, + 1005, + 1006, + 11968, + 2368, + 1007, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "hello", + ",", + "world", + "!", + "it", + "'", + "s", + "12", + ":", + "34", + "—", + "\"", + "quote", + "\"", + "'", + "ap", + "##ost", + "##rop", + "##he", + "'", + "(", + "par", + "##en", + ")", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] hello, world! it's 12 : 34 — \" quote \"'apostrophe'( paren ). [SEP]", + "decoded_skip_special": "hello, world! it's 12 : 34 — \" quote \"'apostrophe'( paren )." + }, + "ascii_numbers": { + "input_ids": [ + 101, + 14255, + 2003, + 3155, + 1017, + 1012, + 15471, + 28154, + 1025, + 1041, + 2003, + 2055, + 1016, + 1012, + 6390, + 2620, + 22407, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "pi", + "is", + "approximately", + "3", + ".", + "141", + "##59", + ";", + "e", + "is", + "about", + "2", + ".", + "71", + "##8", + "##28", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] pi is approximately 3. 14159 ; e is about 2. 71828. [SEP]", + "decoded_skip_special": "pi is approximately 3. 14159 ; e is about 2. 71828." + }, + "ascii_url": { + "input_ids": [ + 101, + 3942, + 16770, + 1024, + 1013, + 1013, + 17662, + 12172, + 1012, + 2522, + 1013, + 9986, + 2015, + 1998, + 5653, + 3960, + 1030, + 2742, + 1012, + 4012, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "visit", + "https", + ":", + "/", + "/", + "hugging", + "##face", + ".", + "co", + "/", + "doc", + "##s", + "and", + "mail", + "bob", + "@", + "example", + ".", + "com", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] visit https : / / huggingface. co / docs and mail bob @ example. com. [SEP]", + "decoded_skip_special": "visit https : / / huggingface. co / docs and mail bob @ example. com." + }, + "ascii_code": { + "input_ids": [ + 101, + 13366, + 5587, + 1006, + 1037, + 1024, + 20014, + 1010, + 1038, + 1024, + 20014, + 1007, + 1011, + 1028, + 20014, + 1024, + 2709, + 1037, + 1009, + 1038, + 102 + ], + "tokens": [ + "[CLS]", + "def", + "add", + "(", + "a", + ":", + "int", + ",", + "b", + ":", + "int", + ")", + "-", + ">", + "int", + ":", + "return", + "a", + "+", + "b", + "[SEP]" + ], + "decoded_with_special": "[CLS] def add ( a : int, b : int ) - > int : return a + b [SEP]", + "decoded_skip_special": "def add ( a : int, b : int ) - > int : return a + b" + }, + "ja_kana_basic": { + "input_ids": [ + 101, + 1646, + 30173, + 30174, + 30175, + 30176, + 30177, + 30178, + 30179, + 30180, + 30181, + 30182, + 30183, + 30184, + 30185, + 30186, + 102 + ], + "tokens": [ + "[CLS]", + "あ", + "##い", + "##う", + "##え", + "##お", + "##か", + "##き", + "##く", + "##け", + "##こ", + "##さ", + "##し", + "##す", + "##せ", + "##そ", + "[SEP]" + ], + "decoded_with_special": "[CLS] あいうえおかきくけこさしすせそ [SEP]", + "decoded_skip_special": "あいうえおかきくけこさしすせそ" + }, + "ja_dakuten": { + "input_ids": [ + 101, + 1651, + 30178, + 30179, + 30180, + 30181, + 30182, + 30183, + 30184, + 30185, + 30186, + 30187, + 30188, + 30190, + 30191, + 30192, + 102 + ], + "tokens": [ + "[CLS]", + "か", + "##き", + "##く", + "##け", + "##こ", + "##さ", + "##し", + "##す", + "##せ", + "##そ", + "##た", + "##ち", + "##つ", + "##て", + "##と", + "[SEP]" + ], + "decoded_with_special": "[CLS] かきくけこさしすせそたちつてと [SEP]", + "decoded_skip_special": "かきくけこさしすせそたちつてと" + }, + "ja_handakuten": { + "input_ids": [ + 101, + 1672, + 30199, + 30200, + 30201, + 30202, + 30244, + 30245, + 30246, + 30247, + 30248, + 102 + ], + "tokens": [ + "[CLS]", + "は", + "##ひ", + "##ふ", + "##へ", + "##ほ", + "##ハ", + "##ヒ", + "##フ", + "##ヘ", + "##ホ", + "[SEP]" + ], + "decoded_with_special": "[CLS] はひふへほハヒフヘホ [SEP]", + "decoded_skip_special": "はひふへほハヒフヘホ" + }, + "ja_kanji_mixed": { + "input_ids": [ + 101, + 1864, + 1876, + 1950, + 1671, + 100, + 100, + 100, + 100, + 100, + 1672, + 100, + 1657, + 30173, + 100, + 100, + 1665, + 30184, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "日", + "本", + "語", + "の", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "は", + "[UNK]", + "し", + "##い", + "[UNK]", + "[UNK]", + "て", + "##す", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] 日 本 語 の [UNK] [UNK] [UNK] [UNK] [UNK] は [UNK] しい [UNK] [UNK] てす 。 [SEP]", + "decoded_skip_special": "日 本 語 の は しい てす 。" + }, + "ja_romaji_mixed": { + "input_ids": [ + 101, + 9170, + 1665, + 8285, + 18715, + 18595, + 6290, + 1690, + 100, + 1658, + 1636, + 100, + 100, + 1672, + 7953, + 1035, + 8909, + 2015, + 1666, + 19204, + 2015, + 1671, + 1752, + 1664, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "swift", + "て", + "auto", + "##tok", + "##eni", + "##zer", + "を", + "[UNK]", + "す", + "。", + "[UNK]", + "[UNK]", + "は", + "input", + "_", + "id", + "##s", + "と", + "token", + "##s", + "の", + "二", + "つ", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] swift て autotokenizer を [UNK] す 。 [UNK] [UNK] は input _ ids と tokens の 二 つ 。 [SEP]", + "decoded_skip_special": "swift て autotokenizer を す 。 は input _ ids と tokens の 二 つ 。" + }, + "ja_long_sentence": { + "input_ids": [ + 101, + 1879, + 1755, + 100, + 100, + 100, + 100, + 100, + 1671, + 100, + 100, + 100, + 1967, + 1651, + 100, + 100, + 1690, + 1774, + 1657, + 30187, + 30211, + 30183, + 30173, + 30177, + 1635, + 1876, + 100, + 1661, + 30215, + 30174, + 30177, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "東", + "京", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "の", + "[UNK]", + "[UNK]", + "[UNK]", + "長", + "か", + "[UNK]", + "[UNK]", + "を", + "出", + "し", + "##た", + "##ら", + "##し", + "##い", + "##か", + "、", + "本", + "[UNK]", + "た", + "##ろ", + "##う", + "##か", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] 東 京 [UNK] [UNK] [UNK] [UNK] [UNK] の [UNK] [UNK] [UNK] 長 か [UNK] [UNK] を 出 したらしいか 、 本 [UNK] たろうか 。 [SEP]", + "decoded_skip_special": "東 京 の 長 か を 出 したらしいか 、 本 たろうか 。" + }, + "ko_hangul_simple": { + "input_ids": [ + 101, + 1463, + 30006, + 30021, + 29992, + 30010, + 30025, + 30005, + 30006, + 29997, + 30009, + 29999, + 30013, + 1012, + 1460, + 30006, + 30021, + 29991, + 30006, + 30024, + 29997, + 30017, + 30024, + 29992, + 30019, + 29993, + 30006, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "ᄋ", + "##ᅡ", + "##ᆫ", + "##ᄂ", + "##ᅧ", + "##ᆼ", + "##ᄒ", + "##ᅡ", + "##ᄉ", + "##ᅦ", + "##ᄋ", + "##ᅭ", + ".", + "ᄇ", + "##ᅡ", + "##ᆫ", + "##ᄀ", + "##ᅡ", + "##ᆸ", + "##ᄉ", + "##ᅳ", + "##ᆸ", + "##ᄂ", + "##ᅵ", + "##ᄃ", + "##ᅡ", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] 안녕하세요. 반갑습니다. [SEP]", + "decoded_skip_special": "안녕하세요. 반갑습니다." + }, + "ko_hangul_jamo": { + "input_ids": [ + 101, + 1469, + 30006, + 30021, + 29991, + 30017, + 30022, + 1464, + 30006, + 29995, + 30011, + 1460, + 30014, + 30021, + 29994, + 30019, + 1469, + 30010, + 30025, + 29997, + 30019, + 30020, + 29991, + 30012, + 1469, + 30006, + 30021, + 29991, + 30017, + 30022, + 1463, + 30017, + 30023, + 30000, + 30008, + 30022, + 1469, + 30010, + 30025, + 29997, + 30019, + 30020, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "ᄒ", + "##ᅡ", + "##ᆫ", + "##ᄀ", + "##ᅳ", + "##ᆯ", + "ᄌ", + "##ᅡ", + "##ᄆ", + "##ᅩ", + "ᄇ", + "##ᅮ", + "##ᆫ", + "##ᄅ", + "##ᅵ", + "ᄒ", + "##ᅧ", + "##ᆼ", + "##ᄉ", + "##ᅵ", + "##ᆨ", + "##ᄀ", + "##ᅪ", + "ᄒ", + "##ᅡ", + "##ᆫ", + "##ᄀ", + "##ᅳ", + "##ᆯ", + "ᄋ", + "##ᅳ", + "##ᆷ", + "##ᄌ", + "##ᅥ", + "##ᆯ", + "ᄒ", + "##ᅧ", + "##ᆼ", + "##ᄉ", + "##ᅵ", + "##ᆨ", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] 한글 자모 분리 형식과 한글 음절 형식. [SEP]", + "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식." + }, + "zh_simplified": { + "input_ids": [ + 101, + 100, + 100, + 1817, + 100, + 100, + 1756, + 100, + 1869, + 100, + 1916, + 1740, + 100, + 100, + 100, + 1775, + 100, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "[UNK]", + "[UNK]", + "学", + "[UNK]", + "[UNK]", + "人", + "[UNK]", + "智", + "[UNK]", + "的", + "一", + "[UNK]", + "[UNK]", + "[UNK]", + "分", + "[UNK]", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] [UNK] [UNK] 学 [UNK] [UNK] 人 [UNK] 智 [UNK] 的 一 [UNK] [UNK] [UNK] 分 [UNK] 。 [SEP]", + "decoded_skip_special": "学 人 智 的 一 分 。" + }, + "zh_traditional": { + "input_ids": [ + 101, + 100, + 100, + 100, + 100, + 100, + 1756, + 100, + 1869, + 100, + 1916, + 1740, + 100, + 100, + 100, + 1775, + 100, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "[UNK]", + "人", + "[UNK]", + "智", + "[UNK]", + "的", + "一", + "[UNK]", + "[UNK]", + "[UNK]", + "分", + "[UNK]", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] [UNK] [UNK] [UNK] [UNK] [UNK] 人 [UNK] 智 [UNK] 的 一 [UNK] [UNK] [UNK] 分 [UNK] 。 [SEP]", + "decoded_skip_special": "人 智 的 一 分 。" + }, + "zh_mixed_en": { + "input_ids": [ + 101, + 1052, + 22123, + 2953, + 2818, + 100, + 1740, + 100, + 2784, + 4083, + 100, + 100, + 1636, + 102 + ], + "tokens": [ + "[CLS]", + "p", + "##yt", + "##or", + "##ch", + "[UNK]", + "一", + "[UNK]", + "deep", + "learning", + "[UNK]", + "[UNK]", + "。", + "[SEP]" + ], + "decoded_with_special": "[CLS] pytorch [UNK] 一 [UNK] deep learning [UNK] [UNK] 。 [SEP]", + "decoded_skip_special": "pytorch 一 deep learning 。" + }, + "ar_basic": { + "input_ids": [ + 101, + 1270, + 23673, + 23673, + 29831, + 19433, + 1270, + 23673, + 29830, + 17149, + 29816, + 14498, + 19433, + 1275, + 22192, + 14498, + 23673, + 19433, + 1275, + 15394, + 25573, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "ا", + "##ل", + "##ل", + "##غ", + "##ة", + "ا", + "##ل", + "##ع", + "##ر", + "##ب", + "##ي", + "##ة", + "ج", + "##م", + "##ي", + "##ل", + "##ة", + "ج", + "##د", + "##ا", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] اللغة العربية جميلة جدا. [SEP]", + "decoded_skip_special": "اللغة العربية جميلة جدا." + }, + "ar_diacritics": { + "input_ids": [ + 101, + 1271, + 29824, + 22192, + 1270, + 23673, + 23673, + 14157, + 1270, + 23673, + 17149, + 29820, + 22192, + 15915, + 1270, + 23673, + 17149, + 29820, + 14498, + 22192, + 102 + ], + "tokens": [ + "[CLS]", + "ب", + "##س", + "##م", + "ا", + "##ل", + "##ل", + "##ه", + "ا", + "##ل", + "##ر", + "##ح", + "##م", + "##ن", + "ا", + "##ل", + "##ر", + "##ح", + "##ي", + "##م", + "[SEP]" + ], + "decoded_with_special": "[CLS] بسم الله الرحمن الرحيم [SEP]", + "decoded_skip_special": "بسم الله الرحمن الرحيم" + }, + "he_basic": { + "input_ids": [ + 101, + 1266, + 29799, + 29792, + 29800, + 1259, + 29792, + 29799, + 29800, + 1012, + 1247, + 29128, + 1249, + 29810, + 29804, + 29795, + 1259, + 29789, + 29811, + 29796, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "ש", + "##ל", + "##ו", + "##ם", + "ע", + "##ו", + "##ל", + "##ם", + ".", + "ז", + "##ה", + "ט", + "##ק", + "##ס", + "##ט", + "ע", + "##ב", + "##ר", + "##י", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] שלום עולם. זה טקסט עברי. [SEP]", + "decoded_skip_special": "שלום עולם. זה טקסט עברי." + }, + "hi_devanagari": { + "input_ids": [ + 101, + 1339, + 29877, + 29863, + 29861, + 29878, + 1330, + 29876, + 29873, + 29876, + 1329, + 29875, + 29859, + 1338, + 29867, + 29861, + 29862, + 1339, + 1344, + 102 + ], + "tokens": [ + "[CLS]", + "ह", + "##ि", + "##न", + "##द", + "##ी", + "भ", + "##ा", + "##ष", + "##ा", + "ब", + "##ह", + "##त", + "स", + "##म", + "##द", + "##ध", + "ह", + "।", + "[SEP]" + ], + "decoded_with_special": "[CLS] हिनदी भाषा बहत समदध ह । [SEP]", + "decoded_skip_special": "हिनदी भाषा बहत समदध ह ।" + }, + "th_basic": { + "input_ids": [ + 101, + 100, + 102 + ], + "tokens": [ + "[CLS]", + "[UNK]", + "[SEP]" + ], + "decoded_with_special": "[CLS] [UNK] [SEP]", + "decoded_skip_special": "" + }, + "emoji_bmp": { + "input_ids": [ + 101, + 3103, + 100, + 4231, + 100, + 2732, + 1620, + 2540, + 1625, + 102 + ], + "tokens": [ + "[CLS]", + "sun", + "[UNK]", + "moon", + "[UNK]", + "star", + "★", + "heart", + "♥", + "[SEP]" + ], + "decoded_with_special": "[CLS] sun [UNK] moon [UNK] star ★ heart ♥ [SEP]", + "decoded_skip_special": "sun moon star ★ heart ♥" + }, + "emoji_astral": { + "input_ids": [ + 101, + 100, + 2000, + 1996, + 4231, + 100, + 2007, + 1037, + 100, + 1998, + 1037, + 100, + 102 + ], + "tokens": [ + "[CLS]", + "[UNK]", + "to", + "the", + "moon", + "[UNK]", + "with", + "a", + "[UNK]", + "and", + "a", + "[UNK]", + "[SEP]" + ], + "decoded_with_special": "[CLS] [UNK] to the moon [UNK] with a [UNK] and a [UNK] [SEP]", + "decoded_skip_special": "to the moon with a and a" + }, + "emoji_zwj_family": { + "input_ids": [ + 101, + 2155, + 1024, + 100, + 1998, + 5210, + 1024, + 100, + 102 + ], + "tokens": [ + "[CLS]", + "family", + ":", + "[UNK]", + "and", + "flag", + ":", + "[UNK]", + "[SEP]" + ], + "decoded_with_special": "[CLS] family : [UNK] and flag : [UNK] [SEP]", + "decoded_skip_special": "family : and flag :" + }, + "emoji_skin_tone": { + "input_ids": [ + 101, + 100, + 4400, + 2007, + 3096, + 12623, + 102 + ], + "tokens": [ + "[CLS]", + "[UNK]", + "wave", + "with", + "skin", + "tones", + "[SEP]" + ], + "decoded_with_special": "[CLS] [UNK] wave with skin tones [SEP]", + "decoded_skip_special": "wave with skin tones" + }, + "mixed_polyglot": { + "input_ids": [ + 101, + 7592, + 1745, + 100, + 1463, + 30006, + 30021, + 29992, + 30010, + 30025, + 1266, + 29799, + 29792, + 29800, + 1295, + 17149, + 29820, + 29816, + 25573, + 1327, + 29867, + 29874, + 29859, + 100, + 1655, + 30217, + 30194, + 30188, + 30198, + 102 + ], + "tokens": [ + "[CLS]", + "hello", + "世", + "[UNK]", + "ᄋ", + "##ᅡ", + "##ᆫ", + "##ᄂ", + "##ᅧ", + "##ᆼ", + "ש", + "##ל", + "##ו", + "##ם", + "م", + "##ر", + "##ح", + "##ب", + "##ا", + "न", + "##म", + "##स", + "##त", + "[UNK]", + "こ", + "##ん", + "##に", + "##ち", + "##は", + "[SEP]" + ], + "decoded_with_special": "[CLS] hello 世 [UNK] 안녕 שלום مرحبا नमसत [UNK] こんにちは [SEP]", + "decoded_skip_special": "hello 世 안녕 שלום مرحبا नमसत こんにちは" + }, + "mixed_code_jp": { + "input_ids": [ + 101, + 1013, + 1013, + 1864, + 1876, + 1950, + 1704, + 30252, + 30263, + 30240, + 2292, + 14806, + 1027, + 1000, + 1655, + 30217, + 30194, + 30188, + 30198, + 1635, + 1745, + 100, + 999, + 1000, + 102 + ], + "tokens": [ + "[CLS]", + "/", + "/", + "日", + "本", + "語", + "コ", + "##メ", + "##ン", + "##ト", + "let", + "greeting", + "=", + "\"", + "こ", + "##ん", + "##に", + "##ち", + "##は", + "、", + "世", + "[UNK]", + "!", + "\"", + "[SEP]" + ], + "decoded_with_special": "[CLS] / / 日 本 語 コメント let greeting = \" こんにちは 、 世 [UNK]! \" [SEP]", + "decoded_skip_special": "/ / 日 本 語 コメント let greeting = \" こんにちは 、 世! \"" + }, + "ipa_basic": { + "input_ids": [ + 101, + 1996, + 24531, + 2005, + 1005, + 3869, + 1005, + 2003, + 1013, + 1042, + 29685, + 29696, + 1013, + 1998, + 2005, + 1005, + 2911, + 1005, + 2003, + 1013, + 1130, + 29685, + 2361, + 1013, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "the", + "ipa", + "for", + "'", + "fish", + "'", + "is", + "/", + "f", + "##ɪ", + "##ʃ", + "/", + "and", + "for", + "'", + "ship", + "'", + "is", + "/", + "ʃ", + "##ɪ", + "##p", + "/", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] the ipa for'fish'is / fɪʃ / and for'ship'is / ʃɪp /. [SEP]", + "decoded_skip_special": "the ipa for'fish'is / fɪʃ / and for'ship'is / ʃɪp /." + }, + "edge_combining": { + "input_ids": [ + 101, + 7668, + 1006, + 22309, + 1007, + 5443, + 7668, + 1006, + 1050, + 2546, + 2094, + 1007, + 1517, + 2168, + 2773, + 1010, + 2367, + 27507, + 1012, + 102 + ], + "tokens": [ + "[CLS]", + "cafe", + "(", + "nfc", + ")", + "vs", + "cafe", + "(", + "n", + "##f", + "##d", + ")", + "—", + "same", + "word", + ",", + "different", + "bytes", + ".", + "[SEP]" + ], + "decoded_with_special": "[CLS] cafe ( nfc ) vs cafe ( nfd ) — same word, different bytes. [SEP]", + "decoded_skip_special": "cafe ( nfc ) vs cafe ( nfd ) — same word, different bytes." + }, + "edge_long_repetition": { + "input_ids": [ + 101, + 13360, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 11057, + 2050, + 22861, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 10322, + 102 + ], + "tokens": [ + "[CLS]", + "aaa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##aa", + "##a", + "bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "##bb", + "[SEP]" + ], + "decoded_with_special": "[CLS] aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb [SEP]", + "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + } + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json new file mode 100644 index 00000000..dc2f41f5 --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/Qwen__Qwen2.5-0.5B.json @@ -0,0 +1,1230 @@ +{ + "model_id": "Qwen/Qwen2.5-0.5B", + "transformers_version": "4.57.1", + "entries": { + "ascii_simple": { + "input_ids": [ + 785, + 3974, + 13876, + 38835, + 34208, + 916, + 279, + 15678, + 5562, + 13 + ], + "tokens": [ + "The", + "Ġquick", + "Ġbrown", + "Ġfox", + "Ġjumps", + "Ġover", + "Ġthe", + "Ġlazy", + "Ġdog", + "." + ], + "decoded_with_special": "The quick brown fox jumps over the lazy dog.", + "decoded_skip_special": "The quick brown fox jumps over the lazy dog." + }, + "ascii_punct": { + "input_ids": [ + 9707, + 11, + 1879, + 0, + 1084, + 594, + 220, + 16, + 17, + 25, + 18, + 19, + 1959, + 330, + 2949, + 1, + 364, + 391, + 535, + 47883, + 6, + 320, + 41064, + 568 + ], + "tokens": [ + "Hello", + ",", + "Ġworld", + "!", + "ĠIt", + "'s", + "Ġ", + "1", + "2", + ":", + "3", + "4", + "ĠâĢĶ", + "Ġ\"", + "quote", + "\"", + "Ġ'", + "ap", + "ost", + "rophe", + "'", + "Ġ(", + "paren", + ")." + ], + "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).", + "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." + }, + "ascii_numbers": { + "input_ids": [ + 34767, + 374, + 13187, + 220, + 18, + 13, + 16, + 19, + 16, + 20, + 24, + 26, + 384, + 374, + 911, + 220, + 17, + 13, + 22, + 16, + 23, + 17, + 23, + 13 + ], + "tokens": [ + "Pi", + "Ġis", + "Ġapproximately", + "Ġ", + "3", + ".", + "1", + "4", + "1", + "5", + "9", + ";", + "Ġe", + "Ġis", + "Ġabout", + "Ġ", + "2", + ".", + "7", + "1", + "8", + "2", + "8", + "." + ], + "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.", + "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828." + }, + "ascii_url": { + "input_ids": [ + 26218, + 3703, + 1110, + 71, + 35268, + 1564, + 6830, + 25967, + 323, + 8072, + 35192, + 35487, + 905, + 13 + ], + "tokens": [ + "Visit", + "Ġhttps", + "://", + "h", + "ugging", + "face", + ".co", + "/docs", + "Ġand", + "Ġmail", + "Ġbob", + "@example", + ".com", + "." + ], + "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.", + "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com." + }, + "ascii_code": { + "input_ids": [ + 750, + 912, + 2877, + 25, + 526, + 11, + 293, + 25, + 526, + 8, + 1464, + 526, + 510, + 262, + 470, + 264, + 488, + 293 + ], + "tokens": [ + "def", + "Ġadd", + "(a", + ":", + "Ġint", + ",", + "Ġb", + ":", + "Ġint", + ")", + "Ġ->", + "Ġint", + ":Ċ", + "ĠĠĠ", + "Ġreturn", + "Ġa", + "Ġ+", + "Ġb" + ], + "decoded_with_special": "def add(a: int, b: int) -> int:\n return a + b", + "decoded_skip_special": "def add(a: int, b: int) -> int:\n return a + b" + }, + "ja_kana_basic": { + "input_ids": [ + 29491, + 94504, + 57842, + 141940, + 49734, + 46784, + 75522, + 22168, + 29713, + 14682, + 17219, + 71242, + 26831 + ], + "tokens": [ + "ãģĤ", + "ãģĦãģĨ", + "ãģĪ", + "ãģĬãģĭ", + "ãģį", + "ãģı", + "ãģij", + "ãģĵ", + "ãģķ", + "ãģĹ", + "ãģĻ", + "ãģĽ", + "ãģĿ" + ], + "decoded_with_special": "あいうえおかきくけこさしすせそ", + "decoded_skip_special": "あいうえおかきくけこさしすせそ" + }, + "ja_dakuten": { + "input_ids": [ + 28195, + 124902, + 125161, + 124682, + 76021, + 99104, + 124145, + 125973, + 127264, + 35685, + 144635, + 125301, + 16161, + 66545 + ], + "tokens": [ + "ãģĮ", + "ãģİ", + "ãģIJ", + "ãģĴ", + "ãģĶãģĸ", + "ãģĺ", + "ãģļ", + "ãģľ", + "ãģŀ", + "ãģł", + "ãģ¢", + "ãģ¥", + "ãģ§", + "ãģ©" + ], + "decoded_with_special": "がぎぐげござじずぜぞだぢづでど", + "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど" + }, + "ja_handakuten": { + "input_ids": [ + 144099, + 139813, + 143262, + 144184, + 142459, + 79705, + 69463, + 56226, + 98595, + 88054 + ], + "tokens": [ + "ãģ±", + "ãģ´", + "ãģ·", + "ãģº", + "ãģ½", + "ãĥij", + "ãĥĶ", + "ãĥĹ", + "ãĥļ", + "ãĥĿ" + ], + "decoded_with_special": "ぱぴぷぺぽパピプペポ", + "decoded_skip_special": "ぱぴぷぺぽパピプペポ" + }, + "ja_kanji_mixed": { + "input_ids": [ + 101059, + 102819, + 15767, + 82699, + 101008, + 71138, + 106637, + 15322, + 133073, + 104832, + 37541, + 1773 + ], + "tokens": [ + "æĹ¥æľ¬", + "èªŀ", + "ãģ®", + "å½¢", + "æħĭ", + "ç´ł", + "è§£æŀIJ", + "ãģ¯", + "éĽ£ãģĹãģĦ", + "åķıé¡Į", + "ãģ§ãģĻ", + "ãĢĤ" + ], + "decoded_with_special": "日本語の形態素解析は難しい問題です。", + "decoded_skip_special": "日本語の形態素解析は難しい問題です。" + }, + "ja_romaji_mixed": { + "input_ids": [ + 55336, + 220, + 16161, + 8979, + 37434, + 94271, + 102854, + 17219, + 1773, + 10236, + 113, + 238, + 27773, + 15322, + 1946, + 8077, + 220, + 19182, + 11211, + 96618, + 40820, + 58639, + 1773 + ], + "tokens": [ + "Swift", + "Ġ", + "ãģ§", + "ĠAuto", + "Tokenizer", + "ĠãĤĴ", + "試", + "ãģĻ", + "ãĢĤ", + "Ġç", + "µ", + "IJ", + "æŀľ", + "ãģ¯", + "Ġinput", + "_ids", + "Ġ", + "ãģ¨", + "Ġtokens", + "Ġãģ®", + "äºĮ", + "ãģ¤", + "ãĢĤ" + ], + "decoded_with_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。", + "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。" + }, + "ja_long_sentence": { + "input_ids": [ + 102356, + 46553, + 65278, + 100955, + 100955, + 30440, + 89977, + 15767, + 100955, + 30440, + 89977, + 100435, + 28195, + 100955, + 30440, + 29412, + 137246, + 127056, + 28195, + 5373, + 129790, + 129085, + 31049, + 1773 + ], + "tokens": [ + "æĿ±", + "京", + "çī¹", + "許", + "許", + "åı¯", + "å±Ģ", + "ãģ®", + "許", + "åı¯", + "å±Ģ", + "éķ·", + "ãģĮ", + "許", + "åı¯", + "ãĤĴ", + "åĩºãģĹãģŁ", + "ãĤīãģĹãģĦ", + "ãģĮ", + "ãĢģ", + "æľ¬å½ĵ", + "ãģłãĤįãģĨ", + "ãģĭ", + "ãĢĤ" + ], + "decoded_with_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。", + "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。" + }, + "ko_hangul_simple": { + "input_ids": [ + 126246, + 144370, + 91145, + 13, + 63757, + 138685, + 38231, + 13 + ], + "tokens": [ + "ìķĪ", + "ëħķ", + "íķĺìĦ¸ìļĶ", + ".", + "Ġë°ĺ", + "ê°ij", + "ìĬµëĭĪëĭ¤", + "." + ], + "decoded_with_special": "안녕하세요. 반갑습니다.", + "decoded_skip_special": "안녕하세요. 반갑습니다." + }, + "ko_hangul_jamo": { + "input_ids": [ + 23573, + 83291, + 64577, + 129439, + 128618, + 28002, + 141965, + 76337, + 53680, + 61298, + 83291, + 16751, + 234, + 126550, + 141965, + 76337, + 13 + ], + "tokens": [ + "íķľ", + "ê¸Ģ", + "ĠìŀIJ", + "모", + "Ġë¶Ħ", + "리", + "Ġíĺķ", + "ìĭĿ", + "ê³¼", + "Ġíķľ", + "ê¸Ģ", + "ĠìĿ", + "Į", + "ìłĪ", + "Ġíĺķ", + "ìĭĿ", + "." + ], + "decoded_with_special": "한글 자모 분리 형식과 한글 음절 형식.", + "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식." + }, + "zh_simplified": { + "input_ids": [ + 102182, + 100134, + 20412, + 104455, + 104111, + 99335, + 103799, + 1773 + ], + "tokens": [ + "æľºåύ", + "åŃ¦ä¹ł", + "æĺ¯", + "人工æĻºèĥ½", + "çļĦä¸Ģ个", + "éĩįè¦ģ", + "åĪĨæĶ¯", + "ãĢĤ" + ], + "decoded_with_special": "机器学习是人工智能的一个重要分支。", + "decoded_skip_special": "机器学习是人工智能的一个重要分支。" + }, + "zh_traditional": { + "input_ids": [ + 100482, + 31548, + 106745, + 20412, + 102249, + 101934, + 99774, + 99542, + 99335, + 103799, + 1773 + ], + "tokens": [ + "æ©Ł", + "åύ", + "åѸç¿Ĵ", + "æĺ¯", + "人工", + "æĻºæħ§", + "çļĦä¸Ģ", + "åĢĭ", + "éĩįè¦ģ", + "åĪĨæĶ¯", + "ãĢĤ" + ], + "decoded_with_special": "機器學習是人工智慧的一個重要分支。", + "decoded_skip_special": "機器學習是人工智慧的一個重要分支。" + }, + "zh_mixed_en": { + "input_ids": [ + 13828, + 51, + 21584, + 54851, + 46944, + 5538, + 6832, + 6567, + 94, + 228, + 99630, + 1773 + ], + "tokens": [ + "Py", + "T", + "orch", + "Ġæĺ¯", + "ä¸Ģ个", + "Ġdeep", + "Ġlearning", + "Ġæ", + "¡", + "Ĩ", + "æŀ¶", + "ãĢĤ" + ], + "decoded_with_special": "PyTorch 是一个 deep learning 框架。", + "decoded_skip_special": "PyTorch 是一个 deep learning 框架。" + }, + "ar_basic": { + "input_ids": [ + 31382, + 130353, + 25871, + 129071, + 138518, + 25871, + 127119, + 13 + ], + "tokens": [ + "اÙĦ", + "ÙĦغ", + "Ø©", + "ĠاÙĦعربÙĬØ©", + "ĠجÙħÙĬÙĦ", + "Ø©", + "Ġجدا", + "." + ], + "decoded_with_special": "اللغة العربية جميلة جدا.", + "decoded_skip_special": "اللغة العربية جميلة جدا." + }, + "ar_diacritics": { + "input_ids": [ + 21360, + 52704, + 20064, + 59397, + 10176, + 52704, + 124478, + 27910, + 73771, + 16157, + 52704, + 124269, + 27910, + 73771, + 29825, + 59397, + 10176, + 27910, + 149, + 108, + 11798, + 52704, + 124269, + 27910, + 73771, + 29825, + 52704, + 124176, + 52704 + ], + "tokens": [ + "ب", + "ÙIJ", + "س", + "ÙĴ", + "Ùħ", + "ÙIJ", + "ĠاÙĦÙĦ", + "Ùİ", + "Ùij", + "Ùĩ", + "ÙIJ", + "ĠاÙĦر", + "Ùİ", + "Ùij", + "ØŃ", + "ÙĴ", + "Ùħ", + "Ùİ", + "Ù", + "°", + "ÙĨ", + "ÙIJ", + "ĠاÙĦر", + "Ùİ", + "Ùij", + "ØŃ", + "ÙIJ", + "ÙĬÙħ", + "ÙIJ" + ], + "decoded_with_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ", + "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" + }, + "he_basic": { + "input_ids": [ + 126654, + 123855, + 124907, + 13, + 126197, + 124395, + 123792, + 125127, + 123855, + 129390, + 13 + ], + "tokens": [ + "ש׾×ķ×Ŀ", + "Ġ×¢", + "×ķ׾×Ŀ", + ".", + "Ġ×ĸ×Ķ", + "Ġ×ĺ", + "×§", + "ס×ĺ", + "Ġ×¢", + "×ijר×Ļ", + "." + ], + "decoded_with_special": "שלום עולם. זה טקסט עברי.", + "decoded_skip_special": "שלום עולם. זה טקסט עברי." + }, + "hi_devanagari": { + "input_ids": [ + 93948, + 42311, + 101, + 30484, + 99, + 43647, + 14925, + 255, + 31411, + 115, + 23868, + 14925, + 105, + 93948, + 72653, + 79238, + 68158, + 87244, + 12619, + 225, + 145256, + 30484, + 100, + 84310, + 12619, + 230, + 146031 + ], + "tokens": [ + "ह", + "िà¤", + "¨", + "à¥įà¤", + "¦", + "à¥Ģ", + "Ġà¤", + "Ń", + "ाà¤", + "·", + "ा", + "Ġà¤", + "¬", + "ह", + "à¥ģ", + "त", + "Ġस", + "म", + "à¥", + "ĥ", + "द", + "à¥įà¤", + "§", + "Ġह", + "à¥", + "Ī", + "।" + ], + "decoded_with_special": "हिन्दी भाषा बहुत समृद्ध है।", + "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।" + }, + "th_basic": { + "input_ids": [ + 93874, + 123899, + 140235, + 124396, + 127382, + 125451, + 123885, + 83546, + 123885, + 125820, + 129674, + 86032, + 124342, + 28319, + 48120, + 124961, + 37213, + 123958, + 129778, + 123958, + 125506 + ], + "tokens": [ + "à¸ģาร", + "à¸Ľà¸£à¸°", + "มวล", + "à¸ľà¸¥", + "à¸łà¸²à¸©à¸²", + "à¹Ħà¸Ĺย", + "à¸ĭ", + "ัà¸ļ", + "à¸ĭ", + "à¹īà¸Ńà¸Ļ", + "à¹Ģà¸ŀราะ", + "à¹Ħม", + "à¹Īม", + "ี", + "à¸Ĭ", + "à¹Īà¸Ńà¸ĩ", + "ว", + "à¹Īาà¸ĩ", + "ระหว", + "à¹Īาà¸ĩ", + "à¸Ħำ" + ], + "decoded_with_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ", + "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ" + }, + "emoji_bmp": { + "input_ids": [ + 30092, + 25125, + 222, + 17788, + 25125, + 122, + 6774, + 37234, + 4746, + 67579 + ], + "tokens": [ + "Sun", + "Ġâĺ", + "Ģ", + "Ġmoon", + "Ġâĺ", + "¾", + "Ġstar", + "Ġâĺħ", + "Ġheart", + "ĠâĻ¥" + ], + "decoded_with_special": "Sun ☀ moon ☾ star ★ heart ♥", + "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥" + }, + "emoji_astral": { + "input_ids": [ + 145836, + 311, + 279, + 17788, + 11162, + 234, + 247, + 448, + 264, + 11162, + 238, + 109, + 323, + 264, + 11162, + 236, + 231 + ], + "tokens": [ + "ðŁļĢ", + "Ġto", + "Ġthe", + "Ġmoon", + "ĠðŁ", + "Į", + "Ļ", + "Ġwith", + "Ġa", + "ĠðŁ", + "IJ", + "±", + "Ġand", + "Ġa", + "ĠðŁ", + "İ", + "ī" + ], + "decoded_with_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉", + "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉" + }, + "emoji_zwj_family": { + "input_ids": [ + 15192, + 25, + 61804, + 101, + 378, + 235, + 145233, + 378, + 235, + 145665, + 378, + 235, + 145988, + 323, + 5181, + 25, + 11162, + 229, + 107, + 145516, + 146035, + 145070, + 145793, + 145754 + ], + "tokens": [ + "Family", + ":", + "ĠðŁij", + "¨", + "âĢ", + "į", + "ðŁij©", + "âĢ", + "į", + "ðŁij§", + "âĢ", + "į", + "ðŁij¦", + "Ġand", + "Ġflag", + ":", + "ĠðŁ", + "ĩ", + "¯", + "ðŁĩµ", + "ðŁĩ°", + "ðŁĩ·", + "ðŁĩ¨", + "ðŁĩ³" + ], + "decoded_with_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳", + "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳" + }, + "emoji_skin_tone": { + "input_ids": [ + 145707, + 144321, + 145707, + 145375, + 145707, + 146530, + 12060, + 448, + 6787, + 41976 + ], + "tokens": [ + "ðŁijĭ", + "ðŁı»", + "ðŁijĭ", + "ðŁı½", + "ðŁijĭ", + "ðŁı¿", + "Ġwave", + "Ġwith", + "Ġskin", + "Ġtones" + ], + "decoded_with_special": "👋🏻👋🏽👋🏿 wave with skin tones", + "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones" + }, + "mixed_polyglot": { + "input_ids": [ + 9707, + 220, + 99489, + 95170, + 144370, + 124756, + 123881, + 23364, + 126860, + 124671, + 14925, + 101, + 87244, + 78368, + 30484, + 97, + 34370, + 129328, + 37213, + 23271, + 125136, + 28319, + 220, + 89015 + ], + "tokens": [ + "Hello", + "Ġ", + "ä¸ĸçķĮ", + "ĠìķĪ", + "ëħķ", + "Ġש׾", + "×ķ×Ŀ", + "ĠÙħ", + "رØŃ", + "با", + "Ġà¤", + "¨", + "म", + "स", + "à¥įà¤", + "¤", + "à¥ĩ", + "Ġส", + "ว", + "ั", + "สà¸Ķ", + "ี", + "Ġ", + "ãģĵãĤĵãģ«ãģ¡ãģ¯" + ], + "decoded_with_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは", + "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは" + }, + "mixed_code_jp": { + "input_ids": [ + 322, + 75402, + 21894, + 102819, + 89078, + 198, + 1149, + 42113, + 284, + 330, + 89015, + 5373, + 99489, + 8958 + ], + "tokens": [ + "//", + "ĠæĹ¥", + "æľ¬", + "èªŀ", + "ãĤ³ãĥ¡ãĥ³ãĥĪ", + "Ċ", + "let", + "Ġgreeting", + "Ġ=", + "Ġ\"", + "ãģĵãĤĵãģ«ãģ¡ãģ¯", + "ãĢģ", + "ä¸ĸçķĮ", + "!\"" + ], + "decoded_with_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"", + "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"" + }, + "ipa_basic": { + "input_ids": [ + 785, + 55747, + 369, + 364, + 18170, + 6, + 374, + 608, + 69, + 145076, + 145388, + 14, + 323, + 369, + 364, + 5270, + 6, + 374, + 608, + 145388, + 145076, + 79, + 11930 + ], + "tokens": [ + "The", + "ĠIPA", + "Ġfor", + "Ġ'", + "fish", + "'", + "Ġis", + "Ġ/", + "f", + "ɪ", + "Êĥ", + "/", + "Ġand", + "Ġfor", + "Ġ'", + "ship", + "'", + "Ġis", + "Ġ/", + "Êĥ", + "ɪ", + "p", + "/." + ], + "decoded_with_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.", + "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/." + }, + "edge_combining": { + "input_ids": [ + 924, + 58858, + 320, + 45, + 6754, + 8, + 6165, + 51950, + 320, + 45, + 14596, + 8, + 1959, + 1852, + 3409, + 11, + 2155, + 5820, + 13 + ], + "tokens": [ + "ca", + "fé", + "Ġ(", + "N", + "FC", + ")", + "Ġvs", + "Ġcafé", + "Ġ(", + "N", + "FD", + ")", + "ĠâĢĶ", + "Ġsame", + "Ġword", + ",", + "Ġdifferent", + "Ġbytes", + "." + ], + "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.", + "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes." + }, + "edge_long_repetition": { + "input_ids": [ + 69440, + 69440, + 69440, + 69440, + 293, + 87609, + 87609, + 87609, + 87609, + 87609, + 87609, + 87609, + 53151 + ], + "tokens": [ + "aaaaaaaa", + "aaaaaaaa", + "aaaaaaaa", + "aaaaaaaa", + "Ġb", + "bbbb", + "bbbb", + "bbbb", + "bbbb", + "bbbb", + "bbbb", + "bbbb", + "bbb" + ], + "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + } + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json new file mode 100644 index 00000000..5212ab36 --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/TinyLlama__TinyLlama-1.1B-Chat-v1.0.json @@ -0,0 +1,1946 @@ +{ + "model_id": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "transformers_version": "4.57.1", + "entries": { + "ascii_simple": { + "input_ids": [ + 1, + 450, + 4996, + 17354, + 1701, + 29916, + 432, + 17204, + 975, + 278, + 17366, + 11203, + 29889 + ], + "tokens": [ + "", + "▁The", + "▁quick", + "▁brown", + "▁fo", + "x", + "▁j", + "umps", + "▁over", + "▁the", + "▁lazy", + "▁dog", + "." + ], + "decoded_with_special": " The quick brown fox jumps over the lazy dog.", + "decoded_skip_special": "The quick brown fox jumps over the lazy dog." + }, + "ascii_punct": { + "input_ids": [ + 1, + 15043, + 29892, + 3186, + 29991, + 739, + 29915, + 29879, + 29871, + 29896, + 29906, + 29901, + 29941, + 29946, + 813, + 376, + 1396, + 29908, + 525, + 481, + 520, + 1336, + 354, + 29915, + 313, + 862, + 264, + 467 + ], + "tokens": [ + "", + "▁Hello", + ",", + "▁world", + "!", + "▁It", + "'", + "s", + "▁", + "1", + "2", + ":", + "3", + "4", + "▁—", + "▁\"", + "quote", + "\"", + "▁'", + "ap", + "ost", + "rop", + "he", + "'", + "▁(", + "par", + "en", + ")." + ], + "decoded_with_special": " Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).", + "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." + }, + "ascii_numbers": { + "input_ids": [ + 1, + 7362, + 338, + 14235, + 29871, + 29941, + 29889, + 29896, + 29946, + 29896, + 29945, + 29929, + 29936, + 321, + 338, + 1048, + 29871, + 29906, + 29889, + 29955, + 29896, + 29947, + 29906, + 29947, + 29889 + ], + "tokens": [ + "", + "▁Pi", + "▁is", + "▁approximately", + "▁", + "3", + ".", + "1", + "4", + "1", + "5", + "9", + ";", + "▁e", + "▁is", + "▁about", + "▁", + "2", + ".", + "7", + "1", + "8", + "2", + "8", + "." + ], + "decoded_with_special": " Pi is approximately 3.14159; e is about 2.71828.", + "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828." + }, + "ascii_url": { + "input_ids": [ + 1, + 5741, + 277, + 2045, + 597, + 29882, + 688, + 3460, + 2161, + 29889, + 1111, + 29914, + 2640, + 322, + 10524, + 289, + 711, + 29992, + 4773, + 29889, + 510, + 29889 + ], + "tokens": [ + "", + "▁Vis", + "it", + "▁https", + "://", + "h", + "ug", + "ging", + "face", + ".", + "co", + "/", + "docs", + "▁and", + "▁mail", + "▁b", + "ob", + "@", + "example", + ".", + "com", + "." + ], + "decoded_with_special": " Visit https://huggingface.co/docs and mail bob@example.com.", + "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com." + }, + "ascii_code": { + "input_ids": [ + 1, + 822, + 788, + 29898, + 29874, + 29901, + 938, + 29892, + 289, + 29901, + 938, + 29897, + 1599, + 938, + 29901, + 13, + 1678, + 736, + 263, + 718, + 289 + ], + "tokens": [ + "", + "▁def", + "▁add", + "(", + "a", + ":", + "▁int", + ",", + "▁b", + ":", + "▁int", + ")", + "▁->", + "▁int", + ":", + "<0x0A>", + "▁▁▁", + "▁return", + "▁a", + "▁+", + "▁b" + ], + "decoded_with_special": " def add(a: int, b: int) -> int:\n return a + b", + "decoded_skip_special": "def add(a: int, b: int) -> int:\n return a + b" + }, + "ja_kana_basic": { + "input_ids": [ + 1, + 29871, + 30641, + 30298, + 30465, + 30914, + 30697, + 30412, + 30538, + 30568, + 30807, + 30589, + 30566, + 30326, + 30427, + 31095, + 31110 + ], + "tokens": [ + "", + "▁", + "あ", + "い", + "う", + "え", + "お", + "か", + "き", + "く", + "け", + "こ", + "さ", + "し", + "す", + "せ", + "そ" + ], + "decoded_with_special": " あいうえおかきくけこさしすせそ", + "decoded_skip_special": "あいうえおかきくけこさしすせそ" + }, + "ja_dakuten": { + "input_ids": [ + 1, + 29871, + 30458, + 230, + 132, + 145, + 31907, + 31991, + 31622, + 230, + 132, + 153, + 31115, + 31761, + 230, + 132, + 159, + 230, + 132, + 161, + 30955, + 230, + 132, + 165, + 230, + 132, + 168, + 30499, + 31250 + ], + "tokens": [ + "", + "▁", + "が", + "<0xE3>", + "<0x81>", + "<0x8E>", + "ぐ", + "げ", + "ご", + "<0xE3>", + "<0x81>", + "<0x96>", + "じ", + "ず", + "<0xE3>", + "<0x81>", + "<0x9C>", + "<0xE3>", + "<0x81>", + "<0x9E>", + "だ", + "<0xE3>", + "<0x81>", + "<0xA2>", + "<0xE3>", + "<0x81>", + "<0xA5>", + "で", + "ど" + ], + "decoded_with_special": " がぎぐげござじずぜぞだぢづでど", + "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど" + }, + "ja_handakuten": { + "input_ids": [ + 1, + 29871, + 230, + 132, + 180, + 230, + 132, + 183, + 230, + 132, + 186, + 230, + 132, + 189, + 230, + 132, + 192, + 30715, + 31172, + 30605, + 31501, + 31205 + ], + "tokens": [ + "", + "▁", + "<0xE3>", + "<0x81>", + "<0xB1>", + "<0xE3>", + "<0x81>", + "<0xB4>", + "<0xE3>", + "<0x81>", + "<0xB7>", + "<0xE3>", + "<0x81>", + "<0xBA>", + "<0xE3>", + "<0x81>", + "<0xBD>", + "パ", + "ピ", + "プ", + "ペ", + "ポ" + ], + "decoded_with_special": " ぱぴぷぺぽパピプペポ", + "decoded_skip_special": "ぱぴぷぺぽパピプペポ" + }, + "ja_kanji_mixed": { + "input_ids": [ + 1, + 29871, + 30325, + 30346, + 30968, + 30199, + 31305, + 233, + 136, + 142, + 31605, + 31201, + 233, + 161, + 147, + 30449, + 236, + 158, + 166, + 30326, + 30298, + 232, + 152, + 146, + 236, + 164, + 143, + 30499, + 30427, + 30267 + ], + "tokens": [ + "", + "▁", + "日", + "本", + "語", + "の", + "形", + "<0xE6>", + "<0x85>", + "<0x8B>", + "素", + "解", + "<0xE6>", + "<0x9E>", + "<0x90>", + "は", + "<0xE9>", + "<0x9B>", + "<0xA3>", + "し", + "い", + "<0xE5>", + "<0x95>", + "<0x8F>", + "<0xE9>", + "<0xA1>", + "<0x8C>", + "で", + "す", + "。" + ], + "decoded_with_special": " 日本語の形態素解析は難しい問題です。", + "decoded_skip_special": "日本語の形態素解析は難しい問題です。" + }, + "ja_romaji_mixed": { + "input_ids": [ + 1, + 14156, + 29871, + 30499, + 11133, + 6066, + 3950, + 29871, + 30396, + 235, + 172, + 169, + 30427, + 30267, + 29871, + 31711, + 30801, + 30449, + 1881, + 29918, + 4841, + 29871, + 30364, + 18897, + 29871, + 30199, + 30685, + 30773, + 30267 + ], + "tokens": [ + "", + "▁Swift", + "▁", + "で", + "▁Auto", + "Token", + "izer", + "▁", + "を", + "<0xE8>", + "<0xA9>", + "<0xA6>", + "す", + "。", + "▁", + "結", + "果", + "は", + "▁input", + "_", + "ids", + "▁", + "と", + "▁tokens", + "▁", + "の", + "二", + "つ", + "。" + ], + "decoded_with_special": " Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。", + "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。" + }, + "ja_long_sentence": { + "input_ids": [ + 1, + 29871, + 30591, + 30675, + 31141, + 235, + 171, + 180, + 235, + 171, + 180, + 30682, + 31655, + 30199, + 235, + 171, + 180, + 30682, + 31655, + 30899, + 30458, + 235, + 171, + 180, + 30682, + 30396, + 30544, + 30326, + 30366, + 30513, + 30326, + 30298, + 30458, + 30330, + 30346, + 30948, + 30955, + 31206, + 30465, + 30412, + 30267 + ], + "tokens": [ + "", + "▁", + "東", + "京", + "特", + "<0xE8>", + "<0xA8>", + "<0xB1>", + "<0xE8>", + "<0xA8>", + "<0xB1>", + "可", + "局", + "の", + "<0xE8>", + "<0xA8>", + "<0xB1>", + "可", + "局", + "長", + "が", + "<0xE8>", + "<0xA8>", + "<0xB1>", + "可", + "を", + "出", + "し", + "た", + "ら", + "し", + "い", + "が", + "、", + "本", + "当", + "だ", + "ろ", + "う", + "か", + "。" + ], + "decoded_with_special": " 東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。", + "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。" + }, + "ko_hangul_simple": { + "input_ids": [ + 1, + 29871, + 31734, + 238, + 136, + 152, + 30944, + 31578, + 31527, + 29889, + 29871, + 238, + 179, + 155, + 237, + 179, + 148, + 239, + 141, + 184, + 31063, + 30709, + 29889 + ], + "tokens": [ + "", + "▁", + "안", + "<0xEB>", + "<0x85>", + "<0x95>", + "하", + "세", + "요", + ".", + "▁", + "<0xEB>", + "<0xB0>", + "<0x98>", + "<0xEA>", + "<0xB0>", + "<0x91>", + "<0xEC>", + "<0x8A>", + "<0xB5>", + "니", + "다", + "." + ], + "decoded_with_special": " 안녕하세요. 반갑습니다.", + "decoded_skip_special": "안녕하세요. 반갑습니다." + }, + "ko_hangul_jamo": { + "input_ids": [ + 1, + 29871, + 228, + 135, + 149, + 228, + 136, + 164, + 228, + 137, + 174, + 237, + 187, + 131, + 29871, + 31013, + 31962, + 29871, + 238, + 185, + 135, + 30826, + 29871, + 240, + 155, + 152, + 31895, + 31906, + 29871, + 30877, + 237, + 187, + 131, + 29871, + 31966, + 239, + 163, + 139, + 29871, + 240, + 155, + 152, + 31895, + 29889 + ], + "tokens": [ + "", + "▁", + "<0xE1>", + "<0x84>", + "<0x92>", + "<0xE1>", + "<0x85>", + "<0xA1>", + "<0xE1>", + "<0x86>", + "<0xAB>", + "<0xEA>", + "<0xB8>", + "<0x80>", + "▁", + "자", + "모", + "▁", + "<0xEB>", + "<0xB6>", + "<0x84>", + "리", + "▁", + "<0xED>", + "<0x98>", + "<0x95>", + "식", + "과", + "▁", + "한", + "<0xEA>", + "<0xB8>", + "<0x80>", + "▁", + "음", + "<0xEC>", + "<0xA0>", + "<0x88>", + "▁", + "<0xED>", + "<0x98>", + "<0x95>", + "식", + "." + ], + "decoded_with_special": " 한글 자모 분리 형식과 한글 음절 형식.", + "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식." + }, + "zh_simplified": { + "input_ids": [ + 1, + 29871, + 31429, + 30943, + 30415, + 231, + 188, + 163, + 30392, + 30313, + 31041, + 31676, + 30815, + 30210, + 30287, + 30502, + 30908, + 30698, + 30748, + 31541, + 30267 + ], + "tokens": [ + "", + "▁", + "机", + "器", + "学", + "<0xE4>", + "<0xB9>", + "<0xA0>", + "是", + "人", + "工", + "智", + "能", + "的", + "一", + "个", + "重", + "要", + "分", + "支", + "。" + ], + "decoded_with_special": " 机器学习是人工智能的一个重要分支。", + "decoded_skip_special": "机器学习是人工智能的一个重要分支。" + }, + "zh_traditional": { + "input_ids": [ + 1, + 29871, + 31540, + 30943, + 31274, + 234, + 194, + 149, + 30392, + 30313, + 31041, + 31676, + 233, + 136, + 170, + 30210, + 30287, + 232, + 131, + 142, + 30908, + 30698, + 30748, + 31541, + 30267 + ], + "tokens": [ + "", + "▁", + "機", + "器", + "學", + "<0xE7>", + "<0xBF>", + "<0x92>", + "是", + "人", + "工", + "智", + "<0xE6>", + "<0x85>", + "<0xA7>", + "的", + "一", + "<0xE5>", + "<0x80>", + "<0x8B>", + "重", + "要", + "分", + "支", + "。" + ], + "decoded_with_special": " 機器學習是人工智慧的一個重要分支。", + "decoded_skip_special": "機器學習是人工智慧的一個重要分支。" + }, + "zh_mixed_en": { + "input_ids": [ + 1, + 10772, + 29911, + 25350, + 29871, + 30392, + 30287, + 30502, + 6483, + 6509, + 29871, + 233, + 164, + 137, + 233, + 161, + 185, + 30267 + ], + "tokens": [ + "", + "▁Py", + "T", + "orch", + "▁", + "是", + "一", + "个", + "▁deep", + "▁learning", + "▁", + "<0xE6>", + "<0xA1>", + "<0x86>", + "<0xE6>", + "<0x9E>", + "<0xB6>", + "。" + ], + "decoded_with_special": " PyTorch 是一个 deep learning 框架。", + "decoded_skip_special": "PyTorch 是一个 deep learning 框架。" + }, + "ar_basic": { + "input_ids": [ + 1, + 24508, + 30138, + 30611, + 30242, + 24508, + 30218, + 30156, + 30177, + 30163, + 30242, + 29871, + 30270, + 30159, + 30163, + 30138, + 30242, + 29871, + 30270, + 30172, + 30112, + 29889 + ], + "tokens": [ + "", + "▁ال", + "ل", + "غ", + "ة", + "▁ال", + "ع", + "ر", + "ب", + "ي", + "ة", + "▁", + "ج", + "م", + "ي", + "ل", + "ة", + "▁", + "ج", + "د", + "ا", + "." + ], + "decoded_with_special": " اللغة العربية جميلة جدا.", + "decoded_skip_special": "اللغة العربية جميلة جدا." + }, + "ar_diacritics": { + "input_ids": [ + 1, + 29871, + 30177, + 30567, + 30198, + 30741, + 30159, + 30567, + 24508, + 30138, + 30323, + 30857, + 30204, + 30567, + 24508, + 30156, + 30323, + 30857, + 30240, + 30741, + 30159, + 30323, + 220, + 179, + 30162, + 30567, + 24508, + 30156, + 30323, + 30857, + 30240, + 30567, + 30163, + 30159, + 30567 + ], + "tokens": [ + "", + "▁", + "ب", + "ِ", + "س", + "ْ", + "م", + "ِ", + "▁ال", + "ل", + "َ", + "ّ", + "ه", + "ِ", + "▁ال", + "ر", + "َ", + "ّ", + "ح", + "ْ", + "م", + "َ", + "<0xD9>", + "<0xB0>", + "ن", + "ِ", + "▁ال", + "ر", + "َ", + "ّ", + "ح", + "ِ", + "ي", + "م", + "ِ" + ], + "decoded_with_special": " بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ", + "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" + }, + "he_basic": { + "input_ids": [ + 1, + 29871, + 30294, + 30249, + 30205, + 30404, + 29871, + 30324, + 30205, + 30249, + 30404, + 29889, + 29871, + 30776, + 30235, + 29871, + 30639, + 30433, + 30504, + 30639, + 29871, + 30324, + 30276, + 30236, + 30196, + 29889 + ], + "tokens": [ + "", + "▁", + "ש", + "ל", + "ו", + "ם", + "▁", + "ע", + "ו", + "ל", + "ם", + ".", + "▁", + "ז", + "ה", + "▁", + "ט", + "ק", + "ס", + "ט", + "▁", + "ע", + "ב", + "ר", + "י", + "." + ], + "decoded_with_special": " שלום עולם. זה טקסט עברי.", + "decoded_skip_special": "שלום עולם. זה טקסט עברי." + }, + "hi_devanagari": { + "input_ids": [ + 1, + 29871, + 30714, + 30436, + 30424, + 30296, + 30694, + 30580, + 29871, + 31380, + 30269, + 31330, + 30269, + 29871, + 31012, + 30714, + 30702, + 30475, + 29871, + 30489, + 30485, + 227, + 168, + 134, + 30694, + 30296, + 31437, + 29871, + 30714, + 31678, + 31776 + ], + "tokens": [ + "", + "▁", + "ह", + "ि", + "न", + "्", + "द", + "ी", + "▁", + "भ", + "ा", + "ष", + "ा", + "▁", + "ब", + "ह", + "ु", + "त", + "▁", + "स", + "म", + "<0xE0>", + "<0xA5>", + "<0x83>", + "द", + "्", + "ध", + "▁", + "ह", + "ै", + "।" + ], + "decoded_with_special": " हिन्दी भाषा बहुत समृद्ध है।", + "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।" + }, + "th_basic": { + "input_ids": [ + 1, + 29871, + 30425, + 30289, + 30297, + 31010, + 30297, + 30823, + 30501, + 30492, + 30496, + 227, + 187, + 159, + 30496, + 31070, + 30289, + 31964, + 30289, + 31252, + 30595, + 30549, + 227, + 187, + 142, + 30510, + 30526, + 227, + 187, + 142, + 30652, + 30351, + 30348, + 30401, + 30727, + 30297, + 30289, + 30823, + 31252, + 30501, + 30543, + 30501, + 30691, + 30913, + 30543, + 30351, + 30398, + 30492, + 30543, + 30289, + 30398, + 30297, + 30823, + 30663, + 30492, + 30543, + 30289, + 30398, + 30759, + 30747 + ], + "tokens": [ + "", + "▁", + "ก", + "า", + "ร", + "ป", + "ร", + "ะ", + "ม", + "ว", + "ล", + "<0xE0>", + "<0xB8>", + "<0x9C>", + "ล", + "ภ", + "า", + "ษ", + "า", + "ไ", + "ท", + "ย", + "<0xE0>", + "<0xB8>", + "<0x8B>", + "ั", + "บ", + "<0xE0>", + "<0xB8>", + "<0x8B>", + "้", + "อ", + "น", + "เ", + "พ", + "ร", + "า", + "ะ", + "ไ", + "ม", + "่", + "ม", + "ี", + "ช", + "่", + "อ", + "ง", + "ว", + "่", + "า", + "ง", + "ร", + "ะ", + "ห", + "ว", + "่", + "า", + "ง", + "ค", + "ำ" + ], + "decoded_with_special": " การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ", + "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ" + }, + "emoji_bmp": { + "input_ids": [ + 1, + 8991, + 29871, + 229, + 155, + 131, + 18786, + 29871, + 229, + 155, + 193, + 5810, + 29871, + 30950, + 5192, + 29871, + 30922 + ], + "tokens": [ + "", + "▁Sun", + "▁", + "<0xE2>", + "<0x98>", + "<0x80>", + "▁moon", + "▁", + "<0xE2>", + "<0x98>", + "<0xBE>", + "▁star", + "▁", + "★", + "▁heart", + "▁", + "♥" + ], + "decoded_with_special": " Sun ☀ moon ☾ star ★ heart ♥", + "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥" + }, + "emoji_astral": { + "input_ids": [ + 1, + 29871, + 243, + 162, + 157, + 131, + 304, + 278, + 18786, + 29871, + 243, + 162, + 143, + 156, + 411, + 263, + 29871, + 243, + 162, + 147, + 180, + 322, + 263, + 29871, + 243, + 162, + 145, + 140 + ], + "tokens": [ + "", + "▁", + "<0xF0>", + "<0x9F>", + "<0x9A>", + "<0x80>", + "▁to", + "▁the", + "▁moon", + "▁", + "<0xF0>", + "<0x9F>", + "<0x8C>", + "<0x99>", + "▁with", + "▁a", + "▁", + "<0xF0>", + "<0x9F>", + "<0x90>", + "<0xB1>", + "▁and", + "▁a", + "▁", + "<0xF0>", + "<0x9F>", + "<0x8E>", + "<0x89>" + ], + "decoded_with_special": " 🚀 to the moon 🌙 with a 🐱 and a 🎉", + "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉" + }, + "emoji_zwj_family": { + "input_ids": [ + 1, + 14662, + 29901, + 29871, + 243, + 162, + 148, + 171, + 30722, + 243, + 162, + 148, + 172, + 30722, + 243, + 162, + 148, + 170, + 30722, + 243, + 162, + 148, + 169, + 322, + 7353, + 29901, + 29871, + 243, + 162, + 138, + 178, + 243, + 162, + 138, + 184, + 243, + 162, + 138, + 179, + 243, + 162, + 138, + 186, + 243, + 162, + 138, + 171, + 243, + 162, + 138, + 182 + ], + "tokens": [ + "", + "▁Family", + ":", + "▁", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0xA8>", + "‍", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0xA9>", + "‍", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0xA7>", + "‍", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0xA6>", + "▁and", + "▁flag", + ":", + "▁", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xAF>", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xB5>", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xB0>", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xB7>", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xA8>", + "<0xF0>", + "<0x9F>", + "<0x87>", + "<0xB3>" + ], + "decoded_with_special": " Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳", + "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳" + }, + "emoji_skin_tone": { + "input_ids": [ + 1, + 29871, + 243, + 162, + 148, + 142, + 243, + 162, + 146, + 190, + 243, + 162, + 148, + 142, + 243, + 162, + 146, + 192, + 243, + 162, + 148, + 142, + 243, + 162, + 146, + 194, + 10742, + 411, + 19309, + 260, + 2873 + ], + "tokens": [ + "", + "▁", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0x8B>", + "<0xF0>", + "<0x9F>", + "<0x8F>", + "<0xBB>", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0x8B>", + "<0xF0>", + "<0x9F>", + "<0x8F>", + "<0xBD>", + "<0xF0>", + "<0x9F>", + "<0x91>", + "<0x8B>", + "<0xF0>", + "<0x9F>", + "<0x8F>", + "<0xBF>", + "▁wave", + "▁with", + "▁skin", + "▁t", + "ones" + ], + "decoded_with_special": " 👋🏻👋🏽👋🏿 wave with skin tones", + "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones" + }, + "mixed_polyglot": { + "input_ids": [ + 1, + 15043, + 29871, + 30793, + 30967, + 29871, + 31734, + 238, + 136, + 152, + 29871, + 30294, + 30249, + 30205, + 30404, + 29871, + 30159, + 30156, + 30240, + 30177, + 30112, + 29871, + 30424, + 30485, + 30489, + 30296, + 30475, + 30569, + 29871, + 30547, + 30492, + 30510, + 30547, + 30718, + 30691, + 29871, + 30589, + 30389, + 30353, + 30644, + 30449 + ], + "tokens": [ + "", + "▁Hello", + "▁", + "世", + "界", + "▁", + "안", + "<0xEB>", + "<0x85>", + "<0x95>", + "▁", + "ש", + "ל", + "ו", + "ם", + "▁", + "م", + "ر", + "ح", + "ب", + "ا", + "▁", + "न", + "म", + "स", + "्", + "त", + "े", + "▁", + "ส", + "ว", + "ั", + "ส", + "ด", + "ี", + "▁", + "こ", + "ん", + "に", + "ち", + "は" + ], + "decoded_with_special": " Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは", + "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは" + }, + "mixed_code_jp": { + "input_ids": [ + 1, + 849, + 29871, + 30325, + 30346, + 30968, + 30459, + 30604, + 30203, + 30279, + 13, + 1026, + 1395, + 15133, + 353, + 376, + 30589, + 30389, + 30353, + 30644, + 30449, + 30330, + 30793, + 30967, + 3850 + ], + "tokens": [ + "", + "▁//", + "▁", + "日", + "本", + "語", + "コ", + "メ", + "ン", + "ト", + "<0x0A>", + "let", + "▁gre", + "eting", + "▁=", + "▁\"", + "こ", + "ん", + "に", + "ち", + "は", + "、", + "世", + "界", + "!\"" + ], + "decoded_with_special": " // 日本語コメント\nlet greeting = \"こんにちは、世界!\"", + "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"" + }, + "ipa_basic": { + "input_ids": [ + 1, + 450, + 5641, + 29909, + 363, + 525, + 15161, + 29915, + 338, + 847, + 29888, + 30312, + 30376, + 29914, + 322, + 363, + 525, + 3527, + 29915, + 338, + 847, + 30376, + 30312, + 29886, + 6294 + ], + "tokens": [ + "", + "▁The", + "▁IP", + "A", + "▁for", + "▁'", + "fish", + "'", + "▁is", + "▁/", + "f", + "ɪ", + "ʃ", + "/", + "▁and", + "▁for", + "▁'", + "ship", + "'", + "▁is", + "▁/", + "ʃ", + "ɪ", + "p", + "/." + ], + "decoded_with_special": " The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.", + "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/." + }, + "edge_combining": { + "input_ids": [ + 1, + 274, + 28059, + 313, + 29940, + 8610, + 29897, + 7186, + 5777, + 1725, + 30103, + 313, + 22498, + 29928, + 29897, + 813, + 1021, + 1734, + 29892, + 1422, + 6262, + 29889 + ], + "tokens": [ + "", + "▁c", + "afé", + "▁(", + "N", + "FC", + ")", + "▁vs", + "▁ca", + "fe", + "́", + "▁(", + "NF", + "D", + ")", + "▁—", + "▁same", + "▁word", + ",", + "▁different", + "▁bytes", + "." + ], + "decoded_with_special": " café (NFC) vs café (NFD) — same word, different bytes.", + "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes." + }, + "edge_long_repetition": { + "input_ids": [ + 1, + 263, + 27137, + 27137, + 27137, + 27137, + 27137, + 27137, + 27137, + 7340, + 29874, + 289, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 1327, + 29890 + ], + "tokens": [ + "", + "▁a", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aa", + "a", + "▁b", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "b" + ], + "decoded_with_special": " aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + } + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json new file mode 100644 index 00000000..e063cb8d --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/google-t5__t5-small.json @@ -0,0 +1,1108 @@ +{ + "model_id": "google-t5/t5-small", + "transformers_version": "4.57.1", + "entries": { + "ascii_simple": { + "input_ids": [ + 37, + 1704, + 4216, + 3, + 20400, + 4418, + 7, + 147, + 8, + 19743, + 1782, + 5, + 1 + ], + "tokens": [ + "▁The", + "▁quick", + "▁brown", + "▁", + "fox", + "▁jump", + "s", + "▁over", + "▁the", + "▁lazy", + "▁dog", + ".", + "" + ], + "decoded_with_special": "The quick brown fox jumps over the lazy dog.", + "decoded_skip_special": "The quick brown fox jumps over the lazy dog." + }, + "ascii_punct": { + "input_ids": [ + 8774, + 6, + 296, + 55, + 94, + 31, + 7, + 586, + 10, + 3710, + 3, + 318, + 96, + 8270, + 15, + 121, + 3, + 31, + 2521, + 14618, + 15, + 31, + 41, + 1893, + 35, + 137, + 1 + ], + "tokens": [ + "▁Hello", + ",", + "▁world", + "!", + "▁It", + "'", + "s", + "▁12", + ":", + "34", + "▁", + "—", + "▁\"", + "quot", + "e", + "\"", + "▁", + "'", + "apos", + "troph", + "e", + "'", + "▁(", + "par", + "en", + ").", + "" + ], + "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).", + "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." + }, + "ascii_numbers": { + "input_ids": [ + 2745, + 19, + 3241, + 1877, + 2534, + 27904, + 117, + 3, + 15, + 19, + 81, + 3, + 21280, + 2606, + 2577, + 5, + 1 + ], + "tokens": [ + "▁Pi", + "▁is", + "▁approximately", + "▁3.", + "14", + "159", + ";", + "▁", + "e", + "▁is", + "▁about", + "▁", + "2.7", + "18", + "28", + ".", + "" + ], + "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.", + "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828." + }, + "ascii_url": { + "input_ids": [ + 4957, + 4893, + 1303, + 107, + 13917, + 53, + 4861, + 5, + 509, + 87, + 7171, + 7, + 11, + 4842, + 3, + 17396, + 1741, + 994, + 9, + 9208, + 5, + 287, + 5, + 1 + ], + "tokens": [ + "▁Visit", + "▁https", + "://", + "h", + "ugg", + "ing", + "face", + ".", + "co", + "/", + "doc", + "s", + "▁and", + "▁mail", + "▁", + "bob", + "@", + "ex", + "a", + "mple", + ".", + "com", + ".", + "" + ], + "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.", + "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com." + }, + "ascii_code": { + "input_ids": [ + 20, + 89, + 617, + 599, + 9, + 10, + 16, + 17, + 6, + 3, + 115, + 10, + 16, + 17, + 61, + 3, + 13114, + 16, + 17, + 10, + 1205, + 3, + 9, + 1768, + 3, + 115, + 1 + ], + "tokens": [ + "▁de", + "f", + "▁add", + "(", + "a", + ":", + "▁in", + "t", + ",", + "▁", + "b", + ":", + "▁in", + "t", + ")", + "▁", + "->", + "▁in", + "t", + ":", + "▁return", + "▁", + "a", + "▁+", + "▁", + "b", + "" + ], + "decoded_with_special": "def add(a: int, b: int) -> int: return a + b", + "decoded_skip_special": "def add(a: int, b: int) -> int: return a + b" + }, + "ja_kana_basic": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "ja_dakuten": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "ja_handakuten": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "ja_kanji_mixed": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "ja_romaji_mixed": { + "input_ids": [ + 20477, + 3, + 2, + 2040, + 3696, + 2217, + 8585, + 3, + 2, + 3, + 2, + 3785, + 834, + 23, + 26, + 7, + 3, + 2, + 14145, + 7, + 3, + 2, + 1 + ], + "tokens": [ + "▁Swift", + "▁", + "", + "▁Auto", + "To", + "ken", + "izer", + "▁", + "", + "▁", + "", + "▁input", + "_", + "i", + "d", + "s", + "▁", + "", + "▁token", + "s", + "▁", + "", + "" + ], + "decoded_with_special": "Swift AutoTokenizer input_ids tokens ", + "decoded_skip_special": "Swift AutoTokenizer input_ids tokens " + }, + "ja_long_sentence": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "ko_hangul_simple": { + "input_ids": [ + 3, + 2, + 5, + 3, + 2, + 5, + 1 + ], + "tokens": [ + "▁", + "", + ".", + "▁", + "", + ".", + "" + ], + "decoded_with_special": ". .", + "decoded_skip_special": ".." + }, + "ko_hangul_jamo": { + "input_ids": [ + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 5, + 1 + ], + "tokens": [ + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + ".", + "" + ], + "decoded_with_special": " .", + "decoded_skip_special": " ." + }, + "zh_simplified": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "zh_traditional": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "zh_mixed_en": { + "input_ids": [ + 12901, + 382, + 127, + 524, + 3, + 2, + 1659, + 1036, + 3, + 2, + 1 + ], + "tokens": [ + "▁Py", + "T", + "or", + "ch", + "▁", + "", + "▁deep", + "▁learning", + "▁", + "", + "" + ], + "decoded_with_special": "PyTorch deep learning ", + "decoded_skip_special": "PyTorch deep learning " + }, + "ar_basic": { + "input_ids": [ + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 5, + 1 + ], + "tokens": [ + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + ".", + "" + ], + "decoded_with_special": " .", + "decoded_skip_special": " ." + }, + "ar_diacritics": { + "input_ids": [ + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "" + ], + "decoded_with_special": " ", + "decoded_skip_special": " " + }, + "he_basic": { + "input_ids": [ + 3, + 2, + 3, + 2, + 5, + 3, + 2, + 3, + 2, + 3, + 2, + 5, + 1 + ], + "tokens": [ + "▁", + "", + "▁", + "", + ".", + "▁", + "", + "▁", + "", + "▁", + "", + ".", + "" + ], + "decoded_with_special": " . .", + "decoded_skip_special": ". ." + }, + "hi_devanagari": { + "input_ids": [ + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "" + ], + "decoded_with_special": " ", + "decoded_skip_special": " " + }, + "th_basic": { + "input_ids": [ + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "" + ], + "decoded_with_special": "", + "decoded_skip_special": "" + }, + "emoji_bmp": { + "input_ids": [ + 3068, + 3, + 2, + 8114, + 3, + 2, + 2213, + 3, + 2, + 842, + 3, + 2, + 1 + ], + "tokens": [ + "▁Sun", + "▁", + "", + "▁moon", + "▁", + "", + "▁star", + "▁", + "", + "▁heart", + "▁", + "", + "" + ], + "decoded_with_special": "Sun moon star heart ", + "decoded_skip_special": "Sun moon star heart " + }, + "emoji_astral": { + "input_ids": [ + 3, + 2, + 12, + 8, + 8114, + 3, + 2, + 28, + 3, + 9, + 3, + 2, + 11, + 3, + 9, + 3, + 2, + 1 + ], + "tokens": [ + "▁", + "", + "▁to", + "▁the", + "▁moon", + "▁", + "", + "▁with", + "▁", + "a", + "▁", + "", + "▁and", + "▁", + "a", + "▁", + "", + "" + ], + "decoded_with_special": " to the moon with a and a ", + "decoded_skip_special": " to the moon with a and a " + }, + "emoji_zwj_family": { + "input_ids": [ + 3712, + 10, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 11, + 5692, + 10, + 3, + 2, + 1 + ], + "tokens": [ + "▁Family", + ":", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁and", + "▁flag", + ":", + "▁", + "", + "" + ], + "decoded_with_special": "Family: and flag: ", + "decoded_skip_special": "Family: and flag: " + }, + "emoji_skin_tone": { + "input_ids": [ + 3, + 2, + 6772, + 28, + 1133, + 12, + 1496, + 1 + ], + "tokens": [ + "▁", + "", + "▁wave", + "▁with", + "▁skin", + "▁to", + "nes", + "" + ], + "decoded_with_special": " wave with skin tones", + "decoded_skip_special": " wave with skin tones" + }, + "mixed_polyglot": { + "input_ids": [ + 8774, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 3, + 2, + 1 + ], + "tokens": [ + "▁Hello", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "▁", + "", + "" + ], + "decoded_with_special": "Hello ", + "decoded_skip_special": "Hello " + }, + "mixed_code_jp": { + "input_ids": [ + 13751, + 3, + 2, + 752, + 18660, + 3274, + 96, + 2, + 4720, + 1 + ], + "tokens": [ + "▁//", + "▁", + "", + "▁let", + "▁greeting", + "▁=", + "▁\"", + "", + "!\"", + "" + ], + "decoded_with_special": "// let greeting = \"!\"", + "decoded_skip_special": "// let greeting = \"!\"" + }, + "ipa_basic": { + "input_ids": [ + 37, + 3, + 25981, + 21, + 3, + 31, + 6779, + 31, + 19, + 3, + 87, + 89, + 2, + 87, + 11, + 21, + 3, + 31, + 2009, + 31, + 19, + 3, + 87, + 2, + 102, + 87, + 5, + 1 + ], + "tokens": [ + "▁The", + "▁", + "IPA", + "▁for", + "▁", + "'", + "fish", + "'", + "▁is", + "▁", + "/", + "f", + "", + "/", + "▁and", + "▁for", + "▁", + "'", + "ship", + "'", + "▁is", + "▁", + "/", + "", + "p", + "/", + ".", + "" + ], + "decoded_with_special": "The IPA for 'fish' is /f/ and for'ship' is /p/.", + "decoded_skip_special": "The IPA for 'fish' is /f/ and for'ship' is /p/." + }, + "edge_combining": { + "input_ids": [ + 11949, + 41, + 567, + 5390, + 61, + 3, + 208, + 7, + 11949, + 41, + 12619, + 308, + 61, + 3, + 318, + 337, + 1448, + 6, + 315, + 57, + 1422, + 5, + 1 + ], + "tokens": [ + "▁café", + "▁(", + "N", + "FC", + ")", + "▁", + "v", + "s", + "▁café", + "▁(", + "NF", + "D", + ")", + "▁", + "—", + "▁same", + "▁word", + ",", + "▁different", + "▁by", + "tes", + ".", + "" + ], + "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.", + "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes." + }, + "edge_long_repetition": { + "input_ids": [ + 3, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 9, + 3, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 115, + 1 + ], + "tokens": [ + "▁", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "a", + "▁", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "b", + "" + ], + "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + } + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json new file mode 100644 index 00000000..3fc1760e --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/baselines/openai-community__gpt2.json @@ -0,0 +1,2096 @@ +{ + "model_id": "openai-community/gpt2", + "transformers_version": "4.57.1", + "entries": { + "ascii_simple": { + "input_ids": [ + 464, + 2068, + 7586, + 21831, + 18045, + 625, + 262, + 16931, + 3290, + 13 + ], + "tokens": [ + "The", + "Ġquick", + "Ġbrown", + "Ġfox", + "Ġjumps", + "Ġover", + "Ġthe", + "Ġlazy", + "Ġdog", + "." + ], + "decoded_with_special": "The quick brown fox jumps over the lazy dog.", + "decoded_skip_special": "The quick brown fox jumps over the lazy dog." + }, + "ascii_punct": { + "input_ids": [ + 15496, + 11, + 995, + 0, + 632, + 338, + 1105, + 25, + 2682, + 851, + 366, + 22708, + 1, + 705, + 499, + 455, + 22599, + 6, + 357, + 11730, + 737 + ], + "tokens": [ + "Hello", + ",", + "Ġworld", + "!", + "ĠIt", + "'s", + "Ġ12", + ":", + "34", + "ĠâĢĶ", + "Ġ\"", + "quote", + "\"", + "Ġ'", + "ap", + "ost", + "rophe", + "'", + "Ġ(", + "paren", + ")." + ], + "decoded_with_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren).", + "decoded_skip_special": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." + }, + "ascii_numbers": { + "input_ids": [ + 38729, + 318, + 6702, + 513, + 13, + 1415, + 19707, + 26, + 304, + 318, + 546, + 362, + 13, + 45720, + 2078, + 13 + ], + "tokens": [ + "Pi", + "Ġis", + "Ġapproximately", + "Ġ3", + ".", + "14", + "159", + ";", + "Ġe", + "Ġis", + "Ġabout", + "Ġ2", + ".", + "718", + "28", + "." + ], + "decoded_with_special": "Pi is approximately 3.14159; e is about 2.71828.", + "decoded_skip_special": "Pi is approximately 3.14159; e is about 2.71828." + }, + "ascii_url": { + "input_ids": [ + 31141, + 3740, + 1378, + 71, + 1018, + 2667, + 2550, + 13, + 1073, + 14, + 31628, + 290, + 6920, + 29202, + 31, + 20688, + 13, + 785, + 13 + ], + "tokens": [ + "Visit", + "Ġhttps", + "://", + "h", + "ug", + "ging", + "face", + ".", + "co", + "/", + "docs", + "Ġand", + "Ġmail", + "Ġbob", + "@", + "example", + ".", + "com", + "." + ], + "decoded_with_special": "Visit https://huggingface.co/docs and mail bob@example.com.", + "decoded_skip_special": "Visit https://huggingface.co/docs and mail bob@example.com." + }, + "ascii_code": { + "input_ids": [ + 4299, + 751, + 7, + 64, + 25, + 493, + 11, + 275, + 25, + 493, + 8, + 4613, + 493, + 25, + 198, + 220, + 220, + 220, + 1441, + 257, + 1343, + 275 + ], + "tokens": [ + "def", + "Ġadd", + "(", + "a", + ":", + "Ġint", + ",", + "Ġb", + ":", + "Ġint", + ")", + "Ġ->", + "Ġint", + ":", + "Ċ", + "Ġ", + "Ġ", + "Ġ", + "Ġreturn", + "Ġa", + "Ġ+", + "Ġb" + ], + "decoded_with_special": "def add(a: int, b: int) -> int:\n return a + b", + "decoded_skip_special": "def add(a: int, b: int) -> int:\n return a + b" + }, + "ja_kana_basic": { + "input_ids": [ + 40948, + 18566, + 29557, + 2515, + 230, + 2515, + 232, + 27370, + 33778, + 31917, + 2515, + 239, + 46036, + 43357, + 22180, + 33623, + 2515, + 249, + 2515, + 251 + ], + "tokens": [ + "ãģĤ", + "ãģĦ", + "ãģĨ", + "ãģ", + "Ī", + "ãģ", + "Ĭ", + "ãģĭ", + "ãģį", + "ãģı", + "ãģ", + "ij", + "ãģĵ", + "ãģķ", + "ãģĹ", + "ãģĻ", + "ãģ", + "Ľ", + "ãģ", + "Ŀ" + ], + "decoded_with_special": "あいうえおかきくけこさしすせそ", + "decoded_skip_special": "あいうえおかきくけこさしすせそ" + }, + "ja_dakuten": { + "input_ids": [ + 35585, + 2515, + 236, + 2515, + 238, + 2515, + 240, + 2515, + 242, + 2515, + 244, + 2515, + 246, + 2515, + 248, + 2515, + 250, + 2515, + 252, + 46777, + 2515, + 95, + 2515, + 98, + 30640, + 2515, + 102 + ], + "tokens": [ + "ãģĮ", + "ãģ", + "İ", + "ãģ", + "IJ", + "ãģ", + "Ĵ", + "ãģ", + "Ķ", + "ãģ", + "ĸ", + "ãģ", + "ĺ", + "ãģ", + "ļ", + "ãģ", + "ľ", + "ãģ", + "ŀ", + "ãģł", + "ãģ", + "¢", + "ãģ", + "¥", + "ãģ§", + "ãģ", + "©" + ], + "decoded_with_special": "がぎぐげござじずぜぞだぢづでど", + "decoded_skip_special": "がぎぐげござじずぜぞだぢづでど" + }, + "ja_handakuten": { + "input_ids": [ + 2515, + 109, + 2515, + 112, + 2515, + 115, + 2515, + 118, + 2515, + 121, + 32546, + 1209, + 242, + 30965, + 1209, + 248, + 1209, + 251 + ], + "tokens": [ + "ãģ", + "±", + "ãģ", + "´", + "ãģ", + "·", + "ãģ", + "º", + "ãģ", + "½", + "ãĥij", + "ãĥ", + "Ķ", + "ãĥĹ", + "ãĥ", + "ļ", + "ãĥ", + "Ŀ" + ], + "decoded_with_special": "ぱぴぷぺぽパピプペポ", + "decoded_skip_special": "ぱぴぷぺぽパピプペポ" + }, + "ja_kanji_mixed": { + "input_ids": [ + 33768, + 98, + 17312, + 105, + 45739, + 252, + 15474, + 121, + 95, + 162, + 26534, + 163, + 112, + 254, + 164, + 100, + 96, + 162, + 252, + 238, + 31676, + 37239, + 96, + 22180, + 18566, + 161, + 243, + 237, + 165, + 94, + 234, + 30640, + 33623, + 16764 + ], + "tokens": [ + "æĹ", + "¥", + "æľ", + "¬", + "èª", + "ŀ", + "ãģ®å", + "½", + "¢", + "æ", + "ħĭ", + "ç", + "´", + "ł", + "è", + "§", + "£", + "æ", + "ŀ", + "IJ", + "ãģ¯", + "éĽ", + "£", + "ãģĹ", + "ãģĦ", + "å", + "ķ", + "ı", + "é", + "¡", + "Į", + "ãģ§", + "ãģĻ", + "ãĢĤ" + ], + "decoded_with_special": "日本語の形態素解析は難しい問題です。", + "decoded_skip_special": "日本語の形態素解析は難しい問題です。" + }, + "ja_romaji_mixed": { + "input_ids": [ + 10462, + 2135, + 23294, + 100, + 11160, + 30642, + 7509, + 17433, + 240, + 164, + 102, + 99, + 33623, + 16764, + 13328, + 113, + 238, + 162, + 252, + 250, + 31676, + 5128, + 62, + 2340, + 23294, + 101, + 16326, + 220, + 5641, + 12859, + 234, + 2515, + 97, + 16764 + ], + "tokens": [ + "Sw", + "ift", + "Ġãģ", + "§", + "ĠAuto", + "Token", + "izer", + "ĠãĤ", + "Ĵ", + "è", + "©", + "¦", + "ãģĻ", + "ãĢĤ", + "Ġç", + "µ", + "IJ", + "æ", + "ŀ", + "ľ", + "ãģ¯", + "Ġinput", + "_", + "ids", + "Ġãģ", + "¨", + "Ġtokens", + "Ġ", + "ãģ®", + "äº", + "Į", + "ãģ", + "¤", + "ãĢĤ" + ], + "decoded_with_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。", + "decoded_skip_special": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。" + }, + "ja_long_sentence": { + "input_ids": [ + 30266, + 109, + 12859, + 105, + 31965, + 117, + 164, + 101, + 109, + 164, + 101, + 109, + 20998, + 107, + 161, + 109, + 222, + 5641, + 164, + 101, + 109, + 20998, + 107, + 161, + 109, + 222, + 165, + 243, + 115, + 35585, + 164, + 101, + 109, + 20998, + 107, + 31758, + 49035, + 118, + 22180, + 25224, + 36853, + 22180, + 18566, + 35585, + 23513, + 17312, + 105, + 37605, + 241, + 46777, + 1792, + 235, + 29557, + 27370, + 16764 + ], + "tokens": [ + "æĿ", + "±", + "äº", + "¬", + "çī", + "¹", + "è", + "¨", + "±", + "è", + "¨", + "±", + "åı", + "¯", + "å", + "±", + "Ģ", + "ãģ®", + "è", + "¨", + "±", + "åı", + "¯", + "å", + "±", + "Ģ", + "é", + "ķ", + "·", + "ãģĮ", + "è", + "¨", + "±", + "åı", + "¯", + "ãĤĴ", + "åĩ", + "º", + "ãģĹ", + "ãģŁ", + "ãĤī", + "ãģĹ", + "ãģĦ", + "ãģĮ", + "ãĢģ", + "æľ", + "¬", + "å½", + "ĵ", + "ãģł", + "ãĤ", + "į", + "ãģĨ", + "ãģĭ", + "ãĢĤ" + ], + "decoded_with_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。", + "decoded_skip_special": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。" + }, + "ko_hangul_simple": { + "input_ids": [ + 168, + 243, + 230, + 167, + 227, + 243, + 47991, + 246, + 168, + 226, + 116, + 168, + 248, + 242, + 13, + 31619, + 108, + 246, + 166, + 108, + 239, + 168, + 232, + 113, + 46695, + 230, + 46695, + 97, + 13 + ], + "tokens": [ + "ì", + "ķ", + "Ī", + "ë", + "ħ", + "ķ", + "íķ", + "ĺ", + "ì", + "Ħ", + "¸", + "ì", + "ļ", + "Ķ", + ".", + "Ġë", + "°", + "ĺ", + "ê", + "°", + "ij", + "ì", + "Ĭ", + "µ", + "ëĭ", + "Ī", + "ëĭ", + "¤", + "." + ], + "decoded_with_special": "안녕하세요. 반갑습니다.", + "decoded_skip_special": "안녕하세요. 반갑습니다." + }, + "ko_hangul_jamo": { + "input_ids": [ + 157, + 226, + 240, + 157, + 227, + 94, + 157, + 228, + 104, + 166, + 116, + 222, + 23821, + 252, + 238, + 167, + 103, + 101, + 31619, + 114, + 226, + 167, + 99, + 105, + 220, + 169, + 246, + 243, + 168, + 233, + 251, + 166, + 111, + 120, + 220, + 47991, + 250, + 166, + 116, + 222, + 23821, + 251, + 234, + 168, + 254, + 230, + 220, + 169, + 246, + 243, + 168, + 233, + 251, + 13 + ], + "tokens": [ + "á", + "Ħ", + "Ĵ", + "á", + "ħ", + "¡", + "á", + "Ĩ", + "«", + "ê", + "¸", + "Ģ", + "Ġì", + "ŀ", + "IJ", + "ë", + "ª", + "¨", + "Ġë", + "¶", + "Ħ", + "ë", + "¦", + "¬", + "Ġ", + "í", + "ĺ", + "ķ", + "ì", + "ĭ", + "Ŀ", + "ê", + "³", + "¼", + "Ġ", + "íķ", + "ľ", + "ê", + "¸", + "Ģ", + "Ġì", + "Ŀ", + "Į", + "ì", + "ł", + "Ī", + "Ġ", + "í", + "ĺ", + "ķ", + "ì", + "ĭ", + "Ŀ", + "." + ], + "decoded_with_special": "한글 자모 분리 형식과 한글 음절 형식.", + "decoded_skip_special": "한글 자모 분리 형식과 한글 음절 형식." + }, + "zh_simplified": { + "input_ids": [ + 17312, + 118, + 161, + 247, + 101, + 27764, + 99, + 20046, + 254, + 42468, + 21689, + 32432, + 98, + 162, + 247, + 118, + 47797, + 121, + 21410, + 31660, + 10310, + 103, + 34932, + 235, + 17358, + 223, + 26344, + 228, + 162, + 242, + 107, + 16764 + ], + "tokens": [ + "æľ", + "º", + "å", + "Ļ", + "¨", + "åŃ", + "¦", + "ä¹", + "ł", + "æĺ¯", + "人", + "å·", + "¥", + "æ", + "Ļ", + "º", + "èĥ", + "½", + "çļĦ", + "ä¸Ģ", + "ä¸", + "ª", + "éĩ", + "į", + "è¦", + "ģ", + "åĪ", + "Ĩ", + "æ", + "Ķ", + "¯", + "ãĢĤ" + ], + "decoded_with_special": "机器学习是人工智能的一个重要分支。", + "decoded_skip_special": "机器学习是人工智能的一个重要分支。" + }, + "zh_traditional": { + "input_ids": [ + 49960, + 161, + 247, + 101, + 27764, + 116, + 163, + 123, + 240, + 42468, + 21689, + 32432, + 98, + 162, + 247, + 118, + 162, + 227, + 100, + 21410, + 31660, + 161, + 222, + 233, + 34932, + 235, + 17358, + 223, + 26344, + 228, + 162, + 242, + 107, + 16764 + ], + "tokens": [ + "æ©Ł", + "å", + "Ļ", + "¨", + "åŃ", + "¸", + "ç", + "¿", + "Ĵ", + "æĺ¯", + "人", + "å·", + "¥", + "æ", + "Ļ", + "º", + "æ", + "ħ", + "§", + "çļĦ", + "ä¸Ģ", + "å", + "Ģ", + "ĭ", + "éĩ", + "į", + "è¦", + "ģ", + "åĪ", + "Ĩ", + "æ", + "Ķ", + "¯", + "ãĢĤ" + ], + "decoded_with_special": "機器學習是人工智慧的一個重要分支。", + "decoded_skip_special": "機器學習是人工智慧的一個重要分支。" + }, + "zh_mixed_en": { + "input_ids": [ + 20519, + 15884, + 354, + 10545, + 246, + 107, + 31660, + 10310, + 103, + 2769, + 4673, + 10545, + 94, + 228, + 162, + 252, + 114, + 16764 + ], + "tokens": [ + "Py", + "Tor", + "ch", + "Ġæ", + "ĺ", + "¯", + "ä¸Ģ", + "ä¸", + "ª", + "Ġdeep", + "Ġlearning", + "Ġæ", + "¡", + "Ĩ", + "æ", + "ŀ", + "¶", + "ãĢĤ" + ], + "decoded_with_special": "PyTorch 是一个 deep learning 框架。", + "decoded_skip_special": "PyTorch 是一个 deep learning 框架。" + }, + "ar_basic": { + "input_ids": [ + 23525, + 13862, + 148, + 118, + 45632, + 28981, + 44690, + 26897, + 39848, + 22654, + 45632, + 17550, + 105, + 25405, + 22654, + 13862, + 45632, + 17550, + 105, + 38843, + 12919, + 13 + ], + "tokens": [ + "اÙĦ", + "ÙĦ", + "Ø", + "º", + "Ø©", + "ĠاÙĦ", + "ع", + "ر", + "ب", + "ÙĬ", + "Ø©", + "ĠØ", + "¬", + "Ùħ", + "ÙĬ", + "ÙĦ", + "Ø©", + "ĠØ", + "¬", + "د", + "ا", + "." + ], + "decoded_with_special": "اللغة العربية جميلة جدا.", + "decoded_skip_special": "اللغة العربية جميلة جدا." + }, + "ar_diacritics": { + "input_ids": [ + 39848, + 44208, + 45692, + 48763, + 25405, + 44208, + 28981, + 13862, + 24333, + 149, + 239, + 29519, + 44208, + 28981, + 26897, + 24333, + 149, + 239, + 148, + 255, + 48763, + 25405, + 24333, + 149, + 108, + 23338, + 44208, + 28981, + 26897, + 24333, + 149, + 239, + 148, + 255, + 44208, + 22654, + 25405, + 44208 + ], + "tokens": [ + "ب", + "ÙIJ", + "س", + "ÙĴ", + "Ùħ", + "ÙIJ", + "ĠاÙĦ", + "ÙĦ", + "Ùİ", + "Ù", + "ij", + "Ùĩ", + "ÙIJ", + "ĠاÙĦ", + "ر", + "Ùİ", + "Ù", + "ij", + "Ø", + "Ń", + "ÙĴ", + "Ùħ", + "Ùİ", + "Ù", + "°", + "ÙĨ", + "ÙIJ", + "ĠاÙĦ", + "ر", + "Ùİ", + "Ù", + "ij", + "Ø", + "Ń", + "ÙIJ", + "ÙĬ", + "Ùħ", + "ÙIJ" + ], + "decoded_with_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ", + "decoded_skip_special": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" + }, + "he_basic": { + "input_ids": [ + 50227, + 40010, + 27072, + 147, + 251, + 14360, + 95, + 27072, + 40010, + 147, + 251, + 13, + 14360, + 244, + 38269, + 14360, + 246, + 147, + 100, + 147, + 94, + 147, + 246, + 14360, + 95, + 49603, + 37778, + 25529, + 13 + ], + "tokens": [ + "ש", + "׾", + "×ķ", + "×", + "Ŀ", + "Ġ×", + "¢", + "×ķ", + "׾", + "×", + "Ŀ", + ".", + "Ġ×", + "ĸ", + "×Ķ", + "Ġ×", + "ĺ", + "×", + "§", + "×", + "¡", + "×", + "ĺ", + "Ġ×", + "¢", + "×ij", + "ר", + "×Ļ", + "." + ], + "decoded_with_special": "שלום עולם. זה טקסט עברי.", + "decoded_skip_special": "שלום עולם. זה טקסט עברי." + }, + "hi_devanagari": { + "input_ids": [ + 11976, + 117, + 11976, + 123, + 11976, + 101, + 24231, + 235, + 11976, + 99, + 24231, + 222, + 28225, + 255, + 48077, + 11976, + 115, + 48077, + 28225, + 105, + 11976, + 117, + 24231, + 223, + 11976, + 97, + 28225, + 116, + 11976, + 106, + 24231, + 225, + 11976, + 99, + 24231, + 235, + 11976, + 100, + 28225, + 117, + 24231, + 230, + 24231, + 97 + ], + "tokens": [ + "à¤", + "¹", + "à¤", + "¿", + "à¤", + "¨", + "à¥", + "į", + "à¤", + "¦", + "à¥", + "Ģ", + "Ġà¤", + "Ń", + "ा", + "à¤", + "·", + "ा", + "Ġà¤", + "¬", + "à¤", + "¹", + "à¥", + "ģ", + "à¤", + "¤", + "Ġà¤", + "¸", + "à¤", + "®", + "à¥", + "ĥ", + "à¤", + "¦", + "à¥", + "į", + "à¤", + "§", + "Ġà¤", + "¹", + "à¥", + "Ī", + "à¥", + "¤" + ], + "decoded_with_special": "हिन्दी भाषा बहुत समृद्ध है।", + "decoded_skip_special": "हिन्दी भाषा बहुत समृद्ध है।" + }, + "th_basic": { + "input_ids": [ + 19567, + 223, + 19567, + 110, + 19567, + 96, + 19567, + 249, + 19567, + 96, + 19567, + 108, + 19567, + 94, + 19567, + 100, + 19567, + 98, + 19567, + 250, + 19567, + 98, + 19567, + 254, + 19567, + 110, + 19567, + 102, + 19567, + 110, + 31479, + 226, + 19567, + 245, + 19567, + 95, + 19567, + 233, + 19567, + 109, + 19567, + 248, + 19567, + 233, + 31479, + 231, + 19567, + 255, + 19567, + 247, + 31479, + 222, + 19567, + 252, + 19567, + 96, + 19567, + 110, + 19567, + 108, + 31479, + 226, + 19567, + 94, + 31479, + 230, + 19567, + 94, + 19567, + 113, + 19567, + 232, + 31479, + 230, + 19567, + 255, + 19567, + 229, + 19567, + 100, + 31479, + 230, + 19567, + 110, + 19567, + 229, + 19567, + 96, + 19567, + 108, + 19567, + 104, + 19567, + 100, + 31479, + 230, + 19567, + 110, + 19567, + 229, + 19567, + 226, + 19567, + 111 + ], + "tokens": [ + "à¸", + "ģ", + "à¸", + "²", + "à¸", + "£", + "à¸", + "Ľ", + "à¸", + "£", + "à¸", + "°", + "à¸", + "¡", + "à¸", + "§", + "à¸", + "¥", + "à¸", + "ľ", + "à¸", + "¥", + "à¸", + "ł", + "à¸", + "²", + "à¸", + "©", + "à¸", + "²", + "à¹", + "Ħ", + "à¸", + "Ĺ", + "à¸", + "¢", + "à¸", + "ĭ", + "à¸", + "±", + "à¸", + "ļ", + "à¸", + "ĭ", + "à¹", + "ī", + "à¸", + "Ń", + "à¸", + "Ļ", + "à¹", + "Ģ", + "à¸", + "ŀ", + "à¸", + "£", + "à¸", + "²", + "à¸", + "°", + "à¹", + "Ħ", + "à¸", + "¡", + "à¹", + "Ī", + "à¸", + "¡", + "à¸", + "µ", + "à¸", + "Ĭ", + "à¹", + "Ī", + "à¸", + "Ń", + "à¸", + "ĩ", + "à¸", + "§", + "à¹", + "Ī", + "à¸", + "²", + "à¸", + "ĩ", + "à¸", + "£", + "à¸", + "°", + "à¸", + "«", + "à¸", + "§", + "à¹", + "Ī", + "à¸", + "²", + "à¸", + "ĩ", + "à¸", + "Ħ", + "à¸", + "³" + ], + "decoded_with_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ", + "decoded_skip_special": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ" + }, + "emoji_bmp": { + "input_ids": [ + 16012, + 34719, + 222, + 8824, + 34719, + 122, + 3491, + 23883, + 2612, + 20724, + 98 + ], + "tokens": [ + "Sun", + "Ġâĺ", + "Ģ", + "Ġmoon", + "Ġâĺ", + "¾", + "Ġstar", + "Ġâĺħ", + "Ġheart", + "ĠâĻ", + "¥" + ], + "decoded_with_special": "Sun ☀ moon ☾ star ★ heart ♥", + "decoded_skip_special": "Sun ☀ moon ☾ star ★ heart ♥" + }, + "emoji_astral": { + "input_ids": [ + 8582, + 248, + 222, + 284, + 262, + 8824, + 12520, + 234, + 247, + 351, + 257, + 12520, + 238, + 109, + 290, + 257, + 12520, + 236, + 231 + ], + "tokens": [ + "ðŁ", + "ļ", + "Ģ", + "Ġto", + "Ġthe", + "Ġmoon", + "ĠðŁ", + "Į", + "Ļ", + "Ġwith", + "Ġa", + "ĠðŁ", + "IJ", + "±", + "Ġand", + "Ġa", + "ĠðŁ", + "İ", + "ī" + ], + "decoded_with_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉", + "decoded_skip_special": "🚀 to the moon 🌙 with a 🐱 and a 🎉" + }, + "emoji_zwj_family": { + "input_ids": [ + 24094, + 25, + 50169, + 101, + 447, + 235, + 41840, + 102, + 447, + 235, + 41840, + 100, + 447, + 235, + 41840, + 99, + 290, + 6056, + 25, + 12520, + 229, + 107, + 8582, + 229, + 113, + 8582, + 229, + 108, + 8582, + 229, + 115, + 8582, + 229, + 101, + 8582, + 229, + 111 + ], + "tokens": [ + "Family", + ":", + "ĠðŁij", + "¨", + "âĢ", + "į", + "ðŁij", + "©", + "âĢ", + "į", + "ðŁij", + "§", + "âĢ", + "į", + "ðŁij", + "¦", + "Ġand", + "Ġflag", + ":", + "ĠðŁ", + "ĩ", + "¯", + "ðŁ", + "ĩ", + "µ", + "ðŁ", + "ĩ", + "°", + "ðŁ", + "ĩ", + "·", + "ðŁ", + "ĩ", + "¨", + "ðŁ", + "ĩ", + "³" + ], + "decoded_with_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳", + "decoded_skip_special": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳" + }, + "emoji_skin_tone": { + "input_ids": [ + 41840, + 233, + 8582, + 237, + 119, + 41840, + 233, + 8582, + 237, + 121, + 41840, + 233, + 8582, + 237, + 123, + 6769, + 351, + 4168, + 23755 + ], + "tokens": [ + "ðŁij", + "ĭ", + "ðŁ", + "ı", + "»", + "ðŁij", + "ĭ", + "ðŁ", + "ı", + "½", + "ðŁij", + "ĭ", + "ðŁ", + "ı", + "¿", + "Ġwave", + "Ġwith", + "Ġskin", + "Ġtones" + ], + "decoded_with_special": "👋🏻👋🏽👋🏿 wave with skin tones", + "decoded_skip_special": "👋🏻👋🏽👋🏿 wave with skin tones" + }, + "mixed_polyglot": { + "input_ids": [ + 15496, + 220, + 10310, + 244, + 45911, + 234, + 23821, + 243, + 230, + 167, + 227, + 243, + 14360, + 102, + 40010, + 27072, + 147, + 251, + 47048, + 26897, + 148, + 255, + 39848, + 12919, + 28225, + 101, + 11976, + 106, + 11976, + 116, + 24231, + 235, + 11976, + 97, + 24231, + 229, + 220, + 19567, + 103, + 19567, + 100, + 19567, + 109, + 19567, + 103, + 19567, + 242, + 19567, + 113, + 23294, + 241, + 22174, + 28618, + 2515, + 94, + 31676 + ], + "tokens": [ + "Hello", + "Ġ", + "ä¸", + "ĸ", + "çķ", + "Į", + "Ġì", + "ķ", + "Ī", + "ë", + "ħ", + "ķ", + "Ġ×", + "©", + "׾", + "×ķ", + "×", + "Ŀ", + "ĠÙħ", + "ر", + "Ø", + "Ń", + "ب", + "ا", + "Ġà¤", + "¨", + "à¤", + "®", + "à¤", + "¸", + "à¥", + "į", + "à¤", + "¤", + "à¥", + "ĩ", + "Ġ", + "à¸", + "ª", + "à¸", + "§", + "à¸", + "±", + "à¸", + "ª", + "à¸", + "Ķ", + "à¸", + "µ", + "Ġãģ", + "ĵ", + "ãĤĵ", + "ãģ«", + "ãģ", + "¡", + "ãģ¯" + ], + "decoded_with_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは", + "decoded_skip_special": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは" + }, + "mixed_code_jp": { + "input_ids": [ + 1003, + 10545, + 245, + 98, + 17312, + 105, + 45739, + 252, + 24679, + 26998, + 6527, + 13298, + 198, + 1616, + 31933, + 796, + 366, + 46036, + 22174, + 28618, + 2515, + 94, + 31676, + 23513, + 10310, + 244, + 45911, + 234, + 2474 + ], + "tokens": [ + "//", + "Ġæ", + "Ĺ", + "¥", + "æľ", + "¬", + "èª", + "ŀ", + "ãĤ³", + "ãĥ¡", + "ãĥ³", + "ãĥĪ", + "Ċ", + "let", + "Ġgreeting", + "Ġ=", + "Ġ\"", + "ãģĵ", + "ãĤĵ", + "ãģ«", + "ãģ", + "¡", + "ãģ¯", + "ãĢģ", + "ä¸", + "ĸ", + "çķ", + "Į", + "!\"" + ], + "decoded_with_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"", + "decoded_skip_special": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"" + }, + "ipa_basic": { + "input_ids": [ + 464, + 27966, + 329, + 705, + 11084, + 6, + 318, + 1220, + 69, + 133, + 103, + 134, + 225, + 14, + 290, + 329, + 705, + 6720, + 6, + 318, + 1220, + 134, + 225, + 133, + 103, + 79, + 11757 + ], + "tokens": [ + "The", + "ĠIPA", + "Ġfor", + "Ġ'", + "fish", + "'", + "Ġis", + "Ġ/", + "f", + "É", + "ª", + "Ê", + "ĥ", + "/", + "Ġand", + "Ġfor", + "Ġ'", + "ship", + "'", + "Ġis", + "Ġ/", + "Ê", + "ĥ", + "É", + "ª", + "p", + "/." + ], + "decoded_with_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/.", + "decoded_skip_special": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/." + }, + "edge_combining": { + "input_ids": [ + 66, + 1878, + 2634, + 357, + 45, + 4851, + 8, + 3691, + 26725, + 136, + 223, + 357, + 21870, + 35, + 8, + 851, + 976, + 1573, + 11, + 1180, + 9881, + 13 + ], + "tokens": [ + "c", + "af", + "é", + "Ġ(", + "N", + "FC", + ")", + "Ġvs", + "Ġcafe", + "Ì", + "ģ", + "Ġ(", + "NF", + "D", + ")", + "ĠâĢĶ", + "Ġsame", + "Ġword", + ",", + "Ġdifferent", + "Ġbytes", + "." + ], + "decoded_with_special": "café (NFC) vs café (NFD) — same word, different bytes.", + "decoded_skip_special": "café (NFC) vs café (NFD) — same word, different bytes." + }, + "edge_long_repetition": { + "input_ids": [ + 24794, + 24794, + 24794, + 24794, + 24794, + 24794, + 24794, + 24794, + 275, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 11848, + 65 + ], + "tokens": [ + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "aaaa", + "Ġb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "bb", + "b" + ], + "decoded_with_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", + "decoded_skip_special": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" + } + } +} diff --git a/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json b/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json new file mode 100644 index 00000000..b5eda950 --- /dev/null +++ b/Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json @@ -0,0 +1,36 @@ +{ + "schema_version": 1, + "description": "Multilingual stress corpus for byte-identical tokenization parity tests against HuggingFace Python `transformers`. Each entry is keyed by a stable id so baselines can be re-aligned across tokenizer kernels. Designed to exercise script boundaries that decoder-only English corpora miss: voiced-kana / dakuten, Hangul jamo composition, Han ideographs (simplified + traditional), RTL bidi, Indic + Thai, astral-plane glyphs, ZWJ grapheme clusters, and mixed-script code.", + "inputs": [ + { "id": "ascii_simple", "category": "ascii", "text": "The quick brown fox jumps over the lazy dog." }, + { "id": "ascii_punct", "category": "ascii", "text": "Hello, world! It's 12:34 — \"quote\" 'apostrophe' (paren)." }, + { "id": "ascii_numbers", "category": "ascii", "text": "Pi is approximately 3.14159; e is about 2.71828." }, + { "id": "ascii_url", "category": "ascii", "text": "Visit https://huggingface.co/docs and mail bob@example.com." }, + { "id": "ascii_code", "category": "code", "text": "def add(a: int, b: int) -> int:\n return a + b" }, + { "id": "ja_kana_basic", "category": "japanese", "text": "あいうえおかきくけこさしすせそ" }, + { "id": "ja_dakuten", "category": "japanese", "text": "がぎぐげござじずぜぞだぢづでど" }, + { "id": "ja_handakuten", "category": "japanese", "text": "ぱぴぷぺぽパピプペポ" }, + { "id": "ja_kanji_mixed", "category": "japanese", "text": "日本語の形態素解析は難しい問題です。" }, + { "id": "ja_romaji_mixed", "category": "japanese", "text": "Swift で AutoTokenizer を試す。 結果は input_ids と tokens の二つ。" }, + { "id": "ja_long_sentence", "category": "japanese", "text": "東京特許許可局の許可局長が許可を出したらしいが、本当だろうか。" }, + { "id": "ko_hangul_simple", "category": "korean", "text": "안녕하세요. 반갑습니다." }, + { "id": "ko_hangul_jamo", "category": "korean", "text": "한글 자모 분리 형식과 한글 음절 형식." }, + { "id": "zh_simplified", "category": "chinese", "text": "机器学习是人工智能的一个重要分支。" }, + { "id": "zh_traditional", "category": "chinese", "text": "機器學習是人工智慧的一個重要分支。" }, + { "id": "zh_mixed_en", "category": "chinese", "text": "PyTorch 是一个 deep learning 框架。" }, + { "id": "ar_basic", "category": "arabic", "text": "اللغة العربية جميلة جدا." }, + { "id": "ar_diacritics", "category": "arabic", "text": "بِسْمِ اللَّهِ الرَّحْمَٰنِ الرَّحِيمِ" }, + { "id": "he_basic", "category": "hebrew", "text": "שלום עולם. זה טקסט עברי." }, + { "id": "hi_devanagari", "category": "devanagari", "text": "हिन्दी भाषा बहुत समृद्ध है।" }, + { "id": "th_basic", "category": "thai", "text": "การประมวลผลภาษาไทยซับซ้อนเพราะไม่มีช่องว่างระหว่างคำ" }, + { "id": "emoji_bmp", "category": "emoji", "text": "Sun ☀ moon ☾ star ★ heart ♥" }, + { "id": "emoji_astral", "category": "emoji", "text": "🚀 to the moon 🌙 with a 🐱 and a 🎉" }, + { "id": "emoji_zwj_family", "category": "emoji", "text": "Family: 👨‍👩‍👧‍👦 and flag: 🇯🇵🇰🇷🇨🇳" }, + { "id": "emoji_skin_tone", "category": "emoji", "text": "👋🏻👋🏽👋🏿 wave with skin tones" }, + { "id": "mixed_polyglot", "category": "mixed", "text": "Hello 世界 안녕 שלום مرحبا नमस्ते สวัสดี こんにちは" }, + { "id": "mixed_code_jp", "category": "mixed", "text": "// 日本語コメント\nlet greeting = \"こんにちは、世界!\"" }, + { "id": "ipa_basic", "category": "ipa", "text": "The IPA for 'fish' is /fɪʃ/ and for 'ship' is /ʃɪp/." }, + { "id": "edge_combining", "category": "edge", "text": "café (NFC) vs café (NFD) — same word, different bytes." }, + { "id": "edge_long_repetition", "category": "edge", "text": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb" } + ] +} diff --git a/Tools/README.md b/Tools/README.md new file mode 100644 index 00000000..4e230d4c --- /dev/null +++ b/Tools/README.md @@ -0,0 +1,52 @@ +# Tools + +Repository-side scripts that produce or maintain fixtures used by the Swift test +suite. They run on macOS/Linux with a CPython interpreter and do not touch the +Swift build. + +## `generate_tokenizer_baselines.py` + +Regenerates the byte-identical reference values consumed by +`Tests/TokenizersTests/MultilingualConformanceTests.swift`. The Python +`transformers` library is treated as the authoritative reference; whenever +this script changes its output, the Swift parity tests are expected to be +re-validated against the new baselines. + +### Setup + +```sh +python3 -m venv .venv-tokenizer-baselines +.venv-tokenizer-baselines/bin/pip install -r Tools/requirements.txt +``` + +### Regenerate all kernels + +```sh +.venv-tokenizer-baselines/bin/python Tools/generate_tokenizer_baselines.py +``` + +This rewrites every `Tests/TokenizersTests/Resources/MultilingualConformance/baselines/*.json` +file in place. Commit the diffs together with the upstream `transformers` +version pinned in `Tools/requirements.txt` so the references are reproducible. + +### Regenerate a single kernel + +```sh +.venv-tokenizer-baselines/bin/python Tools/generate_tokenizer_baselines.py \ + --models BAAI/bge-small-en-v1.5 +``` + +### Adding a new kernel or input + +1. Append the model id to the `MODELS` list in + `generate_tokenizer_baselines.py`, or add an entry to `inputs.json`. +2. Rerun the script. The new baseline file appears under + `Tests/TokenizersTests/Resources/MultilingualConformance/baselines/`. +3. Mirror the kernel in `MultilingualConformanceTests.swift`'s `kernels` + array. +4. Run `swift test --filter MultilingualConformanceTests`. + +If the Swift tokenizer diverges from the new reference, add an entry to +`expectedDivergences` linking to the relevant upstream issue or PR. The test +target stays green while the divergence remains documented, and the test +prints a hint when an upstream fix lands and the entry can be removed. diff --git a/Tools/generate_tokenizer_baselines.py b/Tools/generate_tokenizer_baselines.py new file mode 100644 index 00000000..df97e29d --- /dev/null +++ b/Tools/generate_tokenizer_baselines.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +"""Regenerate multilingual conformance baselines from HuggingFace Python `transformers`. + +This script is the single source of truth for the byte-identical reference values +that `MultilingualConformanceTests` compares Swift output against. To regenerate: + + pip install -r Tools/requirements.txt + python Tools/generate_tokenizer_baselines.py + +Each baseline file is a JSON dictionary keyed by input id, containing the +`input_ids`, the convert_ids_to_tokens result, and the decoded form (both with +and without special tokens). The values are produced by Python's +`AutoTokenizer.from_pretrained(model_id)`, which is treated as the authoritative +reference for byte-identical parity. + +When a Swift test fails against a baseline, regenerate locally with the same +`transformers` version listed in `requirements.txt` to confirm the divergence +isn't an upstream change. +""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + +try: + import transformers + from transformers import AutoTokenizer +except ImportError: + sys.stderr.write( + "transformers is required. Install with: pip install -r Tools/requirements.txt\n" + ) + sys.exit(1) + + +# Model matrix is intentionally small and covers four distinct tokenizer kernels +# observed in production HuggingFace text models. Adding a new kernel is +# preferable to adding a near-duplicate of an existing one. +MODELS = [ + # WordPiece (Bert family) — exercises BasicTokenizer pre-tokenization on + # CJK / dakuten / diacritics. BGE-small-en is the encoder most embedding + # pipelines on Apple Silicon use today. + "BAAI/bge-small-en-v1.5", + + # Unigram / SentencePiece — exercises Unigram lattice + Metaspace decoder + # on multi-codepoint graphemes. T5-small is the canonical Unigram model and + # ships the tokenizer.json required by swift-transformers. + "google-t5/t5-small", + + # Byte-level BPE (GPT-2 family) — exercises ByteLevelPreTokenizer regex + + # byte encoding. Expected to be byte-identical with the Python reference + # across the entire corpus. + "openai-community/gpt2", + + # Modern Byte-level BPE (Qwen family) — exercises a more recent vocabulary + # and merge table while sharing the GPT-2 kernel. + "Qwen/Qwen2.5-0.5B", + + # SentencePiece BPE with byte-fallback (Llama family) — exercises BPE merge + # on multi-codepoint graphemes. TinyLlama uses the standard Llama tokenizer + # without an auth gate. + "TinyLlama/TinyLlama-1.1B-Chat-v1.0", +] + + +def slugify(model_id: str) -> str: + """Filesystem-safe representation of a HuggingFace model id.""" + return model_id.replace("/", "__") + + +def encode_input(tokenizer: Any, text: str) -> dict[str, Any]: + """Produce a stable JSON-serializable view of how `tokenizer` handles `text`.""" + # `add_special_tokens=True` matches what `tokenizer.encode(text)` and the + # default `AutoTokenizer(text)` callable produce. + input_ids = tokenizer.encode(text, add_special_tokens=True) + tokens = tokenizer.convert_ids_to_tokens(input_ids) + decoded_with_special = tokenizer.decode(input_ids, skip_special_tokens=False) + decoded_skip_special = tokenizer.decode(input_ids, skip_special_tokens=True) + return { + "input_ids": list(input_ids), + "tokens": list(tokens), + "decoded_with_special": decoded_with_special, + "decoded_skip_special": decoded_skip_special, + } + + +def generate(model_id: str, corpus: list[dict[str, Any]]) -> dict[str, Any]: + print(f" loading tokenizer for {model_id}...", flush=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + # swift-transformers loads tokenizers via `tokenizer.json` (the Rust-backed + # fast format), so the Python reference has to be the matching fast + # tokenizer for parity to be meaningful. Slow tokenizers can silently + # produce different ids on multi-codepoint inputs. + if not getattr(tokenizer, "is_fast", False): + raise RuntimeError( + f"{model_id} resolved to a slow tokenizer; swift-transformers requires a " + "tokenizer.json (fast) reference. Either pick a model with tokenizer.json " + "published, or pre-convert one with `AutoTokenizer.save_pretrained` and " + "point this script at the local path." + ) + entries: dict[str, Any] = {} + for item in corpus: + entries[item["id"]] = encode_input(tokenizer, item["text"]) + return { + "model_id": model_id, + "transformers_version": transformers.__version__, + "entries": entries, + } + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--corpus", + type=Path, + default=None, + help="Path to inputs.json (defaults to Tests/TokenizersTests/Resources/MultilingualConformance/inputs.json)", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=None, + help="Where to write the per-model baseline JSON files", + ) + parser.add_argument( + "--models", + nargs="+", + default=MODELS, + help="Override the model matrix (default: all 5 kernels)", + ) + args = parser.parse_args() + + repo_root = Path(__file__).resolve().parent.parent + base_dir = repo_root / "Tests" / "TokenizersTests" / "Resources" / "MultilingualConformance" + corpus_path = args.corpus or (base_dir / "inputs.json") + output_dir = args.output_dir or (base_dir / "baselines") + output_dir.mkdir(parents=True, exist_ok=True) + + corpus = json.loads(corpus_path.read_text(encoding="utf-8"))["inputs"] + print(f"loaded {len(corpus)} inputs from {_display_path(corpus_path, repo_root)}") + + for model_id in args.models: + baseline = generate(model_id, corpus) + path = output_dir / f"{slugify(model_id)}.json" + path.write_text( + json.dumps(baseline, ensure_ascii=False, indent=2) + "\n", + encoding="utf-8", + ) + print(f" wrote {_display_path(path, repo_root)}") + + return 0 + + +def _display_path(path: Path, repo_root: Path) -> str: + """Return `path` as repo-relative when it lives inside the repo, otherwise absolute.""" + try: + return str(path.resolve().relative_to(repo_root)) + except ValueError: + return str(path.resolve()) + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/Tools/requirements.txt b/Tools/requirements.txt new file mode 100644 index 00000000..88942cb6 --- /dev/null +++ b/Tools/requirements.txt @@ -0,0 +1,7 @@ +# Versions pinned for reproducible baselines. Bump the `transformers` pin in +# the same commit that re-generates the bundled baselines so the parity values +# stay aligned with a known reference. +transformers==4.57.1 +tokenizers>=0.20,<0.23 +sentencepiece>=0.2 +protobuf>=4,<8