Skip to content

Commit c7a7bf5

Browse files
authored
Classify Hugging Face tokenizer and runtime artifacts (#529)
fix: classify Hugging Face model artifacts Signed-off-by: Zhao Chen <winters.zc@antgroup.com>
1 parent b03fdc8 commit c7a7bf5

3 files changed

Lines changed: 98 additions & 11 deletions

File tree

pkg/modelfile/constants.go

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,22 @@ var (
4848
"*.modelcard", // Model card metadata
4949
"*.meta", // Model metadata
5050
"*tokenizer.model*", // Tokenizer files (e.g., Mistral v3)
51-
"config.json.*", // Model configuration variants
52-
"*.hparams", // Hyperparameter files
53-
"*.params", // Parameter files
54-
"*.hyperparams", // Hyperparameter configuration
55-
"*.wandb", // Weights & Biases configuration
56-
"*.mlflow", // MLflow configuration
57-
"*.tensorboard", // TensorBoard configuration
51+
"*.tiktoken", // TikToken vocabulary files
52+
"vocab.txt", // Tokenizer vocabulary files
53+
"merges.txt", // Tokenizer merge rules
54+
"added_tokens.txt", // Additional tokenizer tokens
55+
"spiece.model", // SentencePiece tokenizer files
56+
"sentencepiece*.model",
57+
"sentencepiece*.vocab",
58+
"tiktoken.model",
59+
"chat_template.jinja",
60+
"config.json.*", // Model configuration variants
61+
"*.hparams", // Hyperparameter files
62+
"*.params", // Parameter files
63+
"*.hyperparams", // Hyperparameter configuration
64+
"*.wandb", // Weights & Biases configuration
65+
"*.mlflow", // MLflow configuration
66+
"*.tensorboard", // TensorBoard configuration
5867
}
5968

6069
// Model file patterns - supported model file extensions.
@@ -97,9 +106,10 @@ var (
97106
"*.f32", // GGML F32 format
98107

99108
// checkpoint formats.
100-
"*.ckpt", // Checkpoint format
101-
"*.checkpoint", // Checkpoint format (alternative extension)
102-
"*.dist_ckpt", // Distributed checkpoint format
109+
"*.ckpt", // Checkpoint format
110+
"*.checkpoint", // Checkpoint format (alternative extension)
111+
"*.dist_ckpt", // Distributed checkpoint format
112+
"tensor[0-9]*_[0-9]*", // Sharded checkpoint tensor files
103113

104114
// Semantics-specific formats
105115
"*.tensor", // Generic tensor format
@@ -113,6 +123,7 @@ var (
113123
"*.engine", // TensorRT format
114124
"*.trt", // TensorRT format (alternative extension)
115125
"*.onnx", // Open Neural Network Exchange format
126+
"*.onnx_data*", // ONNX external data files
116127
"*.msgpack", // MessagePack serialization
117128
"*.model", // Some NLP frameworks
118129
"*.pkl", // Pickle format
@@ -124,6 +135,7 @@ var (
124135
"*.nc", // NetCDF format
125136
"*.mlmodel", // Apple Core ML format
126137
"*.coreml", // Apple Core ML format (alternative)
138+
"*.mil", // Core ML intermediate language files
127139
"*.mleap", // MLeap format (Spark ML)
128140
"*.surml", // SurrealML format
129141
"*.llamafile", // Llamafile format

pkg/modelfile/constants_test.go

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,16 +49,21 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
4949
{"model.bin.part2", true},
5050
{"model.gguf.part1", true},
5151
{"model.gguf.00001-of-00003", true},
52+
{"model.onnx_data", true},
53+
{"model.onnx_data_1", true},
54+
{"ckpt-0/tensor00001_000", true},
5255
{"model.llamafile.zip", true},
5356
{"model.llamafile.gz", true},
5457

5558
// Existing patterns still work.
5659
{"model.safetensors", true},
5760
{"model.bin", true},
5861
{"model.gguf", true},
62+
{"model.mil", true},
5963
{"model.llamafile", true},
6064

6165
// Non-matching files.
66+
{"merges.txt", false},
6267
{"readme.txt", false},
6368
{"script.py", false},
6469
{"events.out.tfevents.1679012345.hostname", false}, // tfevents moved to DocFilePatterns
@@ -70,6 +75,30 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
7075
}
7176
}
7277

78+
func TestIsFileTypeConfigPatterns(t *testing.T) {
79+
testCases := []struct {
80+
filename string
81+
expected bool
82+
}{
83+
{"vocab.txt", true},
84+
{"merges.txt", true},
85+
{"added_tokens.txt", true},
86+
{"chat_template.jinja", true},
87+
{"tokenizer.tiktoken", true},
88+
{"spiece.model", true},
89+
{"sentencepiece.bpe.model", true},
90+
{"sentencepiece.bpe.vocab", true},
91+
{"tiktoken.model", true},
92+
{"weights.model", false},
93+
{"readme.txt", false},
94+
}
95+
96+
assert := assert.New(t)
97+
for _, tc := range testCases {
98+
assert.Equal(tc.expected, IsFileType(tc.filename, ConfigFilePatterns), "filename: %s", tc.filename)
99+
}
100+
}
101+
73102
func TestIsFileTypeDocPatternsTfevents(t *testing.T) {
74103
testCases := []struct {
75104
filename string
@@ -98,10 +127,20 @@ func TestInferFileType(t *testing.T) {
98127
{"config yaml", "settings.yaml", 1024, FileTypeConfig},
99128
{"model safetensors", "model.safetensors", 1024, FileTypeModel},
100129
{"model bin", "weights.bin", 1024, FileTypeModel},
130+
{"model onnx external data", "model.onnx_data_1", 1024, FileTypeModel},
131+
{"model coreml mil", "model.mil", 1024, FileTypeModel},
132+
{"checkpoint tensor shard", "ckpt-0/tensor00001_000", 1024, FileTypeModel},
101133
{"code python", "script.py", 1024, FileTypeCode},
102134
{"code go", "main.go", 1024, FileTypeCode},
103135
{"doc markdown", "README.md", 1024, FileTypeDoc},
104136
{"doc pdf", "guide.pdf", 1024, FileTypeDoc},
137+
{"tokenizer vocab txt", "vocab.txt", 1024, FileTypeConfig},
138+
{"tokenizer merges txt", "merges.txt", 1024, FileTypeConfig},
139+
{"tokenizer added tokens txt", "added_tokens.txt", 1024, FileTypeConfig},
140+
{"sentencepiece spiece model", "spiece.model", 1024, FileTypeConfig},
141+
{"sentencepiece bpe model", "sentencepiece.bpe.model", 1024, FileTypeConfig},
142+
{"tiktoken model", "tiktoken.model", 1024, FileTypeConfig},
143+
{"chat template jinja", "chat_template.jinja", 1024, FileTypeConfig},
105144

106145
// Dotfile with known secondary extension
107146
{".cache.json is config", ".cache.json", 1024, FileTypeConfig},

pkg/modelfile/modelfile_test.go

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
606606
"tokenizer.json",
607607
"special_tokens_map.json",
608608
"vocab.json",
609+
"merges.txt",
609610
},
610611
expectModels: []string{
611612
"pytorch_model.bin",
@@ -617,7 +618,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
617618
"scripts/convert_weights.py",
618619
"scripts/preprocessing/prep.py",
619620
},
620-
expectDocs: []string{"merges.txt", "README.md"},
621+
expectDocs: []string{"README.md"},
621622
expectName: "llama-7b",
622623
expectArch: "transformer",
623624
expectFamily: "llama",
@@ -1891,6 +1892,41 @@ func TestFileTypeClassification(t *testing.T) {
18911892
expectedCodes: []string{"script.py", "inference.py"},
18921893
expectedDocs: []string{"README.md", "LICENSE"},
18931894
},
1895+
{
1896+
name: "huggingface tokenizer and runtime artifacts",
1897+
files: map[string]int64{
1898+
"config.json": 1024,
1899+
"vocab.txt": 1024,
1900+
"merges.txt": 1024,
1901+
"added_tokens.txt": 1024,
1902+
"tokenizer/spiece.model": 1024,
1903+
"tokenizer/sentencepiece.bpe.model": 1024,
1904+
"tokenizer/tiktoken.model": 1024,
1905+
"chat_template.jinja": 1024,
1906+
"onnx/model.onnx_data_1": 1024,
1907+
"coreml/model.mil": 1024,
1908+
"ckpt-0/tensor00001_000": 1024,
1909+
"scripts/inference.py": 1024,
1910+
"README.md": 1024,
1911+
},
1912+
expectedConfigs: []string{
1913+
"config.json",
1914+
"vocab.txt",
1915+
"merges.txt",
1916+
"added_tokens.txt",
1917+
"tokenizer/spiece.model",
1918+
"tokenizer/sentencepiece.bpe.model",
1919+
"tokenizer/tiktoken.model",
1920+
"chat_template.jinja",
1921+
},
1922+
expectedModels: []string{
1923+
"onnx/model.onnx_data_1",
1924+
"coreml/model.mil",
1925+
"ckpt-0/tensor00001_000",
1926+
},
1927+
expectedCodes: []string{"scripts/inference.py"},
1928+
expectedDocs: []string{"README.md"},
1929+
},
18941930
{
18951931
name: "small unknown files treated as code files",
18961932
files: map[string]int64{

0 commit comments

Comments
 (0)