Classify Hugging Face tokenizer and runtime artifacts (#529)

aftersnow · web-flow · commit c7a7bf55e279 · 2026-05-06T10:41:53.000+08:00
fix: classify Hugging Face model artifacts

Signed-off-by: Zhao Chen &lt;winters.zc@antgroup.com&gt;
diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go
@@ -48,13 +48,22 @@ var (
 		"*.modelcard",       // Model card metadata
 		"*.meta",            // Model metadata
 		"*tokenizer.model*", // Tokenizer files (e.g., Mistral v3)
-		"config.json.*",     // Model configuration variants
-		"*.hparams",         // Hyperparameter files
-		"*.params",          // Parameter files
-		"*.hyperparams",     // Hyperparameter configuration
-		"*.wandb",           // Weights & Biases configuration
-		"*.mlflow",          // MLflow configuration
-		"*.tensorboard",     // TensorBoard configuration
+		"*.tiktoken",        // TikToken vocabulary files
+		"vocab.txt",         // Tokenizer vocabulary files
+		"merges.txt",        // Tokenizer merge rules
+		"added_tokens.txt",  // Additional tokenizer tokens
+		"spiece.model",      // SentencePiece tokenizer files
+		"sentencepiece*.model",
+		"sentencepiece*.vocab",
+		"tiktoken.model",
+		"chat_template.jinja",
+		"config.json.*", // Model configuration variants
+		"*.hparams",     // Hyperparameter files
+		"*.params",      // Parameter files
+		"*.hyperparams", // Hyperparameter configuration
+		"*.wandb",       // Weights & Biases configuration
+		"*.mlflow",      // MLflow configuration
+		"*.tensorboard", // TensorBoard configuration
 	}
 
 	// Model file patterns - supported model file extensions.
@@ -97,9 +106,10 @@ var (
 		"*.f32",    // GGML F32 format
 
 		// checkpoint formats.
-		"*.ckpt",       // Checkpoint format
-		"*.checkpoint", // Checkpoint format (alternative extension)
-		"*.dist_ckpt",  // Distributed checkpoint format
+		"*.ckpt",              // Checkpoint format
+		"*.checkpoint",        // Checkpoint format (alternative extension)
+		"*.dist_ckpt",         // Distributed checkpoint format
+		"tensor[0-9]*_[0-9]*", // Sharded checkpoint tensor files
 
 		// Semantics-specific formats
 		"*.tensor",    // Generic tensor format
@@ -113,6 +123,7 @@ var (
 		"*.engine",      // TensorRT format
 		"*.trt",         // TensorRT format (alternative extension)
 		"*.onnx",        // Open Neural Network Exchange format
+		"*.onnx_data*",  // ONNX external data files
 		"*.msgpack",     // MessagePack serialization
 		"*.model",       // Some NLP frameworks
 		"*.pkl",         // Pickle format
@@ -124,6 +135,7 @@ var (
 		"*.nc",          // NetCDF format
 		"*.mlmodel",     // Apple Core ML format
 		"*.coreml",      // Apple Core ML format (alternative)
+		"*.mil",         // Core ML intermediate language files
 		"*.mleap",       // MLeap format (Spark ML)
 		"*.surml",       // SurrealML format
 		"*.llamafile",   // Llamafile format
diff --git a/pkg/modelfile/constants_test.go b/pkg/modelfile/constants_test.go
@@ -49,16 +49,21 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
 		{"model.bin.part2", true},
 		{"model.gguf.part1", true},
 		{"model.gguf.00001-of-00003", true},
+		{"model.onnx_data", true},
+		{"model.onnx_data_1", true},
+		{"ckpt-0/tensor00001_000", true},
 		{"model.llamafile.zip", true},
 		{"model.llamafile.gz", true},
 
 		// Existing patterns still work.
 		{"model.safetensors", true},
 		{"model.bin", true},
 		{"model.gguf", true},
+		{"model.mil", true},
 		{"model.llamafile", true},
 
 		// Non-matching files.
+		{"merges.txt", false},
 		{"readme.txt", false},
 		{"script.py", false},
 		{"events.out.tfevents.1679012345.hostname", false}, // tfevents moved to DocFilePatterns
@@ -70,6 +75,30 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
 	}
 }
 
+func TestIsFileTypeConfigPatterns(t *testing.T) {
+	testCases := []struct {
+		filename string
+		expected bool
+	}{
+		{"vocab.txt", true},
+		{"merges.txt", true},
+		{"added_tokens.txt", true},
+		{"chat_template.jinja", true},
+		{"tokenizer.tiktoken", true},
+		{"spiece.model", true},
+		{"sentencepiece.bpe.model", true},
+		{"sentencepiece.bpe.vocab", true},
+		{"tiktoken.model", true},
+		{"weights.model", false},
+		{"readme.txt", false},
+	}
+
+	assert := assert.New(t)
+	for _, tc := range testCases {
+		assert.Equal(tc.expected, IsFileType(tc.filename, ConfigFilePatterns), "filename: %s", tc.filename)
+	}
+}
+
 func TestIsFileTypeDocPatternsTfevents(t *testing.T) {
 	testCases := []struct {
 		filename string
@@ -98,10 +127,20 @@ func TestInferFileType(t *testing.T) {
 		{"config yaml", "settings.yaml", 1024, FileTypeConfig},
 		{"model safetensors", "model.safetensors", 1024, FileTypeModel},
 		{"model bin", "weights.bin", 1024, FileTypeModel},
+		{"model onnx external data", "model.onnx_data_1", 1024, FileTypeModel},
+		{"model coreml mil", "model.mil", 1024, FileTypeModel},
+		{"checkpoint tensor shard", "ckpt-0/tensor00001_000", 1024, FileTypeModel},
 		{"code python", "script.py", 1024, FileTypeCode},
 		{"code go", "main.go", 1024, FileTypeCode},
 		{"doc markdown", "README.md", 1024, FileTypeDoc},
 		{"doc pdf", "guide.pdf", 1024, FileTypeDoc},
+		{"tokenizer vocab txt", "vocab.txt", 1024, FileTypeConfig},
+		{"tokenizer merges txt", "merges.txt", 1024, FileTypeConfig},
+		{"tokenizer added tokens txt", "added_tokens.txt", 1024, FileTypeConfig},
+		{"sentencepiece spiece model", "spiece.model", 1024, FileTypeConfig},
+		{"sentencepiece bpe model", "sentencepiece.bpe.model", 1024, FileTypeConfig},
+		{"tiktoken model", "tiktoken.model", 1024, FileTypeConfig},
+		{"chat template jinja", "chat_template.jinja", 1024, FileTypeConfig},
 
 		// Dotfile with known secondary extension
 		{".cache.json is config", ".cache.json", 1024, FileTypeConfig},
diff --git a/pkg/modelfile/modelfile_test.go b/pkg/modelfile/modelfile_test.go
@@ -606,6 +606,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
 				"tokenizer.json",
 				"special_tokens_map.json",
 				"vocab.json",
+				"merges.txt",
 			},
 			expectModels: []string{
 				"pytorch_model.bin",
@@ -617,7 +618,7 @@ func TestNewModelfileByWorkspace(t *testing.T) {
 				"scripts/convert_weights.py",
 				"scripts/preprocessing/prep.py",
 			},
-			expectDocs:      []string{"merges.txt", "README.md"},
+			expectDocs:      []string{"README.md"},
 			expectName:      "llama-7b",
 			expectArch:      "transformer",
 			expectFamily:    "llama",
@@ -1891,6 +1892,41 @@ func TestFileTypeClassification(t *testing.T) {
 			expectedCodes:   []string{"script.py", "inference.py"},
 			expectedDocs:    []string{"README.md", "LICENSE"},
 		},
+		{
+			name: "huggingface tokenizer and runtime artifacts",
+			files: map[string]int64{
+				"config.json":                       1024,
+				"vocab.txt":                         1024,
+				"merges.txt":                        1024,
+				"added_tokens.txt":                  1024,
+				"tokenizer/spiece.model":            1024,
+				"tokenizer/sentencepiece.bpe.model": 1024,
+				"tokenizer/tiktoken.model":          1024,
+				"chat_template.jinja":               1024,
+				"onnx/model.onnx_data_1":            1024,
+				"coreml/model.mil":                  1024,
+				"ckpt-0/tensor00001_000":            1024,
+				"scripts/inference.py":              1024,
+				"README.md":                         1024,
+			},
+			expectedConfigs: []string{
+				"config.json",
+				"vocab.txt",
+				"merges.txt",
+				"added_tokens.txt",
+				"tokenizer/spiece.model",
+				"tokenizer/sentencepiece.bpe.model",
+				"tokenizer/tiktoken.model",
+				"chat_template.jinja",
+			},
+			expectedModels: []string{
+				"onnx/model.onnx_data_1",
+				"coreml/model.mil",
+				"ckpt-0/tensor00001_000",
+			},
+			expectedCodes: []string{"scripts/inference.py"},
+			expectedDocs:  []string{"README.md"},
+		},
 		{
 			name: "small unknown files treated as code files",
 			files: map[string]int64{