@@ -49,16 +49,21 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
4949 {"model.bin.part2" , true },
5050 {"model.gguf.part1" , true },
5151 {"model.gguf.00001-of-00003" , true },
52+ {"model.onnx_data" , true },
53+ {"model.onnx_data_1" , true },
54+ {"ckpt-0/tensor00001_000" , true },
5255 {"model.llamafile.zip" , true },
5356 {"model.llamafile.gz" , true },
5457
5558 // Existing patterns still work.
5659 {"model.safetensors" , true },
5760 {"model.bin" , true },
5861 {"model.gguf" , true },
62+ {"model.mil" , true },
5963 {"model.llamafile" , true },
6064
6165 // Non-matching files.
66+ {"merges.txt" , false },
6267 {"readme.txt" , false },
6368 {"script.py" , false },
6469 {"events.out.tfevents.1679012345.hostname" , false }, // tfevents moved to DocFilePatterns
@@ -70,6 +75,30 @@ func TestIsFileTypeModelPatterns(t *testing.T) {
7075 }
7176}
7277
78+ func TestIsFileTypeConfigPatterns (t * testing.T ) {
79+ testCases := []struct {
80+ filename string
81+ expected bool
82+ }{
83+ {"vocab.txt" , true },
84+ {"merges.txt" , true },
85+ {"added_tokens.txt" , true },
86+ {"chat_template.jinja" , true },
87+ {"tokenizer.tiktoken" , true },
88+ {"spiece.model" , true },
89+ {"sentencepiece.bpe.model" , true },
90+ {"sentencepiece.bpe.vocab" , true },
91+ {"tiktoken.model" , true },
92+ {"weights.model" , false },
93+ {"readme.txt" , false },
94+ }
95+
96+ assert := assert .New (t )
97+ for _ , tc := range testCases {
98+ assert .Equal (tc .expected , IsFileType (tc .filename , ConfigFilePatterns ), "filename: %s" , tc .filename )
99+ }
100+ }
101+
73102func TestIsFileTypeDocPatternsTfevents (t * testing.T ) {
74103 testCases := []struct {
75104 filename string
@@ -98,10 +127,20 @@ func TestInferFileType(t *testing.T) {
98127 {"config yaml" , "settings.yaml" , 1024 , FileTypeConfig },
99128 {"model safetensors" , "model.safetensors" , 1024 , FileTypeModel },
100129 {"model bin" , "weights.bin" , 1024 , FileTypeModel },
130+ {"model onnx external data" , "model.onnx_data_1" , 1024 , FileTypeModel },
131+ {"model coreml mil" , "model.mil" , 1024 , FileTypeModel },
132+ {"checkpoint tensor shard" , "ckpt-0/tensor00001_000" , 1024 , FileTypeModel },
101133 {"code python" , "script.py" , 1024 , FileTypeCode },
102134 {"code go" , "main.go" , 1024 , FileTypeCode },
103135 {"doc markdown" , "README.md" , 1024 , FileTypeDoc },
104136 {"doc pdf" , "guide.pdf" , 1024 , FileTypeDoc },
137+ {"tokenizer vocab txt" , "vocab.txt" , 1024 , FileTypeConfig },
138+ {"tokenizer merges txt" , "merges.txt" , 1024 , FileTypeConfig },
139+ {"tokenizer added tokens txt" , "added_tokens.txt" , 1024 , FileTypeConfig },
140+ {"sentencepiece spiece model" , "spiece.model" , 1024 , FileTypeConfig },
141+ {"sentencepiece bpe model" , "sentencepiece.bpe.model" , 1024 , FileTypeConfig },
142+ {"tiktoken model" , "tiktoken.model" , 1024 , FileTypeConfig },
143+ {"chat template jinja" , "chat_template.jinja" , 1024 , FileTypeConfig },
105144
106145 // Dotfile with known secondary extension
107146 {".cache.json is config" , ".cache.json" , 1024 , FileTypeConfig },
0 commit comments