@@ -19,13 +19,17 @@ package modelfile
1919import (
2020 "path/filepath"
2121 "strings"
22+
23+ "github.com/dustin/go-humanize"
2224)
2325
2426var (
2527 // Config file patterns - supported configuration file extensions.
2628 ConfigFilePatterns = []string {
2729 "*.json" , // JSON configuration files
2830 "*.jsonl" , // JSON Lines format
31+ "*.json5" , // JSON5 files
32+ "*.jsonc" , // JSON with comments
2933 "*.yaml" , // YAML configuration files
3034 "*.yml" , // YAML alternative extension
3135 "*.toml" , // TOML configuration files
4549 "*.meta" , // Model metadata
4650 "*tokenizer.model*" , // Tokenizer files (e.g., Mistral v3)
4751 "config.json.*" , // Model configuration variants
52+ "*.hparams" , // Hyperparameter files
53+ "*.params" , // Parameter files
54+ "*.hyperparams" , // Hyperparameter configuration
55+ "*.wandb" , // Weights & Biases configuration
56+ "*.mlflow" , // MLflow configuration
57+ "*.tensorboard" , // TensorBoard configuration
4858 }
4959
5060 // Model file patterns - supported model file extensions.
@@ -56,29 +66,75 @@ var (
5666 "*.bin" , // General binary format
5767 "*.pt" , // PyTorch model
5868 "*.pth" , // PyTorch model (alternative extension)
69+ "*.mar" , // PyTorch Model Archive
70+ "*.pte" , // PyTorch ExecuTorch format
71+ "*.pt2" , // PyTorch 2.0 export format
72+ "*.ptl" , // PyTorch Mobile format
5973
6074 // TensorFlow formats.
6175 "*.tflite" , // TensorFlow Lite
6276 "*.h5" , // Keras HDF5 format
6377 "*.hdf" , // Hierarchical Data Format
6478 "*.hdf5" , // HDF5 (alternative extension)
79+ "*.pb" , // TensorFlow SavedModel/Frozen Graph
80+ "*.meta" , // TensorFlow checkpoint metadata
81+ "*.data-*" , // TensorFlow checkpoint data files
82+ "*.index" , // TensorFlow checkpoint index
83+
84+ // GGML formats.
85+ "*.gguf" , // GGML Universal Format
86+ "*.ggml" , // GGML format (legacy)
87+ "*.ggmf" , // GGMF format (deprecated)
88+ "*.ggjt" , // GGJT format (deprecated)
89+ "*.q4_0" , // GGML Q4_0 quantization
90+ "*.q4_1" , // GGML Q4_1 quantization
91+ "*.q5_0" , // GGML Q5_0 quantization
92+ "*.q5_1" , // GGML Q5_1 quantization
93+ "*.q8_0" , // GGML Q8_0 quantization
94+ "*.f16" , // GGML F16 format
95+ "*.f32" , // GGML F32 format
96+
97+ // checkpoint formats.
98+ "*.ckpt" , // Checkpoint format
99+ "*.checkpoint" , // Checkpoint format (alternative extension)
100+ "*.dist_ckpt" , // Distributed checkpoint format
101+
102+ // Semantics-specific formats
103+ "*.tensor" , // Generic tensor format
104+ "*.weights" , // Generic weights format
105+ "*.state" , // State files
106+ "*.embedding" , // Embedding files
107+ "*.vocab" , // Vocabulary files (when binary)
65108
66109 // Other ML frameworks.
67110 "*.ot" , // OpenVINO format
68111 "*.engine" , // TensorRT format
69112 "*.trt" , // TensorRT format (alternative extension)
70113 "*.onnx" , // Open Neural Network Exchange format
71- "*.gguf" , // GGML Universal Format
72114 "*.msgpack" , // MessagePack serialization
73115 "*.model" , // Some NLP frameworks
74116 "*.pkl" , // Pickle format
75117 "*.pickle" , // Pickle format (alternative extension)
76- "*.ckpt" , // Checkpoint format
77- "*.checkpoint" , // Checkpoint format (alternative extension)
118+ "*.keras" , // Keras native format
119+ "*.joblib" , // Joblib serialization (scikit-learn)
120+ "*.npy" , // NumPy array format
121+ "*.npz" , // NumPy compressed archive
122+ "*.nc" , // NetCDF format
123+ "*.mlmodel" , // Apple Core ML format
124+ "*.coreml" , // Apple Core ML format (alternative)
125+ "*.mleap" , // MLeap format (Spark ML)
126+ "*.surml" , // SurrealML format
127+ "*.llamafile" , // Llamafile format
128+ "*.caffemodel" , // Caffe model format
129+ "*.prototxt" , // Caffe model definition
130+ "*.dlc" , // Qualcomm Deep Learning Container
131+ "*.circle" , // Samsung Circle format
132+ "*.nb" , // Neural Network Binary format
78133 }
79134
80135 // Code file patterns - supported script and notebook files.
81136 CodeFilePatterns = []string {
137+ // language source files
82138 "*.py" , // Python source files
83139 "*.ipynb" , // Jupyter notebooks
84140 "*.sh" , // Shell scripts
@@ -88,42 +144,137 @@ var (
88144 "*.hxx" , // C++ header files
89145 "*.cpp" , // C++ source files
90146 "*.cc" , // C++ source files
147+ "*.cxx" , // C++ source files (alternative)
148+ "*.c++" , // C++ source files (alternative)
91149 "*.hpp" , // C++ header files
92150 "*.hh" , // C++ header files
151+ "*.h++" , // C++ header files (alternative)
93152 "*.java" , // Java source files
94153 "*.js" , // JavaScript source files
154+ "*.mjs" , // JavaScript ES6 modules
155+ "*.cjs" , // CommonJS modules
156+ "*.jsx" , // React JSX files
95157 "*.ts" , // TypeScript source files
158+ "*.tsx" , // TypeScript JSX files
96159 "*.go" , // Go source files
97160 "*.rs" , // Rust source files
98161 "*.swift" , // Swift source files
99162 "*.rb" , // Ruby source files
100163 "*.php" , // PHP source files
101164 "*.scala" , // Scala source files
102165 "*.kt" , // Kotlin source files
166+ "*.kts" , // Kotlin script files
103167 "*.r" , // R source files
168+ "*.R" , // R source files (alternative)
104169 "*.m" , // MATLAB/Objective-C source files
170+ "*.mm" , // Objective-C++ source files
105171 "*.f" , // Fortran source files
106172 "*.f90" , // Fortran 90 source files
173+ "*.f95" , // Fortran 95 source files
174+ "*.f03" , // Fortran 2003 source files
175+ "*.f08" , // Fortran 2008 source files
107176 "*.jl" , // Julia source files
108177 "*.lua" , // Lua source files
109178 "*.pl" , // Perl source files
179+ "*.pm" , // Perl modules
110180 "*.cs" , // C# source files
111181 "*.vb" , // Visual Basic source files
112182 "*.dart" , // Dart source files
113183 "*.groovy" , // Groovy source files
114184 "*.elm" , // Elm source files
115185 "*.erl" , // Erlang source files
186+ "*.hrl" , // Erlang header files
116187 "*.ex" , // Elixir source files
188+ "*.exs" , // Elixir script files
117189 "*.hs" , // Haskell source files
190+ "*.lhs" , // Literate Haskell source files
118191 "*.clj" , // Clojure source files
119192 "*.cljs" , // ClojureScript source files
120- "*.cljc" , // Clojure Common Lisp source files
193+ "*.cljc" , // Clojure Common source files
121194 "*.cl" , // Common Lisp source files
122195 "*.lisp" , // Lisp source files
196+ "*.lsp" , // Lisp source files (alternative)
123197 "*.scm" , // Scheme source files
198+ "*.ss" , // Scheme source files (alternative)
199+ "*.rkt" , // Racket source files
200+ "*.sql" , // SQL files
201+ "*.psql" , // PostgreSQL files
202+ "*.mysql" , // MySQL files
203+ "*.sqlite" , // SQLite files
204+ "*.zig" , // Zig source files
124205 "*.cu" , // CUDA source files
125206 "*.cuh" , // CUDA header files
126207
208+ // Scripting and automation
209+ "*.bash" , // Bash scripts
210+ "*.zsh" , // Zsh scripts
211+ "*.fish" , // Fish shell scripts
212+ "*.csh" , // C shell scripts
213+ "*.tcsh" , // TC shell scripts
214+ "*.ksh" , // Korn shell scripts
215+ "*.ps1" , // PowerShell scripts
216+ "*.psm1" , // PowerShell modules
217+ "*.psd1" , // PowerShell data files
218+ "*.bat" , // Windows batch files
219+ "*.cmd" , // Windows command files
220+ "*.vbs" , // VBScript files
221+ "*.wsf" , // Windows Script Files
222+ "*.applescript" , // AppleScript files
223+ "*.scpt" , // AppleScript compiled files
224+ "*.awk" , // AWK scripts
225+ "*.sed" , // sed scripts
226+ "*.expect" , // Expect scripts
227+
228+ // Build and project files
229+ "*.env" , // Environment variable files
230+ "*.env.*" , // Environment files with suffixes
231+ ".env*" , // Environment files (hidden)
232+ "Makefile*" , // Makefile variants
233+ "*.dockerfile" , // Dockerfile configurations
234+ "Dockerfile*" , // Dockerfile variants
235+ "*.mk" , // Make include files
236+ "*.cmake" , // CMake files
237+ "CMakeLists.txt" , // CMake configuration
238+ "*.gradle" , // Gradle build files
239+ "*.gradle.kts" , // Kotlin DSL Gradle files
240+ "build.gradle*" , // Gradle build files
241+ "settings.gradle*" , // Gradle settings files
242+ "*.sbt" , // SBT build files
243+ "*.mill" , // Mill build files
244+ "*.bazel" , // Bazel build files
245+ "*.bzl" , // Bazel extension files
246+ "BUILD*" , // Bazel BUILD files
247+ "WORKSPACE*" , // Bazel WORKSPACE files
248+ "*.buck" , // Buck build files
249+ "BUCK*" , // Buck BUILD files
250+ "*.ninja" , // Ninja build files
251+ "*.gyp" , // GYP build files
252+ "*.gypi" , // GYP include files
253+ "*.waf" , // Waf build files
254+ "wscript*" , // Waf build scripts
255+ "package.json" , // Node.js package file
256+ "package-lock.json" , // Node.js lock file
257+ "yarn.lock" , // Yarn lock file
258+ "pnpm-lock.yaml" , // PNPM lock file
259+ "requirements*.txt" , // Python requirements
260+ "Pipfile*" , // Python Pipenv files
261+ "pyproject.toml" , // Python project configuration
262+ "setup.cfg" , // Python setup configuration
263+ "tox.ini" , // Python tox configuration
264+ "poetry.lock" , // Python Poetry lock file
265+ "Cargo.toml" , // Rust package configuration
266+ "Cargo.lock" , // Rust lock file
267+ "go.mod" , // Go module file
268+ "go.sum" , // Go checksum file
269+ "composer.json" , // PHP Composer file
270+ "composer.lock" , // PHP Composer lock file
271+ "Gemfile*" , // Ruby Gemfile
272+ "*.gemspec" , // Ruby gem specification
273+ "mix.exs" , // Elixir Mix file
274+ "mix.lock" , // Elixir Mix lock file
275+ "rebar.config" , // Erlang Rebar config
276+ "rebar.lock" , // Erlang Rebar lock file
277+
127278 // Library files.
128279 "*.so" , // Shared object files
129280 "*.dll" , // Dynamic Link Library
@@ -144,6 +295,93 @@ var (
144295 "*requirements*" , // Dependency specifications
145296 "*.log" , // Log files
146297
298+ // Office documents
299+ "*.doc" , // Microsoft Word 97-2003 Document
300+ "*.docx" , // Microsoft Word Document
301+ "*.docm" , // Word Macro-Enabled Document
302+ "*.dot" , // Word 97-2003 Template
303+ "*.dotx" , // Word Template
304+ "*.dotm" , // Word Macro-Enabled Template
305+ "*.rtf" , // Rich Text Format
306+ "*.odt" , // OpenDocument Text
307+ "*.ott" , // OpenDocument Text Template
308+ "*.fodt" , // Flat OpenDocument Text
309+ "*.pages" , // Apple Pages document
310+ "*.wpd" , // WordPerfect document
311+
312+ // Spreadsheet documents
313+ "*.xls" , // Microsoft Excel 97-2003 Workbook
314+ "*.xlsx" , // Microsoft Excel Workbook
315+ "*.xlsm" , // Excel Macro-Enabled Workbook
316+ "*.xlsb" , // Excel Binary Workbook
317+ "*.xlt" , // Excel 97-2003 Template
318+ "*.xltx" , // Excel Template
319+ "*.xltm" , // Excel Macro-Enabled Template
320+ "*.ods" , // OpenDocument Spreadsheet
321+ "*.ots" , // OpenDocument Spreadsheet Template
322+ "*.fods" , // Flat OpenDocument Spreadsheet
323+ "*.numbers" , // Apple Numbers spreadsheet
324+ "*.csv" , // Comma-Separated Values
325+
326+ // Presentation documents
327+ "*.ppt" , // Microsoft PowerPoint 97-2003 Presentation
328+ "*.pptx" , // Microsoft PowerPoint Presentation
329+ "*.pptm" , // PowerPoint Macro-Enabled Presentation
330+ "*.pps" , // PowerPoint 97-2003 Show
331+ "*.ppsx" , // PowerPoint Show
332+ "*.ppsm" , // PowerPoint Macro-Enabled Show
333+ "*.pot" , // PowerPoint 97-2003 Template
334+ "*.potx" , // PowerPoint Template
335+ "*.potm" , // PowerPoint Macro-Enabled Template
336+ "*.odp" , // OpenDocument Presentation
337+ "*.otp" , // OpenDocument Presentation Template
338+ "*.fodp" , // Flat OpenDocument Presentation
339+ "*.key" , // Apple Keynote presentation
340+
341+ // eBook formats
342+ "*.epub" , // Electronic Publication
343+ "*.mobi" , // Mobipocket eBook
344+ "*.azw" , // Amazon Kindle eBook
345+ "*.azw3" , // Amazon Kindle eBook (KF8)
346+ "*.fb2" , // FictionBook 2.0
347+ "*.fb3" , // FictionBook 3.0
348+ "*.lit" , // Microsoft Literature
349+ "*.pdb" , // Palm Database/Document File
350+ "*.djvu" , // DjVu document
351+ "*.djv" , // DjVu document (alternative extension)
352+
353+ // Web and markup documents
354+ "*.html" , // HyperText Markup Language
355+ "*.htm" , // HyperText Markup Language (alternative)
356+ "*.xhtml" , // Extensible HyperText Markup Language
357+ "*.mhtml" , // MIME HTML (Web Archive)
358+ "*.mht" , // MIME HTML (Web Archive, alternative)
359+ "*.xml" , // eXtensible Markup Language
360+ "*.xsl" , // eXtensible Stylesheet Language
361+ "*.xslt" , // XSL Transformations
362+
363+ // Technical documentation formats
364+ "*.tex" , // LaTeX document
365+ "*.latex" , // LaTeX document (alternative)
366+ "*.ltx" , // LaTeX document (alternative)
367+ "*.bib" , // BibTeX bibliography
368+ "*.rst" , // reStructuredText
369+ "*.asciidoc" , // AsciiDoc
370+ "*.adoc" , // AsciiDoc (alternative)
371+ "*.textile" , // Textile markup
372+ "*.wiki" , // Wiki markup
373+ "*.mediawiki" , // MediaWiki markup
374+ "*.org" , // Org-mode document
375+ "*.texi" , // Texinfo document
376+ "*.texinfo" , // Texinfo document (alternative)
377+ "*.info" , // GNU Info document
378+ "*.man" , // Manual page
379+
380+ // Archive and compressed documents
381+ "*.chm" , // Compiled HTML Help
382+ "*.hlp" , // Windows Help File
383+ "*.xps" , // XML Paper Specification
384+
147385 // Image assets.
148386 "*.jpg" , // JPEG image format
149387 "*.jpeg" , // JPEG alternative extension
@@ -182,6 +420,14 @@ var (
182420 }
183421)
184422
423+ const (
424+ // File size thresholds and workspace limits
425+ WeightFileSizeThreshold int64 = 128 * humanize .MByte // 128MB - threshold for considering file as weight file
426+ MaxSingleFileSize int64 = 128 * humanize .GByte // 128GB - maximum size for a single file
427+ MaxWorkspaceFileCount int = 2048 // 2048 files - maximum number of files in workspace
428+ MaxTotalWorkspaceSize int64 = 8 * humanize .TByte // 8TB - maximum total workspace size
429+ )
430+
185431// IsFileType checks if the filename matches any of the given patterns
186432func IsFileType (filename string , patterns []string ) bool {
187433 // Convert filename to lowercase for case-insensitive comparison
@@ -216,3 +462,13 @@ func isSkippable(filename string) bool {
216462
217463 return false
218464}
465+
466+ // For large unknown file type, usually it is a weight file.
467+ func SizeShouldBeWeightFile (size int64 ) bool {
468+ return size > WeightFileSizeThreshold
469+ }
470+
471+ // formatBytes converts byte size to human-readable format using go-humanize
472+ func formatBytes (bytes int64 ) string {
473+ return humanize .Bytes (uint64 (bytes ))
474+ }
0 commit comments