Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cmd/modelfile/generate.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ func init() {
flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "ignore the unrecognized file types in the workspace")
flags.BoolVar(&generateConfig.Overwrite, "overwrite", false, "overwrite the existing modelfile")

// Mark the ignore-unrecognized-file-types flag as deprecated and hidden
flags.MarkDeprecated("ignore-unrecognized-file-types", "this flag will be removed in the next release")
flags.MarkHidden("ignore-unrecognized-file-types")

if err := viper.BindPFlags(flags); err != nil {
panic(fmt.Errorf("bind cache list flags to viper: %w", err))
}
Expand Down
2 changes: 1 addition & 1 deletion pkg/config/modelfile/modelfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ type GenerateConfig struct {
Name string
Version string
Output string
IgnoreUnrecognizedFileTypes bool
IgnoreUnrecognizedFileTypes bool // [deprecated] will be removed in the next release
Overwrite bool
Arch string
Family string
Expand Down
264 changes: 260 additions & 4 deletions pkg/modelfile/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,17 @@ package modelfile
import (
"path/filepath"
"strings"

"github.com/dustin/go-humanize"
)

var (
// Config file patterns - supported configuration file extensions.
ConfigFilePatterns = []string{
"*.json", // JSON configuration files
"*.jsonl", // JSON Lines format
"*.json5", // JSON5 files
"*.jsonc", // JSON with comments
"*.yaml", // YAML configuration files
"*.yml", // YAML alternative extension
"*.toml", // TOML configuration files
Expand All @@ -45,6 +49,12 @@ var (
"*.meta", // Model metadata
"*tokenizer.model*", // Tokenizer files (e.g., Mistral v3)
"config.json.*", // Model configuration variants
"*.hparams", // Hyperparameter files
"*.params", // Parameter files
"*.hyperparams", // Hyperparameter configuration
"*.wandb", // Weights & Biases configuration
"*.mlflow", // MLflow configuration
"*.tensorboard", // TensorBoard configuration
}

// Model file patterns - supported model file extensions.
Expand All @@ -56,29 +66,75 @@ var (
"*.bin", // General binary format
"*.pt", // PyTorch model
"*.pth", // PyTorch model (alternative extension)
"*.mar", // PyTorch Model Archive
"*.pte", // PyTorch ExecuTorch format
"*.pt2", // PyTorch 2.0 export format
"*.ptl", // PyTorch Mobile format

// TensorFlow formats.
"*.tflite", // TensorFlow Lite
"*.h5", // Keras HDF5 format
"*.hdf", // Hierarchical Data Format
"*.hdf5", // HDF5 (alternative extension)
"*.pb", // TensorFlow SavedModel/Frozen Graph
"*.meta", // TensorFlow checkpoint metadata
"*.data-*", // TensorFlow checkpoint data files
"*.index", // TensorFlow checkpoint index

// GGML formats.
"*.gguf", // GGML Universal Format
"*.ggml", // GGML format (legacy)
"*.ggmf", // GGMF format (deprecated)
"*.ggjt", // GGJT format (deprecated)
"*.q4_0", // GGML Q4_0 quantization
"*.q4_1", // GGML Q4_1 quantization
"*.q5_0", // GGML Q5_0 quantization
"*.q5_1", // GGML Q5_1 quantization
"*.q8_0", // GGML Q8_0 quantization
"*.f16", // GGML F16 format
"*.f32", // GGML F32 format

// checkpoint formats.
"*.ckpt", // Checkpoint format
"*.checkpoint", // Checkpoint format (alternative extension)
"*.dist_ckpt", // Distributed checkpoint format

// Semantics-specific formats
"*.tensor", // Generic tensor format
"*.weights", // Generic weights format
"*.state", // State files
"*.embedding", // Embedding files
"*.vocab", // Vocabulary files (when binary)

// Other ML frameworks.
"*.ot", // OpenVINO format
"*.engine", // TensorRT format
"*.trt", // TensorRT format (alternative extension)
"*.onnx", // Open Neural Network Exchange format
"*.gguf", // GGML Universal Format
"*.msgpack", // MessagePack serialization
"*.model", // Some NLP frameworks
"*.pkl", // Pickle format
"*.pickle", // Pickle format (alternative extension)
"*.ckpt", // Checkpoint format
"*.checkpoint", // Checkpoint format (alternative extension)
"*.keras", // Keras native format
"*.joblib", // Joblib serialization (scikit-learn)
"*.npy", // NumPy array format
"*.npz", // NumPy compressed archive
"*.nc", // NetCDF format
"*.mlmodel", // Apple Core ML format
"*.coreml", // Apple Core ML format (alternative)
"*.mleap", // MLeap format (Spark ML)
"*.surml", // SurrealML format
"*.llamafile", // Llamafile format
"*.caffemodel", // Caffe model format
"*.prototxt", // Caffe model definition
"*.dlc", // Qualcomm Deep Learning Container
"*.circle", // Samsung Circle format
"*.nb", // Neural Network Binary format
}

// Code file patterns - supported script and notebook files.
CodeFilePatterns = []string{
// language source files
"*.py", // Python source files
"*.ipynb", // Jupyter notebooks
"*.sh", // Shell scripts
Expand All @@ -88,42 +144,137 @@ var (
"*.hxx", // C++ header files
"*.cpp", // C++ source files
"*.cc", // C++ source files
"*.cxx", // C++ source files (alternative)
"*.c++", // C++ source files (alternative)
"*.hpp", // C++ header files
"*.hh", // C++ header files
"*.h++", // C++ header files (alternative)
"*.java", // Java source files
"*.js", // JavaScript source files
"*.mjs", // JavaScript ES6 modules
"*.cjs", // CommonJS modules
"*.jsx", // React JSX files
"*.ts", // TypeScript source files
"*.tsx", // TypeScript JSX files
"*.go", // Go source files
"*.rs", // Rust source files
"*.swift", // Swift source files
"*.rb", // Ruby source files
"*.php", // PHP source files
"*.scala", // Scala source files
"*.kt", // Kotlin source files
"*.kts", // Kotlin script files
"*.r", // R source files
"*.R", // R source files (alternative)
"*.m", // MATLAB/Objective-C source files
"*.mm", // Objective-C++ source files
"*.f", // Fortran source files
"*.f90", // Fortran 90 source files
"*.f95", // Fortran 95 source files
"*.f03", // Fortran 2003 source files
"*.f08", // Fortran 2008 source files
"*.jl", // Julia source files
"*.lua", // Lua source files
"*.pl", // Perl source files
"*.pm", // Perl modules
"*.cs", // C# source files
"*.vb", // Visual Basic source files
"*.dart", // Dart source files
"*.groovy", // Groovy source files
"*.elm", // Elm source files
"*.erl", // Erlang source files
"*.hrl", // Erlang header files
"*.ex", // Elixir source files
"*.exs", // Elixir script files
"*.hs", // Haskell source files
"*.lhs", // Literate Haskell source files
"*.clj", // Clojure source files
"*.cljs", // ClojureScript source files
"*.cljc", // Clojure Common Lisp source files
"*.cljc", // Clojure Common source files
"*.cl", // Common Lisp source files
"*.lisp", // Lisp source files
"*.lsp", // Lisp source files (alternative)
"*.scm", // Scheme source files
"*.ss", // Scheme source files (alternative)
"*.rkt", // Racket source files
"*.sql", // SQL files
"*.psql", // PostgreSQL files
"*.mysql", // MySQL files
"*.sqlite", // SQLite files
"*.zig", // Zig source files
"*.cu", // CUDA source files
"*.cuh", // CUDA header files

// Scripting and automation
"*.bash", // Bash scripts
"*.zsh", // Zsh scripts
"*.fish", // Fish shell scripts
"*.csh", // C shell scripts
"*.tcsh", // TC shell scripts
"*.ksh", // Korn shell scripts
"*.ps1", // PowerShell scripts
"*.psm1", // PowerShell modules
"*.psd1", // PowerShell data files
"*.bat", // Windows batch files
"*.cmd", // Windows command files
"*.vbs", // VBScript files
"*.wsf", // Windows Script Files
"*.applescript", // AppleScript files
"*.scpt", // AppleScript compiled files
"*.awk", // AWK scripts
"*.sed", // sed scripts
"*.expect", // Expect scripts

// Build and project files
"*.env", // Environment variable files
"*.env.*", // Environment files with suffixes
".env*", // Environment files (hidden)
"Makefile*", // Makefile variants
"*.dockerfile", // Dockerfile configurations
"Dockerfile*", // Dockerfile variants
"*.mk", // Make include files
"*.cmake", // CMake files
"CMakeLists.txt", // CMake configuration
"*.gradle", // Gradle build files
"*.gradle.kts", // Kotlin DSL Gradle files
"build.gradle*", // Gradle build files
"settings.gradle*", // Gradle settings files
"*.sbt", // SBT build files
"*.mill", // Mill build files
"*.bazel", // Bazel build files
"*.bzl", // Bazel extension files
"BUILD*", // Bazel BUILD files
"WORKSPACE*", // Bazel WORKSPACE files
"*.buck", // Buck build files
"BUCK*", // Buck BUILD files
"*.ninja", // Ninja build files
"*.gyp", // GYP build files
"*.gypi", // GYP include files
"*.waf", // Waf build files
"wscript*", // Waf build scripts
"package.json", // Node.js package file
"package-lock.json", // Node.js lock file
"yarn.lock", // Yarn lock file
"pnpm-lock.yaml", // PNPM lock file
"requirements*.txt", // Python requirements
"Pipfile*", // Python Pipenv files
"pyproject.toml", // Python project configuration
"setup.cfg", // Python setup configuration
"tox.ini", // Python tox configuration
"poetry.lock", // Python Poetry lock file
"Cargo.toml", // Rust package configuration
"Cargo.lock", // Rust lock file
"go.mod", // Go module file
"go.sum", // Go checksum file
"composer.json", // PHP Composer file
"composer.lock", // PHP Composer lock file
"Gemfile*", // Ruby Gemfile
"*.gemspec", // Ruby gem specification
"mix.exs", // Elixir Mix file
"mix.lock", // Elixir Mix lock file
"rebar.config", // Erlang Rebar config
"rebar.lock", // Erlang Rebar lock file

// Library files.
"*.so", // Shared object files
"*.dll", // Dynamic Link Library
Expand All @@ -144,6 +295,93 @@ var (
"*requirements*", // Dependency specifications
"*.log", // Log files

// Office documents
"*.doc", // Microsoft Word 97-2003 Document
"*.docx", // Microsoft Word Document
"*.docm", // Word Macro-Enabled Document
"*.dot", // Word 97-2003 Template
"*.dotx", // Word Template
"*.dotm", // Word Macro-Enabled Template
"*.rtf", // Rich Text Format
"*.odt", // OpenDocument Text
"*.ott", // OpenDocument Text Template
"*.fodt", // Flat OpenDocument Text
"*.pages", // Apple Pages document
"*.wpd", // WordPerfect document

// Spreadsheet documents
"*.xls", // Microsoft Excel 97-2003 Workbook
"*.xlsx", // Microsoft Excel Workbook
"*.xlsm", // Excel Macro-Enabled Workbook
"*.xlsb", // Excel Binary Workbook
"*.xlt", // Excel 97-2003 Template
"*.xltx", // Excel Template
"*.xltm", // Excel Macro-Enabled Template
"*.ods", // OpenDocument Spreadsheet
"*.ots", // OpenDocument Spreadsheet Template
"*.fods", // Flat OpenDocument Spreadsheet
"*.numbers", // Apple Numbers spreadsheet
"*.csv", // Comma-Separated Values

// Presentation documents
"*.ppt", // Microsoft PowerPoint 97-2003 Presentation
"*.pptx", // Microsoft PowerPoint Presentation
"*.pptm", // PowerPoint Macro-Enabled Presentation
"*.pps", // PowerPoint 97-2003 Show
"*.ppsx", // PowerPoint Show
"*.ppsm", // PowerPoint Macro-Enabled Show
"*.pot", // PowerPoint 97-2003 Template
"*.potx", // PowerPoint Template
"*.potm", // PowerPoint Macro-Enabled Template
"*.odp", // OpenDocument Presentation
"*.otp", // OpenDocument Presentation Template
"*.fodp", // Flat OpenDocument Presentation
"*.key", // Apple Keynote presentation

// eBook formats
"*.epub", // Electronic Publication
"*.mobi", // Mobipocket eBook
"*.azw", // Amazon Kindle eBook
"*.azw3", // Amazon Kindle eBook (KF8)
"*.fb2", // FictionBook 2.0
"*.fb3", // FictionBook 3.0
"*.lit", // Microsoft Literature
"*.pdb", // Palm Database/Document File
"*.djvu", // DjVu document
"*.djv", // DjVu document (alternative extension)

// Web and markup documents
"*.html", // HyperText Markup Language
"*.htm", // HyperText Markup Language (alternative)
"*.xhtml", // Extensible HyperText Markup Language
"*.mhtml", // MIME HTML (Web Archive)
"*.mht", // MIME HTML (Web Archive, alternative)
"*.xml", // eXtensible Markup Language
"*.xsl", // eXtensible Stylesheet Language
"*.xslt", // XSL Transformations

// Technical documentation formats
"*.tex", // LaTeX document
"*.latex", // LaTeX document (alternative)
"*.ltx", // LaTeX document (alternative)
"*.bib", // BibTeX bibliography
"*.rst", // reStructuredText
"*.asciidoc", // AsciiDoc
"*.adoc", // AsciiDoc (alternative)
"*.textile", // Textile markup
"*.wiki", // Wiki markup
"*.mediawiki", // MediaWiki markup
"*.org", // Org-mode document
"*.texi", // Texinfo document
"*.texinfo", // Texinfo document (alternative)
"*.info", // GNU Info document
"*.man", // Manual page

// Archive and compressed documents
"*.chm", // Compiled HTML Help
"*.hlp", // Windows Help File
"*.xps", // XML Paper Specification

// Image assets.
"*.jpg", // JPEG image format
"*.jpeg", // JPEG alternative extension
Expand Down Expand Up @@ -182,6 +420,14 @@ var (
}
)

const (
// File size thresholds and workspace limits
WeightFileSizeThreshold int64 = 128 * humanize.MByte // 128MB - threshold for considering file as weight file
MaxSingleFileSize int64 = 128 * humanize.GByte // 128GB - maximum size for a single file
MaxWorkspaceFileCount int = 2048 // 2048 files - maximum number of files in workspace
MaxTotalWorkspaceSize int64 = 8 * humanize.TByte // 8TB - maximum total workspace size
)

// IsFileType checks if the filename matches any of the given patterns
func IsFileType(filename string, patterns []string) bool {
// Convert filename to lowercase for case-insensitive comparison
Expand Down Expand Up @@ -216,3 +462,13 @@ func isSkippable(filename string) bool {

return false
}

// For large unknown file type, usually it is a weight file.
func SizeShouldBeWeightFile(size int64) bool {
return size > WeightFileSizeThreshold
}

// formatBytes converts byte size to human-readable format using go-humanize
func formatBytes(bytes int64) string {
return humanize.Bytes(uint64(bytes))
}
Loading