From 4a1468e03dabb1875d3fc811821becd9ccf97773 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 17:01:36 +0800 Subject: [PATCH 01/13] feat: support handle unknown file type Signed-off-by: Zhao Chen --- cmd/modelfile/generate.go | 2 +- pkg/config/modelfile/modelfile.go | 4 +- pkg/modelfile/constants.go | 8 + pkg/modelfile/modelfile.go | 52 +++++- pkg/modelfile/modelfile_test.go | 289 +++++++++++++++++++++--------- 5 files changed, 255 insertions(+), 100 deletions(-) diff --git a/cmd/modelfile/generate.go b/cmd/modelfile/generate.go index 985c88c7..3cad9361 100644 --- a/cmd/modelfile/generate.go +++ b/cmd/modelfile/generate.go @@ -62,7 +62,7 @@ func init() { flags.StringVar(&generateConfig.Precision, "precision", "", "specify model precision, such as bf16, fp16, int8, etc") flags.StringVar(&generateConfig.Quantization, "quantization", "", "specify model quantization, such as awq, gptq, etc") flags.StringVarP(&generateConfig.Output, "output", "O", ".", "specify the output path of modelfilem, must be a directory") - flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "ignore the unrecognized file types in the workspace") + flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "[deprecated] ignore the unrecognized file types in the workspace") flags.BoolVar(&generateConfig.Overwrite, "overwrite", false, "overwrite the existing modelfile") if err := viper.BindPFlags(flags); err != nil { diff --git a/pkg/config/modelfile/modelfile.go b/pkg/config/modelfile/modelfile.go index 2f1251a3..fe1c6f50 100644 --- a/pkg/config/modelfile/modelfile.go +++ b/pkg/config/modelfile/modelfile.go @@ -31,7 +31,7 @@ type GenerateConfig struct { Name string Version string Output string - IgnoreUnrecognizedFileTypes bool + IgnoreUnrecognizedFileTypes bool // [deprecated] will be removed in the next release Overwrite bool Arch string Family string @@ -83,7 +83,7 @@ func (g *GenerateConfig) Validate() error { // If the output path does not exist, we can create the modelfile. if _, err := os.Stat(g.Output); err == nil { if !g.Overwrite { - return fmt.Errorf("Modelfile already exists at %s - use --overwrite to overwrite", g.Output) + return fmt.Errorf("modelfile already exists at %s - use --overwrite to overwrite", g.Output) } } diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 92d07aca..86c9ee95 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -180,6 +180,9 @@ var ( "*.pyo", // Python optimized bytecode "*.pyd", // Python dynamic modules } + + // Large file size threshold + WeightFileSizeThreshold int64 = 128 * 1024 * 1024 ) // IsFileType checks if the filename matches any of the given patterns @@ -216,3 +219,8 @@ func isSkippable(filename string) bool { return false } + +// For large unknown file type, usually it is a weight file. +func SizeShouldBeWeightFile(size int64) bool { + return size > WeightFileSizeThreshold +} diff --git a/pkg/modelfile/modelfile.go b/pkg/modelfile/modelfile.go index edccb59d..a4273c90 100644 --- a/pkg/modelfile/modelfile.go +++ b/pkg/modelfile/modelfile.go @@ -205,7 +205,11 @@ func NewModelfileByWorkspace(workspace string, config *configmodelfile.GenerateC doc: hashset.New(), } - if err := mf.generateByWorkspace(config.IgnoreUnrecognizedFileTypes); err != nil { + if err := mf.validateWorkspace(); err != nil { + return nil, err + } + + if err := mf.generateByWorkspace(); err != nil { return nil, err } @@ -217,8 +221,38 @@ func NewModelfileByWorkspace(workspace string, config *configmodelfile.GenerateC return mf, nil } +// validateWorkspace validates the workspace directory +func (mf *modelfile) validateWorkspace() error { + // check if the workspace is a directory, symbolic link, or empty + info, err := os.Lstat(mf.workspace) + if err != nil { + return fmt.Errorf("access to workspace failed: %s", err) + } + + // check if the workspace is a symbolic link + if info.Mode()&os.ModeSymlink != 0 { + return fmt.Errorf("for simplicity, the workspace should not be a symbolic link: %s", mf.workspace) + } + + // check if the workspace is a directory + if !info.IsDir() { + return fmt.Errorf("the workspace is not a directory: %s", mf.workspace) + } + + // check if the workspace is empty by reading directory contents + entries, err := os.ReadDir(mf.workspace) + if err != nil { + return fmt.Errorf("failed to read workspace directory: %s", err) + } + if len(entries) == 0 { + return fmt.Errorf("the workspace is empty: %s", mf.workspace) + } + + return nil +} + // generateByWorkspace generates the modelfile by the workspace's files. -func (mf *modelfile) generateByWorkspace(ignoreUnrecognizedFileTypes bool) error { +func (mf *modelfile) generateByWorkspace() error { // Walk the path and get the files. if err := filepath.Walk(mf.workspace, func(path string, info os.FileInfo, err error) error { if err != nil { @@ -256,12 +290,14 @@ func (mf *modelfile) generateByWorkspace(ignoreUnrecognizedFileTypes bool) error case IsFileType(filename, DocFilePatterns): mf.doc.Add(relPath) default: - // Skip unrecognized files if IgnoreUnrecognizedFileTypes is true. - if ignoreUnrecognizedFileTypes { - return nil + // If the file is large, usually it is a weight file. + if SizeShouldBeWeightFile(info.Size()) { + mf.model.Add(relPath) + } else { + mf.code.Add(relPath) } - return fmt.Errorf("unknown file type: %s", filename) + return nil } return nil @@ -269,8 +305,8 @@ func (mf *modelfile) generateByWorkspace(ignoreUnrecognizedFileTypes bool) error return err } - if mf.model.Size() == 0 { - return fmt.Errorf("no recognized model files found in directory - you may need to edit the Modelfile manually") + if mf.model.Size() == 0 && mf.code.Size() == 0 && mf.dataset.Size() == 0 { + return fmt.Errorf("no model/code/dataset found - you have to create the Modelfile by yourself") } return nil diff --git a/pkg/modelfile/modelfile_test.go b/pkg/modelfile/modelfile_test.go index a4c9a971..657b8ee7 100644 --- a/pkg/modelfile/modelfile_test.go +++ b/pkg/modelfile/modelfile_test.go @@ -269,25 +269,24 @@ NAME bar func TestNewModelfileByWorkspace(t *testing.T) { testcases := []struct { - name string - setupFiles map[string]string - setupDirs []string - configJson map[string]interface{} - genConfigJson map[string]interface{} - config *configmodelfile.GenerateConfig - ignoreUnrecognizedFileType bool - expectError bool - expectConfigs []string - expectModels []string - expectCodes []string - expectDocs []string - expectName string - expectArch string - expectFamily string - expectFormat string - expectParamsize string - expectPrecision string - expectQuantization string + name string + setupFiles map[string]string + setupDirs []string + configJson map[string]interface{} + genConfigJson map[string]interface{} + config *configmodelfile.GenerateConfig + expectError bool + expectConfigs []string + expectModels []string + expectCodes []string + expectDocs []string + expectName string + expectArch string + expectFamily string + expectFormat string + expectParamsize string + expectPrecision string + expectQuantization string }{ { name: "basic case", @@ -302,13 +301,12 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "test-model", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json"}, - expectModels: []string{"model.bin"}, - expectCodes: []string{"model.py", "tokenizer.py"}, - expectDocs: []string{"README.md", "LICENSE"}, - expectName: "test-model", + expectError: false, + expectConfigs: []string{"config.json"}, + expectModels: []string{"model.bin"}, + expectCodes: []string{"model.py", "tokenizer.py"}, + expectDocs: []string{"README.md", "LICENSE"}, + expectName: "test-model", }, { name: "empty workspace", @@ -316,12 +314,11 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "empty-model", }, - ignoreUnrecognizedFileType: false, - expectError: true, - expectConfigs: []string{}, - expectModels: []string{}, - expectCodes: []string{}, - expectName: "empty-model", + expectError: true, + expectConfigs: []string{}, + expectModels: []string{}, + expectCodes: []string{}, + expectName: "empty-model", }, { name: "with config.json values", @@ -337,15 +334,14 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "config-model", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json"}, - expectModels: []string{"model.bin"}, - expectCodes: []string{}, - expectName: "config-model", - expectArch: "transformer", - expectFamily: "llama", - expectPrecision: "float16", + expectError: false, + expectConfigs: []string{"config.json"}, + expectModels: []string{"model.bin"}, + expectCodes: []string{}, + expectName: "config-model", + expectArch: "transformer", + expectFamily: "llama", + expectPrecision: "float16", }, { name: "nested directory structure", @@ -370,8 +366,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "nested-model", }, - ignoreUnrecognizedFileType: false, - expectError: false, + expectError: false, expectConfigs: []string{ "config.json", "docs/config/parameters.yaml", @@ -406,12 +401,11 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "deep-nested", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"level1/config.json"}, - expectModels: []string{"level1/level2/level3/model.bin"}, - expectCodes: []string{"level1/level2/level3/level4/code.py"}, - expectName: "deep-nested", + expectError: false, + expectConfigs: []string{"level1/config.json"}, + expectModels: []string{"level1/level2/level3/model.bin"}, + expectCodes: []string{"level1/level2/level3/level4/code.py"}, + expectName: "deep-nested", }, { name: "hidden files and directories", @@ -429,12 +423,11 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "hidden-test", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json"}, - expectModels: []string{"model.bin"}, - expectCodes: []string{}, - expectName: "hidden-test", + expectError: false, + expectConfigs: []string{"config.json"}, + expectModels: []string{"model.bin"}, + expectCodes: []string{}, + expectName: "hidden-test", }, { name: "multiple config files in directories", @@ -453,16 +446,15 @@ func TestNewModelfileByWorkspace(t *testing.T) { Name: "multi-config", Format: "pytorch", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json", "models/config.json", "models/gen_config.json"}, - expectModels: []string{"model.bin"}, - expectCodes: []string{}, - expectName: "multi-config", - expectArch: "transformer", - expectFamily: "gpt2", - expectFormat: "pytorch", - expectPrecision: "float32", + expectError: false, + expectConfigs: []string{"config.json", "models/config.json", "models/gen_config.json"}, + expectModels: []string{"model.bin"}, + expectCodes: []string{}, + expectName: "multi-config", + expectArch: "transformer", + expectFamily: "gpt2", + expectFormat: "pytorch", + expectPrecision: "float32", }, { name: "special filename characters", @@ -482,8 +474,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "special-chars", }, - ignoreUnrecognizedFileType: false, - expectError: false, + expectError: false, expectConfigs: []string{ "config with spaces.json", "dir-with-hyphens/config.json", @@ -520,8 +511,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "mixed-types", }, - ignoreUnrecognizedFileType: false, - expectError: false, + expectError: false, expectConfigs: []string{ "configs/main.json", "configs/params.yaml", @@ -554,8 +544,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "same-names", }, - ignoreUnrecognizedFileType: false, - expectError: false, + expectError: false, expectConfigs: []string{ "dir1/config.json", "dir2/config.json", @@ -604,8 +593,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { Name: "llama-7b", ParamSize: "7B", }, - ignoreUnrecognizedFileType: false, - expectError: false, + expectError: false, expectConfigs: []string{ "config.json", "generation_config.json", @@ -650,14 +638,13 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "conflict-test", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json", "generation_config.json"}, - expectModels: []string{"model.bin"}, - expectCodes: []string{}, - expectName: "conflict-test", - expectFamily: "llama", - expectPrecision: "float32", + expectError: false, + expectConfigs: []string{"config.json", "generation_config.json"}, + expectModels: []string{"model.bin"}, + expectCodes: []string{}, + expectName: "conflict-test", + expectFamily: "llama", + expectPrecision: "float32", }, { name: "skipping internal directories", @@ -679,12 +666,11 @@ func TestNewModelfileByWorkspace(t *testing.T) { config: &configmodelfile.GenerateConfig{ Name: "skip-test", }, - ignoreUnrecognizedFileType: false, - expectError: false, - expectConfigs: []string{"config.json"}, - expectModels: []string{"normal/model.bin"}, - expectCodes: []string{"valid_dir/model.py"}, - expectName: "skip-test", + expectError: false, + expectConfigs: []string{"config.json"}, + expectModels: []string{"normal/model.bin"}, + expectCodes: []string{"valid_dir/model.py"}, + expectName: "skip-test", }, } @@ -734,7 +720,7 @@ func TestNewModelfileByWorkspace(t *testing.T) { // Set workspace in config tc.config.Workspace = tempDir - tc.config.IgnoreUnrecognizedFileTypes = tc.ignoreUnrecognizedFileType + tc.config.IgnoreUnrecognizedFileTypes = false // Call the function being tested mf, err := NewModelfileByWorkspace(tempDir, tc.config) @@ -1202,6 +1188,131 @@ func createHashSet(items []string) *hashset.Set { for _, item := range items { set.Add(item) } - return set } + +func TestValidateWorkspace(t *testing.T) { + tests := []struct { + name string + setupFunc func() (string, func()) // returns workspace path and cleanup function + expectedError string + }{ + { + name: "valid_directory", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "validate_workspace_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create a test file to make directory non-empty + testFile := filepath.Join(tmpDir, "test.txt") + err = os.WriteFile(testFile, []byte("test content"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file: %v", err) + } + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "", + }, + { + name: "non_existent_directory", + setupFunc: func() (string, func()) { + return "/non/existent/path", func() {} + }, + expectedError: "access to workspace failed:", + }, + { + name: "file_instead_of_directory", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "validate_workspace_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + testFile := filepath.Join(tmpDir, "test.txt") + err = os.WriteFile(testFile, []byte("test content"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file: %v", err) + } + + return testFile, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "the workspace is not a directory:", + }, + { + name: "empty_directory", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "validate_workspace_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "the workspace is empty:", + }, + { + name: "symbolic_link", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "validate_workspace_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create target directory with content + targetDir := filepath.Join(tmpDir, "target") + err = os.Mkdir(targetDir, 0755) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create target dir: %v", err) + } + + testFile := filepath.Join(targetDir, "test.txt") + err = os.WriteFile(testFile, []byte("test content"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file: %v", err) + } + + // Create symbolic link + linkPath := filepath.Join(tmpDir, "link") + err = os.Symlink(targetDir, linkPath) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create symlink: %v", err) + } + + return linkPath, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "for simplicity, the workspace should not be a symbolic link:", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workspace, cleanup := tt.setupFunc() + defer cleanup() + + mf := &modelfile{ + workspace: workspace, + config: hashset.New(), + model: hashset.New(), + code: hashset.New(), + dataset: hashset.New(), + doc: hashset.New(), + } + + err := mf.validateWorkspace() + + if tt.expectedError == "" { + assert.NoError(t, err, "Expected no error for test case: %s", tt.name) + } else { + assert.Error(t, err, "Expected error for test case: %s", tt.name) + assert.Contains(t, err.Error(), tt.expectedError, "Error message should contain expected text for test case: %s", tt.name) + } + }) + } +} From 63c79e6047e00e62a00f1e414988f9611d297005 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 17:14:49 +0800 Subject: [PATCH 02/13] feat: add support for distributed checkpoint file format Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 86c9ee95..bcd886bc 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -75,6 +75,7 @@ var ( "*.pickle", // Pickle format (alternative extension) "*.ckpt", // Checkpoint format "*.checkpoint", // Checkpoint format (alternative extension) + "*.dist_ckpt", // Distributed checkpoint format } // Code file patterns - supported script and notebook files. From 8e9e0b5b495104a7f1dc92e739cd454933dfbba8 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 18:50:33 +0800 Subject: [PATCH 03/13] feat: expand supported file formats for config Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index bcd886bc..45568445 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -26,6 +26,8 @@ var ( ConfigFilePatterns = []string{ "*.json", // JSON configuration files "*.jsonl", // JSON Lines format + "*.json5", // JSON5 files + "*.jsonc", // JSON with comments "*.yaml", // YAML configuration files "*.yml", // YAML alternative extension "*.toml", // TOML configuration files @@ -45,6 +47,12 @@ var ( "*.meta", // Model metadata "*tokenizer.model*", // Tokenizer files (e.g., Mistral v3) "config.json.*", // Model configuration variants + "*.hparams", // Hyperparameter files + "*.params", // Parameter files + "*.hyperparams", // Hyperparameter configuration + "*.wandb", // Weights & Biases configuration + "*.mlflow", // MLflow configuration + "*.tensorboard", // TensorBoard configuration } // Model file patterns - supported model file extensions. From 92339ff9dd63b3e4cadd6682789eaa44c2287577 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 18:51:47 +0800 Subject: [PATCH 04/13] feat: expand supported file formats for weight Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 52 +++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 45568445..0b8dc003 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -64,26 +64,70 @@ var ( "*.bin", // General binary format "*.pt", // PyTorch model "*.pth", // PyTorch model (alternative extension) + "*.mar", // PyTorch Model Archive + "*.pte", // PyTorch ExecuTorch format + "*.pt2", // PyTorch 2.0 export format + "*.ptl", // PyTorch Mobile format // TensorFlow formats. "*.tflite", // TensorFlow Lite "*.h5", // Keras HDF5 format "*.hdf", // Hierarchical Data Format "*.hdf5", // HDF5 (alternative extension) + "*.pb", // TensorFlow SavedModel/Frozen Graph + "*.meta", // TensorFlow checkpoint metadata + "*.data-*", // TensorFlow checkpoint data files + "*.index", // TensorFlow checkpoint index + + // GGML formats. + "*.gguf", // GGML Universal Format + "*.ggml", // GGML format (legacy) + "*.ggmf", // GGMF format (deprecated) + "*.ggjt", // GGJT format (deprecated) + "*.q4_0", // GGML Q4_0 quantization + "*.q4_1", // GGML Q4_1 quantization + "*.q5_0", // GGML Q5_0 quantization + "*.q5_1", // GGML Q5_1 quantization + "*.q8_0", // GGML Q8_0 quantization + "*.f16", // GGML F16 format + "*.f32", // GGML F32 format + + // checkpoint formats. + "*.ckpt", // Checkpoint format + "*.checkpoint", // Checkpoint format (alternative extension) + "*.dist_ckpt", // Distributed checkpoint format + + // Semantics-specific formats + "*.tensor", // Generic tensor format + "*.weights", // Generic weights format + "*.state", // State files + "*.embedding", // Embedding files + "*.vocab", // Vocabulary files (when binary) // Other ML frameworks. "*.ot", // OpenVINO format "*.engine", // TensorRT format "*.trt", // TensorRT format (alternative extension) "*.onnx", // Open Neural Network Exchange format - "*.gguf", // GGML Universal Format "*.msgpack", // MessagePack serialization "*.model", // Some NLP frameworks "*.pkl", // Pickle format "*.pickle", // Pickle format (alternative extension) - "*.ckpt", // Checkpoint format - "*.checkpoint", // Checkpoint format (alternative extension) - "*.dist_ckpt", // Distributed checkpoint format + "*.keras", // Keras native format + "*.joblib", // Joblib serialization (scikit-learn) + "*.npy", // NumPy array format + "*.npz", // NumPy compressed archive + "*.nc", // NetCDF format + "*.mlmodel", // Apple Core ML format + "*.coreml", // Apple Core ML format (alternative) + "*.mleap", // MLeap format (Spark ML) + "*.surml", // SurrealML format + "*.llamafile", // Llamafile format + "*.caffemodel", // Caffe model format + "*.prototxt", // Caffe model definition + "*.dlc", // Qualcomm Deep Learning Container + "*.circle", // Samsung Circle format + "*.nb", // Neural Network Binary format } // Code file patterns - supported script and notebook files. From e32091c6e141e963edb1df5b204a8747b8fb3ef2 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 18:54:38 +0800 Subject: [PATCH 05/13] feat: expand supported file formats for code Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 98 +++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 0b8dc003..1557a259 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -132,6 +132,7 @@ var ( // Code file patterns - supported script and notebook files. CodeFilePatterns = []string{ + // language source files "*.py", // Python source files "*.ipynb", // Jupyter notebooks "*.sh", // Shell scripts @@ -141,11 +142,18 @@ var ( "*.hxx", // C++ header files "*.cpp", // C++ source files "*.cc", // C++ source files + "*.cxx", // C++ source files (alternative) + "*.c++", // C++ source files (alternative) "*.hpp", // C++ header files "*.hh", // C++ header files + "*.h++", // C++ header files (alternative) "*.java", // Java source files "*.js", // JavaScript source files + "*.mjs", // JavaScript ES6 modules + "*.cjs", // CommonJS modules + "*.jsx", // React JSX files "*.ts", // TypeScript source files + "*.tsx", // TypeScript JSX files "*.go", // Go source files "*.rs", // Rust source files "*.swift", // Swift source files @@ -153,30 +161,118 @@ var ( "*.php", // PHP source files "*.scala", // Scala source files "*.kt", // Kotlin source files + "*.kts", // Kotlin script files "*.r", // R source files + "*.R", // R source files (alternative) "*.m", // MATLAB/Objective-C source files + "*.mm", // Objective-C++ source files "*.f", // Fortran source files "*.f90", // Fortran 90 source files + "*.f95", // Fortran 95 source files + "*.f03", // Fortran 2003 source files + "*.f08", // Fortran 2008 source files "*.jl", // Julia source files "*.lua", // Lua source files "*.pl", // Perl source files + "*.pm", // Perl modules "*.cs", // C# source files "*.vb", // Visual Basic source files "*.dart", // Dart source files "*.groovy", // Groovy source files "*.elm", // Elm source files "*.erl", // Erlang source files + "*.hrl", // Erlang header files "*.ex", // Elixir source files + "*.exs", // Elixir script files "*.hs", // Haskell source files + "*.lhs", // Literate Haskell source files "*.clj", // Clojure source files "*.cljs", // ClojureScript source files - "*.cljc", // Clojure Common Lisp source files + "*.cljc", // Clojure Common source files "*.cl", // Common Lisp source files "*.lisp", // Lisp source files + "*.lsp", // Lisp source files (alternative) "*.scm", // Scheme source files + "*.ss", // Scheme source files (alternative) + "*.rkt", // Racket source files + "*.sql", // SQL files + "*.psql", // PostgreSQL files + "*.mysql", // MySQL files + "*.sqlite", // SQLite files + "*.zig", // Zig source files "*.cu", // CUDA source files "*.cuh", // CUDA header files + // Scripting and automation + "*.bash", // Bash scripts + "*.zsh", // Zsh scripts + "*.fish", // Fish shell scripts + "*.csh", // C shell scripts + "*.tcsh", // TC shell scripts + "*.ksh", // Korn shell scripts + "*.ps1", // PowerShell scripts + "*.psm1", // PowerShell modules + "*.psd1", // PowerShell data files + "*.bat", // Windows batch files + "*.cmd", // Windows command files + "*.vbs", // VBScript files + "*.wsf", // Windows Script Files + "*.applescript", // AppleScript files + "*.scpt", // AppleScript compiled files + "*.awk", // AWK scripts + "*.sed", // sed scripts + "*.expect", // Expect scripts + + // Build and project files + "*.env", // Environment variable files + "*.env.*", // Environment files with suffixes + ".env*", // Environment files (hidden) + "Makefile*", // Makefile variants + "*.dockerfile", // Dockerfile configurations + "Dockerfile*", // Dockerfile variants + "*.mk", // Make include files + "*.cmake", // CMake files + "CMakeLists.txt", // CMake configuration + "*.gradle", // Gradle build files + "*.gradle.kts", // Kotlin DSL Gradle files + "build.gradle*", // Gradle build files + "settings.gradle*", // Gradle settings files + "*.sbt", // SBT build files + "*.mill", // Mill build files + "*.bazel", // Bazel build files + "*.bzl", // Bazel extension files + "BUILD*", // Bazel BUILD files + "WORKSPACE*", // Bazel WORKSPACE files + "*.buck", // Buck build files + "BUCK*", // Buck BUILD files + "*.ninja", // Ninja build files + "*.gyp", // GYP build files + "*.gypi", // GYP include files + "*.waf", // Waf build files + "wscript*", // Waf build scripts + "package.json", // Node.js package file + "package-lock.json", // Node.js lock file + "yarn.lock", // Yarn lock file + "pnpm-lock.yaml", // PNPM lock file + "requirements*.txt", // Python requirements + "Pipfile*", // Python Pipenv files + "pyproject.toml", // Python project configuration + "setup.cfg", // Python setup configuration + "tox.ini", // Python tox configuration + "poetry.lock", // Python Poetry lock file + "Cargo.toml", // Rust package configuration + "Cargo.lock", // Rust lock file + "go.mod", // Go module file + "go.sum", // Go checksum file + "composer.json", // PHP Composer file + "composer.lock", // PHP Composer lock file + "Gemfile*", // Ruby Gemfile + "*.gemspec", // Ruby gem specification + "mix.exs", // Elixir Mix file + "mix.lock", // Elixir Mix lock file + "rebar.config", // Erlang Rebar config + "rebar.lock", // Erlang Rebar lock file + // Library files. "*.so", // Shared object files "*.dll", // Dynamic Link Library From 0730333eadd603c890168a65a5544a0310fd7c98 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 18:54:58 +0800 Subject: [PATCH 06/13] feat: expand supported file formats for doc Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 87 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 1557a259..2bae4318 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -293,6 +293,93 @@ var ( "*requirements*", // Dependency specifications "*.log", // Log files + // Office documents + "*.doc", // Microsoft Word 97-2003 Document + "*.docx", // Microsoft Word Document + "*.docm", // Word Macro-Enabled Document + "*.dot", // Word 97-2003 Template + "*.dotx", // Word Template + "*.dotm", // Word Macro-Enabled Template + "*.rtf", // Rich Text Format + "*.odt", // OpenDocument Text + "*.ott", // OpenDocument Text Template + "*.fodt", // Flat OpenDocument Text + "*.pages", // Apple Pages document + "*.wpd", // WordPerfect document + + // Spreadsheet documents + "*.xls", // Microsoft Excel 97-2003 Workbook + "*.xlsx", // Microsoft Excel Workbook + "*.xlsm", // Excel Macro-Enabled Workbook + "*.xlsb", // Excel Binary Workbook + "*.xlt", // Excel 97-2003 Template + "*.xltx", // Excel Template + "*.xltm", // Excel Macro-Enabled Template + "*.ods", // OpenDocument Spreadsheet + "*.ots", // OpenDocument Spreadsheet Template + "*.fods", // Flat OpenDocument Spreadsheet + "*.numbers", // Apple Numbers spreadsheet + "*.csv", // Comma-Separated Values + + // Presentation documents + "*.ppt", // Microsoft PowerPoint 97-2003 Presentation + "*.pptx", // Microsoft PowerPoint Presentation + "*.pptm", // PowerPoint Macro-Enabled Presentation + "*.pps", // PowerPoint 97-2003 Show + "*.ppsx", // PowerPoint Show + "*.ppsm", // PowerPoint Macro-Enabled Show + "*.pot", // PowerPoint 97-2003 Template + "*.potx", // PowerPoint Template + "*.potm", // PowerPoint Macro-Enabled Template + "*.odp", // OpenDocument Presentation + "*.otp", // OpenDocument Presentation Template + "*.fodp", // Flat OpenDocument Presentation + "*.key", // Apple Keynote presentation + + // eBook formats + "*.epub", // Electronic Publication + "*.mobi", // Mobipocket eBook + "*.azw", // Amazon Kindle eBook + "*.azw3", // Amazon Kindle eBook (KF8) + "*.fb2", // FictionBook 2.0 + "*.fb3", // FictionBook 3.0 + "*.lit", // Microsoft Literature + "*.pdb", // Palm Database/Document File + "*.djvu", // DjVu document + "*.djv", // DjVu document (alternative extension) + + // Web and markup documents + "*.html", // HyperText Markup Language + "*.htm", // HyperText Markup Language (alternative) + "*.xhtml", // Extensible HyperText Markup Language + "*.mhtml", // MIME HTML (Web Archive) + "*.mht", // MIME HTML (Web Archive, alternative) + "*.xml", // eXtensible Markup Language + "*.xsl", // eXtensible Stylesheet Language + "*.xslt", // XSL Transformations + + // Technical documentation formats + "*.tex", // LaTeX document + "*.latex", // LaTeX document (alternative) + "*.ltx", // LaTeX document (alternative) + "*.bib", // BibTeX bibliography + "*.rst", // reStructuredText + "*.asciidoc", // AsciiDoc + "*.adoc", // AsciiDoc (alternative) + "*.textile", // Textile markup + "*.wiki", // Wiki markup + "*.mediawiki", // MediaWiki markup + "*.org", // Org-mode document + "*.texi", // Texinfo document + "*.texinfo", // Texinfo document (alternative) + "*.info", // GNU Info document + "*.man", // Manual page + + // Archive and compressed documents + "*.chm", // Compiled HTML Help + "*.hlp", // Windows Help File + "*.xps", // XML Paper Specification + // Image assets. "*.jpg", // JPEG image format "*.jpeg", // JPEG alternative extension From b8206938770fb1ae62db530e571b1a40cd8e8c2b Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Sun, 15 Jun 2025 19:19:09 +0800 Subject: [PATCH 07/13] feat: implement workspace limits for file size, count, and total size Added validation in the workspace generation process to enforce limits on single file size (128GB), maximum file count (1024), and total workspace size (8TB). Included unit tests to verify these constraints. Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 21 ++++ pkg/modelfile/modelfile.go | 24 ++++ pkg/modelfile/modelfile_test.go | 201 ++++++++++++++++++++++++++++++++ 3 files changed, 246 insertions(+) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 2bae4318..6cf001db 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -17,6 +17,7 @@ package modelfile import ( + "fmt" "path/filepath" "strings" ) @@ -419,6 +420,11 @@ var ( // Large file size threshold WeightFileSizeThreshold int64 = 128 * 1024 * 1024 + + // Workspace limits + MaxSingleFileSize int64 = 128 * 1024 * 1024 * 1024 // 128GB + MaxWorkspaceFileCount int = 1024 // 1024 files + MaxTotalWorkspaceSize int64 = 8 * 1024 * 1024 * 1024 * 1024 // 8TB ) // IsFileType checks if the filename matches any of the given patterns @@ -460,3 +466,18 @@ func isSkippable(filename string) bool { func SizeShouldBeWeightFile(size int64) bool { return size > WeightFileSizeThreshold } + +// formatBytes converts byte size to human-readable format +func formatBytes(bytes int64) string { + const unit = 1024 + if bytes < unit { + return fmt.Sprintf("%d B", bytes) + } + div, exp := int64(unit), 0 + for n := bytes / unit; n >= unit; n /= unit { + div *= unit + exp++ + } + units := []string{"B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"} + return fmt.Sprintf("%.1f%s", float64(bytes)/float64(div), units[exp+1]) +} diff --git a/pkg/modelfile/modelfile.go b/pkg/modelfile/modelfile.go index a4273c90..e5b15290 100644 --- a/pkg/modelfile/modelfile.go +++ b/pkg/modelfile/modelfile.go @@ -253,6 +253,10 @@ func (mf *modelfile) validateWorkspace() error { // generateByWorkspace generates the modelfile by the workspace's files. func (mf *modelfile) generateByWorkspace() error { + // Initialize counters for workspace limits validation + var fileCount int + var totalSize int64 + // Walk the path and get the files. if err := filepath.Walk(mf.workspace, func(path string, info os.FileInfo, err error) error { if err != nil { @@ -274,6 +278,26 @@ func (mf *modelfile) generateByWorkspace() error { return nil } + // Check workspace limits for regular files + fileCount++ + fileSize := info.Size() + totalSize += fileSize + + // Check single file size limit + if fileSize > MaxSingleFileSize { + return fmt.Errorf("file %s exceeds maximum single file size limit of %d bytes (%s)", path, MaxSingleFileSize, formatBytes(MaxSingleFileSize)) + } + + // Check file count limit + if fileCount > MaxWorkspaceFileCount { + return fmt.Errorf("workspace exceeds maximum file count limit of %d files", MaxWorkspaceFileCount) + } + + // Check total workspace size limit + if totalSize > MaxTotalWorkspaceSize { + return fmt.Errorf("workspace exceeds maximum total size limit of %d bytes (%s)", MaxTotalWorkspaceSize, formatBytes(MaxTotalWorkspaceSize)) + } + // Get relative path from the base directory. relPath, err := filepath.Rel(mf.workspace, path) if err != nil { diff --git a/pkg/modelfile/modelfile_test.go b/pkg/modelfile/modelfile_test.go index 657b8ee7..8b5eb12b 100644 --- a/pkg/modelfile/modelfile_test.go +++ b/pkg/modelfile/modelfile_test.go @@ -19,6 +19,7 @@ package modelfile import ( "encoding/json" "errors" + "fmt" "os" "path/filepath" "sort" @@ -1316,3 +1317,203 @@ func TestValidateWorkspace(t *testing.T) { }) } } + +func TestWorkspaceLimits(t *testing.T) { + tests := []struct { + name string + setupFunc func() (string, func()) // returns workspace path and cleanup function + expectedError string + }{ + { + name: "single_file_exceeds_128GB_limit", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "workspace_limits_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create a test file that simulates exceeding 128GB + // We'll use a sparse file to avoid actually creating 128GB+ of data + testFile := filepath.Join(tmpDir, "large_model.bin") + file, err := os.Create(testFile) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file: %v", err) + } + + // Seek to position that would make file appear larger than 128GB + largeSize := MaxSingleFileSize + 1 + _, err = file.Seek(largeSize-1, 0) + if err != nil { + file.Close() + os.RemoveAll(tmpDir) + t.Fatalf("Failed to seek in file: %v", err) + } + + // Write one byte at the end to make the file that size + _, err = file.Write([]byte{0}) + if err != nil { + file.Close() + os.RemoveAll(tmpDir) + t.Fatalf("Failed to write to file: %v", err) + } + file.Close() + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "exceeds maximum single file size limit", + }, + { + name: "file_count_exceeds_2048_limit", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "workspace_limits_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create more than 2048 files + for i := 0; i <= MaxWorkspaceFileCount; i++ { + testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.txt", i)) + err = os.WriteFile(testFile, []byte("test"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file %d: %v", i, err) + } + } + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "exceeds maximum file count limit", + }, + { + name: "total_workspace_size_exceeds_8TB_limit", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "workspace_limits_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create a few large files that together exceed 8TB + // Each file will be just under 128GB (single file limit) + // We'll create 70 files of ~120GB each to exceed 8TB total + fileSize := MaxSingleFileSize - (1024 * 1024 * 1024) // 127GB per file + numFiles := 70 // 70 * 127GB = ~8.9TB + + for i := 0; i < numFiles; i++ { + testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.bin", i)) + file, err := os.Create(testFile) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file %d: %v", i, err) + } + + // Use sparse file technique + _, err = file.Seek(fileSize-1, 0) + if err != nil { + file.Close() + os.RemoveAll(tmpDir) + t.Fatalf("Failed to seek in file %d: %v", i, err) + } + + _, err = file.Write([]byte{0}) + if err != nil { + file.Close() + os.RemoveAll(tmpDir) + t.Fatalf("Failed to write to file %d: %v", i, err) + } + file.Close() + } + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "exceeds maximum total size limit", + }, + { + name: "workspace_within_all_limits", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "workspace_limits_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create a reasonable number of small files + for i := 0; i < 10; i++ { + testFile := filepath.Join(tmpDir, fmt.Sprintf("small_file_%d.txt", i)) + err = os.WriteFile(testFile, []byte("small content"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file %d: %v", i, err) + } + } + + // Add a config file to make it a valid workspace + configFile := filepath.Join(tmpDir, "config.json") + err = os.WriteFile(configFile, []byte(`{"model_type": "test"}`), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create config file: %v", err) + } + + // Add a model file to make it a valid workspace + modelFile := filepath.Join(tmpDir, "model.safetensors") + err = os.WriteFile(modelFile, []byte("fake model data"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create model file: %v", err) + } + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "", + }, + { + name: "exactly_at_file_count_limit", + setupFunc: func() (string, func()) { + tmpDir, err := os.MkdirTemp("", "workspace_limits_test") + if err != nil { + t.Fatalf("Failed to create temp dir: %v", err) + } + + // Create exactly 2048 files (should be allowed) + // Include one model file to make it valid + modelFile := filepath.Join(tmpDir, "model.safetensors") + err = os.WriteFile(modelFile, []byte("fake model data"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create model file: %v", err) + } + + // Create the remaining files to reach exactly 2048 + for i := 1; i < MaxWorkspaceFileCount; i++ { + testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.txt", i)) + err = os.WriteFile(testFile, []byte("test"), 0644) + if err != nil { + os.RemoveAll(tmpDir) + t.Fatalf("Failed to create test file %d: %v", i, err) + } + } + + return tmpDir, func() { os.RemoveAll(tmpDir) } + }, + expectedError: "", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + workspace, cleanup := tt.setupFunc() + defer cleanup() + + // Create a modelfile instance and try to generate by workspace + config := &configmodelfile.GenerateConfig{} + _, err := NewModelfileByWorkspace(workspace, config) + + if tt.expectedError == "" { + assert.NoError(t, err, "Expected no error for test case: %s", tt.name) + } else { + assert.Error(t, err, "Expected error for test case: %s", tt.name) + assert.Contains(t, err.Error(), tt.expectedError, "Error message should contain expected text for test case: %s", tt.name) + } + }) + } +} From ded8ce4eee7c141e9ca1fe9331ee8fb625908856 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Wed, 18 Jun 2025 14:20:56 +0800 Subject: [PATCH 08/13] feat: add unit tests for model configuration generation and workspace validation Implemented comprehensive unit tests for the `generateByModelConfig` and `generateByConfig` methods, ensuring correct handling of various model configuration scenarios. Enhanced the `validateWorkspace` method tests to cover edge cases, including empty directories and symbolic links. Signed-off-by: Zhao Chen --- pkg/modelfile/modelfile_test.go | 796 ++++++++++++++++++++++---------- 1 file changed, 549 insertions(+), 247 deletions(-) diff --git a/pkg/modelfile/modelfile_test.go b/pkg/modelfile/modelfile_test.go index 8b5eb12b..d71ef187 100644 --- a/pkg/modelfile/modelfile_test.go +++ b/pkg/modelfile/modelfile_test.go @@ -23,11 +23,13 @@ import ( "os" "path/filepath" "sort" + "strings" "testing" configmodelfile "github.com/CloudNativeAI/modctl/pkg/config/modelfile" "github.com/emirpasic/gods/sets/hashset" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" ) func TestNewModelfile(t *testing.T) { @@ -1192,328 +1194,628 @@ func createHashSet(items []string) *hashset.Set { return set } -func TestValidateWorkspace(t *testing.T) { - tests := []struct { - name string - setupFunc func() (string, func()) // returns workspace path and cleanup function - expectedError string +// TestGenerateByModelConfig tests the generateByModelConfig method +func TestGenerateByModelConfig(t *testing.T) { + testcases := []struct { + name string + configFiles map[string]map[string]interface{} + expectedArch string + expectedFamily string + expectedPrecision string + expectError bool }{ { - name: "valid_directory", - setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "validate_workspace_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } + name: "config.json with all fields", + configFiles: map[string]map[string]interface{}{ + "config.json": { + "model_type": "llama", + "torch_dtype": "float16", + "transformers_version": "4.30.0", + }, + }, + expectedArch: "transformer", + expectedFamily: "llama", + expectedPrecision: "float16", + expectError: false, + }, + { + name: "generation_config.json overrides config.json", + configFiles: map[string]map[string]interface{}{ + "config.json": { + "model_type": "llama", + "torch_dtype": "float16", + }, + "generation_config.json": { + "model_type": "gpt2", + "torch_dtype": "float32", + }, + }, + expectedFamily: "gpt2", + expectedPrecision: "float32", + expectError: false, + }, + { + name: "invalid json file", + configFiles: map[string]map[string]interface{}{ + "config.json": nil, // This will create invalid JSON + }, + expectError: false, // Invalid JSON is silently ignored + }, + { + name: "no config files", + configFiles: map[string]map[string]interface{}{}, + expectError: false, + }, + { + name: "partial config", + configFiles: map[string]map[string]interface{}{ + "config.json": { + "model_type": "bert", + }, + }, + expectedFamily: "bert", + expectError: false, + }, + { + name: "transformers version only", + configFiles: map[string]map[string]interface{}{ + "config.json": { + "transformers_version": "4.25.1", + }, + }, + expectedArch: "transformer", + expectError: false, + }, + } - // Create a test file to make directory non-empty - testFile := filepath.Join(tmpDir, "test.txt") - err = os.WriteFile(testFile, []byte("test content"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file: %v", err) + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + tempDir, err := os.MkdirTemp("", "model-config-test-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + // Create config files + for filename, content := range tc.configFiles { + if content == nil { + // Create invalid JSON + err = os.WriteFile(filepath.Join(tempDir, filename), []byte("invalid json"), 0644) + } else { + data, err := json.Marshal(content) + require.NoError(t, err) + err = os.WriteFile(filepath.Join(tempDir, filename), data, 0644) } + require.NoError(t, err) + } + + mf := &modelfile{workspace: tempDir} + err = mf.generateByModelConfig() + + if tc.expectError { + assert.Error(err) + } else { + assert.NoError(err) + assert.Equal(tc.expectedArch, mf.arch) + assert.Equal(tc.expectedFamily, mf.family) + assert.Equal(tc.expectedPrecision, mf.precision) + } + }) + } +} + +// TestGenerateByConfig tests the generateByConfig method +func TestGenerateByConfig(t *testing.T) { + testcases := []struct { + name string + workspace string + config *configmodelfile.GenerateConfig + expectedName string + expectedArch string + expectedFamily string + expectedFormat string + expectedParamsize string + expectedPrecision string + expectedQuantization string + }{ + { + name: "default name from workspace", + workspace: "/path/to/my-model", + config: &configmodelfile.GenerateConfig{}, + expectedName: "my-model", + }, + { + name: "custom name overrides workspace", + workspace: "/path/to/workspace", + config: &configmodelfile.GenerateConfig{ + Name: "custom-model", + }, + expectedName: "custom-model", + }, + { + name: "all fields provided", + workspace: "/path/to/model", + config: &configmodelfile.GenerateConfig{ + Name: "test-model", + Arch: "transformer", + Family: "llama", + Format: "safetensors", + ParamSize: "7B", + Precision: "float16", + Quantization: "int8", + }, + expectedName: "test-model", + expectedArch: "transformer", + expectedFamily: "llama", + expectedFormat: "safetensors", + expectedParamsize: "7B", + expectedPrecision: "float16", + expectedQuantization: "int8", + }, + { + name: "empty config uses workspace name", + workspace: "/tmp/test-workspace", + config: &configmodelfile.GenerateConfig{}, + expectedName: "test-workspace", + }, + } + + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + mf := &modelfile{workspace: tc.workspace} + mf.generateByConfig(tc.config) + + assert.Equal(tc.expectedName, mf.name) + assert.Equal(tc.expectedArch, mf.arch) + assert.Equal(tc.expectedFamily, mf.family) + assert.Equal(tc.expectedFormat, mf.format) + assert.Equal(tc.expectedParamsize, mf.paramsize) + assert.Equal(tc.expectedPrecision, mf.precision) + assert.Equal(tc.expectedQuantization, mf.quantization) + }) + } +} - return tmpDir, func() { os.RemoveAll(tmpDir) } +// TestValidateWorkspace tests the validateWorkspace method specifically +func TestValidateWorkspace(t *testing.T) { + testcases := []struct { + name string + setupFunc func() (string, func()) + expectError bool + errorMsg string + }{ + { + name: "valid directory workspace", + setupFunc: func() (string, func()) { + tempDir, err := os.MkdirTemp("", "valid-workspace-*") + require.NoError(t, err) + // Create a file to make it non-empty + err = os.WriteFile(filepath.Join(tempDir, "test.txt"), []byte("test"), 0644) + require.NoError(t, err) + return tempDir, func() { os.RemoveAll(tempDir) } }, - expectedError: "", + expectError: false, }, { - name: "non_existent_directory", + name: "empty directory workspace", setupFunc: func() (string, func()) { - return "/non/existent/path", func() {} + tempDir, err := os.MkdirTemp("", "empty-workspace-*") + require.NoError(t, err) + return tempDir, func() { os.RemoveAll(tempDir) } }, - expectedError: "access to workspace failed:", + expectError: true, + errorMsg: "the workspace is empty", }, { - name: "file_instead_of_directory", + name: "non-existent workspace", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "validate_workspace_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } - - testFile := filepath.Join(tmpDir, "test.txt") - err = os.WriteFile(testFile, []byte("test content"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file: %v", err) - } - - return testFile, func() { os.RemoveAll(tmpDir) } + return "/non/existent/path", func() {} }, - expectedError: "the workspace is not a directory:", + expectError: true, + errorMsg: "access to workspace failed", }, { - name: "empty_directory", + name: "file instead of directory", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "validate_workspace_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } - return tmpDir, func() { os.RemoveAll(tmpDir) } + tempFile, err := os.CreateTemp("", "file-workspace-*") + require.NoError(t, err) + tempFile.Close() + return tempFile.Name(), func() { os.Remove(tempFile.Name()) } }, - expectedError: "the workspace is empty:", + expectError: true, + errorMsg: "the workspace is not a directory", }, { - name: "symbolic_link", + name: "symbolic link workspace", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "validate_workspace_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } - - // Create target directory with content - targetDir := filepath.Join(tmpDir, "target") - err = os.Mkdir(targetDir, 0755) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create target dir: %v", err) + tempDir, err := os.MkdirTemp("", "symlink-target-*") + require.NoError(t, err) + // Create a file in target + err = os.WriteFile(filepath.Join(tempDir, "test.txt"), []byte("test"), 0644) + require.NoError(t, err) + + symlinkPath := tempDir + "-symlink" + err = os.Symlink(tempDir, symlinkPath) + require.NoError(t, err) + + return symlinkPath, func() { + os.RemoveAll(tempDir) + os.Remove(symlinkPath) } - - testFile := filepath.Join(targetDir, "test.txt") - err = os.WriteFile(testFile, []byte("test content"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file: %v", err) - } - - // Create symbolic link - linkPath := filepath.Join(tmpDir, "link") - err = os.Symlink(targetDir, linkPath) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create symlink: %v", err) - } - - return linkPath, func() { os.RemoveAll(tmpDir) } }, - expectedError: "for simplicity, the workspace should not be a symbolic link:", + expectError: true, + errorMsg: "the workspace should not be a symbolic link", }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - workspace, cleanup := tt.setupFunc() + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + workspace, cleanup := tc.setupFunc() defer cleanup() - mf := &modelfile{ - workspace: workspace, - config: hashset.New(), - model: hashset.New(), - code: hashset.New(), - dataset: hashset.New(), - doc: hashset.New(), - } - + mf := &modelfile{workspace: workspace} err := mf.validateWorkspace() - if tt.expectedError == "" { - assert.NoError(t, err, "Expected no error for test case: %s", tt.name) + if tc.expectError { + assert.Error(err) + assert.Contains(err.Error(), tc.errorMsg) } else { - assert.Error(t, err, "Expected error for test case: %s", tt.name) - assert.Contains(t, err.Error(), tt.expectedError, "Error message should contain expected text for test case: %s", tt.name) + assert.NoError(err) } }) } } func TestWorkspaceLimits(t *testing.T) { - tests := []struct { - name string - setupFunc func() (string, func()) // returns workspace path and cleanup function - expectedError string + testcases := []struct { + name string + setupFunc func() (string, func()) + expectError bool + errorMsg string }{ { - name: "single_file_exceeds_128GB_limit", + name: "exceeds file count limit", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "workspace_limits_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } - - // Create a test file that simulates exceeding 128GB - // We'll use a sparse file to avoid actually creating 128GB+ of data - testFile := filepath.Join(tmpDir, "large_model.bin") - file, err := os.Create(testFile) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file: %v", err) + tempDir, err := os.MkdirTemp("", "file-count-test-*") + require.NoError(t, err) + + // Create more files than the limit (1024) + for i := 0; i < MaxWorkspaceFileCount+10; i++ { + filename := fmt.Sprintf("file_%d.txt", i) + err = os.WriteFile(filepath.Join(tempDir, filename), []byte("test"), 0644) + require.NoError(t, err) } - // Seek to position that would make file appear larger than 128GB - largeSize := MaxSingleFileSize + 1 - _, err = file.Seek(largeSize-1, 0) - if err != nil { - file.Close() - os.RemoveAll(tmpDir) - t.Fatalf("Failed to seek in file: %v", err) - } - - // Write one byte at the end to make the file that size - _, err = file.Write([]byte{0}) - if err != nil { - file.Close() - os.RemoveAll(tmpDir) - t.Fatalf("Failed to write to file: %v", err) - } - file.Close() - - return tmpDir, func() { os.RemoveAll(tmpDir) } + return tempDir, func() { os.RemoveAll(tempDir) } }, - expectedError: "exceeds maximum single file size limit", + expectError: true, + errorMsg: "exceeds maximum file count limit", }, { - name: "file_count_exceeds_2048_limit", + name: "normal sized files should pass", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "workspace_limits_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } + tempDir, err := os.MkdirTemp("", "normal-file-test-*") + require.NoError(t, err) - // Create more than 2048 files - for i := 0; i <= MaxWorkspaceFileCount; i++ { - testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.txt", i)) - err = os.WriteFile(testFile, []byte("test"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file %d: %v", i, err) - } - } + // Create normal sized files including a model file + normalPath := filepath.Join(tempDir, "model.bin") + err = os.WriteFile(normalPath, []byte("test model content"), 0644) + require.NoError(t, err) + + // Add a config file too + configPath := filepath.Join(tempDir, "config.json") + err = os.WriteFile(configPath, []byte(`{"model_type": "test"}`), 0644) + require.NoError(t, err) - return tmpDir, func() { os.RemoveAll(tmpDir) } + return tempDir, func() { os.RemoveAll(tempDir) } }, - expectedError: "exceeds maximum file count limit", + expectError: false, }, { - name: "total_workspace_size_exceeds_8TB_limit", + name: "within limits", setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "workspace_limits_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) + tempDir, err := os.MkdirTemp("", "within-limits-test-*") + require.NoError(t, err) + + // Create a reasonable number of files including valid model/code files + for i := 0; i < 8; i++ { + filename := fmt.Sprintf("file_%d.txt", i) + err = os.WriteFile(filepath.Join(tempDir, filename), []byte("test content"), 0644) + require.NoError(t, err) } - // Create a few large files that together exceed 8TB - // Each file will be just under 128GB (single file limit) - // We'll create 70 files of ~120GB each to exceed 8TB total - fileSize := MaxSingleFileSize - (1024 * 1024 * 1024) // 127GB per file - numFiles := 70 // 70 * 127GB = ~8.9TB - - for i := 0; i < numFiles; i++ { - testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.bin", i)) - file, err := os.Create(testFile) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file %d: %v", i, err) - } - - // Use sparse file technique - _, err = file.Seek(fileSize-1, 0) - if err != nil { - file.Close() - os.RemoveAll(tmpDir) - t.Fatalf("Failed to seek in file %d: %v", i, err) - } - - _, err = file.Write([]byte{0}) - if err != nil { - file.Close() - os.RemoveAll(tmpDir) - t.Fatalf("Failed to write to file %d: %v", i, err) - } - file.Close() - } + // Add valid model and config files + err = os.WriteFile(filepath.Join(tempDir, "model.bin"), []byte("model content"), 0644) + require.NoError(t, err) + err = os.WriteFile(filepath.Join(tempDir, "config.json"), []byte(`{"model_type": "test"}`), 0644) + require.NoError(t, err) - return tmpDir, func() { os.RemoveAll(tmpDir) } + return tempDir, func() { os.RemoveAll(tempDir) } }, - expectedError: "exceeds maximum total size limit", + expectError: false, }, - { - name: "workspace_within_all_limits", - setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "workspace_limits_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } + } - // Create a reasonable number of small files - for i := 0; i < 10; i++ { - testFile := filepath.Join(tmpDir, fmt.Sprintf("small_file_%d.txt", i)) - err = os.WriteFile(testFile, []byte("small content"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file %d: %v", i, err) - } - } + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + workspace, cleanup := tc.setupFunc() + defer cleanup() - // Add a config file to make it a valid workspace - configFile := filepath.Join(tmpDir, "config.json") - err = os.WriteFile(configFile, []byte(`{"model_type": "test"}`), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create config file: %v", err) - } + config := &configmodelfile.GenerateConfig{ + Name: "test-model", + } - // Add a model file to make it a valid workspace - modelFile := filepath.Join(tmpDir, "model.safetensors") - err = os.WriteFile(modelFile, []byte("fake model data"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create model file: %v", err) - } + _, err := NewModelfileByWorkspace(workspace, config) - return tmpDir, func() { os.RemoveAll(tmpDir) } - }, - expectedError: "", + if tc.expectError { + assert.Error(err) + assert.Contains(err.Error(), tc.errorMsg) + } else { + assert.NoError(err) + } + }) + } +} + +func TestFileTypeClassification(t *testing.T) { + testcases := []struct { + name string + files map[string]int64 // filename -> size + expectedConfigs []string + expectedModels []string + expectedCodes []string + expectedDocs []string + }{ + { + name: "various file types", + files: map[string]int64{ + "config.json": 1024, + "model.bin": 1024 * 1024 * 1024, // 1GB - large file + "script.py": 2048, + "README.md": 512, + "tokenizer.json": 1024, + "weights.safetensors": 2 * 1024 * 1024 * 1024, // 2GB - large file + "inference.py": 3072, + "LICENSE": 256, + }, + expectedConfigs: []string{"config.json", "tokenizer.json"}, + expectedModels: []string{"model.bin", "weights.safetensors"}, + expectedCodes: []string{"script.py", "inference.py"}, + expectedDocs: []string{"README.md", "LICENSE"}, }, { - name: "exactly_at_file_count_limit", - setupFunc: func() (string, func()) { - tmpDir, err := os.MkdirTemp("", "workspace_limits_test") - if err != nil { - t.Fatalf("Failed to create temp dir: %v", err) - } + name: "small unknown files treated as code files", + files: map[string]int64{ + "unknown_small_file": 1024, // 1KB - below threshold + "another_unknown.xyz": 50 * 1024, // 50KB - below threshold + "config.json": 1024, // Add a config to make workspace valid + }, + expectedConfigs: []string{"config.json"}, + expectedModels: []string{}, + expectedCodes: []string{"unknown_small_file", "another_unknown.xyz"}, + }, + { + name: "case insensitive file extensions", + files: map[string]int64{ + "CONFIG.JSON": 1024, + "Model.BIN": 1024, + "Script.PY": 1024, + "README.MD": 1024, + }, + expectedConfigs: []string{"CONFIG.JSON"}, + expectedModels: []string{"Model.BIN"}, + expectedCodes: []string{"Script.PY"}, + expectedDocs: []string{"README.MD"}, + }, + { + name: "nested directory files", + files: map[string]int64{ + "configs/model.json": 1024, + "models/pytorch_model.bin": 1024, + "src/utils.py": 1024, + "docs/guide.md": 1024, + }, + expectedConfigs: []string{"configs/model.json"}, + expectedModels: []string{"models/pytorch_model.bin"}, + expectedCodes: []string{"src/utils.py"}, + expectedDocs: []string{"docs/guide.md"}, + }, + } - // Create exactly 2048 files (should be allowed) - // Include one model file to make it valid - modelFile := filepath.Join(tmpDir, "model.safetensors") - err = os.WriteFile(modelFile, []byte("fake model data"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create model file: %v", err) + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + tempDir, err := os.MkdirTemp("", "file-type-test-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + // Create files with specified sizes + for filename, size := range tc.files { + fullPath := filepath.Join(tempDir, filename) + + // Create directory if needed + dir := filepath.Dir(fullPath) + if dir != tempDir { + err = os.MkdirAll(dir, 0755) + require.NoError(t, err) } - // Create the remaining files to reach exactly 2048 - for i := 1; i < MaxWorkspaceFileCount; i++ { - testFile := filepath.Join(tmpDir, fmt.Sprintf("file_%d.txt", i)) - err = os.WriteFile(testFile, []byte("test"), 0644) - if err != nil { - os.RemoveAll(tmpDir) - t.Fatalf("Failed to create test file %d: %v", i, err) - } + // Create file with specified size + file, err := os.Create(fullPath) + require.NoError(t, err) + + if size > 0 { + // For large files, we'll write a smaller amount and then seek to create the size + // For testing purposes, just write some content + content := strings.Repeat("x", int(min(size, 1024))) + _, err = file.WriteString(content) + require.NoError(t, err) } + file.Close() + } - return tmpDir, func() { os.RemoveAll(tmpDir) } - }, - expectedError: "", + config := &configmodelfile.GenerateConfig{ + Name: "test-classification", + } + + mf, err := NewModelfileByWorkspace(tempDir, config) + require.NoError(t, err) + + assert.ElementsMatch(tc.expectedConfigs, mf.GetConfigs()) + assert.ElementsMatch(tc.expectedModels, mf.GetModels()) + assert.ElementsMatch(tc.expectedCodes, mf.GetCodes()) + assert.ElementsMatch(tc.expectedDocs, mf.GetDocs()) + }) + } +} + +func TestSkippedFiles(t *testing.T) { + tempDir, err := os.MkdirTemp("", "skip-test-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) + + // Create various files and directories that should be skipped + filesToCreate := []string{ + ".hidden_file", + ".git/config", + "__pycache__/cache.pyc", + "model.pyo", + "script.pyd", + "modelfile", + "normal_file.txt", + "valid_model.bin", + } + + dirsToCreate := []string{ + ".git", + "__pycache__", + ".hidden_dir", + "normal_dir", + } + + for _, dir := range dirsToCreate { + err = os.MkdirAll(filepath.Join(tempDir, dir), 0755) + require.NoError(t, err) + } + + for _, file := range filesToCreate { + fullPath := filepath.Join(tempDir, file) + dir := filepath.Dir(fullPath) + if dir != tempDir { + err = os.MkdirAll(dir, 0755) + require.NoError(t, err) + } + err = os.WriteFile(fullPath, []byte("content"), 0644) + require.NoError(t, err) + } + + config := &configmodelfile.GenerateConfig{ + Name: "skip-test", + } + + mf, err := NewModelfileByWorkspace(tempDir, config) + require.NoError(t, err) + + // Only normal_file.txt and valid_model.bin should be included + allFiles := append(append(append(mf.GetConfigs(), mf.GetModels()...), mf.GetCodes()...), mf.GetDocs()...) + + assert := assert.New(t) + + // Check that skipped files are not included + for _, file := range allFiles { + assert.NotContains(file, ".hidden") + assert.NotContains(file, ".git") + assert.NotContains(file, "__pycache__") + assert.NotContains(file, ".pyc") + assert.NotContains(file, ".pyo") + assert.NotContains(file, ".pyd") + assert.NotEqual(file, "modelfile") + } + + // Check that normal files are included + expectedFiles := []string{"normal_file.txt", "valid_model.bin"} + for _, expectedFile := range expectedFiles { + found := false + for _, file := range allFiles { + if strings.Contains(file, expectedFile) { + found = true + break + } + } + assert.True(found, "Expected file %s should be included", expectedFile) + } +} + +func TestEmptyWorkspaceHandling(t *testing.T) { + testcases := []struct { + name string + files []string + expectError bool + errorMsg string + }{ + { + name: "only documentation files", + files: []string{"README.md", "LICENSE", "docs.txt"}, + expectError: true, + errorMsg: "no model/code/dataset found", + }, + { + name: "only configuration files", + files: []string{"config.json", "settings.yaml"}, + expectError: true, + errorMsg: "no model/code/dataset found", + }, + { + name: "has model files", + files: []string{"model.bin", "config.json"}, + expectError: false, + }, + { + name: "has code files", + files: []string{"script.py", "config.json"}, + expectError: false, + }, + { + name: "mixed valid files", + files: []string{"model.bin", "script.py", "README.md"}, + expectError: false, }, } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - workspace, cleanup := tt.setupFunc() - defer cleanup() + assert := assert.New(t) + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + tempDir, err := os.MkdirTemp("", "empty-workspace-test-*") + require.NoError(t, err) + defer os.RemoveAll(tempDir) - // Create a modelfile instance and try to generate by workspace - config := &configmodelfile.GenerateConfig{} - _, err := NewModelfileByWorkspace(workspace, config) + for _, filename := range tc.files { + err = os.WriteFile(filepath.Join(tempDir, filename), []byte("content"), 0644) + require.NoError(t, err) + } + + config := &configmodelfile.GenerateConfig{ + Name: "test-model", + } + + _, err = NewModelfileByWorkspace(tempDir, config) - if tt.expectedError == "" { - assert.NoError(t, err, "Expected no error for test case: %s", tt.name) + if tc.expectError { + assert.Error(err) + assert.Contains(err.Error(), tc.errorMsg) } else { - assert.Error(t, err, "Expected error for test case: %s", tt.name) - assert.Contains(t, err.Error(), tt.expectedError, "Error message should contain expected text for test case: %s", tt.name) + assert.NoError(err) } }) } } + +// min returns the minimum of two integers +func min(a, b int64) int64 { + if a < b { + return a + } + return b +} From 8ad59ee72260ea3d94ee641d52610344f54fd6be Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Thu, 19 Jun 2025 16:51:20 +0800 Subject: [PATCH 09/13] feat: deprecate and hide the ignore-unrecognized-file-types flag Updated the command-line flag for ignoring unrecognized file types to mark it as deprecated and hidden, indicating it will be removed in the next release. Signed-off-by: Zhao Chen --- cmd/modelfile/generate.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/modelfile/generate.go b/cmd/modelfile/generate.go index 3cad9361..4166879d 100644 --- a/cmd/modelfile/generate.go +++ b/cmd/modelfile/generate.go @@ -62,9 +62,13 @@ func init() { flags.StringVar(&generateConfig.Precision, "precision", "", "specify model precision, such as bf16, fp16, int8, etc") flags.StringVar(&generateConfig.Quantization, "quantization", "", "specify model quantization, such as awq, gptq, etc") flags.StringVarP(&generateConfig.Output, "output", "O", ".", "specify the output path of modelfilem, must be a directory") - flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "[deprecated] ignore the unrecognized file types in the workspace") + flags.BoolVar(&generateConfig.IgnoreUnrecognizedFileTypes, "ignore-unrecognized-file-types", false, "ignore the unrecognized file types in the workspace") flags.BoolVar(&generateConfig.Overwrite, "overwrite", false, "overwrite the existing modelfile") + // Mark the ignore-unrecognized-file-types flag as deprecated and hidden + flags.MarkDeprecated("ignore-unrecognized-file-types", "this flag will be removed in the next release") + flags.MarkHidden("ignore-unrecognized-file-types") + if err := viper.BindPFlags(flags); err != nil { panic(fmt.Errorf("bind cache list flags to viper: %w", err)) } From afc8204738eeddbf73872bacf7c22573c6b03af1 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Thu, 19 Jun 2025 16:57:08 +0800 Subject: [PATCH 10/13] refactor: reorganize constants for file size thresholds and workspace limits Consolidated file size thresholds and workspace limits into a single constants block for improved readability and maintainability. Updated comments for clarity on each constant's purpose. Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index 6cf001db..c60f3bb6 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -417,14 +417,14 @@ var ( "*.pyo", // Python optimized bytecode "*.pyd", // Python dynamic modules } +) - // Large file size threshold - WeightFileSizeThreshold int64 = 128 * 1024 * 1024 - - // Workspace limits - MaxSingleFileSize int64 = 128 * 1024 * 1024 * 1024 // 128GB - MaxWorkspaceFileCount int = 1024 // 1024 files - MaxTotalWorkspaceSize int64 = 8 * 1024 * 1024 * 1024 * 1024 // 8TB +const ( + // File size thresholds and workspace limits + WeightFileSizeThreshold int64 = 128 * 1024 * 1024 // 128MB - threshold for considering file as weight file + MaxSingleFileSize int64 = 128 * 1024 * 1024 * 1024 // 128GB - maximum size for a single file + MaxWorkspaceFileCount int = 1024 // 1024 files - maximum number of files in workspace + MaxTotalWorkspaceSize int64 = 8 * 1024 * 1024 * 1024 * 1024 // 8TB - maximum total workspace size ) // IsFileType checks if the filename matches any of the given patterns From 952380c8c8129d2fcdf9cae70376208f5c91b40e Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Thu, 19 Jun 2025 17:00:30 +0800 Subject: [PATCH 11/13] refactor: replace hardcoded byte size constants with humanize package Updated file size constants to utilize the go-humanize package for improved readability and maintainability. Adjusted the formatBytes function to leverage humanize.Bytes for converting byte sizes to a human-readable format. Signed-off-by: Zhao Chen --- pkg/modelfile/constants.go | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index c60f3bb6..e314b3a4 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -17,9 +17,10 @@ package modelfile import ( - "fmt" "path/filepath" "strings" + + "github.com/dustin/go-humanize" ) var ( @@ -421,10 +422,10 @@ var ( const ( // File size thresholds and workspace limits - WeightFileSizeThreshold int64 = 128 * 1024 * 1024 // 128MB - threshold for considering file as weight file - MaxSingleFileSize int64 = 128 * 1024 * 1024 * 1024 // 128GB - maximum size for a single file - MaxWorkspaceFileCount int = 1024 // 1024 files - maximum number of files in workspace - MaxTotalWorkspaceSize int64 = 8 * 1024 * 1024 * 1024 * 1024 // 8TB - maximum total workspace size + WeightFileSizeThreshold int64 = 128 * humanize.MByte // 128MB - threshold for considering file as weight file + MaxSingleFileSize int64 = 128 * humanize.GByte // 128GB - maximum size for a single file + MaxWorkspaceFileCount int = 1024 // 1024 files - maximum number of files in workspace + MaxTotalWorkspaceSize int64 = 8 * humanize.TByte // 8TB - maximum total workspace size ) // IsFileType checks if the filename matches any of the given patterns @@ -467,17 +468,7 @@ func SizeShouldBeWeightFile(size int64) bool { return size > WeightFileSizeThreshold } -// formatBytes converts byte size to human-readable format +// formatBytes converts byte size to human-readable format using go-humanize func formatBytes(bytes int64) string { - const unit = 1024 - if bytes < unit { - return fmt.Sprintf("%d B", bytes) - } - div, exp := int64(unit), 0 - for n := bytes / unit; n >= unit; n /= unit { - div *= unit - exp++ - } - units := []string{"B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"} - return fmt.Sprintf("%.1f%s", float64(bytes)/float64(div), units[exp+1]) + return humanize.Bytes(uint64(bytes)) } From 443874152f3d90b04343f3d4d7278a0951091cf0 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Thu, 19 Jun 2025 17:05:08 +0800 Subject: [PATCH 12/13] fix: revert error message for existing modelfile check --- pkg/config/modelfile/modelfile.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/config/modelfile/modelfile.go b/pkg/config/modelfile/modelfile.go index fe1c6f50..f2025052 100644 --- a/pkg/config/modelfile/modelfile.go +++ b/pkg/config/modelfile/modelfile.go @@ -83,7 +83,7 @@ func (g *GenerateConfig) Validate() error { // If the output path does not exist, we can create the modelfile. if _, err := os.Stat(g.Output); err == nil { if !g.Overwrite { - return fmt.Errorf("modelfile already exists at %s - use --overwrite to overwrite", g.Output) + return fmt.Errorf("Modelfile already exists at %s - use --overwrite to overwrite", g.Output) } } From a6e642484dcb9af53d96d3c0f29cef19592b3d68 Mon Sep 17 00:00:00 2001 From: Zhao Chen Date: Thu, 19 Jun 2025 17:12:44 +0800 Subject: [PATCH 13/13] chore: increase maximum workspace file count from 1024 to 2048 Updated the MaxWorkspaceFileCount constant to allow for a higher limit of files in the workspace, enhancing flexibility for users managing larger projects. --- pkg/modelfile/constants.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/modelfile/constants.go b/pkg/modelfile/constants.go index e314b3a4..ef221d7c 100644 --- a/pkg/modelfile/constants.go +++ b/pkg/modelfile/constants.go @@ -424,7 +424,7 @@ const ( // File size thresholds and workspace limits WeightFileSizeThreshold int64 = 128 * humanize.MByte // 128MB - threshold for considering file as weight file MaxSingleFileSize int64 = 128 * humanize.GByte // 128GB - maximum size for a single file - MaxWorkspaceFileCount int = 1024 // 1024 files - maximum number of files in workspace + MaxWorkspaceFileCount int = 2048 // 2048 files - maximum number of files in workspace MaxTotalWorkspaceSize int64 = 8 * humanize.TByte // 8TB - maximum total workspace size )