diff --git a/.coderabbit.yaml b/.coderabbit.yaml index 2dba1d2a3c8..7c896941df4 100644 --- a/.coderabbit.yaml +++ b/.coderabbit.yaml @@ -4,7 +4,35 @@ reviews: profile: chill collapse_walkthrough: true poem: false + path_instructions: + - path: "modelopt/**/*.py" + instructions: &security_instructions | + Review all modelopt package and examples Python changes against the security coding practices in + SECURITY.md. Flag any of the following as CRITICAL security issues, + request changes, and fail the check if ANY are present: + 1. torch.load(..., weights_only=False) with no inline comment justifying why it is safe + (e.g. confirming the file is internally-generated and not user-supplied). + 2. numpy.load(..., allow_pickle=True) with no inline comment justifying why it is safe. + Should expose allow_pickle as a caller-configurable parameter defaulting to False, not hardcode True. + 3. trust_remote_code=True hardcoded for transformers model or tokenizer loading. + Code should expose it as a caller-configurable parameter defaulting to False, not hardcode True. + 4. eval() or exec() on any input that could originate from outside the process. + 5. Any use of "# nosec" comments to bypass Bandit security checks is not allowed. + If a security-sensitive pattern is genuinely necessary, the PR must be reviewed and approved + by @NVIDIA/modelopt-setup-codeowners with an explicit justification in the PR description. + - path: "examples/**/*.py" + instructions: *security_instructions auto_review: - auto_incremental_review: false + auto_incremental_review: true drafts: false base_branches: ["main", "release/.*", "feature/.*"] + pre_merge_checks: + custom_checks: + - name: "Security anti-patterns" + mode: "error" + instructions: *security_instructions +knowledge_base: + code_guidelines: + filePatterns: + - "CONTRIBUTING.md" + - "SECURITY.md" diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e3c4dd45161..746f84ee6dc 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,27 +1,28 @@ -## What does this PR do? +### What does this PR do? -**Type of change:** ? +Type of change: ? -**Overview:** ? + -## Usage - +### Usage ```python # Add a code snippet demonstrating how to use this ``` -## Testing +### Testing -## Before your PR is "*Ready for review*" - +### Before your PR is "*Ready for review*" -- **Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md)** and your commits are signed. -- **Is this change backward compatible?**: Yes/No -- **Did you write any new necessary tests?**: Yes/No -- **Did you add or update any necessary documentation?**: Yes/No -- **Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?**: Yes/No +Make sure you read and follow [Contributor guidelines](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md) and your commits are signed (`git commit -s -S`). -## Additional Information +Make sure you read and follow the [Security Best Practices](https://github.com/NVIDIA/Model-Optimizer/blob/main/SECURITY.md#security-coding-practices-for-contributors) (e.g. avoiding hardcoded `trust_remote_code=True`, using `torch.load(..., weights_only=True)`, avoiding `pickle`, etc.). + +- Is this change backward compatible?: ✅ / ❌ / N/A +- If you copied code from any other source, did you follow IP policy in [CONTRIBUTING.md](https://github.com/NVIDIA/Model-Optimizer/blob/main/CONTRIBUTING.md#-copying-code-from-other-sources)?: ✅ / ❌ / N/A +- Did you write any new necessary tests?: ✅ / ❌ / N/A +- Did you update [Changelog](https://github.com/NVIDIA/Model-Optimizer/blob/main/CHANGELOG.rst)?: ✅ / ❌ / N/A + +### Additional Information diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3ace50ada39..eaffe813a51 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,36 +39,41 @@ To run the pre-commit hooks without committing, use: pre-commit run --all-files ``` -## 📝 Writing tests +## 🔒 Security coding practices -We use [pytest](https://docs.pytest.org/) for all tests. The tests are organized into the following directories: +All contributors must follow the security coding practices documented in *Security Coding Practices for +Contributors* section of [SECURITY.md](./SECURITY.md#security-coding-practices-for-contributors) page. -- `tests/unit`: Fast cpu-based unit tests for the core ModelOpt library. They should not take more than a few seconds to run. -- `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run. -- `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details. +Any security-sensitive exception requires review and approval from `@NVIDIA/modelopt-setup-codeowners`. -Please refer to [tox.ini](./tox.ini) for more details on how to run the tests and their dependencies. +## 📋 Copying code from other sources -### Code Coverage +The utilization of third-party code requires authorization via the Open Source Review Board (OSRB) team and needs to follow proper guidance on contributing code. -For any new features / examples, make sure to they are covered by the tests and that the Codecov coverage check in your PR passes. +If you are an external contributor, seek guidance from `@NVIDIA/modelopt-setup-codeowners` for next steps. For internal contributors, follow the steps below: -## Submitting your code +- **File NVBug for use of open-source code:** + Clone NVBug 2885977 and add your use case. Copying code from permissive licensed repositories (e.g. MIT, Apache 2) is generally self-checkout but for other licenses, it is necessary to get expert guidance before merging your PR. +- **License header format:** The file which has code copied from another third-party GitHub repository should have the following in order: + 1. A reference link (with commit hash) to the source from which the code was copied. + 1. The original repository's Copyright / License. + 1. The NVIDIA Apache 2.0 Copyright / License header. -- If you are an external contributor, create a fork of the repository. -- Rebase (not merge) your code to the most recent commit of the `main` branch. We want to ensure a linear history; - see [Merge vs Rebase](https://www.atlassian.com/git/tutorials/merging-vs-rebasing). Remember to test again locally after rebasing to catch any new issues before pushing to your PR. + See [`modelopt/torch/speculative/eagle/utils.py`](./modelopt/torch/speculative/eagle/utils.py) + for an example of the correct license header format. +- **Exclude from license pre-commit hook:** Exclude copied files from the license pre-commit hook so it doesn't auto-add the NVIDIA Apache 2.0 license on top of the file. Add the file path to the `exclude` list in the `insert-license` hook in [`.pre-commit-config.yaml`](./.pre-commit-config.yaml). -```bash -git pull -git rebase origin/main -git push origin --force-with-lease -``` +## 📝 Writing tests -- When pushing the rebased (or any) branch, use `git push --force-with-lease` instead of `git push --force`. -- Submit a pull request and let auto-assigned reviewers (based on [CODEOWNERS](./.github/CODEOWNERS)) review your PR. -- If any CI/CD checks fail, fix the issues and push again. -- Once your PR is approved and all checks pass, one of the reviewers will merge the PR. +We use [pytest](https://docs.pytest.org/) for all tests. For any new features / examples, make sure to add tests and that the coverage check in your PR passes. The tests are organized into the following directories: + +- `tests/unit`: Fast cpu-based unit tests for the core ModelOpt library. They should not take more than a few seconds to run. +- `tests/gpu`: Fast GPU-based unit tests for the core ModelOpt library. In most cases, they should not take more than a few seconds to run. +- `tests/gpu_megatron`: Fast GPU-based unit tests for the core ModelOpt library for Megatron-Core features. In most cases, they should not take more than a few seconds to run. +- `tests/gpu_trtllm`: Fast GPU-based unit tests for the core ModelOpt library for TensorRT-LLM features. In most cases, they should not take more than a few seconds to run. +- `tests/examples`: Integration tests for ModelOpt examples. They should not take more than a few minutes to run. Please refer to [example test README](./tests/examples/README.md) for more details. + +Please refer to [tox.ini](./tox.ini) for more details on how to run the tests and their dependencies. ## ✍️ Signing your work @@ -135,3 +140,9 @@ git push origin --force-with-lease (d) I understand and agree that this project and the contribution are public and that a record of the contribution (including all personal information I submit with it, including my sign-off) is maintained indefinitely and may be redistributed consistent with this project or the open source license(s) involved. ``` + +## Submitting your code + +- Submit a pull request and let auto-assigned reviewers (based on [CODEOWNERS](./.github/CODEOWNERS)) review your PR. +- If any CI/CD checks fail, fix the issues and push again. +- Once your PR is approved and all checks pass, one of the reviewers will merge the PR. diff --git a/SECURITY.md b/SECURITY.md index bba6893c5e5..503e6e2b0f6 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -22,4 +22,150 @@ While NVIDIA currently does not have a bug bounty program, we do offer acknowled ## NVIDIA Product Security -For all security-related concerns, please visit NVIDIA's [Product Security portal](https://www.nvidia.com/en-us/security) +For all security-related concerns, please visit NVIDIA's [Product Security portal](https://www.nvidia.com/en-us/security). + +--- + +## Security Considerations + +### Overview + +NVIDIA Model Optimizer (ModelOpt) is a library used to optimize ML models and may load and process user-provided artifacts (models, weights, configs, calibration data) and their dependencies. Secure deployment depends on how you source artifacts, validate inputs, and harden the environment where ModelOpt runs. + +### What to Be Aware Of + +#### Untrusted model and data inputs + +- Models, weights, configs and data may be malicious or corrupted. + +#### Deserialization and code-execution risks + +- Unsafe deserialization can lead to arbitrary code execution if fed untrusted inputs. +- Avoid using serialization formats/settings that can deserialize arbitrary objects. + +#### Input validation and resource exhaustion + +- Large or malformed inputs can trigger crashes or excessive CPU/GPU/memory use. +- Missing size/type checks can increase DoS risk. + +#### Data in transit and at rest + +- If fetching models or dependencies over the network, insecure transport can enable tampering. +- Stored artifacts, logs, and caches may contain sensitive data. + +#### Logging and observability + +- Logs may inadvertently contain sensitive inputs, paths, tokens, or proprietary model details. +- Overly verbose logs can leak operational and security-relevant information. + +#### Supply chain and third-party components + +- Dependencies may include known vulnerabilities or be compromised. +- Third-party plugins/components loaded at runtime may not have the same security assurances. + +### Example Security Approaches + +#### Artifact integrity + +- Only load artifacts from trusted sources. +- Prefer signed artifacts; verify signatures before loading. + +#### Safe parsing and deserialization + +- Prefer safer storage formats (avoid object deserialization for untrusted inputs). +- Avoid `pickle`, `torch.load()` with untrusted weights, or YAML `unsafe_load`. +- Treat any unverified artifact as untrusted and block/guard its loading. + +#### Hardening and least privilege + +- Run with least privilege and isolate workloads. + +#### Data protection + +- Encrypt sensitive data at rest; use TLS 1.3 for data in transit. +- Never hardcode or log credentials. + +#### Resilience + +- Validate inputs and enforce limits (file size, timeouts, quotas, etc.). +- Keep OS, containers, and dependencies patched; scan for known vulnerabilities. + +--- + +## Security Coding Practices for Contributors + +ModelOpt processes model checkpoints and weights from various sources. Contributors must avoid patterns that can introduce security vulnerabilities. These rules apply to all code except tests. These rules cover a few key security considerations as follows: + +### Deserializing untrusted data + +**Do not use `torch.load(..., weights_only=False)`** unless a documented exception is provided. It uses pickle under the hood and can execute arbitrary code from a malicious checkpoint. + +```python +# Bad — allows arbitrary code execution from the checkpoint file +state = torch.load(path, weights_only=False) + +# Good +state = torch.load(path, weights_only=True, map_location="cpu") + +# Acceptable only with an inline comment explaining why weights_only=False +# is required and confirming the file is internally-generated / trusted. +state = torch.load( + path, + weights_only=False, # loaded file is generated internally by ModelOpt and not supplied by the user + map_location="cpu", +) +``` + +**Do not use `numpy.load(..., allow_pickle=True)`** unless a documented exception is provided. It uses pickle under the hood and can execute arbitrary code from a malicious checkpoint. + +```python +# Bad — allows arbitrary code execution from the checkpoint file +state = numpy.load(path, allow_pickle=True) + +# Good - let the caller decide; default to False +def load_data(path: str, trust_data: bool = False): + return numpy.load(path, allow_pickle=trust_data) +``` + +**Do not use `yaml.load()`** — always use `yaml.safe_load()`. The default loader can execute arbitrary Python objects embedded in YAML. + +### Loading transformers models with `trust_remote_code` + +**Do not hardcode `trust_remote_code=True`.** This flag tells Transformers to execute arbitrary Python shipped with a checkpoint, which is an RCE vector if the model source is untrusted. + +```python +# Bad — silently opts every user into remote code execution +model = AutoModel.from_pretrained(name, trust_remote_code=True) + +# Good — let the caller decide; default to False +def load_model(name: str, trust_remote_code: bool = False): + return AutoModel.from_pretrained(name, trust_remote_code=trust_remote_code) +``` + +### Subprocess and shell commands + +**Never use `shell=True` with string interpolation or user-supplied input.** This is a command-injection vector. + +```python +# Bad — command injection if model_name contains shell metacharacters +subprocess.run(f"python convert.py --model {model_name}", shell=True) + +# Good — pass arguments as a list +subprocess.run(["python", "convert.py", "--model", model_name]) +``` + +### Other patterns to avoid + +- **`eval()` / `exec()`** on strings derived from external input. If you must generate and execute code dynamically, validate the input against an allowlist of safe patterns. +- **Hardcoded secrets or credentials** — never commit tokens, passwords, or API keys. Use environment variables or config files listed in `.gitignore`. + +### Bandit security checks + +Bandit is used as a pre-commit hook to check for security-sensitive patterns in the code. **`# nosec` comments are not allowed** as a bypass for security checks. + +### Creating a security exception + +If a security-sensitive pattern (e.g. `pickle`, `subprocess`) is genuinely required, the contributor must: + +1. **Add an inline comment** explaining *why* the pattern is necessary and *why* it is safe in this specific context (e.g. "loaded file is generated internally by ModelOpt"). +1. **Request review from [@NVIDIA/modelopt-setup-codeowners](https://github.com/orgs/NVIDIA/teams/modelopt-setup-codeowners)** and include a clear justification in the PR description. diff --git a/docs/source/guides/2_save_load.rst b/docs/source/guides/2_save_load.rst index e097e3f8067..9deb82f8db2 100644 --- a/docs/source/guides/2_save_load.rst +++ b/docs/source/guides/2_save_load.rst @@ -129,9 +129,7 @@ Here is the example workflow of restoring the ModelOpt-modified model architectu model = ... # Restore the model architecture using the saved `modelopt_state` - # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input - modelopt_state = torch.load("modelopt_state.pth", weights_only=False) - model = mto.restore_from_modelopt_state(model, modelopt_state) + model = mto.restore_from_modelopt_state(model, modelopt_state_path="modelopt_state.pth") # Load the model weights separately after restoring the model architecture custom_method_to_load_model_weights(model) diff --git a/docs/source/reference/2_security.rst b/docs/source/reference/2_security.rst deleted file mode 100644 index 5a6e37af0ec..00000000000 --- a/docs/source/reference/2_security.rst +++ /dev/null @@ -1,78 +0,0 @@ -Security Considerations -======================= - -Overview --------- - -NVIDIA Model Optimizer (ModelOpt) is a library used to optimize ML models and -may load and process user-provided artifacts (models, weights, configs, -calibration data) and their dependencies. Secure deployment depends on how you -source artifacts, validate inputs, and harden the environment where ModelOpt -runs. - -What to Be Aware Of -------------------- - -**Untrusted model and data inputs** - -- Models, weights, configs and data may be malicious or corrupted. - -**Deserialization and code-execution risks** - -- Unsafe deserialization can lead to arbitrary code execution if fed untrusted - inputs. -- Avoid using serialization formats/settings that can deserialize arbitrary - objects. - -**Input validation and resource exhaustion** - -- Large or malformed inputs can trigger crashes or excessive CPU/GPU/memory use. -- Missing size/type checks can increase DoS risk. - -**Data in transit and at rest** - -- If fetching models or dependencies over the network, insecure transport can - enable tampering. -- Stored artifacts, logs, and caches may contain sensitive data. - -**Logging and observability** - -- Logs may inadvertently contain sensitive inputs, paths, tokens, or proprietary - model details. -- Overly verbose logs can leak operational and security-relevant information. - -**Supply chain and third-party components** - -- Dependencies may include known vulnerabilities or be compromised. -- Third-party plugins/components loaded at runtime may not have the same - security assurances. - -Example Security Approaches ---------------------------- - -**Artifact integrity** - -- Only load artifacts from trusted sources. -- Prefer signed artifacts; verify signatures before loading. - -**Safe parsing and deserialization** - -- Prefer safer storage formats (avoid object deserialization for untrusted - inputs). -- Avoid ``pickle``, ``torch.load()`` with untrusted weights, or YAML - ``unsafe_load``. -- Treat any unverified artifact as untrusted and block/guard its loading. - -**Hardening and least privilege** - -- Run with least privilege and isolate workloads. - -**Data protection** - -- Encrypt sensitive data at rest; use TLS 1.3 for data in transit. -- Never hardcode or log credentials. - -**Resilience** - -- Validate inputs and enforce limits (file size, timeouts, quotas,..). -- Keep OS, containers, and dependencies patched; scan for known vulnerabilities. diff --git a/examples/diffusers/distillation/distillation_trainer.py b/examples/diffusers/distillation/distillation_trainer.py index d98278b9afb..9cd5c0d142f 100644 --- a/examples/diffusers/distillation/distillation_trainer.py +++ b/examples/diffusers/distillation/distillation_trainer.py @@ -591,10 +591,9 @@ def _apply_modelopt_quantization(self) -> None: f"Resuming: restoring quantization architecture from " f"{modelopt_state_path} (weights loaded later by accelerator)" ) - # Security NOTE: weights_only=False is used on ModelOpt-generated state, - # not on untrusted user input. - state = torch.load(modelopt_state_path, weights_only=False, map_location="cpu") - self._transformer = mto.restore_from_modelopt_state(self._transformer, state) + self._transformer = mto.restore_from_modelopt_state( + self._transformer, modelopt_state_path=modelopt_state_path + ) logger.info("Quantization architecture restored for resume") return else: diff --git a/examples/diffusers/requirements.txt b/examples/diffusers/requirements.txt index 3cdac70f8ca..b762ec314ee 100644 --- a/examples/diffusers/requirements.txt +++ b/examples/diffusers/requirements.txt @@ -1,4 +1,3 @@ cuda-python<13 nvtx opencv-python>=4.8.1.78,<4.12.0.88 -sentencepiece diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md index 62f1435f9b5..372fdbcc494 100644 --- a/examples/gpt-oss/README.md +++ b/examples/gpt-oss/README.md @@ -20,6 +20,7 @@ Performing finetuning with Quantization Aware Training solves these issues. The Install the necessary dependencies: ```bash +pip install -U nvidia-modelopt[hf] pip install -r requirements.txt ``` diff --git a/examples/gpt-oss/requirements.txt b/examples/gpt-oss/requirements.txt index 4d75b59c373..368097d3376 100644 --- a/examples/gpt-oss/requirements.txt +++ b/examples/gpt-oss/requirements.txt @@ -1,8 +1,4 @@ -accelerate -datasets -deepspeed kernels>=0.9.0 -peft>=0.17.0 torch>2.7.1 trackio transformers>=4.55.0 diff --git a/examples/llm_qat/README.md b/examples/llm_qat/README.md index c8d0a542afe..cd4b103f30c 100644 --- a/examples/llm_qat/README.md +++ b/examples/llm_qat/README.md @@ -81,7 +81,7 @@ torch.save(mto.modelopt_state(model), "modelopt_quantizer_states.pt") # To resume training from a checkpoint or load the final QAT model for evaluation, # load the quantizer states before loading the model weights -# mto.restore_from_modelopt_state(model, torch.load("modelopt_quantizer_states.pt", weights_only=False)) +# mto.restore_from_modelopt_state(model, modelopt_state_path="modelopt_quantizer_states.pt") # After loading the quantizer states, load the model weights # model.load_state_dict(state_dict_from_last_checkpoint) diff --git a/examples/llm_qat/export.py b/examples/llm_qat/export.py index 1c9e6f4b11d..f48e85c3ee4 100644 --- a/examples/llm_qat/export.py +++ b/examples/llm_qat/export.py @@ -18,7 +18,6 @@ import warnings from pathlib import Path -import torch from transformers import AutoModelForCausalLM, AutoTokenizer import modelopt.torch.opt as mto @@ -51,8 +50,7 @@ def get_model( # Restore modelopt state for LoRA models. For QAT/QAD models from_pretrained call handles this if hasattr(model, "peft_config"): - # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input - modelopt_state = torch.load(f"{ckpt_path}/modelopt_state_train.pth", weights_only=False) + modelopt_state = mto.load_modelopt_state(f"{ckpt_path}/modelopt_state_train.pth") restore_from_modelopt_state(model, modelopt_state) print_rank_0("Restored modelopt state") diff --git a/examples/llm_qat/requirements.txt b/examples/llm_qat/requirements.txt index 8d44913bd60..b8da4e088f5 100644 --- a/examples/llm_qat/requirements.txt +++ b/examples/llm_qat/requirements.txt @@ -1,5 +1,3 @@ flash-attn -peft py7zr -sentencepiece>=0.2.0 tensorboardX diff --git a/examples/llm_sparsity/weight_sparsity/README.md b/examples/llm_sparsity/weight_sparsity/README.md index ca4df236ffa..97563aff007 100644 --- a/examples/llm_sparsity/weight_sparsity/README.md +++ b/examples/llm_sparsity/weight_sparsity/README.md @@ -4,6 +4,17 @@ In this tutorial, we demonstrate how to use Nvidia Model Optimizer to perform Po To learn more about the sparsity feature, please refer to the [documentation](https://nvidia.github.io/Model-Optimizer/guides/6_sparsity.html). +## Pre-Requisites + +### Installation + +Install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example: + +```bash +pip install -U nvidia-modelopt[hf] +pip install -r requirements.txt +``` + ## Getting Started ### Post-Training Sparsification (PTS) for PyTorch models diff --git a/examples/llm_sparsity/weight_sparsity/requirements.txt b/examples/llm_sparsity/weight_sparsity/requirements.txt index e4d43ea0e70..e4021b0194f 100644 --- a/examples/llm_sparsity/weight_sparsity/requirements.txt +++ b/examples/llm_sparsity/weight_sparsity/requirements.txt @@ -1,3 +1,2 @@ flash-attn -sentencepiece>=0.2.0 tensorboardX diff --git a/examples/onnx_ptq/requirements.txt b/examples/onnx_ptq/requirements.txt index 01f7f6dd0ec..166c7675700 100644 --- a/examples/onnx_ptq/requirements.txt +++ b/examples/onnx_ptq/requirements.txt @@ -1,5 +1,5 @@ datasets>=2.14.4 optimum -sentencepiece +sentencepiece>=0.2.1 timm torchvision diff --git a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt index 73bb392b00a..4bdac071cf6 100644 --- a/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt +++ b/examples/windows/accuracy_benchmark/perplexity_metrics/requirements.txt @@ -5,8 +5,7 @@ datasets numpy onnxruntime-genai pandas -sentencepiece +sentencepiece>=0.2.1 tokenizers>=0.14.1 - -torch>=2.0.0 -transformers>=4.36 +torch>=2.6.0 +transformers>=4.53 diff --git a/modelopt/torch/opt/conversion.py b/modelopt/torch/opt/conversion.py index 874c51b5990..6ec7a172981 100644 --- a/modelopt/torch/opt/conversion.py +++ b/modelopt/torch/opt/conversion.py @@ -51,6 +51,7 @@ __all__ = [ "ModeloptStateManager", "apply_mode", + "load_modelopt_state", "modelopt_state", "restore", "restore_from_modelopt_state", @@ -512,7 +513,29 @@ def save(model: nn.Module, f: str | os.PathLike | BinaryIO, **kwargs) -> None: torch.save(ckpt_dict, f, **kwargs) -def restore_from_modelopt_state(model: ModelLike, modelopt_state: dict[str, Any]) -> nn.Module: +def load_modelopt_state(modelopt_state_path: str | os.PathLike, **kwargs) -> dict[str, Any]: + """Load the modelopt state from a file. + + Args: + modelopt_state_path: Target file location. + **kwargs: additional args for ``torch.load()``. + + Returns: + A modelopt state dictionary describing the modifications to the model. + """ + # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input + kwargs.setdefault("weights_only", False) + kwargs.setdefault("map_location", "cpu") + # TODO: Add some validation to ensure the file is a valid modelopt state file. + modelopt_state = torch.load(modelopt_state_path, **kwargs) + return modelopt_state + + +def restore_from_modelopt_state( + model: ModelLike, + modelopt_state: dict[str, Any] | None = None, + modelopt_state_path: str | os.PathLike | None = None, +) -> nn.Module: """Restore the model architecture from the modelopt state dictionary based on the user-provided model. This method does not restore the model parameters such as weights, biases and quantization scales. @@ -526,10 +549,7 @@ def restore_from_modelopt_state(model: ModelLike, modelopt_state: dict[str, Any] model = ... # Create the model-like object # Restore the previously saved modelopt state followed by model weights - # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input - mto.restore_from_modelopt_state( - model, torch.load("modelopt_state.pt", weights_only=False) - ) # Restore modelopt state + mto.restore_from_modelopt_state(model, modelopt_state_path="modelopt_state.pt") model.load_state_dict(torch.load("model_weights.pt"), ...) # Load the model weights If you want to restore the model weights and the modelopt state with saved scales, please use @@ -543,11 +563,21 @@ def restore_from_modelopt_state(model: ModelLike, modelopt_state: dict[str, Any] modelopt_state: The modelopt state dict describing the modelopt modifications to the model. The ``modelopt_state`` can be generated via :meth:`mto.modelopt_state()`. + Cannot be used with modelopt_state_path. + modelopt_state_path: The path to the modelopt state file. + Cannot be used with modelopt_state. Returns: A modified model architecture based on the restored modifications with the unmodified weights as stored in the provided ``model`` argument. """ + assert (modelopt_state is not None) != (modelopt_state_path is not None), ( + "Either modelopt_state or modelopt_state_path must be provided, but not both." + ) + if modelopt_state_path is not None: + modelopt_state = load_modelopt_state(modelopt_state_path) + assert modelopt_state, "modelopt_state is required!" + # initialize ModelLikeModule if needed. model = model if isinstance(model, nn.Module) else ModelLikeModule(model) diff --git a/modelopt/torch/opt/plugins/huggingface.py b/modelopt/torch/opt/plugins/huggingface.py index 99bab772576..8b6396f3e79 100644 --- a/modelopt/torch/opt/plugins/huggingface.py +++ b/modelopt/torch/opt/plugins/huggingface.py @@ -79,10 +79,8 @@ def new_init_fn(self, *args, **kwargs): modelopt_state_path = _get_modelopt_state_path(model_path) _original__init__(self, *args, **kwargs) if os.path.isfile(modelopt_state_path): - # Security NOTE: weights_only=False is used on ModelOpt-generated state_dict, not on untrusted user input - modelopt_state = torch.load(modelopt_state_path, map_location="cpu", weights_only=False) with extra_context() if extra_context else nullcontext(): - restore_from_modelopt_state(self, modelopt_state) + restore_from_modelopt_state(self, modelopt_state_path=modelopt_state_path) print_rank_0(f"Restored ModelOpt state from {modelopt_state_path}") diff --git a/modelopt/torch/opt/plugins/peft.py b/modelopt/torch/opt/plugins/peft.py index c3fd268a588..de1218917f9 100644 --- a/modelopt/torch/opt/plugins/peft.py +++ b/modelopt/torch/opt/plugins/peft.py @@ -72,10 +72,7 @@ def _new_load_adapter(self, model_id, adapter_name, *args, **kwargs): assert adapter_name in self.peft_config, ( f"ModelOpt modified model should have adapter_name={adapter_name} in peft_config" ) - # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input - restore_from_modelopt_state( - self, torch.load(modelopt_state_path, map_location="cpu", weights_only=False) - ) + restore_from_modelopt_state(self, modelopt_state_path=modelopt_state_path) outputs = self._modelopt_cache["load_adapter"](self, model_id, adapter_name, *args, **kwargs) diff --git a/modelopt/torch/quantization/plugins/transformers_trainer.py b/modelopt/torch/quantization/plugins/transformers_trainer.py index b92b240c0da..b0d27865095 100644 --- a/modelopt/torch/quantization/plugins/transformers_trainer.py +++ b/modelopt/torch/quantization/plugins/transformers_trainer.py @@ -28,7 +28,6 @@ import modelopt.torch.opt as mto import modelopt.torch.quantization as mtq from modelopt.torch.distill.plugins.huggingface import KDTrainer -from modelopt.torch.opt.conversion import restore_from_modelopt_state from modelopt.torch.opt.plugins import ModelOptHFTrainer from modelopt.torch.utils import print_rank_0 @@ -233,10 +232,9 @@ def _save_modelopt_state_with_weights(self): print_rank_0(f"Saved modelopt state to {self._modelopt_state_path}") def _restore_modelopt_state_with_weights(self): - # Security NOTE: weights_only=False is used here on ModelOpt-generated state_dict, not on untrusted user input - modelopt_state = torch.load(self._modelopt_state_path, weights_only=False) + modelopt_state = mto.load_modelopt_state(self._modelopt_state_path) modelopt_weights = modelopt_state.pop("modelopt_state_weights", None) - restore_from_modelopt_state(self.model, modelopt_state) + mto.restore_from_modelopt_state(self.model, modelopt_state) if modelopt_weights is not None: set_quantizer_state_dict(self.model, modelopt_weights) print_rank_0("Restored modelopt state with weights.") diff --git a/pyproject.toml b/pyproject.toml index 61319f8cf42..7cee60d309f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ hf = [ "huggingface_hub>=0.24.0", "nltk", "peft>=0.17.0", + "sentencepiece>=0.2.1", # Also implicitly used in test_unified_export_megatron, test_vllm_fakequant_megatron_export "transformers>=4.53,<5.0", # Should match modelopt/torch/__init__.py and tox.ini "wonderwords", ] @@ -98,7 +99,6 @@ dev-test = [ "pytest-cov", "pytest-instafail", "pytest-timeout", - "sentencepiece", "timm", "torchprofile>=0.0.4", # optional dependency for modelopt.torch "torchvision",