Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 59 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
Overview
---

Nemo AutoModel is a Pytorch DTensor‑native SPMD open-source training library under [NVIDIA NeMo Framework](https://github.com/NVIDIA-NeMo), designed to streamline and scale training and finetuning for LLMs and VLMs. Designed for flexibility, reproducibility, and scale, NeMo AutoModel enables both small-scale experiments and massive multi-GPU, multi-node deployments for fast experimentation in research and production environments.
Nemo AutoModel is a Pytorch DTensor‑native SPMD open-source training library under [NVIDIA NeMo Framework](https://github.com/NVIDIA-NeMo), designed to streamline and scale training and finetuning for LLMs, VLMs, and ASR models. Designed for flexibility, reproducibility, and scale, NeMo AutoModel enables both small-scale experiments and massive multi-GPU, multi-node deployments for fast experimentation in research and production environments.
<p align="center">
<a href="https://github.com/NVIDIA-NeMo/Automodel"><picture>
<source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/NVIDIA-NeMo/Automodel/refs/heads/main/docs/automodel_diagram.png">
Expand Down Expand Up @@ -106,6 +106,9 @@ What you can expect:
- [VLM](#vlm-supervised-fine-tuning-sft)
- [Supervised Fine-Tuning (SFT)](#vlm-supervised-fine-tuning-sft)
- [Parameter-Efficient Fine-Tuning (PEFT)](#vlm-parameter-efficient-fine-tuning-peft)
- [ASR](#asr-fine-tuning)
- [Fine-Tuning](#asr-fine-tuning)
- [Parameter-Efficient Fine-Tuning (PEFT)](#asr-parameter-efficient-fine-tuning-peft)
- [Supported Models](#supported-models)
- [Performance](#performance)
- [Interoperability](#-interoperability)
Expand All @@ -130,6 +133,7 @@ What you can expect:
- ✅ **FP8 and mixed precision** - FP8 support with torchao, requires torch.compile-supported models.
- ✅ **DCP** - Distributed Checkpoint support with SafeTensors output.
- ✅ **VLM**: Support for finetuning VLMs (e.g., Qwen2-VL, Gemma-3-VL). More families to be included in the future.
- ✅ **ASR**: Support for finetuning ASR models (e.g., Whisper) with multimodal audio-text processing.
- ✅ **Extended MoE support** - GPT-OSS, Qwen3 (Coder-480B-A35B, etc), Qwen-next.

- 🔜 **Transformers v5 🤗** - Support for transformers v5 🤗 with device-mesh driven parallelism.
Expand Down Expand Up @@ -174,6 +178,9 @@ automodel examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag.yaml --nproc-per-
# VLM example: single-GPU fine-tuning (Gemma-3-VL) with LoRA
automodel examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2_peft.yaml

# ASR example: Whisper fine-tuning on LibriSpeech
automodel examples/asr_finetune/whisper/whisper_small_librispeech.yaml

# Both commands also work with uv run:
uv run automodel examples/llm_finetune/llama3_2/llama3_2_1b_hellaswag.yaml --nproc-per-node 8
```
Expand Down Expand Up @@ -263,6 +270,52 @@ automodel examples/vlm_finetune/gemma3/gemma3_vl_4b_medpix_peft.yaml --nproc-per
```


## ASR Fine-Tuning

NeMo AutoModel supports fine-tuning Automatic Speech Recognition (ASR) models with the same SPMD principles as LLMs and VLMs. ASR models process audio inputs and generate text transcriptions, supporting multilingual speech recognition and translation tasks.

### ASR Single GPU
```bash
# Fine-tune Whisper Small on LibriSpeech (1 GPU)
uv run examples/asr_finetune/finetune.py \
--config examples/asr_finetune/whisper/whisper_small_librispeech.yaml
```

### ASR Multi-GPU
```bash
# Fine-tune Whisper Medium on LibriSpeech (8 GPUs with TP=2)
uv run torchrun --nproc-per-node=8 \
examples/asr_finetune/finetune.py \
--config examples/asr_finetune/whisper/whisper_medium_librispeech.yaml
```

**Supported ASR Models:**
- **Parakeet CTC** (NVIDIA): Fast CTC-based speech recognition with LoRA support
- Models: parakeet-ctc-0.6b, parakeet-ctc-1.1b
- **Whisper** (OpenAI): Multilingual speech recognition and translation (99 languages) with LoRA support
- Models: whisper-tiny, small, medium, large-v3
- **Datasets**: LibriSpeech (readily available), Common Voice (via Mozilla Data Collective), custom audio datasets

See [ASR Fine-tuning Guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/README.md) for more details, dataset information, and advanced configurations.


## ASR Parameter-Efficient Fine-Tuning (PEFT)

```bash
# Whisper Small with LoRA (memory-efficient)
uv run examples/asr_finetune/finetune.py \
--config examples/asr_finetune/whisper/whisper_small_librispeech_peft.yaml

# Parakeet CTC with LoRA
uv run examples/asr_finetune/finetune.py \
--config examples/asr_finetune/parakeet/parakeet_ctc_0.6b_librispeech_peft.yaml
```

**Benefits**: 40-60% memory reduction, 10-30x smaller checkpoints, faster training with higher learning rates.

See [ASR Fine-tuning Guide](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/README.md#parameter-efficient-fine-tuning-peft) for details.


## Supported Models
NeMo AutoModel provides native support for a wide range of models available on the Hugging Face Hub, enabling efficient fine-tuning for various domains. Below is a small sample of ready-to-use families (train as-is or swap any compatible 🤗 causal LM), you can specify nearly any LLM/VLM model available on 🤗 hub:

Expand Down Expand Up @@ -293,9 +346,13 @@ NeMo AutoModel provides native support for a wide range of models available on t
| **LLM** | **Baichuan** | [`baichuan-inc/Baichuan2-7B-Chat`](https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/baichuan/baichuan_2_7b_squad.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/baichuan/baichuan_2_7b_squad_peft.yaml), [FP8](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune/baichuan/baichuan_2_7b_mock_fp8.yaml) |
| **VLM** | **Gemma** | [`google/gemma-3-4b-it`](https://huggingface.co/google/gemma-3-4b-it) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3/gemma3_vl_4b_cord_v2_peft.yaml) |
| | | [`google/gemma-3n-e4b-it`](https://huggingface.co/google/gemma-3n-e4b-it) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune/gemma3n/gemma3n_vl_4b_medpix_peft.yaml) |
| **ASR** | **Parakeet** | [`nvidia/parakeet-ctc-0.6b`](https://huggingface.co/nvidia/parakeet-ctc-0.6b) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/parakeet/parakeet_ctc_0.6b_librispeech.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/parakeet/parakeet_ctc_0.6b_librispeech_peft.yaml) |
| | | [`nvidia/parakeet-ctc-1.1b`](https://huggingface.co/nvidia/parakeet-ctc-1.1b) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/parakeet/parakeet_ctc_1.1b_librispeech.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/parakeet/parakeet_ctc_1.1b_librispeech_peft.yaml) |
| **ASR** | **Whisper** | [`openai/whisper-small`](https://huggingface.co/openai/whisper-small) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/whisper/whisper_small_librispeech.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/whisper/whisper_small_librispeech_peft.yaml) |
| | | [`openai/whisper-medium`](https://huggingface.co/openai/whisper-medium) | [SFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/whisper/whisper_medium_librispeech.yaml), [PEFT](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune/whisper/whisper_medium_librispeech_peft.yaml) |

> [!NOTE]
> Check out more [LLM](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune) and [VLM](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune) examples. Any causal LM on Hugging Face Hub can be used with the base recipe template, just overwrite `--model.pretrained_model_name_or_path <model-id>` in the CLI or in the YAML config.
> Check out more [LLM](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/llm_finetune), [VLM](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/vlm_finetune), and [ASR](https://github.com/NVIDIA-NeMo/Automodel/blob/main/examples/asr_finetune) examples. Any compatible model on Hugging Face Hub can be used with the base recipe template, just overwrite `--model.pretrained_model_name_or_path <model-id>` in the CLI or in the YAML config.


## Performance
Expand Down
12 changes: 8 additions & 4 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python-is-python3 \
curl \
git \
libopenmpi-dev && \
libopenmpi-dev \
ffmpeg \
libavcodec-dev \
libavformat-dev \
libavutil-dev && \
rm -rf /var/lib/apt/lists/*

FROM ${PYTORCH_IMAGE} AS pytorch
Expand Down Expand Up @@ -69,8 +73,8 @@ RUN if [ "$INSTALL_TE" = "True" ]; then \
git fetch origin $TE_COMMIT && \
git checkout FETCH_HEAD && \
git submodule init && git submodule update && \
pip install nvidia-mathdx==25.1.1 && \
env NVTE_CUDA_ARCHS="80;90;100;120" NVTE_BUILD_THREADS_PER_JOB=8 pip install --no-cache-dir --no-build-isolation -v . && \
uv pip install nvidia-mathdx==25.1.1 && \
env NVTE_CUDA_ARCHS="80;90;100;120" NVTE_BUILD_THREADS_PER_JOB=8 uv pip install --no-cache-dir --no-build-isolation -v . && \
cd ../ && rm -rf TransformerEngine; \
fi

Expand Down Expand Up @@ -124,7 +128,7 @@ RUN if [ "$INSTALL_UCCL_EP" = "True" ]; then \
fi

# Address base image CVE
RUN pip install "aiohttp>=3.13.3" \
RUN uv pip install "aiohttp>=3.13.3" \
"jaraco-context>=6.1.0" \
"nbconvert>=7.17.0" \
"pillow>=12.1.1" \
Expand Down
10 changes: 9 additions & 1 deletion docker/common/uv-pytorch.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3250,6 +3250,7 @@ all = [
{ name = "imageio" },
{ name = "imageio-ffmpeg" },
{ name = "kernels" },
{ name = "librosa" },
{ name = "mamba-ssm" },
{ name = "mistral-common", extra = ["opencv"] },
{ name = "numba", version = "0.53.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
Expand All @@ -3270,6 +3271,10 @@ all = [
{ name = "torchvision", marker = "sys_platform == 'never'" },
{ name = "transformer-engine", marker = "sys_platform == 'never'" },
]
asr = [
{ name = "librosa" },
{ name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin'" },
]
cli = [
{ name = "pyyaml" },
]
Expand Down Expand Up @@ -3382,11 +3387,13 @@ requires-dist = [
{ name = "imageio", marker = "extra == 'diffusion'" },
{ name = "imageio-ffmpeg", marker = "extra == 'diffusion'" },
{ name = "kernels", marker = "extra == 'diffusion'" },
{ name = "librosa", marker = "extra == 'asr'" },
{ name = "mamba-ssm", marker = "extra == 'cuda'" },
{ name = "megatron-fsdp", specifier = ">=0.2.3" },
{ name = "mistral-common", extras = ["audio", "hf-hub", "image", "sentencepiece"] },
{ name = "mistral-common", extras = ["opencv"], marker = "extra == 'vlm'", specifier = ">=1.9.0" },
{ name = "mlflow" },
{ name = "nemo-automodel", extras = ["asr"], marker = "extra == 'all'" },
{ name = "nemo-automodel", extras = ["cuda"], marker = "extra == 'all'" },
{ name = "nemo-automodel", extras = ["cuda"], marker = "extra == 'moe'" },
{ name = "nemo-automodel", extras = ["delta-databricks"], marker = "extra == 'all'" },
Expand All @@ -3413,6 +3420,7 @@ requires-dist = [
{ name = "torch", marker = "sys_platform == 'darwin'", specifier = ">=2.6.0,<=2.10.0", index = "https://pypi.org/simple" },
{ name = "torch", marker = "sys_platform == 'linux'", specifier = ">=2.6.0,<=2.10.0", index = "https://download.pytorch.org/whl/cu129" },
{ name = "torchao" },
{ name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'asr'" },
{ name = "torchcodec", marker = "platform_machine == 'x86_64' and sys_platform != 'darwin' and extra == 'vlm'" },
{ name = "torchdata" },
{ name = "torchvision", marker = "sys_platform == 'darwin' and extra == 'diffusion'", index = "https://pypi.org/simple" },
Expand All @@ -3422,7 +3430,7 @@ requires-dist = [
{ name = "transformers", specifier = ">=5.3.0,<5.4.0" },
{ name = "wandb" },
]
provides-extras = ["diffusion", "cuda", "cuda-source", "extra", "fa", "delta-databricks", "moe", "vlm", "cli", "all"]
provides-extras = ["diffusion", "cuda", "cuda-source", "extra", "fa", "delta-databricks", "moe", "vlm", "asr", "cli", "all"]

[package.metadata.requires-dev]
build = [
Expand Down
Loading
Loading