Skip to content

Commit 0baabd5

Browse files
authored
fix: add nvidia-resiliency-ext to default dependencies (#2228)
Signed-off-by: Terry Kong <terryk@nvidia.com>
1 parent c9277a3 commit 0baabd5

2 files changed

Lines changed: 20 additions & 12 deletions

File tree

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ dependencies = [
5555
"cuda-bindings", # for non-colocated refit
5656
"pybase64", # for sglang refit
5757
"nvidia-cudnn-cu12==9.19.0.56", # for transformer-engine no build isolation
58+
"nvidia-resiliency-ext", # for ft_launcher (fault-tolerant training launcher)
5859
]
5960

6061
[project.optional-dependencies]
@@ -160,6 +161,7 @@ megatron-core = { workspace = true }
160161
nemo-automodel = { path = "3rdparty/Automodel-workspace/Automodel", editable = true }
161162
megatron-bridge = { path = "3rdparty/Megatron-Bridge-workspace", editable = true }
162163
nemo_gym = { workspace = true }
164+
nvidia-resiliency-ext = { index = "pypi" }
163165
nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
164166
# torch/torchaudio/torchvision/triton all come from the torch index in order to pick up aarch64 wheels
165167
torch = [

uv.lock

Lines changed: 18 additions & 12 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)