Skip to content

Commit 048b9b7

Browse files
committed
Fix OpenMPI 5.0 PMIx incompatibility with AzureML MPI agent
- Switch image_cnn_train from openmpi5.0-cuda12.4-ubuntu22.04 to openmpi4.1.0-ubuntu22.04 (OpenMPI 5.0 PMIx causes PRTE ERROR: Bad parameter in odls_base_default_fns.c) - PyTorch 2.4.0 pip wheel bundles CUDA runtime, no CUDA base image needed - Install apex Python-only (no --cpp_ext/--cuda_ext) to avoid nvcc dependency - Apply to both CLI and SDK versions
2 parents 68b0955 + e6f8494 commit 048b9b7

File tree

7 files changed

+58
-70
lines changed

7 files changed

+58
-70
lines changed

cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/classify.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,10 @@
2222
from image_classification.dataloaders import load_jpeg_from_file
2323

2424
try:
25-
from apex.fp16_utils import *
26-
from apex import amp
25+
from torch.amp import autocast, GradScaler
2726
except ImportError:
2827
raise ImportError(
29-
"Please install apex from https://www.github.com/nvidia/apex to run this example."
28+
"PyTorch with CUDA support is required to run this example."
3029
)
3130

3231

@@ -68,7 +67,7 @@ def main(args):
6867
model = model.cuda()
6968

7069
if args.precision == "FP16":
71-
model = network_to_half(model)
70+
model = model.half()
7271

7372
model.eval()
7473

cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ dependencies:
66
- python=3.10
77
- pip=23.3
88
- pip:
9+
- setuptools>=69.0
910
- mldesigner==0.1.0b4
1011
- watchdog==4.0.1
1112
- torch==2.4.0

cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/entry.spec.yaml

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ outputs:
111111
code: ./
112112

113113
environment:
114-
image: mcr.microsoft.com/azureml/openmpi5.0-cuda12.4-ubuntu22.04:latest
114+
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest
115115
conda_file: ./conda.yaml
116116

117117
resources:
@@ -121,9 +121,7 @@ distribution:
121121
process_count_per_instance: 1
122122

123123
command: >-
124-
pip install ninja && git clone https://github.com/NVIDIA/apex && cd apex
125-
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" . && cd ..
126-
&& mldesigner execute --source entry.py --name imagecnn_train --inputs train_data=${{inputs.train_data}}
124+
mldesigner execute --source entry.py --name imagecnn_train --inputs train_data=${{inputs.train_data}}
127125
val_data=${{inputs.valid_data}} data_backend=${{inputs.data_backend}} arch=${{inputs.arch}}
128126
model_config=${{inputs.model_config}} workers=${{inputs.workers}} epochs=${{inputs.epochs}}
129127
batch_size=${{inputs.batch_size}} optimizer_batch_size=${{inputs.optimizer_batch_size}} lr=${{inputs.lr}}

cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py

Lines changed: 31 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,8 @@
3636
from . import resnet as models
3737
from . import utils
3838

39-
try:
40-
from apex.parallel import DistributedDataParallel as DDP
41-
from apex.fp16_utils import *
42-
from apex import amp
43-
except ImportError:
44-
raise ImportError(
45-
"Please install apex from https://www.github.com/nvidia/apex to run this example."
46-
)
39+
from torch.nn.parallel import DistributedDataParallel as DDP
40+
from torch.amp import autocast, GradScaler
4741

4842
from torch.utils.tensorboard import SummaryWriter
4943

@@ -109,7 +103,7 @@ def __init__(self, arch, loss, pretrained_weights=None, cuda=True, fp16=False):
109103
if cuda:
110104
model = model.cuda()
111105
if fp16:
112-
model = network_to_half(model)
106+
model = model.half()
113107

114108
# define loss function (criterion) and optimizer
115109
criterion = loss()
@@ -126,8 +120,8 @@ def forward(self, data, target):
126120

127121
return loss, output
128122

129-
def distributed(self):
130-
self.model = DDP(self.model)
123+
def distributed(self, gpu=0):
124+
self.model = DDP(self.model, device_ids=[gpu])
131125

132126
def load_model_state(self, state):
133127
if not state is None:
@@ -172,13 +166,6 @@ def get_optimizer(
172166
weight_decay=weight_decay,
173167
nesterov=nesterov,
174168
)
175-
if fp16:
176-
optimizer = FP16_Optimizer(
177-
optimizer,
178-
static_loss_scale=static_loss_scale,
179-
dynamic_loss_scale=dynamic_loss_scale,
180-
verbose=False,
181-
)
182169

183170
if not state is None:
184171
optimizer.load_state_dict(state)
@@ -255,36 +242,39 @@ def _lr_fn(iteration, epoch):
255242

256243

257244
def get_train_step(
258-
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
245+
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1, scaler=None
259246
):
260247
def _step(input, target, optimizer_step=True):
261248
input_var = Variable(input)
262249
target_var = Variable(target)
263-
loss, output = model_and_loss(input_var, target_var)
250+
251+
if use_amp:
252+
with autocast('cuda'):
253+
loss, output = model_and_loss(input_var, target_var)
254+
else:
255+
loss, output = model_and_loss(input_var, target_var)
256+
264257
if torch.distributed.is_initialized():
265258
reduced_loss = utils.reduce_tensor(loss.data)
266259
else:
267260
reduced_loss = loss.data
268261

269-
if fp16:
270-
optimizer.backward(loss)
271-
elif use_amp:
272-
with amp.scale_loss(loss, optimizer) as scaled_loss:
273-
scaled_loss.backward()
262+
if use_amp and scaler is not None:
263+
scaler.scale(loss).backward()
274264
else:
275265
loss.backward()
276266

277267
if optimizer_step:
278-
opt = (
279-
optimizer.optimizer
280-
if isinstance(optimizer, FP16_Optimizer)
281-
else optimizer
282-
)
283-
for param_group in opt.param_groups:
268+
for param_group in optimizer.param_groups:
284269
for param in param_group["params"]:
285-
param.grad /= batch_size_multiplier
286-
287-
optimizer.step()
270+
if param.grad is not None:
271+
param.grad /= batch_size_multiplier
272+
273+
if use_amp and scaler is not None:
274+
scaler.step(optimizer)
275+
scaler.update()
276+
else:
277+
optimizer.step()
288278
optimizer.zero_grad()
289279

290280
torch.cuda.synchronize()
@@ -304,6 +294,7 @@ def train(
304294
epoch,
305295
detector,
306296
use_amp=False,
297+
scaler=None,
307298
prof=-1,
308299
batch_size_multiplier=1,
309300
register_metrics=True,
@@ -319,6 +310,7 @@ def train(
319310
fp16,
320311
use_amp=use_amp,
321312
batch_size_multiplier=batch_size_multiplier,
313+
scaler=scaler,
322314
)
323315

324316
model_and_loss.train()
@@ -345,7 +337,7 @@ def train(
345337
if writer:
346338
writer.add_scalar("train/summary/scalar/learning_rate", lr, epoch)
347339
writer.add_scalar(
348-
"train/summary/scalar/loss", to_python_float(loss), total_train_step
340+
"train/summary/scalar/loss", float(loss), total_train_step
349341
)
350342
writer.add_scalar(
351343
"perf/summary/scalar/compute_ips",
@@ -359,7 +351,7 @@ def train(
359351
)
360352
mlflow.log_metric("train/learning_rate", step=epoch, value=lr)
361353
mlflow.log_metric(
362-
"train/loss", step=total_train_step, value=to_python_float(loss)
354+
"train/loss", step=total_train_step, value=float(loss)
363355
)
364356
mlflow.log_metric(
365357
"perf/compute_ips",
@@ -448,7 +440,7 @@ def validate(
448440

449441
it_time = time.time() - end
450442

451-
loss_sum += to_python_float(loss)
443+
loss_sum += float(loss)
452444
total_val_step += 1
453445

454446
end = time.time()
@@ -483,6 +475,7 @@ def train_loop(
483475
should_backup_checkpoint,
484476
save_checkpoint_epochs,
485477
use_amp=False,
478+
scaler=None,
486479
batch_size_multiplier=1,
487480
best_prec1=0,
488481
start_epoch=0,
@@ -535,6 +528,7 @@ def train_loop(
535528
epoch,
536529
detector,
537530
use_amp=use_amp,
531+
scaler=scaler,
538532
prof=prof,
539533
register_metrics=epoch == start_epoch,
540534
batch_size_multiplier=batch_size_multiplier,

cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py

Lines changed: 7 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,12 +47,11 @@
4747
import torchvision.datasets as datasets
4848

4949
try:
50-
from apex.parallel import DistributedDataParallel as DDP
51-
from apex.fp16_utils import *
52-
from apex import amp
50+
from torch.nn.parallel import DistributedDataParallel as DDP
51+
from torch.amp import autocast, GradScaler
5352
except ImportError:
5453
raise ImportError(
55-
"Please install apex from https://www.github.com/nvidia/apex to run this example."
54+
"PyTorch with CUDA support is required to run this example."
5655
)
5756

5857
import image_classification.resnet as models
@@ -489,16 +488,12 @@ def _worker_init_fn(id):
489488
elif args.lr_schedule == "linear":
490489
lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=None)
491490

491+
scaler = None
492492
if args.amp:
493-
model_and_loss, optimizer = amp.initialize(
494-
model_and_loss,
495-
optimizer,
496-
opt_level="O2",
497-
loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
498-
)
493+
scaler = GradScaler('cuda')
499494

500495
if args.distributed:
501-
model_and_loss.distributed()
496+
model_and_loss.distributed(args.gpu)
502497

503498
model_and_loss.load_model_state(model_state)
504499

@@ -514,6 +509,7 @@ def _worker_init_fn(id):
514509
should_backup_checkpoint(args),
515510
args.save_checkpoint_epochs,
516511
use_amp=args.amp,
512+
scaler=scaler,
517513
batch_size_multiplier=batch_size_multiplier,
518514
start_epoch=start_epoch,
519515
best_prec1=best_prec1,

sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/conda.yaml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,19 +3,19 @@ channels:
33
- defaults
44
- conda-forge
55
dependencies:
6-
- python=3.10
7-
- pip=23.3
6+
- python=3.8.12
7+
- pip=21.2.2
88
- pip:
99
- mldesigner==0.1.0b4
10-
- watchdog==4.0.1
11-
- torch==2.4.0
12-
- torchvision==0.19.0
13-
- tensorboard==2.17.0
14-
- pillow==10.4.0
15-
- numpy==1.26.4
10+
- watchdog==0.10.3
11+
- torch==1.8.1
12+
- torchvision==0.9.1
13+
- tensorboard==2.5.0
14+
- pillow==8.2.0
15+
- numpy==1.19.5
1616
- --extra-index-url=https://developer.download.nvidia.com/compute/redist/
17-
- nvidia-dali-cuda120
17+
- nvidia-dali-cuda100
1818
- azureml-mlflow>=1.41.0
19-
- protobuf==4.25.3
20-
- pandas==2.2.2
19+
- protobuf==3.20.1
20+
- pandas==1.2.1
2121
- packaging>=22.0 # fix for https://github.com/pypa/setuptools/issues/4483

sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/entry.spec.yaml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ outputs:
111111
code: ./
112112

113113
environment:
114-
image: mcr.microsoft.com/azureml/openmpi5.0-cuda12.4-ubuntu22.04:latest
114+
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04
115115
conda_file: ./conda.yaml
116116

117117
resources:
@@ -121,8 +121,8 @@ distribution:
121121
process_count_per_instance: 1
122122

123123
command: >-
124-
pip install ninja && git clone https://github.com/NVIDIA/apex && cd apex
125-
&& pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" . && cd ..
124+
git clone https://github.com/NVIDIA/apex && cd apex && git checkout 3303b3e7174383312a3468ef390060c26e640cb1
125+
&& python setup.py install && cd ..
126126
&& mldesigner execute --source entry.py --name imagecnn_train --inputs train_data=${{inputs.train_data}}
127127
val_data=${{inputs.valid_data}} data_backend=${{inputs.data_backend}} arch=${{inputs.arch}}
128128
model_config=${{inputs.model_config}} workers=${{inputs.workers}} epochs=${{inputs.epochs}}

0 commit comments

Comments
 (0)