Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,4 @@ command: >-
--output-path ${{outputs.output_path}}
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@ command: >-
--output-path ${{outputs.output_path}}
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@
from image_classification.dataloaders import load_jpeg_from_file

try:
from apex.fp16_utils import *
from apex import amp
from torch.amp import autocast, GradScaler
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
"PyTorch with CUDA support is required to run this example."
)


Expand Down Expand Up @@ -68,7 +67,7 @@ def main(args):
model = model.cuda()

if args.precision == "FP16":
model = network_to_half(model)
model = model.half()

model.eval()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,18 @@ channels:
- defaults
- conda-forge
dependencies:
- python=3.8.12
- pip=21.2.2
- python=3.10
- pip=23.3
- pip:
- mldesigner==0.1.0b4
- watchdog==0.10.3
- torch==1.8.1
- torchvision==0.9.1
- tensorboard==2.5.0
- pillow==8.2.0
- numpy==1.19.5
- watchdog==4.0.1
- torch==2.4.0
- torchvision==0.19.0
- tensorboard==2.17.0
- pillow==10.4.0
- numpy==1.26.4
- --extra-index-url=https://developer.download.nvidia.com/compute/redist/
- nvidia-dali-cuda100
- nvidia-dali-cuda120
- azureml-mlflow>=1.41.0
- protobuf==3.20.1
- pandas==1.2.1
- packaging==22.0 # fix for https://github.com/pypa/setuptools/issues/4483
- protobuf==4.25.3
- pandas==2.2.2
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ outputs:
code: ./

environment:
image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04:latest
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest
conda_file: ./conda.yaml

resources:
Expand All @@ -121,9 +121,8 @@ distribution:
process_count_per_instance: 1

command: >-
git clone https://github.com/NVIDIA/apex && cd apex && git checkout 3303b3e7174383312a3468ef390060c26e640cb1
&& python setup.py install && cd ..
&& mldesigner execute --source entry.py --name imagecnn_train --inputs train_data=${{inputs.train_data}}
pip install setuptools &&
mldesigner execute --source entry.py --name imagecnn_train --inputs train_data=${{inputs.train_data}}
val_data=${{inputs.valid_data}} data_backend=${{inputs.data_backend}} arch=${{inputs.arch}}
model_config=${{inputs.model_config}} workers=${{inputs.workers}} epochs=${{inputs.epochs}}
batch_size=${{inputs.batch_size}} optimizer_batch_size=${{inputs.optimizer_batch_size}} lr=${{inputs.lr}}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,8 @@
from . import resnet as models
from . import utils

try:
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
)
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.amp import autocast, GradScaler

from torch.utils.tensorboard import SummaryWriter

Expand Down Expand Up @@ -109,7 +103,7 @@ def __init__(self, arch, loss, pretrained_weights=None, cuda=True, fp16=False):
if cuda:
model = model.cuda()
if fp16:
model = network_to_half(model)
model = model.half()

# define loss function (criterion) and optimizer
criterion = loss()
Expand All @@ -126,8 +120,8 @@ def forward(self, data, target):

return loss, output

def distributed(self):
self.model = DDP(self.model)
def distributed(self, gpu=0):
self.model = DDP(self.model, device_ids=[gpu])

def load_model_state(self, state):
if not state is None:
Expand Down Expand Up @@ -172,13 +166,6 @@ def get_optimizer(
weight_decay=weight_decay,
nesterov=nesterov,
)
if fp16:
optimizer = FP16_Optimizer(
optimizer,
static_loss_scale=static_loss_scale,
dynamic_loss_scale=dynamic_loss_scale,
verbose=False,
)

if not state is None:
optimizer.load_state_dict(state)
Expand Down Expand Up @@ -255,36 +242,39 @@ def _lr_fn(iteration, epoch):


def get_train_step(
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1
model_and_loss, optimizer, fp16, use_amp=False, batch_size_multiplier=1, scaler=None
):
def _step(input, target, optimizer_step=True):
input_var = Variable(input)
target_var = Variable(target)
loss, output = model_and_loss(input_var, target_var)

if use_amp:
with autocast('cuda'):
loss, output = model_and_loss(input_var, target_var)
else:
loss, output = model_and_loss(input_var, target_var)

if torch.distributed.is_initialized():
reduced_loss = utils.reduce_tensor(loss.data)
else:
reduced_loss = loss.data

if fp16:
optimizer.backward(loss)
elif use_amp:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
if use_amp and scaler is not None:
scaler.scale(loss).backward()
else:
loss.backward()

if optimizer_step:
opt = (
optimizer.optimizer
if isinstance(optimizer, FP16_Optimizer)
else optimizer
)
for param_group in opt.param_groups:
for param_group in optimizer.param_groups:
for param in param_group["params"]:
param.grad /= batch_size_multiplier

optimizer.step()
if param.grad is not None:
param.grad /= batch_size_multiplier

if use_amp and scaler is not None:
scaler.step(optimizer)
scaler.update()
else:
optimizer.step()
optimizer.zero_grad()

torch.cuda.synchronize()
Expand All @@ -304,6 +294,7 @@ def train(
epoch,
detector,
use_amp=False,
scaler=None,
prof=-1,
batch_size_multiplier=1,
register_metrics=True,
Expand All @@ -319,6 +310,7 @@ def train(
fp16,
use_amp=use_amp,
batch_size_multiplier=batch_size_multiplier,
scaler=scaler,
)

model_and_loss.train()
Expand All @@ -345,7 +337,7 @@ def train(
if writer:
writer.add_scalar("train/summary/scalar/learning_rate", lr, epoch)
writer.add_scalar(
"train/summary/scalar/loss", to_python_float(loss), total_train_step
"train/summary/scalar/loss", float(loss), total_train_step
)
writer.add_scalar(
"perf/summary/scalar/compute_ips",
Expand All @@ -359,7 +351,7 @@ def train(
)
mlflow.log_metric("train/learning_rate", step=epoch, value=lr)
mlflow.log_metric(
"train/loss", step=total_train_step, value=to_python_float(loss)
"train/loss", step=total_train_step, value=float(loss)
)
mlflow.log_metric(
"perf/compute_ips",
Expand Down Expand Up @@ -448,7 +440,7 @@ def validate(

it_time = time.time() - end

loss_sum += to_python_float(loss)
loss_sum += float(loss)
total_val_step += 1

end = time.time()
Expand Down Expand Up @@ -483,6 +475,7 @@ def train_loop(
should_backup_checkpoint,
save_checkpoint_epochs,
use_amp=False,
scaler=None,
batch_size_multiplier=1,
best_prec1=0,
start_epoch=0,
Expand Down Expand Up @@ -535,6 +528,7 @@ def train_loop(
epoch,
detector,
use_amp=use_amp,
scaler=scaler,
prof=prof,
register_metrics=epoch == start_epoch,
batch_size_multiplier=batch_size_multiplier,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@
import torchvision.datasets as datasets

try:
from apex.parallel import DistributedDataParallel as DDP
from apex.fp16_utils import *
from apex import amp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.amp import autocast, GradScaler
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to run this example."
"PyTorch with CUDA support is required to run this example."
)

import image_classification.resnet as models
Expand Down Expand Up @@ -489,16 +488,12 @@ def _worker_init_fn(id):
elif args.lr_schedule == "linear":
lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=None)

scaler = None
if args.amp:
model_and_loss, optimizer = amp.initialize(
model_and_loss,
optimizer,
opt_level="O2",
loss_scale="dynamic" if args.dynamic_loss_scale else args.static_loss_scale,
)
scaler = GradScaler('cuda')

if args.distributed:
model_and_loss.distributed()
model_and_loss.distributed(args.gpu)

model_and_loss.load_model_state(model_state)

Expand All @@ -514,6 +509,7 @@ def _worker_init_fn(id):
should_backup_checkpoint(args),
args.save_checkpoint_epochs,
use_amp=args.amp,
scaler=scaler,
batch_size_multiplier=batch_size_multiplier,
start_epoch=start_epoch,
best_prec1=best_prec1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,4 @@ command: >-
environment:
conda_file: ./conda.yaml
image: mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.8-cudnn8-ubuntu22.04:latest
image: mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04:latest
Loading