diff --git a/image_classification/Multi_Node_Training/README.md b/image_classification/Multi_Node_Training/README.md new file mode 100644 index 00000000..61e17b1c --- /dev/null +++ b/image_classification/Multi_Node_Training/README.md @@ -0,0 +1,57 @@ +# Multiple Node Training +English | [简体中文](./README_cn.md) + +PaddleVit also supports multi-node distributed training under collective mode. + +Here we provides a simple tutorial to modify multi-gpus training scripts +to multi-nodes training scripts for any models in PaddleViT. + +This folder takes ViT model as an example. + +## Tutorial +For any models in PaddleViT, one can implement multi-node training by modifying +`main_multi_gpu.py`. +1. Just add arguments `ips='[host ips]' ` in `dist.spawn()`. +2. Then run training script in every host. + +## Training example: ViT +Suppose you have 2 hosts (denoted as node) with 4 gpus on each machine. +Nodes IP addresses are `192.168.0.16` and `192.168.0.17`. + +1. Then modify some lines of `run_train_multi_node.sh`: + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 # number of gpus + + -ips= '192.168.0.16, 192.168.0.17' # seperated by comma + ``` +2. Run training script in every host: + ```shell + sh run_train_multi.sh + ``` + +##Multi-nodes training with one host +It is possible to try multi-node training even when you have only one machine. + +1. Install docker and paddle. For more details, please refer + [here](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/fromdocker.html). + +2. Create a network between docker containers. + ```shell + docker network create -d bridge paddle_net + ``` +3. Create multiple containers as virtual hosts/nodes. Suppose creating 2 containers +with 2 gpus on each node. + ```shell + docker run --name paddle0 -it -d --gpus "device=0,1" --network paddle_net\ + paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash + docker run --name paddle1 -it -d --gpus "device=2,3" --network paddle_net\ + paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash + ``` + > Noted: + > 1. One can assign one gpu device to different containers. But it may occur OOM since multiple models will run on the same gpu. + > 2. One should use `-v` to bind PaddleViT repository to container. + +4. Modify `run_train_multi_node.sh` as described above and run the training script on every container. + + > Noted: One can use `ping` or `ip -a` bash command to check containers' ip addresses. + diff --git a/image_classification/Multi_Node_Training/README_cn.md b/image_classification/Multi_Node_Training/README_cn.md new file mode 100644 index 00000000..c879d46f --- /dev/null +++ b/image_classification/Multi_Node_Training/README_cn.md @@ -0,0 +1,45 @@ +#多机多卡分布式训练 + +简体中文 | [English](./README.md) + +PaddleViT 同样支持Collective多机多卡分布式训练。 + +##教程 +对于每一个模型,用户可以直接通过修改对应模型文件夹下的`main_multi_gpu.py` +以进行多机训练。 +1. 在`dist.spawn()`里加入`ips='[host ips]' `。 +2. 在每个主机上运行代码。 + +##样例:ViT +这个文件夹提供了分布式训练ViT模型的代码和shell脚本。 +假设你有2台主机,每个主机上有4张显卡。主机的ip地址为`192.168.0.16`和`192.168.0.17`。 + +1. 修改shell脚本`run_train_multi_node.sh`的参数 + ```shell + CUDA_VISIBLE_DEVICES=0,1,2,3 # number of gpus + + -ips= '192.168.0.16, 192.168.0.17' # seperated by comma + ``` +2. 在每个主机上运行脚本代码。 + ```shell + sh run_train_multi.sh + ``` +##单机上运行分布式训练 +如果仅有一台主机,同样可以通过docker实现单机上的分布式训练。 +1. 安装docker和paddle。[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/fromdocker.html) +可以下载paddlepaddle提供的docker镜像。 +2. 创建docker容器间网络。 + ```shell + docker network create -d bridge paddle_net + ``` +3. 创建多个docker容器作为虚拟主机,假设我们创建2个容器,并分别分配2个GPU。 + ```shell + docker run --name paddle0 -it -d --gpus "device=0,1" --network paddle_net\ + paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash + docker run --name paddle1 -it -d --gpus "device=2,3" --network paddle_net\ + paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash + ``` + > 注意: + > 1. 可以将同一个GPU同时分配给多个容器,但是这可能会产生OOM错误,因为多个模型将同时运行在这个GPU上。 + > 2. 使用`-v`挂载PaddleViT所在的目录。 + diff --git a/image_classification/Multi_Node_Training/config.py b/image_classification/Multi_Node_Training/config.py new file mode 100644 index 00000000..a7273699 --- /dev/null +++ b/image_classification/Multi_Node_Training/config.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Configuration + +Configuration for data, model archtecture, and training, etc. +Config can be set by .yaml file or by argparser(limited usage) + + +""" +import os +from yacs.config import CfgNode as CN +import yaml + +_C = CN() +_C.BASE = [''] + +# data settings +_C.DATA = CN() +_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU +_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU +_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset +_C.DATA.DATASET = 'imagenet2012' # dataset name +_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune +_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode +_C.DATA.NUM_WORKERS = 2 # number of data loading threads + +# model settings +_C.MODEL = CN() +_C.MODEL.TYPE = 'ViT' +_C.MODEL.NAME = 'ViT' +_C.MODEL.RESUME = None +_C.MODEL.PRETRAINED = None +_C.MODEL.NUM_CLASSES = 1000 +_C.MODEL.DROPOUT = 0.1 +_C.MODEL.DROPPATH = 0.1 +_C.MODEL.ATTENTION_DROPOUT = 0.1 + +# transformer settings +_C.MODEL.TRANS = CN() +_C.MODEL.TRANS.PATCH_SIZE = 32 +_C.MODEL.TRANS.EMBED_DIM = 768 +_C.MODEL.TRANS.MLP_RATIO= 4.0 +_C.MODEL.TRANS.NUM_HEADS = 12 +_C.MODEL.TRANS.DEPTH = 12 +_C.MODEL.TRANS.QKV_BIAS = True + +# training settings +_C.TRAIN = CN() +_C.TRAIN.LAST_EPOCH = 0 +_C.TRAIN.NUM_EPOCHS = 300 +_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size +_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune +_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune +_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0 +_C.TRAIN.END_LR = 5e-4 +_C.TRAIN.GRAD_CLIP = 1.0 +_C.TRAIN.ACCUM_ITER = 2 #1 + +_C.TRAIN.LR_SCHEDULER = CN() +_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine' +_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler +_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler + +_C.TRAIN.OPTIMIZER = CN() +_C.TRAIN.OPTIMIZER.NAME = 'AdamW' +_C.TRAIN.OPTIMIZER.EPS = 1e-8 +_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999) # for adamW +_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9 + +# misc +_C.SAVE = "./output" +_C.TAG = "default" +_C.SAVE_FREQ = 10 # freq to save chpt +_C.REPORT_FREQ = 100 # freq to logging info +_C.VALIDATE_FREQ = 100 # freq to do validation +_C.SEED = 0 +_C.EVAL = False # run evaluation only +_C.AMP = False # mix precision training +_C.LOCAL_RANK = 0 +_C.NGPUS = -1 + + +def _update_config_from_file(config, cfg_file): + config.defrost() + with open(cfg_file, 'r') as infile: + yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader) + for cfg in yaml_cfg.setdefault('BASE', ['']): + if cfg: + _update_config_from_file( + config, os.path.join(os.path.dirname(cfg_file), cfg) + ) + print('merging config from {}'.format(cfg_file)) + config.merge_from_file(cfg_file) + config.freeze() + +def update_config(config, args): + """Update config by ArgumentParser + Args: + args: ArgumentParser contains options + Return: + config: updated config + """ + if args.cfg: + _update_config_from_file(config, args.cfg) + config.defrost() + if args.dataset: + config.DATA.DATASET = args.dataset + if args.batch_size: + config.DATA.BATCH_SIZE = args.batch_size + if args.image_size: + config.DATA.IMAGE_SIZE = args.image_size + if args.data_path: + config.DATA.DATA_PATH = args.data_path + if args.ngpus: + config.NGPUS = args.ngpus + if args.eval: + config.EVAL = True + config.DATA.BATCH_SIZE_EVAL = args.batch_size + if args.pretrained: + config.MODEL.PRETRAINED = args.pretrained + if args.resume: + config.MODEL.RESUME = args.resume + if args.last_epoch: + config.TRAIN.LAST_EPOCH = args.last_epoch + if args.amp: # only during training + if config.EVAL is True: + config.AMP = False + else: + config.AMP = True + + #config.freeze() + return config + + +def get_config(cfg_file=None): + """Return a clone of config or load from yaml file""" + config = _C.clone() + if cfg_file: + _update_config_from_file(config, cfg_file) + return config diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml new file mode 100644 index 00000000..eff0fc29 --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml @@ -0,0 +1,21 @@ +DATA: + IMAGE_SIZE: 224 + CROP_PCT: 0.875 +MODEL: + TYPE: ViT + NAME: vit_base_patch16_224 + TRANS: + PATCH_SIZE: 16 + EMBED_DIM: 768 + MLP_RATIO: 4.0 + DEPTH: 12 + NUM_HEADS: 12 + QKV_BIAS: true +TRAIN: + NUM_EPOCHS: 300 + WARMUP_EPOCHS: 3 + WEIGHT_DECAY: 0.3 + BASE_LR: 0.003 + WARMUP_START_LR: 1e-6 + END_LR: 5e-4 + ACCUM_ITER: 2 diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml new file mode 100644 index 00000000..04cdfaee --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml @@ -0,0 +1,14 @@ +DATA: + IMAGE_SIZE: 384 + CROP_PCT: 1.0 +MODEL: + TYPE: ViT + NAME: vit_base_patch16_384 + TRANS: + PATCH_SIZE: 16 + EMBED_DIM: 768 + MLP_RATIO: 4.0 + DEPTH: 12 + NUM_HEADS: 12 + QKV_BIAS: true + diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml new file mode 100644 index 00000000..8b0516d2 --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml @@ -0,0 +1,21 @@ +DATA: + IMAGE_SIZE: 224 + CROP_PCT: 0.875 +MODEL: + TYPE: ViT + NAME: vit_base_patch32_224 + TRANS: + PATCH_SIZE: 32 + EMBED_DIM: 768 + MLP_RATIO: 4.0 + DEPTH: 12 + NUM_HEADS: 12 + QKV_BIAS: true +TRAIN: + NUM_EPOCHS: 300 + WARMUP_EPOCHS: 3 + WEIGHT_DECAY: 0.3 + BASE_LR: 0.003 + WARMUP_START_LR: 1e-6 + END_LR: 5e-4 + ACCUM_ITER: 2 diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml new file mode 100644 index 00000000..5aa3e6f1 --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml @@ -0,0 +1,21 @@ +DATA: + IMAGE_SIZE: 384 + CROP_PCT: 1.0 +MODEL: + TYPE: ViT + NAME: vit_base_patch32_384 + TRANS: + PATCH_SIZE: 32 + EMBED_DIM: 768 + MLP_RATIO: 4.0 + DEPTH: 12 + NUM_HEADS: 12 + QKV_BIAS: true +TRAIN: + NUM_EPOCHS: 300 + WARMUP_EPOCHS: 3 + WEIGHT_DECAY: 0.3 + BASE_LR: 0.003 + WARMUP_START_LR: 1e-6 + END_LR: 5e-4 + ACCUM_ITER: 2 diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml new file mode 100644 index 00000000..23ac9b37 --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml @@ -0,0 +1,14 @@ +DATA: + IMAGE_SIZE: 224 + CROP_PCT: 0.875 +MODEL: + TYPE: ViT + NAME: vit_large_patch16_224 + TRANS: + PATCH_SIZE: 16 + EMBED_DIM: 1024 + MLP_RATIO: 4.0 + DEPTH: 24 + NUM_HEADS: 16 + QKV_BIAS: true + diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml new file mode 100644 index 00000000..c8c01a6a --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml @@ -0,0 +1,14 @@ +DATA: + IMAGE_SIZE: 384 + CROP_PCT: 1.0 +MODEL: + TYPE: ViT + NAME: vit_large_patch16_384 + TRANS: + PATCH_SIZE: 16 + EMBED_DIM: 1024 + MLP_RATIO: 4.0 + DEPTH: 24 + NUM_HEADS: 16 + QKV_BIAS: true + diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml new file mode 100644 index 00000000..6b7f15aa --- /dev/null +++ b/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml @@ -0,0 +1,14 @@ +DATA: + IMAGE_SIZE: 384 + CROP_PCT: 1.0 +MODEL: + TYPE: ViT + NAME: vit_large_patch32_384 + TRANS: + PATCH_SIZE: 32 + EMBED_DIM: 1024 + MLP_RATIO: 4.0 + DEPTH: 24 + NUM_HEADS: 16 + QKV_BIAS: true + diff --git a/image_classification/Multi_Node_Training/datasets.py b/image_classification/Multi_Node_Training/datasets.py new file mode 100644 index 00000000..e207f9ba --- /dev/null +++ b/image_classification/Multi_Node_Training/datasets.py @@ -0,0 +1,187 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Dataset related classes and methods for ViT training and validation +Cifar10, Cifar100 and ImageNet2012 are supported +""" + +import os +import math +from paddle.io import Dataset, DataLoader, DistributedBatchSampler +from paddle.vision import transforms, datasets, image_load + +class ImageNet2012Dataset(Dataset): + """Build ImageNet2012 dataset + + This class gets train/val imagenet datasets, which loads transfomed data and labels. + + Attributes: + file_folder: path where imagenet images are stored + transform: preprocessing ops to apply on image + img_path_list: list of full path of images in whole dataset + label_list: list of labels of whole dataset + """ + + def __init__(self, file_folder, mode="train", transform=None): + """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform""" + super(ImageNet2012Dataset, self).__init__() + assert mode in ["train", "val"] + self.file_folder = file_folder + self.transform = transform + self.img_path_list = [] + self.label_list = [] + + if mode == "train": + self.list_file = os.path.join(self.file_folder, "train_list.txt") + else: + self.list_file = os.path.join(self.file_folder, "val_list.txt") + + with open(self.list_file, 'r') as infile: + for line in infile: + img_path = line.strip().split()[0] + img_label = int(line.strip().split()[1]) + self.img_path_list.append(os.path.join(self.file_folder, img_path)) + self.label_list.append(img_label) + print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}') + + def __len__(self): + return len(self.label_list) + + def __getitem__(self, index): + data = image_load(self.img_path_list[index]).convert('RGB') + data = self.transform(data) + label = self.label_list[index] + + return data, label + + +def get_train_transforms(config): + """ Get training transforms + + For training, a RandomResizedCrop is applied, then normalization is applied with + [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.] + Outputs is converted to tensor + + Args: + config: configs contains IMAGE_SIZE, see config.py for details + Returns: + transforms_train: training transforms + """ + + transforms_train = transforms.Compose([ + transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE), + scale=(0.05, 1.0)), + transforms.ToTensor(), + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + return transforms_train + + +def get_val_transforms(config): + """ Get training transforms + + For validation, image is first Resize then CenterCrop to image_size. + Then normalization is applied with [0.5, 0.5, 0.5] mean and std. + The input pixel values must be rescaled to [0, 1.] + Outputs is converted to tensor + + Args: + config: configs contains IMAGE_SIZE, see config.py for details + Returns: + transforms_train: training transforms + """ + + scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT)) + transforms_val = transforms.Compose([ + transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image + transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)), + transforms.ToTensor(), + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + return transforms_val + + +def get_dataset(config, mode='train'): + """ Get dataset from config and mode (train/val) + + Returns the related dataset object according to configs and mode(train/val) + + Args: + config: configs contains dataset related settings. see config.py for details + Returns: + dataset: dataset object + """ + assert mode in ['train', 'val'] + if config.DATA.DATASET == "cifar10": + if mode == 'train': + dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config)) + else: + mode = 'test' + dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config)) + elif config.DATA.DATASET == "cifar100": + if mode == 'train': + dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config)) + else: + mode = 'test' + dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config)) + elif config.DATA.DATASET == "imagenet2012": + if mode == 'train': + dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, + mode=mode, + transform=get_train_transforms(config)) + else: + dataset = ImageNet2012Dataset(config.DATA.DATA_PATH, + mode=mode, + transform=get_val_transforms(config)) + else: + raise NotImplementedError( + "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now") + return dataset + + +def get_dataloader(config, dataset, mode='train', multi_process=False): + """Get dataloader with config, dataset, mode as input, allows multiGPU settings. + + Multi-GPU loader is implements as distributedBatchSampler. + + Args: + config: see config.py for details + dataset: paddle.io.dataset object + mode: train/val + multi_process: if True, use DistributedBatchSampler to support multi-processing + Returns: + dataloader: paddle.io.DataLoader object. + """ + + if mode == 'train': + batch_size = config.DATA.BATCH_SIZE + else: + batch_size = config.DATA.BATCH_SIZE_EVAL + + if multi_process is True: + sampler = DistributedBatchSampler(dataset, + batch_size=batch_size, + shuffle=(mode == 'train')) + dataloader = DataLoader(dataset, + batch_sampler=sampler, + num_workers=config.DATA.NUM_WORKERS) + else: + dataloader = DataLoader(dataset, + batch_size=batch_size, + num_workers=config.DATA.NUM_WORKERS, + shuffle=(mode == 'train')) + return dataloader diff --git a/image_classification/Multi_Node_Training/droppath.py b/image_classification/Multi_Node_Training/droppath.py new file mode 100644 index 00000000..25b8d5ff --- /dev/null +++ b/image_classification/Multi_Node_Training/droppath.py @@ -0,0 +1,60 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth +""" + +import paddle +import paddle.nn as nn + + +class DropPath(nn.Layer): + """DropPath class""" + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def drop_path(self, inputs): + """drop path op + Args: + input: tensor with arbitrary shape + drop_prob: float number of drop path probability, default: 0.0 + training: bool, if current mode is training, default: False + Returns: + output: output tensor after drop path + """ + # if prob is 0 or eval mode, return original input + if self.drop_prob == 0. or not self.training: + return inputs + keep_prob = 1 - self.drop_prob + keep_prob = paddle.to_tensor(keep_prob, dtype='float32') + shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1) # shape=(N, 1, 1, 1) + random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype) + random_tensor = random_tensor.floor() # mask + output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation + return output + + def forward(self, inputs): + return self.drop_path(inputs) + + +#def main(): +# tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32') +# dp = DropPath(0.5) +# out = dp(tmp) +# print(out) +# +#if __name__ == "__main__": +# main() diff --git a/image_classification/Multi_Node_Training/main_multi_node.py b/image_classification/Multi_Node_Training/main_multi_node.py new file mode 100644 index 00000000..bb0462b2 --- /dev/null +++ b/image_classification/Multi_Node_Training/main_multi_node.py @@ -0,0 +1,379 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ViT training/validation using multiple GPU """ + +import sys +import os +import time +import logging +import argparse +import random +import numpy as np +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import paddle.distributed as dist +from datasets import get_dataloader +from datasets import get_dataset +from transformer import build_vit as build_model +from utils import AverageMeter +from utils import WarmupCosineScheduler +from config import get_config +from config import update_config + + +parser = argparse.ArgumentParser('ViT') +parser.add_argument('-cfg', type=str, default=None) +parser.add_argument('-dataset', type=str, default=None) +parser.add_argument('-batch_size', type=int, default=None) +parser.add_argument('-image_size', type=int, default=None) +parser.add_argument('-data_path', type=str, default=None) +parser.add_argument('-ngpus', type=int, default=None) +parser.add_argument('-pretrained', type=str, default=None) +parser.add_argument('-resume', type=str, default=None) +parser.add_argument('-last_epoch', type=int, default=None) +parser.add_argument('-eval', action='store_true') +parser.add_argument('-amp', action='store_true') +parser.add_argument('-ips', type=str, default="127.0.0.1") +arguments = parser.parse_args() + + +log_format = "%(asctime)s %(message)s" +logging.basicConfig(stream=sys.stdout, level=logging.INFO, + format=log_format, datefmt="%m%d %I:%M:%S %p") + +# get default config +config = get_config() +# update config by arguments +config = update_config(config, arguments) + +# set output folder +if not config.EVAL: + config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) +else: + config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S')) + +if not os.path.exists(config.SAVE): + os.makedirs(config.SAVE, exist_ok=True) + +# set logging format +logger = logging.getLogger() +fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt')) +fh.setFormatter(logging.Formatter(log_format)) +logger.addHandler(fh) +logger.info(f'config= {config}') + + +def train(dataloader, + model, + criterion, + optimizer, + epoch, + total_batch, + debug_steps=100, + accum_iter=1, + amp=False): + """Training for one epoch + Args: + dataloader: paddle.io.DataLoader, dataloader instance + model: nn.Layer, a ViT model + criterion: nn.criterion + epoch: int, current epoch + total_epoch: int, total num of epoch, for logging + debug_steps: int, num of iters to log info, default: 100 + accum_iter: int, num of iters for accumulating gradients, default: 1 + amp: bool, if True, use mix precision training, default: False + Returns: + train_loss_meter.avg + train_acc_meter.avg + train_time + """ + model.train() + train_loss_meter = AverageMeter() + train_acc_meter = AverageMeter() + if amp is True: + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + time_st = time.time() + + for batch_id, data in enumerate(dataloader): + image = data[0] + label = data[1] + + if amp is True: + with paddle.amp.auto_cast(): + output = model(image) + loss = criterion(output, label) + scaled = scaler.scale(loss) + scaled.backward() + + if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): + scaler.minimize(optimizer, scaled) + optimizer.clear_grad() + else: + output = model(image) + loss = criterion(output, label) + #NOTE: division may be needed depending on the loss function + # Here no division is needed: + # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean' + # + #loss = loss / accum_iter + loss.backward() + + if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)): + optimizer.step() + optimizer.clear_grad() + + pred = F.softmax(output) + acc = paddle.metric.accuracy(pred, label.unsqueeze(1)) + + batch_size = image.shape[0] + train_loss_meter.update(loss.numpy()[0], batch_size) + train_acc_meter.update(acc.numpy()[0], batch_size) + + if batch_id % debug_steps == 0: + logger.info( + f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {train_loss_meter.avg:.4f}, " + + f"Avg Acc: {train_acc_meter.avg:.4f}") + + train_time = time.time() - time_st + return train_loss_meter.avg, train_acc_meter.avg, train_time + + +def validate(dataloader, model, criterion, total_batch, debug_steps=100): + """Validation for whole dataset + Args: + dataloader: paddle.io.DataLoader, dataloader instance + model: nn.Layer, a ViT model + criterion: nn.criterion + total_epoch: int, total num of epoch, for logging + debug_steps: int, num of iters to log info + Returns: + val_loss_meter.avg + val_acc1_meter.avg + val_acc5_meter.avg + val_time + """ + model.eval() + val_loss_meter = AverageMeter() + val_acc1_meter = AverageMeter() + val_acc5_meter = AverageMeter() + time_st = time.time() + + with paddle.no_grad(): + for batch_id, data in enumerate(dataloader): + image = data[0] + label = data[1] + + output = model(image) + loss = criterion(output, label) + + pred = F.softmax(output) + acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1)) + acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5) + + dist.all_reduce(loss) + dist.all_reduce(acc1) + dist.all_reduce(acc5) + loss = loss / dist.get_world_size() + acc1 = acc1 / dist.get_world_size() + acc5 = acc5 / dist.get_world_size() + + batch_size = paddle.to_tensor(image.shape[0]) + dist.all_reduce(batch_size) + + val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0]) + val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0]) + val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0]) + + if batch_id % debug_steps == 0: + logger.info( + f"Val Step[{batch_id:04d}/{total_batch:04d}], " + + f"Avg Loss: {val_loss_meter.avg:.4f}, " + + f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " + + f"Avg Acc@5: {val_acc5_meter.avg:.4f}") + + val_time = time.time() - time_st + return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time + + +def main_worker(*args): + # 0. Preparation + dist.init_parallel_env() + last_epoch = config.TRAIN.LAST_EPOCH + world_size = paddle.distributed.get_world_size() + local_rank = paddle.distributed.get_rank() + logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}') + seed = config.SEED + local_rank + paddle.seed(seed) + np.random.seed(seed) + random.seed(seed) + # 1. Create model + model = build_model(config) + model = paddle.DataParallel(model) + # 2. Create train and val dataloader + dataset_train, dataset_val = args[0], args[1] + dataloader_train = get_dataloader(config, dataset_train, 'train', True) + dataloader_val = get_dataloader(config, dataset_val, 'test', True) + total_batch_train = len(dataloader_train) + total_batch_val = len(dataloader_val) + logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}') + logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}') + # 3. Define criterion + criterion = nn.CrossEntropyLoss() + # 4. Define optimizer and lr_scheduler + scheduler = None + if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine": + scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR, + warmup_start_lr=config.TRAIN.WARMUP_START_LR, + start_lr=config.TRAIN.BASE_LR, + end_lr=config.TRAIN.END_LR, + warmup_epochs=config.TRAIN.WARMUP_EPOCHS, + total_epochs=config.TRAIN.NUM_EPOCHS, + last_epoch=config.TRAIN.LAST_EPOCH, + ) + elif config.TRAIN.LR_SCHEDULER.NAME == "cosine": + scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR, + T_max=config.TRAIN.NUM_EPOCHS, + last_epoch=last_epoch) + elif config.scheduler == "multi-step": + milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")] + scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR, + milestones=milestones, + gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE, + last_epoch=last_epoch) + else: + logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.") + + if config.TRAIN.OPTIMIZER.NAME == "SGD": + if config.TRAIN.GRAD_CLIP: + clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) + else: + clip = None + optimizer = paddle.optimizer.Momentum( + parameters=model.parameters(), + learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, + weight_decay=config.TRAIN.WEIGHT_DECAY, + momentum=config.TRAIN.OPTIMIZER.MOMENTUM, + grad_clip=clip) + elif config.TRAIN.OPTIMIZER.NAME == "AdamW": + if config.TRAIN.GRAD_CLIP: + clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP) + else: + clip = None + optimizer = paddle.optimizer.AdamW( + parameters=model.parameters(), + learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR, + beta1=config.TRAIN.OPTIMIZER.BETAS[0], + beta2=config.TRAIN.OPTIMIZER.BETAS[1], + weight_decay=config.TRAIN.WEIGHT_DECAY, + epsilon=config.TRAIN.OPTIMIZER.EPS, + grad_clip=clip, + #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']), + ) + else: + logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.") + + # 5. Load pretrained model / load resumt model and optimizer states + if config.MODEL.PRETRAINED: + if (config.MODEL.PRETRAINED).endswith('.pdparams'): + raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams') + assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True + model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams') + model.set_dict(model_state) + logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}") + + if config.MODEL.RESUME: + assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True + assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True + model_state = paddle.load(config.MODEL.RESUME+'.pdparams') + model.set_dict(model_state) + opt_state = paddle.load(config.MODEL.RESUME+'.pdopt') + optimizer.set_state_dict(opt_state) + logger.info( + f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}") + + # 6. Validation + if config.EVAL: + logger.info('----- Start Validating') + val_loss, val_acc1, val_acc5, val_time = validate( + dataloader=dataloader_val, + model=model, + criterion=criterion, + total_batch=total_batch_val, + debug_steps=config.REPORT_FREQ) + logger.info(f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + return + + # 6. Start training and validation + logging.info(f"Start training from epoch {last_epoch+1}.") + for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1): + # train + logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}") + train_loss, train_acc, train_time = train(dataloader=dataloader_train, + model=model, + criterion=criterion, + optimizer=optimizer, + epoch=epoch, + total_batch=total_batch_train, + debug_steps=config.REPORT_FREQ, + accum_iter=config.TRAIN.ACCUM_ITER, + amp=config.AMP) + scheduler.step() + + logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Train Loss: {train_loss:.4f}, " + + f"Train Acc: {train_acc:.4f}, " + + f"time: {train_time:.2f}") + # validation + if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: + logger.info(f'----- Validation after Epoch: {epoch}') + val_loss, val_acc1, val_acc5, val_time = validate( + dataloader=dataloader_val, + model=model, + criterion=criterion, + total_batch=total_batch_val, + debug_steps=config.REPORT_FREQ) + logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " + + f"Validation Loss: {val_loss:.4f}, " + + f"Validation Acc@1: {val_acc1:.4f}, " + + f"Validation Acc@5: {val_acc5:.4f}, " + + f"time: {val_time:.2f}") + # model save + if local_rank == 0: + if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS: + model_path = os.path.join( + config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}") + paddle.save(model.state_dict(), model_path + '.pdparams') + paddle.save(optimizer.state_dict(), model_path + '.pdopt') + logger.info(f"----- Save model: {model_path}.pdparams") + logger.info(f"----- Save optim: {model_path}.pdopt") + + +def main(): + dataset_train = get_dataset(config, mode='train') + dataset_val = get_dataset(config, mode='val') + config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS + dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS, ips=arguments.ips) + + +if __name__ == "__main__": + main() diff --git a/image_classification/Multi_Node_Training/run_train_multi_node.sh b/image_classification/Multi_Node_Training/run_train_multi_node.sh new file mode 100644 index 00000000..bc3893be --- /dev/null +++ b/image_classification/Multi_Node_Training/run_train_multi_node.sh @@ -0,0 +1,8 @@ +CUDA_VISIBLE_DEVICES=0,1 \ +python main_multi_node.py \ +-cfg='./configs/vit_base_patch16_224.yaml' \ +-dataset='imagenet2012' \ +-batch_size=32 \ +-data_path='./dataset/imagenet' \ +-ips='172.18.0.2, 172.18.0.3' # the ips should be replaced +#-amp diff --git a/image_classification/Multi_Node_Training/transformer.py b/image_classification/Multi_Node_Training/transformer.py new file mode 100644 index 00000000..384cacf5 --- /dev/null +++ b/image_classification/Multi_Node_Training/transformer.py @@ -0,0 +1,437 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Implement Transformer Class for ViT +""" + +import copy +import paddle +import paddle.nn as nn +from droppath import DropPath +from config import get_config + + +class Identity(nn.Layer): + """ Identity layer + The output of this layer is the input without any change. + Use this layer to avoid using 'if' condition in forward methods + """ + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return x + + +class PatchEmbedding(nn.Layer): + """Patch Embedding and Position Embedding + + Apply patch embedding and position embedding on input images. + + Attributes: + patch_embddings: impl using a patch_size x patch_size Conv2D operation + position_embddings: a parameter with len = num_patch + 1(for cls_token) + cls_token: token insert to the patch feature for classification + dropout: dropout for embeddings + """ + + def __init__(self, + image_size=224, + patch_size=16, + in_channels=3, + embed_dim=768, + dropout=0.): + super().__init__() + n_patches = (image_size // patch_size) * (image_size // patch_size) + + self.patch_embedding = nn.Conv2D(in_channels=in_channels, + out_channels=embed_dim, + kernel_size=patch_size, + stride=patch_size) + + self.position_embeddings = paddle.create_parameter( + shape=[1, n_patches+1, embed_dim], + dtype='float32', + default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02)) + + self.cls_token = paddle.create_parameter( + shape=[1, 1, embed_dim], + dtype='float32', + default_initializer=paddle.nn.initializer.Constant(0)) + + self.dropout = nn.Dropout(dropout) + + def forward(self, x): + cls_tokens = self.cls_token.expand((x.shape[0], -1, -1)) + x = self.patch_embedding(x) + x = x.flatten(2) + x = x.transpose([0, 2, 1]) + x = paddle.concat((cls_tokens, x), axis=1) + + embeddings = x + self.position_embeddings # tensor broadcast + embeddings = self.dropout(embeddings) + return embeddings + + +class Attention(nn.Layer): + """ Attention module + + Attention module for ViT, here q, k, v are assumed the same. + The qkv mappings are stored as one single param. + + Attributes: + num_heads: number of heads + attn_head_size: feature dim of single head + all_head_size: feature dim of all heads + qkv: a nn.Linear for q, k, v mapping + scales: 1 / sqrt(single_head_feature_dim) + out: projection of multi-head attention + attn_dropout: dropout for attention + proj_dropout: final dropout before output + softmax: softmax op for attention + """ + def __init__(self, + embed_dim, + num_heads, + qkv_bias=True, + dropout=0., + attention_dropout=0.): + super().__init__() + self.num_heads = num_heads + self.attn_head_size = int(embed_dim / self.num_heads) + self.all_head_size = self.attn_head_size * self.num_heads + + w_attr_1, b_attr_1 = self._init_weights() + self.qkv = nn.Linear(embed_dim, + self.all_head_size*3, #weights for q, k, and v + weight_attr=w_attr_1, + bias_attr=b_attr_1 if qkv_bias else False) + + self.scales = self.attn_head_size ** -0.5 + + w_attr_2, b_attr_2 = self._init_weights() + self.out = nn.Linear(embed_dim, + embed_dim, + weight_attr=w_attr_2, + bias_attr=b_attr_2) + + self.attn_dropout = nn.Dropout(attention_dropout) + self.proj_dropout = nn.Dropout(dropout) + self.softmax = nn.Softmax(axis=-1) + + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()) + return weight_attr, bias_attr + + def transpose_multihead(self, x): + new_shape = x.shape[:-1] + [self.num_heads, self.attn_head_size] + x = x.reshape(new_shape) + x = x.transpose([0, 2, 1, 3]) + return x + + def forward(self, x): + qkv = self.qkv(x).chunk(3, axis=-1) + q, k, v = map(self.transpose_multihead, qkv) + + attn = paddle.matmul(q, k, transpose_y=True) + attn = attn * self.scales + attn = self.softmax(attn) + attn_weights = attn + attn = self.attn_dropout(attn) + + z = paddle.matmul(attn, v) + z = z.transpose([0, 2, 1, 3]) + new_shape = z.shape[:-2] + [self.all_head_size] + z = z.reshape(new_shape) + # reshape + z = self.out(z) + z = self.proj_dropout(z) + return z, attn_weights + + +class Mlp(nn.Layer): + """ MLP module + + Impl using nn.Linear and activation is GELU, dropout is applied. + Ops: fc -> act -> dropout -> fc -> dropout + + Attributes: + fc1: nn.Linear + fc2: nn.Linear + act: GELU + dropout1: dropout after fc1 + dropout2: dropout after fc2 + """ + def __init__(self, + embed_dim, + mlp_ratio, + dropout=0.): + super().__init__() + w_attr_1, b_attr_1 = self._init_weights() + self.fc1 = nn.Linear(embed_dim, + int(embed_dim * mlp_ratio), + weight_attr=w_attr_1, + bias_attr=b_attr_1) + + w_attr_2, b_attr_2 = self._init_weights() + self.fc2 = nn.Linear(int(embed_dim * mlp_ratio), + embed_dim, + weight_attr=w_attr_2, + bias_attr=b_attr_2) + self.act = nn.GELU() + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + def _init_weights(self): + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.XavierUniform()) #default in pp: xavier + bias_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.Normal(std=1e-6)) #default in pp: zero + return weight_attr, bias_attr + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.dropout1(x) + x = self.fc2(x) + x = self.dropout2(x) + return x + + +class EncoderLayer(nn.Layer): + """Encoder Layer + + Encoder layer contains attention, norm, mlp and residual + + Attributes: + hidden_size: transformer feature dim + attn_norm: nn.LayerNorm before attention + mlp_norm: nn.LayerNorm before mlp + mlp: mlp modual + attn: attention modual + """ + def __init__(self, + embed_dim, + num_heads, + qkv_bias=True, + mlp_ratio=4., + dropout=0., + attention_dropout=0., + droppath=0.): + super().__init__() + w_attr_1, b_attr_1 = self._init_weights() + self.attn_norm = nn.LayerNorm(embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1, + epsilon=1e-6) + + self.attn = Attention(embed_dim, + num_heads, + qkv_bias, + dropout, + attention_dropout) + self.drop_path = DropPath(droppath) if droppath > 0. else Identity() + + w_attr_2, b_attr_2 = self._init_weights() + self.mlp_norm = nn.LayerNorm(embed_dim, + weight_attr=w_attr_2, + bias_attr=b_attr_2, + epsilon=1e-6) + + self.mlp = Mlp(embed_dim, mlp_ratio, dropout) + + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + return weight_attr, bias_attr + + def forward(self, x): + h = x + x = self.attn_norm(x) + x, attn = self.attn(x) + x = self.drop_path(x) + x = x + h + + h = x + x = self.mlp_norm(x) + x = self.mlp(x) + x = self.drop_path(x) + x = x + h + + return x, attn + + +class Encoder(nn.Layer): + """Transformer encoder + + Encoder encoder contains a list of EncoderLayer, and a LayerNorm. + + Attributes: + layers: nn.LayerList contains multiple EncoderLayers + encoder_norm: nn.LayerNorm which is applied after last encoder layer + """ + def __init__(self, + embed_dim, + num_heads, + depth, + qkv_bias=True, + mlp_ratio=4.0, + dropout=0., + attention_dropout=0., + droppath=0.): + super(Encoder, self).__init__() + # stochatic depth decay + depth_decay = [x.item() for x in paddle.linspace(0, droppath, depth)] + layer_list = [] + for i in range(depth): + encoder_layer = EncoderLayer(embed_dim, + num_heads, + qkv_bias=True, + mlp_ratio=4., + dropout=0., + attention_dropout=0., + droppath=depth_decay[i]) + layer_list.append(copy.deepcopy(encoder_layer)) + self.layers = nn.LayerList(layer_list) + + w_attr_1, b_attr_1 = self._init_weights() + self.encoder_norm = nn.LayerNorm(embed_dim, + weight_attr=w_attr_1, + bias_attr=b_attr_1, + epsilon=1e-6) + + def _init_weights(self): + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0)) + bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0)) + return weight_attr, bias_attr + + def forward(self, x): + self_attn = [] + for layer in self.layers: + x, attn = layer(x) + self_attn.append(attn) + out = self.encoder_norm(x) + return out, self_attn + + +class VisualTransformer(nn.Layer): + """ViT transformer + + ViT Transformer, classifier is a single Linear layer for finetune, + For training from scratch, two layer mlp should be used. + Classification is done using cls_token. + + Args: + image_size: int, input image size, default: 224 + patch_size: int, patch size, default: 16 + in_channels: int, input image channels, default: 3 + num_classes: int, number of classes for classification, default: 1000 + embed_dim: int, embedding dimension (patch embed out dim), default: 768 + depth: int, number ot transformer blocks, default: 12 + num_heads: int, number of attention heads, default: 12 + mlp_ratio: float, ratio of mlp hidden dim to embed dim(mlp in dim), default: 4.0 + qkv_bias: bool, If True, enable qkv(nn.Linear) layer with bias, default: True + dropout: float, dropout rate for linear layers, default: 0. + attention_dropout: float, dropout rate for attention layers default: 0. + droppath: float, droppath rate for droppath layers, default: 0. + """ + def __init__(self, + image_size=224, + patch_size=16, + in_channels=3, + num_classes=1000, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + qkv_bias=True, + dropout=0., + attention_dropout=0., + droppath=0., + train_from_scratch=False, + config=None): + super(VisualTransformer, self).__init__() + # create patch embedding with positional embedding + self.patch_embedding = PatchEmbedding(image_size, + patch_size, + in_channels, + embed_dim, + dropout) + # create multi head self-attention layers + self.encoder = Encoder(embed_dim, + num_heads, + depth, + qkv_bias, + mlp_ratio, + dropout, + attention_dropout, + droppath) + + # classifier head (for training from scracth) + if train_from_scratch: + w_attr_1, b_attr_1 = self._init_weights() + w_attr_2, b_attr_2 = self._init_weights() + self.classifier = nn.Sequential( + nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE, + config.MODEL.TRANS.HIDDEN_SIZE, + weight_attr=w_attr_1, + bias_attr=b_attr_1), + nn.ReLU(), + nn.Dropout(config.MODEL.DROPOUT), + nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE, + config.MODEL.NUM_CLASSES, + weight_attr=w_attr_2, + bias_attr=b_attr_2), + nn.Dropout(config.MODEL.DROPOUT), + ) + else: + # classifier head (for finetuning) + w_attr_1, b_attr_1 = self._init_weights() + self.classifier = nn.Linear(embed_dim, + num_classes, + weight_attr=w_attr_1, + bias_attr=b_attr_1) + + def _init_weights(self): + weight_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()) + bias_attr = paddle.ParamAttr( + initializer=paddle.nn.initializer.KaimingUniform()) + return weight_attr, bias_attr + + def forward(self, x): + x = self.patch_embedding(x) + x, attn = self.encoder(x) + logits = self.classifier(x[:, 0]) # take only cls_token as classifier + return logits + + +def build_vit(config): + model = VisualTransformer(image_size=config.DATA.IMAGE_SIZE, + patch_size=config.MODEL.TRANS.PATCH_SIZE, + in_channels=3, + num_classes=config.MODEL.NUM_CLASSES, + embed_dim=config.MODEL.TRANS.EMBED_DIM, + depth=config.MODEL.TRANS.DEPTH, + num_heads=config.MODEL.TRANS.NUM_HEADS, + mlp_ratio=config.MODEL.TRANS.MLP_RATIO, + qkv_bias=config.MODEL.TRANS.QKV_BIAS, + dropout=config.MODEL.DROPOUT, + attention_dropout=config.MODEL.ATTENTION_DROPOUT, + droppath=config.MODEL.DROPPATH, + train_from_scratch=False, + config=config) + return model diff --git a/image_classification/Multi_Node_Training/utils.py b/image_classification/Multi_Node_Training/utils.py new file mode 100644 index 00000000..44800527 --- /dev/null +++ b/image_classification/Multi_Node_Training/utils.py @@ -0,0 +1,120 @@ +# Copyright (c) 2021 PPViT Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""utils for ViT + +Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training +and WarmupCosineScheduler for training + +""" + +import math +from paddle.optimizer.lr import LRScheduler + + +class AverageMeter(): + """ Meter for monitoring losses""" + def __init__(self): + self.avg = 0 + self.sum = 0 + self.cnt = 0 + self.reset() + + def reset(self): + """reset all values to zeros""" + self.avg = 0 + self.sum = 0 + self.cnt = 0 + + def update(self, val, n=1): + """update avg by val and n, where val is the avg of n values""" + self.sum += val * n + self.cnt += n + self.avg = self.sum / self.cnt + + + +def get_exclude_from_weight_decay_fn(exclude_list=[]): + """ Set params with no weight decay during the training + + For certain params, e.g., positional encoding in ViT, weight decay + may not needed during the learning, this method is used to find + these params. + + Args: + exclude_list: a list of params names which need to exclude + from weight decay. + Returns: + exclude_from_weight_decay_fn: a function returns True if param + will be excluded from weight decay + """ + if len(exclude_list) == 0: + exclude_from_weight_decay_fn = None + else: + def exclude_fn(param): + for name in exclude_list: + if param.endswith(name): + return False + return True + exclude_from_weight_decay_fn = exclude_fn + return exclude_from_weight_decay_fn + + +class WarmupCosineScheduler(LRScheduler): + """Warmup Cosine Scheduler + + First apply linear warmup, then apply cosine decay schedule. + Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs" + Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining + "total_epochs - warmup_epochs" + + Attributes: + learning_rate: the starting learning rate (without warmup), not used here! + warmup_start_lr: warmup starting learning rate + start_lr: the starting learning rate (without warmup) + end_lr: the ending learning rate after whole loop + warmup_epochs: # of epochs for warmup + total_epochs: # of total epochs (include warmup) + """ + def __init__(self, + learning_rate, + warmup_start_lr, + start_lr, + end_lr, + warmup_epochs, + total_epochs, + cycles=0.5, + last_epoch=-1, + verbose=False): + """init WarmupCosineScheduler """ + self.warmup_epochs = warmup_epochs + self.total_epochs = total_epochs + self.warmup_start_lr = warmup_start_lr + self.start_lr = start_lr + self.end_lr = end_lr + self.cycles = cycles + super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose) + + def get_lr(self): + """ return lr value """ + if self.last_epoch < self.warmup_epochs: + val = (self.start_lr - self.warmup_start_lr) * float( + self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr + return val + + progress = float(self.last_epoch - self.warmup_epochs) / float( + max(1, self.total_epochs - self.warmup_epochs)) + val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) + val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr) + return val