diff --git a/image_classification/Multi_Node_Training/README.md b/image_classification/Multi_Node_Training/README.md
new file mode 100644
index 00000000..61e17b1c
--- /dev/null
+++ b/image_classification/Multi_Node_Training/README.md
@@ -0,0 +1,57 @@
+# Multiple Node Training
+English | [简体中文](./README_cn.md)
+
+PaddleVit also supports multi-node distributed training under collective mode.
+
+Here we provides a simple tutorial to modify multi-gpus training scripts 
+to multi-nodes training scripts for any models in PaddleViT.
+
+This folder takes ViT model as an example.
+
+## Tutorial
+For any models in PaddleViT, one can implement multi-node training by modifying 
+`main_multi_gpu.py`.
+1. Just add arguments `ips='[host ips]' ` in `dist.spawn()`.
+2. Then run training script in every host.
+
+## Training example: ViT
+Suppose you have 2 hosts (denoted as node) with 4 gpus on each machine. 
+Nodes IP addresses are `192.168.0.16` and `192.168.0.17`.
+
+1. Then modify some lines of `run_train_multi_node.sh`:
+    ```shell
+    CUDA_VISIBLE_DEVICES=0,1,2,3 # number of gpus
+    
+    -ips= '192.168.0.16, 192.168.0.17' # seperated by comma
+    ```
+2. Run training script in every host:
+    ```shell
+    sh run_train_multi.sh
+    ```
+
+##Multi-nodes training with one host
+It is possible to try multi-node training even when you have only one machine.
+
+1. Install docker and paddle. For more details, please refer 
+    [here](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/fromdocker.html).
+
+2. Create a network between docker containers.
+    ```shell
+    docker network create -d bridge paddle_net
+    ```
+3. Create multiple containers as virtual hosts/nodes. Suppose creating 2 containers 
+with 2 gpus on each node.
+    ```shell
+    docker run --name paddle0 -it -d --gpus "device=0,1" --network paddle_net\
+    paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
+    docker run --name paddle1 -it -d --gpus "device=2,3" --network paddle_net\
+    paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
+    ```
+    >   Noted: 
+    >   1. One can assign one gpu device to different containers. But it may occur OOM since multiple models will run on the same gpu. 
+    >   2. One should use `-v` to bind PaddleViT repository to container.
+
+4. Modify `run_train_multi_node.sh` as described above and run the training script on every container.
+   
+    >   Noted: One can use `ping` or `ip -a` bash command to check containers' ip addresses. 
+
diff --git a/image_classification/Multi_Node_Training/README_cn.md b/image_classification/Multi_Node_Training/README_cn.md
new file mode 100644
index 00000000..c879d46f
--- /dev/null
+++ b/image_classification/Multi_Node_Training/README_cn.md
@@ -0,0 +1,45 @@
+#多机多卡分布式训练
+
+简体中文 | [English](./README.md)
+
+PaddleViT 同样支持Collective多机多卡分布式训练。
+
+##教程
+对于每一个模型，用户可以直接通过修改对应模型文件夹下的`main_multi_gpu.py`
+以进行多机训练。
+1. 在`dist.spawn()`里加入`ips='[host ips]' `。
+2. 在每个主机上运行代码。
+
+##样例：ViT
+这个文件夹提供了分布式训练ViT模型的代码和shell脚本。
+假设你有2台主机，每个主机上有4张显卡。主机的ip地址为`192.168.0.16`和`192.168.0.17`。
+
+1. 修改shell脚本`run_train_multi_node.sh`的参数
+   ```shell
+    CUDA_VISIBLE_DEVICES=0,1,2,3 # number of gpus
+    
+    -ips= '192.168.0.16, 192.168.0.17' # seperated by comma
+    ```
+2. 在每个主机上运行脚本代码。
+    ```shell
+    sh run_train_multi.sh
+    ```
+##单机上运行分布式训练
+如果仅有一台主机，同样可以通过docker实现单机上的分布式训练。
+1. 安装docker和paddle。[这里](https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/fromdocker.html)
+可以下载paddlepaddle提供的docker镜像。
+2. 创建docker容器间网络。
+   ```shell
+    docker network create -d bridge paddle_net
+    ```
+3. 创建多个docker容器作为虚拟主机，假设我们创建2个容器，并分别分配2个GPU。
+    ```shell
+    docker run --name paddle0 -it -d --gpus "device=0,1" --network paddle_net\
+    paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
+    docker run --name paddle1 -it -d --gpus "device=2,3" --network paddle_net\
+    paddlepaddle/paddle:2.2.0-gpu-cuda10.2-cudnn7 /bin/bash
+    ```
+    >   注意: 
+    >   1. 可以将同一个GPU同时分配给多个容器，但是这可能会产生OOM错误，因为多个模型将同时运行在这个GPU上。 
+    >   2. 使用`-v`挂载PaddleViT所在的目录。
+
diff --git a/image_classification/Multi_Node_Training/config.py b/image_classification/Multi_Node_Training/config.py
new file mode 100644
index 00000000..a7273699
--- /dev/null
+++ b/image_classification/Multi_Node_Training/config.py
@@ -0,0 +1,153 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Configuration
+
+Configuration for data, model archtecture, and training, etc.
+Config can be set by .yaml file or by argparser(limited usage)
+
+
+"""
+import os
+from yacs.config import CfgNode as CN
+import yaml
+
+_C = CN()
+_C.BASE = ['']
+
+# data settings
+_C.DATA = CN()
+_C.DATA.BATCH_SIZE = 256 #256 # train batch_size for single GPU
+_C.DATA.BATCH_SIZE_EVAL = 8 #64 # val batch_size for single GPU
+_C.DATA.DATA_PATH = '/dataset/imagenet/' # path to dataset
+_C.DATA.DATASET = 'imagenet2012' # dataset name
+_C.DATA.IMAGE_SIZE = 224 # input image size: 224 for pretrain, 384 for finetune
+_C.DATA.CROP_PCT = 0.875 # input image scale ratio, scale is applied before centercrop in eval mode
+_C.DATA.NUM_WORKERS = 2 # number of data loading threads 
+
+# model settings
+_C.MODEL = CN()
+_C.MODEL.TYPE = 'ViT'
+_C.MODEL.NAME = 'ViT'
+_C.MODEL.RESUME = None
+_C.MODEL.PRETRAINED = None
+_C.MODEL.NUM_CLASSES = 1000
+_C.MODEL.DROPOUT = 0.1
+_C.MODEL.DROPPATH = 0.1
+_C.MODEL.ATTENTION_DROPOUT = 0.1
+
+# transformer settings
+_C.MODEL.TRANS = CN()
+_C.MODEL.TRANS.PATCH_SIZE = 32
+_C.MODEL.TRANS.EMBED_DIM = 768
+_C.MODEL.TRANS.MLP_RATIO= 4.0
+_C.MODEL.TRANS.NUM_HEADS = 12
+_C.MODEL.TRANS.DEPTH = 12
+_C.MODEL.TRANS.QKV_BIAS = True
+
+# training settings
+_C.TRAIN = CN()
+_C.TRAIN.LAST_EPOCH = 0
+_C.TRAIN.NUM_EPOCHS = 300
+_C.TRAIN.WARMUP_EPOCHS = 3 #34 # ~ 10k steps for 4096 batch size
+_C.TRAIN.WEIGHT_DECAY = 0.05 #0.3 # 0.0 for finetune
+_C.TRAIN.BASE_LR = 0.001 #0.003 for pretrain # 0.03 for finetune
+_C.TRAIN.WARMUP_START_LR = 1e-6 #0.0
+_C.TRAIN.END_LR = 5e-4
+_C.TRAIN.GRAD_CLIP = 1.0
+_C.TRAIN.ACCUM_ITER = 2 #1
+
+_C.TRAIN.LR_SCHEDULER = CN()
+_C.TRAIN.LR_SCHEDULER.NAME = 'warmupcosine'
+_C.TRAIN.LR_SCHEDULER.MILESTONES = "30, 60, 90" # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_EPOCHS = 30 # only used in StepLRScheduler
+_C.TRAIN.LR_SCHEDULER.DECAY_RATE = 0.1 # only used in StepLRScheduler
+
+_C.TRAIN.OPTIMIZER = CN()
+_C.TRAIN.OPTIMIZER.NAME = 'AdamW'
+_C.TRAIN.OPTIMIZER.EPS = 1e-8
+_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)  # for adamW
+_C.TRAIN.OPTIMIZER.MOMENTUM = 0.9
+
+# misc
+_C.SAVE = "./output"
+_C.TAG = "default"
+_C.SAVE_FREQ = 10 # freq to save chpt
+_C.REPORT_FREQ = 100 # freq to logging info
+_C.VALIDATE_FREQ = 100 # freq to do validation
+_C.SEED = 0
+_C.EVAL = False # run evaluation only
+_C.AMP = False # mix precision training
+_C.LOCAL_RANK = 0
+_C.NGPUS = -1
+
+
+def _update_config_from_file(config, cfg_file):
+    config.defrost()
+    with open(cfg_file, 'r') as infile:
+        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
+    for cfg in yaml_cfg.setdefault('BASE', ['']):
+        if cfg:
+            _update_config_from_file(
+                config, os.path.join(os.path.dirname(cfg_file), cfg)
+            )
+    print('merging config from {}'.format(cfg_file))
+    config.merge_from_file(cfg_file)
+    config.freeze()
+
+def update_config(config, args):
+    """Update config by ArgumentParser
+    Args:
+        args: ArgumentParser contains options
+    Return:
+        config: updated config
+    """
+    if args.cfg:
+        _update_config_from_file(config, args.cfg)
+    config.defrost()
+    if args.dataset:
+        config.DATA.DATASET = args.dataset
+    if args.batch_size:
+        config.DATA.BATCH_SIZE = args.batch_size
+    if args.image_size:
+        config.DATA.IMAGE_SIZE = args.image_size
+    if args.data_path:
+        config.DATA.DATA_PATH = args.data_path
+    if args.ngpus:
+        config.NGPUS = args.ngpus
+    if args.eval:
+        config.EVAL = True
+        config.DATA.BATCH_SIZE_EVAL = args.batch_size
+    if args.pretrained:
+        config.MODEL.PRETRAINED = args.pretrained
+    if args.resume:
+        config.MODEL.RESUME = args.resume
+    if args.last_epoch:
+        config.TRAIN.LAST_EPOCH = args.last_epoch
+    if args.amp: # only during training
+        if config.EVAL is True:
+            config.AMP = False
+        else:
+            config.AMP = True
+
+    #config.freeze()
+    return config
+
+
+def get_config(cfg_file=None):
+    """Return a clone of config or load from yaml file"""
+    config = _C.clone()
+    if cfg_file:
+        _update_config_from_file(config, cfg_file)
+    return config
diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml
new file mode 100644
index 00000000..eff0fc29
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_base_patch16_224.yaml
@@ -0,0 +1,21 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml
new file mode 100644
index 00000000..04cdfaee
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_base_patch16_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch16_384
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+
diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml
new file mode 100644
index 00000000..8b0516d2
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_base_patch32_224.yaml
@@ -0,0 +1,21 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch32_224
+    TRANS:
+        PATCH_SIZE: 32
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
diff --git a/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml b/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml
new file mode 100644
index 00000000..5aa3e6f1
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_base_patch32_384.yaml
@@ -0,0 +1,21 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_base_patch32_384
+    TRANS:
+        PATCH_SIZE: 32
+        EMBED_DIM: 768
+        MLP_RATIO: 4.0
+        DEPTH: 12
+        NUM_HEADS: 12
+        QKV_BIAS: true
+TRAIN:
+    NUM_EPOCHS: 300
+    WARMUP_EPOCHS: 3
+    WEIGHT_DECAY: 0.3
+    BASE_LR: 0.003
+    WARMUP_START_LR: 1e-6
+    END_LR: 5e-4
+    ACCUM_ITER: 2
diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml
new file mode 100644
index 00000000..23ac9b37
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_large_patch16_224.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 224
+    CROP_PCT: 0.875
+MODEL:
+    TYPE: ViT
+    NAME: vit_large_patch16_224
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 1024
+        MLP_RATIO: 4.0
+        DEPTH: 24
+        NUM_HEADS: 16
+        QKV_BIAS: true
+
diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml
new file mode 100644
index 00000000..c8c01a6a
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_large_patch16_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_large_patch16_384
+    TRANS:
+        PATCH_SIZE: 16
+        EMBED_DIM: 1024
+        MLP_RATIO: 4.0
+        DEPTH: 24
+        NUM_HEADS: 16
+        QKV_BIAS: true
+
diff --git a/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml b/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml
new file mode 100644
index 00000000..6b7f15aa
--- /dev/null
+++ b/image_classification/Multi_Node_Training/configs/vit_large_patch32_384.yaml
@@ -0,0 +1,14 @@
+DATA:
+    IMAGE_SIZE: 384
+    CROP_PCT: 1.0
+MODEL:
+    TYPE: ViT
+    NAME: vit_large_patch32_384
+    TRANS:
+        PATCH_SIZE: 32
+        EMBED_DIM: 1024
+        MLP_RATIO: 4.0
+        DEPTH: 24
+        NUM_HEADS: 16
+        QKV_BIAS: true
+
diff --git a/image_classification/Multi_Node_Training/datasets.py b/image_classification/Multi_Node_Training/datasets.py
new file mode 100644
index 00000000..e207f9ba
--- /dev/null
+++ b/image_classification/Multi_Node_Training/datasets.py
@@ -0,0 +1,187 @@
+# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Dataset related classes and methods for ViT training and validation
+Cifar10, Cifar100 and ImageNet2012 are supported
+"""
+
+import os
+import math
+from paddle.io import Dataset, DataLoader, DistributedBatchSampler
+from paddle.vision import transforms, datasets, image_load
+
+class ImageNet2012Dataset(Dataset):
+    """Build ImageNet2012 dataset
+
+    This class gets train/val imagenet datasets, which loads transfomed data and labels.
+
+    Attributes:
+        file_folder: path where imagenet images are stored
+        transform: preprocessing ops to apply on image
+        img_path_list: list of full path of images in whole dataset
+        label_list: list of labels of whole dataset
+    """
+
+    def __init__(self, file_folder, mode="train", transform=None):
+        """Init ImageNet2012 Dataset with dataset file path, mode(train/val), and transform"""
+        super(ImageNet2012Dataset, self).__init__()
+        assert mode in ["train", "val"]
+        self.file_folder = file_folder
+        self.transform = transform
+        self.img_path_list = []
+        self.label_list = []
+
+        if mode == "train":
+            self.list_file = os.path.join(self.file_folder, "train_list.txt")
+        else:
+            self.list_file = os.path.join(self.file_folder, "val_list.txt")
+
+        with open(self.list_file, 'r') as infile:
+            for line in infile:
+                img_path = line.strip().split()[0]
+                img_label = int(line.strip().split()[1])
+                self.img_path_list.append(os.path.join(self.file_folder, img_path))
+                self.label_list.append(img_label)
+        print(f'----- Imagenet2012 image {mode} list len = {len(self.label_list)}')
+
+    def __len__(self):
+        return len(self.label_list)
+
+    def __getitem__(self, index):
+        data = image_load(self.img_path_list[index]).convert('RGB')
+        data = self.transform(data)
+        label = self.label_list[index]
+
+        return data, label
+
+
+def get_train_transforms(config):
+    """ Get training transforms
+
+    For training, a RandomResizedCrop is applied, then normalization is applied with
+    [0.5, 0.5, 0.5] mean and std. The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    transforms_train = transforms.Compose([
+        transforms.RandomResizedCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE),
+                                     scale=(0.05, 1.0)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_train
+
+
+def get_val_transforms(config):
+    """ Get training transforms
+
+    For validation, image is first Resize then CenterCrop to image_size.
+    Then normalization is applied with [0.5, 0.5, 0.5] mean and std.
+    The input pixel values must be rescaled to [0, 1.]
+    Outputs is converted to tensor
+
+    Args:
+        config: configs contains IMAGE_SIZE, see config.py for details
+    Returns:
+        transforms_train: training transforms
+    """
+
+    scale_size = int(math.floor(config.DATA.IMAGE_SIZE / config.DATA.CROP_PCT))
+    transforms_val = transforms.Compose([
+        transforms.Resize(scale_size, 'bicubic'), # single int for resize shorter side of image
+        transforms.CenterCrop((config.DATA.IMAGE_SIZE, config.DATA.IMAGE_SIZE)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+        #transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    return transforms_val
+
+
+def get_dataset(config, mode='train'):
+    """ Get dataset from config and mode (train/val)
+
+    Returns the related dataset object according to configs and mode(train/val)
+
+    Args:
+        config: configs contains dataset related settings. see config.py for details
+    Returns:
+        dataset: dataset object
+    """
+    assert mode in ['train', 'val']
+    if config.DATA.DATASET == "cifar10":
+        if mode == 'train':
+            dataset = datasets.Cifar10(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar10(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "cifar100":
+        if mode == 'train':
+            dataset = datasets.Cifar100(mode=mode, transform=get_train_transforms(config))
+        else:
+            mode = 'test'
+            dataset = datasets.Cifar100(mode=mode, transform=get_val_transforms(config))
+    elif config.DATA.DATASET == "imagenet2012":
+        if mode == 'train':
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_train_transforms(config))
+        else:
+            dataset = ImageNet2012Dataset(config.DATA.DATA_PATH,
+                                          mode=mode,
+                                          transform=get_val_transforms(config))
+    else:
+        raise NotImplementedError(
+            "[{config.DATA.DATASET}] Only cifar10, cifar100, imagenet2012 are supported now")
+    return dataset
+
+
+def get_dataloader(config, dataset, mode='train', multi_process=False):
+    """Get dataloader with config, dataset, mode as input, allows multiGPU settings.
+
+        Multi-GPU loader is implements as distributedBatchSampler.
+
+    Args:
+        config: see config.py for details
+        dataset: paddle.io.dataset object
+        mode: train/val
+        multi_process: if True, use DistributedBatchSampler to support multi-processing
+    Returns:
+        dataloader: paddle.io.DataLoader object.
+    """
+
+    if mode == 'train':
+        batch_size = config.DATA.BATCH_SIZE
+    else:
+        batch_size = config.DATA.BATCH_SIZE_EVAL
+
+    if multi_process is True:
+        sampler = DistributedBatchSampler(dataset,
+                                          batch_size=batch_size,
+                                          shuffle=(mode == 'train'))
+        dataloader = DataLoader(dataset,
+                                batch_sampler=sampler,
+                                num_workers=config.DATA.NUM_WORKERS)
+    else:
+        dataloader = DataLoader(dataset,
+                                batch_size=batch_size,
+                                num_workers=config.DATA.NUM_WORKERS,
+                                shuffle=(mode == 'train'))
+    return dataloader
diff --git a/image_classification/Multi_Node_Training/droppath.py b/image_classification/Multi_Node_Training/droppath.py
new file mode 100644
index 00000000..25b8d5ff
--- /dev/null
+++ b/image_classification/Multi_Node_Training/droppath.py
@@ -0,0 +1,60 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
+"""
+
+import paddle
+import paddle.nn as nn
+
+
+class DropPath(nn.Layer):
+    """DropPath class"""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def drop_path(self, inputs):
+        """drop path op
+        Args:
+            input: tensor with arbitrary shape
+            drop_prob: float number of drop path probability, default: 0.0
+            training: bool, if current mode is training, default: False
+        Returns:
+            output: output tensor after drop path
+        """
+        # if prob is 0 or eval mode, return original input
+        if self.drop_prob == 0. or not self.training:
+            return inputs
+        keep_prob = 1 - self.drop_prob
+        keep_prob = paddle.to_tensor(keep_prob, dtype='float32')
+        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
+        random_tensor = keep_prob + paddle.rand(shape, dtype=inputs.dtype)
+        random_tensor = random_tensor.floor() # mask
+        output = inputs.divide(keep_prob) * random_tensor #divide is to keep same output expectation
+        return output
+
+    def forward(self, inputs):
+        return self.drop_path(inputs)
+
+
+#def main():
+#    tmp = paddle.to_tensor(np.random.rand(8, 16, 8, 8), dtype='float32')
+#    dp = DropPath(0.5)
+#    out = dp(tmp)
+#    print(out)
+#
+#if __name__ == "__main__":
+#    main()
diff --git a/image_classification/Multi_Node_Training/main_multi_node.py b/image_classification/Multi_Node_Training/main_multi_node.py
new file mode 100644
index 00000000..bb0462b2
--- /dev/null
+++ b/image_classification/Multi_Node_Training/main_multi_node.py
@@ -0,0 +1,379 @@
+#  Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""ViT training/validation using multiple GPU """
+
+import sys
+import os
+import time
+import logging
+import argparse
+import random
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.distributed as dist
+from datasets import get_dataloader
+from datasets import get_dataset
+from transformer import build_vit as build_model
+from utils import AverageMeter
+from utils import WarmupCosineScheduler
+from config import get_config
+from config import update_config
+
+
+parser = argparse.ArgumentParser('ViT')
+parser.add_argument('-cfg', type=str, default=None)
+parser.add_argument('-dataset', type=str, default=None)
+parser.add_argument('-batch_size', type=int, default=None)
+parser.add_argument('-image_size', type=int, default=None)
+parser.add_argument('-data_path', type=str, default=None)
+parser.add_argument('-ngpus', type=int, default=None)
+parser.add_argument('-pretrained', type=str, default=None)
+parser.add_argument('-resume', type=str, default=None)
+parser.add_argument('-last_epoch', type=int, default=None)
+parser.add_argument('-eval', action='store_true')
+parser.add_argument('-amp', action='store_true')
+parser.add_argument('-ips', type=str, default="127.0.0.1")
+arguments = parser.parse_args()
+
+
+log_format = "%(asctime)s %(message)s"
+logging.basicConfig(stream=sys.stdout, level=logging.INFO,
+                    format=log_format, datefmt="%m%d %I:%M:%S %p")
+
+# get default config
+config = get_config()
+# update config by arguments
+config = update_config(config, arguments)
+
+# set output folder
+if not config.EVAL:
+    config.SAVE = '{}/train-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+else:
+    config.SAVE = '{}/eval-{}'.format(config.SAVE, time.strftime('%Y%m%d-%H-%M-%S'))
+
+if not os.path.exists(config.SAVE):
+    os.makedirs(config.SAVE, exist_ok=True)
+
+# set logging format
+logger = logging.getLogger()
+fh = logging.FileHandler(os.path.join(config.SAVE, 'log.txt'))
+fh.setFormatter(logging.Formatter(log_format))
+logger.addHandler(fh)
+logger.info(f'config= {config}')
+
+
+def train(dataloader,
+          model,
+          criterion,
+          optimizer,
+          epoch,
+          total_batch,
+          debug_steps=100,
+          accum_iter=1,
+          amp=False):
+    """Training for one epoch
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        epoch: int, current epoch
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info, default: 100
+        accum_iter: int, num of iters for accumulating gradients, default: 1
+        amp: bool, if True, use mix precision training, default: False
+    Returns:
+        train_loss_meter.avg
+        train_acc_meter.avg
+        train_time
+    """
+    model.train()
+    train_loss_meter = AverageMeter()
+    train_acc_meter = AverageMeter()
+    if amp is True:
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+    time_st = time.time()
+
+    for batch_id, data in enumerate(dataloader):
+        image = data[0]
+        label = data[1]
+
+        if amp is True:
+            with paddle.amp.auto_cast():
+                output = model(image)
+                loss = criterion(output, label)
+            scaled = scaler.scale(loss)
+            scaled.backward()
+
+            if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+                scaler.minimize(optimizer, scaled)
+                optimizer.clear_grad()
+        else:
+            output = model(image)
+            loss = criterion(output, label)
+            #NOTE: division may be needed depending on the loss function
+            # Here no division is needed:
+            # default 'reduction' param in nn.CrossEntropyLoss is set to 'mean'
+            #
+            #loss =  loss / accum_iter
+            loss.backward()
+
+            if ((batch_id +1) % accum_iter == 0) or (batch_id + 1 == len(dataloader)):
+                optimizer.step()
+                optimizer.clear_grad()
+
+        pred = F.softmax(output)
+        acc = paddle.metric.accuracy(pred, label.unsqueeze(1))
+
+        batch_size = image.shape[0]
+        train_loss_meter.update(loss.numpy()[0], batch_size)
+        train_acc_meter.update(acc.numpy()[0], batch_size)
+
+        if batch_id % debug_steps == 0:
+            logger.info(
+                f"Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                f"Step[{batch_id:04d}/{total_batch:04d}], " +
+                f"Avg Loss: {train_loss_meter.avg:.4f}, " +
+                f"Avg Acc: {train_acc_meter.avg:.4f}")
+
+    train_time = time.time() - time_st
+    return train_loss_meter.avg, train_acc_meter.avg, train_time
+
+
+def validate(dataloader, model, criterion, total_batch, debug_steps=100):
+    """Validation for whole dataset
+    Args:
+        dataloader: paddle.io.DataLoader, dataloader instance
+        model: nn.Layer, a ViT model
+        criterion: nn.criterion
+        total_epoch: int, total num of epoch, for logging
+        debug_steps: int, num of iters to log info
+    Returns:
+        val_loss_meter.avg
+        val_acc1_meter.avg
+        val_acc5_meter.avg
+        val_time
+    """
+    model.eval()
+    val_loss_meter = AverageMeter()
+    val_acc1_meter = AverageMeter()
+    val_acc5_meter = AverageMeter()
+    time_st = time.time()
+
+    with paddle.no_grad():
+        for batch_id, data in enumerate(dataloader):
+            image = data[0]
+            label = data[1]
+
+            output = model(image)
+            loss = criterion(output, label)
+
+            pred = F.softmax(output)
+            acc1 = paddle.metric.accuracy(pred, label.unsqueeze(1))
+            acc5 = paddle.metric.accuracy(pred, label.unsqueeze(1), k=5)
+
+            dist.all_reduce(loss)
+            dist.all_reduce(acc1)
+            dist.all_reduce(acc5)
+            loss = loss / dist.get_world_size()
+            acc1 = acc1 / dist.get_world_size()
+            acc5 = acc5 / dist.get_world_size()
+
+            batch_size = paddle.to_tensor(image.shape[0])
+            dist.all_reduce(batch_size)
+
+            val_loss_meter.update(loss.numpy()[0], batch_size.numpy()[0])
+            val_acc1_meter.update(acc1.numpy()[0], batch_size.numpy()[0])
+            val_acc5_meter.update(acc5.numpy()[0], batch_size.numpy()[0])
+
+            if batch_id % debug_steps == 0:
+                logger.info(
+                    f"Val Step[{batch_id:04d}/{total_batch:04d}], " +
+                    f"Avg Loss: {val_loss_meter.avg:.4f}, " +
+                    f"Avg Acc@1: {val_acc1_meter.avg:.4f}, " +
+                    f"Avg Acc@5: {val_acc5_meter.avg:.4f}")
+
+    val_time = time.time() - time_st
+    return val_loss_meter.avg, val_acc1_meter.avg, val_acc5_meter.avg, val_time
+
+
+def main_worker(*args):
+    # 0. Preparation
+    dist.init_parallel_env()
+    last_epoch = config.TRAIN.LAST_EPOCH
+    world_size = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    logger.info(f'----- world_size = {world_size}, local_rank = {local_rank}')
+    seed = config.SEED + local_rank
+    paddle.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    # 1. Create model
+    model = build_model(config)
+    model = paddle.DataParallel(model)
+    # 2. Create train and val dataloader
+    dataset_train, dataset_val = args[0], args[1]
+    dataloader_train = get_dataloader(config, dataset_train, 'train', True)
+    dataloader_val = get_dataloader(config, dataset_val, 'test', True)
+    total_batch_train = len(dataloader_train)
+    total_batch_val = len(dataloader_val)
+    logging.info(f'----- Total # of train batch (single gpu): {total_batch_train}')
+    logging.info(f'----- Total # of val batch (single gpu): {total_batch_val}')
+    # 3. Define criterion
+    criterion = nn.CrossEntropyLoss()
+    # 4. Define optimizer and lr_scheduler
+    scheduler = None
+    if config.TRAIN.LR_SCHEDULER.NAME == "warmupcosine":
+        scheduler = WarmupCosineScheduler(learning_rate=config.TRAIN.BASE_LR,
+                                          warmup_start_lr=config.TRAIN.WARMUP_START_LR,
+                                          start_lr=config.TRAIN.BASE_LR,
+                                          end_lr=config.TRAIN.END_LR,
+                                          warmup_epochs=config.TRAIN.WARMUP_EPOCHS,
+                                          total_epochs=config.TRAIN.NUM_EPOCHS,
+                                          last_epoch=config.TRAIN.LAST_EPOCH,
+                                          )
+    elif config.TRAIN.LR_SCHEDULER.NAME == "cosine":
+        scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                             T_max=config.TRAIN.NUM_EPOCHS,
+                                                             last_epoch=last_epoch)
+    elif config.scheduler == "multi-step":
+        milestones = [int(v.strip()) for v in config.TRAIN.LR_SCHEDULER.MILESTONES.split(",")]
+        scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=config.TRAIN.BASE_LR,
+                                                       milestones=milestones,
+                                                       gamma=config.TRAIN.LR_SCHEDULER.DECAY_RATE,
+                                                       last_epoch=last_epoch)
+    else:
+        logging.fatal(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+        raise NotImplementedError(f"Unsupported Scheduler: {config.TRAIN.LR_SCHEDULER}.")
+
+    if config.TRAIN.OPTIMIZER.NAME == "SGD":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.Momentum(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            momentum=config.TRAIN.OPTIMIZER.MOMENTUM,
+            grad_clip=clip)
+    elif config.TRAIN.OPTIMIZER.NAME == "AdamW":
+        if config.TRAIN.GRAD_CLIP:
+            clip = paddle.nn.ClipGradByGlobalNorm(config.TRAIN.GRAD_CLIP)
+        else:
+            clip = None
+        optimizer = paddle.optimizer.AdamW(
+            parameters=model.parameters(),
+            learning_rate=scheduler if scheduler is not None else config.TRAIN.BASE_LR,
+            beta1=config.TRAIN.OPTIMIZER.BETAS[0],
+            beta2=config.TRAIN.OPTIMIZER.BETAS[1],
+            weight_decay=config.TRAIN.WEIGHT_DECAY,
+            epsilon=config.TRAIN.OPTIMIZER.EPS,
+            grad_clip=clip,
+            #apply_decay_param_fun=get_exclude_from_weight_decay_fn(['pos_embed', 'cls_token']),
+            )
+    else:
+        logging.fatal(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+        raise NotImplementedError(f"Unsupported Optimizer: {config.TRAIN.OPTIMIZER.NAME}.")
+
+    # 5. Load pretrained model / load resumt model and optimizer states
+    if config.MODEL.PRETRAINED:
+        if (config.MODEL.PRETRAINED).endswith('.pdparams'):
+            raise ValueError(f'{config.MODEL.PRETRAINED} should not contain .pdparams')
+        assert os.path.isfile(config.MODEL.PRETRAINED + '.pdparams') is True
+        model_state = paddle.load(config.MODEL.PRETRAINED+'.pdparams')
+        model.set_dict(model_state)
+        logger.info(f"----- Pretrained: Load model state from {config.MODEL.PRETRAINED}")
+
+    if config.MODEL.RESUME:
+        assert os.path.isfile(config.MODEL.RESUME+'.pdparams') is True
+        assert os.path.isfile(config.MODEL.RESUME+'.pdopt') is True
+        model_state = paddle.load(config.MODEL.RESUME+'.pdparams')
+        model.set_dict(model_state)
+        opt_state = paddle.load(config.MODEL.RESUME+'.pdopt')
+        optimizer.set_state_dict(opt_state)
+        logger.info(
+            f"----- Resume Training: Load model and optmizer states from {config.MODEL.RESUME}")
+    
+    # 6. Validation
+    if config.EVAL:
+        logger.info('----- Start Validating')
+        val_loss, val_acc1, val_acc5, val_time = validate(
+            dataloader=dataloader_val,
+            model=model,
+            criterion=criterion,
+            total_batch=total_batch_val,
+            debug_steps=config.REPORT_FREQ)
+        logger.info(f"Validation Loss: {val_loss:.4f}, " +
+                    f"Validation Acc@1: {val_acc1:.4f}, " +
+                    f"Validation Acc@5: {val_acc5:.4f}, " +
+                    f"time: {val_time:.2f}")
+        return
+
+    # 6. Start training and validation
+    logging.info(f"Start training from epoch {last_epoch+1}.")
+    for epoch in range(last_epoch+1, config.TRAIN.NUM_EPOCHS+1):
+        # train
+        logging.info(f"Now training epoch {epoch}. LR={optimizer.get_lr():.6f}")
+        train_loss, train_acc, train_time = train(dataloader=dataloader_train,
+                                                  model=model,
+                                                  criterion=criterion,
+                                                  optimizer=optimizer,
+                                                  epoch=epoch,
+                                                  total_batch=total_batch_train,
+                                                  debug_steps=config.REPORT_FREQ,
+                                                  accum_iter=config.TRAIN.ACCUM_ITER,
+                                                  amp=config.AMP)
+        scheduler.step()
+
+        logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                    f"Train Loss: {train_loss:.4f}, " +
+                    f"Train Acc: {train_acc:.4f}, " +
+                    f"time: {train_time:.2f}")
+        # validation
+        if epoch % config.VALIDATE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+            logger.info(f'----- Validation after Epoch: {epoch}')
+            val_loss, val_acc1, val_acc5, val_time = validate(
+                dataloader=dataloader_val,
+                model=model,
+                criterion=criterion,
+                total_batch=total_batch_val,
+                debug_steps=config.REPORT_FREQ)
+            logger.info(f"----- Epoch[{epoch:03d}/{config.TRAIN.NUM_EPOCHS:03d}], " +
+                        f"Validation Loss: {val_loss:.4f}, " +
+                        f"Validation Acc@1: {val_acc1:.4f}, " +
+                        f"Validation Acc@5: {val_acc5:.4f}, " +
+                        f"time: {val_time:.2f}")
+        # model save
+        if local_rank == 0:
+            if epoch % config.SAVE_FREQ == 0 or epoch == config.TRAIN.NUM_EPOCHS:
+                model_path = os.path.join(
+                    config.SAVE, f"{config.MODEL.TYPE}-Epoch-{epoch}-Loss-{train_loss}")
+                paddle.save(model.state_dict(), model_path + '.pdparams')
+                paddle.save(optimizer.state_dict(), model_path + '.pdopt')
+                logger.info(f"----- Save model: {model_path}.pdparams")
+                logger.info(f"----- Save optim: {model_path}.pdopt")
+
+
+def main():
+    dataset_train = get_dataset(config, mode='train')
+    dataset_val = get_dataset(config, mode='val')
+    config.NGPUS = len(paddle.static.cuda_places()) if config.NGPUS == -1 else config.NGPUS
+    dist.spawn(main_worker, args=(dataset_train, dataset_val, ), nprocs=config.NGPUS, ips=arguments.ips)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/image_classification/Multi_Node_Training/run_train_multi_node.sh b/image_classification/Multi_Node_Training/run_train_multi_node.sh
new file mode 100644
index 00000000..bc3893be
--- /dev/null
+++ b/image_classification/Multi_Node_Training/run_train_multi_node.sh
@@ -0,0 +1,8 @@
+CUDA_VISIBLE_DEVICES=0,1 \
+python main_multi_node.py \
+-cfg='./configs/vit_base_patch16_224.yaml' \
+-dataset='imagenet2012' \
+-batch_size=32 \
+-data_path='./dataset/imagenet' \
+-ips='172.18.0.2, 172.18.0.3' # the ips should be replaced
+#-amp
diff --git a/image_classification/Multi_Node_Training/transformer.py b/image_classification/Multi_Node_Training/transformer.py
new file mode 100644
index 00000000..384cacf5
--- /dev/null
+++ b/image_classification/Multi_Node_Training/transformer.py
@@ -0,0 +1,437 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Implement Transformer Class for ViT
+"""
+
+import copy
+import paddle
+import paddle.nn as nn
+from droppath import DropPath
+from config import get_config
+
+
+class Identity(nn.Layer):
+    """ Identity layer
+    The output of this layer is the input without any change.
+    Use this layer to avoid using 'if' condition in forward methods
+    """
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+class PatchEmbedding(nn.Layer):
+    """Patch Embedding and Position Embedding
+
+    Apply patch embedding and position embedding on input images.
+
+    Attributes:
+        patch_embddings: impl using a patch_size x patch_size Conv2D operation
+        position_embddings: a parameter with len = num_patch + 1(for cls_token)
+        cls_token: token insert to the patch feature for classification
+        dropout: dropout for embeddings
+    """
+
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 embed_dim=768,
+                 dropout=0.):
+        super().__init__()
+        n_patches = (image_size // patch_size) * (image_size // patch_size)
+
+        self.patch_embedding = nn.Conv2D(in_channels=in_channels,
+                                         out_channels=embed_dim,
+                                         kernel_size=patch_size,
+                                         stride=patch_size)
+
+        self.position_embeddings = paddle.create_parameter(
+            shape=[1, n_patches+1, embed_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.TruncatedNormal(std=.02))
+
+        self.cls_token = paddle.create_parameter(
+            shape=[1, 1, embed_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Constant(0))
+
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, x):
+        cls_tokens = self.cls_token.expand((x.shape[0], -1, -1))
+        x = self.patch_embedding(x)
+        x = x.flatten(2)
+        x = x.transpose([0, 2, 1])
+        x = paddle.concat((cls_tokens, x), axis=1)
+
+        embeddings = x + self.position_embeddings # tensor broadcast
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class Attention(nn.Layer):
+    """ Attention module
+
+    Attention module for ViT, here q, k, v are assumed the same.
+    The qkv mappings are stored as one single param.
+
+    Attributes:
+        num_heads: number of heads
+        attn_head_size: feature dim of single head
+        all_head_size: feature dim of all heads
+        qkv: a nn.Linear for q, k, v mapping
+        scales: 1 / sqrt(single_head_feature_dim)
+        out: projection of multi-head attention
+        attn_dropout: dropout for attention
+        proj_dropout: final dropout before output
+        softmax: softmax op for attention
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.):
+        super().__init__()
+        self.num_heads = num_heads 
+        self.attn_head_size = int(embed_dim / self.num_heads)
+        self.all_head_size = self.attn_head_size * self.num_heads
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.qkv = nn.Linear(embed_dim,
+                             self.all_head_size*3, #weights for q, k, and v
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1 if qkv_bias else False)
+
+        self.scales = self.attn_head_size ** -0.5
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.out = nn.Linear(embed_dim,
+                             embed_dim,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+
+        self.attn_dropout = nn.Dropout(attention_dropout)
+        self.proj_dropout = nn.Dropout(dropout)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def transpose_multihead(self, x):
+        new_shape = x.shape[:-1] + [self.num_heads, self.attn_head_size]
+        x = x.reshape(new_shape)
+        x = x.transpose([0, 2, 1, 3])
+        return x
+
+    def forward(self, x):
+        qkv = self.qkv(x).chunk(3, axis=-1)
+        q, k, v = map(self.transpose_multihead, qkv)
+
+        attn = paddle.matmul(q, k, transpose_y=True)
+        attn = attn * self.scales
+        attn = self.softmax(attn)
+        attn_weights = attn
+        attn = self.attn_dropout(attn)
+
+        z = paddle.matmul(attn, v)
+        z = z.transpose([0, 2, 1, 3])
+        new_shape = z.shape[:-2] + [self.all_head_size]
+        z = z.reshape(new_shape)
+        # reshape
+        z = self.out(z)
+        z = self.proj_dropout(z)
+        return z, attn_weights
+
+
+class Mlp(nn.Layer):
+    """ MLP module
+
+    Impl using nn.Linear and activation is GELU, dropout is applied.
+    Ops: fc -> act -> dropout -> fc -> dropout
+
+    Attributes:
+        fc1: nn.Linear
+        fc2: nn.Linear
+        act: GELU
+        dropout1: dropout after fc1
+        dropout2: dropout after fc2
+    """
+    def __init__(self,
+                 embed_dim,
+                 mlp_ratio,
+                 dropout=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.fc1 = nn.Linear(embed_dim,
+                             int(embed_dim * mlp_ratio),
+                             weight_attr=w_attr_1,
+                             bias_attr=b_attr_1)
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.fc2 = nn.Linear(int(embed_dim * mlp_ratio),
+                             embed_dim,
+                             weight_attr=w_attr_2,
+                             bias_attr=b_attr_2)
+        self.act = nn.GELU()
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.XavierUniform()) #default in pp: xavier
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.Normal(std=1e-6)) #default in pp: zero
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.dropout1(x)
+        x = self.fc2(x)
+        x = self.dropout2(x)
+        return x
+
+
+class EncoderLayer(nn.Layer):
+    """Encoder Layer
+
+    Encoder layer contains attention, norm, mlp and residual
+
+    Attributes:
+        hidden_size: transformer feature dim
+        attn_norm: nn.LayerNorm before attention
+        mlp_norm: nn.LayerNorm before mlp
+        mlp: mlp modual
+        attn: attention modual
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 qkv_bias=True,
+                 mlp_ratio=4.,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super().__init__()
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.attn_norm = nn.LayerNorm(embed_dim,
+                                      weight_attr=w_attr_1,
+                                      bias_attr=b_attr_1,
+                                      epsilon=1e-6)
+
+        self.attn = Attention(embed_dim,
+                              num_heads,
+                              qkv_bias,
+                              dropout,
+                              attention_dropout)
+        self.drop_path = DropPath(droppath) if droppath > 0. else Identity()
+
+        w_attr_2, b_attr_2 = self._init_weights()
+        self.mlp_norm = nn.LayerNorm(embed_dim,
+                                     weight_attr=w_attr_2,
+                                     bias_attr=b_attr_2,
+                                     epsilon=1e-6)
+
+        self.mlp = Mlp(embed_dim, mlp_ratio, dropout)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        h = x
+        x = self.attn_norm(x)
+        x, attn = self.attn(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        h = x
+        x = self.mlp_norm(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x + h
+
+        return x, attn
+
+
+class Encoder(nn.Layer):
+    """Transformer encoder
+
+    Encoder encoder contains a list of EncoderLayer, and a LayerNorm.
+
+    Attributes:
+        layers: nn.LayerList contains multiple EncoderLayers
+        encoder_norm: nn.LayerNorm which is applied after last encoder layer
+    """
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 depth,
+                 qkv_bias=True,
+                 mlp_ratio=4.0,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.):
+        super(Encoder, self).__init__()
+        # stochatic depth decay
+        depth_decay = [x.item() for x in paddle.linspace(0, droppath, depth)]
+        layer_list = []
+        for i in range(depth):
+            encoder_layer = EncoderLayer(embed_dim,
+                                         num_heads,
+                                         qkv_bias=True,
+                                         mlp_ratio=4.,
+                                         dropout=0.,
+                                         attention_dropout=0.,
+                                         droppath=depth_decay[i])
+            layer_list.append(copy.deepcopy(encoder_layer))
+        self.layers = nn.LayerList(layer_list)
+
+        w_attr_1, b_attr_1 = self._init_weights()
+        self.encoder_norm = nn.LayerNorm(embed_dim,
+                                         weight_attr=w_attr_1,
+                                         bias_attr=b_attr_1,
+                                         epsilon=1e-6)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(0.0))
+        bias_attr = paddle.ParamAttr(initializer=nn.initializer.Constant(1.0))
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        self_attn = []
+        for layer in self.layers:
+            x, attn = layer(x)
+            self_attn.append(attn)
+        out = self.encoder_norm(x)
+        return out, self_attn
+
+
+class VisualTransformer(nn.Layer):
+    """ViT transformer
+
+    ViT Transformer, classifier is a single Linear layer for finetune,
+    For training from scratch, two layer mlp should be used.
+    Classification is done using cls_token.
+
+    Args:
+        image_size: int, input image size, default: 224
+        patch_size: int, patch size, default: 16
+        in_channels: int, input image channels, default: 3
+        num_classes: int, number of classes for classification, default: 1000
+        embed_dim: int, embedding dimension (patch embed out dim), default: 768
+        depth: int, number ot transformer blocks, default: 12
+        num_heads: int, number of attention heads, default: 12
+        mlp_ratio: float, ratio of mlp hidden dim to embed dim(mlp in dim), default: 4.0
+        qkv_bias: bool, If True, enable qkv(nn.Linear) layer with bias, default: True
+        dropout: float, dropout rate for linear layers, default: 0.
+        attention_dropout: float, dropout rate for attention layers default: 0.
+        droppath: float, droppath rate for droppath layers, default: 0.
+    """
+    def __init__(self,
+                 image_size=224,
+                 patch_size=16,
+                 in_channels=3,
+                 num_classes=1000,
+                 embed_dim=768,
+                 depth=12,
+                 num_heads=12,
+                 mlp_ratio=4,
+                 qkv_bias=True,
+                 dropout=0.,
+                 attention_dropout=0.,
+                 droppath=0.,
+                 train_from_scratch=False,
+                 config=None):
+        super(VisualTransformer, self).__init__()
+        # create patch embedding with positional embedding
+        self.patch_embedding = PatchEmbedding(image_size,
+                                              patch_size,
+                                              in_channels,
+                                              embed_dim,
+                                              dropout)
+        # create multi head self-attention layers
+        self.encoder = Encoder(embed_dim,
+                               num_heads,
+                               depth,
+                               qkv_bias,
+                               mlp_ratio,
+                               dropout,
+                               attention_dropout,
+                               droppath)
+
+        # classifier head (for training from scracth)
+        if train_from_scratch:
+            w_attr_1, b_attr_1 = self._init_weights()
+            w_attr_2, b_attr_2 = self._init_weights()
+            self.classifier = nn.Sequential(
+                                nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                                          config.MODEL.TRANS.HIDDEN_SIZE,
+                                          weight_attr=w_attr_1,
+                                          bias_attr=b_attr_1),
+                                nn.ReLU(),
+                                nn.Dropout(config.MODEL.DROPOUT),
+                                nn.Linear(config.MODEL.TRANS.HIDDEN_SIZE,
+                                          config.MODEL.NUM_CLASSES,
+                                          weight_attr=w_attr_2,
+                                          bias_attr=b_attr_2),
+                                nn.Dropout(config.MODEL.DROPOUT),
+                                )
+        else:
+        # classifier head (for finetuning)
+            w_attr_1, b_attr_1 = self._init_weights()
+            self.classifier = nn.Linear(embed_dim,
+                                        num_classes,
+                                        weight_attr=w_attr_1,
+                                        bias_attr=b_attr_1)
+
+    def _init_weights(self):
+        weight_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        bias_attr = paddle.ParamAttr(
+            initializer=paddle.nn.initializer.KaimingUniform())
+        return weight_attr, bias_attr
+
+    def forward(self, x):
+        x = self.patch_embedding(x)
+        x, attn = self.encoder(x)
+        logits = self.classifier(x[:, 0]) # take only cls_token as classifier
+        return logits
+
+
+def build_vit(config):
+    model = VisualTransformer(image_size=config.DATA.IMAGE_SIZE,
+                              patch_size=config.MODEL.TRANS.PATCH_SIZE,
+                              in_channels=3,
+                              num_classes=config.MODEL.NUM_CLASSES,
+                              embed_dim=config.MODEL.TRANS.EMBED_DIM,
+                              depth=config.MODEL.TRANS.DEPTH,
+                              num_heads=config.MODEL.TRANS.NUM_HEADS,
+                              mlp_ratio=config.MODEL.TRANS.MLP_RATIO,
+                              qkv_bias=config.MODEL.TRANS.QKV_BIAS,
+                              dropout=config.MODEL.DROPOUT,
+                              attention_dropout=config.MODEL.ATTENTION_DROPOUT,
+                              droppath=config.MODEL.DROPPATH,
+                              train_from_scratch=False,
+                              config=config)
+    return model
diff --git a/image_classification/Multi_Node_Training/utils.py b/image_classification/Multi_Node_Training/utils.py
new file mode 100644
index 00000000..44800527
--- /dev/null
+++ b/image_classification/Multi_Node_Training/utils.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""utils for ViT
+
+Contains AverageMeter for monitoring, get_exclude_from_decay_fn for training
+and WarmupCosineScheduler for training
+
+"""
+
+import math
+from paddle.optimizer.lr import LRScheduler
+
+
+class AverageMeter():
+    """ Meter for monitoring losses"""
+    def __init__(self):
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+        self.reset()
+
+    def reset(self):
+        """reset all values to zeros"""
+        self.avg = 0
+        self.sum = 0
+        self.cnt = 0
+
+    def update(self, val, n=1):
+        """update avg by val and n, where val is the avg of n values"""
+        self.sum += val * n
+        self.cnt += n
+        self.avg = self.sum / self.cnt
+
+
+
+def get_exclude_from_weight_decay_fn(exclude_list=[]):
+    """ Set params with no weight decay during the training
+
+    For certain params, e.g., positional encoding in ViT, weight decay
+    may not needed during the learning, this method is used to find
+    these params.
+
+    Args:
+        exclude_list: a list of params names which need to exclude
+                      from weight decay.
+    Returns:
+        exclude_from_weight_decay_fn: a function returns True if param
+                                      will be excluded from weight decay
+    """
+    if len(exclude_list) == 0:
+        exclude_from_weight_decay_fn = None
+    else:
+        def exclude_fn(param):
+            for name in exclude_list:
+                if param.endswith(name):
+                    return False
+            return True
+        exclude_from_weight_decay_fn = exclude_fn
+    return exclude_from_weight_decay_fn
+
+
+class WarmupCosineScheduler(LRScheduler):
+    """Warmup Cosine Scheduler
+
+    First apply linear warmup, then apply cosine decay schedule.
+    Linearly increase learning rate from "warmup_start_lr" to "start_lr" over "warmup_epochs"
+    Cosinely decrease learning rate from "start_lr" to "end_lr" over remaining
+    "total_epochs - warmup_epochs"
+
+    Attributes:
+        learning_rate: the starting learning rate (without warmup), not used here!
+        warmup_start_lr: warmup starting learning rate
+        start_lr: the starting learning rate (without warmup)
+        end_lr: the ending learning rate after whole loop
+        warmup_epochs: # of epochs for warmup
+        total_epochs: # of total epochs (include warmup)
+    """
+    def __init__(self,
+                 learning_rate,
+                 warmup_start_lr,
+                 start_lr,
+                 end_lr,
+                 warmup_epochs,
+                 total_epochs,
+                 cycles=0.5,
+                 last_epoch=-1,
+                 verbose=False):
+        """init WarmupCosineScheduler """
+        self.warmup_epochs = warmup_epochs
+        self.total_epochs = total_epochs
+        self.warmup_start_lr = warmup_start_lr
+        self.start_lr = start_lr
+        self.end_lr = end_lr
+        self.cycles = cycles
+        super(WarmupCosineScheduler, self).__init__(learning_rate, last_epoch, verbose)
+
+    def get_lr(self):
+        """ return lr value """
+        if self.last_epoch < self.warmup_epochs:
+            val = (self.start_lr - self.warmup_start_lr) * float(
+                self.last_epoch)/float(self.warmup_epochs) + self.warmup_start_lr
+            return val
+
+        progress = float(self.last_epoch - self.warmup_epochs) / float(
+            max(1, self.total_epochs - self.warmup_epochs))
+        val = max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress)))
+        val = max(0.0, val * (self.start_lr - self.end_lr) + self.end_lr)
+        return val