Merge pull request #14 from modelai/ymir-dev

yzbx · web-flow · commit 76d607697db0 · 2022-11-04T17:17:07.000+08:00
update for ymir2.0.0
diff --git a/README.MD b/README.MD
@@ -93,6 +93,8 @@ docker build -t ymir-executor/mmdet:cu111-tmi -f docker/Dockerfile.cuda111 .
 
 - [ymir-executor-sdk](https://github.com/modelai/ymir-executor-sdk) ymir-executor development SDK.
 
+  - [dataset convert](https://github.com/modelai/ymir-executor-sdk/blob/master/docs/dataset_convert.md)
+
 - [ymir-executor-verifer](https://github.com/modelai/ymir-executor-verifier) debug and check your ymir-executor
 
 ## how to import pretrained model weights
diff --git a/README_zh-CN.MD b/README_zh-CN.MD
@@ -109,6 +109,8 @@ docker build -t ymir-executor/live-code:mxnet-tmi -f mxnet.dockerfile
 
 - [ymir-executor-sdk](https://github.com/modelai/ymir-executor-sdk) ymir镜像开发辅助库
 
+  - [数据集转换](https://github.com/modelai/ymir-executor-sdk/blob/master/docs/dataset_convert.md)
+
 - [ymir-executor-verifer](https://github.com/modelai/ymir-executor-verifier) 调试与检测 ymir-executor
 
 ## 如何导入预训练模型
diff --git a/det-mmdetection-tmi/mining_base.py b/det-mmdetection-tmi/mining_base.py
@@ -2,7 +2,7 @@
 from typing import List
 
 import torch
-import torch.nn.functional as F
+import torch.nn.functional as F  # noqa
 from easydict import EasyDict as edict
 
 
diff --git a/det-mmdetection-tmi/start.py b/det-mmdetection-tmi/start.py
@@ -39,8 +39,8 @@ def _run_mining(cfg: edict) -> None:
     gpu_count = len(gpu_id.split(','))
     mining_algorithm: str = cfg.param.get('mining_algorithm', 'aldd')
 
-    supported_mining_algorithm = ['cald', 'aldd', 'random']
-    assert mining_algorithm in supported_mining_algorithm, f'unknown mining_algorithm {mining_algorithm}, not in {supported_mining_algorithm}'
+    supported_miner = ['cald', 'aldd', 'random', 'entropy']
+    assert mining_algorithm in supported_miner, f'unknown mining_algorithm {mining_algorithm}, not in {supported_miner}'
     if gpu_count <= 1:
         command = f'python3 ymir_mining_{mining_algorithm}.py'
     else:
@@ -67,7 +67,6 @@ def _run_infer() -> None:
 
     cfg = get_merged_config()
     os.environ.setdefault('YMIR_MODELS_DIR', cfg.ymir.output.models_dir)
-    os.environ.setdefault('COCO_EVAL_TMP_FILE', os.path.join(
-        cfg.ymir.output.root_dir, 'eval_tmp.json'))
+    os.environ.setdefault('COCO_EVAL_TMP_FILE', os.path.join(cfg.ymir.output.root_dir, 'eval_tmp.json'))
     os.environ.setdefault('PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION', 'python')
     sys.exit(start(cfg))
diff --git a/det-mmdetection-tmi/ymir_infer.py b/det-mmdetection-tmi/ymir_infer.py
@@ -12,7 +12,7 @@
 from mmdet.utils.util_ymir import get_best_weight_file
 from tqdm import tqdm
 from ymir_exc import dataset_reader as dr
-from ymir_exc import env, monitor
+from ymir_exc import env
 from ymir_exc import result_writer as rw
 from ymir_exc.util import YmirStage, get_merged_config, write_ymir_monitor_process
 
diff --git a/det-mmdetection-tmi/ymir_mining_cald.py b/det-mmdetection-tmi/ymir_mining_cald.py
@@ -11,14 +11,12 @@
 import numpy as np
 import torch
 import torch.distributed as dist
-from easydict import EasyDict as edict
 from mmcv.runner import init_dist
 from mmdet.apis.test import collect_results_gpu
 from mmdet.utils.util_ymir import BBOX, CV_IMAGE
 from nptyping import NDArray
 from scipy.stats import entropy
 from tqdm import tqdm
-from ymir_exc import monitor
 from ymir_exc import result_writer as rw
 from ymir_exc.util import YmirStage, get_merged_config, write_ymir_monitor_process
 from ymir_infer import YmirModel
@@ -250,11 +248,7 @@ def split_result(result: NDArray) -> Tuple[BBOX, NDArray, NDArray]:
     return bboxes, conf, class_id
 
 
-class YmirMining(YmirModel):
-
-    def __init__(self, cfg: edict):
-        super().__init__(cfg)
-
+class CALDMiner(YmirModel):
     def mining(self):
         with open(self.cfg.ymir.input.candidate_index_file, 'r') as f:
             images = [line.strip() for line in f.readlines()]
@@ -276,7 +270,7 @@ def mining(self):
         beta = 1.3
         mining_result = []
         for idx, asset_path in enumerate(tbar):
-            if idx % monitor_gap == 0:
+            if idx % monitor_gap == 0 and RANK in [0, -1]:
                 write_ymir_monitor_process(self.cfg, task='mining', naive_stage_percent=idx / N, stage=YmirStage.TASK)
 
             # batch-level sync, avoid 30min time-out error
@@ -380,7 +374,7 @@ def main():
         init_dist(launcher='pytorch', backend="nccl" if dist.is_nccl_available() else "gloo")
 
     cfg = get_merged_config()
-    miner = YmirMining(cfg)
+    miner = CALDMiner(cfg)
     gpu = max(0, LOCAL_RANK)
     device = torch.device('cuda', gpu)
     miner.model.to(device)
diff --git a/det-mmdetection-tmi/ymir_mining_entropy.py b/det-mmdetection-tmi/ymir_mining_entropy.py
@@ -0,0 +1,87 @@
+"""
+entropy mining
+"""
+import os
+import sys
+
+import cv2
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import init_dist
+from mmdet.apis.test import collect_results_gpu
+from tqdm import tqdm
+from ymir_exc import result_writer as rw
+from ymir_exc.util import YmirStage, get_merged_config, write_ymir_monitor_process
+from ymir_mining_cald import split_result, CALDMiner
+
+LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1))  # https://pytorch.org/docs/stable/elastic/run.html
+RANK = int(os.getenv('RANK', -1))
+WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
+
+
+class EntropyMiner(CALDMiner):
+
+    def mining(self):
+        with open(self.cfg.ymir.input.candidate_index_file, 'r') as f:
+            images = [line.strip() for line in f.readlines()]
+
+        max_barrier_times = len(images) // WORLD_SIZE
+        if RANK == -1:
+            N = len(images)
+            tbar = tqdm(images)
+        else:
+            images_rank = images[RANK::WORLD_SIZE]
+            N = len(images_rank)
+            if RANK == 0:
+                tbar = tqdm(images_rank)
+            else:
+                tbar = images_rank
+
+        monitor_gap = max(1, N // 100)
+        mining_result = []
+        for idx, asset_path in enumerate(tbar):
+            if idx % monitor_gap == 0 and RANK in [0, -1]:
+                write_ymir_monitor_process(self.cfg, task='mining', naive_stage_percent=idx / N, stage=YmirStage.TASK)
+            # batch-level sync, avoid 30min time-out error
+            if WORLD_SIZE > 1 and idx < max_barrier_times:
+                dist.barrier()
+
+            img = cv2.imread(asset_path)
+            # xyxy,conf,cls
+            result = self.predict(img)
+            bboxes, conf, _ = split_result(result)
+            if len(result) == 0:
+                # no result for the image without augmentation
+                mining_result.append((asset_path, -10))
+                continue
+            conf = conf.data.cpu().numpy()
+            mining_result.append((asset_path, -np.sum(conf * np.log2(conf))))
+
+        if WORLD_SIZE > 1:
+            mining_result = collect_results_gpu(mining_result, len(images))
+
+        return mining_result
+
+
+def main():
+    if LOCAL_RANK != -1:
+        init_dist(launcher='pytorch', backend="nccl" if dist.is_nccl_available() else "gloo")
+
+    cfg = get_merged_config()
+    miner = EntropyMiner(cfg)
+    gpu = max(0, LOCAL_RANK)
+    device = torch.device('cuda', gpu)
+    miner.model.to(device)
+    mining_result = miner.mining()
+
+    if RANK in [0, -1]:
+        rw.write_mining_result(mining_result=mining_result)
+
+        write_ymir_monitor_process(cfg, task='mining', naive_stage_percent=1, stage=YmirStage.POSTPROCESS)
+
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/det-yolov5-tmi/ymir/docker/cuda102.dockerfile b/det-yolov5-tmi/ymir/docker/cuda102.dockerfile
@@ -21,7 +21,7 @@ RUN	apt-get update && apt-get install -y gnupg2 git libglib2.0-0 \
     && rm -rf /var/lib/apt/lists/*
 
 # install ymir-exc sdk
-RUN pip install "git+https://github.com/modelai/ymir-executor-sdk.git@ymir1.0.0"
+RUN pip install "git+https://github.com/modelai/ymir-executor-sdk.git@ymir1.3.0"
 
 # Copy file from host to docker and install requirements
 COPY . /app
diff --git a/det-yolov5-tmi/ymir/docker/cuda111.dockerfile b/det-yolov5-tmi/ymir/docker/cuda111.dockerfile
@@ -24,7 +24,7 @@ RUN	apt-get update && apt-get install -y gnupg2 git libglib2.0-0 \
 
 COPY ./requirements.txt /workspace/
 # install ymir-exc sdk and requirements
-RUN pip install "git+https://github.com/modelai/ymir-executor-sdk.git@ymir1.0.0" \
+RUN pip install "git+https://github.com/modelai/ymir-executor-sdk.git@ymir1.3.0" \
     && pip install -r /workspace/requirements.txt
 
 # Copy file from host to docker and install requirements
diff --git a/det-yolov5-tmi/ymir/mining/data_augment.py b/det-yolov5-tmi/ymir/mining/data_augment.py
@@ -8,6 +8,7 @@
 import cv2
 import numpy as np
 from nptyping import NDArray
+
 from ymir.ymir_yolov5 import BBOX, CV_IMAGE
 
 
diff --git a/det-yolov5-tmi/ymir/mining/ymir_mining_cald.py b/det-yolov5-tmi/ymir/mining/ymir_mining_cald.py
@@ -74,7 +74,10 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
             pred = ymir_yolov5.forward(batch['image'].float().to(device), nms=True)
 
         if RANK in [-1, 0]:
-            write_ymir_monitor_process(ymir_cfg, task='mining', naive_stage_percent=0.3 * idx * batch_size_per_gpu / dataset_size, stage=YmirStage.TASK)
+            write_ymir_monitor_process(ymir_cfg,
+                                       task='mining',
+                                       naive_stage_percent=0.3 * idx * batch_size_per_gpu / dataset_size,
+                                       stage=YmirStage.TASK)
         preprocess_image_shape = batch['image'].shape[2:]
         for inner_idx, det in enumerate(pred):  # per image
             result_per_image = []
@@ -102,13 +105,15 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
                                        pin_memory=ymir_yolov5.pin_memory,
                                        drop_last=False)
 
-    # cannot sync here!!!
     dataset_size = len(results)
     monitor_gap = max(1, dataset_size // 1000 // batch_size_per_gpu)
     pbar = tqdm(aug_dataset_loader) if RANK == 0 else aug_dataset_loader
     for idx, batch in enumerate(pbar):
         if idx % monitor_gap == 0 and RANK in [-1, 0]:
-            write_ymir_monitor_process(ymir_cfg, task='mining', naive_stage_percent=0.3 + 0.7 * idx * batch_size_per_gpu / dataset_size, stage=YmirStage.TASK)
+            write_ymir_monitor_process(ymir_cfg,
+                                       task='mining',
+                                       naive_stage_percent=0.3 + 0.7 * idx * batch_size_per_gpu / dataset_size,
+                                       stage=YmirStage.TASK)
 
         batch_consistency = [0.0 for _ in range(len(batch['image_file']))]
         aug_keys = ['flip', 'cutout', 'rotate', 'resize']
diff --git a/det-yolov5-tmi/ymir/start.py b/det-yolov5-tmi/ymir/start.py
@@ -73,8 +73,8 @@ def _run_training(cfg: edict) -> None:
         str(batch_size), '--data', f'{out_dir}/data.yaml', '--project', project, '--cfg', f'models/{model}.yaml',
         '--name', name, '--weights', weights, '--img-size',
         str(img_size), '--save-period',
-        str(save_period), '--device', device,
-        '--workers', str(num_workers_per_gpu)
+        str(save_period), '--device', device, '--workers',
+        str(num_workers_per_gpu)
     ])
 
     if save_best_only:
diff --git a/docs/official-docker-image.md b/docs/official-docker-image.md
@@ -1,11 +1,11 @@
 # official docker image
 
-update: 2022/10/27
+update: 2022/11/01
 
 ## the hyper-parameters for ymir-executor
 
 | docker images | epochs/iters | model structure | image size | batch_size |
-| - | - | - |
+| - | - | - | - | - |
 | yolov5 | epochs | model | img_size | batch_size_per_gpu |
 | mmdetection | max_epochs | config_file | - | samples_per_gpu |
 | yolov4 | max_batches | - | image_height, image_width | batch |
@@ -16,10 +16,43 @@ update: 2022/10/27
 
 - epochs: such as `epochs` or `max_epochs`, control the time for training.
 - iters: such as `max_batches` or `max_iter`, control the time for training.
-- ymir_saved_file_patterns: save the file match one of the pattern. for example `best.pt, *.yaml` will save `best.pt` and all the `*.yaml` file in `/out/model` directory.
+- ymir_saved_file_patterns: save the file match one of the pattern. for example `best.pt, *.yaml` will save `best.pt` and all the `*.yaml` file in `/out/models` directory.
 - export_format: the dataset format for ymir-executor in `/in`, support `ark:raw` and `voc:raw`
 - args_options/cfg_options: for yolov5, use it for other options, such as `--multi-scale --single-cls --optimizer SGD` and so on, view `train.py, parse_opt()` for detail. for mmdetection and detectron2, it provides methods to change other hyper-pameters not defined in `/img-man/training-template.yaml`
 
+## docker image format
+
+youdaoyzbx/ymir-executor:[ymir-version]-[repository]-[cuda version]-[ymir-executor function]
+
+- ymir-version
+    - ymir1.1.0
+    - ymir1.2.0
+    - ymir1.3.0
+    - ymir2.0.0
+
+- repository
+    - yolov4
+    - yolov5
+    - yolov7
+    - mmdet
+    - detectron2
+    - vidt
+    - nanodet
+
+- cuda version
+    - cu101: cuda 10.1
+    - cu102: cuda 10.2
+    - cu111: cuda 11.1
+    - cu112: cuda 11.2
+
+- ymir-executor function
+    - t: training
+    - m: mining
+    - i: infer
+    - d: deploy
+
+
+
 ## ymir2.0.0
 
 2022/10/26: support ymir1.1.0/1.2.0/1.3.0/2.0.0
@@ -30,6 +63,8 @@ youdaoyzbx/ymir-executor:ymir2.0.0-yolov7-cu111-tmi
 youdaoyzbx/ymir-executor:ymir2.0.0-mmdet-cu111-tmi
 youdaoyzbx/ymir-executor:ymir2.0.0-detectron2-cu111-tmi
 youdaoyzbx/ymir-executor:ymir2.0.0-vidt-cu111-tmi
+youdaoyzbx/ymir-executor:ymir2.0.0-nanodet-cu111-tmi
+youdaoyzbx/ymir-executor:ymir2.0.0-yolov5-cu111-tmid # support deploy
 youdaoyzbx/ymir-executor:ymir2.0.0-yolov4-cu111-tmi  # deprecated
 ```
 
diff --git a/docs/ymir-dataset-zh-CN.md b/docs/ymir-dataset-zh-CN.md