Skip to content

Commit 3464c51

Browse files
committed
update mmdet
1 parent 8de25b9 commit 3464c51

7 files changed

Lines changed: 66 additions & 19 deletions

File tree

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
shm_size: '128G'
2-
mining_algorithm: cald
2+
mining_algorithm: aldd
33
class_distribution_scores: '' # 1.0,1.0,0.1,0.2

det-mmdetection-tmi/start.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,15 @@ def _run_mining(cfg: edict) -> None:
5353

5454

5555
def _run_infer() -> None:
56-
command = 'python3 ymir_infer.py'
56+
gpu_id: str = str(cfg.param.get('gpu_id', '0'))
57+
gpu_count = len(gpu_id.split(','))
58+
59+
if gpu_count <= 1:
60+
command = 'python3 ymir_infer.py'
61+
else:
62+
port = find_free_port()
63+
command = f'python3 -m torch.distributed.launch --nproc_per_node {gpu_count} --master_port {port} ymir_infer.py' # noqa
64+
5765
logging.info(f'start infer: {command}')
5866
subprocess.run(command.split(), check=True)
5967
logging.info("infer finished")

det-mmdetection-tmi/ymir_infer.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,27 @@
11
import argparse
2+
import os
23
import os.path as osp
34
import sys
45
import warnings
56
from typing import Any, List
67

78
import cv2
89
import numpy as np
10+
import torch.distributed as dist
911
from easydict import EasyDict as edict
1012
from mmcv import DictAction
11-
from mmdet.apis import inference_detector, init_detector
12-
from mmdet.utils.util_ymir import get_best_weight_file
13+
from mmcv.runner import init_dist
1314
from tqdm import tqdm
14-
from ymir_exc import dataset_reader as dr
15-
from ymir_exc import env
1615
from ymir_exc import result_writer as rw
17-
from ymir_exc.util import YmirStage, get_merged_config, write_ymir_monitor_process
16+
from ymir_exc.util import (YmirStage, get_merged_config, write_ymir_monitor_process)
17+
18+
from mmdet.apis import inference_detector, init_detector
19+
from mmdet.apis.test import collect_results_gpu
20+
from mmdet.utils.util_ymir import get_best_weight_file
21+
22+
LOCAL_RANK = int(os.getenv('LOCAL_RANK', -1)) # https://pytorch.org/docs/stable/elastic/run.html
23+
RANK = int(os.getenv('RANK', -1))
24+
WORLD_SIZE = int(os.getenv('WORLD_SIZE', 1))
1825

1926

2027
def parse_option(cfg_options: str) -> dict:
@@ -80,8 +87,9 @@ def __init__(self, cfg: edict):
8087
cfg_options = parse_option(options) if options else None
8188

8289
# current infer can only use one gpu!!!
83-
gpu_ids = cfg.param.get('gpu_id', '0')
84-
gpu_id = gpu_ids.split(',')[0]
90+
# gpu_ids = cfg.param.get('gpu_id', '0')
91+
# gpu_id = gpu_ids.split(',')[0]
92+
gpu_id = max(0, RANK)
8593
# build the model from a config file and a checkpoint file
8694
self.model = init_detector(config_file, checkpoint_file, device=f'cuda:{gpu_id}', cfg_options=cfg_options)
8795

@@ -90,26 +98,47 @@ def infer(self, img):
9098

9199

92100
def main():
101+
if LOCAL_RANK != -1:
102+
init_dist(launcher='pytorch', backend="nccl" if dist.is_nccl_available() else "gloo")
103+
93104
cfg = get_merged_config()
94105

95-
N = dr.items_count(env.DatasetType.CANDIDATE)
106+
with open(cfg.ymir.input.candidate_index_file, 'r') as f:
107+
images = [line.strip() for line in f.readlines()]
108+
109+
max_barrier_times = len(images) // WORLD_SIZE
110+
if RANK == -1:
111+
N = len(images)
112+
tbar = tqdm(images)
113+
else:
114+
images_rank = images[RANK::WORLD_SIZE]
115+
N = len(images_rank)
116+
if RANK == 0:
117+
tbar = tqdm(images_rank)
118+
else:
119+
tbar = images_rank
96120
infer_result = dict()
97121
model = YmirModel(cfg)
98-
idx = -1
99122

100123
# write infer result
101124
monitor_gap = max(1, N // 100)
102125
conf_threshold = float(cfg.param.conf_threshold)
103-
for asset_path, _ in tqdm(dr.item_paths(dataset_type=env.DatasetType.CANDIDATE)):
126+
for idx, asset_path in enumerate(tbar):
104127
img = cv2.imread(asset_path)
105128
result = model.infer(img)
106129
raw_anns = mmdet_result_to_ymir(result, cfg.param.class_names)
107130

131+
# batch-level sync, avoid 30min time-out error
132+
if WORLD_SIZE > 1 and idx < max_barrier_times:
133+
dist.barrier()
134+
108135
infer_result[asset_path] = [ann for ann in raw_anns if ann.score >= conf_threshold]
109-
idx += 1
110136

111137
if idx % monitor_gap == 0:
112-
write_ymir_monitor_process(cfg, task='infer', naive_stage_percent=idx / N, stage = YmirStage.TASK)
138+
write_ymir_monitor_process(cfg, task='infer', naive_stage_percent=idx / N, stage=YmirStage.TASK)
139+
140+
if WORLD_SIZE > 1:
141+
infer_result = collect_results_gpu(infer_result, len(images))
113142

114143
rw.write_infer_result(infer_result=infer_result)
115144
write_ymir_monitor_process(cfg, task='infer', naive_stage_percent=1.0, stage=YmirStage.POSTPROCESS)

det-mmdetection-tmi/ymir_mining_aldd.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
import torch
44
from easydict import EasyDict as edict
5-
from mining_base import ALDDMining
65
from mmcv.parallel import collate, scatter
6+
from ymir_exc.util import get_merged_config
7+
8+
from mining_base import ALDDMining
79
from mmdet.datasets import replace_ImageToTensor
810
from mmdet.datasets.pipelines import Compose
911
from mmdet.models.detectors import YOLOX
10-
from ymir_exc.util import get_merged_config
1112
from ymir_infer import YmirModel
1213
from ymir_mining_random import RandomMiner
1314

det-mmdetection-tmi/ymir_train.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@
55
import sys
66

77
from easydict import EasyDict as edict
8-
from mmdet.utils.util_ymir import get_best_weight_file, write_ymir_training_result
9-
from ymir_exc import monitor
10-
from ymir_exc.util import YmirStage, find_free_port, get_merged_config, write_ymir_monitor_process
8+
from ymir_exc.util import (YmirStage, find_free_port, get_merged_config,
9+
write_ymir_monitor_process)
10+
11+
from mmdet.utils.util_ymir import (get_best_weight_file,
12+
write_ymir_training_result)
1113

1214

1315
def main(cfg: edict) -> int:

docs/cn/docker_images/det-nanodet-tmi.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
## 镜像地址
1111
```
1212
youdaoyzbx/ymir-executor:ymir2.0.0-nanodet-cu111-tmi
13+
youdaoyzbx/ymir-executor:ymir2.0.2-nanodet-cu111-tmi
1314
```
1415

1516
## 性能说明

docs/cn/docker_images/det-yolov5-tmi.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# yolov5 镜像说明文档
22

3+
4+
## 仓库地址
5+
6+
> 参考[ultralytics/yolov5](https://github.com/ultralytics/yolov5)
7+
- [modelai/ymir-executor-fork](https://github.com/modelai/ymir-executor-fork/tree/master/det-yolov5-tmi)
8+
39
## 镜像地址
410

511
```

0 commit comments

Comments
 (0)