Skip to content

Commit ef09dcf

Browse files
committed
fix 30min dist.barrier() time-out errors
1 parent 512194c commit ef09dcf

4 files changed

Lines changed: 21 additions & 1 deletion

File tree

det-mmdetection-tmi/ymir_mining.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,10 @@ def mining(self):
283283
beta = 1.3
284284
mining_result = []
285285
for asset_path in tbar:
286+
# batch-level sync, avoid 30min time-out error
287+
if LOCAL_RANK != -1:
288+
dist.barrier()
289+
286290
img = cv2.imread(asset_path)
287291
# xyxy,conf,cls
288292
result = self.predict(img)

det-yolov5-tmi/mining/ymir_infer.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,10 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
5858
monitor_gap = max(1, dataset_size // 1000 // batch_size_per_gpu)
5959
pbar = tqdm(origin_dataset_loader) if RANK == 0 else origin_dataset_loader
6060
for idx, batch in enumerate(pbar):
61+
# batch-level sync, avoid 30min time-out error
62+
if LOCAL_RANK != -1:
63+
dist.barrier()
64+
6165
with torch.no_grad():
6266
pred = ymir_yolov5.forward(batch['image'].float().to(device), nms=True)
6367

det-yolov5-tmi/mining/ymir_mining_aldd.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,10 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
157157
pbar = tqdm(origin_dataset_loader) if RANK == 0 else origin_dataset_loader
158158
miner = ALDD(ymir_cfg)
159159
for idx, batch in enumerate(pbar):
160+
# batch-level sync, avoid 30min time-out error
161+
if LOCAL_RANK != -1:
162+
dist.barrier()
163+
160164
with torch.no_grad():
161165
featuremap_output = ymir_yolov5.model.model(batch['image'].float().to(device))[1]
162166
unc_scores = miner.compute_aldd_score(featuremap_output, ymir_yolov5.img_size)
@@ -200,7 +204,7 @@ def main() -> int:
200204

201205
if LOCAL_RANK != -1:
202206
print(f'rank: {RANK}, start destroy process group')
203-
dist.destroy_process_group()
207+
dist.destroy_process_group()
204208
return 0
205209

206210

det-yolov5-tmi/mining/ymir_mining_cald.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
6262
dataset_size = len(images_rank)
6363
pbar = tqdm(origin_dataset_loader) if RANK == 0 else origin_dataset_loader
6464
for idx, batch in enumerate(pbar):
65+
# batch-level sync, avoid 30min time-out error
66+
if LOCAL_RANK != -1:
67+
dist.barrier()
68+
6569
with torch.no_grad():
6670
pred = ymir_yolov5.forward(batch['image'].float().to(device), nms=True)
6771

@@ -98,6 +102,10 @@ def run(ymir_cfg: edict, ymir_yolov5: YmirYolov5):
98102
monitor_gap = max(1, dataset_size // 1000 // batch_size_per_gpu)
99103
pbar = tqdm(aug_dataset_loader) if RANK == 0 else aug_dataset_loader
100104
for idx, batch in enumerate(pbar):
105+
# batch-level sync, avoid 30min time-out error
106+
if LOCAL_RANK != -1:
107+
dist.barrier()
108+
101109
if idx % monitor_gap == 0 and RANK in [-1, 0]:
102110
ymir_yolov5.write_monitor_logger(stage=YmirStage.TASK, p=idx * batch_size_per_gpu / dataset_size)
103111

0 commit comments

Comments
 (0)