diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py index 21cd7b56d8..99c1186bc1 100644 --- a/yolox/data/data_augment.py +++ b/yolox/data/data_augment.py @@ -19,6 +19,12 @@ def augment_hsv(img, hgain=5, sgain=30, vgain=30): + if img.ndim == 2 or img.shape[2] == 1: + raise ValueError( + "HSV augmentation requires 3-channel images. Set hsv_prob to 0.0 for grayscale. " \ + "Image shape: {}".format(img.shape) + ) + hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] # random gains hsv_augs *= np.random.randint(0, 2, 3) # random selection of h, s, v hsv_augs = hsv_augs.astype(np.int16) @@ -132,7 +138,7 @@ def random_affine( def _mirror(image, boxes, prob=0.5): - _, width, _ = image.shape + _, width = image.shape[:2] if random.random() < prob: image = image[:, ::-1] boxes[:, 0::2] = width - boxes[:, 2::-2] @@ -153,6 +159,8 @@ def preproc(img, input_size, swap=(2, 0, 1)): ).astype(np.uint8) padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img + if img.ndim == 2: + padded_img = padded_img[:, :, np.newaxis] padded_img = padded_img.transpose(swap) padded_img = np.ascontiguousarray(padded_img, dtype=np.float32) return padded_img, r @@ -174,7 +182,7 @@ def __call__(self, image, targets, input_dim): image_o = image.copy() targets_o = targets.copy() - height_o, width_o, _ = image_o.shape + height_o, width_o = image_o.shape[:2] boxes_o = targets_o[:, :4] labels_o = targets_o[:, 4] # bbox_o: [xyxy] to [c_x,c_y,w,h] @@ -183,7 +191,7 @@ def __call__(self, image, targets, input_dim): if random.random() < self.hsv_prob: augment_hsv(image) image_t, boxes = _mirror(image, boxes, self.flip_prob) - height, width, _ = image_t.shape + height, width = image_t.shape[:2] image_t, r_ = preproc(image_t, input_dim) # boxes [xyxy] 2 [cx,cy,w,h] boxes = xyxy2cxcywh(boxes) diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py index 8d19047a2b..6418c02044 100644 --- a/yolox/data/datasets/coco.py +++ b/yolox/data/datasets/coco.py @@ -19,7 +19,6 @@ def remove_useless_info(coco): """ if isinstance(coco, COCO): dataset = coco.dataset - dataset.pop("info", None) dataset.pop("licenses", None) for img in dataset["images"]: img.pop("license", None) @@ -45,6 +44,7 @@ def __init__( preproc=None, cache=False, cache_type="ram", + num_channels=3, ): """ COCO dataset initialization. Annotation data are read into memory by COCO API. @@ -71,6 +71,7 @@ def __init__( self.img_size = img_size self.preproc = preproc self.annotations = self._load_coco_annotations() + self.num_channels = num_channels path_filename = [os.path.join(name, anno[3]) for anno in self.annotations] super().__init__( @@ -145,7 +146,7 @@ def load_image(self, index): img_file = os.path.join(self.data_dir, self.name, file_name) - img = cv2.imread(img_file) + img = cv2.imread(img_file, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE) assert img is not None, f"file named {img_file} not found" return img diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py index 708babed55..0e07c36728 100644 --- a/yolox/data/datasets/mosaicdetection.py +++ b/yolox/data/datasets/mosaicdetection.py @@ -96,6 +96,8 @@ def __getitem__(self, idx): img = cv2.resize( img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR ) + if img.ndim == 2: + img = img[:, :, None] # generate output mosaic image (h, w, c) = img.shape[:3] if i_mosaic == 0: @@ -191,13 +193,18 @@ def mixup(self, origin_img, origin_labels, input_dim): cp_scale_ratio *= jit_factor if FLIP: - cp_img = cp_img[:, ::-1, :] + cp_img = cp_img[:, ::-1] origin_h, origin_w = cp_img.shape[:2] target_h, target_w = origin_img.shape[:2] - padded_img = np.zeros( - (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8 - ) + if len(img.shape) == 3: + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8 + ) + else: + padded_img = np.zeros( + (max(origin_h, target_h), max(origin_w, target_w)), dtype=np.uint8 + ) padded_img[:origin_h, :origin_w] = cp_img x_offset, y_offset = 0, 0 diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py index bdacd80191..e6cea728ca 100644 --- a/yolox/data/datasets/voc.py +++ b/yolox/data/datasets/voc.py @@ -108,6 +108,7 @@ def __init__( dataset_name="VOC0712", cache=False, cache_type="ram", + num_channels=3 ): self.root = data_dir self.image_set = image_sets @@ -131,6 +132,7 @@ def __init__( ): self.ids.append((rootpath, line.strip())) self.num_imgs = len(self.ids) + self.num_channels = num_channels self.annotations = self._load_coco_annotations() @@ -184,7 +186,7 @@ def load_resized_img(self, index): def load_image(self, index): img_id = self.ids[index] - img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) + img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE) assert img is not None, f"file named {self._imgpath % img_id} not found" return img diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py index 82e93c21bd..b45e65b3f8 100644 --- a/yolox/exp/yolox_base.py +++ b/yolox/exp/yolox_base.py @@ -32,6 +32,7 @@ def __init__(self): # If your training process cost many memory, reduce this value. self.data_num_workers = 4 self.input_size = (640, 640) # (height, width) + self.num_channels = 3 # 3 for RGB/BGR, 1 for grayscale # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32]. # To disable multiscale training, set the value to 0. self.multiscale_range = 5 @@ -45,6 +46,10 @@ def __init__(self): self.val_ann = "instances_val2017.json" # name of annotation file for testing self.test_ann = "instances_test2017.json" + # name of image folders for training, evaluation and testing + self.train_name = "train2017" + self.val_name = "val2017" + self.test_name = "test2017" # --------------- transform config ----------------- # # prob of applying mosaic aug @@ -52,7 +57,7 @@ def __init__(self): # prob of applying mixup aug self.mixup_prob = 1.0 # prob of applying hsv aug - self.hsv_prob = 1.0 + self.hsv_prob = 1.0 if self.num_channels == 3 else 0.0 # HSV not relevant for grayscale # prob of applying flip aug self.flip_prob = 0.5 # rotation angle range, for example, if set to 2, the true range is (-2, 2) @@ -119,7 +124,7 @@ def init_yolo(M): if getattr(self, "model", None) is None: in_channels = [256, 512, 1024] - backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act) + backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act, num_channels=self.num_channels) head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act) self.model = YOLOX(backbone, head) @@ -150,6 +155,8 @@ def get_dataset(self, cache: bool = False, cache_type: str = "ram"): ), cache=cache, cache_type=cache_type, + name=self.train_name, + num_channels=self.num_channels, ) def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None): @@ -304,9 +311,10 @@ def get_eval_dataset(self, **kwargs): return COCODataset( data_dir=self.data_dir, json_file=self.val_ann if not testdev else self.test_ann, - name="val2017" if not testdev else "test2017", + name=self.val_name if not testdev else self.test_name, img_size=self.test_size, preproc=ValTransform(legacy=legacy), + num_channels=self.num_channels, ) def get_eval_loader(self, batch_size, is_distributed, **kwargs): diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py index b3e053f163..9332d06ee5 100644 --- a/yolox/models/darknet.py +++ b/yolox/models/darknet.py @@ -102,6 +102,7 @@ def __init__( out_features=("dark3", "dark4", "dark5"), depthwise=False, act="silu", + num_channels=3, ): super().__init__() assert out_features, "please provide output features of Darknet" @@ -112,7 +113,7 @@ def __init__( base_depth = max(round(dep_mul * 3), 1) # 3 # stem - self.stem = Focus(3, base_channels, ksize=3, act=act) + self.stem = Focus(num_channels, base_channels, ksize=3, act=act) # dark2 self.dark2 = nn.Sequential( diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py index 4c4e18a5c3..0b04754d16 100644 --- a/yolox/models/yolo_pafpn.py +++ b/yolox/models/yolo_pafpn.py @@ -22,9 +22,10 @@ def __init__( in_channels=[256, 512, 1024], depthwise=False, act="silu", + num_channels=3, ): super().__init__() - self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act) + self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act, num_channels=num_channels) self.in_features = in_features self.in_channels = in_channels Conv = DWConv if depthwise else BaseConv diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py index 3bc2d1ff7a..ad9793f7c5 100644 --- a/yolox/utils/model_utils.py +++ b/yolox/utils/model_utils.py @@ -23,7 +23,8 @@ def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str: from thop import profile stride = 64 - img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device) + in_channels = model.backbone.backbone.stem.conv.conv.in_channels // 4 + img = torch.zeros((1, in_channels, stride, stride), device=next(model.parameters()).device) flops, params = profile(deepcopy(model), inputs=(img,), verbose=False) params /= 1e6 flops /= 1e9