Megvii-BaseDetection · lucia-leonie · Sep 9, 2025 · Sep 9, 2025 · Sep 9, 2025
diff --git a/yolox/data/data_augment.py b/yolox/data/data_augment.py
@@ -19,6 +19,12 @@
 
 
 def augment_hsv(img, hgain=5, sgain=30, vgain=30):
+    if img.ndim == 2 or img.shape[2] == 1:
+        raise ValueError(
+            "HSV augmentation requires 3-channel images. Set hsv_prob to 0.0 for grayscale. " \
+            "Image shape: {}".format(img.shape)
+        )
+
     hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain]  # random gains
     hsv_augs *= np.random.randint(0, 2, 3)  # random selection of h, s, v
     hsv_augs = hsv_augs.astype(np.int16)
@@ -132,7 +138,7 @@ def random_affine(
 
 
 def _mirror(image, boxes, prob=0.5):
-    _, width, _ = image.shape
+    _, width = image.shape[:2]
     if random.random() < prob:
         image = image[:, ::-1]
         boxes[:, 0::2] = width - boxes[:, 2::-2]
@@ -153,6 +159,8 @@ def preproc(img, input_size, swap=(2, 0, 1)):
     ).astype(np.uint8)
     padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
 
+    if img.ndim == 2:
+        padded_img = padded_img[:, :, np.newaxis]
     padded_img = padded_img.transpose(swap)
     padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
     return padded_img, r
@@ -174,7 +182,7 @@ def __call__(self, image, targets, input_dim):
 
         image_o = image.copy()
         targets_o = targets.copy()
-        height_o, width_o, _ = image_o.shape
+        height_o, width_o = image_o.shape[:2]
         boxes_o = targets_o[:, :4]
         labels_o = targets_o[:, 4]
         # bbox_o: [xyxy] to [c_x,c_y,w,h]
@@ -183,7 +191,7 @@ def __call__(self, image, targets, input_dim):
         if random.random() < self.hsv_prob:
             augment_hsv(image)
         image_t, boxes = _mirror(image, boxes, self.flip_prob)
-        height, width, _ = image_t.shape
+        height, width = image_t.shape[:2]
         image_t, r_ = preproc(image_t, input_dim)
         # boxes [xyxy] 2 [cx,cy,w,h]
         boxes = xyxy2cxcywh(boxes)

diff --git a/yolox/data/datasets/coco.py b/yolox/data/datasets/coco.py
@@ -19,7 +19,6 @@ def remove_useless_info(coco):
     """
     if isinstance(coco, COCO):
         dataset = coco.dataset
-        dataset.pop("info", None)
         dataset.pop("licenses", None)
         for img in dataset["images"]:
             img.pop("license", None)
@@ -45,6 +44,7 @@ def __init__(
         preproc=None,
         cache=False,
         cache_type="ram",
+        num_channels=3,
     ):
         """
         COCO dataset initialization. Annotation data are read into memory by COCO API.
@@ -71,6 +71,7 @@ def __init__(
         self.img_size = img_size
         self.preproc = preproc
         self.annotations = self._load_coco_annotations()
+        self.num_channels = num_channels
 
         path_filename = [os.path.join(name, anno[3]) for anno in self.annotations]
         super().__init__(
@@ -145,7 +146,7 @@ def load_image(self, index):
 
         img_file = os.path.join(self.data_dir, self.name, file_name)
 
-        img = cv2.imread(img_file)
+        img = cv2.imread(img_file, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE)
         assert img is not None, f"file named {img_file} not found"
 
         return img

diff --git a/yolox/data/datasets/mosaicdetection.py b/yolox/data/datasets/mosaicdetection.py
@@ -96,6 +96,8 @@ def __getitem__(self, idx):
                 img = cv2.resize(
                     img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
                 )
+                if img.ndim == 2:
+                    img = img[:, :, None]
                 # generate output mosaic image
                 (h, w, c) = img.shape[:3]
                 if i_mosaic == 0:
@@ -191,13 +193,18 @@ def mixup(self, origin_img, origin_labels, input_dim):
         cp_scale_ratio *= jit_factor
 
         if FLIP:
-            cp_img = cp_img[:, ::-1, :]
+            cp_img = cp_img[:, ::-1]
 
         origin_h, origin_w = cp_img.shape[:2]
         target_h, target_w = origin_img.shape[:2]
-        padded_img = np.zeros(
-            (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
-        )
+        if len(img.shape) == 3:
+            padded_img = np.zeros(
+                (max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
+            )
+        else:
+            padded_img = np.zeros(
+                (max(origin_h, target_h), max(origin_w, target_w)), dtype=np.uint8
+            )
         padded_img[:origin_h, :origin_w] = cp_img
 
         x_offset, y_offset = 0, 0

diff --git a/yolox/data/datasets/voc.py b/yolox/data/datasets/voc.py
@@ -108,6 +108,7 @@ def __init__(
         dataset_name="VOC0712",
         cache=False,
         cache_type="ram",
+        num_channels=3
     ):
         self.root = data_dir
         self.image_set = image_sets
@@ -131,6 +132,7 @@ def __init__(
             ):
                 self.ids.append((rootpath, line.strip()))
         self.num_imgs = len(self.ids)
+        self.num_channels = num_channels
 
         self.annotations = self._load_coco_annotations()
 
@@ -184,7 +186,7 @@ def load_resized_img(self, index):
 
     def load_image(self, index):
         img_id = self.ids[index]
-        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
+        img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE)
         assert img is not None, f"file named {self._imgpath % img_id} not found"
 
         return img

diff --git a/yolox/exp/yolox_base.py b/yolox/exp/yolox_base.py
@@ -32,6 +32,7 @@ def __init__(self):
         # If your training process cost many memory, reduce this value.
         self.data_num_workers = 4
         self.input_size = (640, 640)  # (height, width)
+        self.num_channels = 3  # 3 for RGB/BGR, 1 for grayscale
         # Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
         # To disable multiscale training, set the value to 0.
         self.multiscale_range = 5
@@ -45,14 +46,18 @@ def __init__(self):
         self.val_ann = "instances_val2017.json"
         # name of annotation file for testing
         self.test_ann = "instances_test2017.json"
+        # name of image folders for training, evaluation and testing
+        self.train_name = "train2017"
+        self.val_name = "val2017"
+        self.test_name = "test2017"
 
         # --------------- transform config ----------------- #
         # prob of applying mosaic aug
         self.mosaic_prob = 1.0
         # prob of applying mixup aug
         self.mixup_prob = 1.0
         # prob of applying hsv aug
-        self.hsv_prob = 1.0
+        self.hsv_prob = 1.0 if self.num_channels == 3 else 0.0  # HSV not relevant for grayscale
         # prob of applying flip aug
         self.flip_prob = 0.5
         # rotation angle range, for example, if set to 2, the true range is (-2, 2)
@@ -119,7 +124,7 @@ def init_yolo(M):
 
         if getattr(self, "model", None) is None:
             in_channels = [256, 512, 1024]
-            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
+            backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act, num_channels=self.num_channels)
             head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
             self.model = YOLOX(backbone, head)
 
@@ -150,6 +155,8 @@ def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
             ),
             cache=cache,
             cache_type=cache_type,
+            name=self.train_name,
+            num_channels=self.num_channels,
         )
 
     def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None):
@@ -304,9 +311,10 @@ def get_eval_dataset(self, **kwargs):
         return COCODataset(
             data_dir=self.data_dir,
             json_file=self.val_ann if not testdev else self.test_ann,
-            name="val2017" if not testdev else "test2017",
+            name=self.val_name if not testdev else self.test_name,
             img_size=self.test_size,
             preproc=ValTransform(legacy=legacy),
+            num_channels=self.num_channels,
         )
 
     def get_eval_loader(self, batch_size, is_distributed, **kwargs):

diff --git a/yolox/models/darknet.py b/yolox/models/darknet.py
@@ -102,6 +102,7 @@ def __init__(
         out_features=("dark3", "dark4", "dark5"),
         depthwise=False,
         act="silu",
+        num_channels=3,
     ):
         super().__init__()
         assert out_features, "please provide output features of Darknet"
@@ -112,7 +113,7 @@ def __init__(
         base_depth = max(round(dep_mul * 3), 1)  # 3
 
         # stem
-        self.stem = Focus(3, base_channels, ksize=3, act=act)
+        self.stem = Focus(num_channels, base_channels, ksize=3, act=act)
 
         # dark2
         self.dark2 = nn.Sequential(

diff --git a/yolox/models/yolo_pafpn.py b/yolox/models/yolo_pafpn.py
@@ -22,9 +22,10 @@ def __init__(
         in_channels=[256, 512, 1024],
         depthwise=False,
         act="silu",
+        num_channels=3,
     ):
         super().__init__()
-        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
+        self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act, num_channels=num_channels)
         self.in_features = in_features
         self.in_channels = in_channels
         Conv = DWConv if depthwise else BaseConv

diff --git a/yolox/utils/model_utils.py b/yolox/utils/model_utils.py
@@ -23,7 +23,8 @@ def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str:
     from thop import profile
 
     stride = 64
-    img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
+    in_channels = model.backbone.backbone.stem.conv.conv.in_channels // 4
+    img = torch.zeros((1, in_channels, stride, stride), device=next(model.parameters()).device)
     flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
     params /= 1e6
     flops /= 1e9