Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions yolox/data/data_augment.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@


def augment_hsv(img, hgain=5, sgain=30, vgain=30):
if img.ndim == 2 or img.shape[2] == 1:
raise ValueError(
"HSV augmentation requires 3-channel images. Set hsv_prob to 0.0 for grayscale. " \
"Image shape: {}".format(img.shape)
)

hsv_augs = np.random.uniform(-1, 1, 3) * [hgain, sgain, vgain] # random gains
hsv_augs *= np.random.randint(0, 2, 3) # random selection of h, s, v
hsv_augs = hsv_augs.astype(np.int16)
Expand Down Expand Up @@ -132,7 +138,7 @@ def random_affine(


def _mirror(image, boxes, prob=0.5):
_, width, _ = image.shape
_, width = image.shape[:2]
if random.random() < prob:
image = image[:, ::-1]
boxes[:, 0::2] = width - boxes[:, 2::-2]
Expand All @@ -153,6 +159,8 @@ def preproc(img, input_size, swap=(2, 0, 1)):
).astype(np.uint8)
padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img

if img.ndim == 2:
padded_img = padded_img[:, :, np.newaxis]
padded_img = padded_img.transpose(swap)
padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
return padded_img, r
Expand All @@ -174,7 +182,7 @@ def __call__(self, image, targets, input_dim):

image_o = image.copy()
targets_o = targets.copy()
height_o, width_o, _ = image_o.shape
height_o, width_o = image_o.shape[:2]
boxes_o = targets_o[:, :4]
labels_o = targets_o[:, 4]
# bbox_o: [xyxy] to [c_x,c_y,w,h]
Expand All @@ -183,7 +191,7 @@ def __call__(self, image, targets, input_dim):
if random.random() < self.hsv_prob:
augment_hsv(image)
image_t, boxes = _mirror(image, boxes, self.flip_prob)
height, width, _ = image_t.shape
height, width = image_t.shape[:2]
image_t, r_ = preproc(image_t, input_dim)
# boxes [xyxy] 2 [cx,cy,w,h]
boxes = xyxy2cxcywh(boxes)
Expand Down
5 changes: 3 additions & 2 deletions yolox/data/datasets/coco.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ def remove_useless_info(coco):
"""
if isinstance(coco, COCO):
dataset = coco.dataset
dataset.pop("info", None)
dataset.pop("licenses", None)
for img in dataset["images"]:
img.pop("license", None)
Expand All @@ -45,6 +44,7 @@ def __init__(
preproc=None,
cache=False,
cache_type="ram",
num_channels=3,
):
"""
COCO dataset initialization. Annotation data are read into memory by COCO API.
Expand All @@ -71,6 +71,7 @@ def __init__(
self.img_size = img_size
self.preproc = preproc
self.annotations = self._load_coco_annotations()
self.num_channels = num_channels

path_filename = [os.path.join(name, anno[3]) for anno in self.annotations]
super().__init__(
Expand Down Expand Up @@ -145,7 +146,7 @@ def load_image(self, index):

img_file = os.path.join(self.data_dir, self.name, file_name)

img = cv2.imread(img_file)
img = cv2.imread(img_file, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE)
assert img is not None, f"file named {img_file} not found"

return img
Expand Down
15 changes: 11 additions & 4 deletions yolox/data/datasets/mosaicdetection.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ def __getitem__(self, idx):
img = cv2.resize(
img, (int(w0 * scale), int(h0 * scale)), interpolation=cv2.INTER_LINEAR
)
if img.ndim == 2:
img = img[:, :, None]
# generate output mosaic image
(h, w, c) = img.shape[:3]
if i_mosaic == 0:
Expand Down Expand Up @@ -191,13 +193,18 @@ def mixup(self, origin_img, origin_labels, input_dim):
cp_scale_ratio *= jit_factor

if FLIP:
cp_img = cp_img[:, ::-1, :]
cp_img = cp_img[:, ::-1]

origin_h, origin_w = cp_img.shape[:2]
target_h, target_w = origin_img.shape[:2]
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
)
if len(img.shape) == 3:
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w), 3), dtype=np.uint8
)
else:
padded_img = np.zeros(
(max(origin_h, target_h), max(origin_w, target_w)), dtype=np.uint8
)
padded_img[:origin_h, :origin_w] = cp_img

x_offset, y_offset = 0, 0
Expand Down
4 changes: 3 additions & 1 deletion yolox/data/datasets/voc.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
dataset_name="VOC0712",
cache=False,
cache_type="ram",
num_channels=3
):
self.root = data_dir
self.image_set = image_sets
Expand All @@ -131,6 +132,7 @@ def __init__(
):
self.ids.append((rootpath, line.strip()))
self.num_imgs = len(self.ids)
self.num_channels = num_channels

self.annotations = self._load_coco_annotations()

Expand Down Expand Up @@ -184,7 +186,7 @@ def load_resized_img(self, index):

def load_image(self, index):
img_id = self.ids[index]
img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR)
img = cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR if self.num_channels == 3 else cv2.IMREAD_GRAYSCALE)
assert img is not None, f"file named {self._imgpath % img_id} not found"

return img
Expand Down
14 changes: 11 additions & 3 deletions yolox/exp/yolox_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def __init__(self):
# If your training process cost many memory, reduce this value.
self.data_num_workers = 4
self.input_size = (640, 640) # (height, width)
self.num_channels = 3 # 3 for RGB/BGR, 1 for grayscale
# Actual multiscale ranges: [640 - 5 * 32, 640 + 5 * 32].
# To disable multiscale training, set the value to 0.
self.multiscale_range = 5
Expand All @@ -45,14 +46,18 @@ def __init__(self):
self.val_ann = "instances_val2017.json"
# name of annotation file for testing
self.test_ann = "instances_test2017.json"
# name of image folders for training, evaluation and testing
self.train_name = "train2017"
self.val_name = "val2017"
self.test_name = "test2017"

# --------------- transform config ----------------- #
# prob of applying mosaic aug
self.mosaic_prob = 1.0
# prob of applying mixup aug
self.mixup_prob = 1.0
# prob of applying hsv aug
self.hsv_prob = 1.0
self.hsv_prob = 1.0 if self.num_channels == 3 else 0.0 # HSV not relevant for grayscale
# prob of applying flip aug
self.flip_prob = 0.5
# rotation angle range, for example, if set to 2, the true range is (-2, 2)
Expand Down Expand Up @@ -119,7 +124,7 @@ def init_yolo(M):

if getattr(self, "model", None) is None:
in_channels = [256, 512, 1024]
backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act)
backbone = YOLOPAFPN(self.depth, self.width, in_channels=in_channels, act=self.act, num_channels=self.num_channels)
head = YOLOXHead(self.num_classes, self.width, in_channels=in_channels, act=self.act)
self.model = YOLOX(backbone, head)

Expand Down Expand Up @@ -150,6 +155,8 @@ def get_dataset(self, cache: bool = False, cache_type: str = "ram"):
),
cache=cache,
cache_type=cache_type,
name=self.train_name,
num_channels=self.num_channels,
)

def get_data_loader(self, batch_size, is_distributed, no_aug=False, cache_img: str = None):
Expand Down Expand Up @@ -304,9 +311,10 @@ def get_eval_dataset(self, **kwargs):
return COCODataset(
data_dir=self.data_dir,
json_file=self.val_ann if not testdev else self.test_ann,
name="val2017" if not testdev else "test2017",
name=self.val_name if not testdev else self.test_name,
img_size=self.test_size,
preproc=ValTransform(legacy=legacy),
num_channels=self.num_channels,
)

def get_eval_loader(self, batch_size, is_distributed, **kwargs):
Expand Down
3 changes: 2 additions & 1 deletion yolox/models/darknet.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ def __init__(
out_features=("dark3", "dark4", "dark5"),
depthwise=False,
act="silu",
num_channels=3,
):
super().__init__()
assert out_features, "please provide output features of Darknet"
Expand All @@ -112,7 +113,7 @@ def __init__(
base_depth = max(round(dep_mul * 3), 1) # 3

# stem
self.stem = Focus(3, base_channels, ksize=3, act=act)
self.stem = Focus(num_channels, base_channels, ksize=3, act=act)

# dark2
self.dark2 = nn.Sequential(
Expand Down
3 changes: 2 additions & 1 deletion yolox/models/yolo_pafpn.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,10 @@ def __init__(
in_channels=[256, 512, 1024],
depthwise=False,
act="silu",
num_channels=3,
):
super().__init__()
self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act)
self.backbone = CSPDarknet(depth, width, depthwise=depthwise, act=act, num_channels=num_channels)
self.in_features = in_features
self.in_channels = in_channels
Conv = DWConv if depthwise else BaseConv
Expand Down
3 changes: 2 additions & 1 deletion yolox/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ def get_model_info(model: nn.Module, tsize: Sequence[int]) -> str:
from thop import profile

stride = 64
img = torch.zeros((1, 3, stride, stride), device=next(model.parameters()).device)
in_channels = model.backbone.backbone.stem.conv.conv.in_channels // 4
img = torch.zeros((1, in_channels, stride, stride), device=next(model.parameters()).device)
flops, params = profile(deepcopy(model), inputs=(img,), verbose=False)
params /= 1e6
flops /= 1e9
Expand Down