fix: support full-image masks in instance segmentation postprocessing (#588)

kprokofi · web-flow · commit 1951d64e5c5c · 2026-05-20T13:10:06.000Z
* feat: add DETRInstanceSegmentation model wrapper for full-image masks DETR-family instance segmentation models (e.g. RF-DETR-Seg) output full-image masks at reduced resolution (input_size/4) rather than per-box crop masks (28x28) like Mask R-CNN. Add DETRInstanceSegmentation class (__model__ = "DETRInstSeg") that inherits from MaskRCNNModel and overrides postprocess() to resize masks to original image dimensions directly, instead of the box-crop placement logic used by MaskRCNNModel. This follows the same pattern as SSD vs YOLO for detection -- different architectures get different model wrappers, selected via model_type in the exported model's rt_info. MaskRCNNModel remains unchanged for backward compatibility. Resolves: open-edge-platform/geti#6488 * test: add unit tests for DETRInstanceSegmentation postprocess Tests cover: - _full_image_mask_postprocess: resize, threshold, dtype, spatial pattern preservation - Comparison between full-image and per-box-crop postprocessing approaches - DETRInstanceSegmentation.postprocess: basic flow, batch dim squeezing, confidence filtering, empty results, label increment, label names, mask positioning (verifies masks are NOT shifted to box position), multiple detections, class attributes, and inheritance * refactor: extract InstanceSegmentationModel base class Introduce InstanceSegmentationModel as the common base for both MaskRCNNModel and DETRInstanceSegmentation. The base class contains all shared logic: initialization, output detection, preprocessing, box rescaling, confidence/area filtering, and NMS. Subclasses only need to implement _postprocess_single_mask(): - MaskRCNNModel: per-box-crop postprocess (_segm_postprocess) - DETRInstanceSegmentation: full-image resize (_full_image_mask_postprocess) This eliminates the duplicated postprocess code and makes the hierarchy cleanly express the architectural difference between the two approaches. Also updates the tiler to use InstanceSegmentationModel for isinstance checks, and adds tests verifying the new hierarchy. * fix: resolve ruff lint errors (import sorting, unused var, naming)
diff --git a/model_api/src/model_api/models/__init__.py b/model_api/src/model_api/models/__init__.py
@@ -8,7 +8,7 @@
 from .classification import ClassificationModel
 from .detection_model import DetectionModel
 from .image_model import ImageModel
-from .instance_segmentation import MaskRCNNModel
+from .instance_segmentation import DETRInstanceSegmentation, InstanceSegmentationModel, MaskRCNNModel
 from .keypoint_detection import KeypointDetectionModel, TopDownKeypointDetectionPipeline
 from .model import Model
 from .result import (
@@ -68,9 +68,11 @@
     "DetectedKeypoints",
     "DetectionModel",
     "DetectionResult",
+    "DETRInstanceSegmentation",
     "get_contours",
     "ImageModel",
     "ImageResultWithSoftPrediction",
+    "InstanceSegmentationModel",
     "InstanceSegmentationResult",
     "KeypointDetectionModel",
     "Label",
diff --git a/model_api/src/model_api/models/instance_segmentation.py b/model_api/src/model_api/models/instance_segmentation.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 #
 
+from abc import abstractmethod
+
 import cv2
 import numpy as np
 
@@ -14,8 +16,13 @@
 from .utils import ResizeMetadata, calculate_nms, load_labels
 
 
-class MaskRCNNModel(ImageModel):
-    __model__ = "MaskRCNN"
+class InstanceSegmentationModel(ImageModel):
+    """Base class for instance segmentation models.
+
+    Handles common initialization, output detection, preprocessing, box rescaling,
+    confidence filtering, and NMS. Subclasses implement mask-specific postprocessing
+    via `_postprocess_single_mask`.
+    """
 
     def __init__(self, inference_adapter: InferenceAdapter, configuration: dict = {}, preload: bool = False) -> None:
         super().__init__(inference_adapter, configuration, preload)
@@ -107,6 +114,20 @@ def preprocess(self, dict_inputs: dict, meta: dict) -> tuple[dict, dict]:
             dict_inputs[self.image_info_blob_names[0]] = input_image_info
         return dict_inputs, meta
 
+    @abstractmethod
+    def _postprocess_single_mask(self, box: np.ndarray, raw_cls_mask: np.ndarray, im_h: int, im_w: int) -> np.ndarray:
+        """Process a single raw mask into a full-image binary mask.
+
+        Args:
+            box: Bounding box [x1, y1, x2, y2] in original image coordinates.
+            raw_cls_mask: Raw mask output from the model (2D array).
+            im_h: Original image height.
+            im_w: Original image width.
+
+        Returns:
+            Binary mask of shape (im_h, im_w) with dtype uint8.
+        """
+
     def postprocess(self, outputs: dict, meta: dict) -> InstanceSegmentationResult:
         if (
             outputs[self.output_blob_name["labels"]].ndim == 2
@@ -213,7 +234,7 @@ def postprocess(self, outputs: dict, meta: dict) -> InstanceSegmentationResult:
 
             raw_cls_mask = raw_mask[label_idx, ...] if self.is_segmentoly else raw_mask
             if self.params.postprocess_semantic_masks or has_feature_vector_name:
-                resized_mask = _segm_postprocess(
+                resized_mask = self._postprocess_single_mask(
                     box,
                     raw_cls_mask,
                     *meta["original_shape"][:-1],
@@ -226,18 +247,44 @@ def postprocess(self, outputs: dict, meta: dict) -> InstanceSegmentationResult:
             if has_feature_vector_name:
                 saliency_maps[label_idx - 1].append(resized_mask)
 
-        _masks = np.stack(resized_masks) if len(resized_masks) > 0 else np.empty((0, 16, 16), dtype=np.uint8)
+        result_masks = np.stack(resized_masks) if len(resized_masks) > 0 else np.empty((0, 16, 16), dtype=np.uint8)
         return InstanceSegmentationResult(
             bboxes=boxes,
             labels=labels,
             scores=scores,
-            masks=_masks,
+            masks=result_masks,
             label_names=label_names or None,
             saliency_map=_average_and_normalize(saliency_maps),
             feature_vector=outputs.get(_feature_vector_name, np.ndarray(0)),
         )
 
 
+class MaskRCNNModel(InstanceSegmentationModel):
+    """Instance segmentation model for Mask R-CNN-style architectures.
+
+    Uses per-box-crop mask postprocessing: resizes the small mask (e.g. 28x28)
+    to the bounding box dimensions and places it at the box position.
+    """
+
+    __model__ = "MaskRCNN"
+
+    def _postprocess_single_mask(self, box: np.ndarray, raw_cls_mask: np.ndarray, im_h: int, im_w: int) -> np.ndarray:
+        return _segm_postprocess(box, raw_cls_mask, im_h, im_w)
+
+
+class DETRInstanceSegmentation(InstanceSegmentationModel):
+    """Instance segmentation model for DETR-family architectures (e.g. RF-DETR-Seg).
+
+    Uses full-image mask postprocessing: resizes the mask (e.g. 96x96 covering the
+    entire image) to the original image dimensions and applies a threshold.
+    """
+
+    __model__ = "DETRInstSeg"
+
+    def _postprocess_single_mask(self, box: np.ndarray, raw_cls_mask: np.ndarray, im_h: int, im_w: int) -> np.ndarray:
+        return _full_image_mask_postprocess(raw_cls_mask, im_h, im_w)
+
+
 def _average_and_normalize(saliency_maps: list) -> list:
     aggregated = []
     for per_object_maps in saliency_maps:
@@ -286,6 +333,12 @@ def _segm_postprocess(box: np.ndarray, raw_cls_mask: np.ndarray, im_h: int, im_w
     return im_mask
 
 
+def _full_image_mask_postprocess(raw_cls_mask: np.ndarray, im_h: int, im_w: int) -> np.ndarray:
+    """Resize a full-image mask to original dimensions and threshold."""
+    resized = cv2.resize(raw_cls_mask.astype(np.float32), (im_w, im_h), interpolation=cv2.INTER_LINEAR)
+    return (resized > 0.5).astype(np.uint8)
+
+
 _saliency_map_name = "saliency_map"
 _feature_vector_name = "feature_vector"
 
diff --git a/model_api/src/model_api/models/model.py b/model_api/src/model_api/models/model.py
@@ -84,12 +84,14 @@ def __init__(self, inference_adapter: InferenceAdapter, configuration: dict = {}
             ONNXRuntimeAdapter,
         ) and self.__model__ not in {
             "Classification",
+            "DETRInstSeg",
             "MaskRCNN",
             "SSD",
             "Segmentation",
         }:
             self.raise_error(
-                "ONNXRuntimeAdapter is only supported for Classification, MaskRCNN, SSD, and Segmentation wrappers",
+                "ONNXRuntimeAdapter is only supported for Classification, DETRInstSeg, MaskRCNN, SSD,"
+                " and Segmentation wrappers",
             )
 
         self.inputs = self.inference_adapter.get_input_layers()
diff --git a/model_api/src/model_api/tilers/instance_segmentation.py b/model_api/src/model_api/tilers/instance_segmentation.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 from model_api.models import InstanceSegmentationResult
-from model_api.models.instance_segmentation import MaskRCNNModel, _segm_postprocess
+from model_api.models.instance_segmentation import InstanceSegmentationModel, _segm_postprocess
 from model_api.models.utils import multiclass_nms
 
 from .detection import DetectionTiler
@@ -194,13 +194,13 @@ def __call__(self, inputs):
         @contextmanager
         def setup_maskrcnn(*args, **kwds):
             postprocess_state = None
-            if isinstance(self.model, MaskRCNNModel):
+            if isinstance(self.model, InstanceSegmentationModel):
                 postprocess_state = self.model.params.postprocess_semantic_masks
                 self.model._postprocess_semantic_masks = False  # noqa: SLF001
             try:
                 yield
             finally:
-                if isinstance(self.model, MaskRCNNModel):
+                if isinstance(self.model, InstanceSegmentationModel):
                     self.model._postprocess_semantic_masks = postprocess_state  # noqa: SLF001
 
         with setup_maskrcnn():
diff --git a/model_api/tests/unit/models/test_detr_instance_segmentation.py b/model_api/tests/unit/models/test_detr_instance_segmentation.py