perf: Vectorize DiceHelper.__call__()

aymuos15 · aymuos15 · commit 8b2e31693b89 · 2026-03-03T17:09:36.000Z
Replace nested batch/channel loops with vectorized torch operations.

Signed-off-by: Soumya Snigdha Kundu &lt;soumya_snigdha.kundu@kcl.ac.uk&gt;
diff --git a/monai/metrics/meandice.py b/monai/metrics/meandice.py
@@ -322,16 +322,61 @@ def __call__(self, y_pred: torch.Tensor, y: torch.Tensor) -> torch.Tensor | tupl
                 y_pred = torch.sigmoid(y_pred)
             y_pred = y_pred > 0.5
 
+        # Vectorized computation (replaces nested loops for better performance)
+        batch_size = y_pred.shape[0]
+        device = y_pred.device
+
+        # Convert to boolean for computation
+        if y_pred.shape[1] == 1 and n_pred_ch > 1:
+            # Single-channel class indices: convert to one-hot
+            y_pred_bool = torch.zeros(batch_size, n_pred_ch, *y_pred.shape[2:], dtype=torch.bool, device=device)
+            y_bool = torch.zeros(batch_size, n_pred_ch, *y.shape[2:], dtype=torch.bool, device=device)
+
+            for c in range(n_pred_ch):
+                y_pred_bool[:, c] = (y_pred[:, 0] == c)
+                y_bool[:, c] = (y[:, 0] == c)
+        else:
+            # One-hot format: cast to bool
+            y_pred_bool = y_pred.bool()
+            if y.shape[1] == 1 and y_pred.shape[1] > 1:
+                # Expand y to match y_pred channels
+                y_bool = (y == 1).expand(batch_size, n_pred_ch, *y.shape[2:])
+            else:
+                y_bool = y.bool()
+
+        # Flatten spatial dimensions for vectorized computation: (batch, channels, -1)
+        y_pred_flat = y_pred_bool.reshape(batch_size, n_pred_ch, -1).float()
+        y_flat = y_bool.reshape(batch_size, n_pred_ch, -1).float()
+
+        # Compute Dice per (batch, channel) vectorized: all reductions at once
+        intersection = torch.sum(y_pred_flat * y_flat, dim=-1)  # (batch, n_pred_ch)
+        pred_sum = torch.sum(y_pred_flat, dim=-1)  # (batch, n_pred_ch)
+        y_sum = torch.sum(y_flat, dim=-1)  # (batch, n_pred_ch)
+
+        # Dice formula: 2 * intersection / (pred_sum + y_sum)
+        union = pred_sum + y_sum
+        dice = (2.0 * intersection) / union  # (batch, n_pred_ch)
+
+        # Handle empty ground truth cases
+        if self.ignore_empty:
+            # Set NaN where ground truth is empty
+            dice = torch.where(y_sum > 0, dice, torch.tensor(float("nan"), device=device, dtype=dice.dtype))
+        else:
+            # Set 1.0 if both empty, 0.0 if only pred is non-empty
+            empty_mask = y_sum == 0
+            dice = torch.where(
+                empty_mask,
+                torch.where(pred_sum == 0, torch.tensor(1.0, device=device, dtype=dice.dtype),
+                           torch.tensor(0.0, device=device, dtype=dice.dtype)),
+                dice
+            )
+
+        # Select channels: exclude background if requested
         first_ch = 0 if self.include_background else 1
-        data = []
-        for b in range(y_pred.shape[0]):
-            c_list = []
-            for c in range(first_ch, n_pred_ch) if n_pred_ch > 1 else [1]:
-                x_pred = (y_pred[b, 0] == c) if (y_pred.shape[1] == 1) else y_pred[b, c].bool()
-                x = (y[b, 0] == c) if (y.shape[1] == 1) else y[b, c]
-                c_list.append(self.compute_channel(x_pred, x))
-            data.append(torch.stack(c_list))
-        data = torch.stack(data, dim=0).contiguous()  # type: ignore
+        if n_pred_ch > 1:
+            data = dice[:, first_ch:]  # (batch, num_classes_selected)
+        else:
+            data = dice  # (batch, 1)
 
         f, not_nans = do_metric_reduction(data, self.reduction)  # type: ignore
         return (f, not_nans) if self.get_not_nans else f