Optimize deformable bilinear interpolate with invariant-based clamps and weight masking

ShirasawaSama · ShirasawaSama · commit 7f4e63812064 · 2026-03-27T02:22:49.000+08:00
diff --git a/onnxruntime/core/providers/cuda/nn/deform_conv_impl.cu b/onnxruntime/core/providers/cuda/nn/deform_conv_impl.cu
@@ -188,33 +188,34 @@ __device__ __inline__ T BilinearInterpolate(
   CoordT hh = static_cast<CoordT>(1) - lh;
   CoordT hw = static_cast<CoordT>(1) - lw;
 
-  // [Optimization 4]: Branchless neighbor loads via "safe address + validity mask".
-  // 1) Clamp each coordinate to a legal address first (prevents illegal memory access).
-  // 2) Compute validity predicates for the true (possibly OOB) coordinates.
-  // 3) Always load from clamped address and mask invalid neighbors to zero.
-  // Modern CUDA compilers usually lower this to predicated/selp-style code without control-flow branches.
-  const int safe_h_low = max(0, min(h_low, height - 1));
-  const int safe_h_high = max(0, min(h_high, height - 1));
-  const int safe_w_low = max(0, min(w_low, width - 1));
-  const int safe_w_high = max(0, min(w_high, width - 1));
+  // [Optimization 3]: Branchless neighbor loads via "safe address + one-sided clamp".
+  // Given the early return above, coordinates are in (-1, H) x (-1, W), so each index only needs one-sided clamp:
+  //   h_low in [-1, H-1], h_high in [0, H], w_low in [-1, W-1], w_high in [0, W].
+  // We fuse validity into bilinear 1D weights (hh/lh/hw/lw), then always load from legal addresses.
+  // CUDA compilers usually lower this to predicated/selp-style code without control-flow branches.
+  const int safe_h_low = max(0, h_low);
+  const int safe_h_high = min(h_high, height - 1);
+  const int safe_w_low = max(0, w_low);
+  const int safe_w_high = min(w_high, width - 1);
+
+  // [Optimization 4]: Mask validity into bilinear 1D weights.
+  // Reuse the same invariant as above: each weight only needs the single bound that can still fail.
+  // Masking 1D weights is algebraically equivalent to masking each 2D neighbor contribution.
+  // Apply conditions directly on weights so the compiler can emit straightforward predicated selects.
+  // Keep the zero in ComputeT (CoordT) to avoid T->ComputeT implicit conversions for half/BFloat16.
+  const CoordT zero = static_cast<CoordT>(0);
+  hh = (h_low >= 0) ? hh : zero;
+  lh = (h_high < height) ? lh : zero;
+  hw = (w_low >= 0) ? hw : zero;
+  lw = (w_high < width) ? lw : zero;
 
   const int safe_base_low = safe_h_low * width;
   const int safe_base_high = safe_h_high * width;
 
-  const bool h_low_valid = (h_low >= 0 && h_low < height);
-  const bool h_high_valid = (h_high >= 0 && h_high < height);
-  const bool w_low_valid = (w_low >= 0 && w_low < width);
-  const bool w_high_valid = (w_high >= 0 && w_high < width);
-
-  const CoordT m1 = static_cast<CoordT>(h_low_valid && w_low_valid);
-  const CoordT m2 = static_cast<CoordT>(h_low_valid && w_high_valid);
-  const CoordT m3 = static_cast<CoordT>(h_high_valid && w_low_valid);
-  const CoordT m4 = static_cast<CoordT>(h_high_valid && w_high_valid);
-
-  const CoordT v1 = Traits::Load(in + safe_base_low + safe_w_low) * m1;
-  const CoordT v2 = Traits::Load(in + safe_base_low + safe_w_high) * m2;
-  const CoordT v3 = Traits::Load(in + safe_base_high + safe_w_low) * m3;
-  const CoordT v4 = Traits::Load(in + safe_base_high + safe_w_high) * m4;
+  const CoordT v1 = Traits::Load(in + safe_base_low + safe_w_low);
+  const CoordT v2 = Traits::Load(in + safe_base_low + safe_w_high);
+  const CoordT v3 = Traits::Load(in + safe_base_high + safe_w_low);
+  const CoordT v4 = Traits::Load(in + safe_base_high + safe_w_high);
 
   // [Optimization 5]: Factor bilinear into horizontal blends on two rows, then vertical blend.
   // Algebraically equivalent to w1*v1 + w2*v2 + w3*v3 + w4*v4 with w1..w4 from hh/hw/lh/lw;