Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
0ec9bb0
Remove DeformConvCopyGemmOutputRowMajorToNCHW
ShirasawaSama Mar 23, 2026
3b5087c
Adjust parallel cost for DeformableIm2col
ShirasawaSama Mar 23, 2026
dffbe68
Refactor deform conv bilinear with plan
ShirasawaSama Mar 23, 2026
d07e843
Simplify DeformConv im2col plan paths and fix mask indexing bug
ShirasawaSama Mar 23, 2026
7d5125d
Refactor deform conv im2col to use a unified tiled path with context …
ShirasawaSama Mar 23, 2026
435bada
Refactor DeformConv sampling plan to AoSoA layout and use Eigen for b…
ShirasawaSama Mar 23, 2026
c963bd9
Refine DeformConv naming clarity and avoid redundant workspace size r…
ShirasawaSama Mar 23, 2026
5bcb402
Optimize DeformConv by removing streaming plan logic and making bilin…
ShirasawaSama Mar 23, 2026
0c2a602
Refactor Deformconv cpu op
ShirasawaSama Mar 23, 2026
3f2fee9
Harden DeformConv integer bounds checks and streamline hot-path casts…
ShirasawaSama Mar 23, 2026
7ae47a4
Refactor DeformConv bounds validation
ShirasawaSama Mar 23, 2026
7c0d414
Add compute-time bounds checks with size_t-safe indexing
ShirasawaSama Mar 23, 2026
02f9e0c
Optimize CPU DeformConv plan generation with kernel meta precompute
ShirasawaSama Mar 23, 2026
083b33c
Refactor DeformConv kernel meta setup into a params-based cached
ShirasawaSama Mar 23, 2026
afe2dd1
Refactor CPU DeformConv bias add to avoid div/mod and extract DeformC…
ShirasawaSama Mar 24, 2026
d61d36c
Annotate DeformConv CPU bias/col paths with ORT_CPU_RESTRICT and forc…
ShirasawaSama Mar 24, 2026
baf51ac
CPU DeformConv bilinear sampling uses fast floor and inverted bounds …
ShirasawaSama Mar 24, 2026
43730c2
Flatten CPU DeformConv bilinear sampling plan build tasks across spat…
ShirasawaSama Mar 24, 2026
47bb183
Optimize CPU DeformConv sampling and bias parallelism with flattened …
ShirasawaSama Mar 24, 2026
3520625
Add detailed comments for DeformConv CPU implementation
ShirasawaSama Mar 24, 2026
052507e
Reformat codes
ShirasawaSama Mar 24, 2026
b92e8c8
Optimize DeformConv CPU kernel by removing mutex and heap allocations
ShirasawaSama Mar 24, 2026
a9e5cc7
Optimize CUDA DeformConv kernel with static mask branching and tuned …
ShirasawaSama Mar 24, 2026
6359225
CUDA DeformConv reduce 64 bit index pressure in im2col hot path
ShirasawaSama Mar 24, 2026
47ab139
Increase InlinedVector capacity in DeformConv for 7x7 kernels
ShirasawaSama Mar 25, 2026
f590281
Optimize DeformConv bias indexing with int32/int64 dispatch and clean…
ShirasawaSama Mar 25, 2026
b29da63
Optimize CUDA DeformConv bias add with 2D launch fast path and int32/…
ShirasawaSama Mar 25, 2026
2c5a52c
Optimize CUDA DeformConv by using 32-bit index arithmetic when safe a…
ShirasawaSama Mar 25, 2026
4579077
Refactor path indexing
ShirasawaSama Mar 25, 2026
cf21200
optimize deformconv bilinear sampling with interior fast path
ShirasawaSama Mar 25, 2026
97f2598
Rduce deformconv address math in dynamic im2col path
ShirasawaSama Mar 25, 2026
226d3ad
Tune deform conv im2col addressing and bilinear sampling
ShirasawaSama Mar 25, 2026
bdb90bc
Cuda deform conv replace 5x5 im2col launch specialization with 7x7
ShirasawaSama Mar 25, 2026
af3639e
Pick chunk size by min rounds then balanced ceil
ShirasawaSama Mar 25, 2026
f223c69
Fix CUDA DeformConv im2col mask stride unused-variable warning
ShirasawaSama Mar 25, 2026
9632a20
Document and tidy CUDA DeformConv
ShirasawaSama Mar 25, 2026
16e990c
Make deform conv bilinear sampling branchless with masked safe loads
ShirasawaSama Mar 25, 2026
e0558b4
Improve comments and code styles
ShirasawaSama Mar 26, 2026
d6ebfb9
Improve deform conv im2col load balance for offset_group=1
ShirasawaSama Mar 26, 2026
3831979
Harden DeformConv index-width guard and align mask test comment
ShirasawaSama Mar 26, 2026
bc4f9c8
Optimize BilinearInterpolate with one-sided bounds and float mask selp
ShirasawaSama Mar 26, 2026
2068b1a
Make deform_conv_attributes.h self-contained for numeric_limits
ShirasawaSama Mar 30, 2026
8c6ed74
Clarify bilinear index int32 safety comments
ShirasawaSama Mar 30, 2026
db3d449
Fix CeilDiv signed overflow in CUDA DeformConv chunk sizing
ShirasawaSama Apr 4, 2026
f9c1d8c
Rename offset_byte_offset to offset_elem_offset in CUDA DeformConv im…
ShirasawaSama Apr 4, 2026
1024172
Document heuristic threshold for DeformConv CUDA bias-add 2D launch path
ShirasawaSama Apr 4, 2026
6fb5f4f
Document CPU DeformConv sampling-plan tail invariants
ShirasawaSama Apr 4, 2026
394d676
Add test cases
ShirasawaSama Apr 4, 2026
8fea660
Add pointer restrict annotations to DeformConv CPU and CUDA
ShirasawaSama Apr 4, 2026
76dfdba
Fix DeformConv CUDA tail chunk col stride and add regression test
ShirasawaSama Apr 6, 2026
2574ee2
Document DeformConv aliasing assumptions for input and output buffers
ShirasawaSama Apr 6, 2026
cdb979c
Fix DeformConv CUDA grouped tail chunk col-buffer strides and add tai…
ShirasawaSama Apr 7, 2026
ad977c6
Clarify DeformConv CUDA tail-chunk stride comment for grouped GEMM
ShirasawaSama Apr 7, 2026
a767525
Reuse validated common dims for GetNParallelImgs to keep overflow che…
ShirasawaSama Apr 7, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
861 changes: 690 additions & 171 deletions onnxruntime/core/providers/cpu/nn/deform_conv.cc

Large diffs are not rendered by default.

44 changes: 40 additions & 4 deletions onnxruntime/core/providers/cpu/nn/deform_conv_attributes.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#pragma once

#include <climits>
#include <limits>

#include "core/common/common.h"
#include "core/framework/op_kernel.h"
Expand Down Expand Up @@ -73,6 +73,42 @@ struct DeformConvParams {
bool use_mask{false}; // Whether optional mask input is provided
};

// Common derived dimensions used by both CPU and CUDA kernels.
struct DeformConvCommonDims {
int64_t kernel_size{0}; // kH * kW
int64_t output_image_size{0}; // out_h * out_w
int64_t input_image_size{0}; // H * W_in
int64_t kernel_dim{0}; // (C / group) * kernel_size
};

// Validates shared runtime bounds and computes common derived dimensions.
// This helper is backend-agnostic and intended to be reused by both CPU/CUDA
// after DeformConvValidateAndParse() succeeds.
inline Status DeformConvValidateAndComputeCommonDims(const DeformConvParams& params,
DeformConvCommonDims& dims) {
const int64_t int64_max = std::numeric_limits<int64_t>::max();
ORT_RETURN_IF_NOT(params.N > 0 && params.C > 0 && params.M > 0 &&
params.group > 0 && params.offset_group > 0 &&
Comment thread
ShirasawaSama marked this conversation as resolved.
params.kH > 0 && params.kW > 0 &&
params.H > 0 && params.W_in > 0 &&
params.out_h > 0 && params.out_w > 0,
"Invalid deform conv dimensions.");

ORT_RETURN_IF_NOT(params.kH <= int64_max / params.kW, "kernel_size overflows int64.");
dims.kernel_size = params.kH * params.kW;

ORT_RETURN_IF_NOT(params.out_h <= int64_max / params.out_w, "output_image_size overflows int64.");
dims.output_image_size = params.out_h * params.out_w;

ORT_RETURN_IF_NOT(params.H <= int64_max / params.W_in, "input_image_size overflows int64.");
dims.input_image_size = params.H * params.W_in;

ORT_RETURN_IF_NOT((params.C / params.group) <= int64_max / dims.kernel_size, "kernel_dim overflows int64.");
dims.kernel_dim = (params.C / params.group) * dims.kernel_size;

return Status::OK();
}

// Validates inputs and parses attributes into params.
// Returns Status::OK() on success; on failure, params may be partially filled.
inline Status DeformConvValidateAndParse(
Expand Down Expand Up @@ -159,10 +195,10 @@ inline Status DeformConvValidateAndParse(
params.out_w = (params.W_in + params.pad_w + params.pad_w_end - params.dilation_w * (params.kW - 1) - 1) / params.stride_w + 1;
ORT_RETURN_IF_NOT(params.out_h >= 0 && params.out_w >= 0, "Computed output spatial size must be non-negative.");

// CPU BilinearInterpolate uses int for indices (for performance optimization); W <= INT_MAX / (H+1) covers all index math.
// CPU BilinearInterpolate uses int for indices (for performance optimization); W <= int_max / (H+1) covers all index math.
ORT_RETURN_IF_NOT(params.H >= 0 && params.W_in >= 0, "Input spatial dimensions H and W must be non-negative.");
ORT_RETURN_IF_NOT(params.W_in <= static_cast<int64_t>(INT_MAX) / (params.H + 1),
"Input (H+1)*W must not exceed INT_MAX (for performance optimization).");
ORT_RETURN_IF_NOT(params.W_in <= static_cast<int64_t>(std::numeric_limits<int>::max()) / (params.H + 1),
"Input (H+1)*W must not exceed int max (for performance optimization).");

// Validate tensor shapes (use division to avoid int64 overflow in offset_group * 2 * kH * kW).
ORT_RETURN_IF_NOT(offset_shape[0] == params.N, "Offset batch size must match input batch size.");
Expand Down
155 changes: 81 additions & 74 deletions onnxruntime/core/providers/cuda/nn/deform_conv.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@
// Licensed under the MIT License.
//
// CUDA implementation of DeformConv (deformable convolution 2D).
// High-level pipeline matches CPU `nn/deform_conv.cc`: im2col then grouped GEMM then optional bias;
// this file hosts the EP and batch chunking; device kernels live in `deform_conv_impl.cu`.
//
// High-level pipeline (batch may be chunked for col_buffer memory; see GetNParallelImgs):
// (1) Deformable im2col per chunk: DeformConvIm2ColImpl launches GPU kernels that fill col_buffer
// (bilinear sampling + optional mask fused in threads; no separate sampling plan like CPU).
// (2) Grouped strided batched GEMM: Y = W * Col via cuBLAS (row-major vs column-major mapping in ComputeInternal).
// (3) Optional bias: add B[m] to each output channel map (DeformConvAddBiasImpl).
//
// Main difference vs CPU path: CPU builds an AoSoA bilinear plan once per image then reuses it across channels;
// CUDA recomputes bilinear samples in the im2col kernel while walking offset/mask tensors.

#include "core/providers/shared_library/provider_api.h"
#include "deform_conv.h"
Expand All @@ -21,31 +32,30 @@ namespace {

constexpr int kMaxParallelImgs = 32;

// Returns the greatest divisor of n that is <= bound. Used to choose uniform batch chunk sizes.
// Fast path: if n % bound == 0 (common for batch 32/64/128), return immediately.
// When n >= bound^2, linear scan from bound down is O(bound). Otherwise divisor enumeration
// from 1 to sqrt(n) is O(sqrt(n)). Uses integer comparison (no sqrt) for branch decision.
int GetGreatestDivisorBelowBound(int n, int bound) {
if (bound <= 0 || n <= 0) return 1;
if (n % bound == 0) return bound; // Fast path: batch is multiple of target

// n >= bound^2 <=> bound <= sqrt(n) => linear scan is cheaper
if (static_cast<int64_t>(n) >= static_cast<int64_t>(bound) * bound) {
for (int k = bound - 1; k > 1; --k) {
if (n % k == 0) return k;
}
} else {
// n < bound^2 <=> bound > sqrt(n) => divisor enumeration is cheaper
int best = 1;
for (int i = 1; static_cast<int64_t>(i) * i <= static_cast<int64_t>(n); ++i) {
if (n % i != 0) continue;
const int q = n / i;
if (q <= bound && q > best) best = q;
if (i <= bound && i > best) best = i;
}
return best;
}
return 1;
// ceil(numer / denom) for numer >= 0, denom > 0 (integer, no floating point).
// Avoid (numer + denom - 1) / denom: numer near INT_MAX overflows signed int (UB in C++).
inline int CeilDiv(int numer, int denom) {
return numer / denom + (numer % denom != 0 ? 1 : 0);
}

// Chooses DeformConv batch chunk size k (images per outer-loop iteration) given batch N and
// a hard cap T from temp-memory budget (target_parallel_imgs).
//
// Goals (in order):
// 1) Minimize the number of outer rounds I = ceil(N / k). Under k <= T, the minimum achievable
// I is I* = ceil(N / min(N, T)) — take the largest allowed step min(N, T), same as always
// using k = T when N > T, or one round when N <= T.
// 2) Among all k with ceil(N/k) == I*, pick k = ceil(N / I*) so chunk sizes are as balanced as
// possible (last chunk is only slightly smaller than full chunks). k need not divide N; choosing
// k = ceil(N / I*) instead of always k = T often shrinks col_buffer stride when a full-T last
// chunk would leave a much smaller tail.
//
// Closed form: k_cap = min(N, T), I = ceil(N / k_cap), return ceil(N / I).
inline int GetDeformConvParallelChunkSize(int N, int T) {
Comment thread
ShirasawaSama marked this conversation as resolved.
if (N <= 0 || T <= 0) return 1;
const int k_cap = std::min(N, T);
const int num_rounds = CeilDiv(N, k_cap);
return CeilDiv(N, num_rounds);
}

// Returns the maximum temp memory (bytes) allowed for DeformConv's im2col + GEMM buffers.
Expand Down Expand Up @@ -76,28 +86,25 @@ size_t GetDeformConvEffectiveMaxTempBytes(size_t total_global_mem) {
}

// Returns how many images to process in parallel per batch chunk for DeformConv.
// Chooses the largest divisor of batch size N that fits in the temp budget and does not
// exceed kMaxParallelImgs, so that batch dimension is split evenly (no remainder).
// Note: if N is prime and N > target_parallel_imgs, the greatest divisor <= target_parallel_imgs is 1,
// so batching is effectively disabled (single-image chunks).
//
// Temp budget → cap T (see below). Chunk size k = GetDeformConvParallelChunkSize(N, T): minimize
// outer-loop rounds first, then balance chunk sizes via ceil(N / ceil(N / min(N,T))).
// The host loop still uses cur_parallel = min(k, N - b), so k need not divide N.
//
// Formulas:
// kernel_size = kH * kW
// output_image_size = out_h * out_w
// bytes_per_image = output_image_size * (C * kernel_size + M / group) * sizeof(T)
// (temp bytes per image: im2col col buffer + GEMM output buffer per output position)
// kernel_size / output_image_size come from validated common dims
// bytes_per_image = output_image_size * C * kernel_size * sizeof(T)
// (temp bytes per image: im2col col buffer only; GEMM writes directly to Y)
// max_parallel_imgs_mem = max(1, floor(effective_max_temp / bytes_per_image))
// target_parallel_imgs = min(kMaxParallelImgs, max_parallel_imgs_mem)
// return GetGreatestDivisorBelowBound(N, target_parallel_imgs)
// target_parallel_imgs T = min(kMaxParallelImgs, max_parallel_imgs_mem)
// return GetDeformConvParallelChunkSize(N, T)
template <typename T>
int GetNParallelImgs(const DeformConvParams& params, size_t total_global_mem) {
int GetNParallelImgs(const DeformConvParams& params, int64_t kernel_size, int64_t output_image_size, size_t total_global_mem) {
const size_t effective_max_temp = GetDeformConvEffectiveMaxTempBytes(total_global_mem);
const int64_t kernel_size = params.kH * params.kW;
const int64_t output_image_size = params.out_h * params.out_w;
const size_t bytes_per_image = SafeInt<size_t>(output_image_size) * (params.C * kernel_size + params.M / params.group) * sizeof(T);
const size_t bytes_per_image = SafeInt<size_t>(output_image_size) * params.C * kernel_size * sizeof(T);
const int max_parallel_imgs_mem = std::max(1, static_cast<int>(effective_max_temp / std::max(size_t(1), bytes_per_image)));
const int target_parallel_imgs = std::min(kMaxParallelImgs, max_parallel_imgs_mem);
return GetGreatestDivisorBelowBound(static_cast<int>(params.N), target_parallel_imgs);
return GetDeformConvParallelChunkSize(narrow<int>(params.N), target_parallel_imgs);
}

} // namespace
Expand Down Expand Up @@ -146,21 +153,20 @@ Status DeformConv<T>::ComputeInternal(OpKernelContext* context) const {
return Status::OK();
}

const int n_parallel_imgs = GetNParallelImgs<T>(params, GetDeviceProp().totalGlobalMem);

const int64_t kernel_size = kH * kW;
const int64_t output_image_size = out_h * out_w;
const int64_t input_image_size = H * W_in;
const int64_t kernel_dim = (C / group) * kernel_size;
DeformConvCommonDims common_dims;
ORT_RETURN_IF_ERROR(DeformConvValidateAndComputeCommonDims(params, common_dims));
const int64_t kernel_size = common_dims.kernel_size;
const int64_t output_image_size = common_dims.output_image_size;
const int64_t input_image_size = common_dims.input_image_size;
const int64_t kernel_dim = common_dims.kernel_dim;
const int n_parallel_imgs = GetNParallelImgs<T>(params, kernel_size, output_image_size, GetDeviceProp().totalGlobalMem);

const int64_t col_stride = static_cast<int64_t>(n_parallel_imgs) * output_image_size;
const int64_t col_buffer_size = (C * kernel_size) * col_stride;

AllocatorPtr alloc;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&alloc));
auto col_buffer = IAllocator::MakeUniquePtr<T>(alloc, SafeInt<size_t>(col_buffer_size));
// Removed col_transposed allocation as we avoid physical transpose.
auto gemm_output_buffer = IAllocator::MakeUniquePtr<T>(alloc, SafeInt<size_t>((M / group) * col_stride));

const T* Xdata = X->Data<T>();
const T* Wdata = W->Data<T>();
Expand All @@ -180,6 +186,7 @@ Status DeformConv<T>::ComputeInternal(OpKernelContext* context) const {
const int64_t cur_out_size = static_cast<int64_t>(cur_parallel) * output_image_size;

const T* X_block = Xdata + b * (C * input_image_size);
// Stride per full image along N: offset [N, offset_group*2*kH*kW, OH, OW] -> offset_group * 2*kH*kW * OH*OW floats.
const T* offset_block = offset_data + b * (offset_group * 2 * kernel_size * output_image_size);
const T* mask_block = use_mask ? (mask_data + b * (offset_group * kernel_size * output_image_size)) : nullptr;

Expand Down Expand Up @@ -215,16 +222,18 @@ Status DeformConv<T>::ComputeInternal(OpKernelContext* context) const {
// - W (row [M/group, kernel_dim]) -> cuBLAS interprets as col-major [kernel_dim, M/group] = W^T
// - C = A*B = Col^T * W^T = (W*Col)^T = Y^T; C is col-major [cur_out_size, M/group] = Y in row-major
//
// m=cur_out_size, n=M/group, k=kernel_dim; lda=cur_out_size, ldb=kernel_dim, ldc=cur_out_size.
// Per batch image: m=output_image_size, n=M/group, k=kernel_dim; lda=cur_out_size, ldb=kernel_dim,
// ldc=output_image_size (row-major Y slice [M/group, OH*OW]).
//
// cur_parallel==1: cur_out_size==output_image_size, C layout (pos, channel) matches NCHW Y_g[0,ch,pos] -> write
// directly into Y_g. Use strided batched for all groups in one call.
// cur_parallel>1: layouts differ -> write to gemm_output_buffer, then DeformConvCopyGemmOutputRowMajorToNCHW.

const bool gemm_writes_directly = (cur_parallel == 1);
if (gemm_writes_directly) {
// Strided batched: one call for all groups. Strides between batches:
const int64_t stride_col = kernel_dim * col_stride; // = kernel_dim * output_image_size when cur_parallel==1
// cur_parallel==1: one strided-batched GEMM over all groups (single launch).
// cur_parallel>1: per group, strided-batched GEMM with batch_count=cur_parallel; each batch writes one image
// directly into NCHW Y (strideC = M * output_image_size), avoiding a temp buffer + scatter kernel.

if (cur_parallel == 1) {
// col_buffer is packed per iteration with the current chunk width (cur_out_size).
// Using outer-scope col_stride (based on n_parallel_imgs) breaks tail chunks where
// cur_out_size != col_stride (including one-image tails) when group > 1.
const int64_t stride_col = kernel_dim * cur_out_size;
const int64_t stride_weight = (M / group) * kernel_dim;
const int64_t stride_y = (M / group) * output_image_size;
CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
Expand All @@ -249,44 +258,42 @@ Status DeformConv<T>::ComputeInternal(OpKernelContext* context) const {
device_prop,
UseTF32()));
} else {
// cur_parallel>1: GEMM output layout differs from NCHW; write to buffer then copy per group.
const int64_t stride_a_col = output_image_size;
const int64_t stride_b = 0;
const int64_t stride_c_y = M * output_image_size;
for (int64_t g = 0; g < group; ++g) {
const T* W_g = Wdata + g * (M / group) * kernel_dim;
const T* col_g = col_buffer.get() + g * kernel_dim * col_stride;
const T* col_g = col_buffer.get() + g * kernel_dim * cur_out_size;
T* Y_g = Ydata + b * M * output_image_size + g * (M / group) * output_image_size;

CUBLAS_RETURN_IF_ERROR((cublasGemmHelper(
CUBLAS_RETURN_IF_ERROR(cublasGemmStridedBatchedHelper(
cublas,
CUBLAS_OP_N,
CUBLAS_OP_N,
narrow<int>(cur_out_size),
narrow<int>(output_image_size),
narrow<int>(M / group),
narrow<int>(kernel_dim),
&alpha,
reinterpret_cast<const CudaT*>(col_g),
narrow<int>(cur_out_size),
stride_a_col,
reinterpret_cast<const CudaT*>(W_g),
narrow<int>(kernel_dim),
stride_b,
&beta,
reinterpret_cast<CudaT*>(gemm_output_buffer.get()),
narrow<int>(cur_out_size),
reinterpret_cast<CudaT*>(Y_g),
narrow<int>(output_image_size),
stride_c_y,
narrow<int>(cur_parallel),
device_prop,
UseTF32())));

ORT_RETURN_IF_ERROR(DeformConvCopyGemmOutputRowMajorToNCHW<T>(
stream,
gemm_output_buffer.get(),
Y_g,
M,
M / group,
output_image_size,
cur_parallel));
UseTF32()));
}
}
}

if (Bdata != nullptr) {
ORT_RETURN_IF_ERROR(DeformConvAddBiasImpl<T>(stream, Ydata, Bdata, N, M, out_h, out_w));
ORT_RETURN_IF_ERROR(DeformConvAddBiasImpl<T>(stream, Ydata, Bdata, N, M, out_h, out_w,
static_cast<int64_t>(device_prop.maxGridSize[1])));
}

return Status::OK();
Expand Down
Loading
Loading