8686#include " matx.h"
8787#include < cuda/std/complex>
8888#include < cuda/cmath>
89+ #include < algorithm>
90+ #include < charconv>
8991#include < cstdio>
9092#include < cstdlib>
9193#include < cstring>
@@ -100,6 +102,110 @@ using namespace matx;
100102
101103using complex_t = cuda::std::complex <float >;
102104
105+ // Use up to this fraction of L2 for range profiles and phase lookup table
106+ static constexpr double SARBP_AUTO_L2_TARGET_MULTIPLIER = 0.8 ;
107+ static constexpr index_t SARBP_AUTO_BLOCK_GRANULARITY = 256 ;
108+ static constexpr index_t SARBP_AUTO_MIN_BLOCK_SIZE = 256 ;
109+
110+ enum class BlockSizeMode {
111+ Auto,
112+ All,
113+ Manual
114+ };
115+
116+ struct BlockSizeSelection {
117+ BlockSizeMode mode{BlockSizeMode::Auto};
118+ index_t manual_size{0 };
119+ };
120+
121+ static bool parse_index_arg (const std::string &arg, index_t &value, index_t min_value)
122+ {
123+ index_t parsed{};
124+ const auto *begin = arg.data ();
125+ const auto *end = arg.data () + arg.size ();
126+ const auto [ptr, ec] = std::from_chars (begin, end, parsed);
127+ if (arg.empty () || ec != std::errc{} || ptr != end || parsed < min_value) {
128+ return false ;
129+ }
130+
131+ value = parsed;
132+ return true ;
133+ }
134+
135+ static bool parse_block_size_arg (const std::string &arg, BlockSizeSelection &selection)
136+ {
137+ if (arg == " auto" ) {
138+ selection = BlockSizeSelection{BlockSizeMode::Auto, 0 };
139+ return true ;
140+ }
141+ if (arg == " all" ) {
142+ selection = BlockSizeSelection{BlockSizeMode::All, 0 };
143+ return true ;
144+ }
145+
146+ index_t parsed{};
147+ if (!parse_index_arg (arg, parsed, 0 )) {
148+ return false ;
149+ }
150+
151+ if (parsed == 0 ) {
152+ // Preserve the old "-b 0" behavior as an alias for all pulses.
153+ selection = BlockSizeSelection{BlockSizeMode::All, 0 };
154+ } else {
155+ selection = BlockSizeSelection{BlockSizeMode::Manual, parsed};
156+ }
157+ return true ;
158+ }
159+
160+ static index_t round_down_to_multiple (index_t value, index_t multiple)
161+ {
162+ if (multiple <= 1 ) {
163+ return value;
164+ }
165+ return (value / multiple) * multiple;
166+ }
167+
168+ static size_t get_phase_lut_bytes (index_t output_range_bins, const SarBpParams ¶ms)
169+ {
170+ if (!has_feature (params.features , SarBpFeature::PhaseLUTOptimization)) {
171+ return 0 ;
172+ }
173+
174+ const size_t elem_size = (params.compute_type == SarBpComputeType::Double)
175+ ? sizeof (cuda::std::complex <double >)
176+ : sizeof (cuda::std::complex <float >);
177+ return static_cast <size_t >(output_range_bins) * elem_size;
178+ }
179+
180+ static index_t choose_auto_block_size (index_t num_pulses, index_t output_range_bins,
181+ const SarBpParams ¶ms,
182+ const cudaDeviceProp &device_prop)
183+ {
184+ const size_t profile_bytes_per_pulse =
185+ static_cast <size_t >(output_range_bins) * sizeof (complex_t );
186+ if (num_pulses <= 0 || profile_bytes_per_pulse == 0 ||
187+ device_prop.l2CacheSize <= 0 ) {
188+ return num_pulses;
189+ }
190+
191+ const size_t phase_lut_bytes = get_phase_lut_bytes (output_range_bins, params);
192+ const double l2_target_bytes =
193+ static_cast <double >(device_prop.l2CacheSize ) * SARBP_AUTO_L2_TARGET_MULTIPLIER;
194+ double profile_budget_bytes = l2_target_bytes - static_cast <double >(phase_lut_bytes);
195+ const double min_profile_budget =
196+ static_cast <double >(profile_bytes_per_pulse) *
197+ static_cast <double >(SARBP_AUTO_MIN_BLOCK_SIZE);
198+ if (profile_budget_bytes < min_profile_budget) {
199+ profile_budget_bytes = min_profile_budget;
200+ }
201+
202+ index_t block_size =
203+ static_cast <index_t >(profile_budget_bytes / static_cast <double >(profile_bytes_per_pulse));
204+ block_size = round_down_to_multiple (block_size, SARBP_AUTO_BLOCK_GRANULARITY);
205+ block_size = std::max (block_size, SARBP_AUTO_MIN_BLOCK_SIZE);
206+ return std::min (block_size, num_pulses);
207+ }
208+
103209// Aggregate of non-tensor state needed by run_bp_device(). Kept separate from
104210// the tensor and host-buffer parameters because most members are scalar
105211// configuration values whose call-site reads cleanly as a brace-initializer.
@@ -116,6 +222,7 @@ struct BpRunCtx {
116222 index_t ifft_shift;
117223 index_t image_width;
118224 index_t image_height;
225+ index_t image_tiles;
119226 bool is_fx_domain;
120227 bool is_int16_mode;
121228 bool apply_window;
@@ -199,6 +306,23 @@ static int run_bp_device(PosTensor blk_positions, RtmTensor blk_rtm,
199306 auto image = make_tensor<complex_t >({ctx.image_height , ctx.image_width }, matx::MATX_DEVICE_MEMORY);
200307 (image = matx::zeros<complex_t >({ctx.image_height , ctx.image_width })).run (ctx.exec );
201308
309+ auto run_bp_tiles = [&](auto &cur_profiles, auto &cur_positions, auto &cur_rtm) {
310+ for (index_t tile_y = 0 ; tile_y < ctx.image_tiles ; tile_y++) {
311+ const index_t y0 = (ctx.image_height * tile_y) / ctx.image_tiles ;
312+ const index_t y1 = (ctx.image_height * (tile_y + 1 )) / ctx.image_tiles ;
313+ for (index_t tile_x = 0 ; tile_x < ctx.image_tiles ; tile_x++) {
314+ const index_t x0 = (ctx.image_width * tile_x) / ctx.image_tiles ;
315+ const index_t x1 = (ctx.image_width * (tile_x + 1 )) / ctx.image_tiles ;
316+
317+ auto cur_image = matx::slice (image, {y0, x0}, {y1, x1});
318+ auto cur_voxel_locations = matx::slice (voxel_locations, {y0, x0}, {y1, x1});
319+ (cur_image = matx::experimental::sar_bp (
320+ cur_image, cur_profiles, cur_positions, cur_voxel_locations, cur_rtm, ctx.params ))
321+ .run (ctx.exec );
322+ }
323+ }
324+ };
325+
202326 // Warmup: run kernels with correct tensor sizes to initialize FFT plans,
203327 // load kernels, etc. so that the timed run reflects steady-state performance.
204328 if (ctx.do_warmup ) {
@@ -246,10 +370,7 @@ static int run_bp_device(PosTensor blk_positions, RtmTensor blk_rtm,
246370 (cur_profiles = matx::fftshift1D (cur_compressed)).run (ctx.exec );
247371 }
248372
249- (image = matx::experimental::sar_bp (
250- matx::zeros<complex_t >({ctx.image_height , ctx.image_width }),
251- cur_profiles, cur_positions, voxel_locations, cur_rtm, ctx.params ))
252- .run (ctx.exec );
373+ run_bp_tiles (cur_profiles, cur_positions, cur_rtm);
253374 };
254375
255376 // Warmup with primary block size
@@ -404,10 +525,7 @@ static int run_bp_device(PosTensor blk_positions, RtmTensor blk_rtm,
404525
405526 // Backprojection - accumulates this block's pulses into image
406527 MATX_CUDA_CHECK (cudaEventRecord (ev_bp_start[blk], ctx.stream ));
407- (image = matx::experimental::sar_bp (image, cur_profiles,
408- cur_positions, voxel_locations,
409- cur_rtm, ctx.params ))
410- .run (ctx.exec );
528+ run_bp_tiles (cur_profiles, cur_positions, cur_rtm);
411529 MATX_CUDA_CHECK (cudaEventRecord (ev_bp_stop[blk], ctx.stream ));
412530
413531 if (ctx.num_blocks > 1 ) {
@@ -553,7 +671,9 @@ int main(int argc, char **argv) {
553671 << " -o, --output <file> Output file (default: input path with .raw extension)\n "
554672 << " -u, --upsample <N> Range upsample factor via zero-padding (default: 1)\n "
555673 << " -w, --window <type> Window for range compression: hamming, none (default: hamming)\n "
556- << " -b, --block-size <N> Pulses per block for reduced GPU memory (default: all)\n "
674+ << " -b, --block-size <N|0|auto|all>\n "
675+ << " Pulses per block; 0/all use all pulses, auto uses an L2-cache heuristic (default: auto)\n "
676+ << " --image-tiles <N> Process image as N x N tiles (default: 1)\n "
557677 << " --warmup Warmup GPU kernels and FFT plans before timed run\n "
558678 << " --precision <type> Compute precision: double, float, fltflt, mixed (default: mixed)\n "
559679 << " -h, --help Print this help message and exit\n " ;
@@ -568,7 +688,8 @@ int main(int argc, char **argv) {
568688 std::string output_file;
569689 int upsample_factor = 1 ;
570690 std::string window_type = " hamming" ;
571- int block_size_arg = 0 ; // 0 = all pulses in one block
691+ std::string block_size_arg = " auto" ;
692+ index_t image_tiles = 1 ;
572693 bool do_warmup = false ;
573694 std::string precision_type = " mixed" ;
574695
@@ -596,7 +717,15 @@ int main(int argc, char **argv) {
596717 output_file = argv[++i];
597718 } else if (std::strcmp (argv[i], " -b" ) == 0 || std::strcmp (argv[i], " --block-size" ) == 0 ) {
598719 if (!needs_value (i)) return 1 ;
599- block_size_arg = std::atoi (argv[++i]);
720+ block_size_arg = argv[++i];
721+ } else if (std::strcmp (argv[i], " --image-tiles" ) == 0 ) {
722+ if (!needs_value (i)) return 1 ;
723+ if (!parse_index_arg (argv[++i], image_tiles, 1 )) {
724+ std::cerr << " ERROR: invalid image tile count '" << argv[i]
725+ << " ' (use a positive integer)" << std::endl;
726+ print_usage ();
727+ return 1 ;
728+ }
600729 } else if (std::strcmp (argv[i], " --warmup" ) == 0 ) {
601730 do_warmup = true ;
602731 } else if (std::strcmp (argv[i], " --precision" ) == 0 ) {
@@ -623,6 +752,14 @@ int main(int argc, char **argv) {
623752 return 1 ;
624753 }
625754
755+ BlockSizeSelection block_size_selection;
756+ if (!parse_block_size_arg (block_size_arg, block_size_selection)) {
757+ std::cerr << " ERROR: invalid block size '" << block_size_arg
758+ << " ' (use a positive integer, 0, auto, or all)" << std::endl;
759+ print_usage ();
760+ return 1 ;
761+ }
762+
626763 if (output_file.empty ()) {
627764 auto dot = input_file.rfind (' .' );
628765 output_file = (dot != std::string::npos ? input_file.substr (0 , dot) : input_file) + " .raw" ;
@@ -647,6 +784,13 @@ int main(int argc, char **argv) {
647784 const bool is_int16_mode = (hdr.flags & 0x2 ) != 0 ;
648785 const int sgn = hdr.sgn ;
649786
787+ if (image_tiles > image_width || image_tiles > image_height) {
788+ std::cerr << " ERROR: --image-tiles " << image_tiles
789+ << " exceeds image dimensions " << image_height << " x "
790+ << image_width << std::endl;
791+ return 1 ;
792+ }
793+
650794 std::cout << " Input file : " << input_file << std::endl;
651795 std::cout << " Pulses : " << num_pulses << std::endl;
652796 std::cout << " Sample format : " << (is_int16_mode ? " int16" : " complex64" ) << std::endl;
@@ -853,14 +997,46 @@ int main(int argc, char **argv) {
853997 // -------------------------------------------------------------------
854998 // Block processing: range compression (if FX) + backprojection
855999 // -------------------------------------------------------------------
856- const index_t block_size = (block_size_arg > 0 )
857- ? std::min (static_cast <index_t >(block_size_arg), num_pulses)
858- : num_pulses;
1000+ size_t l2_cache_bytes = 0 ;
1001+ const size_t profile_bytes_per_pulse =
1002+ static_cast <size_t >(output_range_bins) * sizeof (complex_t );
1003+ const size_t phase_lut_bytes = get_phase_lut_bytes (output_range_bins, params);
1004+
1005+ index_t block_size = num_pulses;
1006+ if (block_size_selection.mode == BlockSizeMode::Auto) {
1007+ int device = 0 ;
1008+ cudaDeviceProp device_prop{};
1009+ MATX_CUDA_CHECK (cudaGetDevice (&device));
1010+ MATX_CUDA_CHECK (cudaGetDeviceProperties (&device_prop, device));
1011+ l2_cache_bytes = static_cast <size_t >(device_prop.l2CacheSize );
1012+ block_size = choose_auto_block_size (num_pulses, output_range_bins, params, device_prop);
1013+ } else if (block_size_selection.mode == BlockSizeMode::Manual) {
1014+ block_size = std::min (block_size_selection.manual_size , num_pulses);
1015+ }
8591016 const index_t num_blocks = (num_pulses + block_size - 1 ) / block_size;
8601017
861- std::cout << " Block size : " << block_size << " pulses (" << num_blocks
862- << " block" << (num_blocks > 1 ? " s" : " " ) << " )" << std::endl;
1018+ std::cout << " Block size : " << block_size << " pulses " ;
1019+ if (block_size_selection.mode == BlockSizeMode::Auto) {
1020+ std::cout << " (auto, " ;
1021+ } else if (block_size_selection.mode == BlockSizeMode::All) {
1022+ std::cout << " (all, " ;
1023+ } else {
1024+ std::cout << " (manual, " ;
1025+ }
1026+ std::cout << num_blocks << " block" << (num_blocks > 1 ? " s" : " " ) << " )" << std::endl;
1027+ if (block_size_selection.mode == BlockSizeMode::Auto) {
1028+ std::cout << " Auto heuristic : L2 "
1029+ << static_cast <double >(l2_cache_bytes) / (1024.0 * 1024.0 )
1030+ << " MiB, target " << SARBP_AUTO_L2_TARGET_MULTIPLIER
1031+ << " x L2, profiles "
1032+ << static_cast <double >(profile_bytes_per_pulse) / 1024.0
1033+ << " KiB/pulse, phase LUT "
1034+ << static_cast <double >(phase_lut_bytes) / (1024.0 * 1024.0 )
1035+ << " MiB" << std::endl;
1036+ }
8631037 std::cout << " BP precision : " << precision_type << std::endl;
1038+ std::cout << " Image tiles : " << image_tiles << " x " << image_tiles
1039+ << std::endl;
8641040
8651041 cudaStream_t stream;
8661042 MATX_CUDA_CHECK (cudaStreamCreate (&stream));
@@ -893,6 +1069,7 @@ int main(int argc, char **argv) {
8931069 .ifft_shift = ifft_shift,
8941070 .image_width = image_width,
8951071 .image_height = image_height,
1072+ .image_tiles = image_tiles,
8961073 .is_fx_domain = is_fx_domain,
8971074 .is_int16_mode = is_int16_mode,
8981075 .apply_window = apply_window,
0 commit comments