NVIDIA · tbensonatl · Apr 13, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstdio>
 #include <cmath>
+#include <cstring>
 #include <memory>
 #include <fstream>
 #include <istream>
@@ -51,8 +52,17 @@ constexpr int NUM_WARMUP_ITERATIONS = 2;
 // Number of iterations per timed test. Iteration times are averaged in the report.
 constexpr int NUM_ITERATIONS = 20;
 
-template <typename InType, typename OutType>
-void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop)
+template <typename T>
+const char *TypeName() {
+  if constexpr (std::is_same_v<T, float>) return "float";
+  else if constexpr (std::is_same_v<T, double>) return "double";
+  else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) return "complex<float>";
+  else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) return "complex<double>";
+  else return "unknown";
+}
+
+template <typename InType, typename OutType, typename FilterType>
+void ChannelizePolyBench(matx::index_t num_channels, matx::index_t decimation_factor)
 {
   struct {
     matx::index_t num_batches;
@@ -65,6 +75,8 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
     { 1, 17, 256000 },
     { 42, 17, 256000 },
     { 128, 17, 256000 },
+    { 1, 17, 8192*1024 },
+    { 42, 17, 8192*1024 }
   };
 
   cudaStream_t stream;
@@ -76,41 +88,42 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
   cudaExecutor exec{};
 
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
-    for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) {
-      const matx::index_t num_batches = test_cases[i].num_batches;
-      const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
-      const matx::index_t input_len = test_cases[i].input_len;
-      const matx::index_t output_len_per_channel = (input_len + num_channels - 1) / num_channels;
+    const matx::index_t num_batches = test_cases[i].num_batches;
+    const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
+    const matx::index_t input_len = test_cases[i].input_len;
+    const matx::index_t output_len_per_channel = (input_len + decimation_factor - 1) / decimation_factor;
 
-      auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
-      auto filter = matx::make_tensor<InType, 1>({filter_len});
-      auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
+    if (input_len < num_channels * 100) {
+      continue;
+    }
 
-      const matx::index_t decimation_factor = num_channels;
+    auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
+    auto filter = matx::make_tensor<FilterType, 1>({filter_len});
+    auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
+    (input = static_cast<InType>(1)).run(exec);
+    (filter = static_cast<FilterType>(1)).run(exec);
 
-      for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
-      }
+    for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
+      (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
+    }
 
-      exec.sync();
+    exec.sync();
 
-      float elapsed_ms = 0.0f;
-      cudaEventRecord(start, stream);
-      for (int k = 0; k < NUM_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
-      }
-      cudaEventRecord(stop, stream);
-      exec.sync();
-      MATX_CUDA_CHECK_LAST_ERROR();
-      cudaEventElapsedTime(&elapsed_ms, start, stop);
-
-      const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
-      printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
-        " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
-        num_batches, num_channels, filter_len, input_len, avg_elapsed_us,
-        static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
+    float elapsed_ms = 0.0f;
+    cudaEventRecord(start, stream);
+    for (int k = 0; k < NUM_ITERATIONS; k++) {
+      (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
     }
-    printf("\n");
+    cudaEventRecord(stop, stream);
+    exec.sync();
+    MATX_CUDA_CHECK_LAST_ERROR();
+    cudaEventElapsedTime(&elapsed_ms, start, stop);
+
+    const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
+    printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " Decimation: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
+      " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
+      num_batches, num_channels, decimation_factor, filter_len, input_len, avg_elapsed_us,
+      static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
   }
 
   MATX_CUDA_CHECK_LAST_ERROR();
@@ -120,24 +133,127 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
   cudaStreamDestroy(stream);
 }
 
-int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+enum class Precision { Float, Double };
+enum class Domain { Real, Complex };
+
+struct BenchConfig {
+  Precision input_prec   = Precision::Float;
+  Domain    input_domain = Domain::Complex;
+  Precision filter_prec  = Precision::Float;
+  Domain    filter_domain = Domain::Real;
+  matx::index_t M = 10;   // number of channels
+  matx::index_t D = -1;   // decimation factor (-1 means D = M)
+};
+
+void PrintUsage(const char *prog) {
+  printf("Usage: %s [options]\n", prog);
+  printf("  --input-type   <type>   Input type: float, double, cf, cd (default: cf)\n");
+  printf("  --filter-type  <type>   Filter type: float, double, cf, cd (default: float)\n");
+  printf("  -M <N>                  Number of channels (default: 10)\n");
+  printf("  -D <N>                  Decimation factor, 0 < D <= M (default: M)\n");
+  printf("\n");
+  printf("Type shorthands: float, double, cf (complex<float>), cd (complex<double>)\n");
+}
+
+bool ParseType(const char *s, Precision &prec, Domain &dom) {
+  if (strcmp(s, "float") == 0)       { prec = Precision::Float;  dom = Domain::Real;    return true; }
+  if (strcmp(s, "double") == 0)      { prec = Precision::Double; dom = Domain::Real;    return true; }
+  if (strcmp(s, "cf") == 0)          { prec = Precision::Float;  dom = Domain::Complex; return true; }
+  if (strcmp(s, "cd") == 0)          { prec = Precision::Double; dom = Domain::Complex; return true; }
+  return false;
+}
+
+template <typename T>
+struct ScalarType { using type = T; };
+template <typename T>
+struct ScalarType<cuda::std::complex<T>> { using type = T; };
+
+template <typename InType, typename FilterType>
+void DispatchBench(const BenchConfig &cfg) {
+  using in_scalar = typename ScalarType<InType>::type;
+  using OutType = cuda::std::complex<in_scalar>;
+
+  printf("Input: %-16s  Filter: %-16s  Output: %-16s\n",
+      TypeName<InType>(), TypeName<FilterType>(), TypeName<OutType>());
+  printf("M: %" MATX_INDEX_T_FMT "  D: %" MATX_INDEX_T_FMT "\n\n", cfg.M, cfg.D);
+
+  ChannelizePolyBench<InType, OutType, FilterType>(cfg.M, cfg.D);
+}
+
+void RunBench(const BenchConfig &cfg) {
+  auto go = [&](auto in_tag, auto filt_tag) {
+    DispatchBench<decltype(in_tag), decltype(filt_tag)>(cfg);
+  };
+
+  auto dispatch_filter = [&](auto in_tag) {
+    if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Real)
+      go(in_tag, float{});
+    else if (cfg.filter_prec == Precision::Double && cfg.filter_domain == Domain::Real)
+      go(in_tag, double{});
+    else if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Complex)
+      go(in_tag, cuda::std::complex<float>{});
+    else
+      go(in_tag, cuda::std::complex<double>{});
+  };
+
+  if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Real)
+    dispatch_filter(float{});
+  else if (cfg.input_prec == Precision::Double && cfg.input_domain == Domain::Real)
+    dispatch_filter(double{});
+  else if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Complex)
+    dispatch_filter(cuda::std::complex<float>{});
+  else
+    dispatch_filter(cuda::std::complex<double>{});
+}
+
+int main(int argc, char **argv)
 {
   MATX_ENTER_HANDLER();
 
-  const matx::index_t channel_start = 3;
-  const matx::index_t channel_stop = 10;
+  BenchConfig cfg;
 
-  // printf("Benchmarking float -> complex<float>\n");
-  // ChannelizePolyBench<float,cuda::std::complex<float>>(channel_start, channel_stop);
+  for (int i = 1; i < argc; i++) {
+    if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+      PrintUsage(argv[0]);
+      return 0;
+    } else if (strcmp(argv[i], "--input-type") == 0 && i + 1 < argc) {
+      if (!ParseType(argv[++i], cfg.input_prec, cfg.input_domain)) {
+        fprintf(stderr, "Unknown input type: %s\n", argv[i]);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "--filter-type") == 0 && i + 1 < argc) {
+      if (!ParseType(argv[++i], cfg.filter_prec, cfg.filter_domain)) {
+        fprintf(stderr, "Unknown filter type: %s\n", argv[i]);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "-M") == 0 && i + 1 < argc) {
+      cfg.M = static_cast<matx::index_t>(atol(argv[++i]));
+    } else if (strcmp(argv[i], "-D") == 0 && i + 1 < argc) {
+      cfg.D = static_cast<matx::index_t>(atol(argv[++i]));
+    } else {
+      fprintf(stderr, "Unknown option: %s\n", argv[i]);
+      PrintUsage(argv[0]);
+      return 1;
+    }
+  }
 
-  printf("Benchmarking complex<float> -> complex<float>\n");
-  ChannelizePolyBench<cuda::std::complex<float>,cuda::std::complex<float>>(channel_start, channel_stop);
+  // Default D to M (maximally decimated) if not specified
+  if (cfg.D <= 0) {
+    cfg.D = cfg.M;
+  }
+
+  if (cfg.D <= 0 || cfg.D > cfg.M) {
+    fprintf(stderr, "Error: decimation factor D must satisfy 0 < D <= M (got M=%" MATX_INDEX_T_FMT ", D=%" MATX_INDEX_T_FMT ")\n",
+        cfg.M, cfg.D);
+    return 1;
+  }
 
-  // printf("Benchmarking double -> complex<double>\n");
-  // ChannelizePolyBench<double,cuda::std::complex<double>>(channel_start, channel_stop);
+  if (cfg.M < 2) {
+    fprintf(stderr, "Error: number of channels M must be >= 2 (got M=%" MATX_INDEX_T_FMT ")\n", cfg.M);
+    return 1;
+  }
 
-  // printf("Benchmarking complex<double> -> complex<double>\n");
-  // ChannelizePolyBench<cuda::std::complex<double>,cuda::std::complex<double>>(channel_start, channel_stop);
+  RunBench(cfg);
 
   matx::ClearCachesAndAllocations();