diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu
index 49d20b1e..786ed5bd 100644
--- a/examples/channelize_poly_bench.cu
+++ b/examples/channelize_poly_bench.cu
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstdio>
 #include <cmath>
+#include <cstring>
 #include <memory>
 #include <fstream>
 #include <istream>
@@ -51,8 +52,17 @@ constexpr int NUM_WARMUP_ITERATIONS = 2;
 // Number of iterations per timed test. Iteration times are averaged in the report.
 constexpr int NUM_ITERATIONS = 20;
 
-template <typename InType, typename OutType>
-void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop)
+template <typename T>
+const char *TypeName() {
+  if constexpr (std::is_same_v<T, float>) return "float";
+  else if constexpr (std::is_same_v<T, double>) return "double";
+  else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) return "complex<float>";
+  else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) return "complex<double>";
+  else return "unknown";
+}
+
+template <typename InType, typename OutType, typename FilterType>
+void ChannelizePolyBench(matx::index_t num_channels, matx::index_t decimation_factor)
 {
   struct {
     matx::index_t num_batches;
@@ -65,6 +75,8 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
     { 1, 17, 256000 },
     { 42, 17, 256000 },
     { 128, 17, 256000 },
+    { 1, 17, 8192*1024 },
+    { 42, 17, 8192*1024 }
   };
 
   cudaStream_t stream;
@@ -76,41 +88,42 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
   cudaExecutor exec{};
 
   for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
-    for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) {
-      const matx::index_t num_batches = test_cases[i].num_batches;
-      const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
-      const matx::index_t input_len = test_cases[i].input_len;
-      const matx::index_t output_len_per_channel = (input_len + num_channels - 1) / num_channels;
+    const matx::index_t num_batches = test_cases[i].num_batches;
+    const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
+    const matx::index_t input_len = test_cases[i].input_len;
+    const matx::index_t output_len_per_channel = (input_len + decimation_factor - 1) / decimation_factor;
 
-      auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
-      auto filter = matx::make_tensor<InType, 1>({filter_len});
-      auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
+    if (input_len < num_channels * 100) {
+      continue;
+    }
 
-      const matx::index_t decimation_factor = num_channels;
+    auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
+    auto filter = matx::make_tensor<FilterType, 1>({filter_len});
+    auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
+    (input = static_cast<InType>(1)).run(exec);
+    (filter = static_cast<FilterType>(1)).run(exec);
 
-      for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
-      }
+    for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
+      (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
+    }
 
-      exec.sync();
+    exec.sync();
 
-      float elapsed_ms = 0.0f;
-      cudaEventRecord(start, stream);
-      for (int k = 0; k < NUM_ITERATIONS; k++) {
-        (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
-      }
-      cudaEventRecord(stop, stream);
-      exec.sync();
-      MATX_CUDA_CHECK_LAST_ERROR();
-      cudaEventElapsedTime(&elapsed_ms, start, stop);
-
-      const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
-      printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
-        " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
-        num_batches, num_channels, filter_len, input_len, avg_elapsed_us,
-        static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
+    float elapsed_ms = 0.0f;
+    cudaEventRecord(start, stream);
+    for (int k = 0; k < NUM_ITERATIONS; k++) {
+      (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
     }
-    printf("\n");
+    cudaEventRecord(stop, stream);
+    exec.sync();
+    MATX_CUDA_CHECK_LAST_ERROR();
+    cudaEventElapsedTime(&elapsed_ms, start, stop);
+
+    const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
+    printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " Decimation: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
+      " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
+      num_batches, num_channels, decimation_factor, filter_len, input_len, avg_elapsed_us,
+      static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
   }
 
   MATX_CUDA_CHECK_LAST_ERROR();
@@ -120,24 +133,127 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
   cudaStreamDestroy(stream);
 }
 
-int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
+enum class Precision { Float, Double };
+enum class Domain { Real, Complex };
+
+struct BenchConfig {
+  Precision input_prec   = Precision::Float;
+  Domain    input_domain = Domain::Complex;
+  Precision filter_prec  = Precision::Float;
+  Domain    filter_domain = Domain::Real;
+  matx::index_t M = 10;   // number of channels
+  matx::index_t D = -1;   // decimation factor (-1 means D = M)
+};
+
+void PrintUsage(const char *prog) {
+  printf("Usage: %s [options]\n", prog);
+  printf("  --input-type   <type>   Input type: float, double, cf, cd (default: cf)\n");
+  printf("  --filter-type  <type>   Filter type: float, double, cf, cd (default: float)\n");
+  printf("  -M <N>                  Number of channels (default: 10)\n");
+  printf("  -D <N>                  Decimation factor, 0 < D <= M (default: M)\n");
+  printf("\n");
+  printf("Type shorthands: float, double, cf (complex<float>), cd (complex<double>)\n");
+}
+
+bool ParseType(const char *s, Precision &prec, Domain &dom) {
+  if (strcmp(s, "float") == 0)       { prec = Precision::Float;  dom = Domain::Real;    return true; }
+  if (strcmp(s, "double") == 0)      { prec = Precision::Double; dom = Domain::Real;    return true; }
+  if (strcmp(s, "cf") == 0)          { prec = Precision::Float;  dom = Domain::Complex; return true; }
+  if (strcmp(s, "cd") == 0)          { prec = Precision::Double; dom = Domain::Complex; return true; }
+  return false;
+}
+
+template <typename T>
+struct ScalarType { using type = T; };
+template <typename T>
+struct ScalarType<cuda::std::complex<T>> { using type = T; };
+
+template <typename InType, typename FilterType>
+void DispatchBench(const BenchConfig &cfg) {
+  using in_scalar = typename ScalarType<InType>::type;
+  using OutType = cuda::std::complex<in_scalar>;
+
+  printf("Input: %-16s  Filter: %-16s  Output: %-16s\n",
+      TypeName<InType>(), TypeName<FilterType>(), TypeName<OutType>());
+  printf("M: %" MATX_INDEX_T_FMT "  D: %" MATX_INDEX_T_FMT "\n\n", cfg.M, cfg.D);
+
+  ChannelizePolyBench<InType, OutType, FilterType>(cfg.M, cfg.D);
+}
+
+void RunBench(const BenchConfig &cfg) {
+  auto go = [&](auto in_tag, auto filt_tag) {
+    DispatchBench<decltype(in_tag), decltype(filt_tag)>(cfg);
+  };
+
+  auto dispatch_filter = [&](auto in_tag) {
+    if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Real)
+      go(in_tag, float{});
+    else if (cfg.filter_prec == Precision::Double && cfg.filter_domain == Domain::Real)
+      go(in_tag, double{});
+    else if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Complex)
+      go(in_tag, cuda::std::complex<float>{});
+    else
+      go(in_tag, cuda::std::complex<double>{});
+  };
+
+  if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Real)
+    dispatch_filter(float{});
+  else if (cfg.input_prec == Precision::Double && cfg.input_domain == Domain::Real)
+    dispatch_filter(double{});
+  else if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Complex)
+    dispatch_filter(cuda::std::complex<float>{});
+  else
+    dispatch_filter(cuda::std::complex<double>{});
+}
+
+int main(int argc, char **argv)
 {
   MATX_ENTER_HANDLER();
 
-  const matx::index_t channel_start = 3;
-  const matx::index_t channel_stop = 10;
+  BenchConfig cfg;
 
-  // printf("Benchmarking float -> complex<float>\n");
-  // ChannelizePolyBench<float,cuda::std::complex<float>>(channel_start, channel_stop);
+  for (int i = 1; i < argc; i++) {
+    if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+      PrintUsage(argv[0]);
+      return 0;
+    } else if (strcmp(argv[i], "--input-type") == 0 && i + 1 < argc) {
+      if (!ParseType(argv[++i], cfg.input_prec, cfg.input_domain)) {
+        fprintf(stderr, "Unknown input type: %s\n", argv[i]);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "--filter-type") == 0 && i + 1 < argc) {
+      if (!ParseType(argv[++i], cfg.filter_prec, cfg.filter_domain)) {
+        fprintf(stderr, "Unknown filter type: %s\n", argv[i]);
+        return 1;
+      }
+    } else if (strcmp(argv[i], "-M") == 0 && i + 1 < argc) {
+      cfg.M = static_cast<matx::index_t>(atol(argv[++i]));
+    } else if (strcmp(argv[i], "-D") == 0 && i + 1 < argc) {
+      cfg.D = static_cast<matx::index_t>(atol(argv[++i]));
+    } else {
+      fprintf(stderr, "Unknown option: %s\n", argv[i]);
+      PrintUsage(argv[0]);
+      return 1;
+    }
+  }
 
-  printf("Benchmarking complex<float> -> complex<float>\n");
-  ChannelizePolyBench<cuda::std::complex<float>,cuda::std::complex<float>>(channel_start, channel_stop);
+  // Default D to M (maximally decimated) if not specified
+  if (cfg.D <= 0) {
+    cfg.D = cfg.M;
+  }
+
+  if (cfg.D <= 0 || cfg.D > cfg.M) {
+    fprintf(stderr, "Error: decimation factor D must satisfy 0 < D <= M (got M=%" MATX_INDEX_T_FMT ", D=%" MATX_INDEX_T_FMT ")\n",
+        cfg.M, cfg.D);
+    return 1;
+  }
 
-  // printf("Benchmarking double -> complex<double>\n");
-  // ChannelizePolyBench<double,cuda::std::complex<double>>(channel_start, channel_stop);
+  if (cfg.M < 2) {
+    fprintf(stderr, "Error: number of channels M must be >= 2 (got M=%" MATX_INDEX_T_FMT ")\n", cfg.M);
+    return 1;
+  }
 
-  // printf("Benchmarking complex<double> -> complex<double>\n");
-  // ChannelizePolyBench<cuda::std::complex<double>,cuda::std::complex<double>>(channel_start, channel_stop);
+  RunBench(cfg);
 
   matx::ClearCachesAndAllocations();
 
diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh
index dc4dc74e..de62e321 100644
--- a/include/matx/kernels/channelize_poly.cuh
+++ b/include/matx/kernels/channelize_poly.cuh
@@ -47,12 +47,22 @@
 
 namespace matx {
 
-// Number of output elements generated per thread
-constexpr index_t CHANNELIZE_POLY1D_ELEMS_PER_THREAD = 1;
+// detail constants that require both host and device visibility at compile time
+namespace detail {
+    // Number of output elements generated per thread
+    constexpr index_t CHANNELIZE_POLY1D_ELEMS_PER_THREAD = 1;
+
+    // Maximum number of filter rotations per channel for the SmemTiled kernel.
+    // This is used to determine if the filter can be stored in shared memory. The
+    // number of rotations can exceed this value, but the filter will be read from
+    // global memory rather than cached in shared memory.
+    constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS = 32;
+}
 
 #ifdef __CUDACC__ 
 
 namespace detail {
+
 template <typename AccumT, typename FilterT>
 __MATX_DEVICE__ __MATX_INLINE__ auto channelize_cast_filter(FilterT v)
 {
@@ -80,11 +90,45 @@ __MATX_DEVICE__ __MATX_INLINE__ auto channelize_cast_input(InputT v)
         return static_cast<AccumT>(v);
     }
 }
+// Fused complex multiply-accumulate. Decomposes the operation into scalar
+// FMA instructions so the compiler emits 4 FFMA per complex tap instead of
+// ~8 mixed FMUL/FADD/FSUB. Falls back to the default operator* + operator+=
+// for real or mixed-precision types.
+template <typename AccumT, typename FilterValT, typename InputValT>
+__MATX_DEVICE__ __MATX_INLINE__ void channelize_cmac(
+    AccumT &accum, FilterValT hv, InputValT iv)
+{
+    if constexpr (is_complex_v<AccumT> && is_complex_v<FilterValT> && is_complex_v<InputValT>) {
+        auto h_re = hv.real(), h_im = hv.imag();
+        auto i_re = iv.real(), i_im = iv.imag();
+        auto a_re = accum.real(), a_im = accum.imag();
+        a_re = h_re * i_re + a_re;
+        a_re = -(h_im * i_im) + a_re;
+        a_im = h_re * i_im + a_im;
+        a_im = h_im * i_re + a_im;
+        accum = {a_re, a_im};
+    } else if constexpr (is_complex_v<AccumT> && !is_complex_v<FilterValT> && is_complex_v<InputValT>) {
+        // Real filter × complex input
+        auto a_re = accum.real(), a_im = accum.imag();
+        a_re = hv * iv.real() + a_re;
+        a_im = hv * iv.imag() + a_im;
+        accum = {a_re, a_im};
+    } else if constexpr (is_complex_v<AccumT> && is_complex_v<FilterValT> && !is_complex_v<InputValT>) {
+        // Complex filter × real input
+        auto a_re = accum.real(), a_im = accum.imag();
+        a_re = hv.real() * iv + a_re;
+        a_im = hv.imag() * iv + a_im;
+        accum = {a_re, a_im};
+    } else {
+        accum += hv * iv;
+    }
+}
+
 } // namespace detail
 
-template <int THREADS, typename OutType, typename InType, typename FilterType, typename AccumType>
+template <int THREADS, bool MaximallyDecimated, typename OutType, typename InType, typename FilterType, typename AccumType>
 __launch_bounds__(THREADS)
-__global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter)
+__global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter, index_t decimation_factor, uint32_t smem_filter_bytes)
 {
     using output_t = typename OutType::value_type;
     using input_t = typename InType::value_type;
@@ -94,15 +138,6 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
     // If the output is complex, then then accumulator is complex. Otherwise, the accumulator is real.
     using accum_t = cuda::std::conditional_t<is_complex_v<output_t>, typename detail::scalar_to_complex<AccumType>::ctype, AccumType>;
 
-    // Opportunistically store the filter taps in shared memory if the static shared memory
-    // size is sufficient. Otherwise, we will read directly from global memory on use.
-    const int SMEM_MAX_FILTER_TAPS = 128;
-    // Versions of CUDA prior to 11.8 do not allow static shared memory allocations of
-    // cuda::std::complex types due to it having no trivial constructor. This workaround
-    // prevents an 'initializer not allowed for __shared__ variable' error.
-    __align__(sizeof(filter_t)) __shared__ uint8_t smem_filter_workaround[sizeof(filter_t)*SMEM_MAX_FILTER_TAPS];
-    filter_t (&smem_filter)[SMEM_MAX_FILTER_TAPS] = reinterpret_cast<filter_t (&)[SMEM_MAX_FILTER_TAPS]>(smem_filter_workaround);
-
     constexpr int InRank = InType::Rank();
     constexpr int OutRank = OutType::Rank();
     constexpr int ChannelRank = OutRank-1;
@@ -118,93 +153,147 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
     const int channel = blockIdx.y;
     const int tid = threadIdx.x;
 
-    constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
-    const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+    constexpr index_t ELEMS_PER_BLOCK = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+    const index_t first_out_elem = elem_block * detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
     const index_t last_out_elem = cuda::std::min(
         output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);
 
-    if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
-        for (index_t t = tid; t < filter_phase_len-1; t += THREADS) {
-            const index_t h_ind = channel + t * num_channels;
-            smem_filter[t] = filter.operator()(h_ind);
-        }
-        if (tid == THREADS-1) {
-            const index_t h_ind = channel + (filter_phase_len-1) * num_channels;
-            smem_filter[filter_phase_len-1] = (h_ind < filter_full_len) ?
-                filter.operator()(h_ind) : static_cast<filter_t>(0);
-        }
-
-        __syncthreads();
-    }
-
     auto indims = BlockToIdx(input, blockIdx.z, 1);
     auto outdims = BlockToIdx(output, blockIdx.z, 2);
     outdims[ChannelRank] = channel;
 
-    if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) {
-        for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
-            const index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
-            accum_t accum {};
-            const filter_t *h = smem_filter;
-            // index_t in MatX should be signed (32 or 64 bit), so j-- below will not underflow
-            static_assert(std::is_signed_v<index_t>, "assumed signed index_t, but it is unsigned");
-            indims[InRank-1] = t * num_channels + (num_channels - 1 - channel);
-            index_t j_start = t;
-            if (indims[InRank-1] >= input_len) {
-                j_start--;
-                indims[InRank-1] -= num_channels;
-                h++;
+    if constexpr (MaximallyDecimated) {
+        // Maximally decimated (D == M) path: filter phase is fixed per channel
+        // Dynamic shared memory holds one phase's taps when the dispatch provides
+        // enough smem. When smem_bytes == 0 the filter is read from global memory.
+        // When D == M: phase = channel (fixed), A % M = channel,
+        // indims = s + t*M, causal_count = t + 1, and last_arrived >= s
+        // is always true for t >= 0.
+        const index_t s = num_channels - 1 - channel;
+
+        extern __shared__ __align__(16) uint8_t smem_filter_raw[];
+        filter_t *smem_filter = reinterpret_cast<filter_t *>(smem_filter_raw);
+        const bool use_smem_filter = (smem_filter_bytes >= sizeof(filter_t) * filter_phase_len);
+        if (use_smem_filter) {
+            for (index_t t = tid; t < filter_phase_len-1; t += THREADS) {
+                const index_t h_ind = channel + t * num_channels;
+                smem_filter[t] = filter.operator()(h_ind);
             }
-            // Because the filter must fit in shared memory, we know that the maximum
-            // number of iterations fits well within an int.
-            const int niter = static_cast<int>(j_start - first_ind + 1);
-            input_t in_val;
-            for (int i = 0; i < niter; i++) {
-                cuda::std::apply([&in_val, &input](auto &&...args) {                    
-                    in_val = input.operator()(args...);
-                }, indims);
-                accum += detail::channelize_cast_filter<accum_t>(*h) *
-                         detail::channelize_cast_input<accum_t>(in_val);
-                indims[InRank-1] -= num_channels;
-                h++;
+            if (tid == THREADS-1) {
+                const index_t h_ind = channel + (filter_phase_len-1) * num_channels;
+                smem_filter[filter_phase_len-1] = (h_ind < filter_full_len) ?
+                    filter.operator()(h_ind) : static_cast<filter_t>(0);
             }
-            outdims[OutElemRank] = t;
-            cuda::std::apply([accum, &output](auto &&...args) {
-                output.operator()(args...) = static_cast<output_t>(accum);
-            }, outdims);
+
+            __syncthreads();
         }
-    } else {
-        for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
-            index_t first_ind = cuda::std::max(static_cast<index_t>(0), t - filter_phase_len + 1);
-            // If we use the last filter tap for this phase (which is the first index because
-            // the filter is flipped), then it may be a padded zero. If so, increment first_ind
-            // by 1 to avoid using the zero. This prevents a bounds-check in the inner loop.
-            if (first_ind == (t - filter_phase_len + 1)) {
+
+        if (use_smem_filter) {
+            for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
+                accum_t accum {};
+                indims[InRank-1] = s + t * num_channels;
+                index_t h_skip = 0;
+                if (indims[InRank-1] >= input_len) {
+                    h_skip = 1;
+                    indims[InRank-1] -= num_channels;
+                }
+                const filter_t *h = smem_filter + h_skip;
+                int niter = static_cast<int>(cuda::std::min(filter_phase_len - h_skip, t + 1 - h_skip));
+                input_t in_val;
+                for (int i = 0; i < niter; i++) {
+                    cuda::std::apply([&in_val, &input](auto &&...args) {
+                        in_val = input.operator()(args...);
+                    }, indims);
+                    detail::channelize_cmac(accum,
+                            detail::channelize_cast_filter<accum_t>(*h),
+                            detail::channelize_cast_input<accum_t>(in_val));
+                    indims[InRank-1] -= num_channels;
+                    h++;
+                }
+                outdims[OutElemRank] = t;
+                cuda::std::apply([accum, &output](auto &&...args) {
+                    output.operator()(args...) = static_cast<output_t>(accum);
+                }, outdims);
+            }
+        } else {
+            index_t available_taps = filter_phase_len;
+            {
                 const bool h_is_padded = ((filter_phase_len-1) * num_channels + channel) >= filter_full_len;
                 if (h_is_padded) {
-                    first_ind++;
+                    available_taps--;
                 }
             }
-            indims[InRank-1] = t * num_channels + (num_channels - 1 - channel);
-            index_t j_start = t;
-            index_t h_ind { channel };
-            // If the last signal element is a zero-pad value, then skip it to prevent needing
-            // per-access bounds checking in the inner loop.
-            if (indims[InRank-1] >= input_len) {
-                j_start--;
-                indims[InRank-1] -= num_channels;
-                h_ind += num_channels;
+
+            for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
+                accum_t accum {};
+                indims[InRank-1] = s + t * num_channels;
+                index_t h_skip = 0;
+                if (indims[InRank-1] >= input_len) {
+                    h_skip = 1;
+                    indims[InRank-1] -= num_channels;
+                }
+                index_t h_ind = channel + h_skip * num_channels;
+                index_t niter = cuda::std::min(available_taps - h_skip, t + 1 - h_skip);
+                input_t in_val;
+                for (index_t i = 0; i < niter; i++) {
+                    cuda::std::apply([&in_val, &input](auto &&...args) {
+                        in_val = input.operator()(args...);
+                    }, indims);
+                    const filter_t h_val = filter.operator()(h_ind);
+                    detail::channelize_cmac(accum,
+                            detail::channelize_cast_filter<accum_t>(h_val),
+                            detail::channelize_cast_input<accum_t>(in_val));
+                    h_ind += num_channels;
+                    indims[InRank-1] -= num_channels;
+                }
+                outdims[OutElemRank] = t;
+                cuda::std::apply([accum, &output](auto &&...args) {
+                    output.operator()(args...) = static_cast<output_t>(accum);
+                }, outdims);
             }
-            const index_t niter = j_start - first_ind + 1;
+        }
+    } else {
+        // Oversampled (D < M) path: phase rotates per output step
+        // No shared memory. Reads filter from global/L2.
+        // Branch remap for Harris convention: r_remapped changes the input
+        // sample mapping so newest D samples land in branches D-1..0.
+        // Phase uses the original logical channel index (not remapped).
+        const index_t r_remapped = (channel + num_channels - decimation_factor) % num_channels;
+        const index_t s = num_channels - 1 - r_remapped;
+        for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) {
+            const index_t last_arrived = t * decimation_factor + decimation_factor - 1;
+            index_t niter = 0;
+            const index_t phase = (channel + t * decimation_factor) % num_channels;
+            index_t h_ind { phase };
             accum_t accum {};
+            if (last_arrived >= s) {
+                const index_t A = last_arrived - s;
+                indims[InRank-1] = last_arrived - (A % num_channels);
+                const index_t causal_count = A / num_channels + 1;
+                index_t h_skip = 0;
+                if (indims[InRank-1] >= input_len) {
+                    h_skip = 1;
+                    indims[InRank-1] -= num_channels;
+                }
+                h_ind = phase + h_skip * num_channels;
+                index_t available_taps = filter_phase_len;
+                {
+                    const bool h_is_padded = ((filter_phase_len-1) * num_channels + phase) >= filter_full_len;
+                    if (h_is_padded) {
+                        available_taps--;
+                    }
+                }
+                niter = cuda::std::min(available_taps - h_skip, causal_count - h_skip);
+            }
             input_t in_val;
             for (index_t i = 0; i < niter; i++) {
-                cuda::std::apply([&in_val, &input](auto &&...args) {                    
+                cuda::std::apply([&in_val, &input](auto &&...args) {
                     in_val = input.operator()(args...);
                 }, indims);
                 const filter_t h_val = filter.operator()(h_ind);
-                accum += detail::channelize_cast_filter<accum_t>(h_val) *
-                         detail::channelize_cast_input<accum_t>(in_val);
+                detail::channelize_cmac(accum,
+                        detail::channelize_cast_filter<accum_t>(h_val),
+                        detail::channelize_cast_input<accum_t>(in_val));
                 h_ind += num_channels;
                 indims[InRank-1] -= num_channels;
             }
@@ -216,6 +305,400 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter
     }
 }
 
+// Tiled shared-memory polyphase channelizer kernel.
+//
+// Tiles across channels so that only CTILE channels are processed per block,
+// removing the M<=256 constraint of ChannelizePoly1D_Smem while staging
+// input samples in shared memory. Supports both maximally decimated (D == M)
+// and oversampled (D < M) cases.
+//
+// Template parameters:
+//   FilterInSmem — when true, filter taps are cached in shared memory;
+//                  when false, filter taps are read from global/L2.
+//
+// Block: dim3(CTILE, NOUT)
+// Grid:  dim3(time_blocks, channel_tiles, batches)
+//
+// Shared memory layout (FilterInSmem = true):
+//   smem_filter: [P][CTILE] for D==M, or [CTILE][K][P] for D<M
+//   smem_input:  [height][CTILE] circular buffer, height = P + NOUT - 1
+//
+// Shared memory layout (FilterInSmem = false):
+//   smem_input:  [height][CTILE] circular buffer only
+//
+// Each column of smem_input is the circular buffer for one branch of the
+// commutator. Row r, column cx stores input[s(c) + r*M] where c = tile_base+cx
+// and s(c) = M-1-c.
+template <int CTILE, int NOUT, bool MaximallyDecimated, bool FilterInSmem,
+          typename IdxT,
+          typename OutType, typename InType, typename FilterType, typename AccumType>
+__launch_bounds__(CTILE * NOUT)
+__global__ void ChannelizePoly1D_SmemTiled(
+    OutType output, InType input, FilterType filter,
+    IdxT elems_per_channel_per_cta, IdxT decimation_factor, int32_t num_phases_per_channel)
+{
+    using output_t = typename OutType::value_type;
+    using input_t  = typename InType::value_type;
+    using filter_t = typename FilterType::value_type;
+    static_assert(!is_complex_v<AccumType>,
+        "channelize_poly: accumulator type must be real; it will be treated as complex when necessary");
+    using accum_t = cuda::std::conditional_t<is_complex_v<output_t>,
+        typename detail::scalar_to_complex<AccumType>::ctype, AccumType>;
+
+    extern __shared__ __align__(16) uint8_t smem_raw[];
+
+    constexpr int InRank  = InType::Rank();
+    constexpr int OutRank = OutType::Rank();
+    constexpr int ChannelRank = OutRank - 1;
+    constexpr int OutElemRank = OutRank - 2;
+
+    const IdxT input_len = static_cast<IdxT>(input.Size(InRank - 1));
+    const IdxT output_len_per_channel = static_cast<IdxT>(output.Size(OutElemRank));
+    const int32_t M = static_cast<int32_t>(output.Size(ChannelRank));
+    const int32_t filter_full_len = static_cast<int32_t>(filter.Size(0));
+    const int32_t P = static_cast<int32_t>((filter_full_len + M - 1) / M);
+    const int32_t K = num_phases_per_channel;
+
+    const int32_t cx = static_cast<int32_t>(threadIdx.x);
+    const int32_t ty = static_cast<int32_t>(threadIdx.y);
+    const int32_t tid = ty * CTILE + cx;
+    assert(blockDim.x * blockDim.y == CTILE * NOUT && blockDim.z == 1);
+    const int32_t nthreads = CTILE * NOUT;
+    const int32_t tile_base = static_cast<int32_t>(blockIdx.y) * CTILE;
+    const int32_t c = tile_base + cx;
+    const bool active = (c < M);
+
+    // Branch remap offset for Harris convention (oversampled only; 0 for D == M).
+    const int32_t L = MaximallyDecimated ? 0 : (M - static_cast<int32_t>(decimation_factor));
+
+    const int32_t filter_stride = K * P; // per-channel filter block size
+    const int32_t height = P + NOUT - 1;
+
+    filter_t *smem_filter_base = nullptr;
+    input_t  *smem_input = nullptr;
+    if constexpr (FilterInSmem) {
+        smem_filter_base = reinterpret_cast<filter_t *>(smem_raw);
+        const int32_t filter_elems = MaximallyDecimated ? (P * CTILE) : (CTILE * filter_stride);
+        size_t input_byte_offset = sizeof(filter_t) * filter_elems;
+        if (input_byte_offset % sizeof(input_t)) {
+            input_byte_offset += sizeof(input_t) - input_byte_offset % sizeof(input_t);
+        }
+        smem_input = reinterpret_cast<input_t *>(smem_raw + input_byte_offset);
+    } else {
+        smem_input = reinterpret_cast<input_t *>(smem_raw);
+    }
+
+    if constexpr (FilterInSmem) {
+        // Load filter into smem
+        if constexpr (MaximallyDecimated) {
+            for (int32_t i = tid; i < P * CTILE; i += nthreads) {
+                const int32_t p  = i / CTILE;
+                const int32_t local_channel = i % CTILE;
+                const int32_t global_channel = tile_base + local_channel;
+                const int32_t h_ind = global_channel + p * M;
+                smem_filter_base[i] = (global_channel < M && h_ind < filter_full_len)
+                    ? filter.operator()(static_cast<index_t>(h_ind))
+                    : static_cast<filter_t>(0);
+            }
+        } else {
+            // The dispatch must not select FilterInSmem when K exceeds the
+            // rotations[] array size. Assert this invariant at runtime.
+            assert(K <= detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS);
+            int32_t rotations[detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS];
+            for (int32_t k = 0; k < K; k++) {
+                rotations[k] = static_cast<int32_t>((static_cast<int64_t>(k) * decimation_factor) % M);
+            }
+            // TODO-PERF: Each channel rotates through K filter phases. Currently, we store all K phases
+            // per channel in shared memory. It is possible for K * CTILE to exceed the total number
+            // of channels, in which case we would be better off storing the full filter in shared memory.
+            // Furthermore, if there are fewer than K output points per channel generated in this CTA,
+            // then we could store only the required phases.
+            for (int32_t i = tid; i < CTILE * filter_stride; i += nthreads) {
+                const int32_t local_channel = i / filter_stride;
+                const int32_t kp = i % filter_stride;
+                const int32_t k  = kp / P;
+                const int32_t p  = kp % P;
+                const int32_t global_channel = tile_base + local_channel;
+                if (global_channel < M) {
+                    // Phase uses the original logical channel (not remapped)
+                    const int32_t phase = (global_channel + rotations[k]) % M;
+                    const int32_t h_ind = phase + p * M;
+                    smem_filter_base[i] = (h_ind < filter_full_len)
+                        ? filter.operator()(static_cast<index_t>(h_ind))
+                        : static_cast<filter_t>(0);
+                } else {
+                    smem_filter_base[i] = static_cast<filter_t>(0);
+                }
+            }
+        }
+    }
+
+    // r_remapped changes the input sample mapping to match the Harris convention. We conceptually populate commutator branches
+    // starting at M-1 and continuing counter-clockwise. That matches the Harris convention for the maximally decimated
+    // case where L == 0. For the oversampled case, Harris populates branches from M-1 to 0 for each M inputs, shifting
+    // older samples through the 2D filter bank. We stick with the M-1 to 0 convention, but then have to remap the
+    // branch indices to match the Harris convention.
+    // Phase and output channel use the original logical index c.
+    const int32_t r_remapped = (c + L) % M;
+    const int32_t s = active ? (M - 1 - r_remapped) : 0;
+    const IdxT start_elem = static_cast<IdxT>(blockIdx.x) * elems_per_channel_per_cta;
+    const IdxT last_elem = cuda::std::min(
+        output_len_per_channel - 1,
+        start_elem + elems_per_channel_per_cta - 1);
+
+    auto indims  = BlockToIdx(input,  blockIdx.z, 1);
+    auto outdims = BlockToIdx(output, blockIdx.z, 2);
+    outdims[ChannelRank] = c;
+
+    // Helper: load one input sample into smem at (buf_row, col)
+    auto load_smem_elem = [&](int32_t buf_row, int32_t col, IdxT global_row) {
+        const int32_t gc = tile_base + col;
+        const int32_t gc_remapped = MaximallyDecimated ? gc : ((gc + L) % M);
+        const int32_t branch_s = (gc < M) ? (M - 1 - gc_remapped) : 0;
+        const IdxT raw_idx = static_cast<IdxT>(branch_s) + global_row * M;
+        if (gc < M && global_row >= 0 && raw_idx >= 0 && raw_idx < input_len) {
+            indims[InRank - 1] = raw_idx;
+            input_t val;
+            cuda::std::apply([&val, &input](auto &&...args) {
+                val = input.operator()(args...);
+            }, indims);
+            smem_input[buf_row * CTILE + col] = val;
+        } else {
+            smem_input[buf_row * CTILE + col] = static_cast<input_t>(0);
+        }
+    };
+
+    auto max_bidx = [&](IdxT t) -> IdxT {
+        if constexpr (MaximallyDecimated) {
+            return t;
+        } else {
+            return ((t + 1) * decimation_factor - 1) / M;
+        }
+    };
+
+    // Initial fill of circular buffer
+    const IdxT first_iter_end = cuda::std::min(start_elem + static_cast<IdxT>(NOUT) - 1, last_elem);
+    IdxT loaded_up_to = max_bidx(first_iter_end);
+    const IdxT buf_base = loaded_up_to - (height - 1);
+
+    {
+        const int32_t first_row = tid / CTILE;
+        const int32_t row_stride = nthreads / CTILE;
+        int32_t buf_row = static_cast<int32_t>((buf_base + first_row) % height);
+        if (buf_row < 0) buf_row += height;
+        const int32_t buf_row_stride = row_stride % height;
+
+        for (int32_t i = tid; i < height * CTILE; i += nthreads) {
+            load_smem_elem(buf_row, i % CTILE, buf_base + (i / CTILE));
+            buf_row += buf_row_stride;
+            if (buf_row >= height) buf_row -= height;
+        }
+    }
+
+    __syncthreads();
+
+    if constexpr (MaximallyDecimated) {
+        // bidx = t, causal_count = t+1, last_arrived >= s always true.
+        // Track buf_row incrementally across iterations to avoid modulo.
+
+        // Seed buf_row for ty=0 at start_elem
+        int32_t buf_row_base = static_cast<int32_t>(start_elem % height);
+        // Per-iteration advance: NOUT output steps = NOUT buf_row advance. This is a defensive modulo
+        // so that we can keep buf_row_base in [0, height) with only a conditional subtraction.
+        const int32_t nout_wrap = NOUT % height;
+
+        const IdxT last_start = start_elem + ((last_elem - start_elem) / NOUT) * NOUT;
+        for (IdxT next_start = start_elem; next_start <= last_start; next_start += NOUT) {
+            const IdxT t = next_start + ty;
+            if (t <= last_elem && active) {
+                accum_t accum{};
+
+                // buf_row for this thread's output step
+                int32_t my_buf_row = buf_row_base + ty;
+                if (my_buf_row >= height) my_buf_row -= height;
+
+                const IdxT newest_raw = static_cast<IdxT>(s) + t * M;
+                int32_t h_skip = 0;
+                int32_t niter = static_cast<int32_t>(
+                    cuda::std::min(static_cast<IdxT>(P), t + 1));
+                if (newest_raw >= input_len) {
+                    h_skip = 1;
+                    niter = static_cast<int32_t>(
+                        cuda::std::min(static_cast<IdxT>(P - 1), t));
+                    if (--my_buf_row < 0) my_buf_row += height;
+                }
+
+                const int32_t prologue = cuda::std::min(my_buf_row + 1, niter);
+                const int32_t epilogue = niter - prologue;
+                int32_t h_ind = c + h_skip * M;
+                int32_t p = 0;
+                for (int32_t i = 0; i < prologue; i++, p++) {
+                    filter_t hv;
+                    if constexpr (FilterInSmem) {
+                        hv = smem_filter_base[(p + h_skip) * CTILE + cx];
+                    } else {
+                        hv = filter.operator()(static_cast<index_t>(h_ind));
+                    }
+                    const input_t iv = smem_input[my_buf_row * CTILE + cx];
+                    detail::channelize_cmac(accum,
+                            detail::channelize_cast_filter<accum_t>(hv),
+                            detail::channelize_cast_input<accum_t>(iv));
+                    my_buf_row--;
+                    h_ind += M;
+                }
+                my_buf_row = height - 1;
+                for (int32_t i = 0; i < epilogue; i++, p++) {
+                    filter_t hv;
+                    if constexpr (FilterInSmem) {
+                        hv = smem_filter_base[(p + h_skip) * CTILE + cx];
+                    } else {
+                        hv = filter.operator()(static_cast<index_t>(h_ind));
+                    }
+                    const input_t iv = smem_input[my_buf_row * CTILE + cx];
+                    detail::channelize_cmac(accum,
+                            detail::channelize_cast_filter<accum_t>(hv),
+                            detail::channelize_cast_input<accum_t>(iv));
+                    my_buf_row--;
+                    h_ind += M;
+                }
+
+                outdims[OutElemRank] = t;
+                cuda::std::apply([accum, &output](auto &&...args) {
+                    output.operator()(args...) = static_cast<output_t>(accum);
+                }, outdims);
+            }
+
+            if (next_start < last_start) {
+                // Incremental buf_row_base advance (no modulo)
+                buf_row_base += nout_wrap;
+                if (buf_row_base >= height) buf_row_base -= height;
+
+                // Ensure all threads have finished reading smem_input before overwriting
+                __syncthreads();
+
+                // Load NOUT new rows. For D==M, exactly NOUT new samples arrive.
+                // Each ty-lane loads one row (its cx column).
+                {
+                    const IdxT next_end = cuda::std::min(next_start + static_cast<IdxT>(2 * NOUT) - 1, last_elem);
+                    const int32_t new_rows = static_cast<int32_t>(max_bidx(next_end) - loaded_up_to);
+                    int32_t lr = static_cast<int32_t>((loaded_up_to + 1) % height);
+                    if (ty < new_rows) {
+                        int32_t my_lr = lr + ty;
+                        if (my_lr >= height) my_lr -= height;
+                        load_smem_elem(my_lr, cx, loaded_up_to + 1 + ty);
+                    }
+                    loaded_up_to += new_rows;
+                }
+
+                // Ensure new rows are visible before next iteration's compute
+                __syncthreads();
+            }
+        }
+    } else {
+        // D < M: oversampled path
+        const IdxT last_start = start_elem + ((last_elem - start_elem) / NOUT) * NOUT;
+        for (IdxT next_start = start_elem; next_start <= last_start; next_start += NOUT) {
+            const IdxT t = next_start + ty;
+            if (t <= last_elem && active) {
+                accum_t accum{};
+
+                const IdxT last_arrived = t * decimation_factor + decimation_factor - 1;
+                if (last_arrived >= s) {
+                    const IdxT A = last_arrived - s;
+                    const IdxT bidx = A / M;
+                    const IdxT causal_count = bidx + 1;
+
+                    const int32_t k = static_cast<int32_t>(t % K);
+                    // Phase uses the original logical channel (not remapped)
+                    const int32_t phase = static_cast<int32_t>(
+                        (c + t * decimation_factor) % M);
+
+                    int32_t available_taps = P;
+                    if (((P - 1) * M + phase) >= filter_full_len) {
+                        available_taps--;
+                    }
+
+                    IdxT newest_raw = last_arrived - (A % M);
+                    int32_t h_skip = 0;
+                    if (newest_raw >= input_len) {
+                        h_skip = 1;
+                    }
+
+                    const int32_t niter = static_cast<int32_t>(
+                        cuda::std::min(static_cast<IdxT>(available_taps - h_skip),
+                                       causal_count - h_skip));
+
+                    int32_t buf_row = static_cast<int32_t>((bidx - h_skip) % height);
+
+                    const int32_t prologue = cuda::std::min(buf_row + 1, niter);
+                    const int32_t epilogue = niter - prologue;
+                    int32_t h_ind = phase + h_skip * M;
+                    int32_t p = 0;
+                    for (int32_t i = 0; i < prologue; i++, p++) {
+                        filter_t hv;
+                        if constexpr (FilterInSmem) {
+                            hv = smem_filter_base[cx * filter_stride + k * P + (p + h_skip)];
+                        } else {
+                            hv = filter.operator()(static_cast<index_t>(h_ind));
+                        }
+                        const input_t iv = smem_input[buf_row * CTILE + cx];
+                        detail::channelize_cmac(accum,
+                                detail::channelize_cast_filter<accum_t>(hv),
+                                detail::channelize_cast_input<accum_t>(iv));
+                        buf_row--;
+                        h_ind += M;
+                    }
+                    buf_row = height - 1;
+                    for (int32_t i = 0; i < epilogue; i++, p++) {
+                        filter_t hv;
+                        if constexpr (FilterInSmem) {
+                            hv = smem_filter_base[cx * filter_stride + k * P + (p + h_skip)];
+                        } else {
+                            hv = filter.operator()(static_cast<index_t>(h_ind));
+                        }
+                        const input_t iv = smem_input[buf_row * CTILE + cx];
+                        detail::channelize_cmac(accum,
+                                detail::channelize_cast_filter<accum_t>(hv),
+                                detail::channelize_cast_input<accum_t>(iv));
+                        buf_row--;
+                        h_ind += M;
+                    }
+                }
+
+                outdims[OutElemRank] = t;
+                cuda::std::apply([accum, &output](auto &&...args) {
+                    output.operator()(args...) = static_cast<output_t>(accum);
+                }, outdims);
+            }
+
+            if (next_start < last_start) {
+                // Ensure all threads have finished reading smem_input before overwriting
+                __syncthreads();
+
+                // Load new rows for next iteration
+                const IdxT next_iter_end = cuda::std::min(next_start + static_cast<IdxT>(2 * NOUT) - 1, last_elem);
+                const IdxT needed_up_to = max_bidx(next_iter_end);
+                const int32_t new_rows = static_cast<int32_t>(needed_up_to - loaded_up_to);
+
+                // Each ty-lane loads one row (its cx column)
+                {
+                    int32_t lr = static_cast<int32_t>((loaded_up_to + 1) % height);
+                    if (ty < new_rows) {
+                        int32_t my_lr = lr + ty;
+                        if (my_lr >= height) my_lr -= height;
+                        load_smem_elem(my_lr, cx, loaded_up_to + 1 + ty);
+                    }
+                }
+
+                loaded_up_to = needed_up_to;
+
+                // Ensure new rows are visible before next iteration's compute
+                __syncthreads();
+            }
+        }
+    }
+}
+
 // This kernel works in cases where the full filter (with potentially some zero padding) and
 // the inputs required to compute elems_per_channel_per_cta outputs all fit into shared memory.
 template <typename OutType, typename InType, typename FilterType, typename AccumType>
@@ -226,10 +709,10 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
     using filter_t = typename FilterType::value_type;
     static_assert(! is_complex_v<AccumType>,
         "channelize_poly: accumulator type must be real; it will be treated as complex when necessary");
-    // If the output is complex, then then accumulator is complex. Otherwise, the accumulator is real.
+    // If the output is complex, then accumulator is complex. Otherwise, the accumulator is real.
     using accum_t = cuda::std::conditional_t<is_complex_v<output_t>, typename detail::scalar_to_complex<AccumType>::ctype, AccumType>;
 
-    extern __shared__ uint8_t __attribute((aligned(16))) smem_dyn_align16[];
+    extern __shared__ __align__(16) uint8_t smem_dyn_align16[];
 
     constexpr int InRank = InType::Rank();
     constexpr int OutRank = OutType::Rank();
@@ -289,7 +772,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
                 smem_input[smem_ind] = input.operator()(args...);
             }, indims);
         } else {
-            smem_input[smem_ind] = static_cast<filter_t>(0);
+            smem_input[smem_ind] = static_cast<input_t>(0);
         }
     }
 
@@ -313,7 +796,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
                     smem_input[smem_ind] = input.operator()(args...);
                 }, indims);
             } else {
-                smem_input[smem_ind] = static_cast<filter_t>(0);
+                smem_input[smem_ind] = static_cast<input_t>(0);
             }
         }
 
@@ -343,15 +826,17 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f
             const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan);
             // Apply the filter h in reverse order below to flip the filter for convolution
             for (int32_t k = 0; k < prologue_count; k++) {
-                accum += detail::channelize_cast_filter<accum_t>(*h) *
-                         detail::channelize_cast_input<accum_t>(*sample);
+                detail::channelize_cmac(accum,
+                        detail::channelize_cast_filter<accum_t>(*h),
+                        detail::channelize_cast_input<accum_t>(*sample));
                 sample += num_channels;
                 h -= num_channels;
             }
             sample = smem_input + (num_channels - 1 - chan);
             for (int32_t k = 0; k < epilogue_count; k++) {
-                accum += detail::channelize_cast_filter<accum_t>(*h) *
-                         detail::channelize_cast_input<accum_t>(*sample);
+                detail::channelize_cmac(accum,
+                        detail::channelize_cast_filter<accum_t>(*h),
+                        detail::channelize_cast_input<accum_t>(*sample));
                 sample += num_channels;
                 h -= num_channels;
             }
@@ -395,15 +880,15 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
     auto indims = BlockToIdx(input, blockIdx.z, 1);
     auto outdims = BlockToIdx(output, blockIdx.z, 2);
 
-    constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
-    const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+    constexpr index_t ELEMS_PER_BLOCK = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+    const index_t first_out_elem = elem_block * detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
     const index_t last_out_elem = cuda::std::min(
         output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1);
 
     // Versions of CUDA prior to 11.8 do not allow static shared memory allocations of
     // cuda::std::complex types due to it having no trivial constructor. This workaround
     // prevents an 'initializer not allowed for __shared__ variable' error.
-    __align__(sizeof(complex_accum_t)) __shared__ uint8_t smem_eij_workaround[sizeof(complex_accum_t)*NUM_CHAN*NUM_CHAN];
+    __shared__ __align__(16) uint8_t smem_eij_workaround[sizeof(complex_accum_t)*NUM_CHAN*NUM_CHAN];
     complex_accum_t (&smem_eij)[NUM_CHAN][NUM_CHAN] = reinterpret_cast<complex_accum_t (&)[NUM_CHAN][NUM_CHAN]>(smem_eij_workaround);
     // Pre-compute the DFT complex exponentials and store in shared memory
     for (int t = tid; t < NUM_CHAN*NUM_CHAN; t += THREADS) {
@@ -440,14 +925,15 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
             for (int chan = 0; chan < NUM_CHAN; chan++) {
                 const filter_t h_val = (h_ind < filter_full_len) ? filter.operator()(h_ind) : static_cast<filter_t>(0);
                 if (indims[InRank-1] < input_len) {
-                    cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) {                    
-                        accum[chan] += detail::channelize_cast_filter<filtering_accum_t>(h_val) *
-                                       detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...));
+                    cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) {
+                        detail::channelize_cmac(accum[chan],
+                                detail::channelize_cast_filter<filtering_accum_t>(h_val),
+                                detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...)));
                     }, indims);
                 }
                 h_ind++;
                 indims[InRank-1]--;
-            }        
+            }
         }
         niter--;
 
@@ -456,8 +942,9 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
             for (int chan = 0; chan < NUM_CHAN; chan++) {
                 const filter_t h_val = filter.operator()(h_ind);
                 cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) {                    
-                    accum[chan] += detail::channelize_cast_filter<filtering_accum_t>(h_val) *
-                                   detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...));
+                    detail::channelize_cmac(accum[chan],
+                            detail::channelize_cast_filter<filtering_accum_t>(h_val),
+                            detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...)));
                 }, indims);
                 h_ind++;
                 indims[InRank-1]--;
@@ -473,8 +960,9 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT
                 // const filter_t h_val = (h_ind < filter_full_len) ? filter.operator()(h_ind) : static_cast<filter_t>(0);
                 const filter_t h_val = filter.operator()(h_ind);
                 cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) {                    
-                    accum[chan] += detail::channelize_cast_filter<filtering_accum_t>(h_val) *
-                                   detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...));
+                    detail::channelize_cmac(accum[chan],
+                            detail::channelize_cast_filter<filtering_accum_t>(h_val),
+                            detail::channelize_cast_input<filtering_accum_t>(input.operator()(args...)));
                 }, indims);
                 h_ind++;
                 indims[InRank-1]--;
diff --git a/include/matx/operators/channelize_poly.h b/include/matx/operators/channelize_poly.h
index da18bd35..b88f1200 100644
--- a/include/matx/operators/channelize_poly.h
+++ b/include/matx/operators/channelize_poly.h
@@ -78,7 +78,7 @@ namespace detail {
           a_(a), f_(f), num_channels_(num_channels), decimation_factor_(decimation_factor)
       {
         MATX_LOG_TRACE("{} constructor: num_channels={}, decimation_factor={}", str(), num_channels, decimation_factor); 
-        const index_t b_len = (a_.Size(OpA::Rank() - 1) + num_channels - 1) / num_channels;
+        const index_t b_len = (a_.Size(OpA::Rank() - 1) + decimation_factor - 1) / decimation_factor;
 
         for (int r = 0; r < OpA::Rank()-1; r++) {
           out_dims_[r] = a_.Size(r);
@@ -201,16 +201,56 @@ namespace detail {
  * is assumed to contain a single input signal with all preceding dimensions being batch dimensions
  * that are channelized independently.
  * @param f Filter operator that represents the filter coefficients. This must be a 1D tensor.
- * @param num_channels Number of channels to create.
- * @param decimation_factor Factor by which to downsample the input signal into the channels. Currently,
- * the only supported value of decimation_factor is a value equal to num_channels. This corresponds to
- * the maximally decimated, or critically sampled, case. It is also possible for decimation_factor to
- * be less than num_channels, which corresponds to an oversampled case with overlapping channels, but
- * this implementation does not yet support oversampled cases.
+ * @param num_channels Number of channels (or frequency sub-bands) to create.
+ * @param decimation_factor Factor by which to downsample the input signal into the channels. When
+ * decimation_factor equals num_channels, this is the maximally decimated (critically sampled) case.
+ * When decimation_factor is less than num_channels, this is the oversampled case with overlapping
+ * channels. Both integer and rational oversampling ratios are supported.
  * 
  * @returns Operator representing the channelized signal. The output tensor rank is one higher than the
  * input tensor rank. The first Rank-2 dimensions are all batch dimensions. The second-to-last dimension
- * is the sample dimension and the last dimension is the channel dimension.
+ * is the sample dimension and the last dimension is the channel dimension. The per-channel output
+ * size is (in.Size(InType::Rank()-1) + decimation_factor - 1) / decimation_factor.
+ *
+ * Let the letters M and D denote the number of channels and the decimation factor,
+ * respectively. The case where M == D is the maximally decimated (critically sampled) case. In this case,
+ * the channelizer generates M output samples (one per channel) for each M input samples. The case where
+ * D < M is the oversampled case. In this case, the channelizer generates M output samples (one per channel)
+ * for each D input samples. Thus, the total output sample count is roughly M/D times the input sample count.
+ *
+ * Logically, the polyphase channelizer delivers samples to a set of M commutator branches,
+ * each representing a channel or frequency sub-band. For each output step, the samples in that commutator branch are convolved
+ * with a phase of the prototype filter. The outputs of these convolutions are then the inputs to an M-point
+ * IFFT. The results obtained depend on the order in which samples are delivered to the commutator branches
+ * and the filter phase used per branch for each output.
+ *
+ * In MatX, samples are always delivered to the commutator branches in counter-clockwise order. In the
+ * maximally decimated case, the sample-to-branch order is M-1, M-2, ..., 0 as in the
+ * paper by Harris et al [1]. The filter phases are fixed per channel, so channel 0 corresponds to phase 0,
+ * channel 1 corresponds to phase 1, etc. If comparing to another implementation that starts with a different
+ * commutator branch (say, branch 0) and then proceeds in the same order (counter-clockwise), one can either prime the
+ * other channelizer with a single zero input (discarding the outputs, if M outputs are generated after the
+ * first sample) or prime the MatX channelizer with an appropriate number of zeros (e.g., M-1 to match an
+ * implementation that starts at branch 0) to obtain equivalent results.
+ *
+ * The oversampled case, where D < M, is more complicated. The order
+ * in which samples are delivered to the commutator branches is still counter-clockwise, but the samples
+ * are logically delivered to branches D-1, D-2, ..., 0 for each set of D inputs. With each new set of D
+ * inputs, the previous samples are logically shifted through the 2D filter bank. As an implementation detail,
+ * rather than shifting samples, we rotate the filter phases applied to a given commutator branch. Each
+ * branch rotates through a set of K = M / gcd(M, D) phases, where gcd(M, D) is the greatest common divisor
+ * of M and D. Note that the maximally decimated case is a special case of the oversampled case where samples
+ * shift through the 2D filter bank in groups of M, thus maintaining the same filter phase for each channel/branch.
+ *
+ * Comparing the oversampled case to another channelizer with a different convention can be difficult due
+ * to the rotation of the filter phases. For example, for a channelizer that starts at branch 0 and delivers
+ * M outputs after the first sample, that first sample will effectively rotate the filter phases as a result
+ * of generating a set of M outputs. This is unlike the maximally decimated case where the filter phases are
+ * fixed per channel, so we cannot simply prime the other channelizer with a single zero input to obtain equivalent
+ * results. Instead, we need to provide enough zero inputs to the other channelizer so that the two channelizers
+ * will start with the same filter phases. For a channelizer that starts at branch 0 and delivers M outputs
+ * after a single sample, proceeding with the D-1, D-2, ..., 0 order thereafter, priming requires
+ * 1 + (K - 1) * D zero samples, where K = M / gcd(M, D) as before.
  *
  * The polyphase channelizer supports the following properties:
  * - PropAccum: Type of accumulator. This type should always be real, but it will be promoted to
@@ -219,7 +259,12 @@ namespace detail {
  *
  * The default accumulator type is the output type, but the output type is context-dependent. The
  * PropOutput property allows the user to explicitly set the output type and the PropAccum property
- * allows the user to explicitly set the accumulator type.
+ * allows the user to explicitly set the accumulator type. See the examples section for an example
+ * of the property syntax.
+ *
+ * [1] "Digital Receivers and Transmitters Using Polyphase Filter Banks for Wireless Communications",
+ * F. J. Harris, C. Dick, M. Rice, IEEE Transactions on Microwave Theory and Techniques,
+ * Vol. 51, No. 4, Apr. 2003.
  */
 template <typename InType, typename FilterType>
 inline auto channelize_poly(const InType &in, const FilterType &f, index_t num_channels, index_t decimation_factor) {
diff --git a/include/matx/transforms/channelize_poly.h b/include/matx/transforms/channelize_poly.h
index ece38718..70210bb4 100644
--- a/include/matx/transforms/channelize_poly.h
+++ b/include/matx/transforms/channelize_poly.h
@@ -35,6 +35,7 @@
 #include <cstdint>
 #include <cstdio>
 #include <type_traits>
+#include <numeric>
 
 #include "matx/core/error.h"
 #include "matx/core/nvtx.h"
@@ -58,9 +59,168 @@ constexpr index_t MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD = 6;
 // to balance occupancy and CTA size. For now, we choose a reasonable default.
 constexpr index_t MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER = 4;
 
+// Maximum dynamic shared memory (bytes) for caching filter taps in ChannelizePoly1D.
+constexpr size_t MATX_CHANNELIZE_POLY1D_MAX_FILTER_SMEM_BYTES = 6 * 1024;
+
+// Constants for the SmemTiled kernel.
+constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE = 64;
+constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT  = 4;
+constexpr size_t MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_BYTES = 48 * 1024;
+// Maximum filter smem budget. When the filter exceeds this, it is read from
+// global/L2 instead, freeing smem for better occupancy.
+constexpr size_t MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES = 2048;
+
+// Compute the shared memory footprint for the SmemTiled filter taps.
+// For D==M: one phase per tile channel → CTILE * P elements.
+// For D<M: K phases per tile channel → CTILE * K * P elements.
+// Returns a value exceeding the filter budget when K exceeds the rotation
+// limit, which causes FilterInSmem=false in the dispatch.
+template <typename FilterType>
+inline size_t matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes(
+    index_t num_channels, index_t filter_len, index_t decimation_factor)
+{
+  using filter_t = typename FilterType::value_type;
+  constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE;
+  const index_t P = (filter_len + num_channels - 1) / num_channels;
+  if (decimation_factor == num_channels) {
+    return static_cast<size_t>(CTILE) * P * sizeof(filter_t);
+  }
+  const index_t gcd_val = std::gcd(num_channels, decimation_factor);
+  const index_t K = num_channels / gcd_val;
+  if (K > detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS) {
+    return MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES + 1; // exceeds budget, so FilterInSmem=false
+  }
+  return static_cast<size_t>(CTILE) * K * P * sizeof(filter_t);
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline size_t matxChannelizePoly1DInternal_SmemTiledSizeBytes(
+    const OutType &o, const InType &, const FilterType &filter, index_t decimation_factor)
+{
+  using input_t  = typename InType::value_type;
+
+  constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE;
+  constexpr int NOUT  = MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT;
+
+  const index_t M = o.Size(OutType::Rank() - 1);
+  const index_t filter_len = filter.Size(FilterType::Rank() - 1);
+  const index_t P = (filter_len + M - 1) / M;
+  const size_t input_smem = static_cast<size_t>(P + NOUT - 1) * CTILE * sizeof(input_t);
+
+  const size_t filter_smem = matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes<FilterType>(M, filter_len, decimation_factor);
+  if (filter_smem > MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES) {
+    return input_smem;
+  }
+
+  const size_t filter_smem_aligned = filter_smem +
+      ((filter_smem % sizeof(input_t)) ? (sizeof(input_t) - filter_smem % sizeof(input_t)) : 0);
+  return filter_smem_aligned + input_smem;
+}
+
+template <typename OutType, typename InType, typename FilterType>
+inline bool matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel(
+    const OutType &o, const InType &in, const FilterType &filter, index_t decimation_factor)
+{
+  const index_t num_channels = o.Size(OutType::Rank() - 1);
+  // Skip SmemTiled for small channel counts where most of CTILE would be
+  // idle threads. The generic ChannelizePoly1D kernel is more efficient
+  // in this regime.
+  if (num_channels <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE * 3 / 4) {
+    return false;
+  }
+  // The input circular buffer must fit in smem. The filter may or may not
+  // be included (FilterInSmem is decided separately).
+  return matxChannelizePoly1DInternal_SmemTiledSizeBytes(o, in, filter, decimation_factor)
+      <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_BYTES;
+}
+
+template <typename OutType, typename InType, typename FilterType, typename AccumType>
+inline void matxChannelizePoly1DInternal_SmemTiled(
+    OutType o, const InType &i, const FilterType &filter,
+    index_t decimation_factor, cudaStream_t stream)
+{
+#ifdef __CUDACC__
+  MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
+
+  constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE;
+  constexpr int NOUT  = MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT;
+
+  const index_t num_channels = o.Size(OutType::Rank() - 1);
+  const index_t nout_per_channel = o.Size(OutType::Rank() - 2);
+  const int num_batches = static_cast<int>(TotalSize(i) / i.Size(i.Rank() - 1));
+
+  const index_t gcd_val = std::gcd(num_channels, decimation_factor);
+  const int32_t K = static_cast<int32_t>(num_channels / gcd_val);
+
+  const int channel_tiles = static_cast<int>((num_channels + CTILE - 1) / CTILE);
+  // Target ~1024 spatial blocks (time × channel tiles) to saturate the GPU.
+  // For large channel counts, fewer time blocks are needed.
+  const int target_time_blocks = cuda::std::max(1, (1024 + channel_tiles - 1) / channel_tiles);
+  const int elem_per_block = static_cast<int>(
+      (nout_per_channel + target_time_blocks - 1) / target_time_blocks);
+  const int time_blocks = static_cast<int>(
+      (nout_per_channel + elem_per_block - 1) / elem_per_block);
+
+  dim3 block(CTILE, NOUT);
+  dim3 grid(time_blocks, channel_tiles, num_batches);
+
+  const index_t filter_len = filter.Size(FilterType::Rank() - 1);
+  const bool filter_in_smem = matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes<FilterType>(
+      num_channels, filter_len, decimation_factor) <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES;
+  const size_t smem_size = matxChannelizePoly1DInternal_SmemTiledSizeBytes(o, i, filter, decimation_factor);
+
+  // Use int32_t for intra-kernel index arithmetic when all tensor dimensions
+  // fit, avoiding 64-bit IMAD.WIDE instructions in the inner loops.
+  const index_t input_len = i.Size(i.Rank() - 1);
+  const bool use_32bit = (sizeof(index_t) <= sizeof(int32_t)) ||
+      (static_cast<int64_t>(input_len) + num_channels <= std::numeric_limits<int32_t>::max() &&
+       nout_per_channel <= std::numeric_limits<int32_t>::max() &&
+       num_channels <= std::numeric_limits<int32_t>::max());
+
+  // Dispatch on MaximallyDecimated x FilterInSmem x IndexType
+  constexpr bool kMaxDec  = true;
+  constexpr bool kOversampled = false;
+  constexpr bool kFiltSmem = true;
+  constexpr bool kFiltGlobal = false;
+
+  auto launch = [&](auto idx_tag) {
+    using IdxT = decltype(idx_tag);
+    const IdxT epb = static_cast<IdxT>(elem_per_block);
+    const IdxT df  = static_cast<IdxT>(decimation_factor);
+    if (decimation_factor == num_channels) {
+      if (filter_in_smem) {
+        ChannelizePoly1D_SmemTiled<CTILE, NOUT, kMaxDec, kFiltSmem, IdxT, OutType, InType, FilterType, AccumType>
+            <<<grid, block, smem_size, stream>>>(o, i, filter, epb, df, K);
+      } else {
+        ChannelizePoly1D_SmemTiled<CTILE, NOUT, kMaxDec, kFiltGlobal, IdxT, OutType, InType, FilterType, AccumType>
+            <<<grid, block, smem_size, stream>>>(o, i, filter, epb, df, K);
+      }
+    } else {
+      if (filter_in_smem) {
+        ChannelizePoly1D_SmemTiled<CTILE, NOUT, kOversampled, kFiltSmem, IdxT, OutType, InType, FilterType, AccumType>
+            <<<grid, block, smem_size, stream>>>(o, i, filter, epb, df, K);
+      } else {
+        ChannelizePoly1D_SmemTiled<CTILE, NOUT, kOversampled, kFiltGlobal, IdxT, OutType, InType, FilterType, AccumType>
+            <<<grid, block, smem_size, stream>>>(o, i, filter, epb, df, K);
+      }
+    }
+  };
+
+  if constexpr (sizeof(index_t) <= sizeof(int32_t)) {
+    launch(index_t{});
+  } else {
+    if (use_32bit) {
+      launch(int32_t{});
+    } else {
+      launch(index_t{});
+    }
+  }
+#endif
+}
+
 template <typename OutType, typename InType, typename FilterType, typename AccumType>
 inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
-                                     const FilterType &filter, cudaStream_t stream)
+                                     const FilterType &filter, index_t decimation_factor, cudaStream_t stream)
 {
 #ifdef __CUDACC__
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL)
@@ -69,13 +229,26 @@ inline void matxChannelizePoly1DInternal(OutType o, const InType &i,
   const index_t nout_per_channel = o.Size(OutType::Rank()-2);
   const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
 
+  using filter_t = typename FilterType::value_type;
+  const index_t filter_len = filter.Size(FilterType::Rank()-1);
+
   const int THREADS = 256;
-  const index_t ELTS_PER_THREAD = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+  const index_t ELTS_PER_THREAD = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
   const int elem_blocks = static_cast<int>(
     (nout_per_channel + ELTS_PER_THREAD - 1) / ELTS_PER_THREAD);
   dim3 grid(elem_blocks, static_cast<int>(num_channels), num_batches);
-  ChannelizePoly1D<THREADS, OutType, InType, FilterType, AccumType><<<grid, THREADS, 0, stream>>>(
-      o, i, filter);
+  if (decimation_factor == num_channels) {
+    // For M == D, cache one filter phase in dynamic shared memory if it fits.
+    const index_t filter_phase_len = (filter_len + num_channels - 1) / num_channels;
+    const size_t smem_needed = static_cast<size_t>(filter_phase_len) * sizeof(filter_t);
+    const uint32_t smem_bytes = (smem_needed <= MATX_CHANNELIZE_POLY1D_MAX_FILTER_SMEM_BYTES)
+        ? static_cast<uint32_t>(smem_needed) : 0;
+    ChannelizePoly1D<THREADS, true, OutType, InType, FilterType, AccumType><<<grid, THREADS, smem_bytes, stream>>>(
+        o, i, filter, decimation_factor, smem_bytes);
+  } else {
+    ChannelizePoly1D<THREADS, false, OutType, InType, FilterType, AccumType><<<grid, THREADS, 0, stream>>>(
+        o, i, filter, decimation_factor, 0);
+  }
 #endif
 }
 
@@ -149,7 +322,7 @@ inline void matxChannelizePoly1DInternal_FusedChan(OutType o, const InType &i,
   const int num_batches = static_cast<int>(TotalSize(i)/i.Size(i.Rank() - 1));
 
   const int THREADS = 256;
-  const index_t ELTS_PER_THREAD = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
+  const index_t ELTS_PER_THREAD = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS;
   const int elem_blocks = static_cast<int>(
     (nout_per_channel + ELTS_PER_THREAD - 1) / ELTS_PER_THREAD);
   dim3 grid(elem_blocks, 1, num_batches);
@@ -195,9 +368,9 @@ inline void matxChannelizePoly1DUnpackInternal(DataType inout, cudaStream_t stre
 
 /**
  * @brief 1D polyphase channelizer. A channelizer separates an input signal into a set of
- * constituent channels, each corresponding to a band of the input signal bandwidth. The current
- * implementation only supports maximally decimated (i.e., critically sampled) channelizers wherein the
- * decimation factor is equivalent to the number of channels and the channels are non-overlapping.
+ * constituent channels, each corresponding to a band of the input signal bandwidth. Supports both
+ * maximally decimated (critically sampled, decimation_factor == num_channels) and oversampled
+ * (decimation_factor < num_channels) cases, including rational oversampling ratios.
  * 
  * @tparam OutType Type of output
  * @tparam InType Type of input
@@ -208,16 +381,16 @@ inline void matxChannelizePoly1DUnpackInternal(DataType inout, cudaStream_t stre
  * @param in Input operator
  * @param f Filter operator
  * @param num_channels Number of channels in which to separate the signal. Must be greater than 1.
- * @param decimation_factor Factor by which to downsample the input signal into the channels. Currently,
- * the only supported value of decimation_factor is a value equal to num_channels. This corresponds to
- * the maximally decimated, or critically sampled, case. It is also possible for decimation_factor to
- * be less than num_channels, which corresponds to an oversampled case with overlapping channels, but
- * this implementation does not yet support oversampled cases.
+ * @param decimation_factor Factor by which to downsample the input signal into the channels. When
+ * decimation_factor equals num_channels, this is the maximally decimated (critically sampled) case.
+ * When decimation_factor is less than num_channels, this is the oversampled case with overlapping
+ * channels. Both integer (num_channels % decimation_factor == 0) and rational oversampling ratios
+ * are supported.
  * @param stream CUDA stream on which to run the kernel(s)
  */
 template <typename OutType, typename InType, typename FilterType, typename AccumType>
 inline void channelize_poly_impl(OutType out, const InType &in, const FilterType &f,
-                   index_t num_channels, [[maybe_unused]] index_t decimation_factor, cudaStream_t stream = 0) {
+                   index_t num_channels, index_t decimation_factor, cudaStream_t stream = 0) {
   MATX_NVTX_START("", matx::MATX_NVTX_LOG_API)
   using OutputOp = std::remove_cv_t<std::remove_reference_t<OutType>>;
   using InputOp = std::remove_cv_t<std::remove_reference_t<InType>>;
@@ -243,14 +416,14 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
     "channelize_poly: num_channels must be greater than 1");
   MATX_ASSERT_STR(decimation_factor > 0, matxInvalidParameter,
     "channelize_poly: decimation_factor must be positive");
-  MATX_ASSERT_STR(num_channels == decimation_factor, matxInvalidParameter,
-    "channelize_poly: currently only support decimation_factor == num_channels");
+  MATX_ASSERT_STR(decimation_factor <= num_channels, matxInvalidParameter,
+    "channelize_poly: decimation_factor must be <= num_channels");
 
   for(int i = 0 ; i < IN_RANK-1; i++) {
     MATX_ASSERT_STR(out.Size(i) == in.Size(i), matxInvalidDim, "channelize_poly: input/output must have matched batch sizes");
   }
 
-  [[maybe_unused]] const index_t num_elem_per_channel = (in.Size(IN_RANK-1) + num_channels - 1) / num_channels;
+  [[maybe_unused]] const index_t num_elem_per_channel = (in.Size(IN_RANK-1) + decimation_factor - 1) / decimation_factor;
 
   MATX_ASSERT_STR(out.Size(OUT_RANK-1) == num_channels, matxInvalidDim,
     "channelize_poly: output size OUT_RANK-1 mismatch");
@@ -260,7 +433,8 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
   // If neither the input nor the filter is complex, then the filtered samples will be real-valued
   // and we will use an R2C transform. Otherwise, we will use a C2C transform.
   if constexpr (! is_complex_v<input_t> && ! is_complex_half_v<input_t> && ! is_complex_v<filter_t> && ! is_complex_half_v<filter_t>) {
-    if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
+    // The fused-DFT kernel only supports the maximally decimated case (D == M).
+    if (decimation_factor == num_channels && num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
       matxChannelizePoly1DInternal_FusedChan<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, stream);
     } else {
       index_t start_dims[OUT_RANK], stop_dims[OUT_RANK];
@@ -300,10 +474,12 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
         }
       }();
 
-      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+      if (decimation_factor == num_channels && matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
         matxChannelizePoly1DInternal_Smem<decltype(fft_in_slice), InputOp, FilterOp, AccumType>(fft_in_slice, in, f, stream);
+      } else if (matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel(out, in, f, decimation_factor)) {
+        matxChannelizePoly1DInternal_SmemTiled<decltype(fft_in_slice), InputOp, FilterOp, AccumType>(fft_in_slice, in, f, decimation_factor, stream);
       } else {
-        matxChannelizePoly1DInternal<decltype(fft_in_slice), InputOp, FilterOp, AccumType>(fft_in_slice, in, f, stream);
+        matxChannelizePoly1DInternal<decltype(fft_in_slice), InputOp, FilterOp, AccumType>(fft_in_slice, in, f, decimation_factor, stream);
       }
       stop_dims[OUT_RANK-1] = (num_channels/2) + 1;
       auto out_packed = slice<OUT_RANK>(out, start_dims, stop_dims);
@@ -311,13 +487,16 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType
       matxChannelizePoly1DUnpackInternal(out, stream);
     }
   } else {
-    if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
+    // The fused-DFT kernel only supports the maximally decimated case (D == M).
+    if (decimation_factor == num_channels && num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) {
       matxChannelizePoly1DInternal_FusedChan<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, stream);
     } else {
-      if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
+      if (decimation_factor == num_channels && matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) {
         matxChannelizePoly1DInternal_Smem<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, stream);
+      } else if (matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel(out, in, f, decimation_factor)) {
+        matxChannelizePoly1DInternal_SmemTiled<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, decimation_factor, stream);
       } else {
-        matxChannelizePoly1DInternal<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, stream);
+        matxChannelizePoly1DInternal<OutputOp, InputOp, FilterOp, AccumType>(out, in, f, decimation_factor, stream);
       }
       // Specify FORWARD here to prevent any normalization after the ifft. We do not
       // want any extra scaling on the output values.
diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu
index 0ef0611c..f3cb4a7e 100644
--- a/test/00_transform/ChannelizePoly.cu
+++ b/test/00_transform/ChannelizePoly.cu
@@ -153,11 +153,10 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple)
     const index_t a_len = test_cases[i].a_len;
     const index_t f_len = test_cases[i].f_len;
     const index_t num_channels = test_cases[i].num_channels;
-    // Currently do not support oversampled channelization
     const index_t decimation_factor = num_channels;
     [[maybe_unused]] const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels;
     this->pb->template InitAndRunTVGenerator<TestType>(
-      "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels});
+      "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
 
     auto a = make_tensor<TestType>({a_len});
     auto f = make_tensor<TestType>({f_len});
@@ -201,7 +200,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, MixedPrecision)
   const double mixed_thresh = 1e-5;
 
   this->pb->template InitAndRunTVGenerator<double>(
-    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels});
+    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
   auto a64 = make_tensor<double>({a_len});
   auto f64 = make_tensor<double>({f_len});
   this->pb->NumpyToTensorView(a64, "a");
@@ -228,7 +227,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, MixedPrecision)
   }
 
   this->pb->template InitAndRunTVGenerator<cuda::std::complex<double>>(
-    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels});
+    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
   auto ac64 = make_tensor<cuda::std::complex<double>>({a_len});
   this->pb->NumpyToTensorView(ac64, "a");
   this->pb->NumpyToTensorView(f64, "filter_random_real");
@@ -274,7 +273,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, AccumProperty)
   const double mixed_thresh_complex_input = 1e-4;
 
   this->pb->template InitAndRunTVGenerator<double>(
-    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels});
+    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
   auto a64 = make_tensor<double>({a_len});
   auto f64 = make_tensor<double>({f_len});
   auto gold_output_hreal = make_tensor<cuda::std::complex<double>>({b_len_per_channel, num_channels});
@@ -323,7 +322,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, AccumProperty)
   }
 
   this->pb->template InitAndRunTVGenerator<cuda::std::complex<double>>(
-    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels});
+    "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
   auto ac64 = make_tensor<cuda::std::complex<double>>({a_len});
   this->pb->NumpyToTensorView(ac64, "a");
   this->pb->NumpyToTensorView(f64, "filter_random_real");
@@ -430,10 +429,9 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched)
     const index_t a_len = test_cases[i].a_len;
     const index_t f_len = test_cases[i].f_len;
     const index_t num_channels = test_cases[i].num_channels;
-    // Currently do not support oversampled channelization
     const index_t decimation_factor = num_channels;
     const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels;
-    std::vector<index_t> sizes = { a_len, f_len, num_channels };
+    std::vector<index_t> sizes = { a_len, f_len, num_channels, decimation_factor };
     sizes.insert(sizes.end(), test_cases[i].batch_dims.begin(), test_cases[i].batch_dims.end());
     this->pb->template InitAndRunTVGenerator<TestType>(
       "00_transforms", "channelize_poly_operators", "channelize", sizes);
@@ -485,7 +483,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, IdentityFilter)
   // cases (a single channel channelizer is just a convolution).
   const index_t num_channels = 2;
   this->pb->template InitAndRunTVGenerator<TestType>(
-    "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels });
+    "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels, num_channels });
 
   auto a = make_tensor<TestType>({a_len});
   auto f = make_tensor<TestType>({f_len});
@@ -536,7 +534,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators)
   [[maybe_unused]] const index_t f_len = 90;
   const index_t num_channels = 10;
   this->pb->template InitAndRunTVGenerator<TestType>(
-    "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels });
+    "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels, num_channels });
 
   auto a = make_tensor<TestType>({a_len});
   auto f = make_tensor<TestType>({f_len});
@@ -841,4 +839,594 @@ TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003)
   }
 
   MATX_EXIT_HANDLER();
-}
\ No newline at end of file
+}
+
+// Oversampled channelizer test based on Harris 2003 receiver_40z.m.
+// M=40, D=28, 600-tap filter designed with remez. Golden input and output
+// are from the reference implementation. This test validates the oversampled
+// phase rotation convention.
+TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003Oversampled)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = cuda::std::complex<TestType>;
+
+  const index_t input_len = 5600;
+  const index_t M = 40;
+  const index_t D = 28;
+  const index_t num_output = (input_len + D - 1) / D;
+
+  this->pb->template InitAndRunTVGenerator<ComplexType>(
+    "00_transforms", "harris2003_oversampled_operators", "channelize",
+    {input_len});
+
+  auto a = make_tensor<ComplexType>({input_len});
+  auto f = make_tensor<TestType>({600});
+  auto b = make_tensor<ComplexType>({num_output, M});
+
+  this->pb->NumpyToTensorView(a, "a");
+  this->pb->NumpyToTensorView(f, "filter");
+  (b = channelize_poly(a, f, M, D)).run(this->exec);
+
+  this->exec.sync();
+  MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_golden", this->thresh);
+
+  MATX_EXIT_HANDLER();
+}
+
+// Oversampled identity filter test: validates the causal commutator behavior.
+// With a single-tap identity filter (E[r,0] = 1 for all r), the filtering step
+// simply selects one input sample per branch. The causal commutator determines
+// which sample each branch sees at each output time:
+//   - Sample k goes to branch (M-1 - k%M)
+//   - At output time n, only samples x[0..n*D+D-1] have arrived
+//   - Branch r's newest sample: largest k <= n*D+D-1 where k%M == M-1-r
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledIdentityFilter)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using InnerType = typename test_types::inner_type<TestType>::type;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    { 20, 4, 2 },   // 2x oversampled, M=4
+    { 20, 4, 1 },   // 4x oversampled, M=4
+    { 30, 10, 3 },  // rational oversample, M=10 D=3
+    { 20, 4, 3 },   // rational oversample, M=4 D=3
+    { 20, 2, 1 },   // 2x oversampled, M=2
+    {  2, 4, 4 },   // maximally decimated, a_len < M
+    {  3, 4, 4 },   // maximally decimated, a_len < M
+    {  1, 4, 2 },   // oversampled, a_len < M
+    {  3, 4, 2 },   // oversampled, a_len < M
+    {  2, 6, 3 },   // oversampled rational, a_len < M
+  };
+
+  for (size_t tc = 0; tc < sizeof(test_cases)/sizeof(test_cases[0]); tc++) {
+    const index_t a_len = test_cases[tc].a_len;
+    const index_t M = test_cases[tc].num_channels;
+    const index_t D = test_cases[tc].decimation_factor;
+    const index_t f_len = M;  // identity filter: one tap per phase
+    const index_t num_output = (a_len + D - 1) / D;
+
+    // Use the pybind generator only for random input data and not for the expected output
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, M, D});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({num_output, M});
+
+    this->pb->NumpyToTensorView(a, "a");
+
+    // Identity filter: h = [1, 1, ..., 1] of length M
+    // Polyphase decomposition gives E[r, 0] = 1 for all r (single tap per phase)
+    for (index_t i = 0; i < M; i++) { f(i) = static_cast<TestType>(1.0); }
+
+    this->exec.sync();
+
+    (b = channelize_poly(a, f, M, D)).run(this->exec);
+
+    this->exec.sync();
+
+    // Compute expected output using the causal commutator model.
+    // With identity filter, g_r[n] = x[newest_r] if branch r has a sample, else 0.
+    // Then Y[n, :] = M * IFFT(g[n, :])
+    for (index_t n = 0; n < num_output; n++) {
+      const index_t last_arrived = n * D + D - 1;
+
+      // Compute the IDFT (with positive exponential) of the
+      // causal filtered values. This is M * IFFT.
+      for (index_t chan = 0; chan < M; chan++) {
+        ComplexType accum { 0 };
+        for (index_t r = 0; r < M; r++) {
+          // Branch remap for Harris convention input mapping (D < M only)
+          const index_t r_remapped = (D < M) ? ((r + M - D) % M) : r;
+          const index_t s = M - 1 - r_remapped;  // remapped for input access
+
+          // Causal commutator: find newest sample for branch r
+          TestType sample_val = static_cast<TestType>(0.0);
+          if (last_arrived >= s) {
+            const index_t newest = last_arrived - ((last_arrived - s) % M);
+            if (newest < a_len) {
+              sample_val = a(newest);
+            }
+          }
+
+          // IDFT with positive exponential (Harris convention)
+          const double arg = 2.0 * M_PI * static_cast<double>(r) * static_cast<double>(chan) / static_cast<double>(M);
+          double sinx, cosx;
+          sincos(arg, &sinx, &cosx);
+          ComplexType twiddle { static_cast<InnerType>(cosx), static_cast<InnerType>(sinx) };
+          accum += sample_val * twiddle;
+        }
+
+        ComplexType bval { b(n, chan) };
+        ASSERT_NEAR(accum.real(), bval.real(), this->thresh)
+          << "mismatch at n=" << n << " chan=" << chan
+          << " M=" << M << " D=" << D << " a_len=" << a_len;
+        ASSERT_NEAR(accum.imag(), bval.imag(), this->thresh)
+          << "mismatch at n=" << n << " chan=" << chan
+          << " M=" << M << " D=" << D << " a_len=" << a_len;
+      }
+    }
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// Oversampled channelizer tests: integer oversampling (D divides M evenly)
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledInteger)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // 2x oversampling (D = M/2)
+    { 2500, 170, 10, 5 },
+    { 2500, 187, 10, 5 },
+    { 1800, 120, 8, 4 },
+    { 37193, 41*8+4, 8, 4 },
+    // 5x oversampling (D = M/5)
+    { 2500, 170, 10, 2 },
+    // Large channel count
+    { 35000, 5*180, 180, 90 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// Oversampled channelizer tests: rational oversampling (D does not divide M evenly)
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledRational)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // M=10, D=3 (rational 10/3 oversampling)
+    { 2500, 170, 10, 3 },
+    { 2500, 187, 10, 3 },
+    // M=10, D=7
+    { 2500, 170, 10, 7 },
+    // M=9, D=4
+    { 1800, 120, 9, 4 },
+    // M=8, D=3
+    { 37193, 41*8+4, 8, 3 },
+    // M=11, D=4
+    { 2500, 187, 11, 4 },
+    // Large channel count, rational
+    { 35000, 5*181+17, 181, 60 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// Oversampled channelizer with batched inputs
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledBatched)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  constexpr int BATCH_DIMS = 2;
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+    cuda::std::array<index_t, BATCH_DIMS> batch_dims;
+  } test_cases[] = {
+    // Integer oversampling, batched
+    { 2500, 170, 10, 5, { 2, 3 } },
+    // Rational oversampling, batched
+    { 2500, 170, 10, 3, { 2, 3 } },
+    { 37193, 41*8+4, 8, 3, { 3, 1 } },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    std::vector<index_t> sizes = { a_len, f_len, num_channels, decimation_factor };
+    sizes.insert(sizes.end(), test_cases[i].batch_dims.begin(), test_cases[i].batch_dims.end());
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled", sizes);
+    const cuda::std::array<index_t, BATCH_DIMS+1> a_dims = {
+      test_cases[i].batch_dims[0], test_cases[i].batch_dims[1], a_len
+    };
+    const cuda::std::array<index_t, BATCH_DIMS+2> b_dims = {
+      test_cases[i].batch_dims[0], test_cases[i].batch_dims[1], b_len_per_channel, num_channels
+    };
+    auto a = make_tensor<TestType>(a_dims);
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>(b_dims);
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// SmemTiled kernel tests: maximally decimated with M > 256 (forces SmemTiled
+// because the existing _Smem kernel requires M <= 256)
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledMaximallyDecimated)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+  } test_cases[] = {
+    // M=512, P=10 — exercises MaximallyDecimated SmemTiled with multiple channel tiles
+    { 256000, 10*512, 512 },
+    // M=512, unaligned filter (not a multiple of M)
+    { 256000, 10*512+37, 512 },
+    // M=1024, P=4 — large channel count, small P
+    { 102400, 4*1024, 1024 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = num_channels;
+    const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// SmemTiled kernel tests: large-M oversampled cases that exercise multi-tile
+// channel tiling in the oversampled path
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledLargeM)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // M=1024, D=512, 2x integer oversampling, K=2
+    { 256000, 8*1024, 1024, 512 },
+    // M=512, D=256, 2x integer oversampling, K=2
+    { 128000, 10*512, 512, 256 },
+    // M=512, D=256, unaligned filter
+    { 128000, 10*512+19, 512, 256 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// SmemTiled kernel tests: rational oversampling with large K (many distinct
+// filter phases) to stress the K-phase filter storage
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledLargeK)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // M=128, D=97, K=128 (gcd=1, all phases distinct)
+    { 25600, 5*128, 128, 97 },
+    // M=64, D=37, K=64 (gcd=1)
+    { 12800, 8*64, 64, 37 },
+    // M=100, D=30, K=10 (gcd=10), moderate K
+    { 20000, 7*100, 100, 30 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// SmemTiled oversampled with FilterInSmem=true.
+// Requires CTILE * K * P * sizeof(filter_t) <= 2048.
+// With K=2 (2x integer oversample) and small P, the filter fits in smem.
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledFilterInSmem)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // M=1024, D=512, K=2, P=4: filter_smem = 64*2*4*sizeof(filter_t) = 2048 bytes (float)
+    { 102400, 4*1024, 1024, 512 },
+    // M=512, D=256, K=2, P=3: filter_smem = 64*2*3*sizeof(filter_t) = 1536 bytes (float)
+    { 51200, 3*512, 512, 256 },
+    // M=1024, D=512, K=2, P=2: filter_smem = 64*2*2*sizeof(filter_t) = 1024 bytes (float)
+    { 102400, 2*1024, 1024, 512 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// Complex filter tests to exercise the complex×complex channelize_cmac path.
+// Previous tests all use real filters.
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, ComplexFilter)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // Complex filter, D==M, small M (FusedChan path)
+    { 1800, 120, 4, 4 },
+    // Complex filter, D==M, medium M (_Smem path)
+    { 2500, 170, 10, 10 },
+    // Complex filter, D==M, large M (SmemTiled path)
+    { 128000, 10*512, 512, 512 },
+    // Complex filter, oversampled (SmemTiled oversampled path)
+    { 2500, 170, 10, 5 },
+    // Complex filter, oversampled, large M
+    { 128000, 10*512, 512, 256 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+
+    // Use complex input AND complex filter
+    this->pb->template InitAndRunTVGenerator<ComplexType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<ComplexType>({a_len});
+    auto f = make_tensor<ComplexType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
+
+// Oversampled with very large P so SmemTiled input buffer exceeds 48KB,
+// falling back to the generic ChannelizePoly1D (MaximallyDecimated=false) path.
+// For complex<float> input: (P+3) * 64 * 8 > 49152 → P > 93.
+TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, GenericOversampledFallback)
+{
+  MATX_ENTER_HANDLER();
+
+  using TestType = cuda::std::tuple_element_t<0, TypeParam>;
+  using ComplexType = typename test_types::complex_type<TestType>::type;
+
+  struct {
+    index_t a_len;
+    index_t f_len;
+    index_t num_channels;
+    index_t decimation_factor;
+  } test_cases[] = {
+    // M=8, D=4, P=100: input_smem = 103*64*8 = 52736 > 49152 for complex<float>
+    { 8000, 100*8, 8, 4 },
+    // M=10, D=5, P=200: clearly exceeds 48KB for any type
+    { 20000, 200*10, 10, 5 },
+  };
+
+  for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
+    const index_t a_len = test_cases[i].a_len;
+    const index_t f_len = test_cases[i].f_len;
+    const index_t num_channels = test_cases[i].num_channels;
+    const index_t decimation_factor = test_cases[i].decimation_factor;
+    const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor;
+    this->pb->template InitAndRunTVGenerator<TestType>(
+      "00_transforms", "channelize_poly_operators", "channelize_oversampled",
+      {a_len, f_len, num_channels, decimation_factor});
+
+    auto a = make_tensor<TestType>({a_len});
+    auto f = make_tensor<TestType>({f_len});
+    auto b = make_tensor<ComplexType>({b_len_per_channel, num_channels});
+
+    this->pb->NumpyToTensorView(a, "a");
+    this->pb->NumpyToTensorView(f, "filter_random");
+    (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec);
+
+    this->exec.sync();
+    MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh);
+  }
+
+  MATX_EXIT_HANDLER();
+}
diff --git a/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin b/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin
new file mode 100755
index 00000000..d954ce67
Binary files /dev/null and b/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin differ
diff --git a/test/test_vectors/generators/00_transforms.py b/test/test_vectors/generators/00_transforms.py
index 555fc3b9..ff0a887e 100755
--- a/test/test_vectors/generators/00_transforms.py
+++ b/test/test_vectors/generators/00_transforms.py
@@ -175,9 +175,9 @@ def __init__(self, dtype: str, size: List[int]):
         signal_len = size[0]
         filter_len = size[1]
         num_channels = size[2]
-        # Remaining dimensions are batch dimensions
-        if len(size) > 3:
-            a_dims = size[3:]
+        # size[3] is decimation_factor; remaining dimensions are batch dimensions
+        if len(size) > 4:
+            a_dims = size[4:]
             a_dims = np.append(a_dims, signal_len)
         else:
             a_dims = [signal_len]
@@ -246,6 +246,157 @@ def idivup(a, b) -> int: return (a+b-1)//b
         self.res['b_random_hreal'] = out_hreal
         return self.res
 
+    def channelize_oversampled(self) -> Dict[str, np.ndarray]:
+        """General polyphase channelizer supporting arbitrary decimation factor D <= M.
+
+        Uses size[3] as decimation_factor (D). When D == M, this produces the same
+        result as channelize(). When D < M, this is the oversampled case.
+        """
+        def idivup(a, b) -> int: return (a+b-1)//b
+
+        h = self.res['filter_random']
+        num_channels = self.res['num_channels']
+        decimation_factor = self.size[3]
+        x = self.res['a']
+
+        M = num_channels
+        D = decimation_factor
+
+        # Polyphase decompose: E[r, q] = h[q*M + r], zero-pad h to multiple of M
+        num_taps_per_channel = idivup(h.size, M)
+        h_padded = h
+        if M * num_taps_per_channel > h.size:
+            h_padded = np.pad(h, (0, M * num_taps_per_channel - h.size))
+        E = np.reshape(h_padded, (M, num_taps_per_channel), order='F')
+
+        num_output_samples = idivup(x.shape[-1], D)
+        num_batches = x.size // x.shape[-1]
+        out = np.zeros((num_batches, num_output_samples, M), dtype=np.complex128)
+        out_hreal = np.zeros((num_batches, num_output_samples, M), dtype=np.complex128)
+        xr = np.reshape(x, (num_batches, x.shape[-1]))
+
+        E_real = np.real(E)
+        n_idx = np.arange(num_output_samples)
+
+        # Precompute per-(n, r) quantities as 2D arrays [num_output_samples, M]
+        # Branch remap: r_remapped = (r + M - D) % M changes the input sample
+        # mapping so newest D samples land in Harris convention branches.
+        # Phase uses the original logical branch index r (not remapped).
+        n_all = np.arange(num_output_samples)[:, np.newaxis]  # [nout, 1]
+        r_all = np.arange(M)[np.newaxis, :]                    # [1, M]
+        r_remapped = (r_all + M - D) % M                       # [1, M]
+        s_all = M - 1 - r_remapped                              # [1, M] (remapped for input access)
+        last_arrived = n_all * D + D - 1                        # [nout, 1] broadcast
+        valid = last_arrived >= s_all                            # [nout, M]
+        A = np.where(valid, last_arrived - s_all, 0)            # [nout, M]
+        newest = np.where(valid, last_arrived - (A % M), 0)     # [nout, M]
+        causal_count = np.where(valid, A // M + 1, 0)           # [nout, M]
+        phase = np.where(valid,
+            (r_all + (n_all * D) % M) % M,
+            0).astype(int)                                       # [nout, M] (original r, NOT remapped)
+
+        for batch_ind in range(num_batches):
+            xb = xr[batch_ind, :]
+            input_len = xb.shape[-1]
+            filtered = np.zeros((num_output_samples, M), dtype=np.complex128)
+            filtered_hreal = np.zeros((num_output_samples, M), dtype=np.complex128)
+
+            # Loop only over taps (P iterations, typically small)
+            for q in range(num_taps_per_channel):
+                idx = newest - q * M                             # [nout, M]
+                tap_valid = valid & (idx >= 0) & (idx < input_len) & (q < causal_count)
+                if not np.any(tap_valid):
+                    continue
+                # Gather input samples and filter taps for all valid (n, r) pairs
+                inp_vals = np.where(tap_valid, xb[np.clip(idx, 0, input_len - 1).astype(int)], 0)
+                h_vals = E[phase, q]                             # [nout, M]
+                h_real_vals = E_real[phase, q]
+                filtered += np.where(tap_valid, h_vals * inp_vals, 0)
+                filtered_hreal += np.where(tap_valid, h_real_vals * inp_vals, 0)
+            # IFFT across channels, scale by M
+            out[batch_ind, :, :] = ifft(filtered, n=M, axis=1) * M
+            out_hreal[batch_ind, :, :] = ifft(filtered_hreal, n=M, axis=1) * M
+
+        # Reshape output: (num_batches, num_output_samples, M) -> match MatX layout
+        # MatX output is (...batch_dims..., num_output_samples, num_channels)
+        if num_batches > 1:
+            s = list(x.shape)
+            s[-1] = num_output_samples
+            s = np.append(s, M)
+            out = np.reshape(out, s)
+            out_hreal = np.reshape(out_hreal, s)
+        else:
+            out = np.reshape(out, out.shape[1:])
+            out_hreal = np.reshape(out_hreal, out_hreal.shape[1:])
+
+        self.res['b_random'] = out
+        self.res['b_random_hreal'] = out_hreal
+        return self.res
+
+class harris2003_oversampled_operators:
+    """Test case based on the oversampled channelizer from Harris 2003
+    (receiver_40z.m): M=40, D=28, 600-tap filter designed with remez.
+
+    Golden input and output are read from a binary file. The filter is
+    designed using scipy.signal.remez with the same specification as the
+    reference implementation (verified to match within machine epsilon).
+    The binary file layout is:
+      [real(xx), 5600 doubles][imag(xx), 5600 doubles]
+      [real(yy), 40*200 doubles][imag(yy), 40*200 doubles]
+    where yy is [40, 200] stored column-major (channels × output steps)
+    with fftshift applied (DC centered).
+    """
+    def __init__(self, dtype: str, size: List[int]):
+        self.size = size
+        self.dtype = dtype
+
+    def channelize(self) -> Dict[str, np.ndarray]:
+        import os
+        import scipy.signal
+
+        M = 40
+        D = 28
+        input_len = 5600
+        num_output = 200
+
+        # Design the 600-tap prototype filter using Parks-McClellan (remez)
+        # with the same specification as the Harris 2003 reference.
+        freqs = np.array([0, 12, 17, 56, 57, 84, 85, 112, 113, 140,
+                          141, 168, 169, 196, 197, 224, 225, 252, 253, 280,
+                          281, 308, 309, 336, 337, 364, 365, 392, 393, 420,
+                          421, 448, 449, 476, 477, 504, 505, 532, 533, 560]) / 560.0
+        gains = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+        weights = [1, 6, 7, 9, 11, 13, 15, 17, 19, 21,
+                   23, 25, 27, 29, 31, 33, 35, 37, 39, 41]
+        h = scipy.signal.remez(600, freqs, gains, weight=weights, fs=2)
+        h = h.astype(np.float64)
+
+        # Read golden input and output from binary test vector file
+        tv_dir = os.path.join(os.path.dirname(__file__), '..', 'channelize_poly')
+        tv_path = os.path.join(tv_dir, 'harris2003_oversampled_testvector.bin')
+        data = np.fromfile(tv_path, dtype=np.float64)
+
+        offset = 0
+        xx_re = data[offset:offset + input_len]; offset += input_len
+        xx_im = data[offset:offset + input_len]; offset += input_len
+        a = (xx_re + 1j * xx_im).astype(np.complex128)
+
+        yy_re = data[offset:offset + M * num_output]; offset += M * num_output
+        yy_im = data[offset:offset + M * num_output]; offset += M * num_output
+        # The reference output uses fftshift (DC centered). Apply ifftshift
+        # to convert to our convention (DC at index 0).
+        from scipy.fft import ifftshift
+        yy = (yy_re + 1j * yy_im).reshape((M, num_output), order='F')
+        b = ifftshift(yy, axes=0).T
+
+        res = {
+            'a': a,
+            'filter': h,
+            'b_golden': b,
+        }
+        return res
+
+
 class fft_operators:
     def __init__(self, dtype: str, size: List[int]):
         self.size = size