diff --git a/examples/channelize_poly_bench.cu b/examples/channelize_poly_bench.cu index 49d20b1e..786ed5bd 100644 --- a/examples/channelize_poly_bench.cu +++ b/examples/channelize_poly_bench.cu @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -51,8 +52,17 @@ constexpr int NUM_WARMUP_ITERATIONS = 2; // Number of iterations per timed test. Iteration times are averaged in the report. constexpr int NUM_ITERATIONS = 20; -template -void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop) +template +const char *TypeName() { + if constexpr (std::is_same_v) return "float"; + else if constexpr (std::is_same_v) return "double"; + else if constexpr (std::is_same_v>) return "complex"; + else if constexpr (std::is_same_v>) return "complex"; + else return "unknown"; +} + +template +void ChannelizePolyBench(matx::index_t num_channels, matx::index_t decimation_factor) { struct { matx::index_t num_batches; @@ -65,6 +75,8 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop { 1, 17, 256000 }, { 42, 17, 256000 }, { 128, 17, 256000 }, + { 1, 17, 8192*1024 }, + { 42, 17, 8192*1024 } }; cudaStream_t stream; @@ -76,41 +88,42 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop cudaExecutor exec{}; for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { - for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) { - const matx::index_t num_batches = test_cases[i].num_batches; - const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels; - const matx::index_t input_len = test_cases[i].input_len; - const matx::index_t output_len_per_channel = (input_len + num_channels - 1) / num_channels; + const matx::index_t num_batches = test_cases[i].num_batches; + const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels; + const matx::index_t input_len = test_cases[i].input_len; + const matx::index_t output_len_per_channel = (input_len + decimation_factor - 1) / decimation_factor; - auto input = matx::make_tensor({num_batches, input_len}); - auto filter = matx::make_tensor({filter_len}); - auto output = matx::make_tensor({num_batches, output_len_per_channel, num_channels}); + if (input_len < num_channels * 100) { + continue; + } - const matx::index_t decimation_factor = num_channels; + auto input = matx::make_tensor({num_batches, input_len}); + auto filter = matx::make_tensor({filter_len}); + auto output = matx::make_tensor({num_batches, output_len_per_channel, num_channels}); + (input = static_cast(1)).run(exec); + (filter = static_cast(1)).run(exec); - for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) { - (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); - } + for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) { + (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); + } - exec.sync(); + exec.sync(); - float elapsed_ms = 0.0f; - cudaEventRecord(start, stream); - for (int k = 0; k < NUM_ITERATIONS; k++) { - (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); - } - cudaEventRecord(stop, stream); - exec.sync(); - MATX_CUDA_CHECK_LAST_ERROR(); - cudaEventElapsedTime(&elapsed_ms, start, stop); - - const double avg_elapsed_us = (static_cast(elapsed_ms)/NUM_ITERATIONS)*1.0e3; - printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT - " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n", - num_batches, num_channels, filter_len, input_len, avg_elapsed_us, - static_cast(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6)); + float elapsed_ms = 0.0f; + cudaEventRecord(start, stream); + for (int k = 0; k < NUM_ITERATIONS; k++) { + (output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec); } - printf("\n"); + cudaEventRecord(stop, stream); + exec.sync(); + MATX_CUDA_CHECK_LAST_ERROR(); + cudaEventElapsedTime(&elapsed_ms, start, stop); + + const double avg_elapsed_us = (static_cast(elapsed_ms)/NUM_ITERATIONS)*1.0e3; + printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " Decimation: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT + " InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n", + num_batches, num_channels, decimation_factor, filter_len, input_len, avg_elapsed_us, + static_cast(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6)); } MATX_CUDA_CHECK_LAST_ERROR(); @@ -120,24 +133,127 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop cudaStreamDestroy(stream); } -int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv) +enum class Precision { Float, Double }; +enum class Domain { Real, Complex }; + +struct BenchConfig { + Precision input_prec = Precision::Float; + Domain input_domain = Domain::Complex; + Precision filter_prec = Precision::Float; + Domain filter_domain = Domain::Real; + matx::index_t M = 10; // number of channels + matx::index_t D = -1; // decimation factor (-1 means D = M) +}; + +void PrintUsage(const char *prog) { + printf("Usage: %s [options]\n", prog); + printf(" --input-type Input type: float, double, cf, cd (default: cf)\n"); + printf(" --filter-type Filter type: float, double, cf, cd (default: float)\n"); + printf(" -M Number of channels (default: 10)\n"); + printf(" -D Decimation factor, 0 < D <= M (default: M)\n"); + printf("\n"); + printf("Type shorthands: float, double, cf (complex), cd (complex)\n"); +} + +bool ParseType(const char *s, Precision &prec, Domain &dom) { + if (strcmp(s, "float") == 0) { prec = Precision::Float; dom = Domain::Real; return true; } + if (strcmp(s, "double") == 0) { prec = Precision::Double; dom = Domain::Real; return true; } + if (strcmp(s, "cf") == 0) { prec = Precision::Float; dom = Domain::Complex; return true; } + if (strcmp(s, "cd") == 0) { prec = Precision::Double; dom = Domain::Complex; return true; } + return false; +} + +template +struct ScalarType { using type = T; }; +template +struct ScalarType> { using type = T; }; + +template +void DispatchBench(const BenchConfig &cfg) { + using in_scalar = typename ScalarType::type; + using OutType = cuda::std::complex; + + printf("Input: %-16s Filter: %-16s Output: %-16s\n", + TypeName(), TypeName(), TypeName()); + printf("M: %" MATX_INDEX_T_FMT " D: %" MATX_INDEX_T_FMT "\n\n", cfg.M, cfg.D); + + ChannelizePolyBench(cfg.M, cfg.D); +} + +void RunBench(const BenchConfig &cfg) { + auto go = [&](auto in_tag, auto filt_tag) { + DispatchBench(cfg); + }; + + auto dispatch_filter = [&](auto in_tag) { + if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Real) + go(in_tag, float{}); + else if (cfg.filter_prec == Precision::Double && cfg.filter_domain == Domain::Real) + go(in_tag, double{}); + else if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Complex) + go(in_tag, cuda::std::complex{}); + else + go(in_tag, cuda::std::complex{}); + }; + + if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Real) + dispatch_filter(float{}); + else if (cfg.input_prec == Precision::Double && cfg.input_domain == Domain::Real) + dispatch_filter(double{}); + else if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Complex) + dispatch_filter(cuda::std::complex{}); + else + dispatch_filter(cuda::std::complex{}); +} + +int main(int argc, char **argv) { MATX_ENTER_HANDLER(); - const matx::index_t channel_start = 3; - const matx::index_t channel_stop = 10; + BenchConfig cfg; - // printf("Benchmarking float -> complex\n"); - // ChannelizePolyBench>(channel_start, channel_stop); + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + PrintUsage(argv[0]); + return 0; + } else if (strcmp(argv[i], "--input-type") == 0 && i + 1 < argc) { + if (!ParseType(argv[++i], cfg.input_prec, cfg.input_domain)) { + fprintf(stderr, "Unknown input type: %s\n", argv[i]); + return 1; + } + } else if (strcmp(argv[i], "--filter-type") == 0 && i + 1 < argc) { + if (!ParseType(argv[++i], cfg.filter_prec, cfg.filter_domain)) { + fprintf(stderr, "Unknown filter type: %s\n", argv[i]); + return 1; + } + } else if (strcmp(argv[i], "-M") == 0 && i + 1 < argc) { + cfg.M = static_cast(atol(argv[++i])); + } else if (strcmp(argv[i], "-D") == 0 && i + 1 < argc) { + cfg.D = static_cast(atol(argv[++i])); + } else { + fprintf(stderr, "Unknown option: %s\n", argv[i]); + PrintUsage(argv[0]); + return 1; + } + } - printf("Benchmarking complex -> complex\n"); - ChannelizePolyBench,cuda::std::complex>(channel_start, channel_stop); + // Default D to M (maximally decimated) if not specified + if (cfg.D <= 0) { + cfg.D = cfg.M; + } + + if (cfg.D <= 0 || cfg.D > cfg.M) { + fprintf(stderr, "Error: decimation factor D must satisfy 0 < D <= M (got M=%" MATX_INDEX_T_FMT ", D=%" MATX_INDEX_T_FMT ")\n", + cfg.M, cfg.D); + return 1; + } - // printf("Benchmarking double -> complex\n"); - // ChannelizePolyBench>(channel_start, channel_stop); + if (cfg.M < 2) { + fprintf(stderr, "Error: number of channels M must be >= 2 (got M=%" MATX_INDEX_T_FMT ")\n", cfg.M); + return 1; + } - // printf("Benchmarking complex -> complex\n"); - // ChannelizePolyBench,cuda::std::complex>(channel_start, channel_stop); + RunBench(cfg); matx::ClearCachesAndAllocations(); diff --git a/include/matx/kernels/channelize_poly.cuh b/include/matx/kernels/channelize_poly.cuh index dc4dc74e..de62e321 100644 --- a/include/matx/kernels/channelize_poly.cuh +++ b/include/matx/kernels/channelize_poly.cuh @@ -47,12 +47,22 @@ namespace matx { -// Number of output elements generated per thread -constexpr index_t CHANNELIZE_POLY1D_ELEMS_PER_THREAD = 1; +// detail constants that require both host and device visibility at compile time +namespace detail { + // Number of output elements generated per thread + constexpr index_t CHANNELIZE_POLY1D_ELEMS_PER_THREAD = 1; + + // Maximum number of filter rotations per channel for the SmemTiled kernel. + // This is used to determine if the filter can be stored in shared memory. The + // number of rotations can exceed this value, but the filter will be read from + // global memory rather than cached in shared memory. + constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS = 32; +} #ifdef __CUDACC__ namespace detail { + template __MATX_DEVICE__ __MATX_INLINE__ auto channelize_cast_filter(FilterT v) { @@ -80,11 +90,45 @@ __MATX_DEVICE__ __MATX_INLINE__ auto channelize_cast_input(InputT v) return static_cast(v); } } +// Fused complex multiply-accumulate. Decomposes the operation into scalar +// FMA instructions so the compiler emits 4 FFMA per complex tap instead of +// ~8 mixed FMUL/FADD/FSUB. Falls back to the default operator* + operator+= +// for real or mixed-precision types. +template +__MATX_DEVICE__ __MATX_INLINE__ void channelize_cmac( + AccumT &accum, FilterValT hv, InputValT iv) +{ + if constexpr (is_complex_v && is_complex_v && is_complex_v) { + auto h_re = hv.real(), h_im = hv.imag(); + auto i_re = iv.real(), i_im = iv.imag(); + auto a_re = accum.real(), a_im = accum.imag(); + a_re = h_re * i_re + a_re; + a_re = -(h_im * i_im) + a_re; + a_im = h_re * i_im + a_im; + a_im = h_im * i_re + a_im; + accum = {a_re, a_im}; + } else if constexpr (is_complex_v && !is_complex_v && is_complex_v) { + // Real filter × complex input + auto a_re = accum.real(), a_im = accum.imag(); + a_re = hv * iv.real() + a_re; + a_im = hv * iv.imag() + a_im; + accum = {a_re, a_im}; + } else if constexpr (is_complex_v && is_complex_v && !is_complex_v) { + // Complex filter × real input + auto a_re = accum.real(), a_im = accum.imag(); + a_re = hv.real() * iv + a_re; + a_im = hv.imag() * iv + a_im; + accum = {a_re, a_im}; + } else { + accum += hv * iv; + } +} + } // namespace detail -template +template __launch_bounds__(THREADS) -__global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter) +__global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter, index_t decimation_factor, uint32_t smem_filter_bytes) { using output_t = typename OutType::value_type; using input_t = typename InType::value_type; @@ -94,15 +138,6 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter // If the output is complex, then then accumulator is complex. Otherwise, the accumulator is real. using accum_t = cuda::std::conditional_t, typename detail::scalar_to_complex::ctype, AccumType>; - // Opportunistically store the filter taps in shared memory if the static shared memory - // size is sufficient. Otherwise, we will read directly from global memory on use. - const int SMEM_MAX_FILTER_TAPS = 128; - // Versions of CUDA prior to 11.8 do not allow static shared memory allocations of - // cuda::std::complex types due to it having no trivial constructor. This workaround - // prevents an 'initializer not allowed for __shared__ variable' error. - __align__(sizeof(filter_t)) __shared__ uint8_t smem_filter_workaround[sizeof(filter_t)*SMEM_MAX_FILTER_TAPS]; - filter_t (&smem_filter)[SMEM_MAX_FILTER_TAPS] = reinterpret_cast(smem_filter_workaround); - constexpr int InRank = InType::Rank(); constexpr int OutRank = OutType::Rank(); constexpr int ChannelRank = OutRank-1; @@ -118,93 +153,147 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter const int channel = blockIdx.y; const int tid = threadIdx.x; - constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; - const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + constexpr index_t ELEMS_PER_BLOCK = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + const index_t first_out_elem = elem_block * detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; const index_t last_out_elem = cuda::std::min( output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1); - if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) { - for (index_t t = tid; t < filter_phase_len-1; t += THREADS) { - const index_t h_ind = channel + t * num_channels; - smem_filter[t] = filter.operator()(h_ind); - } - if (tid == THREADS-1) { - const index_t h_ind = channel + (filter_phase_len-1) * num_channels; - smem_filter[filter_phase_len-1] = (h_ind < filter_full_len) ? - filter.operator()(h_ind) : static_cast(0); - } - - __syncthreads(); - } - auto indims = BlockToIdx(input, blockIdx.z, 1); auto outdims = BlockToIdx(output, blockIdx.z, 2); outdims[ChannelRank] = channel; - if (filter_phase_len <= SMEM_MAX_FILTER_TAPS) { - for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) { - const index_t first_ind = cuda::std::max(static_cast(0), t - filter_phase_len + 1); - accum_t accum {}; - const filter_t *h = smem_filter; - // index_t in MatX should be signed (32 or 64 bit), so j-- below will not underflow - static_assert(std::is_signed_v, "assumed signed index_t, but it is unsigned"); - indims[InRank-1] = t * num_channels + (num_channels - 1 - channel); - index_t j_start = t; - if (indims[InRank-1] >= input_len) { - j_start--; - indims[InRank-1] -= num_channels; - h++; + if constexpr (MaximallyDecimated) { + // Maximally decimated (D == M) path: filter phase is fixed per channel + // Dynamic shared memory holds one phase's taps when the dispatch provides + // enough smem. When smem_bytes == 0 the filter is read from global memory. + // When D == M: phase = channel (fixed), A % M = channel, + // indims = s + t*M, causal_count = t + 1, and last_arrived >= s + // is always true for t >= 0. + const index_t s = num_channels - 1 - channel; + + extern __shared__ __align__(16) uint8_t smem_filter_raw[]; + filter_t *smem_filter = reinterpret_cast(smem_filter_raw); + const bool use_smem_filter = (smem_filter_bytes >= sizeof(filter_t) * filter_phase_len); + if (use_smem_filter) { + for (index_t t = tid; t < filter_phase_len-1; t += THREADS) { + const index_t h_ind = channel + t * num_channels; + smem_filter[t] = filter.operator()(h_ind); } - // Because the filter must fit in shared memory, we know that the maximum - // number of iterations fits well within an int. - const int niter = static_cast(j_start - first_ind + 1); - input_t in_val; - for (int i = 0; i < niter; i++) { - cuda::std::apply([&in_val, &input](auto &&...args) { - in_val = input.operator()(args...); - }, indims); - accum += detail::channelize_cast_filter(*h) * - detail::channelize_cast_input(in_val); - indims[InRank-1] -= num_channels; - h++; + if (tid == THREADS-1) { + const index_t h_ind = channel + (filter_phase_len-1) * num_channels; + smem_filter[filter_phase_len-1] = (h_ind < filter_full_len) ? + filter.operator()(h_ind) : static_cast(0); } - outdims[OutElemRank] = t; - cuda::std::apply([accum, &output](auto &&...args) { - output.operator()(args...) = static_cast(accum); - }, outdims); + + __syncthreads(); } - } else { - for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) { - index_t first_ind = cuda::std::max(static_cast(0), t - filter_phase_len + 1); - // If we use the last filter tap for this phase (which is the first index because - // the filter is flipped), then it may be a padded zero. If so, increment first_ind - // by 1 to avoid using the zero. This prevents a bounds-check in the inner loop. - if (first_ind == (t - filter_phase_len + 1)) { + + if (use_smem_filter) { + for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) { + accum_t accum {}; + indims[InRank-1] = s + t * num_channels; + index_t h_skip = 0; + if (indims[InRank-1] >= input_len) { + h_skip = 1; + indims[InRank-1] -= num_channels; + } + const filter_t *h = smem_filter + h_skip; + int niter = static_cast(cuda::std::min(filter_phase_len - h_skip, t + 1 - h_skip)); + input_t in_val; + for (int i = 0; i < niter; i++) { + cuda::std::apply([&in_val, &input](auto &&...args) { + in_val = input.operator()(args...); + }, indims); + detail::channelize_cmac(accum, + detail::channelize_cast_filter(*h), + detail::channelize_cast_input(in_val)); + indims[InRank-1] -= num_channels; + h++; + } + outdims[OutElemRank] = t; + cuda::std::apply([accum, &output](auto &&...args) { + output.operator()(args...) = static_cast(accum); + }, outdims); + } + } else { + index_t available_taps = filter_phase_len; + { const bool h_is_padded = ((filter_phase_len-1) * num_channels + channel) >= filter_full_len; if (h_is_padded) { - first_ind++; + available_taps--; } } - indims[InRank-1] = t * num_channels + (num_channels - 1 - channel); - index_t j_start = t; - index_t h_ind { channel }; - // If the last signal element is a zero-pad value, then skip it to prevent needing - // per-access bounds checking in the inner loop. - if (indims[InRank-1] >= input_len) { - j_start--; - indims[InRank-1] -= num_channels; - h_ind += num_channels; + + for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) { + accum_t accum {}; + indims[InRank-1] = s + t * num_channels; + index_t h_skip = 0; + if (indims[InRank-1] >= input_len) { + h_skip = 1; + indims[InRank-1] -= num_channels; + } + index_t h_ind = channel + h_skip * num_channels; + index_t niter = cuda::std::min(available_taps - h_skip, t + 1 - h_skip); + input_t in_val; + for (index_t i = 0; i < niter; i++) { + cuda::std::apply([&in_val, &input](auto &&...args) { + in_val = input.operator()(args...); + }, indims); + const filter_t h_val = filter.operator()(h_ind); + detail::channelize_cmac(accum, + detail::channelize_cast_filter(h_val), + detail::channelize_cast_input(in_val)); + h_ind += num_channels; + indims[InRank-1] -= num_channels; + } + outdims[OutElemRank] = t; + cuda::std::apply([accum, &output](auto &&...args) { + output.operator()(args...) = static_cast(accum); + }, outdims); } - const index_t niter = j_start - first_ind + 1; + } + } else { + // Oversampled (D < M) path: phase rotates per output step + // No shared memory. Reads filter from global/L2. + // Branch remap for Harris convention: r_remapped changes the input + // sample mapping so newest D samples land in branches D-1..0. + // Phase uses the original logical channel index (not remapped). + const index_t r_remapped = (channel + num_channels - decimation_factor) % num_channels; + const index_t s = num_channels - 1 - r_remapped; + for (index_t t = first_out_elem+tid; t <= last_out_elem; t += THREADS) { + const index_t last_arrived = t * decimation_factor + decimation_factor - 1; + index_t niter = 0; + const index_t phase = (channel + t * decimation_factor) % num_channels; + index_t h_ind { phase }; accum_t accum {}; + if (last_arrived >= s) { + const index_t A = last_arrived - s; + indims[InRank-1] = last_arrived - (A % num_channels); + const index_t causal_count = A / num_channels + 1; + index_t h_skip = 0; + if (indims[InRank-1] >= input_len) { + h_skip = 1; + indims[InRank-1] -= num_channels; + } + h_ind = phase + h_skip * num_channels; + index_t available_taps = filter_phase_len; + { + const bool h_is_padded = ((filter_phase_len-1) * num_channels + phase) >= filter_full_len; + if (h_is_padded) { + available_taps--; + } + } + niter = cuda::std::min(available_taps - h_skip, causal_count - h_skip); + } input_t in_val; for (index_t i = 0; i < niter; i++) { - cuda::std::apply([&in_val, &input](auto &&...args) { + cuda::std::apply([&in_val, &input](auto &&...args) { in_val = input.operator()(args...); }, indims); const filter_t h_val = filter.operator()(h_ind); - accum += detail::channelize_cast_filter(h_val) * - detail::channelize_cast_input(in_val); + detail::channelize_cmac(accum, + detail::channelize_cast_filter(h_val), + detail::channelize_cast_input(in_val)); h_ind += num_channels; indims[InRank-1] -= num_channels; } @@ -216,6 +305,400 @@ __global__ void ChannelizePoly1D(OutType output, InType input, FilterType filter } } +// Tiled shared-memory polyphase channelizer kernel. +// +// Tiles across channels so that only CTILE channels are processed per block, +// removing the M<=256 constraint of ChannelizePoly1D_Smem while staging +// input samples in shared memory. Supports both maximally decimated (D == M) +// and oversampled (D < M) cases. +// +// Template parameters: +// FilterInSmem — when true, filter taps are cached in shared memory; +// when false, filter taps are read from global/L2. +// +// Block: dim3(CTILE, NOUT) +// Grid: dim3(time_blocks, channel_tiles, batches) +// +// Shared memory layout (FilterInSmem = true): +// smem_filter: [P][CTILE] for D==M, or [CTILE][K][P] for D +__launch_bounds__(CTILE * NOUT) +__global__ void ChannelizePoly1D_SmemTiled( + OutType output, InType input, FilterType filter, + IdxT elems_per_channel_per_cta, IdxT decimation_factor, int32_t num_phases_per_channel) +{ + using output_t = typename OutType::value_type; + using input_t = typename InType::value_type; + using filter_t = typename FilterType::value_type; + static_assert(!is_complex_v, + "channelize_poly: accumulator type must be real; it will be treated as complex when necessary"); + using accum_t = cuda::std::conditional_t, + typename detail::scalar_to_complex::ctype, AccumType>; + + extern __shared__ __align__(16) uint8_t smem_raw[]; + + constexpr int InRank = InType::Rank(); + constexpr int OutRank = OutType::Rank(); + constexpr int ChannelRank = OutRank - 1; + constexpr int OutElemRank = OutRank - 2; + + const IdxT input_len = static_cast(input.Size(InRank - 1)); + const IdxT output_len_per_channel = static_cast(output.Size(OutElemRank)); + const int32_t M = static_cast(output.Size(ChannelRank)); + const int32_t filter_full_len = static_cast(filter.Size(0)); + const int32_t P = static_cast((filter_full_len + M - 1) / M); + const int32_t K = num_phases_per_channel; + + const int32_t cx = static_cast(threadIdx.x); + const int32_t ty = static_cast(threadIdx.y); + const int32_t tid = ty * CTILE + cx; + assert(blockDim.x * blockDim.y == CTILE * NOUT && blockDim.z == 1); + const int32_t nthreads = CTILE * NOUT; + const int32_t tile_base = static_cast(blockIdx.y) * CTILE; + const int32_t c = tile_base + cx; + const bool active = (c < M); + + // Branch remap offset for Harris convention (oversampled only; 0 for D == M). + const int32_t L = MaximallyDecimated ? 0 : (M - static_cast(decimation_factor)); + + const int32_t filter_stride = K * P; // per-channel filter block size + const int32_t height = P + NOUT - 1; + + filter_t *smem_filter_base = nullptr; + input_t *smem_input = nullptr; + if constexpr (FilterInSmem) { + smem_filter_base = reinterpret_cast(smem_raw); + const int32_t filter_elems = MaximallyDecimated ? (P * CTILE) : (CTILE * filter_stride); + size_t input_byte_offset = sizeof(filter_t) * filter_elems; + if (input_byte_offset % sizeof(input_t)) { + input_byte_offset += sizeof(input_t) - input_byte_offset % sizeof(input_t); + } + smem_input = reinterpret_cast(smem_raw + input_byte_offset); + } else { + smem_input = reinterpret_cast(smem_raw); + } + + if constexpr (FilterInSmem) { + // Load filter into smem + if constexpr (MaximallyDecimated) { + for (int32_t i = tid; i < P * CTILE; i += nthreads) { + const int32_t p = i / CTILE; + const int32_t local_channel = i % CTILE; + const int32_t global_channel = tile_base + local_channel; + const int32_t h_ind = global_channel + p * M; + smem_filter_base[i] = (global_channel < M && h_ind < filter_full_len) + ? filter.operator()(static_cast(h_ind)) + : static_cast(0); + } + } else { + // The dispatch must not select FilterInSmem when K exceeds the + // rotations[] array size. Assert this invariant at runtime. + assert(K <= detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS); + int32_t rotations[detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS]; + for (int32_t k = 0; k < K; k++) { + rotations[k] = static_cast((static_cast(k) * decimation_factor) % M); + } + // TODO-PERF: Each channel rotates through K filter phases. Currently, we store all K phases + // per channel in shared memory. It is possible for K * CTILE to exceed the total number + // of channels, in which case we would be better off storing the full filter in shared memory. + // Furthermore, if there are fewer than K output points per channel generated in this CTA, + // then we could store only the required phases. + for (int32_t i = tid; i < CTILE * filter_stride; i += nthreads) { + const int32_t local_channel = i / filter_stride; + const int32_t kp = i % filter_stride; + const int32_t k = kp / P; + const int32_t p = kp % P; + const int32_t global_channel = tile_base + local_channel; + if (global_channel < M) { + // Phase uses the original logical channel (not remapped) + const int32_t phase = (global_channel + rotations[k]) % M; + const int32_t h_ind = phase + p * M; + smem_filter_base[i] = (h_ind < filter_full_len) + ? filter.operator()(static_cast(h_ind)) + : static_cast(0); + } else { + smem_filter_base[i] = static_cast(0); + } + } + } + } + + // r_remapped changes the input sample mapping to match the Harris convention. We conceptually populate commutator branches + // starting at M-1 and continuing counter-clockwise. That matches the Harris convention for the maximally decimated + // case where L == 0. For the oversampled case, Harris populates branches from M-1 to 0 for each M inputs, shifting + // older samples through the 2D filter bank. We stick with the M-1 to 0 convention, but then have to remap the + // branch indices to match the Harris convention. + // Phase and output channel use the original logical index c. + const int32_t r_remapped = (c + L) % M; + const int32_t s = active ? (M - 1 - r_remapped) : 0; + const IdxT start_elem = static_cast(blockIdx.x) * elems_per_channel_per_cta; + const IdxT last_elem = cuda::std::min( + output_len_per_channel - 1, + start_elem + elems_per_channel_per_cta - 1); + + auto indims = BlockToIdx(input, blockIdx.z, 1); + auto outdims = BlockToIdx(output, blockIdx.z, 2); + outdims[ChannelRank] = c; + + // Helper: load one input sample into smem at (buf_row, col) + auto load_smem_elem = [&](int32_t buf_row, int32_t col, IdxT global_row) { + const int32_t gc = tile_base + col; + const int32_t gc_remapped = MaximallyDecimated ? gc : ((gc + L) % M); + const int32_t branch_s = (gc < M) ? (M - 1 - gc_remapped) : 0; + const IdxT raw_idx = static_cast(branch_s) + global_row * M; + if (gc < M && global_row >= 0 && raw_idx >= 0 && raw_idx < input_len) { + indims[InRank - 1] = raw_idx; + input_t val; + cuda::std::apply([&val, &input](auto &&...args) { + val = input.operator()(args...); + }, indims); + smem_input[buf_row * CTILE + col] = val; + } else { + smem_input[buf_row * CTILE + col] = static_cast(0); + } + }; + + auto max_bidx = [&](IdxT t) -> IdxT { + if constexpr (MaximallyDecimated) { + return t; + } else { + return ((t + 1) * decimation_factor - 1) / M; + } + }; + + // Initial fill of circular buffer + const IdxT first_iter_end = cuda::std::min(start_elem + static_cast(NOUT) - 1, last_elem); + IdxT loaded_up_to = max_bidx(first_iter_end); + const IdxT buf_base = loaded_up_to - (height - 1); + + { + const int32_t first_row = tid / CTILE; + const int32_t row_stride = nthreads / CTILE; + int32_t buf_row = static_cast((buf_base + first_row) % height); + if (buf_row < 0) buf_row += height; + const int32_t buf_row_stride = row_stride % height; + + for (int32_t i = tid; i < height * CTILE; i += nthreads) { + load_smem_elem(buf_row, i % CTILE, buf_base + (i / CTILE)); + buf_row += buf_row_stride; + if (buf_row >= height) buf_row -= height; + } + } + + __syncthreads(); + + if constexpr (MaximallyDecimated) { + // bidx = t, causal_count = t+1, last_arrived >= s always true. + // Track buf_row incrementally across iterations to avoid modulo. + + // Seed buf_row for ty=0 at start_elem + int32_t buf_row_base = static_cast(start_elem % height); + // Per-iteration advance: NOUT output steps = NOUT buf_row advance. This is a defensive modulo + // so that we can keep buf_row_base in [0, height) with only a conditional subtraction. + const int32_t nout_wrap = NOUT % height; + + const IdxT last_start = start_elem + ((last_elem - start_elem) / NOUT) * NOUT; + for (IdxT next_start = start_elem; next_start <= last_start; next_start += NOUT) { + const IdxT t = next_start + ty; + if (t <= last_elem && active) { + accum_t accum{}; + + // buf_row for this thread's output step + int32_t my_buf_row = buf_row_base + ty; + if (my_buf_row >= height) my_buf_row -= height; + + const IdxT newest_raw = static_cast(s) + t * M; + int32_t h_skip = 0; + int32_t niter = static_cast( + cuda::std::min(static_cast(P), t + 1)); + if (newest_raw >= input_len) { + h_skip = 1; + niter = static_cast( + cuda::std::min(static_cast(P - 1), t)); + if (--my_buf_row < 0) my_buf_row += height; + } + + const int32_t prologue = cuda::std::min(my_buf_row + 1, niter); + const int32_t epilogue = niter - prologue; + int32_t h_ind = c + h_skip * M; + int32_t p = 0; + for (int32_t i = 0; i < prologue; i++, p++) { + filter_t hv; + if constexpr (FilterInSmem) { + hv = smem_filter_base[(p + h_skip) * CTILE + cx]; + } else { + hv = filter.operator()(static_cast(h_ind)); + } + const input_t iv = smem_input[my_buf_row * CTILE + cx]; + detail::channelize_cmac(accum, + detail::channelize_cast_filter(hv), + detail::channelize_cast_input(iv)); + my_buf_row--; + h_ind += M; + } + my_buf_row = height - 1; + for (int32_t i = 0; i < epilogue; i++, p++) { + filter_t hv; + if constexpr (FilterInSmem) { + hv = smem_filter_base[(p + h_skip) * CTILE + cx]; + } else { + hv = filter.operator()(static_cast(h_ind)); + } + const input_t iv = smem_input[my_buf_row * CTILE + cx]; + detail::channelize_cmac(accum, + detail::channelize_cast_filter(hv), + detail::channelize_cast_input(iv)); + my_buf_row--; + h_ind += M; + } + + outdims[OutElemRank] = t; + cuda::std::apply([accum, &output](auto &&...args) { + output.operator()(args...) = static_cast(accum); + }, outdims); + } + + if (next_start < last_start) { + // Incremental buf_row_base advance (no modulo) + buf_row_base += nout_wrap; + if (buf_row_base >= height) buf_row_base -= height; + + // Ensure all threads have finished reading smem_input before overwriting + __syncthreads(); + + // Load NOUT new rows. For D==M, exactly NOUT new samples arrive. + // Each ty-lane loads one row (its cx column). + { + const IdxT next_end = cuda::std::min(next_start + static_cast(2 * NOUT) - 1, last_elem); + const int32_t new_rows = static_cast(max_bidx(next_end) - loaded_up_to); + int32_t lr = static_cast((loaded_up_to + 1) % height); + if (ty < new_rows) { + int32_t my_lr = lr + ty; + if (my_lr >= height) my_lr -= height; + load_smem_elem(my_lr, cx, loaded_up_to + 1 + ty); + } + loaded_up_to += new_rows; + } + + // Ensure new rows are visible before next iteration's compute + __syncthreads(); + } + } + } else { + // D < M: oversampled path + const IdxT last_start = start_elem + ((last_elem - start_elem) / NOUT) * NOUT; + for (IdxT next_start = start_elem; next_start <= last_start; next_start += NOUT) { + const IdxT t = next_start + ty; + if (t <= last_elem && active) { + accum_t accum{}; + + const IdxT last_arrived = t * decimation_factor + decimation_factor - 1; + if (last_arrived >= s) { + const IdxT A = last_arrived - s; + const IdxT bidx = A / M; + const IdxT causal_count = bidx + 1; + + const int32_t k = static_cast(t % K); + // Phase uses the original logical channel (not remapped) + const int32_t phase = static_cast( + (c + t * decimation_factor) % M); + + int32_t available_taps = P; + if (((P - 1) * M + phase) >= filter_full_len) { + available_taps--; + } + + IdxT newest_raw = last_arrived - (A % M); + int32_t h_skip = 0; + if (newest_raw >= input_len) { + h_skip = 1; + } + + const int32_t niter = static_cast( + cuda::std::min(static_cast(available_taps - h_skip), + causal_count - h_skip)); + + int32_t buf_row = static_cast((bidx - h_skip) % height); + + const int32_t prologue = cuda::std::min(buf_row + 1, niter); + const int32_t epilogue = niter - prologue; + int32_t h_ind = phase + h_skip * M; + int32_t p = 0; + for (int32_t i = 0; i < prologue; i++, p++) { + filter_t hv; + if constexpr (FilterInSmem) { + hv = smem_filter_base[cx * filter_stride + k * P + (p + h_skip)]; + } else { + hv = filter.operator()(static_cast(h_ind)); + } + const input_t iv = smem_input[buf_row * CTILE + cx]; + detail::channelize_cmac(accum, + detail::channelize_cast_filter(hv), + detail::channelize_cast_input(iv)); + buf_row--; + h_ind += M; + } + buf_row = height - 1; + for (int32_t i = 0; i < epilogue; i++, p++) { + filter_t hv; + if constexpr (FilterInSmem) { + hv = smem_filter_base[cx * filter_stride + k * P + (p + h_skip)]; + } else { + hv = filter.operator()(static_cast(h_ind)); + } + const input_t iv = smem_input[buf_row * CTILE + cx]; + detail::channelize_cmac(accum, + detail::channelize_cast_filter(hv), + detail::channelize_cast_input(iv)); + buf_row--; + h_ind += M; + } + } + + outdims[OutElemRank] = t; + cuda::std::apply([accum, &output](auto &&...args) { + output.operator()(args...) = static_cast(accum); + }, outdims); + } + + if (next_start < last_start) { + // Ensure all threads have finished reading smem_input before overwriting + __syncthreads(); + + // Load new rows for next iteration + const IdxT next_iter_end = cuda::std::min(next_start + static_cast(2 * NOUT) - 1, last_elem); + const IdxT needed_up_to = max_bidx(next_iter_end); + const int32_t new_rows = static_cast(needed_up_to - loaded_up_to); + + // Each ty-lane loads one row (its cx column) + { + int32_t lr = static_cast((loaded_up_to + 1) % height); + if (ty < new_rows) { + int32_t my_lr = lr + ty; + if (my_lr >= height) my_lr -= height; + load_smem_elem(my_lr, cx, loaded_up_to + 1 + ty); + } + } + + loaded_up_to = needed_up_to; + + // Ensure new rows are visible before next iteration's compute + __syncthreads(); + } + } + } +} + // This kernel works in cases where the full filter (with potentially some zero padding) and // the inputs required to compute elems_per_channel_per_cta outputs all fit into shared memory. template @@ -226,10 +709,10 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f using filter_t = typename FilterType::value_type; static_assert(! is_complex_v, "channelize_poly: accumulator type must be real; it will be treated as complex when necessary"); - // If the output is complex, then then accumulator is complex. Otherwise, the accumulator is real. + // If the output is complex, then accumulator is complex. Otherwise, the accumulator is real. using accum_t = cuda::std::conditional_t, typename detail::scalar_to_complex::ctype, AccumType>; - extern __shared__ uint8_t __attribute((aligned(16))) smem_dyn_align16[]; + extern __shared__ __align__(16) uint8_t smem_dyn_align16[]; constexpr int InRank = InType::Rank(); constexpr int OutRank = OutType::Rank(); @@ -289,7 +772,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f smem_input[smem_ind] = input.operator()(args...); }, indims); } else { - smem_input[smem_ind] = static_cast(0); + smem_input[smem_ind] = static_cast(0); } } @@ -313,7 +796,7 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f smem_input[smem_ind] = input.operator()(args...); }, indims); } else { - smem_input[smem_ind] = static_cast(0); + smem_input[smem_ind] = static_cast(0); } } @@ -343,15 +826,17 @@ __global__ void ChannelizePoly1D_Smem(OutType output, InType input, FilterType f const input_t *sample = smem_input + cached_input_ind_tail * num_channels + (num_channels - 1 - chan); // Apply the filter h in reverse order below to flip the filter for convolution for (int32_t k = 0; k < prologue_count; k++) { - accum += detail::channelize_cast_filter(*h) * - detail::channelize_cast_input(*sample); + detail::channelize_cmac(accum, + detail::channelize_cast_filter(*h), + detail::channelize_cast_input(*sample)); sample += num_channels; h -= num_channels; } sample = smem_input + (num_channels - 1 - chan); for (int32_t k = 0; k < epilogue_count; k++) { - accum += detail::channelize_cast_filter(*h) * - detail::channelize_cast_input(*sample); + detail::channelize_cmac(accum, + detail::channelize_cast_filter(*h), + detail::channelize_cast_input(*sample)); sample += num_channels; h -= num_channels; } @@ -395,15 +880,15 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT auto indims = BlockToIdx(input, blockIdx.z, 1); auto outdims = BlockToIdx(output, blockIdx.z, 2); - constexpr index_t ELEMS_PER_BLOCK = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; - const index_t first_out_elem = elem_block * CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + constexpr index_t ELEMS_PER_BLOCK = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + const index_t first_out_elem = elem_block * detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; const index_t last_out_elem = cuda::std::min( output_len_per_channel - 1, first_out_elem + ELEMS_PER_BLOCK - 1); // Versions of CUDA prior to 11.8 do not allow static shared memory allocations of // cuda::std::complex types due to it having no trivial constructor. This workaround // prevents an 'initializer not allowed for __shared__ variable' error. - __align__(sizeof(complex_accum_t)) __shared__ uint8_t smem_eij_workaround[sizeof(complex_accum_t)*NUM_CHAN*NUM_CHAN]; + __shared__ __align__(16) uint8_t smem_eij_workaround[sizeof(complex_accum_t)*NUM_CHAN*NUM_CHAN]; complex_accum_t (&smem_eij)[NUM_CHAN][NUM_CHAN] = reinterpret_cast(smem_eij_workaround); // Pre-compute the DFT complex exponentials and store in shared memory for (int t = tid; t < NUM_CHAN*NUM_CHAN; t += THREADS) { @@ -440,14 +925,15 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT for (int chan = 0; chan < NUM_CHAN; chan++) { const filter_t h_val = (h_ind < filter_full_len) ? filter.operator()(h_ind) : static_cast(0); if (indims[InRank-1] < input_len) { - cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) { - accum[chan] += detail::channelize_cast_filter(h_val) * - detail::channelize_cast_input(input.operator()(args...)); + cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) { + detail::channelize_cmac(accum[chan], + detail::channelize_cast_filter(h_val), + detail::channelize_cast_input(input.operator()(args...))); }, indims); } h_ind++; indims[InRank-1]--; - } + } } niter--; @@ -456,8 +942,9 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT for (int chan = 0; chan < NUM_CHAN; chan++) { const filter_t h_val = filter.operator()(h_ind); cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) { - accum[chan] += detail::channelize_cast_filter(h_val) * - detail::channelize_cast_input(input.operator()(args...)); + detail::channelize_cmac(accum[chan], + detail::channelize_cast_filter(h_val), + detail::channelize_cast_input(input.operator()(args...))); }, indims); h_ind++; indims[InRank-1]--; @@ -473,8 +960,9 @@ __global__ void ChannelizePoly1D_FusedChan(OutType output, InType input, FilterT // const filter_t h_val = (h_ind < filter_full_len) ? filter.operator()(h_ind) : static_cast(0); const filter_t h_val = filter.operator()(h_ind); cuda::std::apply([&accum, chan, h_val, &input](auto &&...args) { - accum[chan] += detail::channelize_cast_filter(h_val) * - detail::channelize_cast_input(input.operator()(args...)); + detail::channelize_cmac(accum[chan], + detail::channelize_cast_filter(h_val), + detail::channelize_cast_input(input.operator()(args...))); }, indims); h_ind++; indims[InRank-1]--; diff --git a/include/matx/operators/channelize_poly.h b/include/matx/operators/channelize_poly.h index da18bd35..b88f1200 100644 --- a/include/matx/operators/channelize_poly.h +++ b/include/matx/operators/channelize_poly.h @@ -78,7 +78,7 @@ namespace detail { a_(a), f_(f), num_channels_(num_channels), decimation_factor_(decimation_factor) { MATX_LOG_TRACE("{} constructor: num_channels={}, decimation_factor={}", str(), num_channels, decimation_factor); - const index_t b_len = (a_.Size(OpA::Rank() - 1) + num_channels - 1) / num_channels; + const index_t b_len = (a_.Size(OpA::Rank() - 1) + decimation_factor - 1) / decimation_factor; for (int r = 0; r < OpA::Rank()-1; r++) { out_dims_[r] = a_.Size(r); @@ -201,16 +201,56 @@ namespace detail { * is assumed to contain a single input signal with all preceding dimensions being batch dimensions * that are channelized independently. * @param f Filter operator that represents the filter coefficients. This must be a 1D tensor. - * @param num_channels Number of channels to create. - * @param decimation_factor Factor by which to downsample the input signal into the channels. Currently, - * the only supported value of decimation_factor is a value equal to num_channels. This corresponds to - * the maximally decimated, or critically sampled, case. It is also possible for decimation_factor to - * be less than num_channels, which corresponds to an oversampled case with overlapping channels, but - * this implementation does not yet support oversampled cases. + * @param num_channels Number of channels (or frequency sub-bands) to create. + * @param decimation_factor Factor by which to downsample the input signal into the channels. When + * decimation_factor equals num_channels, this is the maximally decimated (critically sampled) case. + * When decimation_factor is less than num_channels, this is the oversampled case with overlapping + * channels. Both integer and rational oversampling ratios are supported. * * @returns Operator representing the channelized signal. The output tensor rank is one higher than the * input tensor rank. The first Rank-2 dimensions are all batch dimensions. The second-to-last dimension - * is the sample dimension and the last dimension is the channel dimension. + * is the sample dimension and the last dimension is the channel dimension. The per-channel output + * size is (in.Size(InType::Rank()-1) + decimation_factor - 1) / decimation_factor. + * + * Let the letters M and D denote the number of channels and the decimation factor, + * respectively. The case where M == D is the maximally decimated (critically sampled) case. In this case, + * the channelizer generates M output samples (one per channel) for each M input samples. The case where + * D < M is the oversampled case. In this case, the channelizer generates M output samples (one per channel) + * for each D input samples. Thus, the total output sample count is roughly M/D times the input sample count. + * + * Logically, the polyphase channelizer delivers samples to a set of M commutator branches, + * each representing a channel or frequency sub-band. For each output step, the samples in that commutator branch are convolved + * with a phase of the prototype filter. The outputs of these convolutions are then the inputs to an M-point + * IFFT. The results obtained depend on the order in which samples are delivered to the commutator branches + * and the filter phase used per branch for each output. + * + * In MatX, samples are always delivered to the commutator branches in counter-clockwise order. In the + * maximally decimated case, the sample-to-branch order is M-1, M-2, ..., 0 as in the + * paper by Harris et al [1]. The filter phases are fixed per channel, so channel 0 corresponds to phase 0, + * channel 1 corresponds to phase 1, etc. If comparing to another implementation that starts with a different + * commutator branch (say, branch 0) and then proceeds in the same order (counter-clockwise), one can either prime the + * other channelizer with a single zero input (discarding the outputs, if M outputs are generated after the + * first sample) or prime the MatX channelizer with an appropriate number of zeros (e.g., M-1 to match an + * implementation that starts at branch 0) to obtain equivalent results. + * + * The oversampled case, where D < M, is more complicated. The order + * in which samples are delivered to the commutator branches is still counter-clockwise, but the samples + * are logically delivered to branches D-1, D-2, ..., 0 for each set of D inputs. With each new set of D + * inputs, the previous samples are logically shifted through the 2D filter bank. As an implementation detail, + * rather than shifting samples, we rotate the filter phases applied to a given commutator branch. Each + * branch rotates through a set of K = M / gcd(M, D) phases, where gcd(M, D) is the greatest common divisor + * of M and D. Note that the maximally decimated case is a special case of the oversampled case where samples + * shift through the 2D filter bank in groups of M, thus maintaining the same filter phase for each channel/branch. + * + * Comparing the oversampled case to another channelizer with a different convention can be difficult due + * to the rotation of the filter phases. For example, for a channelizer that starts at branch 0 and delivers + * M outputs after the first sample, that first sample will effectively rotate the filter phases as a result + * of generating a set of M outputs. This is unlike the maximally decimated case where the filter phases are + * fixed per channel, so we cannot simply prime the other channelizer with a single zero input to obtain equivalent + * results. Instead, we need to provide enough zero inputs to the other channelizer so that the two channelizers + * will start with the same filter phases. For a channelizer that starts at branch 0 and delivers M outputs + * after a single sample, proceeding with the D-1, D-2, ..., 0 order thereafter, priming requires + * 1 + (K - 1) * D zero samples, where K = M / gcd(M, D) as before. * * The polyphase channelizer supports the following properties: * - PropAccum: Type of accumulator. This type should always be real, but it will be promoted to @@ -219,7 +259,12 @@ namespace detail { * * The default accumulator type is the output type, but the output type is context-dependent. The * PropOutput property allows the user to explicitly set the output type and the PropAccum property - * allows the user to explicitly set the accumulator type. + * allows the user to explicitly set the accumulator type. See the examples section for an example + * of the property syntax. + * + * [1] "Digital Receivers and Transmitters Using Polyphase Filter Banks for Wireless Communications", + * F. J. Harris, C. Dick, M. Rice, IEEE Transactions on Microwave Theory and Techniques, + * Vol. 51, No. 4, Apr. 2003. */ template inline auto channelize_poly(const InType &in, const FilterType &f, index_t num_channels, index_t decimation_factor) { diff --git a/include/matx/transforms/channelize_poly.h b/include/matx/transforms/channelize_poly.h index ece38718..70210bb4 100644 --- a/include/matx/transforms/channelize_poly.h +++ b/include/matx/transforms/channelize_poly.h @@ -35,6 +35,7 @@ #include #include #include +#include #include "matx/core/error.h" #include "matx/core/nvtx.h" @@ -58,9 +59,168 @@ constexpr index_t MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD = 6; // to balance occupancy and CTA size. For now, we choose a reasonable default. constexpr index_t MATX_CHANNELIZE_POLY1D_FULL_SMEM_KERNEL_NOUT_PER_ITER = 4; +// Maximum dynamic shared memory (bytes) for caching filter taps in ChannelizePoly1D. +constexpr size_t MATX_CHANNELIZE_POLY1D_MAX_FILTER_SMEM_BYTES = 6 * 1024; + +// Constants for the SmemTiled kernel. +constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE = 64; +constexpr int MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT = 4; +constexpr size_t MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_BYTES = 48 * 1024; +// Maximum filter smem budget. When the filter exceeds this, it is read from +// global/L2 instead, freeing smem for better occupancy. +constexpr size_t MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES = 2048; + +// Compute the shared memory footprint for the SmemTiled filter taps. +// For D==M: one phase per tile channel → CTILE * P elements. +// For D +inline size_t matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes( + index_t num_channels, index_t filter_len, index_t decimation_factor) +{ + using filter_t = typename FilterType::value_type; + constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE; + const index_t P = (filter_len + num_channels - 1) / num_channels; + if (decimation_factor == num_channels) { + return static_cast(CTILE) * P * sizeof(filter_t); + } + const index_t gcd_val = std::gcd(num_channels, decimation_factor); + const index_t K = num_channels / gcd_val; + if (K > detail::MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_ROTATIONS) { + return MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES + 1; // exceeds budget, so FilterInSmem=false + } + return static_cast(CTILE) * K * P * sizeof(filter_t); +} + +template +inline size_t matxChannelizePoly1DInternal_SmemTiledSizeBytes( + const OutType &o, const InType &, const FilterType &filter, index_t decimation_factor) +{ + using input_t = typename InType::value_type; + + constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE; + constexpr int NOUT = MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT; + + const index_t M = o.Size(OutType::Rank() - 1); + const index_t filter_len = filter.Size(FilterType::Rank() - 1); + const index_t P = (filter_len + M - 1) / M; + const size_t input_smem = static_cast(P + NOUT - 1) * CTILE * sizeof(input_t); + + const size_t filter_smem = matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes(M, filter_len, decimation_factor); + if (filter_smem > MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES) { + return input_smem; + } + + const size_t filter_smem_aligned = filter_smem + + ((filter_smem % sizeof(input_t)) ? (sizeof(input_t) - filter_smem % sizeof(input_t)) : 0); + return filter_smem_aligned + input_smem; +} + +template +inline bool matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel( + const OutType &o, const InType &in, const FilterType &filter, index_t decimation_factor) +{ + const index_t num_channels = o.Size(OutType::Rank() - 1); + // Skip SmemTiled for small channel counts where most of CTILE would be + // idle threads. The generic ChannelizePoly1D kernel is more efficient + // in this regime. + if (num_channels <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE * 3 / 4) { + return false; + } + // The input circular buffer must fit in smem. The filter may or may not + // be included (FilterInSmem is decided separately). + return matxChannelizePoly1DInternal_SmemTiledSizeBytes(o, in, filter, decimation_factor) + <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_BYTES; +} + +template +inline void matxChannelizePoly1DInternal_SmemTiled( + OutType o, const InType &i, const FilterType &filter, + index_t decimation_factor, cudaStream_t stream) +{ +#ifdef __CUDACC__ + MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) + + constexpr int CTILE = MATX_CHANNELIZE_POLY1D_SMEM_TILED_CTILE; + constexpr int NOUT = MATX_CHANNELIZE_POLY1D_SMEM_TILED_NOUT; + + const index_t num_channels = o.Size(OutType::Rank() - 1); + const index_t nout_per_channel = o.Size(OutType::Rank() - 2); + const int num_batches = static_cast(TotalSize(i) / i.Size(i.Rank() - 1)); + + const index_t gcd_val = std::gcd(num_channels, decimation_factor); + const int32_t K = static_cast(num_channels / gcd_val); + + const int channel_tiles = static_cast((num_channels + CTILE - 1) / CTILE); + // Target ~1024 spatial blocks (time × channel tiles) to saturate the GPU. + // For large channel counts, fewer time blocks are needed. + const int target_time_blocks = cuda::std::max(1, (1024 + channel_tiles - 1) / channel_tiles); + const int elem_per_block = static_cast( + (nout_per_channel + target_time_blocks - 1) / target_time_blocks); + const int time_blocks = static_cast( + (nout_per_channel + elem_per_block - 1) / elem_per_block); + + dim3 block(CTILE, NOUT); + dim3 grid(time_blocks, channel_tiles, num_batches); + + const index_t filter_len = filter.Size(FilterType::Rank() - 1); + const bool filter_in_smem = matxChannelizePoly1DInternal_SmemTiledFilterSmemBytes( + num_channels, filter_len, decimation_factor) <= MATX_CHANNELIZE_POLY1D_SMEM_TILED_MAX_FILTER_BYTES; + const size_t smem_size = matxChannelizePoly1DInternal_SmemTiledSizeBytes(o, i, filter, decimation_factor); + + // Use int32_t for intra-kernel index arithmetic when all tensor dimensions + // fit, avoiding 64-bit IMAD.WIDE instructions in the inner loops. + const index_t input_len = i.Size(i.Rank() - 1); + const bool use_32bit = (sizeof(index_t) <= sizeof(int32_t)) || + (static_cast(input_len) + num_channels <= std::numeric_limits::max() && + nout_per_channel <= std::numeric_limits::max() && + num_channels <= std::numeric_limits::max()); + + // Dispatch on MaximallyDecimated x FilterInSmem x IndexType + constexpr bool kMaxDec = true; + constexpr bool kOversampled = false; + constexpr bool kFiltSmem = true; + constexpr bool kFiltGlobal = false; + + auto launch = [&](auto idx_tag) { + using IdxT = decltype(idx_tag); + const IdxT epb = static_cast(elem_per_block); + const IdxT df = static_cast(decimation_factor); + if (decimation_factor == num_channels) { + if (filter_in_smem) { + ChannelizePoly1D_SmemTiled + <<>>(o, i, filter, epb, df, K); + } else { + ChannelizePoly1D_SmemTiled + <<>>(o, i, filter, epb, df, K); + } + } else { + if (filter_in_smem) { + ChannelizePoly1D_SmemTiled + <<>>(o, i, filter, epb, df, K); + } else { + ChannelizePoly1D_SmemTiled + <<>>(o, i, filter, epb, df, K); + } + } + }; + + if constexpr (sizeof(index_t) <= sizeof(int32_t)) { + launch(index_t{}); + } else { + if (use_32bit) { + launch(int32_t{}); + } else { + launch(index_t{}); + } + } +#endif +} + template inline void matxChannelizePoly1DInternal(OutType o, const InType &i, - const FilterType &filter, cudaStream_t stream) + const FilterType &filter, index_t decimation_factor, cudaStream_t stream) { #ifdef __CUDACC__ MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) @@ -69,13 +229,26 @@ inline void matxChannelizePoly1DInternal(OutType o, const InType &i, const index_t nout_per_channel = o.Size(OutType::Rank()-2); const int num_batches = static_cast(TotalSize(i)/i.Size(i.Rank() - 1)); + using filter_t = typename FilterType::value_type; + const index_t filter_len = filter.Size(FilterType::Rank()-1); + const int THREADS = 256; - const index_t ELTS_PER_THREAD = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + const index_t ELTS_PER_THREAD = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; const int elem_blocks = static_cast( (nout_per_channel + ELTS_PER_THREAD - 1) / ELTS_PER_THREAD); dim3 grid(elem_blocks, static_cast(num_channels), num_batches); - ChannelizePoly1D<<>>( - o, i, filter); + if (decimation_factor == num_channels) { + // For M == D, cache one filter phase in dynamic shared memory if it fits. + const index_t filter_phase_len = (filter_len + num_channels - 1) / num_channels; + const size_t smem_needed = static_cast(filter_phase_len) * sizeof(filter_t); + const uint32_t smem_bytes = (smem_needed <= MATX_CHANNELIZE_POLY1D_MAX_FILTER_SMEM_BYTES) + ? static_cast(smem_needed) : 0; + ChannelizePoly1D<<>>( + o, i, filter, decimation_factor, smem_bytes); + } else { + ChannelizePoly1D<<>>( + o, i, filter, decimation_factor, 0); + } #endif } @@ -149,7 +322,7 @@ inline void matxChannelizePoly1DInternal_FusedChan(OutType o, const InType &i, const int num_batches = static_cast(TotalSize(i)/i.Size(i.Rank() - 1)); const int THREADS = 256; - const index_t ELTS_PER_THREAD = CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; + const index_t ELTS_PER_THREAD = detail::CHANNELIZE_POLY1D_ELEMS_PER_THREAD * THREADS; const int elem_blocks = static_cast( (nout_per_channel + ELTS_PER_THREAD - 1) / ELTS_PER_THREAD); dim3 grid(elem_blocks, 1, num_batches); @@ -195,9 +368,9 @@ inline void matxChannelizePoly1DUnpackInternal(DataType inout, cudaStream_t stre /** * @brief 1D polyphase channelizer. A channelizer separates an input signal into a set of - * constituent channels, each corresponding to a band of the input signal bandwidth. The current - * implementation only supports maximally decimated (i.e., critically sampled) channelizers wherein the - * decimation factor is equivalent to the number of channels and the channels are non-overlapping. + * constituent channels, each corresponding to a band of the input signal bandwidth. Supports both + * maximally decimated (critically sampled, decimation_factor == num_channels) and oversampled + * (decimation_factor < num_channels) cases, including rational oversampling ratios. * * @tparam OutType Type of output * @tparam InType Type of input @@ -208,16 +381,16 @@ inline void matxChannelizePoly1DUnpackInternal(DataType inout, cudaStream_t stre * @param in Input operator * @param f Filter operator * @param num_channels Number of channels in which to separate the signal. Must be greater than 1. - * @param decimation_factor Factor by which to downsample the input signal into the channels. Currently, - * the only supported value of decimation_factor is a value equal to num_channels. This corresponds to - * the maximally decimated, or critically sampled, case. It is also possible for decimation_factor to - * be less than num_channels, which corresponds to an oversampled case with overlapping channels, but - * this implementation does not yet support oversampled cases. + * @param decimation_factor Factor by which to downsample the input signal into the channels. When + * decimation_factor equals num_channels, this is the maximally decimated (critically sampled) case. + * When decimation_factor is less than num_channels, this is the oversampled case with overlapping + * channels. Both integer (num_channels % decimation_factor == 0) and rational oversampling ratios + * are supported. * @param stream CUDA stream on which to run the kernel(s) */ template inline void channelize_poly_impl(OutType out, const InType &in, const FilterType &f, - index_t num_channels, [[maybe_unused]] index_t decimation_factor, cudaStream_t stream = 0) { + index_t num_channels, index_t decimation_factor, cudaStream_t stream = 0) { MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) using OutputOp = std::remove_cv_t>; using InputOp = std::remove_cv_t>; @@ -243,14 +416,14 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType "channelize_poly: num_channels must be greater than 1"); MATX_ASSERT_STR(decimation_factor > 0, matxInvalidParameter, "channelize_poly: decimation_factor must be positive"); - MATX_ASSERT_STR(num_channels == decimation_factor, matxInvalidParameter, - "channelize_poly: currently only support decimation_factor == num_channels"); + MATX_ASSERT_STR(decimation_factor <= num_channels, matxInvalidParameter, + "channelize_poly: decimation_factor must be <= num_channels"); for(int i = 0 ; i < IN_RANK-1; i++) { MATX_ASSERT_STR(out.Size(i) == in.Size(i), matxInvalidDim, "channelize_poly: input/output must have matched batch sizes"); } - [[maybe_unused]] const index_t num_elem_per_channel = (in.Size(IN_RANK-1) + num_channels - 1) / num_channels; + [[maybe_unused]] const index_t num_elem_per_channel = (in.Size(IN_RANK-1) + decimation_factor - 1) / decimation_factor; MATX_ASSERT_STR(out.Size(OUT_RANK-1) == num_channels, matxInvalidDim, "channelize_poly: output size OUT_RANK-1 mismatch"); @@ -260,7 +433,8 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType // If neither the input nor the filter is complex, then the filtered samples will be real-valued // and we will use an R2C transform. Otherwise, we will use a C2C transform. if constexpr (! is_complex_v && ! is_complex_half_v && ! is_complex_v && ! is_complex_half_v) { - if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) { + // The fused-DFT kernel only supports the maximally decimated case (D == M). + if (decimation_factor == num_channels && num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) { matxChannelizePoly1DInternal_FusedChan(out, in, f, stream); } else { index_t start_dims[OUT_RANK], stop_dims[OUT_RANK]; @@ -300,10 +474,12 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType } }(); - if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) { + if (decimation_factor == num_channels && matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) { matxChannelizePoly1DInternal_Smem(fft_in_slice, in, f, stream); + } else if (matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel(out, in, f, decimation_factor)) { + matxChannelizePoly1DInternal_SmemTiled(fft_in_slice, in, f, decimation_factor, stream); } else { - matxChannelizePoly1DInternal(fft_in_slice, in, f, stream); + matxChannelizePoly1DInternal(fft_in_slice, in, f, decimation_factor, stream); } stop_dims[OUT_RANK-1] = (num_channels/2) + 1; auto out_packed = slice(out, start_dims, stop_dims); @@ -311,13 +487,16 @@ inline void channelize_poly_impl(OutType out, const InType &in, const FilterType matxChannelizePoly1DUnpackInternal(out, stream); } } else { - if (num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) { + // The fused-DFT kernel only supports the maximally decimated case (D == M). + if (decimation_factor == num_channels && num_channels <= detail::MATX_CHANNELIZE_POLY1D_FUSED_CHAN_KERNEL_THRESHOLD) { matxChannelizePoly1DInternal_FusedChan(out, in, f, stream); } else { - if (matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) { + if (decimation_factor == num_channels && matxChannelizePoly1DInternal_ShouldUseSmemKernel(out, in, f)) { matxChannelizePoly1DInternal_Smem(out, in, f, stream); + } else if (matxChannelizePoly1DInternal_ShouldUseSmemTiledKernel(out, in, f, decimation_factor)) { + matxChannelizePoly1DInternal_SmemTiled(out, in, f, decimation_factor, stream); } else { - matxChannelizePoly1DInternal(out, in, f, stream); + matxChannelizePoly1DInternal(out, in, f, decimation_factor, stream); } // Specify FORWARD here to prevent any normalization after the ifft. We do not // want any extra scaling on the output values. diff --git a/test/00_transform/ChannelizePoly.cu b/test/00_transform/ChannelizePoly.cu index 0ef0611c..f3cb4a7e 100644 --- a/test/00_transform/ChannelizePoly.cu +++ b/test/00_transform/ChannelizePoly.cu @@ -153,11 +153,10 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Simple) const index_t a_len = test_cases[i].a_len; const index_t f_len = test_cases[i].f_len; const index_t num_channels = test_cases[i].num_channels; - // Currently do not support oversampled channelization const index_t decimation_factor = num_channels; [[maybe_unused]] const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels; this->pb->template InitAndRunTVGenerator( - "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels}); + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); auto a = make_tensor({a_len}); auto f = make_tensor({f_len}); @@ -201,7 +200,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, MixedPrecision) const double mixed_thresh = 1e-5; this->pb->template InitAndRunTVGenerator( - "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels}); + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); auto a64 = make_tensor({a_len}); auto f64 = make_tensor({f_len}); this->pb->NumpyToTensorView(a64, "a"); @@ -228,7 +227,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, MixedPrecision) } this->pb->template InitAndRunTVGenerator>( - "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels}); + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); auto ac64 = make_tensor>({a_len}); this->pb->NumpyToTensorView(ac64, "a"); this->pb->NumpyToTensorView(f64, "filter_random_real"); @@ -274,7 +273,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, AccumProperty) const double mixed_thresh_complex_input = 1e-4; this->pb->template InitAndRunTVGenerator( - "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels}); + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); auto a64 = make_tensor({a_len}); auto f64 = make_tensor({f_len}); auto gold_output_hreal = make_tensor>({b_len_per_channel, num_channels}); @@ -323,7 +322,7 @@ TYPED_TEST(ChannelizePolyTestDoubleType, AccumProperty) } this->pb->template InitAndRunTVGenerator>( - "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels}); + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); auto ac64 = make_tensor>({a_len}); this->pb->NumpyToTensorView(ac64, "a"); this->pb->NumpyToTensorView(f64, "filter_random_real"); @@ -430,10 +429,9 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Batched) const index_t a_len = test_cases[i].a_len; const index_t f_len = test_cases[i].f_len; const index_t num_channels = test_cases[i].num_channels; - // Currently do not support oversampled channelization const index_t decimation_factor = num_channels; const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels; - std::vector sizes = { a_len, f_len, num_channels }; + std::vector sizes = { a_len, f_len, num_channels, decimation_factor }; sizes.insert(sizes.end(), test_cases[i].batch_dims.begin(), test_cases[i].batch_dims.end()); this->pb->template InitAndRunTVGenerator( "00_transforms", "channelize_poly_operators", "channelize", sizes); @@ -485,7 +483,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, IdentityFilter) // cases (a single channel channelizer is just a convolution). const index_t num_channels = 2; this->pb->template InitAndRunTVGenerator( - "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels }); + "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels, num_channels }); auto a = make_tensor({a_len}); auto f = make_tensor({f_len}); @@ -536,7 +534,7 @@ TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, Operators) [[maybe_unused]] const index_t f_len = 90; const index_t num_channels = 10; this->pb->template InitAndRunTVGenerator( - "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels }); + "00_transforms", "channelize_poly_operators", "channelize", { a_len, f_len, num_channels, num_channels }); auto a = make_tensor({a_len}); auto f = make_tensor({f_len}); @@ -841,4 +839,594 @@ TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003) } MATX_EXIT_HANDLER(); -} \ No newline at end of file +} + +// Oversampled channelizer test based on Harris 2003 receiver_40z.m. +// M=40, D=28, 600-tap filter designed with remez. Golden input and output +// are from the reference implementation. This test validates the oversampled +// phase rotation convention. +TYPED_TEST(ChannelizePolyTestDoubleType, Harris2003Oversampled) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = cuda::std::complex; + + const index_t input_len = 5600; + const index_t M = 40; + const index_t D = 28; + const index_t num_output = (input_len + D - 1) / D; + + this->pb->template InitAndRunTVGenerator( + "00_transforms", "harris2003_oversampled_operators", "channelize", + {input_len}); + + auto a = make_tensor({input_len}); + auto f = make_tensor({600}); + auto b = make_tensor({num_output, M}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter"); + (b = channelize_poly(a, f, M, D)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_golden", this->thresh); + + MATX_EXIT_HANDLER(); +} + +// Oversampled identity filter test: validates the causal commutator behavior. +// With a single-tap identity filter (E[r,0] = 1 for all r), the filtering step +// simply selects one input sample per branch. The causal commutator determines +// which sample each branch sees at each output time: +// - Sample k goes to branch (M-1 - k%M) +// - At output time n, only samples x[0..n*D+D-1] have arrived +// - Branch r's newest sample: largest k <= n*D+D-1 where k%M == M-1-r +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledIdentityFilter) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using InnerType = typename test_types::inner_type::type; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + { 20, 4, 2 }, // 2x oversampled, M=4 + { 20, 4, 1 }, // 4x oversampled, M=4 + { 30, 10, 3 }, // rational oversample, M=10 D=3 + { 20, 4, 3 }, // rational oversample, M=4 D=3 + { 20, 2, 1 }, // 2x oversampled, M=2 + { 2, 4, 4 }, // maximally decimated, a_len < M + { 3, 4, 4 }, // maximally decimated, a_len < M + { 1, 4, 2 }, // oversampled, a_len < M + { 3, 4, 2 }, // oversampled, a_len < M + { 2, 6, 3 }, // oversampled rational, a_len < M + }; + + for (size_t tc = 0; tc < sizeof(test_cases)/sizeof(test_cases[0]); tc++) { + const index_t a_len = test_cases[tc].a_len; + const index_t M = test_cases[tc].num_channels; + const index_t D = test_cases[tc].decimation_factor; + const index_t f_len = M; // identity filter: one tap per phase + const index_t num_output = (a_len + D - 1) / D; + + // Use the pybind generator only for random input data and not for the expected output + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, M, D}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({num_output, M}); + + this->pb->NumpyToTensorView(a, "a"); + + // Identity filter: h = [1, 1, ..., 1] of length M + // Polyphase decomposition gives E[r, 0] = 1 for all r (single tap per phase) + for (index_t i = 0; i < M; i++) { f(i) = static_cast(1.0); } + + this->exec.sync(); + + (b = channelize_poly(a, f, M, D)).run(this->exec); + + this->exec.sync(); + + // Compute expected output using the causal commutator model. + // With identity filter, g_r[n] = x[newest_r] if branch r has a sample, else 0. + // Then Y[n, :] = M * IFFT(g[n, :]) + for (index_t n = 0; n < num_output; n++) { + const index_t last_arrived = n * D + D - 1; + + // Compute the IDFT (with positive exponential) of the + // causal filtered values. This is M * IFFT. + for (index_t chan = 0; chan < M; chan++) { + ComplexType accum { 0 }; + for (index_t r = 0; r < M; r++) { + // Branch remap for Harris convention input mapping (D < M only) + const index_t r_remapped = (D < M) ? ((r + M - D) % M) : r; + const index_t s = M - 1 - r_remapped; // remapped for input access + + // Causal commutator: find newest sample for branch r + TestType sample_val = static_cast(0.0); + if (last_arrived >= s) { + const index_t newest = last_arrived - ((last_arrived - s) % M); + if (newest < a_len) { + sample_val = a(newest); + } + } + + // IDFT with positive exponential (Harris convention) + const double arg = 2.0 * M_PI * static_cast(r) * static_cast(chan) / static_cast(M); + double sinx, cosx; + sincos(arg, &sinx, &cosx); + ComplexType twiddle { static_cast(cosx), static_cast(sinx) }; + accum += sample_val * twiddle; + } + + ComplexType bval { b(n, chan) }; + ASSERT_NEAR(accum.real(), bval.real(), this->thresh) + << "mismatch at n=" << n << " chan=" << chan + << " M=" << M << " D=" << D << " a_len=" << a_len; + ASSERT_NEAR(accum.imag(), bval.imag(), this->thresh) + << "mismatch at n=" << n << " chan=" << chan + << " M=" << M << " D=" << D << " a_len=" << a_len; + } + } + } + + MATX_EXIT_HANDLER(); +} + +// Oversampled channelizer tests: integer oversampling (D divides M evenly) +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledInteger) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // 2x oversampling (D = M/2) + { 2500, 170, 10, 5 }, + { 2500, 187, 10, 5 }, + { 1800, 120, 8, 4 }, + { 37193, 41*8+4, 8, 4 }, + // 5x oversampling (D = M/5) + { 2500, 170, 10, 2 }, + // Large channel count + { 35000, 5*180, 180, 90 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// Oversampled channelizer tests: rational oversampling (D does not divide M evenly) +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledRational) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // M=10, D=3 (rational 10/3 oversampling) + { 2500, 170, 10, 3 }, + { 2500, 187, 10, 3 }, + // M=10, D=7 + { 2500, 170, 10, 7 }, + // M=9, D=4 + { 1800, 120, 9, 4 }, + // M=8, D=3 + { 37193, 41*8+4, 8, 3 }, + // M=11, D=4 + { 2500, 187, 11, 4 }, + // Large channel count, rational + { 35000, 5*181+17, 181, 60 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// Oversampled channelizer with batched inputs +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, OversampledBatched) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + constexpr int BATCH_DIMS = 2; + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + cuda::std::array batch_dims; + } test_cases[] = { + // Integer oversampling, batched + { 2500, 170, 10, 5, { 2, 3 } }, + // Rational oversampling, batched + { 2500, 170, 10, 3, { 2, 3 } }, + { 37193, 41*8+4, 8, 3, { 3, 1 } }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + std::vector sizes = { a_len, f_len, num_channels, decimation_factor }; + sizes.insert(sizes.end(), test_cases[i].batch_dims.begin(), test_cases[i].batch_dims.end()); + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", sizes); + const cuda::std::array a_dims = { + test_cases[i].batch_dims[0], test_cases[i].batch_dims[1], a_len + }; + const cuda::std::array b_dims = { + test_cases[i].batch_dims[0], test_cases[i].batch_dims[1], b_len_per_channel, num_channels + }; + auto a = make_tensor(a_dims); + auto f = make_tensor({f_len}); + auto b = make_tensor(b_dims); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// SmemTiled kernel tests: maximally decimated with M > 256 (forces SmemTiled +// because the existing _Smem kernel requires M <= 256) +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledMaximallyDecimated) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + } test_cases[] = { + // M=512, P=10 — exercises MaximallyDecimated SmemTiled with multiple channel tiles + { 256000, 10*512, 512 }, + // M=512, unaligned filter (not a multiple of M) + { 256000, 10*512+37, 512 }, + // M=1024, P=4 — large channel count, small P + { 102400, 4*1024, 1024 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = num_channels; + const index_t b_len_per_channel = (a_len + num_channels - 1) / num_channels; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize", {a_len, f_len, num_channels, num_channels}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// SmemTiled kernel tests: large-M oversampled cases that exercise multi-tile +// channel tiling in the oversampled path +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledLargeM) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // M=1024, D=512, 2x integer oversampling, K=2 + { 256000, 8*1024, 1024, 512 }, + // M=512, D=256, 2x integer oversampling, K=2 + { 128000, 10*512, 512, 256 }, + // M=512, D=256, unaligned filter + { 128000, 10*512+19, 512, 256 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// SmemTiled kernel tests: rational oversampling with large K (many distinct +// filter phases) to stress the K-phase filter storage +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledLargeK) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // M=128, D=97, K=128 (gcd=1, all phases distinct) + { 25600, 5*128, 128, 97 }, + // M=64, D=37, K=64 (gcd=1) + { 12800, 8*64, 64, 37 }, + // M=100, D=30, K=10 (gcd=10), moderate K + { 20000, 7*100, 100, 30 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// SmemTiled oversampled with FilterInSmem=true. +// Requires CTILE * K * P * sizeof(filter_t) <= 2048. +// With K=2 (2x integer oversample) and small P, the filter fits in smem. +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, SmemTiledOversampledFilterInSmem) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // M=1024, D=512, K=2, P=4: filter_smem = 64*2*4*sizeof(filter_t) = 2048 bytes (float) + { 102400, 4*1024, 1024, 512 }, + // M=512, D=256, K=2, P=3: filter_smem = 64*2*3*sizeof(filter_t) = 1536 bytes (float) + { 51200, 3*512, 512, 256 }, + // M=1024, D=512, K=2, P=2: filter_smem = 64*2*2*sizeof(filter_t) = 1024 bytes (float) + { 102400, 2*1024, 1024, 512 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// Complex filter tests to exercise the complex×complex channelize_cmac path. +// Previous tests all use real filters. +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, ComplexFilter) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // Complex filter, D==M, small M (FusedChan path) + { 1800, 120, 4, 4 }, + // Complex filter, D==M, medium M (_Smem path) + { 2500, 170, 10, 10 }, + // Complex filter, D==M, large M (SmemTiled path) + { 128000, 10*512, 512, 512 }, + // Complex filter, oversampled (SmemTiled oversampled path) + { 2500, 170, 10, 5 }, + // Complex filter, oversampled, large M + { 128000, 10*512, 512, 256 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + + // Use complex input AND complex filter + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} + +// Oversampled with very large P so SmemTiled input buffer exceeds 48KB, +// falling back to the generic ChannelizePoly1D (MaximallyDecimated=false) path. +// For complex input: (P+3) * 64 * 8 > 49152 → P > 93. +TYPED_TEST(ChannelizePolyTestNonHalfFloatTypes, GenericOversampledFallback) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + using ComplexType = typename test_types::complex_type::type; + + struct { + index_t a_len; + index_t f_len; + index_t num_channels; + index_t decimation_factor; + } test_cases[] = { + // M=8, D=4, P=100: input_smem = 103*64*8 = 52736 > 49152 for complex + { 8000, 100*8, 8, 4 }, + // M=10, D=5, P=200: clearly exceeds 48KB for any type + { 20000, 200*10, 10, 5 }, + }; + + for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) { + const index_t a_len = test_cases[i].a_len; + const index_t f_len = test_cases[i].f_len; + const index_t num_channels = test_cases[i].num_channels; + const index_t decimation_factor = test_cases[i].decimation_factor; + const index_t b_len_per_channel = (a_len + decimation_factor - 1) / decimation_factor; + this->pb->template InitAndRunTVGenerator( + "00_transforms", "channelize_poly_operators", "channelize_oversampled", + {a_len, f_len, num_channels, decimation_factor}); + + auto a = make_tensor({a_len}); + auto f = make_tensor({f_len}); + auto b = make_tensor({b_len_per_channel, num_channels}); + + this->pb->NumpyToTensorView(a, "a"); + this->pb->NumpyToTensorView(f, "filter_random"); + (b = channelize_poly(a, f, num_channels, decimation_factor)).run(this->exec); + + this->exec.sync(); + MATX_TEST_ASSERT_COMPARE(this->pb, b, "b_random", this->thresh); + } + + MATX_EXIT_HANDLER(); +} diff --git a/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin b/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin new file mode 100755 index 00000000..d954ce67 Binary files /dev/null and b/test/test_vectors/channelize_poly/harris2003_oversampled_testvector.bin differ diff --git a/test/test_vectors/generators/00_transforms.py b/test/test_vectors/generators/00_transforms.py index 555fc3b9..ff0a887e 100755 --- a/test/test_vectors/generators/00_transforms.py +++ b/test/test_vectors/generators/00_transforms.py @@ -175,9 +175,9 @@ def __init__(self, dtype: str, size: List[int]): signal_len = size[0] filter_len = size[1] num_channels = size[2] - # Remaining dimensions are batch dimensions - if len(size) > 3: - a_dims = size[3:] + # size[3] is decimation_factor; remaining dimensions are batch dimensions + if len(size) > 4: + a_dims = size[4:] a_dims = np.append(a_dims, signal_len) else: a_dims = [signal_len] @@ -246,6 +246,157 @@ def idivup(a, b) -> int: return (a+b-1)//b self.res['b_random_hreal'] = out_hreal return self.res + def channelize_oversampled(self) -> Dict[str, np.ndarray]: + """General polyphase channelizer supporting arbitrary decimation factor D <= M. + + Uses size[3] as decimation_factor (D). When D == M, this produces the same + result as channelize(). When D < M, this is the oversampled case. + """ + def idivup(a, b) -> int: return (a+b-1)//b + + h = self.res['filter_random'] + num_channels = self.res['num_channels'] + decimation_factor = self.size[3] + x = self.res['a'] + + M = num_channels + D = decimation_factor + + # Polyphase decompose: E[r, q] = h[q*M + r], zero-pad h to multiple of M + num_taps_per_channel = idivup(h.size, M) + h_padded = h + if M * num_taps_per_channel > h.size: + h_padded = np.pad(h, (0, M * num_taps_per_channel - h.size)) + E = np.reshape(h_padded, (M, num_taps_per_channel), order='F') + + num_output_samples = idivup(x.shape[-1], D) + num_batches = x.size // x.shape[-1] + out = np.zeros((num_batches, num_output_samples, M), dtype=np.complex128) + out_hreal = np.zeros((num_batches, num_output_samples, M), dtype=np.complex128) + xr = np.reshape(x, (num_batches, x.shape[-1])) + + E_real = np.real(E) + n_idx = np.arange(num_output_samples) + + # Precompute per-(n, r) quantities as 2D arrays [num_output_samples, M] + # Branch remap: r_remapped = (r + M - D) % M changes the input sample + # mapping so newest D samples land in Harris convention branches. + # Phase uses the original logical branch index r (not remapped). + n_all = np.arange(num_output_samples)[:, np.newaxis] # [nout, 1] + r_all = np.arange(M)[np.newaxis, :] # [1, M] + r_remapped = (r_all + M - D) % M # [1, M] + s_all = M - 1 - r_remapped # [1, M] (remapped for input access) + last_arrived = n_all * D + D - 1 # [nout, 1] broadcast + valid = last_arrived >= s_all # [nout, M] + A = np.where(valid, last_arrived - s_all, 0) # [nout, M] + newest = np.where(valid, last_arrived - (A % M), 0) # [nout, M] + causal_count = np.where(valid, A // M + 1, 0) # [nout, M] + phase = np.where(valid, + (r_all + (n_all * D) % M) % M, + 0).astype(int) # [nout, M] (original r, NOT remapped) + + for batch_ind in range(num_batches): + xb = xr[batch_ind, :] + input_len = xb.shape[-1] + filtered = np.zeros((num_output_samples, M), dtype=np.complex128) + filtered_hreal = np.zeros((num_output_samples, M), dtype=np.complex128) + + # Loop only over taps (P iterations, typically small) + for q in range(num_taps_per_channel): + idx = newest - q * M # [nout, M] + tap_valid = valid & (idx >= 0) & (idx < input_len) & (q < causal_count) + if not np.any(tap_valid): + continue + # Gather input samples and filter taps for all valid (n, r) pairs + inp_vals = np.where(tap_valid, xb[np.clip(idx, 0, input_len - 1).astype(int)], 0) + h_vals = E[phase, q] # [nout, M] + h_real_vals = E_real[phase, q] + filtered += np.where(tap_valid, h_vals * inp_vals, 0) + filtered_hreal += np.where(tap_valid, h_real_vals * inp_vals, 0) + # IFFT across channels, scale by M + out[batch_ind, :, :] = ifft(filtered, n=M, axis=1) * M + out_hreal[batch_ind, :, :] = ifft(filtered_hreal, n=M, axis=1) * M + + # Reshape output: (num_batches, num_output_samples, M) -> match MatX layout + # MatX output is (...batch_dims..., num_output_samples, num_channels) + if num_batches > 1: + s = list(x.shape) + s[-1] = num_output_samples + s = np.append(s, M) + out = np.reshape(out, s) + out_hreal = np.reshape(out_hreal, s) + else: + out = np.reshape(out, out.shape[1:]) + out_hreal = np.reshape(out_hreal, out_hreal.shape[1:]) + + self.res['b_random'] = out + self.res['b_random_hreal'] = out_hreal + return self.res + +class harris2003_oversampled_operators: + """Test case based on the oversampled channelizer from Harris 2003 + (receiver_40z.m): M=40, D=28, 600-tap filter designed with remez. + + Golden input and output are read from a binary file. The filter is + designed using scipy.signal.remez with the same specification as the + reference implementation (verified to match within machine epsilon). + The binary file layout is: + [real(xx), 5600 doubles][imag(xx), 5600 doubles] + [real(yy), 40*200 doubles][imag(yy), 40*200 doubles] + where yy is [40, 200] stored column-major (channels × output steps) + with fftshift applied (DC centered). + """ + def __init__(self, dtype: str, size: List[int]): + self.size = size + self.dtype = dtype + + def channelize(self) -> Dict[str, np.ndarray]: + import os + import scipy.signal + + M = 40 + D = 28 + input_len = 5600 + num_output = 200 + + # Design the 600-tap prototype filter using Parks-McClellan (remez) + # with the same specification as the Harris 2003 reference. + freqs = np.array([0, 12, 17, 56, 57, 84, 85, 112, 113, 140, + 141, 168, 169, 196, 197, 224, 225, 252, 253, 280, + 281, 308, 309, 336, 337, 364, 365, 392, 393, 420, + 421, 448, 449, 476, 477, 504, 505, 532, 533, 560]) / 560.0 + gains = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + weights = [1, 6, 7, 9, 11, 13, 15, 17, 19, 21, + 23, 25, 27, 29, 31, 33, 35, 37, 39, 41] + h = scipy.signal.remez(600, freqs, gains, weight=weights, fs=2) + h = h.astype(np.float64) + + # Read golden input and output from binary test vector file + tv_dir = os.path.join(os.path.dirname(__file__), '..', 'channelize_poly') + tv_path = os.path.join(tv_dir, 'harris2003_oversampled_testvector.bin') + data = np.fromfile(tv_path, dtype=np.float64) + + offset = 0 + xx_re = data[offset:offset + input_len]; offset += input_len + xx_im = data[offset:offset + input_len]; offset += input_len + a = (xx_re + 1j * xx_im).astype(np.complex128) + + yy_re = data[offset:offset + M * num_output]; offset += M * num_output + yy_im = data[offset:offset + M * num_output]; offset += M * num_output + # The reference output uses fftshift (DC centered). Apply ifftshift + # to convert to our convention (DC at index 0). + from scipy.fft import ifftshift + yy = (yy_re + 1j * yy_im).reshape((M, num_output), order='F') + b = ifftshift(yy, axes=0).T + + res = { + 'a': a, + 'filter': h, + 'b_golden': b, + } + return res + + class fft_operators: def __init__(self, dtype: str, size: List[int]): self.size = size