Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 158 additions & 42 deletions examples/channelize_poly_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <cassert>
#include <cstdio>
#include <cmath>
#include <cstring>
#include <memory>
#include <fstream>
#include <istream>
Expand All @@ -51,8 +52,17 @@ constexpr int NUM_WARMUP_ITERATIONS = 2;
// Number of iterations per timed test. Iteration times are averaged in the report.
constexpr int NUM_ITERATIONS = 20;

template <typename InType, typename OutType>
void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop)
template <typename T>
const char *TypeName() {
Comment thread
tbensonatl marked this conversation as resolved.
if constexpr (std::is_same_v<T, float>) return "float";
else if constexpr (std::is_same_v<T, double>) return "double";
else if constexpr (std::is_same_v<T, cuda::std::complex<float>>) return "complex<float>";
else if constexpr (std::is_same_v<T, cuda::std::complex<double>>) return "complex<double>";
else return "unknown";
}

template <typename InType, typename OutType, typename FilterType>
void ChannelizePolyBench(matx::index_t num_channels, matx::index_t decimation_factor)
{
struct {
matx::index_t num_batches;
Expand All @@ -65,6 +75,8 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
{ 1, 17, 256000 },
{ 42, 17, 256000 },
{ 128, 17, 256000 },
{ 1, 17, 8192*1024 },
{ 42, 17, 8192*1024 }
};

cudaStream_t stream;
Expand All @@ -76,41 +88,42 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
cudaExecutor exec{};

for (size_t i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); i++) {
for (matx::index_t num_channels = channel_start; num_channels <= channel_stop; num_channels++) {
const matx::index_t num_batches = test_cases[i].num_batches;
const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
const matx::index_t input_len = test_cases[i].input_len;
const matx::index_t output_len_per_channel = (input_len + num_channels - 1) / num_channels;
const matx::index_t num_batches = test_cases[i].num_batches;
const matx::index_t filter_len = test_cases[i].filter_len_per_channel * num_channels;
const matx::index_t input_len = test_cases[i].input_len;
const matx::index_t output_len_per_channel = (input_len + decimation_factor - 1) / decimation_factor;

auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
auto filter = matx::make_tensor<InType, 1>({filter_len});
auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
if (input_len < num_channels * 100) {
continue;
}

const matx::index_t decimation_factor = num_channels;
auto input = matx::make_tensor<InType, 2>({num_batches, input_len});
auto filter = matx::make_tensor<FilterType, 1>({filter_len});
auto output = matx::make_tensor<OutType, 3>({num_batches, output_len_per_channel, num_channels});
(input = static_cast<InType>(1)).run(exec);
(filter = static_cast<FilterType>(1)).run(exec);

for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
(output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
}
for (int k = 0; k < NUM_WARMUP_ITERATIONS; k++) {
(output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
}

exec.sync();
exec.sync();

float elapsed_ms = 0.0f;
cudaEventRecord(start, stream);
for (int k = 0; k < NUM_ITERATIONS; k++) {
(output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
}
cudaEventRecord(stop, stream);
exec.sync();
MATX_CUDA_CHECK_LAST_ERROR();
cudaEventElapsedTime(&elapsed_ms, start, stop);

const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
" InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
num_batches, num_channels, filter_len, input_len, avg_elapsed_us,
static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
float elapsed_ms = 0.0f;
cudaEventRecord(start, stream);
for (int k = 0; k < NUM_ITERATIONS; k++) {
(output = channelize_poly(input, filter, num_channels, decimation_factor)).run(exec);
}
printf("\n");
cudaEventRecord(stop, stream);
exec.sync();
MATX_CUDA_CHECK_LAST_ERROR();
cudaEventElapsedTime(&elapsed_ms, start, stop);

const double avg_elapsed_us = (static_cast<double>(elapsed_ms)/NUM_ITERATIONS)*1.0e3;
printf("Batches: %5" MATX_INDEX_T_FMT " Channels: %5" MATX_INDEX_T_FMT " Decimation: %5" MATX_INDEX_T_FMT " FilterLen: %5" MATX_INDEX_T_FMT
" InputLen: %7" MATX_INDEX_T_FMT " Elapsed Usecs: %12.1f MPts/sec: %12.3f\n",
num_batches, num_channels, decimation_factor, filter_len, input_len, avg_elapsed_us,
static_cast<double>(num_batches*num_channels*output_len_per_channel)/1.0e6/(avg_elapsed_us/1.0e6));
}

MATX_CUDA_CHECK_LAST_ERROR();
Expand All @@ -120,24 +133,127 @@ void ChannelizePolyBench(matx::index_t channel_start, matx::index_t channel_stop
cudaStreamDestroy(stream);
}

int main([[maybe_unused]] int argc, [[maybe_unused]] char **argv)
enum class Precision { Float, Double };
enum class Domain { Real, Complex };

struct BenchConfig {
Precision input_prec = Precision::Float;
Domain input_domain = Domain::Complex;
Precision filter_prec = Precision::Float;
Domain filter_domain = Domain::Real;
matx::index_t M = 10; // number of channels
matx::index_t D = -1; // decimation factor (-1 means D = M)
};

void PrintUsage(const char *prog) {
printf("Usage: %s [options]\n", prog);
printf(" --input-type <type> Input type: float, double, cf, cd (default: cf)\n");
printf(" --filter-type <type> Filter type: float, double, cf, cd (default: float)\n");
printf(" -M <N> Number of channels (default: 10)\n");
printf(" -D <N> Decimation factor, 0 < D <= M (default: M)\n");
printf("\n");
printf("Type shorthands: float, double, cf (complex<float>), cd (complex<double>)\n");
}

bool ParseType(const char *s, Precision &prec, Domain &dom) {
if (strcmp(s, "float") == 0) { prec = Precision::Float; dom = Domain::Real; return true; }
if (strcmp(s, "double") == 0) { prec = Precision::Double; dom = Domain::Real; return true; }
if (strcmp(s, "cf") == 0) { prec = Precision::Float; dom = Domain::Complex; return true; }
if (strcmp(s, "cd") == 0) { prec = Precision::Double; dom = Domain::Complex; return true; }
return false;
}

template <typename T>
struct ScalarType { using type = T; };
template <typename T>
struct ScalarType<cuda::std::complex<T>> { using type = T; };

template <typename InType, typename FilterType>
void DispatchBench(const BenchConfig &cfg) {
using in_scalar = typename ScalarType<InType>::type;
using OutType = cuda::std::complex<in_scalar>;

printf("Input: %-16s Filter: %-16s Output: %-16s\n",
TypeName<InType>(), TypeName<FilterType>(), TypeName<OutType>());
printf("M: %" MATX_INDEX_T_FMT " D: %" MATX_INDEX_T_FMT "\n\n", cfg.M, cfg.D);

ChannelizePolyBench<InType, OutType, FilterType>(cfg.M, cfg.D);
}

void RunBench(const BenchConfig &cfg) {
auto go = [&](auto in_tag, auto filt_tag) {
DispatchBench<decltype(in_tag), decltype(filt_tag)>(cfg);
};

auto dispatch_filter = [&](auto in_tag) {
if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Real)
go(in_tag, float{});
else if (cfg.filter_prec == Precision::Double && cfg.filter_domain == Domain::Real)
go(in_tag, double{});
else if (cfg.filter_prec == Precision::Float && cfg.filter_domain == Domain::Complex)
go(in_tag, cuda::std::complex<float>{});
else
go(in_tag, cuda::std::complex<double>{});
};

if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Real)
dispatch_filter(float{});
else if (cfg.input_prec == Precision::Double && cfg.input_domain == Domain::Real)
dispatch_filter(double{});
else if (cfg.input_prec == Precision::Float && cfg.input_domain == Domain::Complex)
dispatch_filter(cuda::std::complex<float>{});
else
dispatch_filter(cuda::std::complex<double>{});
}

int main(int argc, char **argv)
{
MATX_ENTER_HANDLER();

const matx::index_t channel_start = 3;
const matx::index_t channel_stop = 10;
BenchConfig cfg;

// printf("Benchmarking float -> complex<float>\n");
// ChannelizePolyBench<float,cuda::std::complex<float>>(channel_start, channel_stop);
for (int i = 1; i < argc; i++) {
if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
PrintUsage(argv[0]);
return 0;
} else if (strcmp(argv[i], "--input-type") == 0 && i + 1 < argc) {
if (!ParseType(argv[++i], cfg.input_prec, cfg.input_domain)) {
fprintf(stderr, "Unknown input type: %s\n", argv[i]);
return 1;
}
} else if (strcmp(argv[i], "--filter-type") == 0 && i + 1 < argc) {
if (!ParseType(argv[++i], cfg.filter_prec, cfg.filter_domain)) {
fprintf(stderr, "Unknown filter type: %s\n", argv[i]);
return 1;
}
} else if (strcmp(argv[i], "-M") == 0 && i + 1 < argc) {
cfg.M = static_cast<matx::index_t>(atol(argv[++i]));
} else if (strcmp(argv[i], "-D") == 0 && i + 1 < argc) {
cfg.D = static_cast<matx::index_t>(atol(argv[++i]));
} else {
fprintf(stderr, "Unknown option: %s\n", argv[i]);
PrintUsage(argv[0]);
return 1;
}
}

printf("Benchmarking complex<float> -> complex<float>\n");
ChannelizePolyBench<cuda::std::complex<float>,cuda::std::complex<float>>(channel_start, channel_stop);
// Default D to M (maximally decimated) if not specified
if (cfg.D <= 0) {
cfg.D = cfg.M;
}

if (cfg.D <= 0 || cfg.D > cfg.M) {
fprintf(stderr, "Error: decimation factor D must satisfy 0 < D <= M (got M=%" MATX_INDEX_T_FMT ", D=%" MATX_INDEX_T_FMT ")\n",
cfg.M, cfg.D);
return 1;
}

// printf("Benchmarking double -> complex<double>\n");
// ChannelizePolyBench<double,cuda::std::complex<double>>(channel_start, channel_stop);
if (cfg.M < 2) {
fprintf(stderr, "Error: number of channels M must be >= 2 (got M=%" MATX_INDEX_T_FMT ")\n", cfg.M);
return 1;
}

// printf("Benchmarking complex<double> -> complex<double>\n");
// ChannelizePolyBench<cuda::std::complex<double>,cuda::std::complex<double>>(channel_start, channel_stop);
RunBench(cfg);

matx::ClearCachesAndAllocations();

Expand Down
Loading