diff --git a/.gitignore b/.gitignore index b729af5d6..80ad9627a 100644 --- a/.gitignore +++ b/.gitignore @@ -24,4 +24,11 @@ tmp/ src/cuda/GPU_Microbenchmark/ubench/**/* !src/cuda/GPU_Microbenchmark/ubench/**/*/ !src/cuda/GPU_Microbenchmark/ubench/**/*.* -!src/cuda/GPU_Microbenchmark/ubench/**/Makefile \ No newline at end of file +!src/cuda/GPU_Microbenchmark/ubench/**/Makefile + +# Ignore compiled CUDA binaries +*.fatbin + +# Ignore VPI symlinks +src/cuda/HPC/vpi/* +!src/cuda/HPC/vpi/vpi_subtractor/main.cpp \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index a26c652f2..e48c7a85a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -11,3 +11,9 @@ path = src/cuda/pytorch_examples url = https://github.com/accel-sim/pytorch_examples.git branch = inference_accelsim_v2 +[submodule "src/cuda/HPC/external/cugraph"] + path = src/cuda/HPC/external/cugraph + url = https://github.com/rapidsai/cugraph.git +[submodule "src/cuda/HPC/external/newton"] + path = src/cuda/HPC/external/newton + url = https://github.com/newton-physics/newton.git diff --git a/README.md b/README.md index 09db7bfda..02a0fd9de 100644 --- a/README.md +++ b/README.md @@ -68,3 +68,16 @@ To compile everything above for AccelWattch: ``` make accelwattch -C ./src ``` + +## H100 Benchmark Suite + +The H100 suite contains 15 modern GPU workloads from H100 profiling and analysis: + +- **cuFFT** (2 apps): FFT operations using cuFFT library +- **cuSolver** (2 apps): Linear algebra using cuSolver library +- **Image Processing** (3 apps): Wavelet transform, Gaussian filter, FDTD3d +- **Graph Algorithms** (2 apps): BFS and MST using cuGraph (git submodule) +- **Physics Simulation** (3 apps): Newton physics engine benchmarks (git submodule) +- **Computer Vision** (3 apps): VPI-based vision processing (requires VPI 4.0) + +See [src/cuda/H100/README.md](src/cuda/H100/README.md) for details. diff --git a/get_data.sh b/get_data.sh index 77ce877ae..8654f9759 100755 --- a/get_data.sh +++ b/get_data.sh @@ -10,3 +10,17 @@ if [ ! -d $DATA_ROOT ]; then tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT rm all.gpgpu-sim-app-data.tgz fi + +echo "Generating HPC benchmark data..." +if [ -f $BASH_ROOT/src/cuda/HPC/get_graph_data.sh ]; then + bash $BASH_ROOT/src/cuda/HPC/get_graph_data.sh || echo "Warning: Graph data generation failed" +fi +if [ -f $BASH_ROOT/src/cuda/HPC/get_image_data.sh ]; then + bash $BASH_ROOT/src/cuda/HPC/get_image_data.sh || echo "Warning: Image data generation failed" +fi +if [ -f $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh ]; then + bash $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh || echo "Warning: DWT data generation failed" +fi +if [ -f $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh ]; then + bash $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh +fi diff --git a/src/Makefile b/src/Makefile index a890d58f7..acb019d08 100644 --- a/src/Makefile +++ b/src/Makefile @@ -4,7 +4,7 @@ $(error You must run "source setup_environment before calling make") endif ifeq ($(CUDA_GT_7), 1) -all: GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface # mlperf_training +all: GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface hpc # mlperf_training else ifeq ($(CUDA_GT_4), 1) all: pannotia rodinia_2.0-ft proxy-apps dragon-naive microbench rodinia-3.1 ispass-2009 dragon-cdp lonestargpu-2.0 polybench parboil shoc custom_apps @@ -19,7 +19,7 @@ accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw #Disable clean for now, It has a bug! # clean_dragon-naive clean_pannotia clean_proxy-apps -clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark +clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark clean_hpc clean_accelwattch: clean_rodinia-3.1 clean_parboil clean_cutlass clean_cuda_samples-11.0 clean_cuda_samples_hw_power clean_rodinia-3.1_hw_power clean_parboil_hw_power clean_accelwattch_ubench clean_data: @@ -543,6 +543,52 @@ clean_heterosync: clean_cutlass: rm -rf cuda/cutlass-bench/build +############################################################################### +# Modern HPC Benchmarks - (CUDA 11+, sm_75+) +############################################################################### +hpc: + mkdir -p $(BINDIR)/$(BINSUBDIR)/ + # Initialize and update submodules (like cutlass pattern) + # Clean up any corrupted submodule directories (check if .git exists) + @for dir in cuda/cuda-samples cuda/HPC/external/cugraph cuda/HPC/external/newton; do \ + if [ -d $$dir ] && [ ! -d $$dir/.git ]; then \ + echo "Removing corrupted submodule directory: $$dir"; \ + rm -rf $$dir; \ + fi; \ + done + git submodule sync && git submodule update --init --recursive + + # Build simple apps (cuFFT, cuSolver, image processing) + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC simple + + + + # Build Newton apps (Newton submodule) + cp -r cuda/HPC/newton $(BINDIR)/$(BINSUBDIR)/ + mkdir -p $(BINDIR)/$(BINSUBDIR)/external + cp -r cuda/HPC/external/newton $(BINDIR)/$(BINSUBDIR)/external/ + bash $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh + echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/diffsim_ball/example_diffsim_ball.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball + chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball + echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/robot_cartpole/example_robot_cartpole.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole + chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole + + # Build VPI apps from symlinked sources and copy binaries + $(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC vpi + cp cuda/HPC/vpi/vpi_background_subtractor/build/vpi_sample_14_background_subtractor $(BINDIR)/$(BINSUBDIR)/vpi_background_subtractor + cp cuda/HPC/vpi/vpi_orb_feature_detector/build/vpi_sample_18_orb_feature_detector $(BINDIR)/$(BINSUBDIR)/vpi_orb_feature_detector + cp cuda/HPC/vpi/vpi_stereo_disparity/build/vpi_sample_02_stereo_disparity $(BINDIR)/$(BINSUBDIR)/vpi_stereo_disparity + + # Build graph apps (cuGraph submodule) + -$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC graph && \ + cp cuda/HPC/graph/mst_standalone/build/mst_standalone $(BINDIR)/$(BINSUBDIR)/ + +clean_HPC: + $(SETENV) $(MAKE) -C cuda/HPC clean + rm -rf cuda/HPC/external/cugraph/build + rm -rf cuda/HPC/external/newton/build + rm -rf cuda/HPC/newton/newton_venv + # clean_deeplearning: # $(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean @@ -711,7 +757,7 @@ clean_pytorch_examples: rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae clean_cuda_samples: - $(MAKE) clean -C ./cuda/cuda-samples/build + -$(MAKE) clean -C ./cuda/cuda-samples/build 2>/dev/null || true clean_huggingface: rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface diff --git a/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h new file mode 100644 index 000000000..fd178eaef --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h @@ -0,0 +1,25 @@ + + +#ifndef BLACK_GB202_DEF_H +#define BLACK_GB202_DEF_H + +#include "./common/common.h" +#include "./common/deviceQuery.h" + +#define L1_SIZE (256 * 1024) // Max L1 size in bytes + +// #define CLK_FREQUENCY 2010 // frequency in MHz + +#define ISSUE_MODEL issue_model::single // single issue core or dual issue +#define CORE_MODEL core_model::subcore // subcore model or shared model +#define DRAM_MODEL dram_model::GDDR6 // memory type +#define WARP_SCHEDS_PER_SM 4 // number of warp schedulers per SM + + +#define SASS_hmma_per_PTX_wmma 2 + + +#define L2_BANKS_PER_MEM_CHANNEL 1 +#define L2_BANK_WIDTH_in_BYTE 64 + +#endif diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h index 0b0306596..4def21d8a 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h @@ -108,7 +108,9 @@ inline void parseGpuConfigArgs(int argc, char *argv[]) ++i; } config.MAX_WARPS_PER_SM = config.MAX_THREADS_PER_SM / config.WARP_SIZE; - config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f; + // Note: MEM_CLK_FREQUENCY is already in MHz from initializeDeviceProp (line 313) + // Do not convert to GHz - the bandwidth calculation expects MHz + // config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f; config.BLOCKS_PER_SM = config.MAX_THREADS_PER_SM / config.THREADS_PER_BLOCK; config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK; config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; diff --git a/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h index 8d12b2456..227011654 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h @@ -14,6 +14,7 @@ // #include "volta_TITANV_hw_def.h" // #include "ampere_A100_hw_def.h" -#include "blackwell_B200_hw_def.h" +// #include "blackwell_B200_hw_def.h" +#include "blackwell_RTX5090_hw_def.h" #endif diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu index 7ef201c6e..b279f8b49 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu @@ -96,12 +96,13 @@ int main(int argc, char *argv[]) } } + config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM + config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Recalculate after changing BLOCKS_NUM + unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE; assert(ARRAY_SIZE * sizeof(float) < config.L2_SIZE); // Array size must not exceed L2 size - - config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM - + // config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM // Commented out - causes mismatch on GPUs with MAX_THREADS_PER_SM != 2048 uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t)); diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu index 841bdee17..78c130e22 100644 --- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu +++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu @@ -145,13 +145,32 @@ int main(int argc, char *argv[]) float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); - unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types - float max_bw = (float)config.MEM_BITWIDTH * config.MEM_CLK_FREQUENCY * 2 / 1e3 / 8; - mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0])); - printf("Mem BW= %f (Byte/Clk)\n", mem_bw); - printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6); - printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw); - printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100); - - printf("Total Clk number = %u \n", stopClk[0] - startClk[0]); + // Find min and max clocks across all threads to get actual kernel execution time + uint32_t minStart = startClk[0], maxStop = stopClk[0]; + for (unsigned i = 1; i < config.TOTAL_THREADS; i++) { + if (startClk[i] < minStart) minStart = startClk[i]; + if (stopClk[i] > maxStop) maxStop = stopClk[i]; + } + uint32_t totalClocks = maxStop - minStart; + + unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats (5 reads + 1 write) + + // Theoretical max bandwidth in GB/s + // MEM_CLK_FREQUENCY is in MHz, MEM_BITWIDTH is in bits + // BW = (Bus Width / 8 bytes) × (Clock MHz) × 2 (DDR) / 1000 = GB/s + float max_bw = (float)config.MEM_BITWIDTH / 8 * config.MEM_CLK_FREQUENCY * 2 / 1000; + + // Achieved bandwidth from CUDA event timing (most accurate) + float achieved_bw_from_time = (float)N / milliseconds / 1e6; + + // Achieved bandwidth from cycle count (less accurate, single SM perspective) + float achieved_bw_from_cycles = ((float)N / totalClocks) * config.CLK_FREQUENCY / 1e3; + + printf("Mem BW= %f (Byte/Clk)\n", (float)N / totalClocks); + printf("Mem BW (from time)= %f (GB/sec)\n", achieved_bw_from_time); + printf("Mem BW (from cycles)= %f (GB/sec)\n", achieved_bw_from_cycles); + printf("Max Theoretical Mem BW= %f (GB/sec)\n", max_bw); + printf("Mem Efficiency = %f %%\n", (achieved_bw_from_time / max_bw) * 100); + + printf("Total Clk number = %u (min start: %u, max stop: %u)\n", totalClocks, minStart, maxStop); } diff --git a/src/cuda/HPC/Makefile b/src/cuda/HPC/Makefile new file mode 100644 index 000000000..6ff6bd438 --- /dev/null +++ b/src/cuda/HPC/Makefile @@ -0,0 +1,100 @@ +# H100 Benchmark Suite Makefile (following cutlass-bench pattern) +.PHONY: all simple graph newton vpi clean cufft cusolver image + +# Default: Build all apps (submodules initialized by parent Makefile) +all: simple graph newton vpi + + +# Simple apps (cuFFT, cuSolver, image processing - CUDA toolkit only) +simple: cufft cusolver image + +cufft: + $(MAKE) -C cufft/cufft_3d_c2c + $(MAKE) -C cufft/cufft_lto_r2c_c2r + +cusolver: + $(MAKE) -C cusolver/cusolver_ormqr + $(MAKE) -C cusolver/cusolver_Xgetrf + +image: + $(MAKE) -C image/dwtHaar1D + $(MAKE) -C image/recursiveGaussian + $(MAKE) -C image/FDTD3d + +# Graph apps (cuGraph submodule - built with CMake) +graph: + @echo "Building cuGraph library from submodule..." + @if [ -d "external/cugraph" ]; then \ + cd external/cugraph && \ + ./build.sh libcugraph --skip_cpp_tests && \ + echo "Building graph apps..." && \ + cd ../.. && \ + export CUGRAPH_ROOT="$(shell pwd)/external/cugraph" && \ + # mkdir -p graph/bfs_standalone/build && \ + # cd graph/bfs_standalone/build && \ + # cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \ + # $(MAKE) && \ + # cd ../../.. && \ + mkdir -p graph/mst_standalone/build && \ + cd graph/mst_standalone/build && \ + cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \ + $(MAKE); \ + else \ + echo "WARNING: cuGraph submodule not found - skipping graph apps"; \ + fi + +# Newton apps (Newton submodule - Python-based) +newton: + @echo "Newton apps ready (Python-based, no build needed)" + +# VPI apps (VPI library from system - required) +# Symlink sources from /opt/nvidia/vpi*/samples/ and build locally +vpi: + @echo "Searching for VPI installation..." + @VPI_ROOT=$$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1); \ + if [ -z "$$VPI_ROOT" ]; then \ + echo "ERROR: VPI not found in /opt/nvidia/"; \ + echo "Please install VPI from NVIDIA"; \ + exit 1; \ + fi; \ + echo "Found VPI at $$VPI_ROOT"; \ + VPI_SAMPLES=$$VPI_ROOT/samples; \ + echo "Creating symlinks to VPI sample sources..."; \ + mkdir -p vpi/vpi_orb_feature_detector vpi/vpi_stereo_disparity; \ + ln -sf $$VPI_SAMPLES/14-background_subtractor/CMakeLists.txt vpi/vpi_background_subtractor/CMakeLists.txt; \ + ln -sf $$VPI_SAMPLES/18-orb_feature_detector/main.cpp vpi/vpi_orb_feature_detector/main.cpp; \ + ln -sf $$VPI_SAMPLES/18-orb_feature_detector/CMakeLists.txt vpi/vpi_orb_feature_detector/CMakeLists.txt; \ + ln -sf $$VPI_SAMPLES/02-stereo_disparity/main.cpp vpi/vpi_stereo_disparity/main.cpp; \ + ln -sf $$VPI_SAMPLES/02-stereo_disparity/CMakeLists.txt vpi/vpi_stereo_disparity/CMakeLists.txt; \ + echo "Building VPI apps from symlinked sources..."; \ + mkdir -p vpi/vpi_background_subtractor/build && \ + cd vpi/vpi_background_subtractor/build && \ + cmake .. && $(MAKE) && \ + cd ../../.. && \ + mkdir -p vpi/vpi_orb_feature_detector/build && \ + cd vpi/vpi_orb_feature_detector/build && \ + cmake .. && $(MAKE) && \ + cd ../../.. && \ + mkdir -p vpi/vpi_stereo_disparity/build && \ + cd vpi/vpi_stereo_disparity/build && \ + cmake .. && $(MAKE) + +clean: + # Clean bin directory + -rm -rf bin + # Clean simple apps + -$(MAKE) -C cufft/cufft_3d_c2c clean + -$(MAKE) -C cufft/cufft_lto_r2c_c2r clean + -$(MAKE) -C cusolver/cusolver_ormqr clean + -$(MAKE) -C cusolver/cusolver_Xgetrf clean + -$(MAKE) -C image/dwtHaar1D clean + -$(MAKE) -C image/recursiveGaussian clean + -$(MAKE) -C image/FDTD3d clean + # Clean graph apps and cuGraph build + -rm -rf graph/bfs_standalone/build + -rm -rf graph/mst_standalone/build + -rm -rf external/cugraph/cpp/build + # Clean Newton + -rm -rf newton/newton_venv + # Clean VPI apps and symlinks + -rm -rf vpi/*/build diff --git a/src/cuda/HPC/README.md b/src/cuda/HPC/README.md new file mode 100644 index 000000000..3eedbeaab --- /dev/null +++ b/src/cuda/HPC/README.md @@ -0,0 +1,100 @@ +# H100 Benchmark Suite + +14 modern GPU workloads from H100 profiling and analysis. + +## Applications + +### cuFFT (2 apps) - FFT operations +- `cufft_3d_c2c_scalable` - 3D Complex-to-Complex FFT +- `cufft_lto_r2c_c2r_scalable` - Real↔Complex FFT with LTO callbacks + +### cuSolver (2 apps) - Linear algebra +- `cusolver_ormqr_scalable` - QR factorization +- `cusolver_Xgetrf_scalable` - LU factorization + +### Image Processing (3 apps) +- `dwtHaar1D` - Haar wavelet transform +- `recursiveGaussian` - Recursive Gaussian filter +- `FDTD3d` - Finite-Difference Time-Domain 3D simulation + +### Graph Algorithms (2 apps) +- `bfs_standalone` - Breadth-First Search (requires cuGraph submodule) +- `mst_standalone` - Minimum Spanning Tree (requires cuGraph submodule) + +### Physics Simulation (3 apps) +- `newton_diffsim_ball` - Differential simulation (requires Newton submodule) +- `newton_robot_cartpole` - Robotics simulation (requires Newton submodule) +- `newton_mpm_granular` - Material Point Method simulation (requires Newton submodule) + +### Computer Vision (3 apps) +- `vpi_background_subtractor` - Background subtraction (requires VPI 4.0) +- `vpi_orb_feature_detector` - ORB feature detection (requires VPI 4.0) +- `vpi_stereo_disparity` - Stereo disparity calculation (requires VPI 4.0) + +## Dependencies + +- **CUDA 11.0+** - Required (provides cuFFT, cuSolver, cuBLAS libraries) +- **cuGraph** - Git submodule (auto-initialized for graph apps) +- **Newton** - Git submodule (auto-initialized for physics apps) +- **VPI 4.0** - install from https://developer.nvidia.com/embedded/vpi + +## Build + +```bash +# From repository root +source src/setup_environment + +# Generate data files (standard workflow) +make data + +# Build all H100 apps +make -C src H100 + +# Or build everything with: +make all -i -j -C src +``` + +Binaries are output to `bin//release/H100-*` + +Newton apps are copied to `bin//release/newton/newton_*` + +## Running + +```bash +# cuFFT apps +bin/*/release/H100-cufft_3d_c2c small +bin/*/release/H100-cufft_lto_r2c_c2r medium + +# cuSolver apps +bin/*/release/H100-cusolver_ormqr large +bin/*/release/H100-cusolver_Xgetrf medium + +# Image apps +bin/*/release/H100-dwtHaar1D +bin/*/release/H100-recursiveGaussian +bin/*/release/H100-FDTD3d + +# Graph apps (with generated data) +bin/*/release/H100-bfs --file data_dirs/cuda/H100/graph/karate.mtx +bin/*/release/H100-mst --file data_dirs/cuda/H100/graph/netscience.mtx + +# Newton apps +bin/*/release/newton/newton_diffsim_ball +bin/*/release/newton/newton_robot_cartpole +bin/*/release/newton/newton_mpm_granular + +# VPI apps (if VPI installed) +bin/*/release/vpi_background_subtractor cuda +``` + +## GPU Support + +- Requires compute capability 7.5+ (Turing, Ampere, Hopper) +- Tested on: V100 (sm_70), A100 (sm_80), H100 (sm_90) + +## Notes + +- Simple apps (cuFFT, cuSolver, image) build on any system with CUDA 11+ +- Graph apps require cuGraph submodule (automatically handled by build system) +- Newton apps create Python virtual environment on first run +- VPI apps are optional and only build if VPI library is installed diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile b/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile new file mode 100644 index 000000000..23ec8b328 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile @@ -0,0 +1,9 @@ +EXECUTABLE := cufft_3d_c2c_scalable +CUFILES := cufft_3d_c2c_scalable.cu +CCFILES := +ADDITIONAL_LIBS := -lcufft +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +include ../../../common/common.mk diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu new file mode 100644 index 000000000..4263035d6 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu @@ -0,0 +1,196 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cufft_utils.h" + +using dim_t = std::array; + +int main(int argc, char *argv[]) { + cufftHandle plan; + cudaStream_t stream = NULL; + + // Default values + int n = 16; + int batch_size = 4; + + // Parse named command-line arguments + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--n") == 0 || strcmp(argv[i], "-n") == 0) { + if (i + 1 < argc) { + n = atoi(argv[++i]); + } else { + std::printf("Error: %s requires a value\n", argv[i]); + std::printf("Usage: %s [--n|-n ] [--batch-size|-b ]\n", argv[0]); + std::printf(" or: %s \n", argv[0]); + return EXIT_FAILURE; + } + } else if (strcmp(argv[i], "--batch-size") == 0 || strcmp(argv[i], "-b") == 0) { + if (i + 1 < argc) { + batch_size = atoi(argv[++i]); + } else { + std::printf("Error: %s requires a value\n", argv[i]); + std::printf("Usage: %s [--n|-n ] [--batch-size|-b ]\n", argv[0]); + std::printf(" or: %s \n", argv[0]); + return EXIT_FAILURE; + } + } else if (strcmp(argv[i], "small") == 0) { + n = 16; // 16×16×16 = 4K elements + batch_size = 4; + } else if (strcmp(argv[i], "medium") == 0) { + n = 32; // 32×32×32 = 32K elements + batch_size = 8; + } else if (strcmp(argv[i], "large") == 0) { + n = 64; // 64×64×64 = 262K elements + batch_size = 8; + } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + std::printf("Usage: %s [OPTIONS]\n", argv[0]); + std::printf("\nOptions:\n"); + std::printf(" -n, --n 3D FFT dimension (n×n×n) (default: 16)\n"); + std::printf(" -b, --batch-size Number of batched 3D FFTs (default: 4)\n"); + std::printf("\nPresets:\n"); + std::printf(" small : n=16, batch_size=4 (16×16×16 = 4K elements)\n"); + std::printf(" medium : n=32, batch_size=8 (32×32×32 = 32K elements)\n"); + std::printf(" large : n=64, batch_size=8 (64×64×64 = 262K elements)\n"); + std::printf("\nExamples:\n"); + std::printf(" %s --n 64 --batch-size 16\n", argv[0]); + std::printf(" %s -n 128 -b 4\n", argv[0]); + std::printf(" %s medium --batch-size 16\n", argv[0]); + std::printf(" %s large\n", argv[0]); + return EXIT_SUCCESS; + } else { + std::printf("Error: Unknown argument '%s'\n", argv[i]); + std::printf("Usage: %s [--n|-n ] [--batch-size|-b ]\n", argv[0]); + std::printf(" or: %s \n", argv[0]); + std::printf(" or: %s --help\n", argv[0]); + return EXIT_FAILURE; + } + } + + dim_t fft = {n, n, n}; + int fft_size = fft[0] * fft[1] * fft[2]; + + std::printf("==============================================\n"); + std::printf("cuFFT 3D C2C Example (Scalable)\n"); + std::printf("==============================================\n"); + std::printf("FFT dimension: %d×%d×%d\n", n, n, n); + std::printf("FFT size: %d\n", fft_size); + std::printf("Batch size: %d\n", batch_size); + std::printf("==============================================\n\n"); + + using scalar_type = float; + using data_type = std::complex; + + std::vector data(fft_size * batch_size); + + // Initialize with simple pattern + std::random_device rd; + std::mt19937 gen(42); // Fixed seed for reproducibility + std::uniform_real_distribution dist(0.0f, 1.0f); + + for (int i = 0; i < data.size(); i++) { + data[i] = data_type(dist(gen), dist(gen)); + } + + if (n <= 16) { + std::printf("Input array (first 8 elements):\n"); + for (int i = 0; i < std::min(8, (int)data.size()); i++) { + std::printf("%f + %fj\n", data[i].real(), data[i].imag()); + } + std::printf("=====\n"); + } + + cufftComplex *d_data = nullptr; + + // inembed/onembed being nullptr indicates contiguous data for each batch, then the stride and dist settings are ignored + CUFFT_CALL(cufftPlanMany(&plan, fft.size(), fft.data(), + nullptr, 1, 0, // *inembed, istride, idist + nullptr, 1, 0, // *onembed, ostride, odist + CUFFT_C2C, batch_size)); + + CUDA_RT_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUFFT_CALL(cufftSetStream(plan, stream)); + + // Create device data arrays + CUDA_RT_CALL(cudaMalloc(reinterpret_cast(&d_data), sizeof(data_type) * data.size())); + CUDA_RT_CALL(cudaMemcpyAsync(d_data, data.data(), sizeof(data_type) * data.size(), + cudaMemcpyHostToDevice, stream)); + + /* + * Note: + * Identical pointers to data and output arrays implies in-place transformation + */ + std::printf("Executing forward FFT...\n"); + CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD)); + CUDA_RT_CALL(cudaStreamSynchronize(stream)); + std::printf("Forward FFT complete.\n"); + + if (n <= 16) { + CUDA_RT_CALL(cudaMemcpyAsync(data.data(), d_data, sizeof(data_type) * data.size(), + cudaMemcpyDeviceToHost, stream)); + CUDA_RT_CALL(cudaStreamSynchronize(stream)); + std::printf("Output after Forward (first 8 elements):\n"); + for (int i = 0; i < std::min(8, (int)data.size()); i++) { + std::printf("%f + %fj\n", data[i].real(), data[i].imag()); + } + std::printf("=====\n"); + } + + // Normalize the data and inverse FFT + std::printf("Executing inverse FFT...\n"); + scaling_kernel<<<(data.size() + 127) / 128, 128, 0, stream>>>(d_data, data.size(), 1.f/fft_size); + CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE)); + CUDA_RT_CALL(cudaStreamSynchronize(stream)); + std::printf("Inverse FFT complete.\n"); + + if (n <= 16) { + CUDA_RT_CALL(cudaMemcpyAsync(data.data(), d_data, sizeof(data_type) * data.size(), + cudaMemcpyDeviceToHost, stream)); + CUDA_RT_CALL(cudaStreamSynchronize(stream)); + std::printf("Output after Inverse (first 8 elements):\n"); + for (int i = 0; i < std::min(8, (int)data.size()); i++) { + std::printf("%f + %fj\n", data[i].real(), data[i].imag()); + } + std::printf("=====\n"); + } + + + + /* free resources */ + CUDA_RT_CALL(cudaFree(d_data)); + + CUFFT_CALL(cufftDestroy(plan)); + + CUDA_RT_CALL(cudaStreamDestroy(stream)); + + CUDA_RT_CALL(cudaDeviceReset()); + + std::printf("\n==============================================\n"); + std::printf("SUCCESS: 3D C2C FFT completed\n"); + std::printf("==============================================\n"); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h new file mode 100644 index 000000000..846d63ab7 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h @@ -0,0 +1,64 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +// CUDA API error checking +#ifndef CUDA_RT_CALL +#define CUDA_RT_CALL( call ) \ + { \ + auto status = static_cast( call ); \ + if ( status != cudaSuccess ) \ + fprintf( stderr, \ + "ERROR: CUDA RT call \"%s\" in line %d of file %s failed " \ + "with " \ + "%s (%d).\n", \ + #call, \ + __LINE__, \ + __FILE__, \ + cudaGetErrorString( status ), \ + status ); \ + } +#endif // CUDA_RT_CALL + +// cufft API error chekcing +#ifndef CUFFT_CALL +#define CUFFT_CALL( call ) \ + { \ + auto status = static_cast( call ); \ + if ( status != CUFFT_SUCCESS ) \ + fprintf( stderr, \ + "ERROR: CUFFT call \"%s\" in line %d of file %s failed " \ + "with " \ + "code (%d).\n", \ + #call, \ + __LINE__, \ + __FILE__, \ + status ); \ + } +#endif // CUFFT_CALL + +__global__ +void scaling_kernel(cufftComplex* data, int element_count, float scale) { + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + const int stride = blockDim.x * gridDim.x; + for (auto i = tid; i $@ + +# Make cpp source depend on fatbin header so it gets built first +cufft_lto_r2c_c2r_scalable.cpp: $(FATBIN_HEADER) diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h new file mode 100644 index 000000000..55bd7a0cb --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _CALLBACK_PARAMS__H_ +#define _CALLBACK_PARAMS__H_ + +// Callback parameters structure +struct cb_params { + unsigned window_size; + unsigned signal_size; +}; + +// Problem input parameters (made non-const for runtime configuration) +extern unsigned batches; +extern unsigned signal_size; +extern unsigned window_size; +extern unsigned complex_signal_size; + +// Precision threshold +constexpr float threshold = 1e-6; + +#endif // _CALLBACK_PARAMS__H_ \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp new file mode 100644 index 000000000..ae6ffcdc7 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp @@ -0,0 +1,63 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _COMMON__CPP_ +#define _COMMON__CPP_ + +#include +#include "common.h" + +// Wave parameters +constexpr unsigned waves = 12; +constexpr float signal_max_A = 20.; +constexpr float signal_max_f = 500.; +constexpr float sampling_dt = 1e-3; + +// Initialize the input signal as a composite of sine waves +// with random amplitudes and frequencies +void init_input_signals(unsigned batches, unsigned signal_size, float* signals) { + std::mt19937 e2(0); + + std::uniform_real_distribution<> A_dist(0., signal_max_A); + std::uniform_real_distribution<> f_dist(0., signal_max_f); + + const unsigned complex_signal_size = signal_size / 2 + 1; + + for(unsigned batch = 0; batch < batches; ++batch) { + std::vector wave_amplitudes; + std::vector wave_frequencies; + + // Generate the amplitudes and frequencies of the waves + for(unsigned i = 0; i < waves; ++i) { + wave_amplitudes.push_back(A_dist(e2)); + wave_frequencies.push_back(f_dist(e2)); + } + + // Compose the signal + float time = 0.; + for(unsigned s = 0; s < signal_size; ++s) { + for(unsigned i = 0; i < waves; ++i) { + unsigned idx = batch * (2 * complex_signal_size) + s; + signals[idx] += wave_amplitudes[i] * sin(2. * PI * wave_frequencies[i] * time) ; + } + time += sampling_dt; + } + } +} + +#endif // _COMMON__CPP_ \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h new file mode 100644 index 000000000..6bcbacccc --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h @@ -0,0 +1,68 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef _COMMON__H_ +#define _COMMON__H_ + +#include +#include +#include + +// Some helper definitions +#define ERROR_VALUE -1 +#define PASS_VALUE 0 +#define PI 3.1415926535897932 + +// Check CUDA API error +inline int checkErrors(cudaError_t error, int line_number) { + if (error != cudaSuccess) { + printf("Example failed in CUDA API on line %d with error %d\n", line_number, error); + return ERROR_VALUE; + } + return PASS_VALUE; +} + +// Check cuFFT API error +inline int checkErrors(cufftResult error, int line_number) { + if (error != CUFFT_SUCCESS) { + printf("Example failed in cuFFT API on line %d with error %d\n", line_number, error); + return ERROR_VALUE; + } + return PASS_VALUE; +} + +#define CHECK_ERROR(error) checkErrors(error, __LINE__) + +template +double compute_error(T* ref, T* out, unsigned batches, unsigned signal_size){ + double squared_diff = 0; + double squared_norm = 0; + const unsigned batch_offset = 2 * (signal_size / 2 + 1); + for (int b = 0; b < batches; b++) { + for (int i = 0; i < signal_size; i++) { + unsigned ref_idx = b * batch_offset + i; + squared_diff += std::norm(ref[ref_idx] - out[ref_idx]); // Note that std::norm(z) = z * conj(z), not the usual sqrt(z * conj(z)) + squared_norm += std::norm(ref[ref_idx]); + } + } + return std::sqrt(squared_diff / squared_norm); +} + +// Initialize input signals +void init_input_signals(unsigned batches, unsigned signal_size, float* signals); +#endif // _COMMON__H_ \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp new file mode 100644 index 000000000..97dd7f6f4 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp @@ -0,0 +1,240 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +/* + * Example showing the use of LTO callbacks with CUFFT to perform + * truncation with zero padding. + * +*/ + +#include +#include +#include +#include "r2c_c2r_reference.h" +#include "common.h" +#include "callback_params.h" + +// NOTE: Header containing the compiled LTO callback device function in a C array, generated with bin2c +#include "r2c_c2r_lto_callback_device_fatbin.h" + +// Note: Removed static_assert since window_size and signal_size are now runtime variables + +int test_r2c_window_c2r() { + + // Padded array for in-place transforms - use heap allocation for large sizes + const size_t array_size = batches * 2 * complex_signal_size; + float *input_signals = new float[array_size](); + float *output_signals = new float[array_size]; + float *reference = new float[array_size]; + + init_input_signals(batches, signal_size, input_signals); + + const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float); + + // Allocate and copy input from host to GPU + float *device_signals; + CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes)); + CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice)); + + // Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback + cufftHandle forward_plan, inverse_plan_cb; + size_t work_size; + + CHECK_ERROR(cufftCreate(&forward_plan)); + CHECK_ERROR(cufftCreate(&inverse_plan_cb)); + + // NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet) +#ifdef CB_USE_CONSTANT_MEMORY + cb_params *device_params = nullptr; + std::string callback_name = "windowing_constant_memory_callback"; +#else + // Define a structure used to pass in the window size + cb_params host_params; + host_params.window_size = window_size; + host_params.signal_size = complex_signal_size; + + // Allocate and copy callback parameters from host to GPU + cb_params *device_params; + CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params))); + CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice)); + + std::string callback_name = "windowing_callback"; +#endif + size_t lto_callback_fatbin_size = sizeof(window_callback); + printf("Setting up LTO callback '%s', fatbin size: %zu bytes\n", callback_name.c_str(), lto_callback_fatbin_size); + cufftResult cb_result = cufftXtSetJITCallback(inverse_plan_cb, + callback_name.c_str(), + (void*)window_callback, + lto_callback_fatbin_size, + CUFFT_CB_LD_COMPLEX, + (void **)&device_params); + printf("cufftXtSetJITCallback returned: %d\n", cb_result); + CHECK_ERROR(cb_result); + + printf("Creating forward plan (R2C): signal_size=%u, batches=%u\n", signal_size, batches); + CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size)); + printf("Creating inverse plan (C2R) with callback: signal_size=%u, batches=%u\n", signal_size, batches); + CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size)); + + // Transform signal forward + printf("Transforming signal cufftExecR2C\n"); + CHECK_ERROR(cufftExecR2C(forward_plan, (cufftReal *)device_signals, (cufftComplex *)device_signals)); + + // Apply window via load callback and inverse-transform the signal + printf("Transforming signal cufftExecC2R\n"); + CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals)); + + // Copy device memory to host + CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost)); + + // Destroy CUFFT context + CHECK_ERROR(cufftDestroy(forward_plan)); + CHECK_ERROR(cufftDestroy(inverse_plan_cb)); + + // Cleanup memory + CHECK_ERROR(cudaFree(device_signals)); + CHECK_ERROR(cudaFree(device_params)); + + // Compute reference + if(reference_r2c_window_c2r(batches, signal_size, window_size, input_signals, reference) != PASS_VALUE) { + printf("Failed to compute the reference"); + delete[] input_signals; + delete[] output_signals; + delete[] reference; + return ERROR_VALUE; + }; + + double l2_error = compute_error(reference, output_signals, batches, signal_size); + printf("L2 error: %e\n", l2_error); + + // Cleanup heap-allocated arrays + delete[] input_signals; + delete[] output_signals; + delete[] reference; + + return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE; +} + +// Define global variables for size configuration +unsigned batches = 100; +unsigned signal_size = 128; +unsigned window_size = 16; +unsigned complex_signal_size = signal_size / 2 + 1; + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + // Parse named command-line arguments + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--batches") == 0 || strcmp(argv[i], "-b") == 0) { + if (i + 1 < argc) { + batches = atoi(argv[++i]); + } else { + printf("Error: %s requires a value\n", argv[i]); + printf("Usage: %s [--batches|-b ] [--signal-size|-s ] [--window-size|-w ]\n", argv[0]); + printf(" or: %s \n", argv[0]); + return ERROR_VALUE; + } + } else if (strcmp(argv[i], "--signal-size") == 0 || strcmp(argv[i], "-s") == 0) { + if (i + 1 < argc) { + signal_size = atoi(argv[++i]); + } else { + printf("Error: %s requires a value\n", argv[i]); + printf("Usage: %s [--batches|-b ] [--signal-size|-s ] [--window-size|-w ]\n", argv[0]); + printf(" or: %s \n", argv[0]); + return ERROR_VALUE; + } + } else if (strcmp(argv[i], "--window-size") == 0 || strcmp(argv[i], "-w") == 0) { + if (i + 1 < argc) { + window_size = atoi(argv[++i]); + } else { + printf("Error: %s requires a value\n", argv[i]); + printf("Usage: %s [--batches|-b ] [--signal-size|-s ] [--window-size|-w ]\n", argv[0]); + printf(" or: %s \n", argv[0]); + return ERROR_VALUE; + } + } else if (strcmp(argv[i], "small") == 0) { + batches = 128; + signal_size = 64; + window_size = 16; + } else if (strcmp(argv[i], "medium") == 0) { + batches = 500; + signal_size = 256; + window_size = 32; + } else if (strcmp(argv[i], "large") == 0) { + batches = 2000; + signal_size = 32; + window_size = 8; + } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + printf("Usage: %s [OPTIONS]\n", argv[0]); + printf("\nOptions:\n"); + printf(" -b, --batches Number of FFT batches (default: 100)\n"); + printf(" -s, --signal-size Size of each signal (default: 128)\n"); + printf(" -w, --window-size Window size for truncation (default: 16)\n"); + printf("\nPresets:\n"); + printf(" small : batches=128, signal_size=64, window_size=16\n"); + printf(" medium : batches=500, signal_size=256, window_size=32\n"); + printf(" large : batches=2000, signal_size=32, window_size=8\n"); + printf("\nExamples:\n"); + printf(" %s --batches 1024 --signal-size 512 --window-size 32\n", argv[0]); + printf(" %s -b 1024 -s 512 -w 32\n", argv[0]); + printf(" %s medium --batches 1000\n", argv[0]); + printf(" %s small\n", argv[0]); + return PASS_VALUE; + } else { + printf("Error: Unknown argument '%s'\n", argv[i]); + printf("Usage: %s [--batches|-b ] [--signal-size|-s ] [--window-size|-w ]\n", argv[0]); + printf(" or: %s \n", argv[0]); + printf(" or: %s --help\n", argv[0]); + return ERROR_VALUE; + } + } + + complex_signal_size = signal_size / 2 + 1; + + printf("==============================================\n"); + printf("cuFFT LTO R2C:C2R Example (Scalable)\n"); + printf("==============================================\n"); + printf("Batches: %u\n", batches); + printf("Signal size: %u\n", signal_size); + printf("Window size: %u\n", window_size); + printf("==============================================\n\n"); + + struct cudaDeviceProp properties; + int device; + CHECK_ERROR(cudaGetDevice(&device)); + CHECK_ERROR(cudaGetDeviceProperties(&properties, device)); + if (!(properties.major >= 5)) { + printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n"); + return ERROR_VALUE; + } + + int result = test_r2c_window_c2r(); + + printf("\n==============================================\n"); + if (result == PASS_VALUE) { + printf("SUCCESS: LTO R2C:C2R completed\n"); + } else { + printf("FAILED: LTO R2C:C2R\n"); + } + printf("==============================================\n"); + + return result; +} \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h new file mode 100644 index 000000000..576c68378 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h @@ -0,0 +1,102 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/* A simplified version of the NVRTC helper included with CUDA samples, + * targeting cuFFT LTO callbacks + */ + +#ifndef COMMON_NVRTC_HELPER_H_ +#define COMMON_NVRTC_HELPER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#define NVRTC_SAFE_CALL(Name, x) \ + do { \ + nvrtcResult result = x; \ + if (result != NVRTC_SUCCESS) { \ + std::cerr << "\nerror: " << Name << " failed with error " \ + << nvrtcGetErrorString(result) << std::endl; \ + exit(1); \ + } \ + } while (0) + +#define STRINGIZE2(s) #s +#define STRINGIZE(s) STRINGIZE2(s) +#define INCLUDE_CUDA_PATH "-I" STRINGIZE(CUDA_PATH) "/include" +#define CUDA_ARCH_FLAG "-arch=compute_" STRINGIZE(CUDA_ARCH) +#define CALLBACK_CODE_PATH(name) STRINGIZE(SOURCE_PATH) "/" name + +void compile_file_to_lto(std::vector& cubin_result, const char *filename) { + std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate); + if (!inputFile.is_open()) { + std::cerr << "\nerror: unable to open " << filename << " for reading!\n"; + exit(1); + } + + std::streampos pos = inputFile.tellg(); + size_t inputSize = (size_t)pos; + std::vector memBlock(inputSize + 1); + + inputFile.seekg(0, std::ios::beg); + inputFile.read(memBlock.data(), inputSize); + inputFile.close(); + memBlock[inputSize] = '\x0'; + + const int num_params = 6; + const char *compile_params[] = {INCLUDE_CUDA_PATH, + CUDA_ARCH_FLAG, + "--std=c++11", + "--relocatable-device-code=true", + "-default-device", + "-dlto"}; + + // Compile + nvrtcProgram prog; + NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock.data(), filename, 0, NULL, NULL)); + nvrtcResult res = nvrtcCompileProgram(prog, num_params, compile_params); + + // Print log + size_t logSize; + NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize)); + std::vector log(logSize + 1); + NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log.data())); + log[logSize] = '\x0'; + + if(log.size() > 2) { + std::cerr << "\n compilation log ---\n"; + std::string s(log.begin(), log.end()); + std::cerr << s; + std::cerr << "\n end log ---\n"; + } + + NVRTC_SAFE_CALL("nvrtcCompileProgram", res); + + size_t codeSize; + NVRTC_SAFE_CALL("nvrtcGetLTOIRSize", nvrtcGetLTOIRSize(prog, &codeSize)); + std::vector buffer(codeSize); + NVRTC_SAFE_CALL("nvrtcGetNVVM", nvrtcGetLTOIR(prog, buffer.data())); + cubin_result = buffer; +} + +#endif // COMMON_NVRTC_HELPER_H_ \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu new file mode 100644 index 000000000..7d838d028 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu @@ -0,0 +1,148 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +/* + * Example showing the use of LTO callbacks with CUFFT to perform + * normalization and truncation with zero padding. + * +*/ + +#include +#include +#include "common.h" +#include "r2c_c2r_reference.h" +#include "callback_params.h" + +// This is the store callback routine. It filters high frequencies +// based on a truncation window specified by the user +// NOTE: unlike the LTO version, the callback function can have +// any name +__constant__ unsigned cmem_window_size = window_size; +__constant__ unsigned cmem_signal_size = complex_signal_size; +__device__ cufftComplex windowing_callback(void *input, + size_t index, + void *info, + void *sharedmem) { + cufftComplex* cb_output = static_cast(input); +#ifdef CB_USE_CONSTANT_MEMORY + const unsigned sample = index % cmem_signal_size; + return (sample < cmem_window_size) ? cb_output[index] : cufftComplex{0.f, 0.f}; +#else + const cb_params* params = static_cast(info); + const unsigned sample = index % params->signal_size; + return (sample < params->window_size) ? cb_output[index] : cufftComplex{0.f, 0.f}; +#endif +} + +__device__ cufftCallbackLoadC device_callback_ptr = windowing_callback; + +static_assert(window_size < (signal_size/2 + 1), "The window size must be smaller than the signal size in complex space"); + +int test_r2c_window_c2r() { + + // Padded array for in-place transforms + float input_signals[batches][2 * complex_signal_size] = {}; + float output_signals[batches][2 * complex_signal_size]; + float reference[batches][2 * complex_signal_size]; + + init_input_signals(batches, signal_size, &input_signals[0][0]); + + const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float); + + // Allocate and copy input from host to GPU + float *device_signals; + CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes)); + CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice)); + + // Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback + cufftHandle forward_plan, inverse_plan_cb; + size_t work_size; + + CHECK_ERROR(cufftCreate(&forward_plan)); + CHECK_ERROR(cufftCreate(&inverse_plan_cb)); + + CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size)); + CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size)); + + // NOTE: The host needs to get a copy of the device pointer to the callback. Not required for LTO callback + cufftCallbackLoadC host_callback_ptr; + CHECK_ERROR(cudaMemcpyFromSymbol(&host_callback_ptr, device_callback_ptr, sizeof(host_callback_ptr))); + +#ifdef CB_USE_CONSTANT_MEMORY + cb_params *device_params = nullptr; +#else + // Define a structure used to pass in the window size + cb_params host_params; + host_params.window_size = window_size; + host_params.signal_size = complex_signal_size; + + // Allocate and copy callback parameters from host to GPU + cb_params *device_params; + CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params))); + CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice)); +#endif + // Now associate the load callback with the plan. + CHECK_ERROR(cufftXtSetCallback(inverse_plan_cb, (void **)&host_callback_ptr, CUFFT_CB_LD_COMPLEX, (void **)&device_params)); + + // Transform signal forward + printf("Transforming signal cufftExecR2C\n"); + CHECK_ERROR(cufftExecR2C(forward_plan, (cufftReal *)device_signals, (cufftComplex *)device_signals)); + + // Apply window via load callback and inverse-transform the signal + printf("Transforming signal cufftExecC2R\n"); + CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals)); + + // Copy device memory to host + CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost)); + + // Destroy CUFFT context + CHECK_ERROR(cufftDestroy(forward_plan)); + CHECK_ERROR(cufftDestroy(inverse_plan_cb)); + + // Cleanup memory + CHECK_ERROR(cudaFree(device_signals)); + CHECK_ERROR(cudaFree(device_params)); + + // Compute reference + if (reference_r2c_window_c2r(batches, signal_size, window_size, &input_signals[0][0], &reference[0][0]) != PASS_VALUE) { + printf("Failed to compute the reference"); + return ERROR_VALUE; + } + + double l2_error = compute_error(&reference[0][0], &output_signals[0][0], batches, signal_size); + printf("L2 error: %e\n", l2_error); + + return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + struct cudaDeviceProp properties; + int device; + CHECK_ERROR(cudaGetDevice(&device)); + CHECK_ERROR(cudaGetDeviceProperties(&properties, device)); + if (!(properties.major >= 5)) { + printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n"); + return ERROR_VALUE; + } + + return test_r2c_window_c2r(); +} diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu new file mode 100644 index 000000000..1431e3c1b --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu @@ -0,0 +1,45 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +/* + * Example showing the use of LTO callbacks with CUFFT to perform + * truncation with zero padding. + * +*/ + +#include +#include "callback_params.h" + +// This is the store callback routine. It filters high frequencies +// based on a truncation window specified by the user +__device__ cufftComplex windowing_callback(void* input, + unsigned long long idx, + void* info, + void* sharedmem) { + + const cb_params* params = static_cast(info); + cufftComplex* cb_output = static_cast(input); + + const unsigned sample = idx % params->signal_size; + + return (sample < params->window_size) ? cb_output[idx] : cufftComplex{0.f, 0.f}; +} + +// Note: Constant memory version removed for scalable implementation +// The callback now uses runtime parameters passed through info pointer \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h new file mode 100644 index 000000000..4e97cd281 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h @@ -0,0 +1,92 @@ +#ifdef __cplusplus +extern "C" { +#endif + +unsigned long long window_callback[] = { +0x00100001ba55ed50ULL,0x00000000000009f8ULL,0x0000007801010008ULL,0x0000000000000980ULL, +0x000000400000097aULL,0x0000004b00010040ULL,0x0000000000000000ULL,0x0000000000012011ULL, +0x00000000861d559cULL,0x0000000000000b38ULL,0x0000002800000048ULL,0x2d20303d7a74662dULL, +0x7669645f63657270ULL,0x636572702d20313dULL,0x20313d747271735fULL,0x20313d64616d662dULL, +0x0000000000000000ULL,0xb3df19853aa79a4eULL,0x9ce55e973efaf7b9ULL,0xfc5d4b5fbd7f7656ULL, +0xa16dd934438c2971ULL,0x73d0c26bafc12e47ULL,0xc55e34b6edfe113aULL,0x6a89003a1b8764a5ULL, +0xf1da8d1ac6137429ULL,0x2056217118c8564cULL,0xd408cb5fa0a53a2bULL,0xcbe5acb9331f5bfaULL, +0x462be6b0cca5934dULL,0x9b9e7c1e0e218196ULL,0xd873ec4c22f37e49ULL,0x02ebbd52b2810818ULL, +0xec2e1d2a87e77e0dULL,0x4915b0a280110c87ULL,0x1489d837223481b0ULL,0xcb43c090d76cf5c6ULL, +0xb22f9e00526ba48dULL,0xc680d59356482921ULL,0x0b0aeb2ecdc91c3bULL,0x7d84415d3f51f0d7ULL, +0x989832ace4dc2e90ULL,0x9b734532920eeedeULL,0xf31d2a7716674f31ULL,0x4ef6f384005e385fULL, +0x15a46b421f64a904ULL,0xcf084f6bd9e1e817ULL,0x9204f365d20a2ac4ULL,0xba35c48d37212d76ULL, +0x1997f360b753f06cULL,0x2f1b5cfe7ebc4108ULL,0xb30a4f954c82ef29ULL,0x8b5b329795581519ULL, +0xe012c44eaf3d9c4bULL,0x19a9d60149758779ULL,0x61e73f3af4a79265ULL,0x0c41dde210f2040eULL, +0xf65b08c5efa40305ULL,0xde45eeb929bad831ULL,0xc62fac341327b9a9ULL,0x3dc6a75c639e59e0ULL, +0xf0e2bf18f990e7bdULL,0x9e07cea9c0a2b240ULL,0x403f7fdf59a9f22bULL,0xf533989bd7551679ULL, +0xb972ad03a62d611dULL,0x5814c8f1029f9063ULL,0x935cd7350fb780c2ULL,0x0480baf83650c1daULL, +0xae6e283c826dab1bULL,0x9403eeecba31977eULL,0x171516ce797d87e7ULL,0x978abc930e89a78cULL, +0x8aec5f230461ebb8ULL,0xcf51594c1dcc07e4ULL,0x4e7c1b94b7d479a4ULL,0xd23236cd4982d9baULL, +0x69793ccdd0946bcbULL,0x2f667e3119cee955ULL,0xfe38643d463ac0baULL,0xc4f5842cbb9645b0ULL, +0xedf1793fd1e1ec67ULL,0x09651a6bfbc63253ULL,0x17e852707a4e1b4aULL,0x8ca32cfb49e9c1f2ULL, +0xe561e27972e37105ULL,0x6b17da7adb5c6dbdULL,0x859c087b1b46341dULL,0xc78b3aee1de06653ULL, +0x2296d389a9abff58ULL,0x8d44a92ae716d279ULL,0xdabae2b9cffe9335ULL,0xdd57f3e3a54e3f9aULL, +0x6fa10d4a75988453ULL,0x44751e637287cc4fULL,0x3826e90044b88618ULL,0xabdc86d1a00c4864ULL, +0xac5154b5269750faULL,0x0b0c98d0a734e6aeULL,0x8a38ebbe6dfcd890ULL,0xf578bc7e9f63584eULL, +0x066a20a40c81aa79ULL,0xb1da95c681714f98ULL,0x95a7a8939262db69ULL,0xeb5d6d162db0c47dULL, +0xf3cbe8f708242112ULL,0xaec2f3d2534d3653ULL,0xe368cdcad21cc38fULL,0x6ddf47e86c29ae6bULL, +0x217c3065a6c9fc31ULL,0x8cc86fed4c910837ULL,0x734aa81b3a35bdb4ULL,0xdfab9d5e3944c522ULL, +0xba8ace5aef0d926eULL,0xbca8fb8a3dd0dd71ULL,0x8c6f9b2ea76144a1ULL,0x3ab7c46491c5d372ULL, +0xfdea9370de1a3944ULL,0xb21ede18c16ec2cfULL,0xa247d0c8f3581cc0ULL,0xfb8a09a13694e5e5ULL, +0x2e3a35c923afdfbaULL,0x538bf2b326b6461dULL,0x6c38602ae2353a33ULL,0x0616b90c76d45fd1ULL, +0x8ba1ab4e2e5418a2ULL,0x6b1b4cba49aab043ULL,0xf4b3807b6202b50eULL,0x30e460e644adc04aULL, +0x2177e70736c1fc36ULL,0x08bc8cf5c8b924a4ULL,0x511edf6f6b2cafe8ULL,0xd50990da78d708e6ULL, +0x39bd58ef4af9f7d6ULL,0xc8d83cb7c862305dULL,0x0b64be09f791357fULL,0x203298c8a390b840ULL, +0xe8655af21c86b93dULL,0x0e9e081bebcb83f9ULL,0x7d4540d9fdae1387ULL,0xdc7ea2fccf583b79ULL, +0x58948df65e916276ULL,0x4fa699acc456d34eULL,0xa9bd264cb8c624b1ULL,0x6186bb1f42e517c7ULL, +0xf4d0cecdb6c77c86ULL,0x91b62496e92d4007ULL,0xe6884e4a995fbfc8ULL,0x360fb1f078911dbaULL, +0xd2913b43fc8ed09aULL,0x4802ac43e9cce4cbULL,0x51792bb1d9b9f6a5ULL,0x1b80a0a7c4f6aa74ULL, +0xe9cfe89ebf0fd497ULL,0x954e2ca2a5e42f12ULL,0xfde393fba24954f2ULL,0x570e5780e2d3f9faULL, +0x8f1352dda1f7ae5eULL,0xf3656cb562fd1ab6ULL,0x89d52ee3498587efULL,0x08ec08a1a6be2e49ULL, +0x2d8820f34153deefULL,0x579b418c80f79846ULL,0x996b578d31659089ULL,0x85b836c947893a33ULL, +0xe72e090c3cb8f0beULL,0xa4c03328e3c94489ULL,0xf6b569ca5ca0c462ULL,0xdd2e775be2369af4ULL, +0x8524d124d8aa721eULL,0xc0a77bc694326c48ULL,0xb6c1afa438fd842bULL,0xd7f290a76df49a2cULL, +0x99ead5386853ed6fULL,0x799b53798b539b1cULL,0x5db2e52ee8e56ee0ULL,0xfe08d75aec95f6aeULL, +0xfedbea28c256805cULL,0xdfd5f2a84e6bb1b3ULL,0x998a35c7cc4c6ea4ULL,0xd811df87ac37b46bULL, +0xe1a8cd605a7ee7a2ULL,0x6e50e24c7ad14c0bULL,0xb0939d317d59fbafULL,0x634ed5336389b361ULL, +0x6ffd0cac1457b393ULL,0xb051e7a7daad179cULL,0xa8717200aa075d3fULL,0x41deb9cd9acd8a15ULL, +0xeb41d554ef55aebcULL,0xa0d2661239c800b3ULL,0xbf327a02cef15065ULL,0xfba11541537dfec0ULL, +0x537d8e14e95a8fccULL,0x17ce1cb6524b5280ULL,0x022e06d043aeeaa5ULL,0x7e702426c779b0e5ULL, +0x5f3e81b3ea707e5aULL,0x07cad8b5ce0d8817ULL,0xc87601c78dac0445ULL,0x777611b269475022ULL, +0x3e24e81e1350d045ULL,0x253ecf00d8fbc0b9ULL,0x1178a94ef37c250eULL,0x007f3d52ecf1fcd7ULL, +0xe40a338a55e96fe9ULL,0x65fe06013586f9f0ULL,0x39552ceeab4b2881ULL,0xee7f86bd4d92ba59ULL, +0x3bb8faa518e27aecULL,0x9a3741260e2efcb7ULL,0x8dfa691ddb3a3482ULL,0x8e1565f9c1426e43ULL, +0x279a9a74fe66c9d1ULL,0x13b9f8925cc24f4fULL,0x43bf58cead91cef4ULL,0xd6e2b4a88629424dULL, +0xeee159c966933528ULL,0x9c4fb5ad60dc37b5ULL,0x17e808902199a72fULL,0x370ad041076053d1ULL, +0xf75706a61ae5a5deULL,0xd6386efeede5af4eULL,0xd63aa08aeabeb868ULL,0x621b3cb0fdbd7244ULL, +0xb471a2fc0bec5acfULL,0x20c318ab7abff16eULL,0x6c70f515eb8270b6ULL,0x15a67ea5468a3b9fULL, +0xac806d321b5f7729ULL,0xb6e8d71b32633a44ULL,0x1d81a3fc2e385b6cULL,0x6bc1e3c198e7a0a2ULL, +0xc6da0604ab91d760ULL,0xf35056b67187bf6dULL,0xf9b6eb751b88af0bULL,0xef422c4a4f7d915eULL, +0xa8f8324f46270989ULL,0xdd52d4b110c83432ULL,0xa25a603ec3ec6d75ULL,0x4c9b33515196921cULL, +0x02276767300c71c5ULL,0x457700ffba63c7f6ULL,0xcf16b3113812082fULL,0xe930a3e7b8729bf8ULL, +0xa9c1d4ae2411747fULL,0x20a008371ed9d342ULL,0x9205463540dd2ca6ULL,0x0f4ff6d7db127aa1ULL, +0xbd854c29ce1afbfbULL,0x1e8231fdfde87d45ULL,0x28efc8ad57411175ULL,0xad634e05d1009295ULL, +0xd7073fdea7279cc9ULL,0x920c15ac2619c0a1ULL,0x34f3b62f46bb053aULL,0x47496dd4845c853fULL, +0x14b46f3bfde59fb2ULL,0x1560522a21ca7458ULL,0x855978202eee5338ULL,0x0650162f164f2ee3ULL, +0x5bb43d678278fe99ULL,0x00e41b9de79c21abULL,0xbb814528fa67f1daULL,0xd9ead75fbb0924daULL, +0x11bcc3bdf9f2c518ULL,0x20162ada6a99bda9ULL,0xddb6d248728f7fafULL,0x2af54679064abca0ULL, +0x61be9c5fc3359a21ULL,0x67bab1acb7e2a71eULL,0x98ffd55efb24b2abULL,0x7bce9f79ae365a19ULL, +0x6eed62d19aaf43dcULL,0x678fe63ccb7f2d30ULL,0xdcf998ca5ef7076dULL,0xbd75e71fd6515355ULL, +0x93df67d27ddebc8dULL,0xa331ef2c4e33c37fULL,0xa1668f9cc53cf9fbULL,0x54eeaba4cce0e139ULL, +0xdc713dc459e8f295ULL,0xb1f751a14e827a95ULL,0xdc4c6b72e04299e7ULL,0xa45f439b44e5bcb0ULL, +0x75d528261a9dd346ULL,0x60bbe86e87d440d6ULL,0x5507f0a0f2f9d449ULL,0x589ddbe8d38c87d4ULL, +0x6a9325188b1b4e13ULL,0x6debafe7543805f4ULL,0x3c59cad53035933bULL,0x5425f2c2d6a01b15ULL, +0xe9e0f53e2e548128ULL,0xd6b40f8b54205c19ULL,0x0528cd0440ae1a58ULL,0xaa8d82f522379d6eULL, +0x8e96f1d46b2078acULL,0x7805c441a20ac004ULL,0x5704b4b0a0caa41dULL,0xc255c06a850c0d62ULL, +0xae2a53f4c3edd4ddULL,0x3e52c89c671196c5ULL,0xff37ac769ca776bcULL,0xb49a880feb53a57fULL, +0x350459f19159481eULL,0xd9cb47e6b7eeb44dULL,0x448720ab781460cfULL,0x67033b8e51fa51e6ULL, +0x135f62109a1a3bebULL,0xdadd7ecbe4238ad5ULL,0xcefe8f02b9ba9165ULL,0x59999f0427393ea1ULL, +0x02582b0a20ebe3f2ULL,0xe26080ea0fe5f050ULL,0x00851a6d480e2f34ULL,0xdcea9e2bab8481deULL, +0x0ae74fed789fb981ULL,0xeb5fa5b0f630d9aaULL,0xeab795f79c0fc9e0ULL,0x74810dca5d08ee93ULL, +0x00000000000078fbULL +}; + +#ifdef __cplusplus +} +#endif + diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp new file mode 100644 index 000000000..4ec57e6bb --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp @@ -0,0 +1,132 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + +/* + * Example showing the use of LTO callbacks with CUFFT to perform + * truncation with zero padding. + * +*/ + +#include +#include +#include "r2c_c2r_reference.h" +#include "common.h" +#include "nvrtc_helper.h" +#include "callback_params.h" +static_assert(window_size < (signal_size/2 + 1), "The window size must be smaller than the signal size in complex space"); + +int test_r2c_window_c2r() { + // Padded array for in-place transforms + float input_signals[batches][2 * complex_signal_size] = {}; + float output_signals[batches][2 * complex_signal_size]; + float reference[batches][2 * complex_signal_size]; + + init_input_signals(batches, signal_size, &input_signals[0][0]); + + const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float); + + // Allocate and copy input from host to GPU + float *device_signals; + CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes)); + CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice)); + + // NOTE: Use NVRTC to compile the callback function to LTO + std::vector callback_buffer; + compile_file_to_lto(callback_buffer, CALLBACK_CODE_PATH("r2c_c2r_lto_callback_device.cu")); + + // Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback + cufftHandle forward_plan, inverse_plan_cb; + size_t work_size; + + CHECK_ERROR(cufftCreate(&forward_plan)); + CHECK_ERROR(cufftCreate(&inverse_plan_cb)); + + // NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet) +#ifdef CB_USE_CONSTANT_MEMORY + cb_params *device_params = nullptr; + std::string callback_name = "windowing_constant_memory_callback"; +#else + // Define a structure used to pass in the window size + cb_params host_params; + host_params.window_size = window_size; + host_params.signal_size = complex_signal_size; + + // Allocate and copy callback parameters from host to GPU + cb_params *device_params; + CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params))); + CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice)); + + std::string callback_name = "windowing_callback"; +#endif + CHECK_ERROR(cufftXtSetJITCallback(inverse_plan_cb, + callback_name.c_str(), + (void*)callback_buffer.data(), + callback_buffer.size(), + CUFFT_CB_LD_COMPLEX, + (void **)&device_params)); + + CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size)); + CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size)); + + // Transform signal forward + printf("Transforming signal cufftExecR2C\n"); + CHECK_ERROR(cufftExecR2C(forward_plan, (cufftReal *)device_signals, (cufftComplex *)device_signals)); + + // Apply window via load callback and inverse-transform the signal + printf("Transforming signal cufftExecC2R\n"); + CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals)); + + // Copy device memory to host + CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost)); + + // Destroy CUFFT context + CHECK_ERROR(cufftDestroy(forward_plan)); + CHECK_ERROR(cufftDestroy(inverse_plan_cb)); + + // Cleanup memory + CHECK_ERROR(cudaFree(device_signals)); + CHECK_ERROR(cudaFree(device_params)); + + // Compute reference + if (reference_r2c_window_c2r(batches, signal_size, window_size, input_signals[0], reference[0]) != PASS_VALUE) { + printf("Failed to compute the reference"); + return ERROR_VALUE; + } + + double l2_error = compute_error(reference[0], output_signals[0], batches, signal_size); + printf("L2 error: %e\n", l2_error); + + return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE; +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) { + struct cudaDeviceProp properties; + int device; + CHECK_ERROR(cudaGetDevice(&device)); + CHECK_ERROR(cudaGetDeviceProperties(&properties, device)); + if (!(properties.major >= 5)) { + printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n"); + return ERROR_VALUE; + } + + return test_r2c_window_c2r(); +} diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu new file mode 100644 index 000000000..b040bfd54 --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu @@ -0,0 +1,86 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Reference for the example of LTO callbacks, + * run the same plans but perform the windowing with + * a separate kernel. + * +*/ + +#include +#include +#include +#include +#include "r2c_c2r_reference.h" + +__global__ void windowing(unsigned nbatches, unsigned complex_signal_size, unsigned window_size, float2* buffer) +{ + const size_t idx = blockIdx.x*blockDim.x + threadIdx.x; + + if(idx >= nbatches * complex_signal_size) return; + + const unsigned sample = idx % complex_signal_size; + + buffer[idx].x = (sample < window_size) ? buffer[idx].x : 0.f; + buffer[idx].y = (sample < window_size) ? buffer[idx].y : 0.f; +} + +int reference_r2c_window_c2r(unsigned batches, unsigned signal_size, unsigned window_size, float* input_signals, float* output_signals) { + const unsigned complex_signal_size = signal_size / 2 + 1; + const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float); + + // Allocate and copy input from host to GPU + float *device_signals; + CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes)); + CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice)); + + // Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform + cufftHandle forward_plan, inverse_plan; + size_t work_size; + + CHECK_ERROR(cufftCreate(&forward_plan)); + CHECK_ERROR(cufftCreate(&inverse_plan)); + + CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size)); + CHECK_ERROR(cufftMakePlan1d(inverse_plan, signal_size, CUFFT_C2R, batches, &work_size)); + + // Transform signal forward + printf("Transforming reference cufftExecR2C\n"); + CHECK_ERROR(cufftExecR2C(forward_plan, (cufftReal *)device_signals, (cufftComplex *)device_signals)); + + // Apply window via separate kernel + windowing<<<(batches * complex_signal_size + 255) / 256, 256>>>(batches, complex_signal_size, window_size, (float2*) device_signals); + cudaDeviceSynchronize(); + CHECK_ERROR(cudaGetLastError()); + + // Inverse-transform the signal + printf("Transforming reference cufftExecC2R\n"); + CHECK_ERROR(cufftExecC2R(inverse_plan, (cufftComplex *)device_signals, (cufftReal *)device_signals)); + + // Copy device memory to host + CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost)); + + // Destroy CUFFT context + CHECK_ERROR(cufftDestroy(forward_plan)); + CHECK_ERROR(cufftDestroy(inverse_plan)); + + // Cleanup memory + CHECK_ERROR(cudaFree(device_signals)); + + return PASS_VALUE; +} \ No newline at end of file diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h new file mode 100644 index 000000000..9872fd98d --- /dev/null +++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h @@ -0,0 +1,26 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#ifndef R2C_C2R_REFERENCE__H_ +#define R2C_C2R_REFERENCE__H_ + +#include "common.h" + +int reference_r2c_window_c2r(unsigned batches, unsigned signal_size, unsigned window_size, float* input_signals, float* output_signals); + +#endif // R2C_C2R_REFERENCE__H_ \ No newline at end of file diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile b/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile new file mode 100644 index 000000000..8fdea2698 --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile @@ -0,0 +1,9 @@ +EXECUTABLE := cusolver_Xgetrf_scalable +CUFILES := cusolver_Xgetrf_scalable.cu +CCFILES := +ADDITIONAL_LIBS := -lcusolver -lcublas +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +include ../../../common/common.mk diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu new file mode 100644 index 000000000..5253a00ae --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu @@ -0,0 +1,286 @@ +/* + * Modified cuSOLVER Xgetrf example with scalable input sizes + * Based on NVIDIA's cusolver_Xgetrf_example.cu + * + * Accepts command-line arguments for matrix size: + * small: 16x16 + * medium: 128x128 + * large: 512x512 + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "cusolver_utils.h" + +void print_usage(const char* prog_name) { + printf("Usage: %s [OPTIONS]\n", prog_name); + printf("\n"); + printf("Options:\n"); + printf(" -m, --m Matrix dimension (creates mxm matrix)\n"); + printf("\n"); + printf("Presets:\n"); + printf(" small 16x16 matrix\n"); + printf(" medium 128x128 matrix\n"); + printf(" large 512x512 matrix\n"); + printf("\n"); + printf("Examples:\n"); + printf(" %s --m 1024 # 1024x1024 matrix\n", prog_name); + printf(" %s -m 2048 # 2048x2048 matrix\n", prog_name); + printf(" %s small # 16x16 matrix\n", prog_name); + printf(" %s medium # 128x128 matrix\n", prog_name); + printf("\n"); +} + +int main(int argc, char *argv[]) { + // Parse command line arguments + int64_t m = 16; // Default: small + const char* size_name = "small"; + bool custom_m = false; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + print_usage(argv[0]); + return 0; + } else if (strcmp(argv[i], "--m") == 0 || strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + m = atoll(argv[++i]); + custom_m = true; + size_name = "custom"; + } else { + fprintf(stderr, "Error: %s requires a value\n", argv[i]); + print_usage(argv[0]); + return 1; + } + } else if (strcmp(argv[i], "small") == 0) { + m = 16; + size_name = "small"; + } else if (strcmp(argv[i], "medium") == 0) { + m = 128; + size_name = "medium"; + } else if (strcmp(argv[i], "large") == 0) { + m = 512; + size_name = "large"; + } else { + fprintf(stderr, "Error: Unknown argument '%s'\n", argv[i]); + print_usage(argv[0]); + return 1; + } + } + + printf("==============================================\n"); + printf("cuSOLVER Xgetrf Example (Scalable)\n"); + printf("==============================================\n"); + if (custom_m) { + printf("Matrix size: %ldx%ld\n", m, m); + } else { + printf("Matrix size: %s (%ldx%ld)\n", size_name, m, m); + } + printf("Pivot: ON (compute P*A = L*U)\n"); + printf("==============================================\n\n"); + + cusolverDnHandle_t cusolverH = NULL; + cudaStream_t stream = NULL; + + using data_type = double; + + const int64_t lda = m; + const int64_t ldb = m; + const int64_t nrhs = 1; // number of right-hand sides + + // Generate random matrix A and vector B + std::vector A(m * m); + std::vector B(m); + std::vector X(m, 0); + std::vector LU(lda * m, 0); + std::vector Ipiv(m, 0); + int info = 0; + + // Initialize with random values for reproducibility + std::mt19937 gen(42); // Fixed seed for reproducibility + std::uniform_real_distribution dist(0.0, 10.0); + + for (int64_t i = 0; i < m * m; i++) { + A[i] = dist(gen); + } + + for (int64_t i = 0; i < m; i++) { + B[i] = dist(gen); + } + + // For small matrices, print them + if (m <= 16) { + std::printf("A = (matlab base-1)\n"); + for (int64_t row = 0; row < m; row++) { + for (int64_t col = 0; col < m; col++) { + printf("%.2f ", A[col * m + row]); + } + printf("\n"); + } + std::printf("=====\n"); + std::printf("B = (matlab base-1)\n"); + for (int64_t i = 0; i < m; i++) { + printf("%.2f ", B[i]); + } + printf("\n"); + std::printf("=====\n"); + } + + data_type *d_A = nullptr; /* device copy of A */ + data_type *d_B = nullptr; /* device copy of B */ + int64_t *d_Ipiv = nullptr; /* pivoting sequence */ + int *d_info = nullptr; /* error info */ + + size_t workspaceInBytesOnDevice = 0; /* size of workspace */ + void *d_work = nullptr; /* device workspace for getrf */ + size_t workspaceInBytesOnHost = 0; /* size of workspace */ + void *h_work = nullptr; /* host workspace for getrf */ + + const int pivot_on = 1; + const int algo = 0; + + printf("Using New Algo\n"); + + /* step 1: create cusolver handle, bind a stream */ + CUSOLVER_CHECK(cusolverDnCreate(&cusolverH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream)); + + /* step 2: copy A to device */ + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_A), sizeof(data_type) * A.size())); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_B), sizeof(data_type) * B.size())); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_Ipiv), sizeof(int64_t) * Ipiv.size())); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_info), sizeof(int))); + + CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(), + cudaMemcpyHostToDevice, stream)); + CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(), + cudaMemcpyHostToDevice, stream)); + + /* step 3: query working space of Xgetrf */ + cusolverDnParams_t params; + CUSOLVER_CHECK(cusolverDnCreateParams(¶ms)); + + CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(cusolverH, params, m, m, + CUDA_R_64F, d_A, lda, + CUDA_R_64F, &workspaceInBytesOnDevice, + &workspaceInBytesOnHost)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_work), workspaceInBytesOnDevice)); + + if (workspaceInBytesOnHost > 0) { + h_work = malloc(workspaceInBytesOnHost); + if (h_work == nullptr) { + throw std::runtime_error("Error: h_work not allocated."); + } + } + + printf("Starting LU factorization (Xgetrf)...\n"); + printf("Workspace: device=%zu bytes, host=%zu bytes\n", + workspaceInBytesOnDevice, workspaceInBytesOnHost); + + /* step 4: LU factorization */ + CUSOLVER_CHECK(cusolverDnXgetrf(cusolverH, params, m, m, + CUDA_R_64F, d_A, lda, d_Ipiv, + CUDA_R_64F, d_work, workspaceInBytesOnDevice, + h_work, workspaceInBytesOnHost, d_info)); + + CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(LU.data(), d_A, sizeof(data_type) * A.size(), + cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaMemcpyAsync(Ipiv.data(), d_Ipiv, sizeof(int64_t) * Ipiv.size(), + cudaMemcpyDeviceToHost, stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + std::printf("after Xgetrf: info = %d\n", info); + if (0 > info) { + std::printf("%d-th parameter is wrong \n", -info); + exit(1); + } else if (0 < info) { + std::printf("WARNING: matrix is singular, U(%d,%d) = 0\n", info, info); + } + + // For small matrices, print pivoting sequence + if (m <= 16) { + printf("pivoting sequence, matlab base-1\n"); + for (int64_t i = 0; i < m; i++) { + printf("Ipiv(%ld) = %ld\n", i + 1, Ipiv[i]); + } + } else { + printf("pivoting sequence (first 8), matlab base-1\n"); + for (int i = 0; i < std::min((int64_t)8, m); i++) { + printf("Ipiv(%d) = %ld\n", i + 1, Ipiv[i]); + } + } + + // For small matrices, print L and U + if (m <= 16) { + printf("L and U = (matlab base-1)\n"); + for (int64_t row = 0; row < m; row++) { + for (int64_t col = 0; col < m; col++) { + printf("%.2f ", LU[col * m + row]); + } + printf("\n"); + } + std::printf("=====\n"); + } + + printf("Starting solve (Xgetrs)...\n"); + + /* step 5: solve A*X = B */ + CUSOLVER_CHECK(cusolverDnXgetrs(cusolverH, params, CUBLAS_OP_N, m, nrhs, + CUDA_R_64F, d_A, lda, d_Ipiv, + CUDA_R_64F, d_B, ldb, d_info)); + + CUDA_CHECK(cudaMemcpyAsync(X.data(), d_B, sizeof(data_type) * X.size(), + cudaMemcpyDeviceToHost, stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // Print solution + if (m <= 16) { + std::printf("X = (matlab base-1)\n"); + for (int64_t i = 0; i < m; i++) { + printf("%.6f ", X[i]); + } + printf("\n"); + } else { + std::printf("X = (first 8 elements)\n"); + for (int i = 0; i < std::min((int64_t)8, m); i++) { + printf("%.6f ", X[i]); + } + printf("\n"); + } + + printf("\n==============================================\n"); + printf("SUCCESS: LU factorization and solve completed\n"); + printf("==============================================\n"); + + /* free resources */ + CUDA_CHECK(cudaFree(d_A)); + CUDA_CHECK(cudaFree(d_B)); + CUDA_CHECK(cudaFree(d_Ipiv)); + CUDA_CHECK(cudaFree(d_info)); + CUDA_CHECK(cudaFree(d_work)); + + CUSOLVER_CHECK(cusolverDnDestroyParams(params)); + CUSOLVER_CHECK(cusolverDnDestroy(cusolverH)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + + if (h_work) { + free(h_work); + } + + CUDA_CHECK(cudaDeviceReset()); + + return EXIT_SUCCESS; +} diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h new file mode 100644 index 000000000..2d37d51c5 --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h @@ -0,0 +1,299 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// CUDA API error checking +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("CUDA error"); \ + } \ + } while (0) + +// cusolver API error checking +#define CUSOLVER_CHECK(err) \ + do { \ + cusolverStatus_t err_ = (err); \ + if (err_ != CUSOLVER_STATUS_SUCCESS) { \ + printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cusolver error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cublas error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUSPARSE_CHECK(err) \ + do { \ + cusparseStatus_t err_ = (err); \ + if (err_ != CUSPARSE_STATUS_SUCCESS) { \ + printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cusparse error"); \ + } \ + } while (0) + +// memory alignment +#define ALIGN_TO(A, B) (((A + B - 1) / B) * B) + +// device memory pitch alignment +static const size_t device_alignment = 32; + +// type traits +template struct traits; + +template <> struct traits { + // scalar type + typedef float T; + typedef T S; + + static constexpr T zero = 0.f; + static constexpr cudaDataType cuda_data_type = CUDA_R_32F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F; +#endif + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, S f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef double T; + typedef T S; + + static constexpr T zero = 0.; + static constexpr cudaDataType cuda_data_type = CUDA_R_64F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F; +#endif + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, S f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef float S; + typedef cuFloatComplex T; + + static constexpr T zero = {0.f, 0.f}; + static constexpr cudaDataType cuda_data_type = CUDA_C_32F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F; +#endif + + inline static S abs(T val) { return cuCabsf(val); } + + template inline static T rand(RNG &gen) { + return make_cuFloatComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCaddf(a, b); } + inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); } + + inline static T mul(T v, S f) { return make_cuFloatComplex(v.x * f, v.y * f); } +}; + +template <> struct traits { + // scalar type + typedef double S; + typedef cuDoubleComplex T; + + static constexpr T zero = {0., 0.}; + static constexpr cudaDataType cuda_data_type = CUDA_C_64F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F; +#endif + + inline static S abs(T val) { return cuCabs(val); } + + template inline static T rand(RNG &gen) { + return make_cuDoubleComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCadd(a, b); } + inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); } + + inline static T mul(T v, S f) { return make_cuDoubleComplex(v.x * f, v.y * f); } +}; + +template void print_matrix(const int &m, const int &n, const T *A, const int &lda); + +template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template <> +void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template +void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution::S> dis(-1.0, 1.0); + auto rand_gen = std::bind(dis, gen); + + *lda = n; + + size_t matrix_mem_size = static_cast(*lda * m * sizeof(T)); + // suppress gcc 7 size warning + if (matrix_mem_size <= PTRDIFF_MAX) + *A = (T *)malloc(matrix_mem_size); + else + throw std::runtime_error("Memory allocation size is too large"); + + if (*A == NULL) + throw std::runtime_error("Unable to allocate host matrix"); + + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + T *A_col = (*A) + *lda * j; + A_col[i] = traits::rand(rand_gen); + } + } +} + +// Makes matrix A of size mxn and leading dimension lda diagonal dominant +template +void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) { + for (int j = 0; j < std::min(m, n); ++j) { + T *A_col = A + lda * j; + auto col_sum = traits::S>::zero; + for (int i = 0; i < m; ++i) { + col_sum += traits::abs(A_col[i]); + } + A_col[j] = traits::add(A_col[j], col_sum); + } +} + +// Returns cudaDataType value as defined in library_types.h for the string containing type name +cudaDataType get_cuda_library_type(std::string type_string) { + if (type_string.compare("CUDA_R_16F") == 0) + return CUDA_R_16F; + else if (type_string.compare("CUDA_C_16F") == 0) + return CUDA_C_16F; + else if (type_string.compare("CUDA_R_32F") == 0) + return CUDA_R_32F; + else if (type_string.compare("CUDA_C_32F") == 0) + return CUDA_C_32F; + else if (type_string.compare("CUDA_R_64F") == 0) + return CUDA_R_64F; + else if (type_string.compare("CUDA_C_64F") == 0) + return CUDA_C_64F; + else if (type_string.compare("CUDA_R_8I") == 0) + return CUDA_R_8I; + else if (type_string.compare("CUDA_C_8I") == 0) + return CUDA_C_8I; + else if (type_string.compare("CUDA_R_8U") == 0) + return CUDA_R_8U; + else if (type_string.compare("CUDA_C_8U") == 0) + return CUDA_C_8U; + else if (type_string.compare("CUDA_R_32I") == 0) + return CUDA_R_32I; + else if (type_string.compare("CUDA_C_32I") == 0) + return CUDA_C_32I; + else if (type_string.compare("CUDA_R_32U") == 0) + return CUDA_R_32U; + else if (type_string.compare("CUDA_C_32U") == 0) + return CUDA_C_32U; + else + throw std::runtime_error("Unknown CUDA datatype"); +} + +// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing +// solver name +cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) { + if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0) + return CUSOLVER_IRS_REFINE_NONE; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0) + return CUSOLVER_IRS_REFINE_CLASSICAL; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0) + return CUSOLVER_IRS_REFINE_GMRES; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0) + return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0) + return CUSOLVER_IRS_REFINE_GMRES_GMRES; + else + printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str()); + + return CUSOLVER_IRS_REFINE_NOT_SET; +} \ No newline at end of file diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile b/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile new file mode 100644 index 000000000..b7430b74e --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile @@ -0,0 +1,9 @@ +EXECUTABLE := cusolver_ormqr_scalable +CUFILES := cusolver_ormqr_scalable.cu +CCFILES := +ADDITIONAL_LIBS := -lcusolver -lcublas +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +include ../../../common/common.mk diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu new file mode 100644 index 000000000..e5aded469 --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu @@ -0,0 +1,253 @@ +/* + * Modified cuSOLVER ormqr example with scalable input sizes + * Based on NVIDIA's cusolver_ormqr_example.cu + * + * Accepts command-line arguments for matrix size: + * small: 16x16 + * medium: 256x256 + * large: 768x768 + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cusolver_utils.h" + +void print_usage(const char* prog_name) { + printf("Usage: %s [OPTIONS]\n", prog_name); + printf("\n"); + printf("Options:\n"); + printf(" -m, --m Matrix dimension (creates mxm matrix)\n"); + printf("\n"); + printf("Presets:\n"); + printf(" small 16x16 matrix\n"); + printf(" medium 256x256 matrix\n"); + printf(" large 768x768 matrix\n"); + printf("\n"); + printf("Examples:\n"); + printf(" %s --m 512 # 512x512 matrix\n", prog_name); + printf(" %s -m 1024 # 1024x1024 matrix\n", prog_name); + printf(" %s small # 16x16 matrix\n", prog_name); + printf(" %s medium # 256x256 matrix\n", prog_name); + printf("\n"); +} + +int main(int argc, char *argv[]) { + // Parse command line arguments + int m = 16; // Default: small + const char* size_name = "small"; + bool custom_m = false; + + for (int i = 1; i < argc; i++) { + if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) { + print_usage(argv[0]); + return 0; + } else if (strcmp(argv[i], "--m") == 0 || strcmp(argv[i], "-m") == 0) { + if (i + 1 < argc) { + m = atoi(argv[++i]); + custom_m = true; + size_name = "custom"; + } else { + fprintf(stderr, "Error: %s requires a value\n", argv[i]); + print_usage(argv[0]); + return 1; + } + } else if (strcmp(argv[i], "small") == 0) { + m = 16; + size_name = "small"; + } else if (strcmp(argv[i], "medium") == 0) { + m = 256; + size_name = "medium"; + } else if (strcmp(argv[i], "large") == 0) { + m = 768; + size_name = "large"; + } else { + fprintf(stderr, "Error: Unknown argument '%s'\n", argv[i]); + print_usage(argv[0]); + return 1; + } + } + + printf("==============================================\n"); + printf("cuSOLVER ormqr Example (Scalable)\n"); + printf("==============================================\n"); + if (custom_m) { + printf("Matrix size: %dx%d\n", m, m); + } else { + printf("Matrix size: %s (%dx%d)\n", size_name, m, m); + } + printf("==============================================\n\n"); + + cusolverDnHandle_t cusolverH = NULL; + cublasHandle_t cublasH = NULL; + cudaStream_t stream{}; + + const int lda = m; + const int ldb = m; + const int nrhs = 1; // number of right hand side vectors + + // Generate random matrix A and vector B + std::vector A(m * m); + std::vector B(m); + std::vector XC(ldb * nrhs, 0); // solution matrix from GPU + + // Initialize with random values for reproducibility + std::mt19937 gen(42); // Fixed seed for reproducibility + std::uniform_real_distribution dist(0.0, 10.0); + + for (int i = 0; i < m * m; i++) { + A[i] = dist(gen); + } + + for (int i = 0; i < m; i++) { + B[i] = dist(gen); + } + + // For small matrices, print them + if (m <= 16) { + std::printf("A = (first 8x8 block, matlab base-1)\n"); + int print_size = std::min(m, 8); + for (int row = 0; row < print_size; row++) { + for (int col = 0; col < print_size; col++) { + printf("%.2f ", A[col * m + row]); + } + printf("\n"); + } + std::printf("=====\n"); + std::printf("B = (first 8 elements, matlab base-1)\n"); + for (int i = 0; i < std::min(m, 8); i++) { + printf("%.2f ", B[i]); + } + printf("\n"); + std::printf("=====\n"); + } + + /* device memory */ + double *d_A = nullptr; + double *d_tau = nullptr; + double *d_B = nullptr; + int *d_info = nullptr; + double *d_work = nullptr; + + int lwork_geqrf = 0; + int lwork_ormqr = 0; + int lwork = 0; + int info = 0; + + const double one = 1; + + /* step 1: create cudense/cublas handle */ + CUSOLVER_CHECK(cusolverDnCreate(&cusolverH)); + CUBLAS_CHECK(cublasCreate(&cublasH)); + + CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); + CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream)); + CUBLAS_CHECK(cublasSetStream(cublasH, stream)); + + /* step 2: copy A and B to device */ + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_A), sizeof(double) * A.size())); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_tau), sizeof(double) * m)); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_B), sizeof(double) * B.size())); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_info), sizeof(int))); + + CUDA_CHECK( + cudaMemcpyAsync(d_A, A.data(), sizeof(double) * A.size(), cudaMemcpyHostToDevice, stream)); + CUDA_CHECK( + cudaMemcpyAsync(d_B, B.data(), sizeof(double) * B.size(), cudaMemcpyHostToDevice, stream)); + + /* step 3: query working space of geqrf and ormqr */ + CUSOLVER_CHECK(cusolverDnDgeqrf_bufferSize(cusolverH, m, m, d_A, lda, &lwork_geqrf)); + + CUSOLVER_CHECK(cusolverDnDormqr_bufferSize(cusolverH, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, nrhs, m, + d_A, lda, d_tau, d_B, ldb, &lwork_ormqr)); + + lwork = std::max(lwork_geqrf, lwork_ormqr); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&d_work), sizeof(double) * lwork)); + + printf("Starting QR factorization (geqrf)...\n"); + + /* step 4: compute QR factorization */ + CUSOLVER_CHECK(cusolverDnDgeqrf(cusolverH, m, m, d_A, lda, d_tau, d_work, lwork, d_info)); + + /* check if QR is good or not */ + CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + std::printf("after geqrf: info = %d\n", info); + if (0 > info) { + std::printf("%d-th parameter is wrong \n", -info); + exit(1); + } + + printf("Starting ormqr (Q^T * B)...\n"); + + /* step 5: compute Q^T*B */ + CUSOLVER_CHECK(cusolverDnDormqr(cusolverH, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, nrhs, m, d_A, lda, + d_tau, d_B, ldb, d_work, lwork, d_info)); + + /* check if QR is good or not */ + CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + std::printf("after ormqr: info = %d\n", info); + if (0 > info) { + std::printf("%d-th parameter is wrong \n", -info); + exit(1); + } + + printf("Computing x = R \\ Q^T*B (triangular solve)...\n"); + + /* step 6: compute x = R \ Q^T*B */ + CUBLAS_CHECK(cublasDtrsm(cublasH, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N, + CUBLAS_DIAG_NON_UNIT, m, nrhs, &one, d_A, lda, d_B, ldb)); + + CUDA_CHECK(cudaMemcpyAsync(XC.data(), d_B, sizeof(double) * XC.size(), cudaMemcpyDeviceToHost, + stream)); + + CUDA_CHECK(cudaStreamSynchronize(stream)); + + // For small matrices, print solution + if (m <= 16) { + std::printf("X = (matlab base-1)\n"); + for (int i = 0; i < m; i++) { + printf("%.6f ", XC[i]); + } + printf("\n"); + } else { + std::printf("X = (first 8 elements)\n"); + for (int i = 0; i < std::min(m, 8); i++) { + printf("%.6f ", XC[i]); + } + printf("\n"); + } + + printf("\n==============================================\n"); + printf("SUCCESS: QR factorization and solve completed\n"); + printf("==============================================\n"); + + /* free resources */ + CUDA_CHECK(cudaFree(d_A)); + CUDA_CHECK(cudaFree(d_tau)); + CUDA_CHECK(cudaFree(d_B)); + CUDA_CHECK(cudaFree(d_info)); + CUDA_CHECK(cudaFree(d_work)); + + CUBLAS_CHECK(cublasDestroy(cublasH)); + CUSOLVER_CHECK(cusolverDnDestroy(cusolverH)); + + CUDA_CHECK(cudaStreamDestroy(stream)); + + CUDA_CHECK(cudaDeviceReset()); + + return EXIT_SUCCESS; +} diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h new file mode 100644 index 000000000..2d37d51c5 --- /dev/null +++ b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h @@ -0,0 +1,299 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +// CUDA API error checking +#define CUDA_CHECK(err) \ + do { \ + cudaError_t err_ = (err); \ + if (err_ != cudaSuccess) { \ + printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("CUDA error"); \ + } \ + } while (0) + +// cusolver API error checking +#define CUSOLVER_CHECK(err) \ + do { \ + cusolverStatus_t err_ = (err); \ + if (err_ != CUSOLVER_STATUS_SUCCESS) { \ + printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cusolver error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUBLAS_CHECK(err) \ + do { \ + cublasStatus_t err_ = (err); \ + if (err_ != CUBLAS_STATUS_SUCCESS) { \ + printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cublas error"); \ + } \ + } while (0) + +// cublas API error checking +#define CUSPARSE_CHECK(err) \ + do { \ + cusparseStatus_t err_ = (err); \ + if (err_ != CUSPARSE_STATUS_SUCCESS) { \ + printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__); \ + throw std::runtime_error("cusparse error"); \ + } \ + } while (0) + +// memory alignment +#define ALIGN_TO(A, B) (((A + B - 1) / B) * B) + +// device memory pitch alignment +static const size_t device_alignment = 32; + +// type traits +template struct traits; + +template <> struct traits { + // scalar type + typedef float T; + typedef T S; + + static constexpr T zero = 0.f; + static constexpr cudaDataType cuda_data_type = CUDA_R_32F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F; +#endif + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, S f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef double T; + typedef T S; + + static constexpr T zero = 0.; + static constexpr cudaDataType cuda_data_type = CUDA_R_64F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F; +#endif + + inline static S abs(T val) { return fabs(val); } + + template inline static T rand(RNG &gen) { return (S)gen(); } + + inline static T add(T a, T b) { return a + b; } + + inline static T mul(T v, S f) { return v * f; } +}; + +template <> struct traits { + // scalar type + typedef float S; + typedef cuFloatComplex T; + + static constexpr T zero = {0.f, 0.f}; + static constexpr cudaDataType cuda_data_type = CUDA_C_32F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F; +#endif + + inline static S abs(T val) { return cuCabsf(val); } + + template inline static T rand(RNG &gen) { + return make_cuFloatComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCaddf(a, b); } + inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); } + + inline static T mul(T v, S f) { return make_cuFloatComplex(v.x * f, v.y * f); } +}; + +template <> struct traits { + // scalar type + typedef double S; + typedef cuDoubleComplex T; + + static constexpr T zero = {0., 0.}; + static constexpr cudaDataType cuda_data_type = CUDA_C_64F; +#if CUDART_VERSION >= 11000 + static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F; +#endif + + inline static S abs(T val) { return cuCabs(val); } + + template inline static T rand(RNG &gen) { + return make_cuDoubleComplex((S)gen(), (S)gen()); + } + + inline static T add(T a, T b) { return cuCadd(a, b); } + inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); } + + inline static T mul(T v, S f) { return make_cuDoubleComplex(v.x * f, v.y * f); } +}; + +template void print_matrix(const int &m, const int &n, const T *A, const int &lda); + +template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f ", A[j * lda + i]); + } + std::printf("\n"); + } +} + +template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template <> +void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) { + for (int i = 0; i < m; i++) { + for (int j = 0; j < n; j++) { + std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y); + } + std::printf("\n"); + } +} + +template +void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution::S> dis(-1.0, 1.0); + auto rand_gen = std::bind(dis, gen); + + *lda = n; + + size_t matrix_mem_size = static_cast(*lda * m * sizeof(T)); + // suppress gcc 7 size warning + if (matrix_mem_size <= PTRDIFF_MAX) + *A = (T *)malloc(matrix_mem_size); + else + throw std::runtime_error("Memory allocation size is too large"); + + if (*A == NULL) + throw std::runtime_error("Unable to allocate host matrix"); + + for (int j = 0; j < n; ++j) { + for (int i = 0; i < m; ++i) { + T *A_col = (*A) + *lda * j; + A_col[i] = traits::rand(rand_gen); + } + } +} + +// Makes matrix A of size mxn and leading dimension lda diagonal dominant +template +void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) { + for (int j = 0; j < std::min(m, n); ++j) { + T *A_col = A + lda * j; + auto col_sum = traits::S>::zero; + for (int i = 0; i < m; ++i) { + col_sum += traits::abs(A_col[i]); + } + A_col[j] = traits::add(A_col[j], col_sum); + } +} + +// Returns cudaDataType value as defined in library_types.h for the string containing type name +cudaDataType get_cuda_library_type(std::string type_string) { + if (type_string.compare("CUDA_R_16F") == 0) + return CUDA_R_16F; + else if (type_string.compare("CUDA_C_16F") == 0) + return CUDA_C_16F; + else if (type_string.compare("CUDA_R_32F") == 0) + return CUDA_R_32F; + else if (type_string.compare("CUDA_C_32F") == 0) + return CUDA_C_32F; + else if (type_string.compare("CUDA_R_64F") == 0) + return CUDA_R_64F; + else if (type_string.compare("CUDA_C_64F") == 0) + return CUDA_C_64F; + else if (type_string.compare("CUDA_R_8I") == 0) + return CUDA_R_8I; + else if (type_string.compare("CUDA_C_8I") == 0) + return CUDA_C_8I; + else if (type_string.compare("CUDA_R_8U") == 0) + return CUDA_R_8U; + else if (type_string.compare("CUDA_C_8U") == 0) + return CUDA_C_8U; + else if (type_string.compare("CUDA_R_32I") == 0) + return CUDA_R_32I; + else if (type_string.compare("CUDA_C_32I") == 0) + return CUDA_C_32I; + else if (type_string.compare("CUDA_R_32U") == 0) + return CUDA_R_32U; + else if (type_string.compare("CUDA_C_32U") == 0) + return CUDA_C_32U; + else + throw std::runtime_error("Unknown CUDA datatype"); +} + +// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing +// solver name +cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) { + if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0) + return CUSOLVER_IRS_REFINE_NONE; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0) + return CUSOLVER_IRS_REFINE_CLASSICAL; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0) + return CUSOLVER_IRS_REFINE_GMRES; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0) + return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES; + else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0) + return CUSOLVER_IRS_REFINE_GMRES_GMRES; + else + printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str()); + + return CUSOLVER_IRS_REFINE_NOT_SET; +} \ No newline at end of file diff --git a/src/cuda/HPC/external/cugraph b/src/cuda/HPC/external/cugraph new file mode 160000 index 000000000..15e04b307 --- /dev/null +++ b/src/cuda/HPC/external/cugraph @@ -0,0 +1 @@ +Subproject commit 15e04b3070dda0985b281f69bc7d37ad0a91f94d diff --git a/src/cuda/HPC/external/newton b/src/cuda/HPC/external/newton new file mode 160000 index 000000000..141baffff --- /dev/null +++ b/src/cuda/HPC/external/newton @@ -0,0 +1 @@ +Subproject commit 141baffff9d6059e94fbc399d9404868169555a8 diff --git a/src/cuda/HPC/find_cugraph_tag.sh b/src/cuda/HPC/find_cugraph_tag.sh new file mode 100755 index 000000000..971a79982 --- /dev/null +++ b/src/cuda/HPC/find_cugraph_tag.sh @@ -0,0 +1,46 @@ +#!/bin/bash +# Find the latest cuGraph tag that supports the given CUDA version + +CUDA_VERSION=$1 +CUGRAPH_DIR=$2 + +if [ -z "$CUDA_VERSION" ] || [ -z "$CUGRAPH_DIR" ]; then + echo "Usage: $0 " + exit 1 +fi + +cd "$CUGRAPH_DIR" || exit 1 + +# Fetch all tags +git fetch --tags --quiet 2>/dev/null + +# Get all tags sorted by version (newest first) +TAGS=$(git tag -l 'v*' | sort -V -r) + +# For each tag, check if it supports the CUDA version +for TAG in $TAGS; do + # Checkout the tag quietly + git checkout "$TAG" --quiet 2>/dev/null || continue + + # Check rapids-cmake or CMakeLists.txt for CUDA version support + # Look for CUDA version specifications in cmake files + if [ -f "rapids-cmake/rapids-cuda/rapids_cuda_init_architectures.cmake" ]; then + CUDA_FILE="rapids-cmake/rapids-cuda/rapids_cuda_init_architectures.cmake" + elif [ -f "cpp/CMakeLists.txt" ]; then + CUDA_FILE="cpp/CMakeLists.txt" + else + continue + fi + + # Extract supported CUDA versions from the file + # Look for patterns like "CUDA 12.8" or "CUDA_VERSION 12.8" + if grep -q "$CUDA_VERSION" "$CUDA_FILE" 2>/dev/null || \ + grep -qE "CUDA.*$CUDA_VERSION|$CUDA_VERSION.*CUDA" "$CUDA_FILE" 2>/dev/null; then + echo "$TAG" + exit 0 + fi +done + +# If no tag found, return the latest tag +LATEST=$(git tag -l 'v*' | sort -V -r | head -1) +echo "$LATEST" diff --git a/src/cuda/HPC/generate_graph.py b/src/cuda/HPC/generate_graph.py new file mode 100755 index 000000000..0bc623ce3 --- /dev/null +++ b/src/cuda/HPC/generate_graph.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Generate a synthetic graph with ~100K vertices for benchmarking. +Creates a scale-free graph using the Barabási-Albert model. +""" + +import sys +import random +import argparse + + +def create_barabasi_albert_graph(n, m, seed=42): + """ + Create a Barabási-Albert scale-free graph using optimized approach. + + Args: + n: Number of vertices + m: Number of edges to attach from a new node to existing nodes + seed: Random seed for reproducibility + + Returns: + List of edges (u, v) tuples + """ + random.seed(seed) + + print(f"Generating Barabási-Albert graph with {n:,} vertices...") + print(f"Each new node connects to {m} existing nodes") + + # Edge list + edges = [] + + # Targets for preferential attachment (repeating nodes based on degree) + # This allows O(1) random selection with degree-based probability + targets = [] + + # Start with a small complete graph + initial_nodes = max(m, 2) + for i in range(initial_nodes): + for j in range(i + 1, initial_nodes): + edges.append((i, j)) + targets.append(i) + targets.append(j) + + # Add remaining nodes with preferential attachment + for new_node in range(initial_nodes, n): + # Sample m unique nodes from targets (with replacement conceptually, + # but we ensure uniqueness) + selected = set() + + # Try to select m unique targets + attempts = 0 + while len(selected) < m and attempts < m * 20: + target = random.choice(targets) + selected.add(target) + attempts += 1 + + # If we couldn't get m unique targets (very unlikely), fill with any nodes + if len(selected) < m: + available = set(range(new_node)) - selected + needed = m - len(selected) + selected.update(random.sample(list(available), min(needed, len(available)))) + + # Add edges to selected nodes + for target in selected: + edges.append((new_node, target)) + # Add both endpoints to targets for preferential attachment + targets.append(new_node) + targets.append(target) + + # Progress indicator + if (new_node + 1) % 10000 == 0: + print(f" Generated {new_node + 1:,} / {n:,} vertices...") + + n_edges = len(edges) + print(f"Generated graph: {n:,} vertices, {n_edges:,} edges") + print(f"Average degree: {2 * n_edges / n:.2f}") + + return edges, n + + +def save_graph_as_mtx(edges, n_vertices, output_file): + """ + Save graph in Matrix Market (.mtx) format. + + Args: + edges: List of (u, v) tuples + n_vertices: Number of vertices + output_file: Output file path + """ + print(f"Saving to {output_file}...") + + # Make edges symmetric (undirected graph) and unique + all_edges = set() + for u, v in edges: + # Ensure u < v for consistent ordering + if u > v: + u, v = v, u + all_edges.add((u, v)) + + # Convert to sorted list + all_edges = sorted(all_edges) + + with open(output_file, 'w') as f: + # Write Matrix Market header + f.write("%%MatrixMarket matrix coordinate pattern symmetric\n") + f.write(f"% Barabási-Albert graph: {n_vertices} vertices, {len(all_edges)} edges\n") + + # Write dimensions: rows cols non-zeros + # For an adjacency matrix, we only store the upper triangle + f.write(f"{n_vertices} {n_vertices} {len(all_edges)}\n") + + # Write edges (1-based indexing in MTX format) + for u, v in all_edges: + f.write(f"{u+1} {v+1}\n") + + print(f"Successfully saved graph to {output_file}") + + # Print file size + import os + file_size = os.path.getsize(output_file) + if file_size < 1024*1024: + print(f"File size: {file_size/1024:.1f} KB") + else: + print(f"File size: {file_size/(1024*1024):.1f} MB") + + +def main(): + parser = argparse.ArgumentParser( + description='Generate a synthetic graph for benchmarking', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + '-n', '--vertices', + type=int, + default=100000, + help='Number of vertices' + ) + parser.add_argument( + '-m', '--edges-per-node', + type=int, + default=5, + help='Number of edges each new node attaches to (controls graph density)' + ) + parser.add_argument( + '-o', '--output', + type=str, + default='synthetic_100k.mtx', + help='Output file name' + ) + parser.add_argument( + '--seed', + type=int, + default=42, + help='Random seed for reproducibility' + ) + + args = parser.parse_args() + + # Validate arguments + if args.vertices < 1: + print("Error: Number of vertices must be positive") + sys.exit(1) + + if args.edges_per_node < 1 or args.edges_per_node >= args.vertices: + print(f"Error: edges-per-node must be between 1 and {args.vertices-1}") + sys.exit(1) + + # Generate graph + edges, n_vertices = create_barabasi_albert_graph( + args.vertices, + args.edges_per_node, + args.seed + ) + + # Save to MTX format + save_graph_as_mtx(edges, n_vertices, args.output) + + n_edges = len(edges) + print("\nGraph statistics:") + print(f" Vertices: {n_vertices:,}") + print(f" Edges: {n_edges:,}") + print(f" Average degree: {2 * n_edges / n_vertices:.2f}") + density = (2 * n_edges) / (n_vertices * (n_vertices - 1)) + print(f" Density: {density:.6f}") + + print(f"\nUsage:") + print(f" BFS: cd bfs && ./run.sh --file ../{args.output}") + print(f" MST: cd mst && ./run.sh --file ../{args.output}") + + +if __name__ == '__main__': + main() diff --git a/src/cuda/HPC/generate_large_ppm.py b/src/cuda/HPC/generate_large_ppm.py new file mode 100755 index 000000000..cd9ed9b7c --- /dev/null +++ b/src/cuda/HPC/generate_large_ppm.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 +""" +Generate large PPM image files for recursiveGaussian benchmark. +Creates synthetic test images at various resolutions. +""" + +import sys +import random + +def generate_ppm(width, height, filename): + """Generate a PPM P6 (binary RGB) image file.""" + + print(f"Generating {width}x{height} PPM image...") + + # Calculate sizes + pixel_count = width * height + rgb_data_size = pixel_count * 3 + + with open(filename, 'wb') as f: + # Write ASCII header + header = f"P6\n# Generated test image for recursiveGaussian_hpc\n{width} {height}\n255\n" + f.write(header.encode('ascii')) + + # Generate RGB data in chunks to avoid memory issues + chunk_size = 1024 * 1024 # 1MB chunks + bytes_written = 0 + + print(f"Writing {rgb_data_size / (1024*1024):.1f} MB of RGB data...") + + while bytes_written < rgb_data_size: + # Generate chunk of random RGB values + remaining = rgb_data_size - bytes_written + current_chunk_size = min(chunk_size, remaining) + + # Create gradient pattern (more interesting than pure random) + chunk_data = bytearray() + for i in range(current_chunk_size // 3): + pixel_idx = (bytes_written // 3) + i + row = pixel_idx // width + col = pixel_idx % width + + # Create a gradient pattern + r = (col * 255 // width) & 0xFF + g = (row * 255 // height) & 0xFF + b = ((row + col) * 255 // (width + height)) & 0xFF + + chunk_data.extend([r, g, b]) + + f.write(chunk_data) + bytes_written += len(chunk_data) + + # Progress indicator + progress = (bytes_written / rgb_data_size) * 100 + if bytes_written % (10 * 1024 * 1024) < chunk_size: # Every ~10MB + print(f" Progress: {progress:.1f}%") + + # Get file size + import os + file_size = os.path.getsize(filename) + + print(f"✓ Created {filename}") + print(f" Size: {file_size / (1024*1024):.1f} MB") + print(f" Dimensions: {width}x{height}") + print() + +if __name__ == "__main__": + if len(sys.argv) > 1: + # Custom size from command line + if len(sys.argv) != 4: + print("Usage: generate_large_ppm.py ") + print(" or: generate_large_ppm.py (generates standard sizes)") + sys.exit(1) + + width = int(sys.argv[1]) + height = int(sys.argv[2]) + filename = sys.argv[3] + generate_ppm(width, height, filename) + else: + # Generate standard test sizes + import os + script_dir = os.path.dirname(os.path.abspath(__file__)) + base_dir = os.path.join(script_dir, "../../../data_dirs/cuda/HPC/recursiveGaussian/data") + + # Create directory if it doesn't exist + os.makedirs(base_dir, exist_ok=True) + + sizes = [ + (768, 768, f"{base_dir}/teapot768.ppm"), + (1024, 1024, f"{base_dir}/teapot1024.ppm"), + # (2048, 2048, f"{base_dir}/teapot2048.ppm"), # Uncomment for 4K + ] + + for width, height, filename in sizes: + generate_ppm(width, height, filename) + + print("All images generated successfully!") diff --git a/src/cuda/HPC/generate_large_signal.py b/src/cuda/HPC/generate_large_signal.py new file mode 100755 index 000000000..ac4798713 --- /dev/null +++ b/src/cuda/HPC/generate_large_signal.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +Generate large signal files for dwtHaar1D benchmark. +Creates synthetic signal data at various sizes (powers of 2). +""" + +import sys +import math + +def generate_signal(size_power, filename, epsilon=0.001): + """ + Generate a signal file with 2^size_power elements. + + Args: + size_power: Power of 2 (e.g., 24 for 2^24 = 16,777,216 elements) + filename: Output filename + epsilon: Epsilon value for header (precision parameter) + """ + + num_elements = 2 ** size_power + + print(f"Generating signal file with {num_elements:,} elements (2^{size_power})...") + print("This will take a few minutes...") + + with open(filename, 'w') as f: + # Write header (epsilon value as comment) + f.write(f"# {epsilon}\n") + + # Generate and write signal values + # Using a simple synthetic signal (sine wave + noise) + chunk_size = 100000 # Write in chunks + values_written = 0 + + while values_written < num_elements: + chunk_values = [] + + for i in range(min(chunk_size, num_elements - values_written)): + idx = values_written + i + + # Create a synthetic signal: combination of multiple frequencies + # This creates a more realistic signal than pure random + t = idx / num_elements # Normalized time 0 to 1 + + # Multiple frequency components + value = ( + math.sin(2 * math.pi * 5 * t) * 0.5 + # 5 Hz + math.sin(2 * math.pi * 13 * t) * 0.3 + # 13 Hz + math.sin(2 * math.pi * 31 * t) * 0.2 # 31 Hz + ) + + chunk_values.append(f"{value:.6f}") + + # Write chunk (space-separated) + f.write(" ".join(chunk_values)) + f.write(" ") + + values_written += len(chunk_values) + + # Progress indicator + progress = (values_written / num_elements) * 100 + if progress % 25 < (100 * chunk_size / num_elements): + print(f"Progress: {progress:.1f}%") + + f.write("\n") + + # Get file size + import os + file_size = os.path.getsize(filename) + + print(f"\n✓ Generated {filename}") + print(f" Elements: {num_elements:,} (2^{size_power})") + print(f" File size: {file_size / (1024*1024):.1f} MB") + print() + +if __name__ == "__main__": + if len(sys.argv) > 1: + # Custom size from command line + if len(sys.argv) != 3: + print("Usage: generate_large_signal.py ") + print(" Example: generate_large_signal.py 24 signal_2_24.dat") + print(" or: generate_large_signal.py (generates standard sizes)") + sys.exit(1) + + size_power = int(sys.argv[1]) + filename = sys.argv[2] + generate_signal(size_power, filename) + else: + # Generate standard test sizes + import os + script_dir = os.path.dirname(os.path.abspath(__file__)) + base_dir = os.path.join(script_dir, "../../../data_dirs/cuda/HPC/dwtHaar1D_hpc/data") + + # Create directory if it doesn't exist + os.makedirs(base_dir, exist_ok=True) + + sizes = [ + (20, f"{base_dir}/signal_2_20.dat"), # 2^20 = 1,048,576 + (22, f"{base_dir}/signal_2_22.dat"), # 2^22 = 4,194,304 + (24, f"{base_dir}/signal_2_24.dat"), # 2^24 = 16,777,216 + ] + + for power, filename in sizes: + generate_signal(power, filename) + + print("All signal files generated successfully!") diff --git a/src/cuda/HPC/get_dwt_data.sh b/src/cuda/HPC/get_dwt_data.sh new file mode 100755 index 000000000..17c3e2373 --- /dev/null +++ b/src/cuda/HPC/get_dwt_data.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# Generate random signal data for DWT + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DATA_DIR="$SCRIPT_DIR/../../../data_dirs/cuda/HPC/dwtHaar1D_hpc/data/" +mkdir -p "$DATA_DIR" + +# Link gold files and signal files from cuda-samples +CUDA_SAMPLES_DWT="$SCRIPT_DIR/../cuda-samples/Samples/5_Domain_Specific/dwtHaar1D/" + +if [ -d "$CUDA_SAMPLES_DWT" ]; then + echo "Linking reference data from cuda-samples..." + # Link gold files (rename to match yml expectations) + ln -sf "$CUDA_SAMPLES_DWT/data/regression_2_18.gold.dat" "$DATA_DIR/regression_2_18.gold.dat" + ln -sf "$CUDA_SAMPLES_DWT/data/regression_2_14.gold.dat" "$DATA_DIR/regression_2_14.gold.dat" + ln -sf "$CUDA_SAMPLES_DWT/data/regression.gold.dat" "$DATA_DIR/regression.gold.dat" + + # Link signal files from cuda-samples + ln -sf "$CUDA_SAMPLES_DWT/data/signal_2_18.dat" "$DATA_DIR/signal_2_18.dat" + ln -sf "$CUDA_SAMPLES_DWT/data/signal_2_14.dat" "$DATA_DIR/signal_2_14.dat" + ln -sf "$CUDA_SAMPLES_DWT/data/signal.dat" "$DATA_DIR/signal.dat" + + echo "Linked reference data from cuda-samples" +fi + +# Generate random signal files (using dd for portability - no numpy required) +for size in 512 1024 4096 16384 65536; do + dd if=/dev/urandom of="$DATA_DIR/signal_${size}.dat" bs=4 count=$size status=none 2>/dev/null + echo "Generated signal_${size}.dat" +done + +echo "DWT signal data ready" + +echo "DWT data ready in $DATA_DIR" + +# Generate large signal files using the Python script +if [ -f "$SCRIPT_DIR/generate_large_signal.py" ]; then + echo "Generating large signal files..." + python3 "$SCRIPT_DIR/generate_large_signal.py" +fi diff --git a/src/cuda/HPC/get_graph_data.sh b/src/cuda/HPC/get_graph_data.sh new file mode 100755 index 000000000..2df0cff25 --- /dev/null +++ b/src/cuda/HPC/get_graph_data.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Download standard graph datasets and generate synthetic graph + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DATA_DIR="$SCRIPT_DIR/../../../data_dirs/cuda/HPC/graph/data/" +mkdir -p "$DATA_DIR" + +# Download karate club (standard benchmark from SuiteSparse Matrix Collection) +if [ ! -f "$DATA_DIR/karate.mtx" ]; then + echo "Downloading karate.mtx..." + wget -O "$DATA_DIR/karate.tar.gz" \ + https://suitesparse-collection-website.herokuapp.com/MM/Newman/karate.tar.gz + cd "$DATA_DIR" && tar -xzf karate.tar.gz && mv karate/karate.mtx . && rm -rf karate karate.tar.gz +fi + +# Download netscience (standard benchmark from SuiteSparse Matrix Collection) +if [ ! -f "$DATA_DIR/netscience.mtx" ]; then + echo "Downloading netscience.mtx..." + wget -O "$DATA_DIR/netscience.tar.gz" \ + https://suitesparse-collection-website.herokuapp.com/MM/Newman/netscience.tar.gz + cd "$DATA_DIR" && tar -xzf netscience.tar.gz && mv netscience/netscience.mtx . && rm -rf netscience netscience.tar.gz +fi + +# Generate synthetic 100k vertex graph +if [ ! -f "$DATA_DIR/synthetic_100k.mtx" ]; then + echo "Generating synthetic_100k.mtx..." + python3 "$SCRIPT_DIR/generate_graph.py" -n 100000 -m 5 -o "$DATA_DIR/synthetic_100k.mtx" +fi + +echo "Graph data ready in $DATA_DIR" diff --git a/src/cuda/HPC/get_image_data.sh b/src/cuda/HPC/get_image_data.sh new file mode 100755 index 000000000..6abea0061 --- /dev/null +++ b/src/cuda/HPC/get_image_data.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Generate test images for recursiveGaussian + +DATA_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../../../data_dirs/cuda/HPC/recursiveGaussian_hpc/data/" +mkdir -p "$DATA_DIR" + +# Generate test PPM images using Python (portable, no ImageMagick dependency) +python3 - "$DATA_DIR" << 'EOF' +import os +import sys + +def create_gradient_ppm(filename, size): + """Create a simple gradient PPM image in P6 (binary) format""" + with open(filename, 'wb') as f: + # Write header in ASCII + header = f"P6\n{size} {size}\n255\n" + f.write(header.encode('ascii')) + # Write pixel data in binary + for y in range(size): + for x in range(size): + # Gradient from black to white + val = int((x + y) * 255 / (2 * size)) + # Write RGB as 3 bytes (RGBA would need 4th byte) + f.write(bytes([val, val, val])) + +data_dir = os.path.expanduser(sys.argv[1]) +create_gradient_ppm(f"{data_dir}/teapot128.ppm", 128) +create_gradient_ppm(f"{data_dir}/teapot256.ppm", 256) +create_gradient_ppm(f"{data_dir}/teapot512.ppm", 512) +create_gradient_ppm(f"{data_dir}/teapot768.ppm", 768) +create_gradient_ppm(f"{data_dir}/teapot1024.ppm", 1024) +print(f"Generated test images in {data_dir}") +EOF + +echo "Image data ready in $DATA_DIR" + +# Generate large test images using the Python script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +if [ -f "$SCRIPT_DIR/generate_large_ppm.py" ]; then + echo "Generating large PPM images..." + python3 "$SCRIPT_DIR/generate_large_ppm.py" +fi diff --git a/src/cuda/HPC/get_vpi_data.sh b/src/cuda/HPC/get_vpi_data.sh new file mode 100755 index 000000000..c5e7b1a7d --- /dev/null +++ b/src/cuda/HPC/get_vpi_data.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Link VPI sample data from VPI installation + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DATA_ROOT="$SCRIPT_DIR/../../../data_dirs/cuda/HPC" + +# Find VPI installation +VPI_ROOT=$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1) + +if [ -z "$VPI_ROOT" ]; then + echo "ERROR: VPI installation not found in /opt/nvidia/" + exit 1 +fi + +VPI_ASSETS="$VPI_ROOT/samples/assets" + +if [ ! -d "$VPI_ASSETS" ]; then + echo "ERROR: VPI sample assets not found at $VPI_ASSETS" + exit 1 +fi + +echo "Linking VPI sample data from $VPI_ASSETS..." + +# Create data directory structure for each VPI app +for app in vpi_background_subtractor vpi_orb_feature_detector vpi_stereo_disparity; do + # Create parent directory + mkdir -p "$DATA_ROOT/$app" + # Remove existing data directory/symlink if it exists + rm -rf "$DATA_ROOT/$app/data" + # Create symbolic link to VPI assets + ln -sf "$VPI_ASSETS" "$DATA_ROOT/$app/data" + echo " $app/data -> $VPI_ASSETS" +done + +echo "VPI data linked successfully" diff --git a/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt b/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt new file mode 100644 index 000000000..478571f67 --- /dev/null +++ b/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt @@ -0,0 +1,41 @@ +cmake_minimum_required(VERSION 3.18) +project(bfs_standalone CUDA CXX) + +find_package(CUDAToolkit REQUIRED) + +# Try to find cuGraph installation +# First check environment variable, then common install locations +if(DEFINED ENV{CUGRAPH_ROOT}) + set(cugraph_ROOT $ENV{CUGRAPH_ROOT}) +elseif(EXISTS "$ENV{HOME}/cugraph") + set(cugraph_ROOT "$ENV{HOME}/cugraph") +else() + set(cugraph_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../external/cugraph") +endif() + +add_executable(bfs_standalone bfs_standalone.cu) +target_compile_features(bfs_standalone PRIVATE cxx_std_17) + +# Add include directories for cuGraph and dependencies +target_include_directories(bfs_standalone PRIVATE + ${cugraph_ROOT}/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/rmm-src/include + ${cugraph_ROOT}/cpp/build/_deps/raft-src/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/cuvs-src/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-src/include +) + +# Link libraries +target_link_directories(bfs_standalone PRIVATE + ${cugraph_ROOT}/cpp/build + ${cugraph_ROOT}/cpp/build/_deps/rmm-build + ${cugraph_ROOT}/cpp/build/_deps/raft-build + ${cugraph_ROOT}/cpp/build/_deps/cuvs-build +) + +target_link_libraries(bfs_standalone PRIVATE + CUDA::cudart + CUDA::cublas + cugraph + cugraph_c +) diff --git a/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu b/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu new file mode 100644 index 000000000..1a651b3e0 --- /dev/null +++ b/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu @@ -0,0 +1,262 @@ +/* + * Standalone BFS kernel extracted from cugraph test suite + * Simplified to run without gtest framework + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Simple graph loader for MTX format +template +struct SimpleGraph { + std::vector offsets; + std::vector indices; + vertex_t num_vertices; + edge_t num_edges; +}; + +template +SimpleGraph load_mtx_graph(const std::string& filename) { + SimpleGraph graph; + + std::ifstream file(filename); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file " << filename << std::endl; + exit(1); + } + + // Check if graph is symmetric + bool is_symmetric = false; + std::string line; + while (std::getline(file, line)) { + if (line[0] != '%') break; + if (line.find("symmetric") != std::string::npos) { + is_symmetric = true; + } + } + + // Read dimensions + vertex_t num_rows, num_cols; + edge_t num_entries; + std::istringstream iss(line); + iss >> num_rows >> num_cols >> num_entries; + + graph.num_vertices = std::max(num_rows, num_cols); + + // Read edges + std::vector> edges; + vertex_t src, dst; + while (file >> src >> dst) { + src--; dst--; // MTX is 1-indexed + edges.push_back({src, dst}); + // For symmetric graphs, add reverse edge if not a self-loop + if (is_symmetric && src != dst) { + edges.push_back({dst, src}); + } + } + + graph.num_edges = edges.size(); + + // Convert to CSR format + graph.offsets.resize(graph.num_vertices + 1, 0); + + // Count degree + for (const auto& edge : edges) { + graph.offsets[edge.first + 1]++; + } + + // Prefix sum + for (vertex_t i = 0; i < graph.num_vertices; i++) { + graph.offsets[i + 1] += graph.offsets[i]; + } + + graph.indices.resize(edges.size()); + std::vector current_pos = graph.offsets; + + for (const auto& edge : edges) { + graph.indices[current_pos[edge.first]++] = edge.second; + } + + file.close(); + return graph; +} + +int main(int argc, char** argv) { + using vertex_t = int32_t; + using edge_t = int32_t; + using weight_t = float; + + // Parse command line arguments + std::string graph_file = "karate.mtx"; + vertex_t source = 0; + bool use_rmat = false; + int rmat_scale = 20; + int rmat_edge_factor = 16; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "--file" && i + 1 < argc) { + graph_file = argv[++i]; + } else if (arg == "--source" && i + 1 < argc) { + source = std::atoi(argv[++i]); + } else if (arg == "--rmat") { + use_rmat = true; + } else if (arg == "--scale" && i + 1 < argc) { + rmat_scale = std::atoi(argv[++i]); + } else if (arg == "--edge-factor" && i + 1 < argc) { + rmat_edge_factor = std::atoi(argv[++i]); + } else if (arg == "--help" || arg == "-h") { + std::cout << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --file Graph file in MTX format\n" + << " --source Source vertex for BFS (default: 0)\n" + << " --rmat Use RMAT generated graph instead of file\n" + << " --scale RMAT scale parameter (default: 20)\n" + << " --edge-factor RMAT edge factor (default: 16)\n" + << " --help, -h Show this help message\n"; + return 0; + } + } + + std::cout << "=== Standalone BFS Kernel ===" << std::endl; + std::cout << "Source vertex: " << source << std::endl; + + if (use_rmat) { + std::cout << "RMAT generation not supported in standalone version.\n"; + std::cout << "Please use --file option with an MTX graph file\n"; + return 1; + } + + std::cout << "Loading graph from: " << graph_file << std::endl; + + // Initialize RMM memory pool + auto cuda_mr = std::make_shared(); + auto pool_mr = std::make_shared>( + cuda_mr.get(), 1024 * 1024 * 1024ULL); // 1GB initial pool + rmm::mr::set_current_device_resource(pool_mr.get()); + + raft::handle_t handle{}; + HighResTimer hr_timer{}; + + // Load graph + auto h_graph = load_mtx_graph(graph_file); + std::cout << "Loaded graph: " << h_graph.num_vertices << " vertices, " + << h_graph.num_edges << " edges" << std::endl; + + // Copy edges to device + rmm::device_uvector d_src(h_graph.num_edges, handle.get_stream()); + rmm::device_uvector d_dst(h_graph.num_edges, handle.get_stream()); + + // Extract source and destination from indices/offsets + std::vector h_src, h_dst; + for (vertex_t v = 0; v < h_graph.num_vertices; v++) { + for (edge_t e = h_graph.offsets[v]; e < h_graph.offsets[v + 1]; e++) { + h_src.push_back(v); + h_dst.push_back(h_graph.indices[e]); + } + } + + cudaMemcpy(d_src.data(), h_src.data(), h_src.size() * sizeof(vertex_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_dst.data(), h_dst.data(), h_dst.size() * sizeof(vertex_t), cudaMemcpyHostToDevice); + + std::cout << "Constructing cugraph..." << std::endl; + hr_timer.start("Graph construction"); + + // Build graph from edge list + std::optional> d_renumber_map{std::nullopt}; + + auto [graph, edge_properties, renumber_map] = + cugraph::create_graph_from_edgelist( + handle, + std::nullopt, // vertex list + std::move(d_src), + std::move(d_dst), + std::vector{}, // no edge properties + cugraph::graph_properties_t{true, false}, // undirected, no multi-edges + true); // renumber + + auto graph_view = graph.view(); + + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + double graph_time = hr_timer.stop(); + std::cout << "Graph construction: " << (graph_time * 1000.0) << " ms" << std::endl; + + std::cout << "Running BFS from source " << source << "..." << std::endl; + + // Allocate output + rmm::device_uvector d_distances(graph_view.number_of_vertices(), handle.get_stream()); + rmm::device_uvector d_predecessors(graph_view.number_of_vertices(), handle.get_stream()); + + hr_timer.start("BFS"); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + + rmm::device_scalar const d_source(source, handle.get_stream()); + + cugraph::bfs(handle, + graph_view, + d_distances.data(), + d_predecessors.data(), + d_source.data(), + size_t{1}, + false, // direction optimizing (false for now) + std::numeric_limits::max()); + + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + double bfs_time = hr_timer.stop(); + std::cout << "BFS execution: " << (bfs_time * 1000.0) << " ms" << std::endl; + + // Copy results back + std::vector h_distances(graph_view.number_of_vertices()); + std::vector h_predecessors(graph_view.number_of_vertices()); + + cudaMemcpy(h_distances.data(), d_distances.data(), + h_distances.size() * sizeof(vertex_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_predecessors.data(), d_predecessors.data(), + h_predecessors.size() * sizeof(vertex_t), cudaMemcpyDeviceToHost); + + // Print results summary + std::cout << "\n=== BFS Results ===" << std::endl; + std::cout << "First 10 vertices:" << std::endl; + std::cout << "Vertex\tDistance\tPredecessor" << std::endl; + for (int i = 0; i < std::min(10, (int)h_distances.size()); i++) { + std::cout << i << "\t" << h_distances[i] << "\t\t"; + if (h_predecessors[i] == cugraph::invalid_vertex_id::value) { + std::cout << "None"; + } else { + std::cout << h_predecessors[i]; + } + std::cout << std::endl; + } + + // Count reachable vertices + int reachable = 0; + for (auto d : h_distances) { + if (d != std::numeric_limits::max()) reachable++; + } + std::cout << "\nReachable vertices: " << reachable << " / " << h_distances.size() << std::endl; + + std::cout << "\n=== BFS Complete ===" << std::endl; + + return 0; +} diff --git a/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt b/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt new file mode 100644 index 000000000..088a781ba --- /dev/null +++ b/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt @@ -0,0 +1,51 @@ +cmake_minimum_required(VERSION 3.18) +project(mst_standalone CUDA CXX) + +find_package(CUDAToolkit REQUIRED) + +# Try to find cuGraph installation +# First check environment variable, then common install locations +if(DEFINED ENV{CUGRAPH_ROOT}) + set(cugraph_ROOT $ENV{CUGRAPH_ROOT}) +elseif(EXISTS "$ENV{HOME}/cugraph") + set(cugraph_ROOT "$ENV{HOME}/cugraph") +else() + set(cugraph_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../external/cugraph") +endif() + +add_executable(mst_standalone mst_standalone.cu) +target_compile_features(mst_standalone PRIVATE cxx_std_17) +target_compile_options(mst_standalone PRIVATE $<$:--extended-lambda>) + +# Add include directories for cuGraph and dependencies +target_include_directories(mst_standalone PRIVATE + ${cugraph_ROOT}/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/rmm-src/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/rmm-build/include + ${cugraph_ROOT}/cpp/build/_deps/raft-src/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/raft-build/include + ${cugraph_ROOT}/cpp/build/_deps/cuvs-src/cpp/include + ${cugraph_ROOT}/cpp/build/_deps/cuvs-build/include + ${cugraph_ROOT}/cpp/build/_deps/cccl-src/libcudacxx/include + ${cugraph_ROOT}/cpp/build/_deps/cccl-src/thrust + ${cugraph_ROOT}/cpp/build/_deps/cccl-src/cub + ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-src/include +) + +# Link libraries +target_link_directories(mst_standalone PRIVATE + ${cugraph_ROOT}/cpp/build + ${cugraph_ROOT}/cpp/build/_deps/rmm-build + ${cugraph_ROOT}/cpp/build/_deps/raft-build + ${cugraph_ROOT}/cpp/build/_deps/cuvs-build + ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-build +) + +target_link_libraries(mst_standalone PRIVATE + CUDA::cudart + CUDA::cublas + cugraph + cugraph_c + rmm + rapids_logger +) diff --git a/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu b/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu new file mode 100644 index 000000000..97059096b --- /dev/null +++ b/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu @@ -0,0 +1,247 @@ +/* + * Standalone MST (Minimum Spanning Tree) kernel + * Extracted and simplified from cugraph test suite + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Simple graph loader for MTX format +template +struct SimpleWeightedGraph { + std::vector row_indices; + std::vector col_indices; + std::vector weights; + vertex_t num_vertices; + edge_t num_edges; +}; + +template +SimpleWeightedGraph load_weighted_mtx_graph(const std::string& filename) { + SimpleWeightedGraph graph; + + std::ifstream file(filename); + if (!file.is_open()) { + std::cerr << "Error: Cannot open file " << filename << std::endl; + exit(1); + } + + // Skip comments and read header + std::string line; + bool is_symmetric = false; + bool is_pattern = false; + + while (std::getline(file, line)) { + if (line[0] != '%') break; + if (line.find("symmetric") != std::string::npos) { + is_symmetric = true; + } + if (line.find("pattern") != std::string::npos) { + is_pattern = true; + } + } + + // Read dimensions from the first non-comment line + vertex_t num_rows, num_cols; + edge_t num_entries; + std::istringstream iss(line); + iss >> num_rows >> num_cols >> num_entries; + + graph.num_vertices = std::max(num_rows, num_cols); + + // Read edges + vertex_t src, dst; + weight_t weight; + + while (file >> src >> dst) { + src--; dst--; // MTX is 1-indexed + + if (is_pattern) { + weight = 1.0; // Default weight for pattern matrices + } else { + file >> weight; + } + + graph.row_indices.push_back(src); + graph.col_indices.push_back(dst); + graph.weights.push_back(weight); + + // For symmetric graphs, add reverse edge if not a self-loop + if (is_symmetric && src != dst) { + graph.row_indices.push_back(dst); + graph.col_indices.push_back(src); + graph.weights.push_back(weight); + } + } + + graph.num_edges = graph.row_indices.size(); + + file.close(); + return graph; +} + +// Convert COO to CSR format +template +struct CSRGraph { + std::vector offsets; + std::vector indices; + std::vector weights; + vertex_t num_vertices; + edge_t num_edges; +}; + +template +CSRGraph coo_to_csr(const SimpleWeightedGraph& coo) { + CSRGraph csr; + csr.num_vertices = coo.num_vertices; + csr.num_edges = coo.num_edges; + + // Initialize offsets + csr.offsets.resize(csr.num_vertices + 1, 0); + + // Count degree + for (const auto& src : coo.row_indices) { + csr.offsets[src + 1]++; + } + + // Prefix sum + for (vertex_t i = 0; i < csr.num_vertices; i++) { + csr.offsets[i + 1] += csr.offsets[i]; + } + + // Fill indices and weights + csr.indices.resize(coo.num_edges); + csr.weights.resize(coo.num_edges); + std::vector current_pos = csr.offsets; + + for (size_t i = 0; i < coo.row_indices.size(); i++) { + vertex_t src = coo.row_indices[i]; + edge_t pos = current_pos[src]++; + csr.indices[pos] = coo.col_indices[i]; + csr.weights[pos] = coo.weights[i]; + } + + return csr; +} + +int main(int argc, char** argv) { + using vertex_t = int32_t; + using edge_t = int32_t; + using weight_t = float; + + // Parse command line arguments + std::string graph_file = "graphs/karate.mtx"; + + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; + if (arg == "--file" && i + 1 < argc) { + graph_file = argv[++i]; + } else if (arg == "--help" || arg == "-h") { + std::cout << "Usage: " << argv[0] << " [options]\n" + << "Options:\n" + << " --file Graph file in MTX format (default: graphs/karate.mtx)\n" + << " --help, -h Show this help message\n"; + return 0; + } + } + + std::cout << "=== Standalone MST (Minimum Spanning Tree) Kernel ===" << std::endl; + std::cout << "Loading graph from: " << graph_file << std::endl; + + // Load graph + auto coo_graph = load_weighted_mtx_graph(graph_file); + std::cout << "Loaded graph: " << coo_graph.num_vertices << " vertices, " + << coo_graph.num_edges << " edges" << std::endl; + + // Convert to CSR + std::cout << "Converting to CSR format..." << std::endl; + auto csr_graph = coo_to_csr(coo_graph); + + // Initialize RMM memory resource + rmm::mr::cuda_memory_resource cuda_mr; + rmm::mr::pool_memory_resource pool_mr( + &cuda_mr, 512 * 1024 * 1024ULL); // 512 MB pool + rmm::mr::set_current_device_resource(&pool_mr); + + // Create RAFT handle + raft::handle_t handle; + + // Copy graph to device + std::cout << "Copying graph to device..." << std::endl; + rmm::device_uvector d_offsets(csr_graph.offsets.size(), handle.get_stream()); + rmm::device_uvector d_indices(csr_graph.indices.size(), handle.get_stream()); + rmm::device_uvector d_weights(csr_graph.weights.size(), handle.get_stream()); + + raft::update_device(d_offsets.data(), csr_graph.offsets.data(), + csr_graph.offsets.size(), handle.get_stream()); + raft::update_device(d_indices.data(), csr_graph.indices.data(), + csr_graph.indices.size(), handle.get_stream()); + raft::update_device(d_weights.data(), csr_graph.weights.data(), + csr_graph.weights.size(), handle.get_stream()); + + // Create cuGraph CSR view + std::cout << "Constructing cugraph..." << std::endl; + cugraph::legacy::GraphCSRView graph_view( + d_offsets.data(), + d_indices.data(), + d_weights.data(), + csr_graph.num_vertices, + csr_graph.num_edges); + + handle.sync_stream(); + + // Run MST + std::cout << "Running MST algorithm..." << std::endl; + + HighResTimer hr_timer{}; + hr_timer.start("MST"); + + auto mst_edges = cugraph::minimum_spanning_tree(handle, graph_view); + + handle.sync_stream(); + hr_timer.stop(); + + std::cout << "\n=== MST Results ===" << std::endl; + hr_timer.display_and_clear(std::cout); + + // Calculate MST weight + auto mst_weight = thrust::reduce( + thrust::device_pointer_cast(mst_edges->view().edge_data), + thrust::device_pointer_cast(mst_edges->view().edge_data) + mst_edges->view().number_of_edges); + + auto total_weight = thrust::reduce( + thrust::device_pointer_cast(d_weights.data()), + thrust::device_pointer_cast(d_weights.data()) + csr_graph.num_edges); + + std::cout << "MST edges: " << mst_edges->view().number_of_edges << std::endl; + std::cout << "MST total weight: " << mst_weight << std::endl; + std::cout << "Original graph total weight: " << total_weight << std::endl; + std::cout << "MST weight ratio: " << (mst_weight / total_weight * 100.0) << "%" << std::endl; + std::cout << "\nExpected MST edges for " << csr_graph.num_vertices + << " vertices: " << (csr_graph.num_vertices - 1) << std::endl; + + std::cout << "\n=== MST Complete ===" << std::endl; + + return 0; +} diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp b/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp new file mode 100644 index 000000000..d05094055 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp @@ -0,0 +1,232 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "FDTD3d.h" + +#include +#include "helper_functions.h" +#include +#include +#include + +#include "FDTD3dGPU.h" +#include "FDTD3dReference.h" + +#ifndef CLAMP +#define CLAMP(a, min, max) (MIN(max, MAX(a, min))) +#endif + +//// Name of the log file +// const char *printfFile = "FDTD3d.txt"; + +// Forward declarations +bool runTest(int argc, const char **argv); +void showHelp(const int argc, const char **argv); + +int main(int argc, char **argv) +{ + bool bTestResult = false; + // Start the log + printf("%s Starting...\n\n", argv[0]); + + // Check help flag + if (checkCmdLineFlag(argc, (const char **)argv, "help")) { + printf("Displaying help on console\n"); + showHelp(argc, (const char **)argv); + bTestResult = true; + } + else { + // Execute + bTestResult = runTest(argc, (const char **)argv); + } + + // Finish + exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE); +} + +void showHelp(const int argc, const char **argv) +{ + if (argc > 0) + std::cout << std::endl << argv[0] << std::endl; + + std::cout << std::endl << "Syntax:" << std::endl; + std::cout << std::left; + std::cout << " " << std::setw(20) << "--device=" + << "Specify device to use for execution" << std::endl; + std::cout << " " << std::setw(20) << "--dimx=" + << "Specify number of elements in x direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--dimy=" + << "Specify number of elements in y direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--dimz=" + << "Specify number of elements in z direction (excluding halo)" << std::endl; + std::cout << " " << std::setw(20) << "--radius=" + << "Specify radius of stencil" << std::endl; + std::cout << " " << std::setw(20) << "--timesteps=" + << "Specify number of timesteps" << std::endl; + std::cout << " " << std::setw(20) << "--block-size=" + << "Specify number of threads per block" << std::endl; + std::cout << std::endl; + std::cout << " " << std::setw(20) << "--noprompt" + << "Skip prompt before exit" << std::endl; + std::cout << std::endl; +} + +bool runTest(int argc, const char **argv) +{ + float *host_output; + float *device_output; + float *input; + float *coeff; + + int defaultDim; + int dimx; + int dimy; + int dimz; + int outerDimx; + int outerDimy; + int outerDimz; + int radius; + int timesteps; + size_t volumeSize; + memsize_t memsize; + + const float lowerBound = 0.0f; + const float upperBound = 1.0f; + + // Determine default dimensions + printf("Set-up, based upon target device GMEM size...\n"); + // Get the memory size of the target device + printf(" getTargetDeviceGlobalMemSize\n"); + getTargetDeviceGlobalMemSize(&memsize, argc, argv); + + // We can never use all the memory so to keep things simple we aim to + // use around half the total memory + memsize /= 2; + + // Most of our memory use is taken up by the input and output buffers - + // two buffers of equal size - and for simplicity the volume is a cube: + // dim = floor( (N/2)^(1/3) ) + defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0 / 3.0)); + + // By default, make the volume edge size an integer multiple of 128B to + // improve performance by coalescing memory accesses, in a real + // application it would make sense to pad the lines accordingly + int roundTarget = 128 / sizeof(float); + defaultDim = defaultDim / roundTarget * roundTarget; + defaultDim -= k_radius_default * 2; + + // Check dimension is valid + if (defaultDim < k_dim_min) { + printf("insufficient device memory (maximum volume on device is %d, must be " + "between %d and %d).\n", + defaultDim, + k_dim_min, + k_dim_max); + exit(EXIT_FAILURE); + } + else if (defaultDim > k_dim_max) { + defaultDim = k_dim_max; + } + + // For QA testing, override default volume size + if (checkCmdLineFlag(argc, argv, "qatest")) { + defaultDim = MIN(defaultDim, k_dim_qa); + } + + // set default dim + dimx = defaultDim; + dimy = defaultDim; + dimz = defaultDim; + radius = k_radius_default; + timesteps = k_timesteps_default; + + // Parse command line arguments + if (checkCmdLineFlag(argc, argv, "dimx")) { + dimx = CLAMP(getCmdLineArgumentInt(argc, argv, "dimx"), k_dim_min, k_dim_max); + } + + if (checkCmdLineFlag(argc, argv, "dimy")) { + dimy = CLAMP(getCmdLineArgumentInt(argc, argv, "dimy"), k_dim_min, k_dim_max); + } + + if (checkCmdLineFlag(argc, argv, "dimz")) { + dimz = CLAMP(getCmdLineArgumentInt(argc, argv, "dimz"), k_dim_min, k_dim_max); + } + + if (checkCmdLineFlag(argc, argv, "radius")) { + radius = CLAMP(getCmdLineArgumentInt(argc, argv, "radius"), k_radius_min, k_radius_max); + } + + if (checkCmdLineFlag(argc, argv, "timesteps")) { + timesteps = CLAMP(getCmdLineArgumentInt(argc, argv, "timesteps"), k_timesteps_min, k_timesteps_max); + } + + // Determine volume size + outerDimx = dimx + 2 * radius; + outerDimy = dimy + 2 * radius; + outerDimz = dimz + 2 * radius; + volumeSize = outerDimx * outerDimy * outerDimz; + + // Allocate memory + host_output = (float *)calloc(volumeSize, sizeof(float)); + input = (float *)malloc(volumeSize * sizeof(float)); + coeff = (float *)malloc((radius + 1) * sizeof(float)); + + // Create coefficients + for (int i = 0; i <= radius; i++) { + coeff[i] = 0.1f; + } + + // Generate data + printf(" generateRandomData\n\n"); + generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound); + printf("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d " + "timesteps...\n\n", + dimx, + dimy, + dimz, + radius, + timesteps); + + // Execute on the host + printf("fdtdReference...\n"); + fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps); + printf("fdtdReference complete\n"); + + // Allocate memory + device_output = (float *)calloc(volumeSize, sizeof(float)); + + // Execute on the device + printf("fdtdGPU...\n"); + fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv); + printf("fdtdGPU complete\n"); + + // Compare the results + float tolerance = 0.0001f; + printf("\nCompareData (tolerance %f)...\n", tolerance); + return compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance); +} diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3d.h b/src/cuda/HPC/image/FDTD3d/FDTD3d.h new file mode 100644 index 000000000..f943ab1a7 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3d.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FDTD3D_H_ +#define _FDTD3D_H_ + +// The values are set to give reasonable runtimes, they can +// be changed but note that running very large dimensions can +// take a very long time and you should avoid running on your +// primary display in this case. +#define k_dim_min 96 +#define k_dim_max 8192 +#define k_dim_qa 248 + +// Note that the radius is defined here as exactly 4 since the +// kernel code uses a constant. If you want a different radius +// you must change the kernel accordingly. +#define k_radius_min 4 +#define k_radius_max 4 +#define k_radius_default 4 + +// The values are set to give reasonable runtimes, they can +// be changed but note that running a very large number of +// timesteps can take a very long time and you should avoid +// running on your primary display in this case. +#define k_timesteps_min 1 +#define k_timesteps_max 10 +#define k_timesteps_default 5 + +#endif diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu new file mode 100644 index 000000000..6dc2caf35 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu @@ -0,0 +1,260 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include "helper_cuda.h" +#include "helper_functions.h" +#include + +#include "FDTD3dGPU.h" +#include "FDTD3dGPUKernel.cuh" + +bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv) +{ + int deviceCount = 0; + int targetDevice = 0; + size_t memsize = 0; + + // Get the number of CUDA enabled GPU devices + printf(" cudaGetDeviceCount\n"); + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + + // Select target device (device 0 by default) + targetDevice = findCudaDevice(argc, (const char **)argv); + + // Query target device for maximum memory allocation + printf(" cudaGetDeviceProperties\n"); + struct cudaDeviceProp deviceProp; + checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice)); + + memsize = deviceProp.totalGlobalMem; + + // Save the result + *result = (memsize_t)memsize; + return true; +} + +bool fdtdGPU(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps, + const int argc, + const char **argv) +{ + const int outerDimx = dimx + 2 * radius; + const int outerDimy = dimy + 2 * radius; + const int outerDimz = dimz + 2 * radius; + const size_t volumeSize = outerDimx * outerDimy * outerDimz; + int deviceCount = 0; + int targetDevice = 0; + float *bufferOut = 0; + float *bufferIn = 0; + dim3 dimBlock; + dim3 dimGrid; + + // Ensure that the inner data starts on a 128B boundary + const int padding = (128 / sizeof(float)) - radius; + const size_t paddedVolumeSize = volumeSize + padding; + +#ifdef GPU_PROFILING + cudaEvent_t profileStart = 0; + cudaEvent_t profileEnd = 0; + const int profileTimesteps = timesteps - 1; + + if (profileTimesteps < 1) { + printf(" cannot profile with fewer than two timesteps (timesteps=%d), " + "profiling is disabled.\n", + timesteps); + } + +#endif + + // Check the radius is valid + if (radius != RADIUS) { + printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS); + exit(EXIT_FAILURE); + } + + // Get the number of CUDA enabled GPU devices + checkCudaErrors(cudaGetDeviceCount(&deviceCount)); + + // Select target device (device 0 by default) + targetDevice = findCudaDevice(argc, (const char **)argv); + + checkCudaErrors(cudaSetDevice(targetDevice)); + + // Allocate memory buffers + checkCudaErrors(cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float))); + checkCudaErrors(cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float))); + + // Check for a command-line specified block size + int userBlockSize; + + if (checkCmdLineFlag(argc, (const char **)argv, "block-size")) { + userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size"); + // Constrain to a multiple of k_blockDimX + userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX); + + // Constrain within allowed bounds + userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax); + } + else { + userBlockSize = k_blockSizeMax; + } + + // Check the device limit on the number of threads + struct cudaFuncAttributes funcAttrib; + checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel)); + + userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock); + + // Set the block size + dimBlock.x = k_blockDimX; + // Visual Studio 2005 does not like std::min + // dimBlock.y = std::min(userBlockSize / k_blockDimX, + // (size_t)k_blockDimMaxY); + dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY) ? (userBlockSize / k_blockDimX) + : (size_t)k_blockDimMaxY; + dimGrid.x = (unsigned int)ceil((float)dimx / dimBlock.x); + dimGrid.y = (unsigned int)ceil((float)dimy / dimBlock.y); + printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y); + printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y); + + // Check the block size is valid + if (dimBlock.x < RADIUS || dimBlock.y < RADIUS) { + printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n", dimBlock.x, dimBlock.y, RADIUS); + exit(EXIT_FAILURE); + } + + // Copy the input to the device input buffer + checkCudaErrors(cudaMemcpy(bufferIn + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice)); + + // Copy the input to the device output buffer (actually only need the halo) + checkCudaErrors(cudaMemcpy(bufferOut + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice)); + + // Copy the coefficients to the device coefficient buffer + checkCudaErrors(cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float))); + +#ifdef GPU_PROFILING + + // Create the events + checkCudaErrors(cudaEventCreate(&profileStart)); + checkCudaErrors(cudaEventCreate(&profileEnd)); + +#endif + + // Execute the FDTD + float *bufferSrc = bufferIn + padding; + float *bufferDst = bufferOut + padding; + printf(" GPU FDTD loop\n"); + +#ifdef GPU_PROFILING + // Enqueue start event + checkCudaErrors(cudaEventRecord(profileStart, 0)); +#endif + + for (int it = 0; it < timesteps; it++) { + printf("\tt = %d ", it); + + // Launch the kernel + printf("launch kernel\n"); + FiniteDifferencesKernel<<>>(bufferDst, bufferSrc, dimx, dimy, dimz); + + // Toggle the buffers + // Visual Studio 2005 does not like std::swap + // std::swap(bufferSrc, bufferDst); + float *tmp = bufferDst; + bufferDst = bufferSrc; + bufferSrc = tmp; + } + + printf("\n"); + +#ifdef GPU_PROFILING + // Enqueue end event + checkCudaErrors(cudaEventRecord(profileEnd, 0)); +#endif + + // Wait for the kernel to complete + checkCudaErrors(cudaDeviceSynchronize()); + + // Read the result back, result is in bufferSrc (after final toggle) + checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float), cudaMemcpyDeviceToHost)); + +// Report time +#ifdef GPU_PROFILING + float elapsedTimeMS = 0; + + if (profileTimesteps > 0) { + checkCudaErrors(cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd)); + } + + if (profileTimesteps > 0) { + // Convert milliseconds to seconds + double elapsedTime = elapsedTimeMS * 1.0e-3; + double avgElapsedTime = elapsedTime / (double)profileTimesteps; + // Determine number of computations per timestep + size_t pointsComputed = dimx * dimy * dimz; + // Determine throughput + double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime; + printf("FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, " + "NumDevsUsed = %u, Blocksize = %u\n", + throughputM, + avgElapsedTime, + pointsComputed, + 1, + dimBlock.x * dimBlock.y); + } + +#endif + + // Cleanup + if (bufferIn) { + checkCudaErrors(cudaFree(bufferIn)); + } + + if (bufferOut) { + checkCudaErrors(cudaFree(bufferOut)); + } + +#ifdef GPU_PROFILING + + if (profileStart) { + checkCudaErrors(cudaEventDestroy(profileStart)); + } + + if (profileEnd) { + checkCudaErrors(cudaEventDestroy(profileEnd)); + } + +#endif + return true; +} diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h new file mode 100644 index 000000000..463f7e1d9 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h @@ -0,0 +1,56 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FDTD3DGPU_H_ +#define _FDTD3DGPU_H_ + +#include +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) && defined(_MSC_VER) +typedef unsigned __int64 memsize_t; +#else +#include +typedef uint64_t memsize_t; +#endif + +#define k_blockDimX 32 +#define k_blockDimMaxY 16 +#define k_blockSizeMin 128 +#define k_blockSizeMax (k_blockDimX * k_blockDimMaxY) + +bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv); +bool fdtdGPU(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps, + const int argc, + const char **argv); + +#endif diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh b/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh new file mode 100644 index 000000000..c6a473559 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh @@ -0,0 +1,160 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include + +#include "FDTD3dGPU.h" + +namespace cg = cooperative_groups; + +// Note: If you change the RADIUS, you should also change the unrolling below +#define RADIUS 4 + +__constant__ float stencil[RADIUS + 1]; + +__global__ void +FiniteDifferencesKernel(float *output, const float *input, const int dimx, const int dimy, const int dimz) +{ + bool validr = true; + bool validw = true; + const int gtidx = blockIdx.x * blockDim.x + threadIdx.x; + const int gtidy = blockIdx.y * blockDim.y + threadIdx.y; + const int ltidx = threadIdx.x; + const int ltidy = threadIdx.y; + const int workx = blockDim.x; + const int worky = blockDim.y; + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + __shared__ float tile[k_blockDimMaxY + 2 * RADIUS][k_blockDimX + 2 * RADIUS]; + + const int stride_y = dimx + 2 * RADIUS; + const int stride_z = stride_y * (dimy + 2 * RADIUS); + + int inputIndex = 0; + int outputIndex = 0; + + // Advance inputIndex to start of inner volume + inputIndex += RADIUS * stride_y + RADIUS; + + // Advance inputIndex to target element + inputIndex += gtidy * stride_y + gtidx; + + float infront[RADIUS]; + float behind[RADIUS]; + float current; + + const int tx = ltidx + RADIUS; + const int ty = ltidy + RADIUS; + + // Check in bounds + if ((gtidx >= dimx + RADIUS) || (gtidy >= dimy + RADIUS)) + validr = false; + + if ((gtidx >= dimx) || (gtidy >= dimy)) + validw = false; + + // Preload the "infront" and "behind" data + for (int i = RADIUS - 2; i >= 0; i--) { + if (validr) + behind[i] = input[inputIndex]; + + inputIndex += stride_z; + } + + if (validr) + current = input[inputIndex]; + + outputIndex = inputIndex; + inputIndex += stride_z; + + for (int i = 0; i < RADIUS; i++) { + if (validr) + infront[i] = input[inputIndex]; + + inputIndex += stride_z; + } + +// Step through the xy-planes +#pragma unroll 9 + + for (int iz = 0; iz < dimz; iz++) { + // Advance the slice (move the thread-front) + for (int i = RADIUS - 1; i > 0; i--) + behind[i] = behind[i - 1]; + + behind[0] = current; + current = infront[0]; +#pragma unroll 4 + + for (int i = 0; i < RADIUS - 1; i++) + infront[i] = infront[i + 1]; + + if (validr) + infront[RADIUS - 1] = input[inputIndex]; + + inputIndex += stride_z; + outputIndex += stride_z; + cg::sync(cta); + + // Note that for the work items on the boundary of the problem, the + // supplied index when reading the halo (below) may wrap to the + // previous/next row or even the previous/next xy-plane. This is + // acceptable since a) we disable the output write for these work + // items and b) there is at least one xy-plane before/after the + // current plane, so the access will be within bounds. + + // Update the data slice in the local tile + // Halo above & below + if (ltidy < RADIUS) { + tile[ltidy][tx] = input[outputIndex - RADIUS * stride_y]; + tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y]; + } + + // Halo left & right + if (ltidx < RADIUS) { + tile[ty][ltidx] = input[outputIndex - RADIUS]; + tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx]; + } + + tile[ty][tx] = current; + cg::sync(cta); + + // Compute the output value + float value = stencil[0] * current; +#pragma unroll 4 + + for (int i = 1; i <= RADIUS; i++) { + value += stencil[i] + * (infront[i - 1] + behind[i - 1] + tile[ty - i][tx] + tile[ty + i][tx] + tile[ty][tx - i] + + tile[ty][tx + i]); + } + + // Store the output value + if (validw) + output[outputIndex] = value; + } +} diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp new file mode 100644 index 000000000..6ece3469c --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp @@ -0,0 +1,191 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "FDTD3dReference.h" + +#include +#include +#include +#include +#include + +void generateRandomData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound) +{ + srand(0); + + for (int iz = 0; iz < dimz; iz++) { + for (int iy = 0; iy < dimy; iy++) { + for (int ix = 0; ix < dimx; ix++) { + *data = (float)(lowerBound + ((float)rand() / (float)RAND_MAX) * (upperBound - lowerBound)); + ++data; + } + } + } +} + +void generatePatternData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound) +{ + for (int iz = 0; iz < dimz; iz++) { + for (int iy = 0; iy < dimy; iy++) { + for (int ix = 0; ix < dimx; ix++) { + *data = (float)(lowerBound + ((float)iz / (float)dimz) * (upperBound - lowerBound)); + ++data; + } + } + } +} + +bool fdtdReference(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps) +{ + const int outerDimx = dimx + 2 * radius; + const int outerDimy = dimy + 2 * radius; + const int outerDimz = dimz + 2 * radius; + const size_t volumeSize = outerDimx * outerDimy * outerDimz; + const int stride_y = outerDimx; + const int stride_z = stride_y * outerDimy; + float *intermediate = 0; + const float *bufsrc = 0; + float *bufdst = 0; + float *bufdstnext = 0; + + // Allocate temporary buffer + printf(" calloc intermediate\n"); + intermediate = (float *)calloc(volumeSize, sizeof(float)); + + // Decide which buffer to use first (result should end up in output) + if ((timesteps % 2) == 0) { + bufsrc = input; + bufdst = intermediate; + bufdstnext = output; + } + else { + bufsrc = input; + bufdst = output; + bufdstnext = intermediate; + } + + // Run the FDTD (naive method) + printf(" Host FDTD loop\n"); + + for (int it = 0; it < timesteps; it++) { + printf("\tt = %d\n", it); + const float *src = bufsrc; + float *dst = bufdst; + + for (int iz = -radius; iz < dimz + radius; iz++) { + for (int iy = -radius; iy < dimy + radius; iy++) { + for (int ix = -radius; ix < dimx + radius; ix++) { + if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) { + float value = (*src) * coeff[0]; + + for (int ir = 1; ir <= radius; ir++) { + value += coeff[ir] * (*(src + ir) + *(src - ir)); // horizontal + value += coeff[ir] * (*(src + ir * stride_y) + *(src - ir * stride_y)); // vertical + value += coeff[ir] * (*(src + ir * stride_z) + *(src - ir * stride_z)); // in front & behind + } + + *dst = value; + } + else { + *dst = *src; + } + + ++dst; + ++src; + } + } + } + + // Rotate buffers + float *tmp = bufdst; + bufdst = bufdstnext; + bufdstnext = tmp; + bufsrc = (const float *)tmp; + } + + printf("\n"); + + if (intermediate) + free(intermediate); + + return true; +} + +bool compareData(const float *output, + const float *reference, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const float tolerance) +{ + for (int iz = -radius; iz < dimz + radius; iz++) { + for (int iy = -radius; iy < dimy + radius; iy++) { + for (int ix = -radius; ix < dimx + radius; ix++) { + if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) { + // Determine the absolute difference + float difference = fabs(*reference - *output); + float error; + + // Determine the relative error + if (*reference != 0) + error = difference / *reference; + else + error = difference; + + // Check the error is within the tolerance + if (error > tolerance) { + printf("Data error at point (%d,%d,%d)\t%f instead of %f\n", ix, iy, iz, *output, *reference); + return false; + } + } + + ++output; + ++reference; + } + } + } + + return true; +} diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h new file mode 100644 index 000000000..f39ad62be --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _FDTD3DREFERENCE_H_ +#define _FDTD3DREFERENCE_H_ + +void generateRandomData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound); +void generatePatternData(float *data, + const int dimx, + const int dimy, + const int dimz, + const float lowerBound, + const float upperBound); +bool fdtdReference(float *output, + const float *input, + const float *coeff, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const int timesteps); +bool compareData(const float *output, + const float *reference, + const int dimx, + const int dimy, + const int dimz, + const int radius, + const float tolerance = 0.0001f); + +#endif diff --git a/src/cuda/HPC/image/FDTD3d/Makefile b/src/cuda/HPC/image/FDTD3d/Makefile new file mode 100644 index 000000000..183bc8b4c --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/Makefile @@ -0,0 +1,8 @@ +EXECUTABLE := FDTD3d_hpc +CUFILES := FDTD3dGPU.cu +CCFILES := FDTD3d.cpp FDTD3dReference.cpp +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +include ../../../common/common.mk diff --git a/src/cuda/HPC/image/FDTD3d/exception.h b/src/cuda/HPC/image/FDTD3d/exception.h new file mode 100644 index 000000000..ca8ac2525 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/src/cuda/HPC/image/FDTD3d/helper_cuda.h b/src/cuda/HPC/image/FDTD3d/helper_cuda.h new file mode 100644 index 000000000..723b28f7a --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/helper_cuda.h @@ -0,0 +1,988 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + + case CUFFT_MISSING_DEPENDENCY: + return "CUFFT_MISSING_DEPENDENCY"; + + case CUFFT_NVRTC_FAILURE: + return "CUFFT_NVRTC_FAILURE"; + + case CUFFT_NVJITLINK_FAILURE: + return "CUFFT_NVJITLINK_FAILURE"; + + case CUFFT_NVSHMEM_FAILURE: + return "CUFFT_NVSHMEM_FAILURE"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x89, 128}, + {0x90, 128}, + {0xa0, 128}, + {0xa1, 128}, + {0xa3, 128}, + {0xb0, 128}, + {0xc0, 128}, + {0xc1, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x89, "Ada"}, + {0x90, "Hopper"}, + {0xa0, "Blackwell"}, + {0xa1, "Blackwell"}, + {0xa3, "Blackwell"}, + {0xb0, "Blackwell"}, + {0xc0, "Blackwell"}, + {0xc1, "Blackwell"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); + cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); + if (result != cudaSuccess) { + // If cudaDevAttrClockRate attribute is not supported we + // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. + if(result == cudaErrorInvalidValue) { + clockRate = 1; + } + else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, + static_cast(result), _cudaGetErrorEnum(result)); + exit(EXIT_FAILURE); + } + } + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (integrated && (computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/src/cuda/HPC/image/FDTD3d/helper_functions.h b/src/cuda/HPC/image/FDTD3d/helper_functions.h new file mode 100644 index 000000000..f71da5761 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include "exception.h" +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include "helper_image.h" // helper functions for image compare, dump, data comparisons +#include "helper_string.h" // helper functions for string parsing +#include "helper_timer.h" // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/src/cuda/HPC/image/FDTD3d/helper_image.h b/src/cuda/HPC/image/FDTD3d/helper_image.h new file mode 100644 index 000000000..7e81b0cd8 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include "exception.h" +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include "helper_string.h" + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/src/cuda/HPC/image/FDTD3d/helper_string.h b/src/cuda/HPC/image/FDTD3d/helper_string.h new file mode 100644 index 000000000..39a1b3805 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/src/cuda/HPC/image/FDTD3d/helper_timer.h b/src/cuda/HPC/image/FDTD3d/helper_timer.h new file mode 100644 index 000000000..0614a7802 --- /dev/null +++ b/src/cuda/HPC/image/FDTD3d/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include "exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/Makefile b/src/cuda/HPC/image/dwtHaar1D/Makefile new file mode 100644 index 000000000..b34033e13 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/Makefile @@ -0,0 +1,8 @@ +EXECUTABLE := dwtHaar1D_hpc +CUFILES := dwtHaar1D.cu +CCFILES := +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +include ../../../common/common.mk diff --git a/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu new file mode 100644 index 000000000..9a80481cc --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu @@ -0,0 +1,397 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +* 1D DWT for Haar wavelet and signals with a length which is a power of 2. +* The code reduces bank conflicts and non-coalesced reads / writes as +* appropriate but does not fully remove them because the computational +* overhead to achieve this would outweighs the benefit (see inline comments +* for more details). +* Large signals are subdivided into sub-signals with 512 elements and the +* wavelet transform for these is computed with one block over 10 decomposition +* levels. The resulting signal consisting of the approximation coefficients at +* level X is then processed in a subsequent step on the device. This requires +* interblock synchronization which is only possible on host side. +* Detail coefficients which have been computed are not further referenced +* during the decomposition so that they can be stored directly in their final +* position in global memory. The transform and its storing scheme preserve +* locality in the coefficients so that these writes are coalesced. +* Approximation coefficients are stored in shared memory because they are +* needed to compute the subsequent decomposition step. The top most +* approximation coefficient for a sub-signal processed by one block is stored +* in a special global memory location to simplify the processing after the +* interblock synchronization. +* Most books on wavelets explain the Haar wavelet decomposition. A good freely +* available resource is the Wavelet primer by Stollnitz et al. +* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf +* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf +* The basic of all Wavelet transforms is to decompose a signal into +* approximation (a) and detail (d) coefficients where the detail tends to be +* small or zero which allows / simplifies compression. The following "graphs" +* demonstrate the transform for a signal +* of length eight. The index always describes the decomposition level where +* a coefficient arises. The input signal is interpreted as approximation signal +* at level 0. The coefficients computed on the device are stored in the same +* scheme as in the example. This data structure is particularly well suited for +* compression and also preserves the hierarchical structure of the +decomposition. + +------------------------------------------------- +| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | +------------------------------------------------- + +------------------------------------------------- +| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +------------------------------------------------- +| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +------------------------------------------------- +| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +* Host code. +*/ + +#ifdef _WIN32 +#define NOMINMAX +#endif + +// includes, system +#include +#include +#include +#include +#include + +// includes, project +#include "helper_cuda.h" +#include "helper_functions.h" + +// constants which are used in host and device code +#define INV_SQRT_2 0.70710678118654752440f; +const unsigned int LOG_NUM_BANKS = 4; +const unsigned int NUM_BANKS = 16; + +//////////////////////////////////////////////////////////////////////////////// +// includes, kernels +#include "dwtHaar1D_kernel.cuh" + +//////////////////////////////////////////////////////////////////////////////// +// declaration, forward +void runTest(int argc, char **argv); +bool getLevels(unsigned int len, unsigned int *levels); + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + // run test + runTest(argc, argv); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Perform the wavelet decomposition +//////////////////////////////////////////////////////////////////////////////// +void runTest(int argc, char **argv) +{ + bool bResult = false; // flag for final validation of the results + + char *s_fname = NULL, *r_gold_fname = NULL; + char r_fname[256]; + const char usage[] = {"\nUsage:\n" + " dwtHaar1D --signal= --result= " + "[--gold=]\n\n" + " Input file containing the signal\n" + " Output file storing the result of the wavelet " + "decomposition\n" + " (Optional) Input file containing the reference result of the " + "wavelet decomposition\n" + "\nExample:\n" + " ./dwtHaar1D\n" + " --signal=signal.dat\n" + " --result=result.dat\n" + " --gold=regression.gold.dat\n"}; + + printf("%s Starting...\n\n", argv[0]); + + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + findCudaDevice(argc, (const char **)argv); + + // file names, either specified as cmd line args or use default + if (argc >= 3) { + char *tmp_sfname, *tmp_rfname, *tmp_goldfname; + + if ((getCmdLineArgumentString(argc, (const char **)argv, "signal", &tmp_sfname) != true) + || (getCmdLineArgumentString(argc, (const char **)argv, "result", &tmp_rfname) != true)) { + fprintf(stderr, "Invalid input syntax.\n%s", usage); + exit(EXIT_FAILURE); + } + + s_fname = sdkFindFilePath(tmp_sfname, argv[0]); + strcpy(r_fname, tmp_rfname); + + // Gold file is optional + if (getCmdLineArgumentString(argc, (const char **)argv, "gold", &tmp_goldfname) == true) { + r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]); + } + } + else { + s_fname = sdkFindFilePath("signal.dat", argv[0]); + r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]); + strcpy(r_fname, "result.dat"); + } + + printf("source file = \"%s\"\n", s_fname); + printf("reference file = \"%s\"\n", r_fname); + if (r_gold_fname != NULL) { + printf("gold file = \"%s\"\n", r_gold_fname); + } else { + printf("gold file = (none - validation skipped)\n"); + } + + // read in signal + unsigned int slength = 0; + float *signal = NULL; + + if (s_fname == NULL) { + fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage); + + exit(EXIT_FAILURE); + } + + if (sdkReadFile(s_fname, &signal, &slength, false) == true) { + printf("Reading signal from \"%s\"\n", s_fname); + } + else { + exit(EXIT_FAILURE); + } + + // get the number of decompositions necessary to perform a full decomposition + unsigned int dlevels_complete = 0; + + if (true != getLevels(slength, &dlevels_complete)) { + // error message + fprintf(stderr, "Signal length not supported.\n"); + // cleanup and abort + free(signal); + exit(EXIT_FAILURE); + } + + // device in data + float *d_idata = NULL; + // device out data + float *d_odata = NULL; + // device approx_final data + float *approx_final = NULL; + // The very final approximation coefficient has to be written to the output + // data, all others are reused as input data in the next global step and + // therefore have to be written to the input data again. + // The following flag indicates where to copy approx_final data + // - 0 is input, 1 is output + int approx_is_input; + + // allocate device mem + const unsigned int smem_size = sizeof(float) * slength; + checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size)); + checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size)); + checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size)); + // copy input data to device + checkCudaErrors(cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice)); + + // total number of threads + // in the first decomposition step always one thread computes the average and + // detail signal for one pair of adjacent values + unsigned int num_threads_total_left = slength / 2; + // decomposition levels performed in the current / next step + unsigned int dlevels_step = dlevels_complete; + + // 1D signal so the arrangement of elements is also 1D + dim3 block_size; + dim3 grid_size; + + // number of decomposition levels left after one iteration on the device + unsigned int dlevels_left = dlevels_complete; + + // if less or equal 1k elements, then the data can be processed in one block, + // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the + // computation is split across multiple SM's if enough input data + if (dlevels_complete <= 10) { + // decomposition can be performed at once + block_size.x = num_threads_total_left; + approx_is_input = 0; + } + else { + // 512 threads per block + grid_size.x = (num_threads_total_left / 512); + block_size.x = 512; + + // 512 threads corresponds to 10 decomposition steps + dlevels_step = 10; + dlevels_left -= 10; + + approx_is_input = 1; + } + + // Initialize d_odata to 0.0f + initValue<<>>(d_odata, 0.0f); + + // do until full decomposition is accomplished + while (0 != num_threads_total_left) { + // double the number of threads as bytes + unsigned int mem_shared = (2 * block_size.x) * sizeof(float); + // extra memory requirements to avoid bank conflicts + mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float); + + // run kernel + dwtHaar1D<<>>( + d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x); + + // Copy approx_final to appropriate location + if (approx_is_input) { + checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); + } + else { + checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice)); + } + + // update level variables + if (dlevels_left < 10) { + // approx_final = d_odata; + approx_is_input = 0; + } + + // more global steps necessary + dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left; + dlevels_left -= 10; + + // after each step only half the threads are used any longer + // therefore after 10 steps 2^10 less threads + num_threads_total_left = num_threads_total_left >> 10; + + // update block and grid size + grid_size.x = (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512)) ? 1 : 0; + + if (grid_size.x <= 1) { + block_size.x = num_threads_total_left; + } + } + + // get the result back from the server + // allocate mem for the result + float *odata = (float *)malloc(smem_size); + checkCudaErrors(cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost)); + + // post processing + // write file for regression test + if (r_fname == NULL) { + fprintf(stderr, + "Cannot write the output file storing the result of the wavelet " + "decomposition.\n%s", + usage); + exit(EXIT_FAILURE); + } + + if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) { + printf("Writing result to \"%s\"\n", r_fname); + } + else { + exit(EXIT_FAILURE); + } + + // load the reference solution + unsigned int len_reference = 0; + float *reference = NULL; + + // Validation is optional - only run if gold file is provided + if (r_gold_fname != NULL) { + if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) { + printf("Reading reference result from \"%s\"\n", r_gold_fname); + } + else { + fprintf(stderr, "Failed to read gold file \"%s\"\n", r_gold_fname); + exit(EXIT_FAILURE); + } + + assert(slength == len_reference); + + // compare the computed solution and the reference + bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f); + free(reference); + } else { + printf("No gold file provided - skipping validation\n"); + bResult = true; // Mark as success when validation is skipped + } + + // free allocated host and device memory + checkCudaErrors(cudaFree(d_odata)); + checkCudaErrors(cudaFree(d_idata)); + checkCudaErrors(cudaFree(approx_final)); + + free(signal); + free(odata); + free(s_fname); + if (r_gold_fname != NULL) { + free(r_gold_fname); + } + + if (r_gold_fname != NULL) { + printf(bResult ? "Test success!\n" : "Test failure!\n"); + } else { + printf("Processing complete (validation skipped)\n"); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Get number of decomposition levels to perform a full decomposition +//! Also check if the input signal size is suitable +//! @return true if the number of decomposition levels could be determined +//! and the signal length is supported by the implementation, +//! otherwise false +//! @param len length of input signal +//! @param levels number of decomposition levels necessary to perform a full +//! decomposition +//////////////////////////////////////////////////////////////////////////////// +bool getLevels(unsigned int len, unsigned int *levels) +{ + bool retval = false; + + // currently signals up to a length of 2^20 supported + for (unsigned int i = 0; i < 20; ++i) { + if (len == (1 << i)) { + *levels = i; + retval = true; + break; + } + } + + return retval; +} diff --git a/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh new file mode 100644 index 000000000..3bf1e6588 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh @@ -0,0 +1,248 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* +* 1D DWT for Haar wavelet and signals with a length which is a power of 2. +* The code reduces bank conflicts and non-coalesced reads / writes as +* appropriate but does not fully remove them because the computational +* overhead to achieve this would outweighs the benefit (see inline comments +* for more details). +* Large signals are subdivided into sub-signals with 512 elements and the +* wavelet transform for these is computed with one block over 10 decomposition +* levels. The resulting signal consisting of the approximation coefficients at +* level X is then processed in a subsequent step on the device. This requires +* interblock synchronization which is only possible on host side. +* Detail coefficients which have been computed are not further referenced +* during the decomposition so that they can be stored directly in their final +* position in global memory. The transform and its storing scheme preserve +* locality in the coefficients so that these writes are coalesced. +* Approximation coefficients are stored in shared memory because they are +* needed to compute the subsequent decomposition step. The top most +* approximation coefficient for a sub-signal processed by one block is stored +* in a special global memory location to simplify the processing after the +* interblock synchronization. +* Most books on wavelets explain the Haar wavelet decomposition. A good freely +* available resource is the Wavelet primer by Stollnitz et al. +* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf +* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf +* The basic of all Wavelet transforms is to decompose a signal into +* approximation (a) and detail (d) coefficients where the detail tends to be +* small or zero which allows / simplifies compression. The following "graphs" +* demonstrate the transform for a signal +* of length eight. The index always describes the decomposition level where +* a coefficient arises. The input signal is interpreted as approximation signal +* at level 0. The coefficients computed on the device are stored in the same +* scheme as in the example. This data structure is particularly well suited for +* compression and also preserves the hierarchical structure of the +decomposition. + +------------------------------------------------- +| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | +------------------------------------------------- + +------------------------------------------------- +| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +------------------------------------------------- +| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +------------------------------------------------- +| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 | +------------------------------------------------- + +* Device Code. +*/ + +#ifndef _DWTHAAR1D_KERNEL_H_ +#define _DWTHAAR1D_KERNEL_H_ + +#include + +namespace cg = cooperative_groups; + +//////////////////////////////////////////////////////////////////////////////// +//! @param id input data +//! @param od output data +//! @param value +//////////////////////////////////////////////////////////////////////////////// +__global__ void initValue(float *od, float value) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + // position of write into global memory + unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x; + + od[index] = value; + + // sync after each decomposition step + cg::sync(cta); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Compute partial wavelet decomposition on the GPU using Haar basis +//! For each thread block the full decomposition is computed but these results +//! have to be combined +//! Use one thread to perform the full decomposition +//! @param id input data +//! @param od output data +//! @param approx_final place to store the final approximation coefficient for +//! the subsignal +//! @param dlevels number of decomposition levels for this transform +//! @param slength_step_half half signal length for current decomposition +//! level (offset for storing detail coefficients in +//! global memory +//! @param bdim block dimension +//////////////////////////////////////////////////////////////////////////////// +__global__ void dwtHaar1D(float *id, + float *od, + float *approx_final, + const unsigned int dlevels, + const unsigned int slength_step_half, + const int bdim) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + + // shared memory for part of the signal + extern __shared__ float shared[]; + + // thread runtime environment, 1D parametrization + const int gdim = gridDim.x; + // const int bdim = blockDim.x; + const int bid = blockIdx.x; + const int tid = threadIdx.x; + + // global thread id (w.r.t. to total data set) + const int tid_global = (bid * bdim) + tid; + unsigned int idata = (bid * (2 * bdim)) + tid; + + // read data from global memory + shared[tid] = id[idata]; + shared[tid + bdim] = id[idata + bdim]; + cg::sync(cta); + + // this operation has a two way bank conflicts for all threads, this are two + // additional cycles for each warp -- all alternatives to avoid this bank + // conflict are more expensive than the one cycle introduced by serialization + float data0 = shared[2 * tid]; + float data1 = shared[(2 * tid) + 1]; + cg::sync(cta); + + // detail coefficient, not further referenced so directly store in + // global memory + od[tid_global + slength_step_half] = (data0 - data1) * INV_SQRT_2; + + // offset to avoid bank conflicts + // see the scan example for a more detailed description + unsigned int atid = tid + (tid >> LOG_NUM_BANKS); + + // approximation coefficient + // store in shared memory for further decomposition steps in this global step + shared[atid] = (data0 + data1) * INV_SQRT_2; + + // all threads have to write approximation coefficient to shared memory before + // next steps can take place + cg::sync(cta); + + // early out if possible + // the compiler removes this part from the source because dlevels is + // a constant shader input + // note: syncthreads in bodies of branches can lead to dead-locks unless + // the condition evaluates the same way for ALL threads of a block, as in + // this case + if (dlevels > 1) { + // offset to second element in shared element which has to be used for the + // decomposition, effectively 2^(i - 1) + unsigned int offset_neighbor = 1; + // number of active threads per decomposition level + // identical to the offset for the detail coefficients + unsigned int num_threads = bdim >> 1; + + // index for the first element of the pair to process + // the representation is still compact (and therefore still tid * 2) + // because the first step operated on registers and only the result has been + // written to shared memory + unsigned int idata0 = tid * 2; + + // offset levels to make the loop more efficient + for (unsigned int i = 1; i < dlevels; ++i) { + // Non-coalesced writes occur if the number of active threads becomes + // less than 16 for a block because the start address for the first + // block is not always aligned with 64 byte which is necessary for + // coalesced access. However, the problem only occurs at high levels + // with only a small number of active threads so that the total number of + // non-coalesced access is rather small and does not justify the + // computations which are necessary to avoid these uncoalesced writes + // (this has been tested and verified) + if (tid < num_threads) { + // update stride, with each decomposition level the stride grows by a + // factor of 2 + unsigned int idata1 = idata0 + offset_neighbor; + + // position of write into global memory + unsigned int g_wpos = (num_threads * gdim) + (bid * num_threads) + tid; + + // compute wavelet decomposition step + + // offset to avoid bank conflicts + unsigned int c_idata0 = idata0 + (idata0 >> LOG_NUM_BANKS); + unsigned int c_idata1 = idata1 + (idata1 >> LOG_NUM_BANKS); + + // detail coefficient, not further modified so directly store + // in global memory + od[g_wpos] = (shared[c_idata0] - shared[c_idata1]) * INV_SQRT_2; + + // approximation coefficient + // note that the representation in shared memory becomes rather sparse + // (with a lot of holes inbetween) but the storing scheme in global + // memory guarantees that the common representation (approx, detail_0, + // detail_1, ...) + // is achieved + shared[c_idata0] = (shared[c_idata0] + shared[c_idata1]) * INV_SQRT_2; + + // update storage offset for details + num_threads = num_threads >> 1; // div 2 + offset_neighbor <<= 1; // mul 2 + idata0 = idata0 << 1; // mul 2 + } + + // sync after each decomposition step + cg::sync(cta); + } + + // write the top most level element for the next decomposition steps + // which are performed after an interlock synchronization on host side + if (0 == tid) { + approx_final[bid] = shared[0]; + } + + } // end early out if possible +} + +#endif // #ifndef _DWTHAAR1D_KERNEL_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/exception.h b/src/cuda/HPC/image/dwtHaar1D/exception.h new file mode 100644 index 000000000..ca8ac2525 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h b/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h new file mode 100644 index 000000000..723b28f7a --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h @@ -0,0 +1,988 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + + case CUFFT_MISSING_DEPENDENCY: + return "CUFFT_MISSING_DEPENDENCY"; + + case CUFFT_NVRTC_FAILURE: + return "CUFFT_NVRTC_FAILURE"; + + case CUFFT_NVJITLINK_FAILURE: + return "CUFFT_NVJITLINK_FAILURE"; + + case CUFFT_NVSHMEM_FAILURE: + return "CUFFT_NVSHMEM_FAILURE"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x89, 128}, + {0x90, 128}, + {0xa0, 128}, + {0xa1, 128}, + {0xa3, 128}, + {0xb0, 128}, + {0xc0, 128}, + {0xc1, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x89, "Ada"}, + {0x90, "Hopper"}, + {0xa0, "Blackwell"}, + {0xa1, "Blackwell"}, + {0xa3, "Blackwell"}, + {0xb0, "Blackwell"}, + {0xc0, "Blackwell"}, + {0xc1, "Blackwell"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); + cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); + if (result != cudaSuccess) { + // If cudaDevAttrClockRate attribute is not supported we + // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. + if(result == cudaErrorInvalidValue) { + clockRate = 1; + } + else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, + static_cast(result), _cudaGetErrorEnum(result)); + exit(EXIT_FAILURE); + } + } + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (integrated && (computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_functions.h b/src/cuda/HPC/image/dwtHaar1D/helper_functions.h new file mode 100644 index 000000000..f71da5761 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include "exception.h" +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include "helper_image.h" // helper functions for image compare, dump, data comparisons +#include "helper_string.h" // helper functions for string parsing +#include "helper_timer.h" // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_image.h b/src/cuda/HPC/image/dwtHaar1D/helper_image.h new file mode 100644 index 000000000..7e81b0cd8 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include "exception.h" +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include "helper_string.h" + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_string.h b/src/cuda/HPC/image/dwtHaar1D/helper_string.h new file mode 100644 index 000000000..39a1b3805 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_timer.h b/src/cuda/HPC/image/dwtHaar1D/helper_timer.h new file mode 100644 index 000000000..0614a7802 --- /dev/null +++ b/src/cuda/HPC/image/dwtHaar1D/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include "exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/Makefile b/src/cuda/HPC/image/recursiveGaussian/Makefile new file mode 100644 index 000000000..526fc27c9 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/Makefile @@ -0,0 +1,12 @@ +EXECUTABLE := recursiveGaussian_hpc +CUFILES := recursiveGaussian_cuda.cu +CCFILES := recursiveGaussian.cpp +OMIT_CUTIL_LIB := 1 +OMIT_SHRUTIL_LIB := 1 +SM_VERSIONS := 75 80 90 + +# Build in benchmark mode without OpenGL +NVCCFLAGS += -DNO_OPENGL +CXXFLAGS += -DNO_OPENGL + +include ../../../common/common.mk diff --git a/src/cuda/HPC/image/recursiveGaussian/exception.h b/src/cuda/HPC/image/recursiveGaussian/exception.h new file mode 100644 index 000000000..ca8ac2525 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/exception.h @@ -0,0 +1,151 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* CUda UTility Library */ +#ifndef COMMON_EXCEPTION_H_ +#define COMMON_EXCEPTION_H_ + +// includes, system +#include +#include +#include +#include +#include + +//! Exception wrapper. +//! @param Std_Exception Exception out of namespace std for easy typing. +template +class Exception : public Std_Exception { + public: + //! @brief Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const char *detailed = "-"); + + //! Static construction interface + //! @return Alwayss throws ( Located_Exception) + //! @param file file in which the Exception occurs + //! @param line line in which the Exception occurs + //! @param detailed details on the code fragment causing the Exception + static void throw_it(const char *file, const int line, + const std::string &detailed); + + //! Destructor + virtual ~Exception() throw(); + + private: + //! Constructor, default (private) + Exception(); + + //! Constructor, standard + //! @param str string returned by what() + explicit Exception(const std::string &str); +}; + +//////////////////////////////////////////////////////////////////////////////// +//! Exception handler function for arbitrary exceptions +//! @param ex exception to handle +//////////////////////////////////////////////////////////////////////////////// +template +inline void handleException(const Exception_Typ &ex) { + std::cerr << ex.what() << std::endl; + + exit(EXIT_FAILURE); +} + +//! Convenience macros + +//! Exception caused by dynamic program behavior, e.g. file does not exist +#define RUNTIME_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Logic exception in program, e.g. an assert failed +#define LOGIC_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//! Out of range exception +#define RANGE_EXCEPTION(msg) \ + Exception::throw_it(__FILE__, __LINE__, msg) + +//////////////////////////////////////////////////////////////////////////////// +//! Implementation + +// includes, system +#include + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const char *detailed) { + std::stringstream s; + + // Quiet heavy-weight but exceptions are not for + // performance / release versions + s << "Exception in file '" << file << "' in line " << line << "\n" + << "Detailed description: " << detailed << "\n"; + + throw Exception(s.str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Static construction interface. +//! @param Exception causing code fragment (file and line) and detailed infos. +//////////////////////////////////////////////////////////////////////////////// +/*static*/ template +void Exception::throw_it(const char *file, const int line, + const std::string &msg) { + throw_it(file, line, msg.c_str()); +} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, default (private). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception() : Std_Exception("Unknown Exception.\n") {} + +//////////////////////////////////////////////////////////////////////////////// +//! Constructor, standard (private). +//! String returned by what(). +//////////////////////////////////////////////////////////////////////////////// +template +Exception::Exception(const std::string &s) : Std_Exception(s) {} + +//////////////////////////////////////////////////////////////////////////////// +//! Destructor +//////////////////////////////////////////////////////////////////////////////// +template +Exception::~Exception() throw() {} + + // functions, exported + +#endif // COMMON_EXCEPTION_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h b/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h new file mode 100644 index 000000000..723b28f7a --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h @@ -0,0 +1,988 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +//////////////////////////////////////////////////////////////////////////////// +// These are CUDA Helper functions for initialization and error checking + +#ifndef COMMON_HELPER_CUDA_H_ +#define COMMON_HELPER_CUDA_H_ + +#pragma once + +#include +#include +#include +#include + +#include "helper_string.h" + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// Note, it is required that your SDK sample to include the proper header +// files, please refer the CUDA examples for examples of the needed CUDA +// headers, which may change depending on which CUDA functions are used. + +// CUDA Runtime error messages +#ifdef __DRIVER_TYPES_H__ +static const char *_cudaGetErrorEnum(cudaError_t error) { + return cudaGetErrorName(error); +} +#endif + +#ifdef CUDA_DRIVER_API +// CUDA Driver API errors +static const char *_cudaGetErrorEnum(CUresult error) { + static char unknown[] = ""; + const char *ret = NULL; + cuGetErrorName(error, &ret); + return ret ? ret : unknown; +} +#endif + +#ifdef CUBLAS_API_H_ +// cuBLAS API errors +static const char *_cudaGetErrorEnum(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; + + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; + + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; + } + + return ""; +} +#endif + +#ifdef _CUFFT_H_ +// cuFFT API errors +static const char *_cudaGetErrorEnum(cufftResult error) { + switch (error) { + case CUFFT_SUCCESS: + return "CUFFT_SUCCESS"; + + case CUFFT_INVALID_PLAN: + return "CUFFT_INVALID_PLAN"; + + case CUFFT_ALLOC_FAILED: + return "CUFFT_ALLOC_FAILED"; + + case CUFFT_INVALID_TYPE: + return "CUFFT_INVALID_TYPE"; + + case CUFFT_INVALID_VALUE: + return "CUFFT_INVALID_VALUE"; + + case CUFFT_INTERNAL_ERROR: + return "CUFFT_INTERNAL_ERROR"; + + case CUFFT_EXEC_FAILED: + return "CUFFT_EXEC_FAILED"; + + case CUFFT_SETUP_FAILED: + return "CUFFT_SETUP_FAILED"; + + case CUFFT_INVALID_SIZE: + return "CUFFT_INVALID_SIZE"; + + case CUFFT_UNALIGNED_DATA: + return "CUFFT_UNALIGNED_DATA"; + + case CUFFT_INVALID_DEVICE: + return "CUFFT_INVALID_DEVICE"; + + case CUFFT_NO_WORKSPACE: + return "CUFFT_NO_WORKSPACE"; + + case CUFFT_NOT_IMPLEMENTED: + return "CUFFT_NOT_IMPLEMENTED"; + + case CUFFT_NOT_SUPPORTED: + return "CUFFT_NOT_SUPPORTED"; + + case CUFFT_MISSING_DEPENDENCY: + return "CUFFT_MISSING_DEPENDENCY"; + + case CUFFT_NVRTC_FAILURE: + return "CUFFT_NVRTC_FAILURE"; + + case CUFFT_NVJITLINK_FAILURE: + return "CUFFT_NVJITLINK_FAILURE"; + + case CUFFT_NVSHMEM_FAILURE: + return "CUFFT_NVSHMEM_FAILURE"; + } + + return ""; +} +#endif + +#ifdef CUSPARSEAPI +// cuSPARSE API errors +static const char *_cudaGetErrorEnum(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + } + + return ""; +} +#endif + +#ifdef CUSOLVER_COMMON_H_ +// cuSOLVER API errors +static const char *_cudaGetErrorEnum(cusolverStatus_t error) { + switch (error) { + case CUSOLVER_STATUS_SUCCESS: + return "CUSOLVER_STATUS_SUCCESS"; + case CUSOLVER_STATUS_NOT_INITIALIZED: + return "CUSOLVER_STATUS_NOT_INITIALIZED"; + case CUSOLVER_STATUS_ALLOC_FAILED: + return "CUSOLVER_STATUS_ALLOC_FAILED"; + case CUSOLVER_STATUS_INVALID_VALUE: + return "CUSOLVER_STATUS_INVALID_VALUE"; + case CUSOLVER_STATUS_ARCH_MISMATCH: + return "CUSOLVER_STATUS_ARCH_MISMATCH"; + case CUSOLVER_STATUS_MAPPING_ERROR: + return "CUSOLVER_STATUS_MAPPING_ERROR"; + case CUSOLVER_STATUS_EXECUTION_FAILED: + return "CUSOLVER_STATUS_EXECUTION_FAILED"; + case CUSOLVER_STATUS_INTERNAL_ERROR: + return "CUSOLVER_STATUS_INTERNAL_ERROR"; + case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSOLVER_STATUS_NOT_SUPPORTED: + return "CUSOLVER_STATUS_NOT_SUPPORTED "; + case CUSOLVER_STATUS_ZERO_PIVOT: + return "CUSOLVER_STATUS_ZERO_PIVOT"; + case CUSOLVER_STATUS_INVALID_LICENSE: + return "CUSOLVER_STATUS_INVALID_LICENSE"; + } + + return ""; +} +#endif + +#ifdef CURAND_H_ +// cuRAND API errors +static const char *_cudaGetErrorEnum(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NVJPEGAPI +// nvJPEG API errors +static const char *_cudaGetErrorEnum(nvjpegStatus_t error) { + switch (error) { + case NVJPEG_STATUS_SUCCESS: + return "NVJPEG_STATUS_SUCCESS"; + + case NVJPEG_STATUS_NOT_INITIALIZED: + return "NVJPEG_STATUS_NOT_INITIALIZED"; + + case NVJPEG_STATUS_INVALID_PARAMETER: + return "NVJPEG_STATUS_INVALID_PARAMETER"; + + case NVJPEG_STATUS_BAD_JPEG: + return "NVJPEG_STATUS_BAD_JPEG"; + + case NVJPEG_STATUS_JPEG_NOT_SUPPORTED: + return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED"; + + case NVJPEG_STATUS_ALLOCATOR_FAILURE: + return "NVJPEG_STATUS_ALLOCATOR_FAILURE"; + + case NVJPEG_STATUS_EXECUTION_FAILED: + return "NVJPEG_STATUS_EXECUTION_FAILED"; + + case NVJPEG_STATUS_ARCH_MISMATCH: + return "NVJPEG_STATUS_ARCH_MISMATCH"; + + case NVJPEG_STATUS_INTERNAL_ERROR: + return "NVJPEG_STATUS_INTERNAL_ERROR"; + } + + return ""; +} +#endif + +#ifdef NV_NPPIDEFS_H +// NPP API errors +static const char *_cudaGetErrorEnum(NppStatus error) { + switch (error) { + case NPP_NOT_SUPPORTED_MODE_ERROR: + return "NPP_NOT_SUPPORTED_MODE_ERROR"; + + case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_RESIZE_NO_OPERATION_ERROR: + return "NPP_RESIZE_NO_OPERATION_ERROR"; + + case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY: + return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_BAD_ARG_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFF_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECT_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUAD_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEM_ALLOC_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTO_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_INPUT: + return "NPP_INVALID_INPUT"; + + case NPP_POINTER_ERROR: + return "NPP_POINTER_ERROR"; + + case NPP_WARNING: + return "NPP_WARNING"; + + case NPP_ODD_ROI_WARNING: + return "NPP_ODD_ROI_WARNING"; +#else + + // These are for CUDA 5.5 or higher + case NPP_BAD_ARGUMENT_ERROR: + return "NPP_BAD_ARGUMENT_ERROR"; + + case NPP_COEFFICIENT_ERROR: + return "NPP_COEFFICIENT_ERROR"; + + case NPP_RECTANGLE_ERROR: + return "NPP_RECTANGLE_ERROR"; + + case NPP_QUADRANGLE_ERROR: + return "NPP_QUADRANGLE_ERROR"; + + case NPP_MEMORY_ALLOCATION_ERR: + return "NPP_MEMORY_ALLOCATION_ERROR"; + + case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR: + return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR"; + + case NPP_INVALID_HOST_POINTER_ERROR: + return "NPP_INVALID_HOST_POINTER_ERROR"; + + case NPP_INVALID_DEVICE_POINTER_ERROR: + return "NPP_INVALID_DEVICE_POINTER_ERROR"; +#endif + + case NPP_LUT_NUMBER_OF_LEVELS_ERROR: + return "NPP_LUT_NUMBER_OF_LEVELS_ERROR"; + + case NPP_TEXTURE_BIND_ERROR: + return "NPP_TEXTURE_BIND_ERROR"; + + case NPP_WRONG_INTERSECTION_ROI_ERROR: + return "NPP_WRONG_INTERSECTION_ROI_ERROR"; + + case NPP_NOT_EVEN_STEP_ERROR: + return "NPP_NOT_EVEN_STEP_ERROR"; + + case NPP_INTERPOLATION_ERROR: + return "NPP_INTERPOLATION_ERROR"; + + case NPP_RESIZE_FACTOR_ERROR: + return "NPP_RESIZE_FACTOR_ERROR"; + + case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR: + return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000 + + case NPP_MEMFREE_ERR: + return "NPP_MEMFREE_ERR"; + + case NPP_MEMSET_ERR: + return "NPP_MEMSET_ERR"; + + case NPP_MEMCPY_ERR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERR: + return "NPP_MIRROR_FLIP_ERR"; +#else + + case NPP_MEMFREE_ERROR: + return "NPP_MEMFREE_ERROR"; + + case NPP_MEMSET_ERROR: + return "NPP_MEMSET_ERROR"; + + case NPP_MEMCPY_ERROR: + return "NPP_MEMCPY_ERROR"; + + case NPP_MIRROR_FLIP_ERROR: + return "NPP_MIRROR_FLIP_ERROR"; +#endif + + case NPP_ALIGNMENT_ERROR: + return "NPP_ALIGNMENT_ERROR"; + + case NPP_STEP_ERROR: + return "NPP_STEP_ERROR"; + + case NPP_SIZE_ERROR: + return "NPP_SIZE_ERROR"; + + case NPP_NULL_POINTER_ERROR: + return "NPP_NULL_POINTER_ERROR"; + + case NPP_CUDA_KERNEL_EXECUTION_ERROR: + return "NPP_CUDA_KERNEL_EXECUTION_ERROR"; + + case NPP_NOT_IMPLEMENTED_ERROR: + return "NPP_NOT_IMPLEMENTED_ERROR"; + + case NPP_ERROR: + return "NPP_ERROR"; + + case NPP_SUCCESS: + return "NPP_SUCCESS"; + + case NPP_WRONG_INTERSECTION_QUAD_WARNING: + return "NPP_WRONG_INTERSECTION_QUAD_WARNING"; + + case NPP_MISALIGNED_DST_ROI_WARNING: + return "NPP_MISALIGNED_DST_ROI_WARNING"; + + case NPP_AFFINE_QUAD_INCORRECT_WARNING: + return "NPP_AFFINE_QUAD_INCORRECT_WARNING"; + + case NPP_DOUBLE_SIZE_WARNING: + return "NPP_DOUBLE_SIZE_WARNING"; + + case NPP_WRONG_INTERSECTION_ROI_WARNING: + return "NPP_WRONG_INTERSECTION_ROI_WARNING"; + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000 + /* These are 6.0 or higher */ + case NPP_LUT_PALETTE_BITSIZE_ERROR: + return "NPP_LUT_PALETTE_BITSIZE_ERROR"; + + case NPP_ZC_MODE_NOT_SUPPORTED_ERROR: + return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR"; + + case NPP_QUALITY_INDEX_ERROR: + return "NPP_QUALITY_INDEX_ERROR"; + + case NPP_CHANNEL_ORDER_ERROR: + return "NPP_CHANNEL_ORDER_ERROR"; + + case NPP_ZERO_MASK_VALUE_ERROR: + return "NPP_ZERO_MASK_VALUE_ERROR"; + + case NPP_NUMBER_OF_CHANNELS_ERROR: + return "NPP_NUMBER_OF_CHANNELS_ERROR"; + + case NPP_COI_ERROR: + return "NPP_COI_ERROR"; + + case NPP_DIVISOR_ERROR: + return "NPP_DIVISOR_ERROR"; + + case NPP_CHANNEL_ERROR: + return "NPP_CHANNEL_ERROR"; + + case NPP_STRIDE_ERROR: + return "NPP_STRIDE_ERROR"; + + case NPP_ANCHOR_ERROR: + return "NPP_ANCHOR_ERROR"; + + case NPP_MASK_SIZE_ERROR: + return "NPP_MASK_SIZE_ERROR"; + + case NPP_MOMENT_00_ZERO_ERROR: + return "NPP_MOMENT_00_ZERO_ERROR"; + + case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR: + return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR"; + + case NPP_THRESHOLD_ERROR: + return "NPP_THRESHOLD_ERROR"; + + case NPP_CONTEXT_MATCH_ERROR: + return "NPP_CONTEXT_MATCH_ERROR"; + + case NPP_FFT_FLAG_ERROR: + return "NPP_FFT_FLAG_ERROR"; + + case NPP_FFT_ORDER_ERROR: + return "NPP_FFT_ORDER_ERROR"; + + case NPP_SCALE_RANGE_ERROR: + return "NPP_SCALE_RANGE_ERROR"; + + case NPP_DATA_TYPE_ERROR: + return "NPP_DATA_TYPE_ERROR"; + + case NPP_OUT_OFF_RANGE_ERROR: + return "NPP_OUT_OFF_RANGE_ERROR"; + + case NPP_DIVIDE_BY_ZERO_ERROR: + return "NPP_DIVIDE_BY_ZERO_ERROR"; + + case NPP_RANGE_ERROR: + return "NPP_RANGE_ERROR"; + + case NPP_NO_MEMORY_ERROR: + return "NPP_NO_MEMORY_ERROR"; + + case NPP_ERROR_RESERVED: + return "NPP_ERROR_RESERVED"; + + case NPP_NO_OPERATION_WARNING: + return "NPP_NO_OPERATION_WARNING"; + + case NPP_DIVIDE_BY_ZERO_WARNING: + return "NPP_DIVIDE_BY_ZERO_WARNING"; +#endif + +#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000 + /* These are 7.0 or higher */ + case NPP_OVERFLOW_ERROR: + return "NPP_OVERFLOW_ERROR"; + + case NPP_CORRUPTED_DATA_ERROR: + return "NPP_CORRUPTED_DATA_ERROR"; +#endif + } + + return ""; +} +#endif + +template +void check(T result, char const *const func, const char *const file, + int const line) { + if (result) { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, + static_cast(result), _cudaGetErrorEnum(result), func); + exit(EXIT_FAILURE); + } +} + +#ifdef __DRIVER_TYPES_H__ +// This will output the proper CUDA error strings in the event +// that a CUDA host call returns an error +#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__) + +// This will output the proper error string when calling cudaGetLastError +#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__) + +inline void __getLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + exit(EXIT_FAILURE); + } +} + +// This will only print the proper error string when calling cudaGetLastError +// but not exit program incase error detected. +#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__) + +inline void __printLastCudaError(const char *errorMessage, const char *file, + const int line) { + cudaError_t err = cudaGetLastError(); + + if (cudaSuccess != err) { + fprintf(stderr, + "%s(%i) : getLastCudaError() CUDA error :" + " %s : (%d) %s.\n", + file, line, errorMessage, static_cast(err), + cudaGetErrorString(err)); + } +} +#endif + +#ifndef MAX +#define MAX(a, b) (a > b ? a : b) +#endif + +// Float To Int conversion +inline int ftoi(float value) { + return (value >= 0 ? static_cast(value + 0.5) + : static_cast(value - 0.5)); +} + +// Beginning of GPU Architecture definitions +inline int _ConvertSMVer2Cores(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the # of cores per SM + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + int Cores; + } sSMtoCores; + + sSMtoCores nGpuArchCoresPerSM[] = { + {0x30, 192}, + {0x32, 192}, + {0x35, 192}, + {0x37, 192}, + {0x50, 128}, + {0x52, 128}, + {0x53, 128}, + {0x60, 64}, + {0x61, 128}, + {0x62, 128}, + {0x70, 64}, + {0x72, 64}, + {0x75, 64}, + {0x80, 64}, + {0x86, 128}, + {0x87, 128}, + {0x89, 128}, + {0x90, 128}, + {0xa0, 128}, + {0xa1, 128}, + {0xa3, 128}, + {0xb0, 128}, + {0xc0, 128}, + {0xc1, 128}, + {-1, -1}}; + + int index = 0; + + while (nGpuArchCoresPerSM[index].SM != -1) { + if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) { + return nGpuArchCoresPerSM[index].Cores; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoCores for SM %d.%d is undefined." + " Default to use %d Cores/SM\n", + major, minor, nGpuArchCoresPerSM[index - 1].Cores); + return nGpuArchCoresPerSM[index - 1].Cores; +} + +inline const char* _ConvertSMVer2ArchName(int major, int minor) { + // Defines for GPU Architecture types (using the SM version to determine + // the GPU Arch name) + typedef struct { + int SM; // 0xMm (hexidecimal notation), M = SM Major version, + // and m = SM minor version + const char* name; + } sSMtoArchName; + + sSMtoArchName nGpuArchNameSM[] = { + {0x30, "Kepler"}, + {0x32, "Kepler"}, + {0x35, "Kepler"}, + {0x37, "Kepler"}, + {0x50, "Maxwell"}, + {0x52, "Maxwell"}, + {0x53, "Maxwell"}, + {0x60, "Pascal"}, + {0x61, "Pascal"}, + {0x62, "Pascal"}, + {0x70, "Volta"}, + {0x72, "Xavier"}, + {0x75, "Turing"}, + {0x80, "Ampere"}, + {0x86, "Ampere"}, + {0x87, "Ampere"}, + {0x89, "Ada"}, + {0x90, "Hopper"}, + {0xa0, "Blackwell"}, + {0xa1, "Blackwell"}, + {0xa3, "Blackwell"}, + {0xb0, "Blackwell"}, + {0xc0, "Blackwell"}, + {0xc1, "Blackwell"}, + {-1, "Graphics Device"}}; + + int index = 0; + + while (nGpuArchNameSM[index].SM != -1) { + if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) { + return nGpuArchNameSM[index].name; + } + + index++; + } + + // If we don't find the values, we default use the previous one + // to run properly + printf( + "MapSMtoArchName for SM %d.%d is undefined." + " Default to use %s\n", + major, minor, nGpuArchNameSM[index - 1].name); + return nGpuArchNameSM[index - 1].name; +} + // end of GPU Architecture definitions + +#ifdef __CUDA_RUNTIME_H__ +// General GPU Device CUDA Initialization +inline int gpuDeviceInit(int devID) { + int device_count; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuDeviceInit() CUDA error: " + "no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + if (devID < 0) { + devID = 0; + } + + if (devID > device_count - 1) { + fprintf(stderr, "\n"); + fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", + device_count); + fprintf(stderr, + ">> gpuDeviceInit (-device=%d) is not a valid" + " GPU device. <<\n", + devID); + fprintf(stderr, "\n"); + return -devID; + } + + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + if (computeMode == cudaComputeModeProhibited) { + fprintf(stderr, + "Error: device is running in , no threads can use cudaSetDevice().\n"); + return -1; + } + + if (major < 1) { + fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n"); + exit(EXIT_FAILURE); + } + + checkCudaErrors(cudaSetDevice(devID)); + printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor)); + + return devID; +} + +// This function returns the best GPU (with maximum GFLOPS) +inline int gpuGetMaxGflopsDeviceId() { + int current_device = 0, sm_per_multiproc = 0; + int max_perf_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + uint64_t max_compute_perf = 0; + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the best CUDA capable GPU device + current_device = 0; + + while (current_device < device_count) { + int computeMode = -1, major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + + // If this GPU is not running on Compute Mode prohibited, + // then we can add it to the list + if (computeMode != cudaComputeModeProhibited) { + if (major == 9999 && minor == 9999) { + sm_per_multiproc = 1; + } else { + sm_per_multiproc = + _ConvertSMVer2Cores(major, minor); + } + int multiProcessorCount = 0, clockRate = 0; + checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device)); + cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device); + if (result != cudaSuccess) { + // If cudaDevAttrClockRate attribute is not supported we + // set clockRate as 1, to consider GPU with most SMs and CUDA Cores. + if(result == cudaErrorInvalidValue) { + clockRate = 1; + } + else { + fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__, + static_cast(result), _cudaGetErrorEnum(result)); + exit(EXIT_FAILURE); + } + } + uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate; + + if (compute_perf > max_compute_perf) { + max_compute_perf = compute_perf; + max_perf_device = current_device; + } + } else { + devices_prohibited++; + } + + ++current_device; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "gpuGetMaxGflopsDeviceId() CUDA error:" + " all devices have compute mode prohibited.\n"); + exit(EXIT_FAILURE); + } + + return max_perf_device; +} + +// Initialization code to find the best CUDA Device +inline int findCudaDevice(int argc, const char **argv) { + int devID = 0; + + // If the command-line has a device number specified, use it + if (checkCmdLineFlag(argc, argv, "device")) { + devID = getCmdLineArgumentInt(argc, argv, "device="); + + if (devID < 0) { + printf("Invalid command line parameter\n "); + exit(EXIT_FAILURE); + } else { + devID = gpuDeviceInit(devID); + + if (devID < 0) { + printf("exiting...\n"); + exit(EXIT_FAILURE); + } + } + } else { + // Otherwise pick the device with highest Gflops/s + devID = gpuGetMaxGflopsDeviceId(); + checkCudaErrors(cudaSetDevice(devID)); + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + devID, _ConvertSMVer2ArchName(major, minor), major, minor); + + } + + return devID; +} + +inline int findIntegratedGPU() { + int current_device = 0; + int device_count = 0; + int devices_prohibited = 0; + + checkCudaErrors(cudaGetDeviceCount(&device_count)); + + if (device_count == 0) { + fprintf(stderr, "CUDA error: no devices supporting CUDA.\n"); + exit(EXIT_FAILURE); + } + + // Find the integrated GPU which is compute capable + while (current_device < device_count) { + int computeMode = -1, integrated = -1; + checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device)); + // If GPU is integrated and is not running on Compute Mode prohibited, + // then cuda can map to GLES resource + if (integrated && (computeMode != cudaComputeModeProhibited)) { + checkCudaErrors(cudaSetDevice(current_device)); + + int major = 0, minor = 0; + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device)); + printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", + current_device, _ConvertSMVer2ArchName(major, minor), major, minor); + + return current_device; + } else { + devices_prohibited++; + } + + current_device++; + } + + if (devices_prohibited == device_count) { + fprintf(stderr, + "CUDA error:" + " No GLES-CUDA Interop capable GPU found.\n"); + exit(EXIT_FAILURE); + } + + return -1; +} + +// General check for CUDA GPU SM Capabilities +inline bool checkCudaCapabilities(int major_version, int minor_version) { + int dev; + int major = 0, minor = 0; + + checkCudaErrors(cudaGetDevice(&dev)); + checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev)); + checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev)); + + if ((major > major_version) || + (major == major_version && + minor >= minor_version)) { + printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, + _ConvertSMVer2ArchName(major, minor), major, minor); + return true; + } else { + printf( + " No GPU device was found that can support " + "CUDA compute capability %d.%d.\n", + major_version, minor_version); + return false; + } +} +#endif + + // end of CUDA Helper Functions + +#endif // COMMON_HELPER_CUDA_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_functions.h b/src/cuda/HPC/image/recursiveGaussian/helper_functions.h new file mode 100644 index 000000000..f71da5761 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_functions.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, +// timers, image helpers, etc) +#ifndef COMMON_HELPER_FUNCTIONS_H_ +#define COMMON_HELPER_FUNCTIONS_H_ + +#ifdef WIN32 +#pragma warning(disable : 4996) +#endif + +// includes, project +#include +#include "exception.h" +#include +#include +#include + +#include +#include +#include +#include +#include + +// includes, timer, string parsing, image helpers +#include "helper_image.h" // helper functions for image compare, dump, data comparisons +#include "helper_string.h" // helper functions for string parsing +#include "helper_timer.h" // helper functions for timers + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#endif // COMMON_HELPER_FUNCTIONS_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_gl.h b/src/cuda/HPC/image/recursiveGaussian/helper_gl.h new file mode 100644 index 000000000..85c075595 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_gl.h @@ -0,0 +1,267 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (OpenGL) +#ifndef HELPER_GL_H +#define HELPER_GL_H + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + #include +#endif + +#if defined(__APPLE__) || defined(MACOSX) + #include +#else + #include + #ifdef __linux__ + #include + #endif /* __linux__ */ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Prototypes */ +namespace __HelperGL { + static int isGLVersionSupported(unsigned reqMajor, unsigned reqMinor); + static int areGLExtensionsSupported(const std::string &); +#ifdef __linux__ + + #ifndef HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION + #define USE_GL_FUNC(name, proto) proto name = (proto) glXGetProcAddress ((const GLubyte *)#name) + #else + #define USE_GL_FUNC(name, proto) extern proto name + #endif + + USE_GL_FUNC(glBindBuffer, PFNGLBINDBUFFERPROC); + USE_GL_FUNC(glDeleteBuffers, PFNGLDELETEBUFFERSPROC); + USE_GL_FUNC(glBufferData, PFNGLBUFFERDATAPROC); + USE_GL_FUNC(glBufferSubData, PFNGLBUFFERSUBDATAPROC); + USE_GL_FUNC(glGenBuffers, PFNGLGENBUFFERSPROC); + USE_GL_FUNC(glCreateProgram, PFNGLCREATEPROGRAMPROC); + USE_GL_FUNC(glBindProgramARB, PFNGLBINDPROGRAMARBPROC); + USE_GL_FUNC(glGenProgramsARB, PFNGLGENPROGRAMSARBPROC); + USE_GL_FUNC(glDeleteProgramsARB, PFNGLDELETEPROGRAMSARBPROC); + USE_GL_FUNC(glDeleteProgram, PFNGLDELETEPROGRAMPROC); + USE_GL_FUNC(glGetProgramInfoLog, PFNGLGETPROGRAMINFOLOGPROC); + USE_GL_FUNC(glGetProgramiv, PFNGLGETPROGRAMIVPROC); + USE_GL_FUNC(glProgramParameteriEXT, PFNGLPROGRAMPARAMETERIEXTPROC); + USE_GL_FUNC(glProgramStringARB, PFNGLPROGRAMSTRINGARBPROC); + USE_GL_FUNC(glUnmapBuffer, PFNGLUNMAPBUFFERPROC); + USE_GL_FUNC(glMapBuffer, PFNGLMAPBUFFERPROC); + USE_GL_FUNC(glGetBufferParameteriv, PFNGLGETBUFFERPARAMETERIVPROC); + USE_GL_FUNC(glLinkProgram, PFNGLLINKPROGRAMPROC); + USE_GL_FUNC(glUseProgram, PFNGLUSEPROGRAMPROC); + USE_GL_FUNC(glAttachShader, PFNGLATTACHSHADERPROC); + USE_GL_FUNC(glCreateShader, PFNGLCREATESHADERPROC); + USE_GL_FUNC(glShaderSource, PFNGLSHADERSOURCEPROC); + USE_GL_FUNC(glCompileShader, PFNGLCOMPILESHADERPROC); + USE_GL_FUNC(glDeleteShader, PFNGLDELETESHADERPROC); + USE_GL_FUNC(glGetShaderInfoLog, PFNGLGETSHADERINFOLOGPROC); + USE_GL_FUNC(glGetShaderiv, PFNGLGETSHADERIVPROC); + USE_GL_FUNC(glUniform1i, PFNGLUNIFORM1IPROC); + USE_GL_FUNC(glUniform1f, PFNGLUNIFORM1FPROC); + USE_GL_FUNC(glUniform2f, PFNGLUNIFORM2FPROC); + USE_GL_FUNC(glUniform3f, PFNGLUNIFORM3FPROC); + USE_GL_FUNC(glUniform4f, PFNGLUNIFORM4FPROC); + USE_GL_FUNC(glUniform1fv, PFNGLUNIFORM1FVPROC); + USE_GL_FUNC(glUniform2fv, PFNGLUNIFORM2FVPROC); + USE_GL_FUNC(glUniform3fv, PFNGLUNIFORM3FVPROC); + USE_GL_FUNC(glUniform4fv, PFNGLUNIFORM4FVPROC); + USE_GL_FUNC(glUniformMatrix4fv, PFNGLUNIFORMMATRIX4FVPROC); + USE_GL_FUNC(glSecondaryColor3fv, PFNGLSECONDARYCOLOR3FVPROC); + USE_GL_FUNC(glGetUniformLocation, PFNGLGETUNIFORMLOCATIONPROC); + USE_GL_FUNC(glGenFramebuffersEXT, PFNGLGENFRAMEBUFFERSEXTPROC); + USE_GL_FUNC(glBindFramebufferEXT, PFNGLBINDFRAMEBUFFEREXTPROC); + USE_GL_FUNC(glDeleteFramebuffersEXT, PFNGLDELETEFRAMEBUFFERSEXTPROC); + USE_GL_FUNC(glCheckFramebufferStatusEXT, PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC); + USE_GL_FUNC(glGetFramebufferAttachmentParameterivEXT, PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC); + USE_GL_FUNC(glFramebufferTexture1DEXT, PFNGLFRAMEBUFFERTEXTURE1DEXTPROC); + USE_GL_FUNC(glFramebufferTexture2DEXT, PFNGLFRAMEBUFFERTEXTURE2DEXTPROC); + USE_GL_FUNC(glFramebufferTexture3DEXT, PFNGLFRAMEBUFFERTEXTURE3DEXTPROC); + USE_GL_FUNC(glGenerateMipmapEXT, PFNGLGENERATEMIPMAPEXTPROC); + USE_GL_FUNC(glGenRenderbuffersEXT, PFNGLGENRENDERBUFFERSEXTPROC); + USE_GL_FUNC(glDeleteRenderbuffersEXT, PFNGLDELETERENDERBUFFERSEXTPROC); + USE_GL_FUNC(glBindRenderbufferEXT, PFNGLBINDRENDERBUFFEREXTPROC); + USE_GL_FUNC(glRenderbufferStorageEXT, PFNGLRENDERBUFFERSTORAGEEXTPROC); + USE_GL_FUNC(glFramebufferRenderbufferEXT, PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC); + USE_GL_FUNC(glClampColorARB, PFNGLCLAMPCOLORARBPROC); + USE_GL_FUNC(glBindFragDataLocationEXT, PFNGLBINDFRAGDATALOCATIONEXTPROC); + +#if !defined(GLX_EXTENSION_NAME) || !defined(GL_VERSION_1_3) + USE_GL_FUNC(glActiveTexture, PFNGLACTIVETEXTUREPROC); + USE_GL_FUNC(glClientActiveTexture, PFNGLACTIVETEXTUREPROC); +#endif + + #undef USE_GL_FUNC +#endif /*__linux__ */ +} + + +namespace __HelperGL { + namespace __Int { + static std::vector split(const std::string &str) + { + std::istringstream ss(str); + std::istream_iterator it(ss); + return std::vector (it, std::istream_iterator()); + } + + /* Sort the vector passed by reference */ + template static inline void sort(std::vector &a) + { + std::sort(a.begin(), a.end()); + } + + /* Compare two vectors */ + template static int equals(std::vector a, std::vector b) + { + if (a.size() != b.size()) return 0; + sort(a); + sort(b); + + return std::equal(a.begin(), a.end(), b.begin()); + } + + template static std::vector getIntersection(std::vector a, std::vector b) + { + sort(a); + sort(b); + + std::vector rc; + std::set_intersection(a.begin(), a.end(), b.begin(), b.end(), + std::back_inserter >(rc)); + return rc; + } + + static std::vector getGLExtensions() + { + std::string extensionsStr( (const char *)glGetString(GL_EXTENSIONS)); + return split (extensionsStr); + } + } + + static int areGLExtensionsSupported(const std::string &extensions) + { + std::vector all = __Int::getGLExtensions(); + + std::vector requested = __Int::split(extensions); + std::vector matched = __Int::getIntersection(all, requested); + + return __Int::equals(matched, requested); + } + + static int isGLVersionSupported(unsigned reqMajor, unsigned reqMinor) + { +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + if (glewInit() != GLEW_OK) + { + std::cerr << "glewInit() failed!" << std::endl; + return 0; + } +#endif + std::string version ((const char *) glGetString (GL_VERSION)); + std::stringstream stream (version); + unsigned major, minor; + char dot; + + stream >> major >> dot >> minor; + + assert (dot == '.'); + return major > reqMajor || (major == reqMajor && minor >= reqMinor); + } + + static inline const char* glErrorToString(GLenum err) + { +#define CASE_RETURN_MACRO(arg) case arg: return #arg + switch(err) + { + CASE_RETURN_MACRO(GL_NO_ERROR); + CASE_RETURN_MACRO(GL_INVALID_ENUM); + CASE_RETURN_MACRO(GL_INVALID_VALUE); + CASE_RETURN_MACRO(GL_INVALID_OPERATION); + CASE_RETURN_MACRO(GL_OUT_OF_MEMORY); + CASE_RETURN_MACRO(GL_STACK_UNDERFLOW); + CASE_RETURN_MACRO(GL_STACK_OVERFLOW); +#ifdef GL_INVALID_FRAMEBUFFER_OPERATION + CASE_RETURN_MACRO(GL_INVALID_FRAMEBUFFER_OPERATION); +#endif + default: break; + } +#undef CASE_RETURN_MACRO + return "*UNKNOWN*"; + } + +//////////////////////////////////////////////////////////////////////////// +//! Check for OpenGL error +//! @return bool if no GL error has been encountered, otherwise 0 +//! @param file __FILE__ macro +//! @param line __LINE__ macro +//! @note The GL error is listed on stderr +//! @note This function should be used via the CHECK_ERROR_GL() macro +//////////////////////////////////////////////////////////////////////////// + inline bool sdkCheckErrorGL(const char *file, const int line) + { + bool ret_val = true; + + // check for error + GLenum gl_error = glGetError(); + + if (gl_error != GL_NO_ERROR) + { +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + char tmpStr[512]; + // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line + // when the user double clicks on the error line in the Output pane. Like any compile error. + sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, glErrorToString(gl_error)); + fprintf(stderr, "%s", tmpStr); +#endif + fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line); + fprintf(stderr, "%s\n", glErrorToString(gl_error)); + ret_val = false; + } + + return ret_val; + } + +#define SDK_CHECK_ERROR_GL() \ + if( false == sdkCheckErrorGL( __FILE__, __LINE__)) { \ + exit(EXIT_FAILURE); \ + } + +} /* of namespace __HelperGL*/ + +using namespace __HelperGL; + +#endif /*HELPER_GL_H*/ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_image.h b/src/cuda/HPC/image/recursiveGaussian/helper_image.h new file mode 100644 index 000000000..7e81b0cd8 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_image.h @@ -0,0 +1,1001 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (image,bitmap) +#ifndef COMMON_HELPER_IMAGE_H_ +#define COMMON_HELPER_IMAGE_H_ + +#include +#include "exception.h" +#include +#include + +#include +#include +#include +#include +#include + +#ifndef MIN +#define MIN(a, b) ((a < b) ? a : b) +#endif +#ifndef MAX +#define MAX(a, b) ((a > b) ? a : b) +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#include "helper_string.h" + +// namespace unnamed (internal) +namespace helper_image_internal { +//! size of PGM file header +const unsigned int PGMHeaderSize = 0x40; + +// types + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterFromUByte; + +//! Data converter from unsigned char / unsigned byte +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val); + } +}; + +//! Data converter from unsigned char / unsigned byte to float +template <> +struct ConverterFromUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + float operator()(const unsigned char &val) { + return static_cast(val) / 255.0f; + } +}; + +//! Data converter from unsigned char / unsigned byte to type T +template +struct ConverterToUByte; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator (essentially a passthru + //! @return converted value + //! @param val value to convert + unsigned char operator()(const unsigned char &val) { return val; } +}; + +//! Data converter from unsigned char / unsigned byte to unsigned int +template <> +struct ConverterToUByte { + //! Conversion operator + //! @return converted value + //! @param val value to convert + unsigned char operator()(const float &val) { + return static_cast(val * 255.0f); + } +}; +} // namespace helper_image_internal + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#else +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#endif + +inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w, + unsigned int *h, unsigned int *channels) { + FILE *fp = NULL; + + if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) { + std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl; + return false; + } + + // check header + char header[helper_image_internal::PGMHeaderSize]; + + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl; + return false; + } + + if (strncmp(header, "P5", 2) == 0) { + *channels = 1; + } else if (strncmp(header, "P6", 2) == 0) { + *channels = 3; + } else { + std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl; + *channels = 0; + return false; + } + + // parse header, read maxval, width and height + unsigned int width = 0; + unsigned int height = 0; + unsigned int maxval = 0; + unsigned int i = 0; + + while (i < 3) { + if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) { + std::cerr << "__LoadPPM() : reading PGM header returned NULL" + << std::endl; + return false; + } + + if (header[0] == '#') { + continue; + } + + if (i == 0) { + i += SSCANF(header, "%u %u %u", &width, &height, &maxval); + } else if (i == 1) { + i += SSCANF(header, "%u %u", &height, &maxval); + } else if (i == 2) { + i += SSCANF(header, "%u", &maxval); + } + } + + // check if given handle for the data is initialized + if (NULL != *data) { + if (*w != width || *h != height) { + std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl; + } + } else { + *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height * + *channels); + *w = width; + *h = height; + } + + // read and close file + if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) == + 0) { + std::cerr << "__LoadPPM() read data returned error." << std::endl; + } + + fclose(fp); + + return true; +} + +template +inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = NULL; + unsigned int channels; + + if (true != __loadPPM(file, &idata, w, h, &channels)) { + return false; + } + + unsigned int size = *w * *h * channels; + + // initialize mem if necessary + // the correct size is checked / set in loadPGMc() + if (NULL == *data) { + *data = reinterpret_cast(malloc(sizeof(T) * size)); + } + + // copy and cast data + std::transform(idata, idata + size, *data, + helper_image_internal::ConverterFromUByte()); + + free(idata); + + return true; +} + +template +inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w, + unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = reinterpret_cast(malloc(sizeof(T) * size * 4)); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool __savePPM(const char *file, unsigned char *data, unsigned int w, + unsigned int h, unsigned int channels) { + assert(NULL != data); + assert(w > 0); + assert(h > 0); + + std::fstream fh(file, std::fstream::out | std::fstream::binary); + + if (fh.bad()) { + std::cerr << "__savePPM() : Opening file failed." << std::endl; + return false; + } + + if (channels == 1) { + fh << "P5\n"; + } else if (channels == 3) { + fh << "P6\n"; + } else { + std::cerr << "__savePPM() : Invalid number of channels." << std::endl; + return false; + } + + fh << w << "\n" << h << "\n" << 0xff << std::endl; + + for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) { + fh << data[i]; + } + + fh.flush(); + + if (fh.bad()) { + std::cerr << "__savePPM() : Writing data failed." << std::endl; + return false; + } + + fh.close(); + + return true; +} + +template +inline bool sdkSavePGM(const char *file, T *data, unsigned int w, + unsigned int h) { + unsigned int size = w * h; + unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size); + + std::transform(data, data + size, idata, + helper_image_internal::ConverterToUByte()); + + // write file + bool result = __savePPM(file, idata, w, h, 1); + + // cleanup + free(idata); + + return result; +} + +inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w, + unsigned int h) { + // strip 4th component + int size = w * h; + unsigned char *ndata = + (unsigned char *)malloc(sizeof(unsigned char) * size * 3); + unsigned char *ptr = ndata; + + for (int i = 0; i < size; i++) { + *ptr++ = *data++; + *ptr++ = *data++; + *ptr++ = *data++; + data++; + } + + bool result = __savePPM(file, ndata, w, h, 3); + free(ndata); + return result; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFile(const char *filename, T **data, unsigned int *len, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // intermediate storage for the data read + std::vector data_read; + + // open file for reading + FILE *fh = NULL; + + // check if filestream is valid + if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) { + printf("Unable to open input file: %s\n", filename); + return false; + } + + // read all data elements + T token; + + while (!feof(fh)) { + fscanf(fh, "%f", &token); + data_read.push_back(token); + } + + // the last element is read twice + data_read.pop_back(); + fclose(fh); + + // check if the given handle is already initialized + if (NULL != *data) { + if (*len != data_read.size()) { + std::cerr << "sdkReadFile() : Initialized memory given but " + << "size mismatch with signal read " + << "(data read / data init = " << (unsigned int)data_read.size() + << " / " << *len << ")" << std::endl; + + return false; + } + } else { + // allocate storage for the data read + *data = reinterpret_cast(malloc(sizeof(T) * data_read.size())); + // store signal size + *len = static_cast(data_read.size()); + } + + // copy data + memcpy(*data, &data_read.front(), sizeof(T) * data_read.size()); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Read file \filename and return the data +//! @return bool if reading the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data uninitialized pointer, returned initialized and pointing to +//! the data read +//! @param len number of data elements in data, -1 on error +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len, + unsigned int block_num, unsigned int block_size, + bool verbose) { + // check input arguments + assert(NULL != filename); + assert(NULL != len); + + // open file for reading + FILE *fh = fopen(filename, "rb"); + + if (fh == NULL && verbose) { + std::cerr << "sdkReadFile() : Opening file failed." << std::endl; + return false; + } + + // check if the given handle is already initialized + // allocate storage for the data read + data[block_num] = reinterpret_cast(malloc(block_size)); + + // read all data elements + fseek(fh, block_num * block_size, SEEK_SET); + *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh); + + fclose(fh); + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Write a data file \filename +//! @return true if writing the file succeeded, otherwise false +//! @param filename name of the source file +//! @param data data to write +//! @param len number of data elements in data, -1 on error +//! @param epsilon epsilon for comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len, + const S epsilon, bool verbose, bool append = false) { + assert(NULL != filename); + assert(NULL != data); + + // open file for writing + // if (append) { + std::fstream fh(filename, std::fstream::out | std::fstream::ate); + + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename + << " for write/append." << std::endl; + } + + /* } else { + std::fstream fh(filename, std::fstream::out); + if (verbose) { + std::cerr << "sdkWriteFile() : Open file " << filename << " for + write." << std::endl; + } + } + */ + + // check if filestream is valid + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Opening file failed." << std::endl; + } + + return false; + } + + // first write epsilon + fh << "# " << epsilon << "\n"; + + // write data + for (unsigned int i = 0; (i < len) && (fh.good()); ++i) { + fh << data[i] << ' '; + } + + // Check if writing succeeded + if (!fh.good()) { + if (verbose) { + std::cerr << "sdkWriteFile() : Writing file failed." << std::endl; + } + + return false; + } + + // file ends with nl + fh << std::endl; + + return true; +} + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference timer_interface to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareData(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + bool result = true; + unsigned int error_count = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = static_cast(reference[i]) - static_cast(data[i]); + bool comp = (diff <= epsilon) && (diff >= -epsilon); + result &= comp; + + error_count += !comp; + +#if 0 + + if (!comp) { + std::cerr << "ERROR, i = " << i << ",\t " + << reference[i] << " / " + << data[i] + << " (reference / data)\n"; + } + +#endif + } + + if (threshold == 0.0f) { + return (result) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return (len * threshold > error_count) ? true : false; + } +} + +#ifndef __MIN_EPSILON_ERROR +#define __MIN_EPSILON_ERROR 1e-3f +#endif + +////////////////////////////////////////////////////////////////////////////// +//! Compare two arrays of arbitrary type +//! @return true if \a reference and \a data are identical, otherwise false +//! @param reference handle to the reference data / gold image +//! @param data handle to the computed data +//! @param len number of elements in reference and data +//! @param epsilon epsilon to use for the comparison +//! @param epsilon threshold % of (# of bytes) for pass/fail +////////////////////////////////////////////////////////////////////////////// +template +inline bool compareDataAsFloatThreshold(const T *reference, const T *data, + const unsigned int len, const S epsilon, + const float threshold) { + assert(epsilon >= 0); + + // If we set epsilon to be 0, let's set a minimum threshold + float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR); + int error_count = 0; + bool result = true; + + for (unsigned int i = 0; i < len; ++i) { + float diff = + fabs(static_cast(reference[i]) - static_cast(data[i])); + bool comp = (diff < max_error); + result &= comp; + + if (!comp) { + error_count++; + } + } + + if (threshold == 0.0f) { + if (error_count) { + printf("total # of errors = %d\n", error_count); + } + + return (error_count == 0) ? true : false; + } else { + if (error_count) { + printf("%4.2f(%%) of bytes mismatched (count=%d)\n", + static_cast(error_count) * 100 / static_cast(len), + error_count); + } + + return ((len * threshold > error_count) ? true : false); + } +} + +inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) { + printf("sdkDumpBin: <%s>\n", filename); + FILE *fp; + FOPEN(fp, filename, "wb"); + fwrite(data, bytes, 1, fp); + fflush(fp); + fclose(fp); +} + +inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + unsigned int *src_buffer, *ref_buffer; + FILE *src_fp = NULL, *ref_fp = NULL; + + uint64_t error_count = 0; + size_t fsize = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", + src_file); + error_count++; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", + ref_file, exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + ref_file); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf( + "compareBin2Bin " + " unable to open ref_file: %s\n", + ref_file_path); + error_count++; + } + + if (src_fp && ref_fp) { + src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int)); + + fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp); + fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp); + + printf( + "> compareBin2Bin nelements=%d," + " epsilon=%4.2f, threshold=%4.2f\n", + nelements, epsilon, threshold); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize)); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize)); + + if (!compareData(ref_buffer, src_buffer, nelements, + epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file, + unsigned int nelements, const float epsilon, + const float threshold, char *exec_path) { + float *src_buffer = NULL, *ref_buffer = NULL; + FILE *src_fp = NULL, *ref_fp = NULL; + size_t fsize = 0; + + uint64_t error_count = 0; + + if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) { + printf("compareBin2Bin unable to open src_file: %s\n", src_file); + error_count = 1; + } + + char *ref_file_path = sdkFindFilePath(ref_file, exec_path); + + if (ref_file_path == NULL) { + printf("compareBin2Bin unable to find <%s> in <%s>\n", ref_file, + exec_path); + printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", + exec_path); + printf("Aborting comparison!\n"); + printf(" FAILED\n"); + error_count++; + + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } else { + if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) { + printf("compareBin2Bin unable to open ref_file: %s\n", + ref_file_path); + error_count = 1; + } + + if (src_fp && ref_fp) { + src_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + ref_buffer = reinterpret_cast(malloc(nelements * sizeof(float))); + + printf( + "> compareBin2Bin nelements=%d, epsilon=%4.2f," + " threshold=%4.2f\n", + nelements, epsilon, threshold); + fsize = fread(src_buffer, sizeof(float), nelements, src_fp); + printf(" src_file <%s>, size=%d bytes\n", src_file, + static_cast(fsize * sizeof(float))); + fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp); + printf(" ref_file <%s>, size=%d bytes\n", ref_file_path, + static_cast(fsize * sizeof(float))); + + if (!compareDataAsFloatThreshold( + ref_buffer, src_buffer, nelements, epsilon, threshold)) { + error_count++; + } + + fclose(src_fp); + fclose(ref_fp); + + free(src_buffer); + free(ref_buffer); + } else { + if (src_fp) { + fclose(src_fp); + } + + if (ref_fp) { + fclose(ref_fp); + } + } + } + + if (error_count == 0) { + printf(" OK\n"); + } else { + printf(" FAILURE: %d errors...\n", (unsigned int)error_count); + } + + return (error_count == 0); // returns true if all pixels pass +} + +inline bool sdkCompareL2fe(const float *reference, const float *data, + const unsigned int len, const float epsilon) { + assert(epsilon >= 0); + + float error = 0; + float ref = 0; + + for (unsigned int i = 0; i < len; ++i) { + float diff = reference[i] - data[i]; + error += diff * diff; + ref += reference[i] * reference[i]; + } + + float normRef = sqrtf(ref); + + if (fabs(ref) < 1e-7) { +#ifdef _DEBUG + std::cerr << "ERROR, reference l2-norm is 0\n"; +#endif + return false; + } + + float normError = sqrtf(error); + error = normError / normRef; + bool result = error < epsilon; +#ifdef _DEBUG + + if (!result) { + std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon " + << epsilon << "\n"; + } + +#endif + + return result; +} + +inline bool sdkLoadPPMub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned int channels; + return __loadPPM(file, data, w, h, &channels); +} + +inline bool sdkLoadPPM4ub(const char *file, unsigned char **data, + unsigned int *w, unsigned int *h) { + unsigned char *idata = 0; + unsigned int channels; + + if (__loadPPM(file, &idata, w, h, &channels)) { + // pad 4th component + int size = *w * *h; + // keep the original pointer + unsigned char *idata_orig = idata; + *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4); + unsigned char *ptr = *data; + + for (int i = 0; i < size; i++) { + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = *idata++; + *ptr++ = 0; + } + + free(idata_orig); + return true; + } else { + free(idata); + return false; + } +} + +inline bool sdkComparePPM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data, *ref_data; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PPMvsPPM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) { + std::cerr << "PPMvsPPM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + } + + if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +inline bool sdkComparePGM(const char *src_file, const char *ref_file, + const float epsilon, const float threshold, + bool verboseErrors) { + unsigned char *src_data = 0, *ref_data = 0; + uint64_t error_count = 0; + unsigned int ref_width, ref_height; + unsigned int src_width, src_height; + + if (src_file == NULL || ref_file == NULL) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: src_file or ref_file is NULL." + " Aborting comparison\n"; + } + + return false; + } + + if (verboseErrors) { + std::cerr << "> Compare (a)rendered: <" << src_file << ">\n"; + std::cerr << "> (b)reference: <" << ref_file << ">\n"; + } + + if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file + << "\n"; + } + + return false; + } + + if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) { + std::cerr << "PGMvsPGM: unable to load src image file: " << src_file + << "\n"; + return false; + } + + if (src_height != ref_height || src_width != ref_width) { + if (verboseErrors) { + std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width + << "," << src_height << ")vs(" << ref_width << "," << ref_height + << ")\n"; + } + } + + if (verboseErrors) + std::cerr << "PGMvsPGM: comparing images size (" << src_width << "," + << src_height << ") epsilon(" << epsilon << "), threshold(" + << threshold * 100 << "%)\n"; + + if (compareData(ref_data, src_data, src_width * src_height, epsilon, + threshold) == false) { + error_count = 1; + } + + if (error_count == 0) { + if (verboseErrors) { + std::cerr << " OK\n\n"; + } + } else { + if (verboseErrors) { + std::cerr << " FAILURE! " << error_count << " errors...\n\n"; + } + } + + // returns true if all pixels pass + return (error_count == 0) ? true : false; +} + +#endif // COMMON_HELPER_IMAGE_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_math.h b/src/cuda/HPC/image/recursiveGaussian/helper_math.h new file mode 100644 index 000000000..6d2f63aee --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_math.h @@ -0,0 +1,1469 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * This file implements common mathematical operations on vector types + * (float3, float4 etc.) since these are not provided as standard by CUDA. + * + * The syntax is modeled on the Cg standard library. + * + * This is part of the Helper library includes + * + * Thanks to Linh Hah for additions and fixes. + */ + +#ifndef HELPER_MATH_H +#define HELPER_MATH_H + +#include "cuda_runtime.h" + +typedef unsigned int uint; +typedef unsigned short ushort; + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +#ifndef __CUDACC__ +#include + +//////////////////////////////////////////////////////////////////////////////// +// host implementations of CUDA functions +//////////////////////////////////////////////////////////////////////////////// + +inline float fminf(float a, float b) +{ + return a < b ? a : b; +} + +inline float fmaxf(float a, float b) +{ + return a > b ? a : b; +} + +inline int max(int a, int b) +{ + return a > b ? a : b; +} + +inline int min(int a, int b) +{ + return a < b ? a : b; +} + +inline float rsqrtf(float x) +{ + return 1.0f / sqrtf(x); +} +#endif + +//////////////////////////////////////////////////////////////////////////////// +// constructors +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 make_float2(float s) +{ + return make_float2(s, s); +} +inline __host__ __device__ float2 make_float2(float3 a) +{ + return make_float2(a.x, a.y); +} +inline __host__ __device__ float2 make_float2(int2 a) +{ + return make_float2(float(a.x), float(a.y)); +} +inline __host__ __device__ float2 make_float2(uint2 a) +{ + return make_float2(float(a.x), float(a.y)); +} + +inline __host__ __device__ int2 make_int2(int s) +{ + return make_int2(s, s); +} +inline __host__ __device__ int2 make_int2(int3 a) +{ + return make_int2(a.x, a.y); +} +inline __host__ __device__ int2 make_int2(uint2 a) +{ + return make_int2(int(a.x), int(a.y)); +} +inline __host__ __device__ int2 make_int2(float2 a) +{ + return make_int2(int(a.x), int(a.y)); +} + +inline __host__ __device__ uint2 make_uint2(uint s) +{ + return make_uint2(s, s); +} +inline __host__ __device__ uint2 make_uint2(uint3 a) +{ + return make_uint2(a.x, a.y); +} +inline __host__ __device__ uint2 make_uint2(int2 a) +{ + return make_uint2(uint(a.x), uint(a.y)); +} + +inline __host__ __device__ float3 make_float3(float s) +{ + return make_float3(s, s, s); +} +inline __host__ __device__ float3 make_float3(float2 a) +{ + return make_float3(a.x, a.y, 0.0f); +} +inline __host__ __device__ float3 make_float3(float2 a, float s) +{ + return make_float3(a.x, a.y, s); +} +inline __host__ __device__ float3 make_float3(float4 a) +{ + return make_float3(a.x, a.y, a.z); +} +inline __host__ __device__ float3 make_float3(int3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} +inline __host__ __device__ float3 make_float3(uint3 a) +{ + return make_float3(float(a.x), float(a.y), float(a.z)); +} + +inline __host__ __device__ int3 make_int3(int s) +{ + return make_int3(s, s, s); +} +inline __host__ __device__ int3 make_int3(int2 a) +{ + return make_int3(a.x, a.y, 0); +} +inline __host__ __device__ int3 make_int3(int2 a, int s) +{ + return make_int3(a.x, a.y, s); +} +inline __host__ __device__ int3 make_int3(uint3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} +inline __host__ __device__ int3 make_int3(float3 a) +{ + return make_int3(int(a.x), int(a.y), int(a.z)); +} + +inline __host__ __device__ uint3 make_uint3(uint s) +{ + return make_uint3(s, s, s); +} +inline __host__ __device__ uint3 make_uint3(uint2 a) +{ + return make_uint3(a.x, a.y, 0); +} +inline __host__ __device__ uint3 make_uint3(uint2 a, uint s) +{ + return make_uint3(a.x, a.y, s); +} +inline __host__ __device__ uint3 make_uint3(uint4 a) +{ + return make_uint3(a.x, a.y, a.z); +} +inline __host__ __device__ uint3 make_uint3(int3 a) +{ + return make_uint3(uint(a.x), uint(a.y), uint(a.z)); +} + +inline __host__ __device__ float4 make_float4(float s) +{ + return make_float4(s, s, s, s); +} +inline __host__ __device__ float4 make_float4(float3 a) +{ + return make_float4(a.x, a.y, a.z, 0.0f); +} +inline __host__ __device__ float4 make_float4(float3 a, float w) +{ + return make_float4(a.x, a.y, a.z, w); +} +inline __host__ __device__ float4 make_float4(int4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} +inline __host__ __device__ float4 make_float4(uint4 a) +{ + return make_float4(float(a.x), float(a.y), float(a.z), float(a.w)); +} + +inline __host__ __device__ int4 make_int4(int s) +{ + return make_int4(s, s, s, s); +} +inline __host__ __device__ int4 make_int4(int3 a) +{ + return make_int4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ int4 make_int4(int3 a, int w) +{ + return make_int4(a.x, a.y, a.z, w); +} +inline __host__ __device__ int4 make_int4(uint4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} +inline __host__ __device__ int4 make_int4(float4 a) +{ + return make_int4(int(a.x), int(a.y), int(a.z), int(a.w)); +} + + +inline __host__ __device__ uint4 make_uint4(uint s) +{ + return make_uint4(s, s, s, s); +} +inline __host__ __device__ uint4 make_uint4(uint3 a) +{ + return make_uint4(a.x, a.y, a.z, 0); +} +inline __host__ __device__ uint4 make_uint4(uint3 a, uint w) +{ + return make_uint4(a.x, a.y, a.z, w); +} +inline __host__ __device__ uint4 make_uint4(int4 a) +{ + return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// negate +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 &a) +{ + return make_float2(-a.x, -a.y); +} +inline __host__ __device__ int2 operator-(int2 &a) +{ + return make_int2(-a.x, -a.y); +} +inline __host__ __device__ float3 operator-(float3 &a) +{ + return make_float3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ int3 operator-(int3 &a) +{ + return make_int3(-a.x, -a.y, -a.z); +} +inline __host__ __device__ float4 operator-(float4 &a) +{ + return make_float4(-a.x, -a.y, -a.z, -a.w); +} +inline __host__ __device__ int4 operator-(int4 &a) +{ + return make_int4(-a.x, -a.y, -a.z, -a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// addition +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator+(float2 a, float2 b) +{ + return make_float2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(float2 &a, float2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ float2 operator+(float2 a, float b) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ float2 operator+(float b, float2 a) +{ + return make_float2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(float2 &a, float b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ int2 operator+(int2 a, int2 b) +{ + return make_int2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(int2 &a, int2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ int2 operator+(int2 a, int b) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ int2 operator+(int b, int2 a) +{ + return make_int2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(int2 &a, int b) +{ + a.x += b; + a.y += b; +} + +inline __host__ __device__ uint2 operator+(uint2 a, uint2 b) +{ + return make_uint2(a.x + b.x, a.y + b.y); +} +inline __host__ __device__ void operator+=(uint2 &a, uint2 b) +{ + a.x += b.x; + a.y += b.y; +} +inline __host__ __device__ uint2 operator+(uint2 a, uint b) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ uint2 operator+(uint b, uint2 a) +{ + return make_uint2(a.x + b, a.y + b); +} +inline __host__ __device__ void operator+=(uint2 &a, uint b) +{ + a.x += b; + a.y += b; +} + + +inline __host__ __device__ float3 operator+(float3 a, float3 b) +{ + return make_float3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(float3 &a, float3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ float3 operator+(float3 a, float b) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(float3 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int3 a, int3 b) +{ + return make_int3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(int3 &a, int3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ int3 operator+(int3 a, int b) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(int3 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ uint3 operator+(uint3 a, uint3 b) +{ + return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z); +} +inline __host__ __device__ void operator+=(uint3 &a, uint3 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; +} +inline __host__ __device__ uint3 operator+(uint3 a, uint b) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ void operator+=(uint3 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; +} + +inline __host__ __device__ int3 operator+(int b, int3 a) +{ + return make_int3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ uint3 operator+(uint b, uint3 a) +{ + return make_uint3(a.x + b, a.y + b, a.z + b); +} +inline __host__ __device__ float3 operator+(float b, float3 a) +{ + return make_float3(a.x + b, a.y + b, a.z + b); +} + +inline __host__ __device__ float4 operator+(float4 a, float4 b) +{ + return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(float4 &a, float4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ float4 operator+(float4 a, float b) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ float4 operator+(float b, float4 a) +{ + return make_float4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(float4 &a, float b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ int4 operator+(int4 a, int4 b) +{ + return make_int4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(int4 &a, int4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ int4 operator+(int4 a, int b) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ int4 operator+(int b, int4 a) +{ + return make_int4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(int4 &a, int b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +inline __host__ __device__ uint4 operator+(uint4 a, uint4 b) +{ + return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w); +} +inline __host__ __device__ void operator+=(uint4 &a, uint4 b) +{ + a.x += b.x; + a.y += b.y; + a.z += b.z; + a.w += b.w; +} +inline __host__ __device__ uint4 operator+(uint4 a, uint b) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ uint4 operator+(uint b, uint4 a) +{ + return make_uint4(a.x + b, a.y + b, a.z + b, a.w + b); +} +inline __host__ __device__ void operator+=(uint4 &a, uint b) +{ + a.x += b; + a.y += b; + a.z += b; + a.w += b; +} + +//////////////////////////////////////////////////////////////////////////////// +// subtract +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator-(float2 a, float2 b) +{ + return make_float2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(float2 &a, float2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ float2 operator-(float2 a, float b) +{ + return make_float2(a.x - b, a.y - b); +} +inline __host__ __device__ float2 operator-(float b, float2 a) +{ + return make_float2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(float2 &a, float b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ int2 operator-(int2 a, int2 b) +{ + return make_int2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(int2 &a, int2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ int2 operator-(int2 a, int b) +{ + return make_int2(a.x - b, a.y - b); +} +inline __host__ __device__ int2 operator-(int b, int2 a) +{ + return make_int2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(int2 &a, int b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ uint2 operator-(uint2 a, uint2 b) +{ + return make_uint2(a.x - b.x, a.y - b.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint2 b) +{ + a.x -= b.x; + a.y -= b.y; +} +inline __host__ __device__ uint2 operator-(uint2 a, uint b) +{ + return make_uint2(a.x - b, a.y - b); +} +inline __host__ __device__ uint2 operator-(uint b, uint2 a) +{ + return make_uint2(b - a.x, b - a.y); +} +inline __host__ __device__ void operator-=(uint2 &a, uint b) +{ + a.x -= b; + a.y -= b; +} + +inline __host__ __device__ float3 operator-(float3 a, float3 b) +{ + return make_float3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(float3 &a, float3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ float3 operator-(float3 a, float b) +{ + return make_float3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ float3 operator-(float b, float3 a) +{ + return make_float3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(float3 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ int3 operator-(int3 a, int3 b) +{ + return make_int3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(int3 &a, int3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ int3 operator-(int3 a, int b) +{ + return make_int3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ int3 operator-(int b, int3 a) +{ + return make_int3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(int3 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ uint3 operator-(uint3 a, uint3 b) +{ + return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint3 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; +} +inline __host__ __device__ uint3 operator-(uint3 a, uint b) +{ + return make_uint3(a.x - b, a.y - b, a.z - b); +} +inline __host__ __device__ uint3 operator-(uint b, uint3 a) +{ + return make_uint3(b - a.x, b - a.y, b - a.z); +} +inline __host__ __device__ void operator-=(uint3 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; +} + +inline __host__ __device__ float4 operator-(float4 a, float4 b) +{ + return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(float4 &a, float4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ float4 operator-(float4 a, float b) +{ + return make_float4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ void operator-=(float4 &a, float b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ int4 operator-(int4 a, int4 b) +{ + return make_int4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(int4 &a, int4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ int4 operator-(int4 a, int b) +{ + return make_int4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ int4 operator-(int b, int4 a) +{ + return make_int4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(int4 &a, int b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +inline __host__ __device__ uint4 operator-(uint4 a, uint4 b) +{ + return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint4 b) +{ + a.x -= b.x; + a.y -= b.y; + a.z -= b.z; + a.w -= b.w; +} +inline __host__ __device__ uint4 operator-(uint4 a, uint b) +{ + return make_uint4(a.x - b, a.y - b, a.z - b, a.w - b); +} +inline __host__ __device__ uint4 operator-(uint b, uint4 a) +{ + return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w); +} +inline __host__ __device__ void operator-=(uint4 &a, uint b) +{ + a.x -= b; + a.y -= b; + a.z -= b; + a.w -= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// multiply +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator*(float2 a, float2 b) +{ + return make_float2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(float2 &a, float2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ float2 operator*(float2 a, float b) +{ + return make_float2(a.x * b, a.y * b); +} +inline __host__ __device__ float2 operator*(float b, float2 a) +{ + return make_float2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(float2 &a, float b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ int2 operator*(int2 a, int2 b) +{ + return make_int2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(int2 &a, int2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ int2 operator*(int2 a, int b) +{ + return make_int2(a.x * b, a.y * b); +} +inline __host__ __device__ int2 operator*(int b, int2 a) +{ + return make_int2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(int2 &a, int b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ uint2 operator*(uint2 a, uint2 b) +{ + return make_uint2(a.x * b.x, a.y * b.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint2 b) +{ + a.x *= b.x; + a.y *= b.y; +} +inline __host__ __device__ uint2 operator*(uint2 a, uint b) +{ + return make_uint2(a.x * b, a.y * b); +} +inline __host__ __device__ uint2 operator*(uint b, uint2 a) +{ + return make_uint2(b * a.x, b * a.y); +} +inline __host__ __device__ void operator*=(uint2 &a, uint b) +{ + a.x *= b; + a.y *= b; +} + +inline __host__ __device__ float3 operator*(float3 a, float3 b) +{ + return make_float3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(float3 &a, float3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ float3 operator*(float3 a, float b) +{ + return make_float3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ float3 operator*(float b, float3 a) +{ + return make_float3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(float3 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ int3 operator*(int3 a, int3 b) +{ + return make_int3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(int3 &a, int3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ int3 operator*(int3 a, int b) +{ + return make_int3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ int3 operator*(int b, int3 a) +{ + return make_int3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(int3 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ uint3 operator*(uint3 a, uint3 b) +{ + return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint3 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; +} +inline __host__ __device__ uint3 operator*(uint3 a, uint b) +{ + return make_uint3(a.x * b, a.y * b, a.z * b); +} +inline __host__ __device__ uint3 operator*(uint b, uint3 a) +{ + return make_uint3(b * a.x, b * a.y, b * a.z); +} +inline __host__ __device__ void operator*=(uint3 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; +} + +inline __host__ __device__ float4 operator*(float4 a, float4 b) +{ + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(float4 &a, float4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ float4 operator*(float4 a, float b) +{ + return make_float4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ float4 operator*(float b, float4 a) +{ + return make_float4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(float4 &a, float b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ int4 operator*(int4 a, int4 b) +{ + return make_int4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(int4 &a, int4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ int4 operator*(int4 a, int b) +{ + return make_int4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ int4 operator*(int b, int4 a) +{ + return make_int4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(int4 &a, int b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +inline __host__ __device__ uint4 operator*(uint4 a, uint4 b) +{ + return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint4 b) +{ + a.x *= b.x; + a.y *= b.y; + a.z *= b.z; + a.w *= b.w; +} +inline __host__ __device__ uint4 operator*(uint4 a, uint b) +{ + return make_uint4(a.x * b, a.y * b, a.z * b, a.w * b); +} +inline __host__ __device__ uint4 operator*(uint b, uint4 a) +{ + return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w); +} +inline __host__ __device__ void operator*=(uint4 &a, uint b) +{ + a.x *= b; + a.y *= b; + a.z *= b; + a.w *= b; +} + +//////////////////////////////////////////////////////////////////////////////// +// divide +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 operator/(float2 a, float2 b) +{ + return make_float2(a.x / b.x, a.y / b.y); +} +inline __host__ __device__ void operator/=(float2 &a, float2 b) +{ + a.x /= b.x; + a.y /= b.y; +} +inline __host__ __device__ float2 operator/(float2 a, float b) +{ + return make_float2(a.x / b, a.y / b); +} +inline __host__ __device__ void operator/=(float2 &a, float b) +{ + a.x /= b; + a.y /= b; +} +inline __host__ __device__ float2 operator/(float b, float2 a) +{ + return make_float2(b / a.x, b / a.y); +} + +inline __host__ __device__ float3 operator/(float3 a, float3 b) +{ + return make_float3(a.x / b.x, a.y / b.y, a.z / b.z); +} +inline __host__ __device__ void operator/=(float3 &a, float3 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; +} +inline __host__ __device__ float3 operator/(float3 a, float b) +{ + return make_float3(a.x / b, a.y / b, a.z / b); +} +inline __host__ __device__ void operator/=(float3 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; +} +inline __host__ __device__ float3 operator/(float b, float3 a) +{ + return make_float3(b / a.x, b / a.y, b / a.z); +} + +inline __host__ __device__ float4 operator/(float4 a, float4 b) +{ + return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w); +} +inline __host__ __device__ void operator/=(float4 &a, float4 b) +{ + a.x /= b.x; + a.y /= b.y; + a.z /= b.z; + a.w /= b.w; +} +inline __host__ __device__ float4 operator/(float4 a, float b) +{ + return make_float4(a.x / b, a.y / b, a.z / b, a.w / b); +} +inline __host__ __device__ void operator/=(float4 &a, float b) +{ + a.x /= b; + a.y /= b; + a.z /= b; + a.w /= b; +} +inline __host__ __device__ float4 operator/(float b, float4 a) +{ + return make_float4(b / a.x, b / a.y, b / a.z, b / a.w); +} + +//////////////////////////////////////////////////////////////////////////////// +// min +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fminf(float2 a, float2 b) +{ + return make_float2(fminf(a.x,b.x), fminf(a.y,b.y)); +} +inline __host__ __device__ float3 fminf(float3 a, float3 b) +{ + return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z)); +} +inline __host__ __device__ float4 fminf(float4 a, float4 b) +{ + return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w)); +} + +inline __host__ __device__ int2 min(int2 a, int2 b) +{ + return make_int2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ int3 min(int3 a, int3 b) +{ + return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ int4 min(int4 a, int4 b) +{ + return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +inline __host__ __device__ uint2 min(uint2 a, uint2 b) +{ + return make_uint2(min(a.x,b.x), min(a.y,b.y)); +} +inline __host__ __device__ uint3 min(uint3 a, uint3 b) +{ + return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z)); +} +inline __host__ __device__ uint4 min(uint4 a, uint4 b) +{ + return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// max +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmaxf(float2 a, float2 b) +{ + return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y)); +} +inline __host__ __device__ float3 fmaxf(float3 a, float3 b) +{ + return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z)); +} +inline __host__ __device__ float4 fmaxf(float4 a, float4 b) +{ + return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w)); +} + +inline __host__ __device__ int2 max(int2 a, int2 b) +{ + return make_int2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ int3 max(int3 a, int3 b) +{ + return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ int4 max(int4 a, int4 b) +{ + return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +inline __host__ __device__ uint2 max(uint2 a, uint2 b) +{ + return make_uint2(max(a.x,b.x), max(a.y,b.y)); +} +inline __host__ __device__ uint3 max(uint3 a, uint3 b) +{ + return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z)); +} +inline __host__ __device__ uint4 max(uint4 a, uint4 b) +{ + return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// lerp +// - linear interpolation between a and b, based on value t in [0, 1] range +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float lerp(float a, float b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float2 lerp(float2 a, float2 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float3 lerp(float3 a, float3 b, float t) +{ + return a + t*(b-a); +} +inline __device__ __host__ float4 lerp(float4 a, float4 b, float t) +{ + return a + t*(b-a); +} + +//////////////////////////////////////////////////////////////////////////////// +// clamp +// - clamp the value v to be in the range [a, b] +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float clamp(float f, float a, float b) +{ + return fmaxf(a, fminf(f, b)); +} +inline __device__ __host__ int clamp(int f, int a, int b) +{ + return max(a, min(f, b)); +} +inline __device__ __host__ uint clamp(uint f, uint a, uint b) +{ + return max(a, min(f, b)); +} + +inline __device__ __host__ float2 clamp(float2 v, float a, float b) +{ + return make_float2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b) +{ + return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ float3 clamp(float3 v, float a, float b) +{ + return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b) +{ + return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ float4 clamp(float4 v, float a, float b) +{ + return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b) +{ + return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ int2 clamp(int2 v, int a, int b) +{ + return make_int2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b) +{ + return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ int3 clamp(int3 v, int a, int b) +{ + return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b) +{ + return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ int4 clamp(int4 v, int a, int b) +{ + return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b) +{ + return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b) +{ + return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b)); +} +inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b) +{ + return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b) +{ + return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b)); +} +inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b) +{ + return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b) +{ + return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b)); +} +inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b) +{ + return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// dot product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float dot(float2 a, float2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ float dot(float3 a, float3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ float dot(float4 a, float4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ int dot(int2 a, int2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ int dot(int3 a, int3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ int dot(int4 a, int4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +inline __host__ __device__ uint dot(uint2 a, uint2 b) +{ + return a.x * b.x + a.y * b.y; +} +inline __host__ __device__ uint dot(uint3 a, uint3 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z; +} +inline __host__ __device__ uint dot(uint4 a, uint4 b) +{ + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +//////////////////////////////////////////////////////////////////////////////// +// length +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float length(float2 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float3 v) +{ + return sqrtf(dot(v, v)); +} +inline __host__ __device__ float length(float4 v) +{ + return sqrtf(dot(v, v)); +} + +//////////////////////////////////////////////////////////////////////////////// +// normalize +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 normalize(float2 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float3 normalize(float3 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} +inline __host__ __device__ float4 normalize(float4 v) +{ + float invLen = rsqrtf(dot(v, v)); + return v * invLen; +} + +//////////////////////////////////////////////////////////////////////////////// +// floor +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 floorf(float2 v) +{ + return make_float2(floorf(v.x), floorf(v.y)); +} +inline __host__ __device__ float3 floorf(float3 v) +{ + return make_float3(floorf(v.x), floorf(v.y), floorf(v.z)); +} +inline __host__ __device__ float4 floorf(float4 v) +{ + return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// frac - returns the fractional portion of a scalar or each vector component +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float fracf(float v) +{ + return v - floorf(v); +} +inline __host__ __device__ float2 fracf(float2 v) +{ + return make_float2(fracf(v.x), fracf(v.y)); +} +inline __host__ __device__ float3 fracf(float3 v) +{ + return make_float3(fracf(v.x), fracf(v.y), fracf(v.z)); +} +inline __host__ __device__ float4 fracf(float4 v) +{ + return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// fmod +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fmodf(float2 a, float2 b) +{ + return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y)); +} +inline __host__ __device__ float3 fmodf(float3 a, float3 b) +{ + return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z)); +} +inline __host__ __device__ float4 fmodf(float4 a, float4 b) +{ + return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// absolute value +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float2 fabs(float2 v) +{ + return make_float2(fabs(v.x), fabs(v.y)); +} +inline __host__ __device__ float3 fabs(float3 v) +{ + return make_float3(fabs(v.x), fabs(v.y), fabs(v.z)); +} +inline __host__ __device__ float4 fabs(float4 v) +{ + return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w)); +} + +inline __host__ __device__ int2 abs(int2 v) +{ + return make_int2(abs(v.x), abs(v.y)); +} +inline __host__ __device__ int3 abs(int3 v) +{ + return make_int3(abs(v.x), abs(v.y), abs(v.z)); +} +inline __host__ __device__ int4 abs(int4 v) +{ + return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w)); +} + +//////////////////////////////////////////////////////////////////////////////// +// reflect +// - returns reflection of incident ray I around surface normal N +// - N should be normalized, reflected vector's length is equal to length of I +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 reflect(float3 i, float3 n) +{ + return i - 2.0f * n * dot(n,i); +} + +//////////////////////////////////////////////////////////////////////////////// +// cross product +//////////////////////////////////////////////////////////////////////////////// + +inline __host__ __device__ float3 cross(float3 a, float3 b) +{ + return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x); +} + +//////////////////////////////////////////////////////////////////////////////// +// smoothstep +// - returns 0 if x < a +// - returns 1 if x > b +// - otherwise returns smooth interpolation between 0 and 1 based on x +//////////////////////////////////////////////////////////////////////////////// + +inline __device__ __host__ float smoothstep(float a, float b, float x) +{ + float y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(3.0f - (2.0f*y))); +} +inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x) +{ + float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y))); +} +inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x) +{ + float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y))); +} +inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x) +{ + float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f); + return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y))); +} + +#endif diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_string.h b/src/cuda/HPC/image/recursiveGaussian/helper_string.h new file mode 100644 index 000000000..39a1b3805 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_string.h @@ -0,0 +1,428 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// These are helper functions for the SDK samples (string parsing, timers, etc) +#ifndef COMMON_HELPER_STRING_H_ +#define COMMON_HELPER_STRING_H_ + +#include +#include +#include +#include + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +#ifndef _CRT_SECURE_NO_DEPRECATE +#define _CRT_SECURE_NO_DEPRECATE +#endif +#ifndef STRCASECMP +#define STRCASECMP _stricmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP _strnicmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result != 0) +#endif +#ifndef SSCANF +#define SSCANF sscanf_s +#endif +#ifndef SPRINTF +#define SPRINTF sprintf_s +#endif +#else // Linux Includes +#include +#include + +#ifndef STRCASECMP +#define STRCASECMP strcasecmp +#endif +#ifndef STRNCASECMP +#define STRNCASECMP strncasecmp +#endif +#ifndef STRCPY +#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath) +#endif + +#ifndef FOPEN +#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode)) +#endif +#ifndef FOPEN_FAIL +#define FOPEN_FAIL(result) (result == NULL) +#endif +#ifndef SSCANF +#define SSCANF sscanf +#endif +#ifndef SPRINTF +#define SPRINTF sprintf +#endif +#endif + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// CUDA Utility Helper Functions +inline int stringRemoveDelimiter(char delimiter, const char *string) { + int string_start = 0; + + while (string[string_start] == delimiter) { + string_start++; + } + + if (string_start >= static_cast(strlen(string) - 1)) { + return 0; + } + + return string_start; +} + +inline int getFileExtension(char *filename, char **extension) { + int string_length = static_cast(strlen(filename)); + + while (filename[string_length--] != '.') { + if (string_length == 0) break; + } + + if (string_length > 0) string_length += 2; + + if (string_length == 0) + *extension = NULL; + else + *extension = &filename[string_length]; + + return string_length; +} + +inline bool checkCmdLineFlag(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + + const char *equal_pos = strchr(string_argv, '='); + int argv_length = static_cast( + equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv); + + int length = static_cast(strlen(string_ref)); + + if (length == argv_length && + !STRNCASECMP(string_argv, string_ref, length)) { + bFound = true; + continue; + } + } + } + + return bFound; +} + +// This function wraps the CUDA Driver API into a template function +template +inline bool getCmdLineArgumentValue(const int argc, const char **argv, + const char *string_ref, T *value) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + *value = (T)atoi(&string_argv[length + auto_inc]); + } + + bFound = true; + i = argc; + } + } + } + + return bFound; +} + +inline int getCmdLineArgumentInt(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + int value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = atoi(&string_argv[length + auto_inc]); + } else { + value = 0; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline float getCmdLineArgumentFloat(const int argc, const char **argv, + const char *string_ref) { + bool bFound = false; + float value = -1; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + const char *string_argv = &argv[i][string_start]; + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + if (length + 1 <= static_cast(strlen(string_argv))) { + int auto_inc = (string_argv[length] == '=') ? 1 : 0; + value = static_cast(atof(&string_argv[length + auto_inc])); + } else { + value = 0.f; + } + + bFound = true; + continue; + } + } + } + + if (bFound) { + return value; + } else { + return 0; + } +} + +inline bool getCmdLineArgumentString(const int argc, const char **argv, + const char *string_ref, + char **string_retval) { + bool bFound = false; + + if (argc >= 1) { + for (int i = 1; i < argc; i++) { + int string_start = stringRemoveDelimiter('-', argv[i]); + char *string_argv = const_cast(&argv[i][string_start]); + int length = static_cast(strlen(string_ref)); + + if (!STRNCASECMP(string_argv, string_ref, length)) { + *string_retval = &string_argv[length + 1]; + bFound = true; + continue; + } + } + } + + if (!bFound) { + *string_retval = NULL; + } + + return bFound; +} + +////////////////////////////////////////////////////////////////////////////// +//! Find the path for a file assuming that +//! files are found in the searchPath. +//! +//! @return the path if succeeded, otherwise 0 +//! @param filename name of the file +//! @param executable_path optional absolute path of the executable +////////////////////////////////////////////////////////////////////////////// +inline char *sdkFindFilePath(const char *filename, + const char *executable_path) { + // defines a variable that is replaced with the name of the + // executable + + // Typical relative search paths to locate needed companion files (e.g. sample + // input data, or JIT source files) The origin for the relative search may be + // the .exe file, a .bat file launching an .exe, a browser .exe launching the + // .exe or .bat, etc + const char *searchPath[] = { + "./", // same dir + "./data/", // same dir + + "../../../../Samples//", // up 4 in tree + "../../../Samples//", // up 3 in tree + "../../Samples//", // up 2 in tree + + "../../../../Samples//data/", // up 4 in tree + "../../../Samples//data/", // up 3 in tree + "../../Samples//data/", // up 2 in tree + + "../../../../Samples/0_Introduction//", // up 4 in tree + "../../../Samples/0_Introduction//", // up 3 in tree + "../../Samples/0_Introduction//", // up 2 in tree + + "../../../../Samples/1_Utilities//", // up 4 in tree + "../../../Samples/1_Utilities//", // up 3 in tree + "../../Samples/1_Utilities//", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//", // up 4 in tree + "../../../Samples/3_CUDA_Features//", // up 3 in tree + "../../Samples/3_CUDA_Features//", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//", // up 3 in tree + "../../Samples/4_CUDA_Libraries//", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//", // up 4 in tree + "../../../Samples/5_Domain_Specific//", // up 3 in tree + "../../Samples/5_Domain_Specific//", // up 2 in tree + + "../../../../Samples/6_Performance//", // up 4 in tree + "../../../Samples/6_Performance//", // up 3 in tree + "../../Samples/6_Performance//", // up 2 in tree + + "../../../../Samples/0_Introduction//data/", // up 4 in tree + "../../../Samples/0_Introduction//data/", // up 3 in tree + "../../Samples/0_Introduction//data/", // up 2 in tree + + "../../../../Samples/1_Utilities//data/", // up 4 in tree + "../../../Samples/1_Utilities//data/", // up 3 in tree + "../../Samples/1_Utilities//data/", // up 2 in tree + + "../../../../Samples/2_Concepts_and_Techniques//data/", // up 4 in tree + "../../../Samples/2_Concepts_and_Techniques//data/", // up 3 in tree + "../../Samples/2_Concepts_and_Techniques//data/", // up 2 in tree + + "../../../../Samples/3_CUDA_Features//data/", // up 4 in tree + "../../../Samples/3_CUDA_Features//data/", // up 3 in tree + "../../Samples/3_CUDA_Features//data/", // up 2 in tree + + "../../../../Samples/4_CUDA_Libraries//data/", // up 4 in tree + "../../../Samples/4_CUDA_Libraries//data/", // up 3 in tree + "../../Samples/4_CUDA_Libraries//data/", // up 2 in tree + + "../../../../Samples/5_Domain_Specific//data/", // up 4 in tree + "../../../Samples/5_Domain_Specific//data/", // up 3 in tree + "../../Samples/5_Domain_Specific//data/", // up 2 in tree + + "../../../../Samples/6_Performance//data/", // up 4 in tree + "../../../Samples/6_Performance//data/", // up 3 in tree + "../../Samples/6_Performance//data/", // up 2 in tree + + "../../../../Common/data/", // up 4 in tree + "../../../Common/data/", // up 3 in tree + "../../Common/data/" // up 2 in tree + }; + + // Extract the executable name + std::string executable_name; + + if (executable_path != 0) { + executable_name = std::string(executable_path); + +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + // Windows path delimiter + size_t delimiter_pos = executable_name.find_last_of('\\'); + executable_name.erase(0, delimiter_pos + 1); + + if (executable_name.rfind(".exe") != std::string::npos) { + // we strip .exe, only if the .exe is found + executable_name.resize(executable_name.size() - 4); + } + +#else + // Linux & OSX path delimiter + size_t delimiter_pos = executable_name.find_last_of('/'); + executable_name.erase(0, delimiter_pos + 1); +#endif + } + + // Loop over all search paths and return the first hit + for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) { + std::string path(searchPath[i]); + size_t executable_name_pos = path.find(""); + + // If there is executable_name variable in the searchPath + // replace it with the value + if (executable_name_pos != std::string::npos) { + if (executable_path != 0) { + path.replace(executable_name_pos, strlen(""), + executable_name); + } else { + // Skip this path entry if no executable argument is given + continue; + } + } + +#ifdef _DEBUG + printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str()); +#endif + + // Test if the file exists + path.append(filename); + FILE *fp; + FOPEN(fp, path.c_str(), "rb"); + + if (fp != NULL) { + fclose(fp); + // File found + // returning an allocated array here for backwards compatibility reasons + char *file_path = reinterpret_cast(malloc(path.length() + 1)); + STRCPY(file_path, path.length() + 1, path.c_str()); + return file_path; + } + + if (fp) { + fclose(fp); + } + } + + // File not found + printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename); + return 0; +} + +#endif // COMMON_HELPER_STRING_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_timer.h b/src/cuda/HPC/image/recursiveGaussian/helper_timer.h new file mode 100644 index 000000000..0614a7802 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/helper_timer.h @@ -0,0 +1,465 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +// Helper Timing Functions +#ifndef COMMON_HELPER_TIMER_H_ +#define COMMON_HELPER_TIMER_H_ + +#ifndef EXIT_WAIVED +#define EXIT_WAIVED 2 +#endif + +// includes, system +#include + +// includes, project +#include "exception.h" + +// Definition of the StopWatch Interface, this is used if we don't want to use +// the CUT functions But rather in a self contained class interface +class StopWatchInterface { + public: + StopWatchInterface() {} + virtual ~StopWatchInterface() {} + + public: + //! Start time measurement + virtual void start() = 0; + + //! Stop time measurement + virtual void stop() = 0; + + //! Reset time counters to zero + virtual void reset() = 0; + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + virtual float getTime() = 0; + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + virtual float getAverageTime() = 0; +}; + +////////////////////////////////////////////////////////////////// +// Begin Stopwatch timer class definitions for all OS platforms // +////////////////////////////////////////////////////////////////// +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) +// includes, system +#define WINDOWS_LEAN_AND_MEAN +#include +#undef min +#undef max + +//! Windows specific implementation of StopWatch +class StopWatchWin : public StopWatchInterface { + public: + //! Constructor, default + StopWatchWin() + : start_time(), + end_time(), + diff_time(0.0f), + total_time(0.0f), + running(false), + clock_sessions(0), + freq(0), + freq_set(false) { + if (!freq_set) { + // helper variable + LARGE_INTEGER temp; + + // get the tick frequency from the OS + QueryPerformanceFrequency(reinterpret_cast(&temp)); + + // convert to type in which it is needed + freq = (static_cast(temp.QuadPart)) / 1000.0; + + // rememeber query + freq_set = true; + } + } + + // Destructor + ~StopWatchWin() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // member variables + + //! Start of measurement + LARGE_INTEGER start_time; + //! End of measurement + LARGE_INTEGER end_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; + + //! tick frequency + double freq; + + //! flag if the frequency has been set + bool freq_set; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::start() { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::stop() { + QueryPerformanceCounter(reinterpret_cast(&end_time)); + diff_time = static_cast(((static_cast(end_time.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + + total_time += diff_time; + clock_sessions++; + running = false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchWin::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + QueryPerformanceCounter(reinterpret_cast(&start_time)); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + LARGE_INTEGER temp; + QueryPerformanceCounter(reinterpret_cast(&temp)); + retval += static_cast(((static_cast(temp.QuadPart) - + static_cast(start_time.QuadPart)) / + freq)); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchWin::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +#else +// Declarations for Stopwatch on Linux and Mac OSX +// includes, system +#include +#include + +//! Windows specific implementation of StopWatch +class StopWatchLinux : public StopWatchInterface { + public: + //! Constructor, default + StopWatchLinux() + : start_time(), + diff_time(0.0), + total_time(0.0), + running(false), + clock_sessions(0) {} + + // Destructor + virtual ~StopWatchLinux() {} + + public: + //! Start time measurement + inline void start(); + + //! Stop time measurement + inline void stop(); + + //! Reset time counters to zero + inline void reset(); + + //! Time in msec. after start. If the stop watch is still running (i.e. there + //! was no call to stop()) then the elapsed time is returned, otherwise the + //! time between the last start() and stop call is returned + inline float getTime(); + + //! Mean time to date based on the number of times the stopwatch has been + //! _stopped_ (ie finished sessions) and the current total time + inline float getAverageTime(); + + private: + // helper functions + + //! Get difference between start time and current time + inline float getDiffTime(); + + private: + // member variables + + //! Start of measurement + struct timeval start_time; + + //! Time difference between the last start and stop + float diff_time; + + //! TOTAL time difference between starts and stops + float total_time; + + //! flag if the stop watch is running + bool running; + + //! Number of times clock has been started + //! and stopped to allow averaging + int clock_sessions; +}; + +// functions, inlined + +//////////////////////////////////////////////////////////////////////////////// +//! Start time measurement +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::start() { + gettimeofday(&start_time, 0); + running = true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop time measurement and increment add to the current diff_time summation +//! variable. Also increment the number of times this clock has been run. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::stop() { + diff_time = getDiffTime(); + total_time += diff_time; + running = false; + clock_sessions++; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Reset the timer to 0. Does not change the timer running state but does +//! recapture this point in time as the current start time if it is running. +//////////////////////////////////////////////////////////////////////////////// +inline void StopWatchLinux::reset() { + diff_time = 0; + total_time = 0; + clock_sessions = 0; + + if (running) { + gettimeofday(&start_time, 0); + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. after start. If the stop watch is still running (i.e. there +//! was no call to stop()) then the elapsed time is returned added to the +//! current diff_time sum, otherwise the current summed time difference alone +//! is returned. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getTime() { + // Return the TOTAL time to date + float retval = total_time; + + if (running) { + retval += getDiffTime(); + } + + return retval; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Time in msec. for a single run based on the total number of COMPLETED runs +//! and the total time. +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getAverageTime() { + return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f; +} +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +inline float StopWatchLinux::getDiffTime() { + struct timeval t_time; + gettimeofday(&t_time, 0); + + // time difference in milli-seconds + return static_cast(1000.0 * (t_time.tv_sec - start_time.tv_sec) + + (0.001 * (t_time.tv_usec - start_time.tv_usec))); +} +#endif // WIN32 + +//////////////////////////////////////////////////////////////////////////////// +//! Timer functionality exported + +//////////////////////////////////////////////////////////////////////////////// +//! Create a new timer +//! @return true if a time has been created, otherwise false +//! @param name of the new timer, 0 if the creation failed +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkCreateTimer(StopWatchInterface **timer_interface) { +// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface); +#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) + *timer_interface = reinterpret_cast(new StopWatchWin()); +#else + *timer_interface = + reinterpret_cast(new StopWatchLinux()); +#endif + return (*timer_interface != NULL) ? true : false; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Delete a timer +//! @return true if a time has been deleted, otherwise false +//! @param name of the timer to delete +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) { + // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + delete *timer_interface; + *timer_interface = NULL; + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Start the time with name \a name +//! @param name name of the timer to start +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStartTimer(StopWatchInterface **timer_interface) { + // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->start(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Stop the time with name \a name. Does not reset. +//! @param name name of the timer to stop +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkStopTimer(StopWatchInterface **timer_interface) { + // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->stop(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Resets the timer's counter. +//! @param name name of the timer to reset. +//////////////////////////////////////////////////////////////////////////////// +inline bool sdkResetTimer(StopWatchInterface **timer_interface) { + // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + (*timer_interface)->reset(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +//! Return the average time for timer execution as the total time +//! for the timer dividied by the number of completed (stopped) runs the timer +//! has made. +//! Excludes the current running time if the timer is currently running. +//! @param name name of the timer to return the time of +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetAverageTimerValue called object %08x\n", (void + // *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getAverageTime(); + } else { + return 0.0f; + } +} + +//////////////////////////////////////////////////////////////////////////////// +//! Total execution time for the timer over all runs since the last reset +//! or timer creation. +//! @param name name of the timer to obtain the value of. +//////////////////////////////////////////////////////////////////////////////// +inline float sdkGetTimerValue(StopWatchInterface **timer_interface) { + // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface); + if (*timer_interface) { + return (*timer_interface)->getTime(); + } else { + return 0.0f; + } +} + +#endif // COMMON_HELPER_TIMER_H_ diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp new file mode 100644 index 000000000..1a6172e0d --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp @@ -0,0 +1,587 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Recursive Gaussian filter + sgreen 8/1/08 + + This code sample implements a Gaussian blur using Deriche's recursive method: + http://citeseer.ist.psu.edu/deriche93recursively.html + + This is similar to the box filter sample in the SDK, but it uses the previous + outputs of the filter as well as the previous inputs. This is also known as an + IIR (infinite impulse response) filter, since its response to an input impulse + can last forever. + + The main advantage of this method is that the execution time is independent of + the filter width. + + The GPU processes columns of the image in parallel. To avoid uncoalesced reads + for the row pass we transpose the image and then transpose it back again + afterwards. + + The implementation is based on code from the CImg library: + http://cimg.sourceforge.net/ + Thanks to David Tschumperl� and all the CImg contributors! +*/ + +#pragma warning(disable : 4819) + +#ifndef NO_OPENGL +// OpenGL Graphics includes +#include "helper_gl.h" +#if defined(__APPLE__) || defined(MACOSX) +#pragma clang diagnostic ignored "-Wdeprecated-declarations" +#include +#ifndef glutCloseFunc +#define glutCloseFunc glutWMCloseFunc +#endif +#else +#include +#endif + +// CUDA includes and interop headers +#include +#endif + +#include + +// CUDA utilities and system includes +#include "helper_cuda.h" // includes cuda.h and cuda_runtime_api.h +#include "helper_functions.h" + +// Includes +#include +#include +#include +#include + +#define MAX(a, b) ((a > b) ? a : b) + +#define USE_SIMPLE_FILTER 0 + +#define MAX_EPSILON_ERROR 5.0f +#define THRESHOLD 0.15f + +// Define the files that are to be save and the reference images for validation +const char *sOriginal[] = {"teapot512_10.ppm", "teapot512_14.ppm", "teapot512_18.ppm", "teapot512_22.ppm", NULL}; + +const char *sReference[] = {"ref_10.ppm", "ref_14.ppm", "ref_18.ppm", "ref_22.ppm", NULL}; + +const char *image_filename = "teapot512.ppm"; +float sigma = 10.0f; +int order = 0; +int nthreads = 64; // number of threads per block + +unsigned int width, height; +unsigned int *h_img = NULL; +unsigned int *d_img = NULL; +unsigned int *d_temp = NULL; + +#ifndef NO_OPENGL +GLuint pbo = 0; // OpenGL pixel buffer object +GLuint texid = 0; // texture + +cudaGraphicsResource_t cuda_vbo_resource; +#endif + +StopWatchInterface *timer = 0; + +// Auto-Verification Code +const int frameCheckNumber = 4; +int fpsCount = 0; // FPS count for averaging +int fpsLimit = 1; // FPS limit for sampling +unsigned int frameCount = 0; + +int *pArgc = NULL; +char **pArgv = NULL; + +bool runBenchmark = false; + +const char *sSDKsample = "CUDA Recursive Gaussian"; + +extern "C" void transpose(unsigned int *d_src, unsigned int *d_dest, unsigned int width, int height); + +extern "C" void gaussianFilterRGBA(unsigned int *d_src, + unsigned int *d_dest, + unsigned int *d_temp, + int width, + int height, + float sigma, + int order, + int nthreads); + +void cleanup(); + +#ifndef NO_OPENGL +void computeFPS() +{ + frameCount++; + fpsCount++; + + if (fpsCount == fpsLimit) { + char fps[256]; + float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f); + sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps); + + glutSetWindowTitle(fps); + fpsCount = 0; + + fpsLimit = ftoi(MAX(ifps, 1.f)); + sdkResetTimer(&timer); + } +} + +// display results using OpenGL +void display() +{ + sdkStartTimer(&timer); + + // execute filter, writing results to pbo + unsigned int *d_result; + checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0)); + size_t num_bytes; + checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_vbo_resource)); + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + + // unmap buffer object + checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0)); + + // load texture from pbo + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBindTexture(GL_TEXTURE_2D, texid); + glPixelStorei(GL_UNPACK_ALIGNMENT, 1); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + + // display results + glClear(GL_COLOR_BUFFER_BIT); + + glEnable(GL_TEXTURE_2D); + glDisable(GL_DEPTH_TEST); + + glBegin(GL_QUADS); + glTexCoord2f(0, 1); + glVertex2f(0, 0); + glTexCoord2f(1, 1); + glVertex2f(1, 0); + glTexCoord2f(1, 0); + glVertex2f(1, 1); + glTexCoord2f(0, 0); + glVertex2f(0, 1); + glEnd(); + + glDisable(GL_TEXTURE_2D); + glutSwapBuffers(); + + sdkStopTimer(&timer); + + computeFPS(); +} + +void idle() { glutPostRedisplay(); } +#endif + +void cleanup() +{ + sdkDeleteTimer(&timer); + + checkCudaErrors(cudaFree(d_img)); + checkCudaErrors(cudaFree(d_temp)); + +#ifndef NO_OPENGL + if (!runBenchmark) { + if (pbo) { + // unregister this buffer object with CUDA + checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource)); + glDeleteBuffers(1, &pbo); + } + + if (texid) { + glDeleteTextures(1, &texid); + } + } +#endif +} + +#ifndef NO_OPENGL +void keyboard(unsigned char key, int x, int y) +{ + switch (key) { + case 27: +#if defined(__APPLE__) || defined(MACOSX) + exit(EXIT_SUCCESS); +#else + glutDestroyWindow(glutGetWindow()); + return; +#endif + break; + + case '=': + sigma += 0.1f; + break; + + case '-': + sigma -= 0.1f; + + if (sigma < 0.0) { + sigma = 0.0f; + } + + break; + + case '+': + sigma += 1.0f; + break; + + case '_': + sigma -= 1.0f; + + if (sigma < 0.0) { + sigma = 0.0f; + } + + break; + + case '0': + order = 0; + break; + + case '1': + order = 1; + sigma = 0.5f; + break; + + case '2': + order = 2; + sigma = 0.5f; + break; + + default: + break; + } + + printf("sigma = %f\n", sigma); + glutPostRedisplay(); +} + +void reshape(int x, int y) +{ + glViewport(0, 0, x, y); + + glMatrixMode(GL_MODELVIEW); + glLoadIdentity(); + + glMatrixMode(GL_PROJECTION); + glLoadIdentity(); + glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0); +} + +void initGLBuffers() +{ + // create pixel buffer object to store final image + glGenBuffers(1, &pbo); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo); + glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, h_img, GL_STREAM_DRAW_ARB); + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0); + checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, pbo, cudaGraphicsRegisterFlagsWriteDiscard)); + + // create texture for display + glGenTextures(1, &texid); + glBindTexture(GL_TEXTURE_2D, texid); + glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); + glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glBindTexture(GL_TEXTURE_2D, 0); +} + +void initGL(int *argc, char **argv) +{ + glutInit(argc, argv); + glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE); + glutInitWindowSize(width, height); + glutCreateWindow(sSDKsample); + glutDisplayFunc(display); + glutKeyboardFunc(keyboard); + glutReshapeFunc(reshape); + glutIdleFunc(idle); + +#if defined(__APPLE__) || defined(MACOSX) + atexit(cleanup); +#else + glutCloseFunc(cleanup); +#endif + + printf("Press '+' and '-' to change filter width\n"); + printf("0, 1, 2 - change filter order\n"); + + if (!isGLVersionSupported(2, 0) + || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) { + fprintf(stderr, "Required OpenGL extensions missing."); + exit(EXIT_FAILURE); + } +} +#endif + +void initCudaBuffers() +{ + unsigned int size = width * height * sizeof(unsigned int); + + // allocate device memory + checkCudaErrors(cudaMalloc((void **)&d_img, size)); + checkCudaErrors(cudaMalloc((void **)&d_temp, size)); + + checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice)); + + sdkCreateTimer(&timer); +} + +void benchmark(int iterations) +{ + // allocate memory for result + unsigned int *d_result; + unsigned int size = width * height * sizeof(unsigned int); + checkCudaErrors(cudaMalloc((void **)&d_result, size)); + + // warm-up + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); + + // execute the kernel + for (int i = 0; i < iterations; i++) { + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + } + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStopTimer(&timer); + + // check if kernel execution generated an error + getLastCudaError("Kernel execution failed"); + + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height * iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + + checkCudaErrors(cudaFree(d_result)); +} + +bool runSingleTest(const char *ref_file, const char *exec_path) +{ + // allocate memory for result + int nTotalErrors = 0; + unsigned int *d_result; + unsigned int size = width * height * sizeof(unsigned int); + checkCudaErrors(cudaMalloc((void **)&d_result, size)); + + // warm-up + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + + checkCudaErrors(cudaDeviceSynchronize()); + sdkStartTimer(&timer); + + gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads); + checkCudaErrors(cudaDeviceSynchronize()); + getLastCudaError("Kernel execution failed"); + sdkStopTimer(&timer); + + unsigned char *h_result = (unsigned char *)malloc(width * height * 4); + checkCudaErrors(cudaMemcpy(h_result, d_result, width * height * 4, cudaMemcpyDeviceToHost)); + + char dump_file[1024]; + sprintf(dump_file, "teapot512_%02d.ppm", (int)sigma); + sdkSavePPM4ub(dump_file, h_result, width, height); + + if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) { + nTotalErrors++; + } + + printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); + printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6); + + checkCudaErrors(cudaFree(d_result)); + free(h_result); + + printf("Summary: %d errors!\n", nTotalErrors); + + printf(nTotalErrors == 0 ? "Test passed\n" : "Test failed!\n"); + return (nTotalErrors == 0); +} + +//////////////////////////////////////////////////////////////////////////////// +// Program main +//////////////////////////////////////////////////////////////////////////////// +int main(int argc, char **argv) +{ + pArgc = &argc; + pArgv = argv; + char *ref_file = NULL; + +#if defined(__linux__) + setenv("DISPLAY", ":0", 0); +#endif + + printf("%s Starting...\n\n", sSDKsample); + + printf("NOTE: The CUDA Samples are not meant for performance measurements. " + "Results may vary when GPU Boost is enabled.\n\n"); + + // use command-line specified CUDA device, otherwise use device with highest + // Gflops/s + if (argc > 1) { + if (checkCmdLineFlag(argc, (const char **)argv, "file")) { + getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file); + fpsLimit = frameCheckNumber; + } + } + + // Get the path of the filename + char *filename; + + if (getCmdLineArgumentString(argc, (const char **)argv, "image", &filename)) { + image_filename = filename; + } + + // load image + char *image_path = sdkFindFilePath(image_filename, argv[0]); + + if (image_path == NULL) { + fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename); + exit(EXIT_FAILURE); + } + + sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); + + if (!h_img) { + printf("Error unable to load PPM file: '%s'\n", image_path); + exit(EXIT_FAILURE); + } + + printf("Loaded '%s', %d x %d pixels\n", image_path, width, height); + + if (checkCmdLineFlag(argc, (const char **)argv, "threads")) { + nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads"); + } + + if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) { + sigma = getCmdLineArgumentFloat(argc, (const char **)argv, "sigma"); + } + + int iterations = 100; // Default iterations + if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) { + iterations = getCmdLineArgumentInt(argc, (const char **)argv, "iterations"); + } + + runBenchmark = checkCmdLineFlag(argc, (const char **)argv, "benchmark"); + +#ifdef NO_OPENGL + // Check if benchmark mode is enabled when compiled without OpenGL + if (!runBenchmark) { + fprintf(stderr, "\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "WARNING: This binary was compiled without OpenGL support.\n"); + fprintf(stderr, "You must use the --benchmark flag to run this program.\n"); + fprintf(stderr, "========================================\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Auto-enabling benchmark mode...\n"); + fprintf(stderr, "\n"); + runBenchmark = true; + } +#endif + + int device; + struct cudaDeviceProp prop; + cudaGetDevice(&device); + cudaGetDeviceProperties(&prop, device); + + if (!strncmp("Tesla", prop.name, 5)) { + printf("Tesla card detected, running the test in benchmark mode (no OpenGL " + "display)\n"); + // runBenchmark = true; + runBenchmark = true; + } + + // Benchmark or AutoTest mode detected, no OpenGL + if (runBenchmark == true || ref_file != NULL) { + findCudaDevice(argc, (const char **)argv); + } +#ifndef NO_OPENGL + else { + // First initialize OpenGL context, and then select CUDA device. + initGL(&argc, argv); + findCudaDevice(argc, (const char **)argv); + } +#else + else { + // This binary was compiled without OpenGL support + fprintf(stderr, "\n"); + fprintf(stderr, "ERROR: This binary was compiled without OpenGL support (NO_OPENGL defined).\n"); + fprintf(stderr, "You must run in benchmark mode using the --benchmark flag.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "Usage: %s --benchmark [--iterations N] [--image FILE]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "Example: %s --benchmark --iterations 1\n", argv[0]); + fprintf(stderr, "\n"); + exit(EXIT_FAILURE); + } +#endif + + initCudaBuffers(); + + if (ref_file) { + printf("(Automated Testing)\n"); + bool testPassed = runSingleTest(ref_file, argv[0]); + + cleanup(); + exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE); + } + + if (runBenchmark) { + printf("(Run Benchmark with %d iteration%s)\n", iterations, iterations == 1 ? "" : "s"); + benchmark(iterations); + + cleanup(); + exit(EXIT_SUCCESS); + } + +#ifndef NO_OPENGL + initGLBuffers(); + glutMainLoop(); +#else + // If we get here, something went wrong - we're not in benchmark mode + // but OpenGL is disabled + fprintf(stderr, "\n"); + fprintf(stderr, "ERROR: Reached interactive mode path, but this binary was compiled without OpenGL.\n"); + fprintf(stderr, "This should not happen. Please use --benchmark flag.\n"); + fprintf(stderr, "\n"); + cleanup(); + exit(EXIT_FAILURE); +#endif + + exit(EXIT_SUCCESS); +} diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu new file mode 100644 index 000000000..353c30019 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu @@ -0,0 +1,155 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Recursive Gaussian filter + sgreen 8/1/08 + + This code sample implements a Gaussian blur using Deriche's recursive method: + http://citeseer.ist.psu.edu/deriche93recursively.html + + This is similar to the box filter sample in the SDK, but it uses the previous + outputs of the filter as well as the previous inputs. This is also known as an + IIR (infinite impulse response) filter, since its response to an input impulse + can last forever. + + The main advantage of this method is that the execution time is independent of + the filter width. + + The GPU processes columns of the image in parallel. To avoid uncoalesced reads + for the row pass we transpose the image and then transpose it back again + afterwards. + + The implementation is based on code from the CImg library: + http://cimg.sourceforge.net/ + Thanks to David Tschumperl� and all the CImg contributors! +*/ + +#include +#include "helper_cuda.h" +#include "helper_math.h" +#include +#include +#include + +#include "recursiveGaussian_kernel.cuh" + +#define USE_SIMPLE_FILTER 0 + +// Round a / b to nearest higher integer value +int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); } + +/* + Transpose a 2D array (see SDK transpose example) +*/ +extern "C" void transpose(uint *d_src, uint *d_dest, uint width, int height) +{ + dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1); + dim3 threads(BLOCK_DIM, BLOCK_DIM, 1); + d_transpose<<>>(d_dest, d_src, width, height); + getLastCudaError("Kernel execution failed"); +} + +/* + Perform Gaussian filter on a 2D image using CUDA + + Parameters: + d_src - pointer to input image in device memory + d_dest - pointer to destination image in device memory + d_temp - pointer to temporary storage in device memory + width - image width + height - image height + sigma - sigma of Gaussian + order - filter order (0, 1 or 2) +*/ + +// 8-bit RGBA version +extern "C" void +gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, int width, int height, float sigma, int order, int nthreads) +{ + // compute filter coefficients + const float nsigma = sigma < 0.1f ? 0.1f : sigma, alpha = 1.695f / nsigma, ema = (float)std::exp(-alpha), + ema2 = (float)std::exp(-2 * alpha), b1 = -2 * ema, b2 = ema2; + + float a0 = 0, a1 = 0, a2 = 0, a3 = 0, coefp = 0, coefn = 0; + + switch (order) { + case 0: { + const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2); + a0 = k; + a1 = k * (alpha - 1) * ema; + a2 = k * (alpha + 1) * ema; + a3 = -k * ema2; + } break; + + case 1: { + const float k = (1 - ema) * (1 - ema) / ema; + a0 = k * ema; + a1 = a3 = 0; + a2 = -a0; + } break; + + case 2: { + const float ea = (float)std::exp(-alpha), k = -(ema2 - 1) / (2 * alpha * ema), + kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) / (3 * ea + 1 + 3 * ea * ea + ea * ea * ea)); + a0 = kn; + a1 = -kn * (1 + k * alpha) * ema; + a2 = kn * (1 - k * alpha) * ema; + a3 = -kn * ema2; + } break; + + default: + fprintf(stderr, "gaussianFilter: invalid order parameter!\n"); + return; + } + + coefp = (a0 + a1) / (1 + b1 + b2); + coefn = (a2 + a3) / (1 + b1 + b2); + +// process columns +#if USE_SIMPLE_FILTER + d_simpleRecursive_rgba<<>>(d_src, d_temp, width, height, ema); +#else + d_recursiveGaussian_rgba<<>>( + d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn); +#endif + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, width, height); + getLastCudaError("transpose: Kernel execution failed"); + +// process rows +#if USE_SIMPLE_FILTER + d_simpleRecursive_rgba<<>>(d_dest, d_temp, height, width, ema); +#else + d_recursiveGaussian_rgba<<>>( + d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn); +#endif + getLastCudaError("Kernel execution failed"); + + transpose(d_temp, d_dest, height, width); +} diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh new file mode 100644 index 000000000..4db2642c9 --- /dev/null +++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh @@ -0,0 +1,235 @@ +/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of NVIDIA CORPORATION nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY + * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + Recursive Gaussian filter +*/ + +#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_ +#define _RECURSIVEGAUSSIAN_KERNEL_CU_ + +#include +#include +#include +#include + +namespace cg = cooperative_groups; + +#include "helper_cuda.h" +#include "helper_math.h" + +#define BLOCK_DIM 16 +#define CLAMP_TO_EDGE 1 + +// Transpose kernel (see transpose CUDA Sample for details) +__global__ void d_transpose(uint *odata, uint *idata, int width, int height) +{ + // Handle to thread block group + cg::thread_block cta = cg::this_thread_block(); + + __shared__ uint block[BLOCK_DIM][BLOCK_DIM + 1]; + + // read the matrix tile into shared memory + unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x; + unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y; + + if ((xIndex < width) && (yIndex < height)) { + unsigned int index_in = yIndex * width + xIndex; + block[threadIdx.y][threadIdx.x] = idata[index_in]; + } + + cg::sync(cta); + + // write the transposed matrix tile to global memory + xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x; + yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y; + + if ((xIndex < height) && (yIndex < width)) { + unsigned int index_out = yIndex * height + xIndex; + odata[index_out] = block[threadIdx.x][threadIdx.y]; + } +} + +// RGBA version +// reads from 32-bit uint array holding 8-bit RGBA + +// convert floating point rgba color to 32-bit integer +__device__ uint rgbaFloatToInt(float4 rgba) +{ + rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0] + rgba.y = __saturatef(rgba.y); + rgba.z = __saturatef(rgba.z); + rgba.w = __saturatef(rgba.w); + return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | (uint(rgba.y * 255) << 8) | uint(rgba.x * 255); +} + +// convert from 32-bit int to float4 +__device__ float4 rgbaIntToFloat(uint c) +{ + float4 rgba; + rgba.x = (c & 0xff) / 255.0f; + rgba.y = ((c >> 8) & 0xff) / 255.0f; + rgba.z = ((c >> 16) & 0xff) / 255.0f; + rgba.w = ((c >> 24) & 0xff) / 255.0f; + return rgba; +} + +/* + simple 1st order recursive filter + - processes one image column per thread + + parameters: + id - pointer to input data (RGBA image packed into 32-bit integers) + od - pointer to output data + w - image width + h - image height + a - blur parameter +*/ + +__global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + + if (x >= w) + return; + + id += x; // advance pointers to correct column + od += x; + + // forward pass + float4 yp = rgbaIntToFloat(*id); // previous output + + for (int y = 0; y < h; y++) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a * (yp - xc); // simple lerp between current and previous value + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + yp = yc; + } + + // reset pointers to point to last element in column + id -= w; + od -= w; + + // reverse pass + // ensures response is symmetrical + yp = rgbaIntToFloat(*id); + + for (int y = h - 1; y >= 0; y--) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = xc + a * (yp - xc); + *od = rgbaFloatToInt((rgbaIntToFloat(*od) + yc) * 0.5f); + id -= w; + od -= w; // move to previous row + yp = yc; + } +} + +/* + recursive Gaussian filter + + parameters: + id - pointer to input data (RGBA image packed into 32-bit integers) + od - pointer to output data + w - image width + h - image height + a0-a3, b1, b2, coefp, coefn - filter parameters +*/ + +__global__ void d_recursiveGaussian_rgba(uint *id, + uint *od, + int w, + int h, + float a0, + float a1, + float a2, + float a3, + float b1, + float b2, + float coefp, + float coefn) +{ + unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; + + if (x >= w) + return; + + id += x; // advance pointers to correct column + od += x; + + // forward pass + float4 xp = make_float4(0.0f); // previous input + float4 yp = make_float4(0.0f); // previous output + float4 yb = make_float4(0.0f); // previous output by 2 +#if CLAMP_TO_EDGE + xp = rgbaIntToFloat(*id); + yb = coefp * xp; + yp = yb; +#endif + + for (int y = 0; y < h; y++) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a0 * xc + a1 * xp - b1 * yp - b2 * yb; + *od = rgbaFloatToInt(yc); + id += w; + od += w; // move to next row + xp = xc; + yb = yp; + yp = yc; + } + + // reset pointers to point to last element in column + id -= w; + od -= w; + + // reverse pass + // ensures response is symmetrical + float4 xn = make_float4(0.0f); + float4 xa = make_float4(0.0f); + float4 yn = make_float4(0.0f); + float4 ya = make_float4(0.0f); +#if CLAMP_TO_EDGE + xn = xa = rgbaIntToFloat(*id); + yn = coefn * xn; + ya = yn; +#endif + + for (int y = h - 1; y >= 0; y--) { + float4 xc = rgbaIntToFloat(*id); + float4 yc = a2 * xn + a3 * xa - b1 * yn - b2 * ya; + xa = xn; + xn = xc; + ya = yn; + yn = yc; + *od = rgbaFloatToInt(rgbaIntToFloat(*od) + yc); + id -= w; + od -= w; // move to previous row + } +} + +#endif // #ifndef _GAUSSIAN_KERNEL_H_ diff --git a/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py b/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py new file mode 100644 index 000000000..0ae69119e --- /dev/null +++ b/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py @@ -0,0 +1,284 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 The Newton Developers +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +########################################################################### +# Example Diffsim Ball +# +# Shows how to use Newton to optimize the initial velocity of a particle +# such that it bounces off the wall and floor in order to hit a target. +# +# This example uses the built-in wp.Tape() object to compute gradients of +# the distance to target (loss) w.r.t the initial velocity, followed by +# a simple gradient-descent optimization step. +# +# Command: python -m newton.examples diffsim_ball +# +########################################################################### +import numpy as np +import warp as wp + +import newton +import newton.examples +from newton.tests.unittest_utils import assert_np_equal +from newton.utils import bourke_color_map + + +@wp.kernel +def loss_kernel(pos: wp.array(dtype=wp.vec3), target: wp.vec3, loss: wp.array(dtype=float)): + # distance to target + delta = pos[0] - target + loss[0] = wp.dot(delta, delta) + + +@wp.kernel +def step_kernel(x: wp.array(dtype=wp.vec3), grad: wp.array(dtype=wp.vec3), alpha: float): + tid = wp.tid() + + # gradient descent step + x[tid] = x[tid] - grad[tid] * alpha + + +class Example: + def __init__(self, viewer, args): + # setup simulation parameters first + self.fps = 60 + self.frame = 0 + self.frame_dt = 1.0 / self.fps + self.sim_steps = 36 + self.sim_substeps = 8 + self.sim_dt = self.frame_dt / self.sim_substeps + + self.verbose = args.verbose + + self.train_iter = 0 + self.train_rate = 0.02 + self.target = (0.0, -2.0, 1.5) + self.loss = wp.zeros(1, dtype=wp.float32, requires_grad=True) + self.loss_history = [] + + self.viewer = viewer + self.viewer.show_particles = True + + # setup simulation scene + scene = newton.ModelBuilder(up_axis=newton.Axis.Z) + + scene.add_particle(pos=wp.vec3(0.0, -0.5, 1.0), vel=wp.vec3(0.0, 5.0, -5.0), mass=1.0) + + # add wall and ground plane + ke = 1.0e4 + kf = 0.0 + kd = 1.0e1 + mu = 0.2 + + scene.add_shape_box( + body=-1, + xform=wp.transform(wp.vec3(0.0, 2.0, 1.0), wp.quat_identity()), + hx=1.0, + hy=0.25, + hz=1.0, + cfg=newton.ModelBuilder.ShapeConfig(ke=ke, kf=kf, kd=kd, mu=mu), + ) + + scene.add_ground_plane(cfg=newton.ModelBuilder.ShapeConfig(ke=ke, kf=kf, kd=kd, mu=mu)) + + # finalize model + # use `requires_grad=True` to create a model for differentiable simulation + self.model = scene.finalize(requires_grad=True) + + self.model.soft_contact_ke = ke + self.model.soft_contact_kf = kf + self.model.soft_contact_kd = kd + self.model.soft_contact_mu = mu + self.model.soft_contact_restitution = 1.0 + + self.solver = newton.solvers.SolverSemiImplicit(self.model) + + # allocate sim states, initialize control and one-shot contacts (valid for simple collisions against constant plane) + self.states = [self.model.state() for _ in range(self.sim_steps * self.sim_substeps + 1)] + self.control = self.model.control() + + # Create collision pipeline (requires_grad for differentiable simulation) + self.collision_pipeline = newton.CollisionPipeline( + self.model, + broad_phase="explicit", + soft_contact_margin=10.0, + requires_grad=True, + ) + self.contacts = self.collision_pipeline.contacts() + self.collision_pipeline.collide(self.states[0], self.contacts) + + self.viewer.set_model(self.model) + + # capture forward/backward passes + self.capture() + + def capture(self): + # if wp.get_device().is_cuda: + # with wp.ScopedCapture() as capture: + # self.forward_backward() + # self.graph = capture.graph + # else: + self.graph = None + + def forward_backward(self): + self.tape = wp.Tape() + with self.tape: + self.forward() + self.tape.backward(self.loss) + + def forward(self): + # run simulation loop + for sim_step in range(self.sim_steps): + self.simulate(sim_step) + + # compute loss on final state + wp.launch(loss_kernel, dim=1, inputs=[self.states[-1].particle_q, self.target, self.loss]) + + return self.loss + + def simulate(self, sim_step): + for i in range(self.sim_substeps): + t = sim_step * self.sim_substeps + i + self.states[t].clear_forces() + self.solver.step(self.states[t], self.states[t + 1], self.control, self.contacts, self.sim_dt) + + def step(self): + if self.graph: + wp.capture_launch(self.graph) + else: + self.forward_backward() + + x = self.states[0].particle_qd + + if self.verbose: + print(f"Train iter: {self.train_iter} Loss: {self.loss}") + print(f" x: {x} g: {x.grad}") + + # gradient descent step + wp.launch(step_kernel, dim=len(x), inputs=[x, x.grad, self.train_rate]) + + # clear grads for next iteration + self.tape.zero() + + self.train_iter += 1 + self.loss_history.append(self.loss.numpy()[0]) + + def test_final(self): + x_grad_numeric, x_grad_analytic = self.check_grad() + assert_np_equal(x_grad_numeric, x_grad_analytic, tol=5e-2) + assert all(np.array(self.loss_history) < 10.0) + # skip the last loss because there could be some bouncing around the optimum + assert all(np.diff(self.loss_history[:-1]) < -1e-3) + + def render(self): + if self.viewer.is_paused(): + self.viewer.begin_frame(self.viewer.time) + self.viewer.end_frame() + return + + if self.frame > 0 and self.train_iter % 16 != 0: + return + + # draw trajectory + traj_verts = [self.states[0].particle_q.numpy()[0].tolist()] + + for i in range(self.sim_steps + 1): + state = self.states[i * self.sim_substeps] + traj_verts.append(state.particle_q.numpy()[0].tolist()) + + self.viewer.begin_frame(self.frame * self.frame_dt) + self.viewer.log_scalar("/loss", self.loss.numpy()[0]) + self.viewer.log_state(state) + self.viewer.log_contacts(self.contacts, state) + self.viewer.log_shapes( + "/target", + newton.GeoType.BOX, + (0.1, 0.1, 0.1), + wp.array([wp.transform(self.target, wp.quat_identity())], dtype=wp.transform), + wp.array([wp.vec3(0.0, 0.0, 0.0)], dtype=wp.vec3), + ) + self.viewer.log_lines( + f"/traj_{self.train_iter - 1}", + wp.array(traj_verts[0:-1], dtype=wp.vec3), + wp.array(traj_verts[1:], dtype=wp.vec3), + bourke_color_map(0.0, 7.0, self.loss.numpy()[0]), + ) + self.viewer.end_frame() + + self.frame += 1 + + def check_grad(self): + param = self.states[0].particle_qd + + # initial value + x_c = param.numpy().flatten() + + # compute numeric gradient + x_grad_numeric = np.zeros_like(x_c) + + for i in range(len(x_c)): + eps = 1.0e-3 + + step = np.zeros_like(x_c) + step[i] = eps + + x_1 = x_c + step + x_0 = x_c - step + + param.assign(x_1) + l_1 = self.forward().numpy()[0] + + param.assign(x_0) + l_0 = self.forward().numpy()[0] + + dldx = (l_1 - l_0) / (eps * 2.0) + + x_grad_numeric[i] = dldx + + # reset initial state + param.assign(x_c) + + # compute analytic gradient + tape = wp.Tape() + with tape: + l = self.forward() + + tape.backward(l) + + x_grad_analytic = param.grad.numpy()[0].copy() + + print(f"numeric grad: {x_grad_numeric}") + print(f"analytic grad: {x_grad_analytic}") + + tape.zero() + + return x_grad_numeric, x_grad_analytic + + @staticmethod + def create_parser(): + parser = newton.examples.create_parser() + parser.add_argument( + "--verbose", action="store_true", help="Print out additional status messages during execution." + ) + return parser + + +if __name__ == "__main__": + parser = Example.create_parser() + viewer, args = newton.examples.init(parser) + + example = Example(viewer, args) + example.check_grad() + newton.examples.run(example, args) diff --git a/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py b/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py new file mode 100644 index 000000000..485c865f0 --- /dev/null +++ b/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py @@ -0,0 +1,206 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 The Newton Developers +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +########################################################################### +# Example Robot Cartpole +# +# Shows how to set up a simulation of a rigid-body cartpole articulation +# from a USD stage using newton.ModelBuilder.add_usd(). +# +# Command: python -m newton.examples robot_cartpole --world-count 100 +# +########################################################################### + +import warp as wp + +import newton +import newton.examples + + +class Example: + def __init__(self, viewer, args): + self.fps = 60 + self.frame_dt = 1.0 / self.fps + self.sim_time = 0.0 + self.sim_substeps = 10 + self.sim_dt = self.frame_dt / self.sim_substeps + + self.world_count = args.world_count + + self.viewer = viewer + + cartpole = newton.ModelBuilder() + newton.solvers.SolverMuJoCo.register_custom_attributes(cartpole) + cartpole.default_shape_cfg.density = 100.0 + cartpole.default_joint_cfg.armature = 0.1 + cartpole.default_body_armature = 0.1 + + cartpole.add_usd( + newton.examples.get_asset("cartpole.usda"), + enable_self_collisions=False, + collapse_fixed_joints=True, + ) + # set initial joint positions + cartpole.joint_q[-3:] = [0.0, 0.3, 0.0] + + builder = newton.ModelBuilder() + builder.replicate(cartpole, self.world_count, spacing=(1.0, 2.0, 0.0)) + + # finalize model + self.model = builder.finalize() + + self.solver = newton.solvers.SolverMuJoCo(self.model) + # self.solver = newton.solvers.SolverSemiImplicit(self.model, joint_attach_ke=1600.0, joint_attach_kd=20.0) + # self.solver = newton.solvers.SolverFeatherstone(self.model) + + self.state_0 = self.model.state() + self.state_1 = self.model.state() + self.control = self.model.control() + # we do not need to evaluate contacts for this example + self.contacts = None + + # Evaluating forward kinematics is needed only for maximal-coordinate solvers + newton.eval_fk(self.model, self.model.joint_q, self.model.joint_qd, self.state_0) + + self.viewer.set_model(self.model) + self.viewer.set_world_offsets((0.0, 0.0, 0.0)) + + # Set camera to view all the cartpoles + self.viewer.set_camera( + pos=wp.vec3(7.3, -14.0, 2.3), + pitch=-5.0, + yaw=-225.0, + ) + if hasattr(self.viewer, "camera") and hasattr(self.viewer.camera, "fov"): + self.viewer.camera.fov = 90.0 + + self.capture() + + def capture(self): + self.graph = None + # if wp.get_device().is_cuda: + # with wp.ScopedCapture() as capture: + # self.simulate() + # self.graph = capture.graph + + def simulate(self): + for _ in range(self.sim_substeps): + self.state_0.clear_forces() + + # apply forces to the model for picking, wind, etc + self.viewer.apply_forces(self.state_0) + + self.solver.step(self.state_0, self.state_1, self.control, self.contacts, self.sim_dt) + + # swap states + self.state_0, self.state_1 = self.state_1, self.state_0 + + def step(self): + if self.graph: + wp.capture_launch(self.graph) + else: + self.simulate() + + self.sim_time += self.frame_dt + + def render(self): + self.viewer.begin_frame(self.sim_time) + self.viewer.log_state(self.state_0) + self.viewer.end_frame() + + def test_final(self): + num_bodies_per_world = self.model.body_count // self.world_count + newton.examples.test_body_state( + self.model, + self.state_0, + "cart is at ground level and has correct orientation", + lambda q, qd: q[2] == 0.0 and newton.math.vec_allclose(q.q, wp.quat_identity()), + indices=[i * num_bodies_per_world for i in range(self.world_count)], + ) + newton.examples.test_body_state( + self.model, + self.state_0, + "cart only moves along y direction", + lambda q, qd: qd[0] == 0.0 + and abs(qd[1]) > 0.05 + and qd[2] == 0.0 + and wp.length_sq(wp.spatial_bottom(qd)) == 0.0, + indices=[i * num_bodies_per_world for i in range(self.world_count)], + ) + newton.examples.test_body_state( + self.model, + self.state_0, + "pole1 only has y-axis linear velocity and x-axis angular velocity", + lambda q, qd: qd[0] == 0.0 + and abs(qd[1]) > 0.05 + and qd[2] == 0.0 + and abs(qd[3]) > 0.3 + and qd[4] == 0.0 + and qd[5] == 0.0, + indices=[i * num_bodies_per_world + 1 for i in range(self.world_count)], + ) + newton.examples.test_body_state( + self.model, + self.state_0, + "pole2 only has yz-plane linear velocity and x-axis angular velocity", + lambda q, qd: qd[0] == 0.0 + and abs(qd[1]) > 0.05 + and abs(qd[2]) > 0.05 + and abs(qd[3]) > 0.2 + and qd[4] == 0.0 + and qd[5] == 0.0, + indices=[i * num_bodies_per_world + 2 for i in range(self.world_count)], + ) + qd = self.state_0.body_qd.numpy() + world0_cart_vel = wp.spatial_vector(*qd[0]) + world0_pole1_vel = wp.spatial_vector(*qd[1]) + world0_pole2_vel = wp.spatial_vector(*qd[2]) + newton.examples.test_body_state( + self.model, + self.state_0, + "cart velocities match across worlds", + lambda q, qd: newton.math.vec_allclose(qd, world0_cart_vel), + indices=[i * num_bodies_per_world for i in range(self.world_count)], + ) + newton.examples.test_body_state( + self.model, + self.state_0, + "pole1 velocities match across worlds", + lambda q, qd: newton.math.vec_allclose(qd, world0_pole1_vel), + indices=[i * num_bodies_per_world + 1 for i in range(self.world_count)], + ) + newton.examples.test_body_state( + self.model, + self.state_0, + "pole2 velocities match across worlds", + lambda q, qd: newton.math.vec_allclose(qd, world0_pole2_vel, atol=1e-6), + indices=[i * num_bodies_per_world + 2 for i in range(self.world_count)], + ) + + @staticmethod + def create_parser(): + parser = newton.examples.create_parser() + newton.examples.add_world_count_arg(parser) + parser.set_defaults(world_count=100) + return parser + + +if __name__ == "__main__": + parser = Example.create_parser() + viewer, args = newton.examples.init(parser) + + example = Example(viewer, args) + + newton.examples.run(example, args) diff --git a/src/cuda/HPC/newton/setup_newton.sh b/src/cuda/HPC/newton/setup_newton.sh new file mode 100755 index 000000000..7ec1090f4 --- /dev/null +++ b/src/cuda/HPC/newton/setup_newton.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Setup Newton environment (similar to huggingface/setup_environment.sh) + +NEWTON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NEWTON_SUBMODULE="$NEWTON_DIR/../external/newton" + +# Create Python virtual environment if it doesn't exist +if [ ! -d "$NEWTON_DIR/newton_venv" ]; then + echo "Creating Python virtual environment for Newton..." + python3 -m venv "$NEWTON_DIR/newton_venv" + source "$NEWTON_DIR/newton_venv/bin/activate" + + # Install Newton and dependencies + pip install --upgrade pip + if [ -d "$NEWTON_SUBMODULE" ]; then + echo "Installing Newton from submodule..." + pip install -e "$NEWTON_SUBMODULE" + # Install additional dependencies for robot and USD examples + pip install usd-core mujoco-warp numpy warp-lang + else + echo "WARNING: Newton submodule not found at $NEWTON_SUBMODULE" + echo "Run: git submodule update --init --recursive" + fi +else + source "$NEWTON_DIR/newton_venv/bin/activate" +fi + +export NEWTON_ENV="$NEWTON_DIR/newton_venv" +export PYTHONPATH="$NEWTON_SUBMODULE:$PYTHONPATH" diff --git a/src/cuda/HPC/setup_vpi.sh b/src/cuda/HPC/setup_vpi.sh new file mode 100755 index 000000000..1f38c4f52 --- /dev/null +++ b/src/cuda/HPC/setup_vpi.sh @@ -0,0 +1,83 @@ +#!/bin/bash +# VPI Setup Script +# Installs VPI library if not already present +# Requires sudo for system-wide installation + +set -e + +echo "VPI Setup Script" +echo "================" + +# Check if VPI is already installed +if command -v vpi-config &> /dev/null; then + VPI_VERSION=$(vpi-config --version) + VPI_PATH=$(vpi-config --prefix) + echo "VPI already installed: version $VPI_VERSION at $VPI_PATH" + exit 0 +fi + +# Check if running on x86_64 Linux +if [[ "$(uname -m)" != "x86_64" ]] || [[ "$(uname -s)" != "Linux" ]]; then + echo "ERROR: VPI installation is only supported on Linux x86_64" + exit 1 +fi + +# Detect Ubuntu version +if [[ -f /etc/os-release ]]; then + . /etc/os-release + UBUNTU_VERSION=$VERSION_ID +else + echo "ERROR: Cannot detect Ubuntu version" + exit 1 +fi + +echo "Detected Ubuntu $UBUNTU_VERSION" + +# Check for sudo +if ! command -v sudo &> /dev/null; then + echo "ERROR: sudo is required for VPI installation" + exit 1 +fi + +echo "" +echo "Installing VPI via apt..." +echo "This requires sudo privileges and will install system packages." +echo "" + +# Install prerequisites +sudo apt-get update +sudo apt-get install -y gnupg software-properties-common + +# Add NVIDIA repository key (modern method) +sudo mkdir -p /etc/apt/keyrings +curl -fsSL https://repo.download.nvidia.com/jetson/jetson-ota-public.asc | sudo gpg --dearmor -o /etc/apt/keyrings/nvidia-jetson.gpg + +# Add repository based on Ubuntu version +if [[ "$UBUNTU_VERSION" == "22.04" ]]; then + echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/jammy r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list +elif [[ "$UBUNTU_VERSION" == "24.04" ]]; then + echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/noble r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list +else + echo "WARNING: Ubuntu $UBUNTU_VERSION not officially supported. Trying jammy repository..." + echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/jammy r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list +fi + +# Install VPI packages +sudo apt-get update +sudo apt-get install -y libnvvpi4 vpi4-dev vpi4-samples + +# Detect Python version and install Python bindings +PYTHON_VERSION=$(python3 --version | awk '{print $2}' | cut -d. -f1,2) +if [[ "$PYTHON_VERSION" == "3.10" ]]; then + sudo apt-get install -y python3.10-vpi4 +elif [[ "$PYTHON_VERSION" == "3.12" ]]; then + sudo apt-get install -y python3.12-vpi4 +else + echo "WARNING: Python VPI bindings not available for Python $PYTHON_VERSION" + echo "Supported versions: 3.10, 3.12" +fi + +echo "" +echo "VPI installation complete!" +echo "VPI installed at: $(vpi-config --prefix)" +echo "VPI version: $(vpi-config --version)" diff --git a/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp b/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp new file mode 100644 index 000000000..9364f7eed --- /dev/null +++ b/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp @@ -0,0 +1,276 @@ +/* +* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +* +* Redistribution and use in source and binary forms, with or without +* modification, are permitted provided that the following conditions +* are met: +* * Redistributions of source code must retain the above copyright +* notice, this list of conditions and the following disclaimer. +* * Redistributions in binary form must reproduce the above copyright +* notice, this list of conditions and the following disclaimer in the +* documentation and/or other materials provided with the distribution. +* * Neither the name of NVIDIA CORPORATION nor the names of its +* contributors may be used to endorse or promote products derived +* from this software without specific prior written permission. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CHECK_STATUS(STMT) \ + do \ + { \ + VPIStatus status = (STMT); \ + if (status != VPI_SUCCESS) \ + { \ + char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH]; \ + vpiGetLastStatusMessage(buffer, sizeof(buffer)); \ + std::ostringstream ss; \ + ss << vpiStatusGetName(status) << ": " << buffer; \ + throw std::runtime_error(ss.str()); \ + } \ + } while (0); + +int main(int argc, char *argv[]) +{ + // OpenCV image that will be wrapped by a VPIImage. + // Define it here so that it's destroyed *after* wrapper is destroyed + cv::Mat cvCurFrame; + + // VPI objects that will be used + VPIStream stream = NULL; + VPIImage imgCurFrame = NULL; + VPIImage bgimage = NULL; + VPIImage fgmask = NULL; + VPIPayload payload = NULL; + + int retval = 0; + + try + { + // Parse named arguments + std::string strBackend; + std::string strInputVideo; + int numFramesToProcess = -1; // -1 means process all frames + + for (int i = 1; i < argc; i++) + { + std::string arg = argv[i]; + + if (arg == "--backend" || arg == "-b") + { + if (i + 1 < argc) + { + strBackend = argv[++i]; + } + else + { + throw std::runtime_error("--backend requires a value"); + } + } + else if (arg == "--input" || arg == "-i") + { + if (i + 1 < argc) + { + strInputVideo = argv[++i]; + } + else + { + throw std::runtime_error("--input requires a value"); + } + } + else if (arg == "--num-frames" || arg == "-n") + { + if (i + 1 < argc) + { + numFramesToProcess = std::atoi(argv[++i]); + if (numFramesToProcess <= 0) + { + throw std::runtime_error("--num-frames must be a positive integer"); + } + } + else + { + throw std::runtime_error("--num-frames requires a value"); + } + } + else if (arg == "--help" || arg == "-h") + { + std::cout << "Usage: " << argv[0] << " [OPTIONS]\n" + << "Options:\n" + << " --backend, -b Backend to use (required)\n" + << " --input, -i Input video file (required)\n" + << " --num-frames, -n Number of frames to process (optional, default: all)\n" + << " --help, -h Show this help message\n"; + return 0; + } + else + { + throw std::runtime_error("Unknown argument: " + arg); + } + } + + // Validate required arguments + if (strBackend.empty()) + { + throw std::runtime_error("--backend is required\n\nUse --help for usage information"); + } + if (strInputVideo.empty()) + { + throw std::runtime_error("--input is required\n\nUse --help for usage information"); + } + + VPIBackend backend; + if (strBackend == "cpu") + { + backend = VPI_BACKEND_CPU; + } + else if (strBackend == "cuda") + { + backend = VPI_BACKEND_CUDA; + } + else + { + throw std::runtime_error("Backend '" + strBackend + "' not recognized."); + } + + // Load the input video + cv::VideoCapture invid; + if (!invid.open(strInputVideo)) + { + throw std::runtime_error("Can't open '" + strInputVideo + "'"); + } + + int32_t width = invid.get(cv::CAP_PROP_FRAME_WIDTH); + int32_t height = invid.get(cv::CAP_PROP_FRAME_HEIGHT); + + // Create the stream where processing will happen. We'll use user-provided backend. + CHECK_STATUS(vpiStreamCreate(backend, &stream)); + + // Create background subtractor payload to be executed on the given backend + // OpenCV delivers us BGR8 images, so the algorithm is configured to accept that. + CHECK_STATUS(vpiCreateBackgroundSubtractor(backend, width, height, VPI_IMAGE_FORMAT_BGR8, &payload)); + + // Create foreground image + CHECK_STATUS(vpiImageCreate(width, height, VPI_IMAGE_FORMAT_U8, 0, &fgmask)); + + // Create background image + CHECK_STATUS(vpiImageCreate(width, height, VPI_IMAGE_FORMAT_BGR8, 0, &bgimage)); + + int fourcc = cv::VideoWriter::fourcc('M', 'P', 'E', 'G'); + double fps = invid.get(cv::CAP_PROP_FPS); + + cv::VideoWriter outVideo("fgmask_" + strBackend + ".mp4", fourcc, fps, cv::Size(width, height), false); + if (!outVideo.isOpened()) + { + throw std::runtime_error("Can't create output video"); + } + + cv::VideoWriter bgimageVideo("bgimage_" + strBackend + ".mp4", fourcc, fps, cv::Size(width, height)); + if (!outVideo.isOpened()) + { + throw std::runtime_error("Can't create output video"); + } + + // Fetch a new frame until video ends or desired frame count is reached + int idxFrame = 1; + + while (invid.read(cvCurFrame)) + { + // Check if we've reached the desired number of frames + if (numFramesToProcess > 0 && idxFrame > numFramesToProcess) + { + printf("Processed %d frames (limit reached)\n", idxFrame - 1); + break; + } + + printf("Processing frame %d\n", idxFrame++); + // Wrap frame into a VPIImage + if (imgCurFrame == NULL) + { + CHECK_STATUS(vpiImageCreateWrapperOpenCVMat(cvCurFrame, 0, &imgCurFrame)); + } + else + { + CHECK_STATUS(vpiImageSetWrappedOpenCVMat(imgCurFrame, cvCurFrame)); + } + + VPIBackgroundSubtractorParams params; + CHECK_STATUS(vpiInitBackgroundSubtractorParams(¶ms)); + params.learningRate = 0.01; + + CHECK_STATUS( + vpiSubmitBackgroundSubtractor(stream, backend, payload, imgCurFrame, fgmask, bgimage, ¶ms)); + + // Wait for processing to finish. + CHECK_STATUS(vpiStreamSync(stream)); + + { + // Now add it to the output video stream + VPIImageData imgdata; + CHECK_STATUS(vpiImageLockData(fgmask, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata)); + + cv::Mat outFrame; + CHECK_STATUS(vpiImageDataExportOpenCVMat(imgdata, &outFrame)); + + outVideo << outFrame; + + CHECK_STATUS(vpiImageUnlock(fgmask)); + } + + { + VPIImageData bgdata; + CHECK_STATUS(vpiImageLockData(bgimage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &bgdata)); + + cv::Mat outFrame; + CHECK_STATUS(vpiImageDataExportOpenCVMat(bgdata, &outFrame)); + + bgimageVideo << outFrame; + + CHECK_STATUS(vpiImageUnlock(bgimage)); + } + } + } + catch (std::exception &e) + { + std::cerr << e.what() << std::endl; + retval = 1; + } + + // Destroy all resources used + vpiStreamDestroy(stream); + vpiPayloadDestroy(payload); + + vpiImageDestroy(imgCurFrame); + vpiImageDestroy(fgmask); + vpiImageDestroy(bgimage); + + return retval; +} diff --git a/src/cuda/common/common.mk b/src/cuda/common/common.mk index cd5f3cefc..14885532b 100644 --- a/src/cuda/common/common.mk +++ b/src/cuda/common/common.mk @@ -21,7 +21,9 @@ .SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx INCLUDES += -I$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/common/inc -ADDITIONAL_LIBS += -L$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/lib -lcutil_x86_64 +ifneq ($(OMIT_CUTIL_LIB),1) + ADDITIONAL_LIBS += -L$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/lib -lcutil_x86_64 +endif # Add new SM Versions here as devices with new Compute Capability are released SM_VERSIONS := 70 75 @@ -56,6 +58,9 @@ BINDIR ?= $(ROOTBINDIR)/$(OSLOWER) ROOTOBJDIR ?= obj LIBDIR ?= $(ROOTDIR)/../lib +# Set CUDA_VERSION_MAJOR if not already set +CUDA_VERSION_MAJOR ?= 12 + ifeq ($(shell test ${CUDA_VERSION_MAJOR} -lt 5; echo $$?), 0) LIBDIRSDK := $(NVIDIA_COMPUTE_SDK_LOCATION)/C/lib COMMONDIR := $(NVIDIA_COMPUTE_SDK_LOCATION)/C/common diff --git a/src/cuda/cutlass-bench b/src/cuda/cutlass-bench index e51efbfe1..291300fff 160000 --- a/src/cuda/cutlass-bench +++ b/src/cuda/cutlass-bench @@ -1 +1 @@ -Subproject commit e51efbfe18fe4f4cbb66ab814c55bf4aa0185491 +Subproject commit 291300ffffa3533a78ee104f08a8490a29ce9ccb diff --git a/src/cuda/pytorch_examples b/src/cuda/pytorch_examples index 30d1a73e5..1bef748fa 160000 --- a/src/cuda/pytorch_examples +++ b/src/cuda/pytorch_examples @@ -1 +1 @@ -Subproject commit 30d1a73e5ab3bb92e5834dc11805e53ee8aee2c7 +Subproject commit 1bef748fab064e2fc3beddcbda60fd51cb9612d2 diff --git a/src/setup_environment b/src/setup_environment index 21b42af72..29a1dbba4 100755 --- a/src/setup_environment +++ b/src/setup_environment @@ -208,4 +208,11 @@ else fi fi +# Check for VPI (optional for H100 VPI apps) +if command -v vpi-config &> /dev/null; then + export VPI_INSTALL_PATH=$(vpi-config --prefix) + export VPI_VERSION=$(vpi-config --version) + echo "VPI found: version $VPI_VERSION at $VPI_INSTALL_PATH" +fi + export GPUAPPS_SETUP_ENVIRONMENT_WAS_RUN=1