diff --git a/.gitignore b/.gitignore
index b729af5d6..80ad9627a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,11 @@ tmp/
 src/cuda/GPU_Microbenchmark/ubench/**/*
 !src/cuda/GPU_Microbenchmark/ubench/**/*/
 !src/cuda/GPU_Microbenchmark/ubench/**/*.*
-!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
\ No newline at end of file
+!src/cuda/GPU_Microbenchmark/ubench/**/Makefile
+
+# Ignore compiled CUDA binaries
+*.fatbin
+
+# Ignore VPI symlinks
+src/cuda/HPC/vpi/*
+!src/cuda/HPC/vpi/vpi_subtractor/main.cpp
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index a26c652f2..e48c7a85a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -11,3 +11,9 @@
 	path = src/cuda/pytorch_examples
 	url = https://github.com/accel-sim/pytorch_examples.git
 	branch = inference_accelsim_v2
+[submodule "src/cuda/HPC/external/cugraph"]
+	path = src/cuda/HPC/external/cugraph
+	url = https://github.com/rapidsai/cugraph.git
+[submodule "src/cuda/HPC/external/newton"]
+	path = src/cuda/HPC/external/newton
+	url = https://github.com/newton-physics/newton.git
diff --git a/README.md b/README.md
index 09db7bfda..02a0fd9de 100644
--- a/README.md
+++ b/README.md
@@ -68,3 +68,16 @@ To compile everything above for AccelWattch:
 ```
 make accelwattch -C ./src
 ```
+
+## H100 Benchmark Suite
+
+The H100 suite contains 15 modern GPU workloads from H100 profiling and analysis:
+
+- **cuFFT** (2 apps): FFT operations using cuFFT library
+- **cuSolver** (2 apps): Linear algebra using cuSolver library
+- **Image Processing** (3 apps): Wavelet transform, Gaussian filter, FDTD3d
+- **Graph Algorithms** (2 apps): BFS and MST using cuGraph (git submodule)
+- **Physics Simulation** (3 apps): Newton physics engine benchmarks (git submodule)
+- **Computer Vision** (3 apps): VPI-based vision processing (requires VPI 4.0)
+
+See [src/cuda/H100/README.md](src/cuda/H100/README.md) for details.
diff --git a/get_data.sh b/get_data.sh
index 77ce877ae..8654f9759 100755
--- a/get_data.sh
+++ b/get_data.sh
@@ -10,3 +10,17 @@ if [ ! -d $DATA_ROOT ]; then
     tar xzvf all.gpgpu-sim-app-data.tgz -C $BASH_ROOT
     rm all.gpgpu-sim-app-data.tgz
 fi
+
+echo "Generating HPC benchmark data..."
+if [ -f $BASH_ROOT/src/cuda/HPC/get_graph_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_graph_data.sh || echo "Warning: Graph data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_image_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_image_data.sh || echo "Warning: Image data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_dwt_data.sh || echo "Warning: DWT data generation failed"
+fi
+if [ -f $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh ]; then
+    bash $BASH_ROOT/src/cuda/HPC/get_vpi_data.sh
+fi
diff --git a/src/Makefile b/src/Makefile
index a890d58f7..acb019d08 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -4,7 +4,7 @@ $(error You must run "source setup_environment before calling make")
 endif
 
 ifeq ($(CUDA_GT_7), 1)
-all:  GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface # mlperf_training
+all:  GPU_Microbenchmark microbench rodinia_2.0-ft cutlass rodinia-3.1 pannotia proxy-apps ispass-2009 lonestargpu-2.0 polybench custom_apps heterosync cuda_samples mlperf_inference vllm huggingface hpc # mlperf_training
 else
 ifeq ($(CUDA_GT_4), 1)
 all:   pannotia rodinia_2.0-ft proxy-apps dragon-naive microbench rodinia-3.1 ispass-2009 dragon-cdp lonestargpu-2.0 polybench parboil shoc custom_apps
@@ -19,7 +19,7 @@ accelwattch_hw_power: rodinia-3.1_hw_power parboil_hw_power cuda_samples-11.0_hw
 #Disable clean for now, It has a bug!
 # clean_dragon-naive clean_pannotia clean_proxy-apps
 
-clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp  clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark
+clean: clean_mlperf_inference clean_rodinia_2.0-ft clean_dragon-cdp  clean_ispass-2009 clean_lonestargpu-2.0 clean_custom_apps clean_parboil clean_cutlass clean_rodinia-3.1 clean_heterosync clean_UVMSmart_test clean_cuda_samples clean_huggingface clean_GPU_Microbenchmark clean_hpc
 clean_accelwattch: clean_rodinia-3.1 clean_parboil clean_cutlass clean_cuda_samples-11.0 clean_cuda_samples_hw_power clean_rodinia-3.1_hw_power clean_parboil_hw_power clean_accelwattch_ubench
 
 clean_data:
@@ -543,6 +543,52 @@ clean_heterosync:
 clean_cutlass:
 	rm -rf cuda/cutlass-bench/build
 
+###############################################################################
+# Modern HPC Benchmarks - (CUDA 11+, sm_75+)
+###############################################################################
+hpc:
+	mkdir -p $(BINDIR)/$(BINSUBDIR)/
+	# Initialize and update submodules (like cutlass pattern)
+	# Clean up any corrupted submodule directories (check if .git exists)
+	@for dir in cuda/cuda-samples cuda/HPC/external/cugraph cuda/HPC/external/newton; do \
+		if [ -d $$dir ] && [ ! -d $$dir/.git ]; then \
+			echo "Removing corrupted submodule directory: $$dir"; \
+			rm -rf $$dir; \
+		fi; \
+	done
+	git submodule sync && git submodule update --init --recursive
+	
+	# Build simple apps (cuFFT, cuSolver, image processing)
+	$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC simple
+	
+	
+	
+	# Build Newton apps (Newton submodule)
+	cp -r cuda/HPC/newton $(BINDIR)/$(BINSUBDIR)/
+	mkdir -p $(BINDIR)/$(BINSUBDIR)/external
+	cp -r cuda/HPC/external/newton $(BINDIR)/$(BINSUBDIR)/external/
+	bash $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh
+	echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/diffsim_ball/example_diffsim_ball.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
+	chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_diffsim_ball
+	echo 'source $(BINDIR)/$(BINSUBDIR)/newton/setup_newton.sh && python3 $(BINDIR)/$(BINSUBDIR)/newton/robot_cartpole/example_robot_cartpole.py "$$@"' > $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole
+	chmod u+x $(BINDIR)/$(BINSUBDIR)/newton_robot_cartpole
+	
+	# Build VPI apps from symlinked sources and copy binaries
+	$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC vpi
+	cp cuda/HPC/vpi/vpi_background_subtractor/build/vpi_sample_14_background_subtractor $(BINDIR)/$(BINSUBDIR)/vpi_background_subtractor
+	cp cuda/HPC/vpi/vpi_orb_feature_detector/build/vpi_sample_18_orb_feature_detector $(BINDIR)/$(BINSUBDIR)/vpi_orb_feature_detector
+	cp cuda/HPC/vpi/vpi_stereo_disparity/build/vpi_sample_02_stereo_disparity $(BINDIR)/$(BINSUBDIR)/vpi_stereo_disparity
+
+	# Build graph apps (cuGraph submodule)
+	-$(SETENV) $(MAKE) $(MAKE_ARGS) -C cuda/HPC graph && \
+		cp cuda/HPC/graph/mst_standalone/build/mst_standalone $(BINDIR)/$(BINSUBDIR)/
+
+clean_HPC:
+	$(SETENV) $(MAKE) -C cuda/HPC clean
+	rm -rf cuda/HPC/external/cugraph/build
+	rm -rf cuda/HPC/external/newton/build
+	rm -rf cuda/HPC/newton/newton_venv
+
 # clean_deeplearning:
 # 	$(SETENV) $(MAKE) $(MAKE_ARGS) noinline=$(noinline) -C cuda/cudnn/mnist clean
 
@@ -711,7 +757,7 @@ clean_pytorch_examples:
 	rm -f $(BINDIR)/$(BINSUBDIR)/inference_vae
 
 clean_cuda_samples:
-	$(MAKE) clean -C ./cuda/cuda-samples/build
+	-$(MAKE) clean -C ./cuda/cuda-samples/build 2>/dev/null || true
 
 clean_huggingface:
 	rm -rf $(BINDIR)/$(BINSUBDIR)/huggingface
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h
new file mode 100644
index 000000000..fd178eaef
--- /dev/null
+++ b/src/cuda/GPU_Microbenchmark/hw_def/blackwell_RTX5090_hw_def.h
@@ -0,0 +1,25 @@
+
+
+#ifndef BLACK_GB202_DEF_H
+#define BLACK_GB202_DEF_H
+
+#include "./common/common.h"
+#include "./common/deviceQuery.h"
+
+#define L1_SIZE (256 * 1024) // Max L1 size in bytes
+
+// #define CLK_FREQUENCY 2010 // frequency in MHz
+
+#define ISSUE_MODEL issue_model::single // single issue core or dual issue
+#define CORE_MODEL core_model::subcore  // subcore model or shared model
+#define DRAM_MODEL dram_model::GDDR6      // memory type
+#define WARP_SCHEDS_PER_SM 4            // number of warp schedulers per SM
+
+
+#define SASS_hmma_per_PTX_wmma 2
+
+
+#define L2_BANKS_PER_MEM_CHANNEL 1
+#define L2_BANK_WIDTH_in_BYTE 64
+
+#endif
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
index 0b0306596..4def21d8a 100644
--- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
+++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -108,7 +108,9 @@ inline void parseGpuConfigArgs(int argc, char *argv[])
         ++i;
     }
     config.MAX_WARPS_PER_SM = config.MAX_THREADS_PER_SM / config.WARP_SIZE;
-    config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
+    // Note: MEM_CLK_FREQUENCY is already in MHz from initializeDeviceProp (line 313)
+    // Do not convert to GHz - the bandwidth calculation expects MHz
+    // config.MEM_CLK_FREQUENCY = config.MEM_CLK_FREQUENCY * 1e-3f;
     config.BLOCKS_PER_SM = config.MAX_THREADS_PER_SM / config.THREADS_PER_BLOCK;
     config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
     config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h b/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h
index 8d12b2456..227011654 100644
--- a/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h
+++ b/src/cuda/GPU_Microbenchmark/hw_def/hw_def.h
@@ -14,6 +14,7 @@
 // #include "volta_TITANV_hw_def.h"
 
 // #include "ampere_A100_hw_def.h"
-#include "blackwell_B200_hw_def.h"
+// #include "blackwell_B200_hw_def.h"
+#include "blackwell_RTX5090_hw_def.h"
 
 #endif
diff --git a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
index 7ef201c6e..b279f8b49 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/l2_cache/l2_bw_32f/l2_bw_32f.cu
@@ -96,12 +96,13 @@ int main(int argc, char *argv[])
     }
   }
 
+  config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM
+  config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM; // Recalculate after changing BLOCKS_NUM
+
   unsigned ARRAY_SIZE = config.TOTAL_THREADS + repeat_times * config.WARP_SIZE;
   assert(ARRAY_SIZE * sizeof(float) <
          config.L2_SIZE); // Array size must not exceed L2 size
-
-  config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM
-
+  // config.BLOCKS_NUM = config.SM_NUMBER * 2; // 2 blocks per SM // Commented out - causes mismatch on GPUs with MAX_THREADS_PER_SM != 2048
   uint64_t *startClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
   uint64_t *stopClk = (uint64_t *)malloc(config.TOTAL_THREADS * sizeof(uint64_t));
 
diff --git a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
index 841bdee17..78c130e22 100644
--- a/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
+++ b/src/cuda/GPU_Microbenchmark/ubench/mem/mem_bw/mem_bw.cu
@@ -145,13 +145,32 @@ int main(int argc, char *argv[])
   float milliseconds = 0;
   cudaEventElapsedTime(&milliseconds, start, stop);
 
-  unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats types
-  float max_bw = (float)config.MEM_BITWIDTH * config.MEM_CLK_FREQUENCY * 2 / 1e3 / 8;
-  mem_bw = (float)(N) / ((float)(stopClk[0] - startClk[0]));
-  printf("Mem BW= %f (Byte/Clk)\n", mem_bw);
-  printf("Mem BW= %f (GB/sec)\n", (float)N / milliseconds / 1e6);
-  printf("Max Theortical Mem BW= %f (GB/sec)\n", max_bw);
-  printf("Mem Efficiency = %f %%\n", (mem_bw / max_bw) * 100);
-
-  printf("Total Clk number = %u \n", stopClk[0] - startClk[0]);
+  // Find min and max clocks across all threads to get actual kernel execution time
+  uint32_t minStart = startClk[0], maxStop = stopClk[0];
+  for (unsigned i = 1; i < config.TOTAL_THREADS; i++) {
+    if (startClk[i] < minStart) minStart = startClk[i];
+    if (stopClk[i] > maxStop) maxStop = stopClk[i];
+  }
+  uint32_t totalClocks = maxStop - minStart;
+
+  unsigned N = ARRAY_SIZE * 6 * sizeof(float); // 6 arrays of floats (5 reads + 1 write)
+
+  // Theoretical max bandwidth in GB/s
+  // MEM_CLK_FREQUENCY is in MHz, MEM_BITWIDTH is in bits
+  // BW = (Bus Width / 8 bytes) × (Clock MHz) × 2 (DDR) / 1000 = GB/s
+  float max_bw = (float)config.MEM_BITWIDTH / 8 * config.MEM_CLK_FREQUENCY * 2 / 1000;
+  
+  // Achieved bandwidth from CUDA event timing (most accurate)
+  float achieved_bw_from_time = (float)N / milliseconds / 1e6;
+  
+  // Achieved bandwidth from cycle count (less accurate, single SM perspective)
+  float achieved_bw_from_cycles = ((float)N / totalClocks) * config.CLK_FREQUENCY / 1e3;
+  
+  printf("Mem BW= %f (Byte/Clk)\n", (float)N / totalClocks);
+  printf("Mem BW (from time)= %f (GB/sec)\n", achieved_bw_from_time);
+  printf("Mem BW (from cycles)= %f (GB/sec)\n", achieved_bw_from_cycles);
+  printf("Max Theoretical Mem BW= %f (GB/sec)\n", max_bw);
+  printf("Mem Efficiency = %f %%\n", (achieved_bw_from_time / max_bw) * 100);
+
+  printf("Total Clk number = %u (min start: %u, max stop: %u)\n", totalClocks, minStart, maxStop);
 }
diff --git a/src/cuda/HPC/Makefile b/src/cuda/HPC/Makefile
new file mode 100644
index 000000000..6ff6bd438
--- /dev/null
+++ b/src/cuda/HPC/Makefile
@@ -0,0 +1,100 @@
+# H100 Benchmark Suite Makefile (following cutlass-bench pattern)
+.PHONY: all simple graph newton vpi clean cufft cusolver image
+
+# Default: Build all apps (submodules initialized by parent Makefile)
+all: simple graph newton vpi
+
+
+# Simple apps (cuFFT, cuSolver, image processing - CUDA toolkit only)
+simple: cufft cusolver image
+
+cufft:
+	$(MAKE) -C cufft/cufft_3d_c2c
+	$(MAKE) -C cufft/cufft_lto_r2c_c2r
+
+cusolver:
+	$(MAKE) -C cusolver/cusolver_ormqr
+	$(MAKE) -C cusolver/cusolver_Xgetrf
+
+image:
+	$(MAKE) -C image/dwtHaar1D
+	$(MAKE) -C image/recursiveGaussian
+	$(MAKE) -C image/FDTD3d
+
+# Graph apps (cuGraph submodule - built with CMake)
+graph:
+	@echo "Building cuGraph library from submodule..."
+	@if [ -d "external/cugraph" ]; then \
+		cd external/cugraph && \
+		./build.sh libcugraph --skip_cpp_tests && \
+		echo "Building graph apps..." && \
+		cd ../.. && \
+		export CUGRAPH_ROOT="$(shell pwd)/external/cugraph" && \
+		# mkdir -p graph/bfs_standalone/build && \
+		# cd graph/bfs_standalone/build && \
+		# cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
+		# $(MAKE) && \
+		# cd ../../.. && \
+		mkdir -p graph/mst_standalone/build && \
+		cd graph/mst_standalone/build && \
+		cmake .. -DCMAKE_CUDA_COMPILER=$(CUDA_INSTALL_PATH)/bin/nvcc && \
+		$(MAKE); \
+	else \
+		echo "WARNING: cuGraph submodule not found - skipping graph apps"; \
+	fi
+
+# Newton apps (Newton submodule - Python-based)
+newton:
+	@echo "Newton apps ready (Python-based, no build needed)"
+
+# VPI apps (VPI library from system - required)
+# Symlink sources from /opt/nvidia/vpi*/samples/ and build locally
+vpi:
+	@echo "Searching for VPI installation..."
+	@VPI_ROOT=$$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1); \
+	if [ -z "$$VPI_ROOT" ]; then \
+		echo "ERROR: VPI not found in /opt/nvidia/"; \
+		echo "Please install VPI from NVIDIA"; \
+		exit 1; \
+	fi; \
+	echo "Found VPI at $$VPI_ROOT"; \
+	VPI_SAMPLES=$$VPI_ROOT/samples; \
+	echo "Creating symlinks to VPI sample sources..."; \
+	mkdir -p vpi/vpi_orb_feature_detector vpi/vpi_stereo_disparity; \
+	ln -sf $$VPI_SAMPLES/14-background_subtractor/CMakeLists.txt vpi/vpi_background_subtractor/CMakeLists.txt; \
+	ln -sf $$VPI_SAMPLES/18-orb_feature_detector/main.cpp vpi/vpi_orb_feature_detector/main.cpp; \
+	ln -sf $$VPI_SAMPLES/18-orb_feature_detector/CMakeLists.txt vpi/vpi_orb_feature_detector/CMakeLists.txt; \
+	ln -sf $$VPI_SAMPLES/02-stereo_disparity/main.cpp vpi/vpi_stereo_disparity/main.cpp; \
+	ln -sf $$VPI_SAMPLES/02-stereo_disparity/CMakeLists.txt vpi/vpi_stereo_disparity/CMakeLists.txt; \
+	echo "Building VPI apps from symlinked sources..."; \
+	mkdir -p vpi/vpi_background_subtractor/build && \
+	cd vpi/vpi_background_subtractor/build && \
+	cmake .. && $(MAKE) && \
+	cd ../../.. && \
+	mkdir -p vpi/vpi_orb_feature_detector/build && \
+	cd vpi/vpi_orb_feature_detector/build && \
+	cmake .. && $(MAKE) && \
+	cd ../../.. && \
+	mkdir -p vpi/vpi_stereo_disparity/build && \
+	cd vpi/vpi_stereo_disparity/build && \
+	cmake .. && $(MAKE)
+
+clean:
+	# Clean bin directory
+	-rm -rf bin
+	# Clean simple apps
+	-$(MAKE) -C cufft/cufft_3d_c2c clean
+	-$(MAKE) -C cufft/cufft_lto_r2c_c2r clean
+	-$(MAKE) -C cusolver/cusolver_ormqr clean
+	-$(MAKE) -C cusolver/cusolver_Xgetrf clean
+	-$(MAKE) -C image/dwtHaar1D clean
+	-$(MAKE) -C image/recursiveGaussian clean
+	-$(MAKE) -C image/FDTD3d clean
+	# Clean graph apps and cuGraph build
+	-rm -rf graph/bfs_standalone/build
+	-rm -rf graph/mst_standalone/build
+	-rm -rf external/cugraph/cpp/build
+	# Clean Newton
+	-rm -rf newton/newton_venv
+	# Clean VPI apps and symlinks
+	-rm -rf vpi/*/build
diff --git a/src/cuda/HPC/README.md b/src/cuda/HPC/README.md
new file mode 100644
index 000000000..3eedbeaab
--- /dev/null
+++ b/src/cuda/HPC/README.md
@@ -0,0 +1,100 @@
+# H100 Benchmark Suite
+
+14 modern GPU workloads from H100 profiling and analysis.
+
+## Applications
+
+### cuFFT (2 apps) - FFT operations
+- `cufft_3d_c2c_scalable` - 3D Complex-to-Complex FFT
+- `cufft_lto_r2c_c2r_scalable` - Real↔Complex FFT with LTO callbacks
+
+### cuSolver (2 apps) - Linear algebra
+- `cusolver_ormqr_scalable` - QR factorization
+- `cusolver_Xgetrf_scalable` - LU factorization
+
+### Image Processing (3 apps)
+- `dwtHaar1D` - Haar wavelet transform
+- `recursiveGaussian` - Recursive Gaussian filter
+- `FDTD3d` - Finite-Difference Time-Domain 3D simulation
+
+### Graph Algorithms (2 apps)
+- `bfs_standalone` - Breadth-First Search (requires cuGraph submodule)
+- `mst_standalone` - Minimum Spanning Tree (requires cuGraph submodule)
+
+### Physics Simulation (3 apps)
+- `newton_diffsim_ball` - Differential simulation (requires Newton submodule)
+- `newton_robot_cartpole` - Robotics simulation (requires Newton submodule)
+- `newton_mpm_granular` - Material Point Method simulation (requires Newton submodule)
+
+### Computer Vision (3 apps)
+- `vpi_background_subtractor` - Background subtraction (requires VPI 4.0)
+- `vpi_orb_feature_detector` - ORB feature detection (requires VPI 4.0)
+- `vpi_stereo_disparity` - Stereo disparity calculation (requires VPI 4.0)
+
+## Dependencies
+
+- **CUDA 11.0+** - Required (provides cuFFT, cuSolver, cuBLAS libraries)
+- **cuGraph** - Git submodule (auto-initialized for graph apps)
+- **Newton** - Git submodule (auto-initialized for physics apps)
+- **VPI 4.0** - install from https://developer.nvidia.com/embedded/vpi
+
+## Build
+
+```bash
+# From repository root
+source src/setup_environment
+
+# Generate data files (standard workflow)
+make data
+
+# Build all H100 apps
+make -C src H100
+
+# Or build everything with:
+make all -i -j -C src
+```
+
+Binaries are output to `bin/<cuda-version>/release/H100-*`
+
+Newton apps are copied to `bin/<cuda-version>/release/newton/newton_*`
+
+## Running
+
+```bash
+# cuFFT apps
+bin/*/release/H100-cufft_3d_c2c small
+bin/*/release/H100-cufft_lto_r2c_c2r medium
+
+# cuSolver apps
+bin/*/release/H100-cusolver_ormqr large
+bin/*/release/H100-cusolver_Xgetrf medium
+
+# Image apps
+bin/*/release/H100-dwtHaar1D
+bin/*/release/H100-recursiveGaussian
+bin/*/release/H100-FDTD3d
+
+# Graph apps (with generated data)
+bin/*/release/H100-bfs --file data_dirs/cuda/H100/graph/karate.mtx
+bin/*/release/H100-mst --file data_dirs/cuda/H100/graph/netscience.mtx
+
+# Newton apps
+bin/*/release/newton/newton_diffsim_ball
+bin/*/release/newton/newton_robot_cartpole
+bin/*/release/newton/newton_mpm_granular
+
+# VPI apps (if VPI installed)
+bin/*/release/vpi_background_subtractor cuda <video-file> <frames>
+```
+
+## GPU Support
+
+- Requires compute capability 7.5+ (Turing, Ampere, Hopper)
+- Tested on: V100 (sm_70), A100 (sm_80), H100 (sm_90)
+
+## Notes
+
+- Simple apps (cuFFT, cuSolver, image) build on any system with CUDA 11+
+- Graph apps require cuGraph submodule (automatically handled by build system)
+- Newton apps create Python virtual environment on first run
+- VPI apps are optional and only build if VPI library is installed
diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile b/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile
new file mode 100644
index 000000000..23ec8b328
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_3d_c2c/Makefile
@@ -0,0 +1,9 @@
+EXECUTABLE := cufft_3d_c2c_scalable
+CUFILES := cufft_3d_c2c_scalable.cu
+CCFILES :=
+ADDITIONAL_LIBS := -lcufft
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu
new file mode 100644
index 000000000..4263035d6
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_3d_c2c_scalable.cu
@@ -0,0 +1,196 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include <array>
+#include <complex>
+#include <iostream>
+#include <random>
+#include <vector>
+#include <cstring>
+#include <cuda_runtime.h>
+#include <cufft.h>
+
+#include "cufft_utils.h"
+
+using dim_t = std::array<int, 3>;
+
+int main(int argc, char *argv[]) {
+    cufftHandle plan;
+    cudaStream_t stream = NULL;
+
+    // Default values
+    int n = 16;
+    int batch_size = 4;
+
+    // Parse named command-line arguments
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--n") == 0 || strcmp(argv[i], "-n") == 0) {
+            if (i + 1 < argc) {
+                n = atoi(argv[++i]);
+            } else {
+                std::printf("Error: %s requires a value\n", argv[i]);
+                std::printf("Usage: %s [--n|-n <value>] [--batch-size|-b <value>]\n", argv[0]);
+                std::printf("   or: %s <small|medium|large>\n", argv[0]);
+                return EXIT_FAILURE;
+            }
+        } else if (strcmp(argv[i], "--batch-size") == 0 || strcmp(argv[i], "-b") == 0) {
+            if (i + 1 < argc) {
+                batch_size = atoi(argv[++i]);
+            } else {
+                std::printf("Error: %s requires a value\n", argv[i]);
+                std::printf("Usage: %s [--n|-n <value>] [--batch-size|-b <value>]\n", argv[0]);
+                std::printf("   or: %s <small|medium|large>\n", argv[0]);
+                return EXIT_FAILURE;
+            }
+        } else if (strcmp(argv[i], "small") == 0) {
+            n = 16;           // 16×16×16 = 4K elements
+            batch_size = 4;
+        } else if (strcmp(argv[i], "medium") == 0) {
+            n = 32;           // 32×32×32 = 32K elements
+            batch_size = 8;
+        } else if (strcmp(argv[i], "large") == 0) {
+            n = 64;          // 64×64×64 = 262K elements
+            batch_size = 8;
+        } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+            std::printf("Usage: %s [OPTIONS]\n", argv[0]);
+            std::printf("\nOptions:\n");
+            std::printf("  -n, --n <value>            3D FFT dimension (n×n×n) (default: 16)\n");
+            std::printf("  -b, --batch-size <value>   Number of batched 3D FFTs (default: 4)\n");
+            std::printf("\nPresets:\n");
+            std::printf("  small   : n=16,  batch_size=4  (16×16×16 = 4K elements)\n");
+            std::printf("  medium  : n=32,  batch_size=8  (32×32×32 = 32K elements)\n");
+            std::printf("  large   : n=64,  batch_size=8  (64×64×64 = 262K elements)\n");
+            std::printf("\nExamples:\n");
+            std::printf("  %s --n 64 --batch-size 16\n", argv[0]);
+            std::printf("  %s -n 128 -b 4\n", argv[0]);
+            std::printf("  %s medium --batch-size 16\n", argv[0]);
+            std::printf("  %s large\n", argv[0]);
+            return EXIT_SUCCESS;
+        } else {
+            std::printf("Error: Unknown argument '%s'\n", argv[i]);
+            std::printf("Usage: %s [--n|-n <value>] [--batch-size|-b <value>]\n", argv[0]);
+            std::printf("   or: %s <small|medium|large>\n", argv[0]);
+            std::printf("   or: %s --help\n", argv[0]);
+            return EXIT_FAILURE;
+        }
+    }
+
+    dim_t fft = {n, n, n};
+    int fft_size = fft[0] * fft[1] * fft[2];
+
+    std::printf("==============================================\n");
+    std::printf("cuFFT 3D C2C Example (Scalable)\n");
+    std::printf("==============================================\n");
+    std::printf("FFT dimension: %d×%d×%d\n", n, n, n);
+    std::printf("FFT size: %d\n", fft_size);
+    std::printf("Batch size: %d\n", batch_size);
+    std::printf("==============================================\n\n");
+
+    using scalar_type = float;
+    using data_type = std::complex<scalar_type>;
+
+    std::vector<data_type> data(fft_size * batch_size);
+
+    // Initialize with simple pattern
+    std::random_device rd;
+    std::mt19937 gen(42);  // Fixed seed for reproducibility
+    std::uniform_real_distribution<float> dist(0.0f, 1.0f);
+
+    for (int i = 0; i < data.size(); i++) {
+        data[i] = data_type(dist(gen), dist(gen));
+    }
+
+    if (n <= 16) {
+        std::printf("Input array (first 8 elements):\n");
+        for (int i = 0; i < std::min(8, (int)data.size()); i++) {
+            std::printf("%f + %fj\n", data[i].real(), data[i].imag());
+        }
+        std::printf("=====\n");
+    }
+
+    cufftComplex *d_data = nullptr;
+
+    // inembed/onembed being nullptr indicates contiguous data for each batch, then the stride and dist settings are ignored
+    CUFFT_CALL(cufftPlanMany(&plan, fft.size(), fft.data(),
+                             nullptr, 1, 0, // *inembed, istride, idist
+                             nullptr, 1, 0, // *onembed, ostride, odist
+                             CUFFT_C2C, batch_size));
+
+    CUDA_RT_CALL(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUFFT_CALL(cufftSetStream(plan, stream));
+
+    // Create device data arrays
+    CUDA_RT_CALL(cudaMalloc(reinterpret_cast<void **>(&d_data), sizeof(data_type) * data.size()));
+    CUDA_RT_CALL(cudaMemcpyAsync(d_data, data.data(), sizeof(data_type) * data.size(),
+                                 cudaMemcpyHostToDevice, stream));
+
+    /*
+     * Note:
+     *  Identical pointers to data and output arrays implies in-place transformation
+     */
+    std::printf("Executing forward FFT...\n");
+    CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_FORWARD));
+    CUDA_RT_CALL(cudaStreamSynchronize(stream));
+    std::printf("Forward FFT complete.\n");
+
+    if (n <= 16) {
+        CUDA_RT_CALL(cudaMemcpyAsync(data.data(), d_data, sizeof(data_type) * data.size(),
+                                     cudaMemcpyDeviceToHost, stream));
+        CUDA_RT_CALL(cudaStreamSynchronize(stream));
+        std::printf("Output after Forward (first 8 elements):\n");
+        for (int i = 0; i < std::min(8, (int)data.size()); i++) {
+            std::printf("%f + %fj\n", data[i].real(), data[i].imag());
+        }
+        std::printf("=====\n");
+    }
+
+    // Normalize the data and inverse FFT
+    std::printf("Executing inverse FFT...\n");
+    scaling_kernel<<<(data.size() + 127) / 128, 128, 0, stream>>>(d_data, data.size(), 1.f/fft_size);
+    CUFFT_CALL(cufftExecC2C(plan, d_data, d_data, CUFFT_INVERSE));
+    CUDA_RT_CALL(cudaStreamSynchronize(stream));
+    std::printf("Inverse FFT complete.\n");
+
+    if (n <= 16) {
+        CUDA_RT_CALL(cudaMemcpyAsync(data.data(), d_data, sizeof(data_type) * data.size(),
+                                     cudaMemcpyDeviceToHost, stream));
+        CUDA_RT_CALL(cudaStreamSynchronize(stream));
+        std::printf("Output after Inverse (first 8 elements):\n");
+        for (int i = 0; i < std::min(8, (int)data.size()); i++) {
+            std::printf("%f + %fj\n", data[i].real(), data[i].imag());
+        }
+        std::printf("=====\n");
+    }
+
+
+
+    /* free resources */
+    CUDA_RT_CALL(cudaFree(d_data));
+
+    CUFFT_CALL(cufftDestroy(plan));
+
+    CUDA_RT_CALL(cudaStreamDestroy(stream));
+
+    CUDA_RT_CALL(cudaDeviceReset());
+
+    std::printf("\n==============================================\n");
+    std::printf("SUCCESS: 3D C2C FFT completed\n");
+    std::printf("==============================================\n");
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h
new file mode 100644
index 000000000..846d63ab7
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_3d_c2c/cufft_utils.h
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2020 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#pragma once
+
+// CUDA API error checking
+#ifndef CUDA_RT_CALL
+#define CUDA_RT_CALL( call )                                                                                           \
+    {                                                                                                                  \
+        auto status = static_cast<cudaError_t>( call );                                                                \
+        if ( status != cudaSuccess )                                                                                   \
+            fprintf( stderr,                                                                                           \
+                     "ERROR: CUDA RT call \"%s\" in line %d of file %s failed "                                        \
+                     "with "                                                                                           \
+                     "%s (%d).\n",                                                                                     \
+                     #call,                                                                                            \
+                     __LINE__,                                                                                         \
+                     __FILE__,                                                                                         \
+                     cudaGetErrorString( status ),                                                                     \
+                     status );                                                                                         \
+    }
+#endif  // CUDA_RT_CALL
+
+// cufft API error chekcing
+#ifndef CUFFT_CALL
+#define CUFFT_CALL( call )                                                                                             \
+    {                                                                                                                  \
+        auto status = static_cast<cufftResult>( call );                                                                \
+        if ( status != CUFFT_SUCCESS )                                                                                 \
+            fprintf( stderr,                                                                                           \
+                     "ERROR: CUFFT call \"%s\" in line %d of file %s failed "                                          \
+                     "with "                                                                                           \
+                     "code (%d).\n",                                                                                   \
+                     #call,                                                                                            \
+                     __LINE__,                                                                                         \
+                     __FILE__,                                                                                         \
+                     status );                                                                                         \
+    }
+#endif  // CUFFT_CALL
+
+__global__
+void scaling_kernel(cufftComplex* data, int element_count, float scale) {
+    const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    const int stride = blockDim.x * gridDim.x;
+    for (auto i = tid; i<element_count; i+= stride) {
+        data[i].x *= scale;
+        data[i].y *= scale;
+    }
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/Makefile b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/Makefile
new file mode 100644
index 000000000..6a1997865
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/Makefile
@@ -0,0 +1,24 @@
+EXECUTABLE := cufft_lto_r2c_c2r_scalable
+CUFILES := r2c_c2r_reference.cu
+CCFILES := cufft_lto_r2c_c2r_scalable.cpp common.cpp
+ADDITIONAL_LIBS := -lcufft
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+# LTO fatbin generation variables
+CUDA_LTO_ARCH ?= 75
+BIN2C ?= $(CUDA_INSTALL_PATH)/bin/bin2c
+FATBIN_HEADER := r2c_c2r_lto_callback_device_fatbin.h
+NVCC_KEEP_CLEAN := $(FATBIN_HEADER) r2c_c2r_lto_callback_device.fatbin
+
+include ../../../common/common.mk
+
+# Generate fatbin header (custom rule added AFTER common.mk)
+$(FATBIN_HEADER): r2c_c2r_lto_callback_device.cu
+	@echo "Generating LTO fatbin header..."
+	@$(CUDA_INSTALL_PATH)/bin/nvcc --std=c++11 --generate-code arch=compute_$(CUDA_LTO_ARCH),code=lto_$(CUDA_LTO_ARCH) -dc -fatbin $< -o r2c_c2r_lto_callback_device.fatbin
+	@$(BIN2C) --name window_callback --type longlong r2c_c2r_lto_callback_device.fatbin > $@
+
+# Make cpp source depend on fatbin header so it gets built first
+cufft_lto_r2c_c2r_scalable.cpp: $(FATBIN_HEADER)
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h
new file mode 100644
index 000000000..55bd7a0cb
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/callback_params.h
@@ -0,0 +1,37 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _CALLBACK_PARAMS__H_
+#define _CALLBACK_PARAMS__H_
+
+// Callback parameters structure
+struct cb_params {
+	unsigned window_size;
+	unsigned signal_size;
+};
+
+// Problem input parameters (made non-const for runtime configuration)
+extern unsigned batches;
+extern unsigned signal_size;
+extern unsigned window_size;
+extern unsigned complex_signal_size;
+
+// Precision threshold
+constexpr float threshold = 1e-6;
+
+#endif // _CALLBACK_PARAMS__H_
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp
new file mode 100644
index 000000000..ae6ffcdc7
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.cpp
@@ -0,0 +1,63 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _COMMON__CPP_
+#define _COMMON__CPP_
+
+#include <random>
+#include "common.h"
+
+// Wave parameters
+constexpr unsigned waves        =  12;
+constexpr float    signal_max_A =  20.;
+constexpr float    signal_max_f = 500.;
+constexpr float    sampling_dt  = 1e-3;
+
+// Initialize the input signal as a composite of sine waves
+// with random amplitudes and frequencies
+void init_input_signals(unsigned batches, unsigned signal_size, float* signals) {
+	std::mt19937 e2(0);
+
+	std::uniform_real_distribution<> A_dist(0., signal_max_A);
+	std::uniform_real_distribution<> f_dist(0., signal_max_f);
+
+	const unsigned complex_signal_size = signal_size / 2 + 1;
+
+	for(unsigned batch = 0; batch < batches; ++batch) {
+		std::vector<float> wave_amplitudes;
+		std::vector<float> wave_frequencies;
+
+		// Generate the amplitudes and frequencies of the waves
+		for(unsigned i = 0; i < waves; ++i) {
+			wave_amplitudes.push_back(A_dist(e2));
+			wave_frequencies.push_back(f_dist(e2));
+		}
+
+		// Compose the signal
+		float time = 0.;
+		for(unsigned s = 0; s < signal_size; ++s) {
+			for(unsigned i = 0; i < waves; ++i) {
+				unsigned idx = batch * (2 * complex_signal_size) + s;
+				signals[idx] += wave_amplitudes[i] * sin(2. * PI * wave_frequencies[i] * time) ;
+			}
+			time += sampling_dt;
+		}
+	}
+}
+
+#endif // _COMMON__CPP_
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h
new file mode 100644
index 000000000..6bcbacccc
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/common.h
@@ -0,0 +1,68 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef _COMMON__H_
+#define _COMMON__H_
+
+#include <cuda.h>
+#include <cufft.h>
+#include <complex>
+
+// Some helper definitions
+#define ERROR_VALUE -1
+#define PASS_VALUE   0
+#define PI 3.1415926535897932
+
+// Check CUDA API error
+inline int checkErrors(cudaError_t error, int line_number) {
+	if (error != cudaSuccess) {
+		printf("Example failed in CUDA API on line %d with error %d\n", line_number, error);
+		return ERROR_VALUE;
+	}
+	return PASS_VALUE;
+}
+
+// Check cuFFT API error
+inline int checkErrors(cufftResult error, int line_number) {
+	if (error != CUFFT_SUCCESS) {
+		printf("Example failed in cuFFT API on line %d with error %d\n", line_number, error);
+		return ERROR_VALUE;
+	}
+	return PASS_VALUE;
+}
+
+#define CHECK_ERROR(error) checkErrors(error, __LINE__)
+
+template<typename T>
+double compute_error(T* ref, T* out, unsigned batches, unsigned signal_size){
+    double squared_diff = 0;
+    double squared_norm = 0;
+	const unsigned batch_offset = 2 * (signal_size / 2 + 1);
+    for (int b = 0; b < batches; b++) {
+        for (int i = 0; i < signal_size; i++) {
+            unsigned  ref_idx = b * batch_offset + i;
+            squared_diff += std::norm(ref[ref_idx] - out[ref_idx]); // Note that std::norm(z) = z * conj(z), not the usual sqrt(z * conj(z))
+            squared_norm += std::norm(ref[ref_idx]);
+        }
+    }
+    return std::sqrt(squared_diff / squared_norm);
+}
+
+// Initialize input signals
+void init_input_signals(unsigned batches, unsigned signal_size, float* signals);
+#endif // _COMMON__H_
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp
new file mode 100644
index 000000000..97dd7f6f4
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/cufft_lto_r2c_c2r_scalable.cpp
@@ -0,0 +1,240 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+/*
+ * Example showing the use of LTO callbacks with CUFFT to perform
+ * truncation with zero padding.
+ *
+*/
+
+#include <cuda_runtime_api.h>
+#include <cufftXt.h>
+#include <cstring>
+#include "r2c_c2r_reference.h"
+#include "common.h"
+#include "callback_params.h"
+
+// NOTE: Header containing the compiled LTO callback device function in a C array, generated with bin2c
+#include "r2c_c2r_lto_callback_device_fatbin.h"
+
+// Note: Removed static_assert since window_size and signal_size are now runtime variables
+
+int test_r2c_window_c2r() {
+
+	// Padded array for in-place transforms - use heap allocation for large sizes
+	const size_t array_size = batches * 2 * complex_signal_size;
+	float *input_signals = new float[array_size]();
+	float *output_signals = new float[array_size];
+	float *reference = new float[array_size];
+
+	init_input_signals(batches, signal_size, input_signals);
+
+	const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float);
+
+	// Allocate and copy input from host to GPU
+	float *device_signals;
+	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
+	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));
+
+	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback
+	cufftHandle forward_plan, inverse_plan_cb;
+	size_t work_size;
+
+	CHECK_ERROR(cufftCreate(&forward_plan));
+	CHECK_ERROR(cufftCreate(&inverse_plan_cb));
+
+	// NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet)
+#ifdef CB_USE_CONSTANT_MEMORY
+	cb_params *device_params = nullptr;
+	std::string callback_name = "windowing_constant_memory_callback";
+#else
+	// Define a structure used to pass in the window size
+	cb_params host_params;
+	host_params.window_size = window_size;
+	host_params.signal_size = complex_signal_size;
+
+	// Allocate and copy callback parameters from host to GPU
+	cb_params *device_params;
+	CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params)));
+	CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice));
+
+	std::string callback_name = "windowing_callback";
+#endif
+	size_t lto_callback_fatbin_size = sizeof(window_callback);
+	printf("Setting up LTO callback '%s', fatbin size: %zu bytes\n", callback_name.c_str(), lto_callback_fatbin_size);
+	cufftResult cb_result = cufftXtSetJITCallback(inverse_plan_cb,
+                                      callback_name.c_str(),
+                                      (void*)window_callback,
+                                      lto_callback_fatbin_size,
+                                      CUFFT_CB_LD_COMPLEX,
+                                      (void **)&device_params);
+	printf("cufftXtSetJITCallback returned: %d\n", cb_result);
+	CHECK_ERROR(cb_result);
+
+	printf("Creating forward plan (R2C): signal_size=%u, batches=%u\n", signal_size, batches);
+	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
+	printf("Creating inverse plan (C2R) with callback: signal_size=%u, batches=%u\n", signal_size, batches);
+	CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size));
+
+	// Transform signal forward
+	printf("Transforming signal cufftExecR2C\n");
+	CHECK_ERROR(cufftExecR2C(forward_plan,    (cufftReal *)device_signals, (cufftComplex *)device_signals));
+
+	// Apply window via load callback and inverse-transform the signal
+	printf("Transforming signal cufftExecC2R\n");
+	CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals));
+
+	// Copy device memory to host
+	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));
+
+	// Destroy CUFFT context
+	CHECK_ERROR(cufftDestroy(forward_plan));
+	CHECK_ERROR(cufftDestroy(inverse_plan_cb));
+
+	// Cleanup memory
+	CHECK_ERROR(cudaFree(device_signals));
+	CHECK_ERROR(cudaFree(device_params));
+
+	// Compute reference
+	if(reference_r2c_window_c2r(batches, signal_size, window_size, input_signals, reference) != PASS_VALUE) {
+		printf("Failed to compute the reference");
+		delete[] input_signals;
+		delete[] output_signals;
+		delete[] reference;
+		return ERROR_VALUE;
+	};
+
+	double l2_error = compute_error<float>(reference, output_signals, batches, signal_size);
+	printf("L2 error: %e\n", l2_error);
+
+	// Cleanup heap-allocated arrays
+	delete[] input_signals;
+	delete[] output_signals;
+	delete[] reference;
+
+	return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE;
+}
+
+// Define global variables for size configuration
+unsigned batches = 100;
+unsigned signal_size = 128;
+unsigned window_size = 16;
+unsigned complex_signal_size = signal_size / 2 + 1;
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+    // Parse named command-line arguments
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--batches") == 0 || strcmp(argv[i], "-b") == 0) {
+            if (i + 1 < argc) {
+                batches = atoi(argv[++i]);
+            } else {
+                printf("Error: %s requires a value\n", argv[i]);
+                printf("Usage: %s [--batches|-b <value>] [--signal-size|-s <value>] [--window-size|-w <value>]\n", argv[0]);
+                printf("   or: %s <small|medium|large>\n", argv[0]);
+                return ERROR_VALUE;
+            }
+        } else if (strcmp(argv[i], "--signal-size") == 0 || strcmp(argv[i], "-s") == 0) {
+            if (i + 1 < argc) {
+                signal_size = atoi(argv[++i]);
+            } else {
+                printf("Error: %s requires a value\n", argv[i]);
+                printf("Usage: %s [--batches|-b <value>] [--signal-size|-s <value>] [--window-size|-w <value>]\n", argv[0]);
+                printf("   or: %s <small|medium|large>\n", argv[0]);
+                return ERROR_VALUE;
+            }
+        } else if (strcmp(argv[i], "--window-size") == 0 || strcmp(argv[i], "-w") == 0) {
+            if (i + 1 < argc) {
+                window_size = atoi(argv[++i]);
+            } else {
+                printf("Error: %s requires a value\n", argv[i]);
+                printf("Usage: %s [--batches|-b <value>] [--signal-size|-s <value>] [--window-size|-w <value>]\n", argv[0]);
+                printf("   or: %s <small|medium|large>\n", argv[0]);
+                return ERROR_VALUE;
+            }
+        } else if (strcmp(argv[i], "small") == 0) {
+            batches = 128;
+            signal_size = 64;
+            window_size = 16;
+        } else if (strcmp(argv[i], "medium") == 0) {
+            batches = 500;
+            signal_size = 256;
+            window_size = 32;
+        } else if (strcmp(argv[i], "large") == 0) {
+            batches = 2000;
+            signal_size = 32;
+            window_size = 8;
+        } else if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+            printf("Usage: %s [OPTIONS]\n", argv[0]);
+            printf("\nOptions:\n");
+            printf("  -b, --batches <value>      Number of FFT batches (default: 100)\n");
+            printf("  -s, --signal-size <value>  Size of each signal (default: 128)\n");
+            printf("  -w, --window-size <value>  Window size for truncation (default: 16)\n");
+            printf("\nPresets:\n");
+            printf("  small   : batches=128,  signal_size=64,  window_size=16\n");
+            printf("  medium  : batches=500,  signal_size=256, window_size=32\n");
+            printf("  large   : batches=2000, signal_size=32,  window_size=8\n");
+            printf("\nExamples:\n");
+            printf("  %s --batches 1024 --signal-size 512 --window-size 32\n", argv[0]);
+            printf("  %s -b 1024 -s 512 -w 32\n", argv[0]);
+            printf("  %s medium --batches 1000\n", argv[0]);
+            printf("  %s small\n", argv[0]);
+            return PASS_VALUE;
+        } else {
+            printf("Error: Unknown argument '%s'\n", argv[i]);
+            printf("Usage: %s [--batches|-b <value>] [--signal-size|-s <value>] [--window-size|-w <value>]\n", argv[0]);
+            printf("   or: %s <small|medium|large>\n", argv[0]);
+            printf("   or: %s --help\n", argv[0]);
+            return ERROR_VALUE;
+        }
+    }
+
+    complex_signal_size = signal_size / 2 + 1;
+
+    printf("==============================================\n");
+    printf("cuFFT LTO R2C:C2R Example (Scalable)\n");
+    printf("==============================================\n");
+    printf("Batches: %u\n", batches);
+    printf("Signal size: %u\n", signal_size);
+    printf("Window size: %u\n", window_size);
+    printf("==============================================\n\n");
+
+    struct cudaDeviceProp properties;
+    int device;
+    CHECK_ERROR(cudaGetDevice(&device));
+    CHECK_ERROR(cudaGetDeviceProperties(&properties, device));
+    if (!(properties.major >= 5)) {
+        printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n");
+        return ERROR_VALUE;
+    }
+
+    int result = test_r2c_window_c2r();
+
+    printf("\n==============================================\n");
+    if (result == PASS_VALUE) {
+        printf("SUCCESS: LTO R2C:C2R completed\n");
+    } else {
+        printf("FAILED: LTO R2C:C2R\n");
+    }
+    printf("==============================================\n");
+
+    return result;
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h
new file mode 100644
index 000000000..576c68378
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/nvrtc_helper.h
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/* A simplified version of the NVRTC helper included with CUDA samples,
+ * targeting cuFFT LTO callbacks
+ */
+
+#ifndef COMMON_NVRTC_HELPER_H_
+#define COMMON_NVRTC_HELPER_H_
+
+#include <cuda.h>
+#include <nvrtc.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#define NVRTC_SAFE_CALL(Name, x)                                \
+  do {                                                          \
+    nvrtcResult result = x;                                     \
+    if (result != NVRTC_SUCCESS) {                              \
+      std::cerr << "\nerror: " << Name << " failed with error " \
+                << nvrtcGetErrorString(result) << std::endl;    \
+      exit(1);                                                  \
+    }                                                           \
+  } while (0)
+
+#define STRINGIZE2(s) #s
+#define STRINGIZE(s) STRINGIZE2(s)
+#define INCLUDE_CUDA_PATH "-I" STRINGIZE(CUDA_PATH) "/include"
+#define CUDA_ARCH_FLAG "-arch=compute_" STRINGIZE(CUDA_ARCH)
+#define CALLBACK_CODE_PATH(name) STRINGIZE(SOURCE_PATH) "/" name
+
+void compile_file_to_lto(std::vector<char>& cubin_result, const char *filename) {
+  std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
+  if (!inputFile.is_open()) {
+    std::cerr << "\nerror: unable to open " << filename << " for reading!\n";
+    exit(1);
+  }
+
+  std::streampos pos = inputFile.tellg();
+  size_t inputSize = (size_t)pos;
+  std::vector<char> memBlock(inputSize + 1);
+
+  inputFile.seekg(0, std::ios::beg);
+  inputFile.read(memBlock.data(), inputSize);
+  inputFile.close();
+  memBlock[inputSize] = '\x0';
+
+  const int   num_params       = 6;
+  const char *compile_params[] = {INCLUDE_CUDA_PATH,
+                                  CUDA_ARCH_FLAG,
+                                  "--std=c++11",
+                                  "--relocatable-device-code=true",
+                                  "-default-device",
+                                  "-dlto"};
+
+  // Compile
+  nvrtcProgram prog;
+  NVRTC_SAFE_CALL("nvrtcCreateProgram", nvrtcCreateProgram(&prog, memBlock.data(), filename, 0, NULL, NULL));
+  nvrtcResult res = nvrtcCompileProgram(prog, num_params, compile_params);
+
+  // Print log
+  size_t logSize;
+  NVRTC_SAFE_CALL("nvrtcGetProgramLogSize", nvrtcGetProgramLogSize(prog, &logSize));
+  std::vector<char> log(logSize + 1);
+  NVRTC_SAFE_CALL("nvrtcGetProgramLog", nvrtcGetProgramLog(prog, log.data()));
+  log[logSize] = '\x0';
+
+  if(log.size() > 2) {
+    std::cerr << "\n compilation log ---\n";
+    std::string s(log.begin(), log.end());
+    std::cerr << s;
+    std::cerr << "\n end log ---\n";
+  }
+
+  NVRTC_SAFE_CALL("nvrtcCompileProgram", res);
+
+  size_t codeSize;
+  NVRTC_SAFE_CALL("nvrtcGetLTOIRSize", nvrtcGetLTOIRSize(prog, &codeSize));
+  std::vector<char> buffer(codeSize);
+  NVRTC_SAFE_CALL("nvrtcGetNVVM", nvrtcGetLTOIR(prog, buffer.data()));
+  cubin_result = buffer;
+}
+
+#endif  // COMMON_NVRTC_HELPER_H_
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu
new file mode 100644
index 000000000..7d838d028
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_legacy_callback_example.cu
@@ -0,0 +1,148 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+/*
+ * Example showing the use of LTO callbacks with CUFFT to perform
+ * normalization and truncation with zero padding.
+ *
+*/
+
+#include <cuda_runtime_api.h>
+#include <cufftXt.h>
+#include "common.h"
+#include "r2c_c2r_reference.h"
+#include "callback_params.h"
+
+// This is the store callback routine. It filters high frequencies
+// based on a truncation window specified by the user
+// NOTE: unlike the LTO version, the callback function can have
+// any name
+__constant__ unsigned cmem_window_size = window_size;
+__constant__ unsigned cmem_signal_size = complex_signal_size;
+__device__ cufftComplex windowing_callback(void *input,
+                                           size_t index,
+                                           void *info,
+                                           void *sharedmem) {
+ 	cufftComplex* cb_output = static_cast<cufftComplex*>(input);
+#ifdef CB_USE_CONSTANT_MEMORY
+	const unsigned sample = index % cmem_signal_size;
+	return (sample < cmem_window_size) ? cb_output[index] : cufftComplex{0.f, 0.f};
+#else
+	const cb_params* params = static_cast<const cb_params*>(info);
+	const unsigned sample = index % params->signal_size;
+	return (sample < params->window_size) ? cb_output[index] : cufftComplex{0.f, 0.f};
+#endif
+}
+
+__device__ cufftCallbackLoadC device_callback_ptr = windowing_callback;
+
+static_assert(window_size < (signal_size/2 + 1), "The window size must be smaller than the signal size in complex space");
+
+int test_r2c_window_c2r() {
+
+	// Padded array for in-place transforms
+	float  input_signals[batches][2 * complex_signal_size] = {};
+	float output_signals[batches][2 * complex_signal_size];
+	float      reference[batches][2 * complex_signal_size];
+
+	init_input_signals(batches, signal_size, &input_signals[0][0]);
+
+	const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float);
+
+	// Allocate and copy input from host to GPU
+	float *device_signals;
+	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
+	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));
+
+	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback
+	cufftHandle forward_plan, inverse_plan_cb;
+	size_t work_size;
+
+	CHECK_ERROR(cufftCreate(&forward_plan));
+	CHECK_ERROR(cufftCreate(&inverse_plan_cb));
+
+	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
+	CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size));
+
+	// NOTE: The host needs to get a copy of the device pointer to the callback. Not required for LTO callback
+	cufftCallbackLoadC host_callback_ptr;
+	CHECK_ERROR(cudaMemcpyFromSymbol(&host_callback_ptr, device_callback_ptr, sizeof(host_callback_ptr)));
+
+#ifdef CB_USE_CONSTANT_MEMORY
+	cb_params *device_params = nullptr;
+#else
+	// Define a structure used to pass in the window size
+	cb_params host_params;
+	host_params.window_size = window_size;
+	host_params.signal_size = complex_signal_size;
+
+	// Allocate and copy callback parameters from host to GPU
+	cb_params *device_params;
+	CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params)));
+	CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice));
+#endif
+	// Now associate the load callback with the plan.
+	CHECK_ERROR(cufftXtSetCallback(inverse_plan_cb, (void **)&host_callback_ptr, CUFFT_CB_LD_COMPLEX, (void **)&device_params));
+
+	// Transform signal forward
+	printf("Transforming signal cufftExecR2C\n");
+	CHECK_ERROR(cufftExecR2C(forward_plan,    (cufftReal *)device_signals, (cufftComplex *)device_signals));
+
+	// Apply window via load callback and inverse-transform the signal
+	printf("Transforming signal cufftExecC2R\n");
+	CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals));
+
+	// Copy device memory to host
+	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));
+
+	// Destroy CUFFT context
+	CHECK_ERROR(cufftDestroy(forward_plan));
+	CHECK_ERROR(cufftDestroy(inverse_plan_cb));
+
+	// Cleanup memory
+	CHECK_ERROR(cudaFree(device_signals));
+	CHECK_ERROR(cudaFree(device_params));
+
+	// Compute reference
+	if (reference_r2c_window_c2r(batches, signal_size, window_size, &input_signals[0][0], &reference[0][0]) != PASS_VALUE) {
+		printf("Failed to compute the reference");
+		return ERROR_VALUE;
+	}
+
+	double l2_error = compute_error<float>(&reference[0][0], &output_signals[0][0], batches, signal_size);
+	printf("L2 error: %e\n", l2_error);
+
+	return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+	struct cudaDeviceProp properties;
+	int device;
+	CHECK_ERROR(cudaGetDevice(&device));
+	CHECK_ERROR(cudaGetDeviceProperties(&properties, device));
+	if (!(properties.major >= 5)) {
+		printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n");
+		return ERROR_VALUE;
+	}
+
+	return test_r2c_window_c2r();
+}
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu
new file mode 100644
index 000000000..1431e3c1b
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device.cu
@@ -0,0 +1,45 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+/*
+ * Example showing the use of LTO callbacks with CUFFT to perform
+ * truncation with zero padding.
+ *
+*/
+
+#include <cufftXt.h>
+#include "callback_params.h"
+
+// This is the store callback routine. It filters high frequencies
+// based on a truncation window specified by the user
+__device__ cufftComplex windowing_callback(void*              input,
+                                           unsigned long long idx,
+                                           void*              info,
+                                           void*              sharedmem) {
+
+	const cb_params* params = static_cast<const cb_params*>(info);
+	cufftComplex* cb_output = static_cast<cufftComplex*>(input);
+
+	const unsigned sample   = idx % params->signal_size;
+
+	return (sample < params->window_size) ? cb_output[idx] : cufftComplex{0.f, 0.f};
+}
+
+// Note: Constant memory version removed for scalable implementation
+// The callback now uses runtime parameters passed through info pointer
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h
new file mode 100644
index 000000000..4e97cd281
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_callback_device_fatbin.h
@@ -0,0 +1,92 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+unsigned long long window_callback[] = {
+0x00100001ba55ed50ULL,0x00000000000009f8ULL,0x0000007801010008ULL,0x0000000000000980ULL,
+0x000000400000097aULL,0x0000004b00010040ULL,0x0000000000000000ULL,0x0000000000012011ULL,
+0x00000000861d559cULL,0x0000000000000b38ULL,0x0000002800000048ULL,0x2d20303d7a74662dULL,
+0x7669645f63657270ULL,0x636572702d20313dULL,0x20313d747271735fULL,0x20313d64616d662dULL,
+0x0000000000000000ULL,0xb3df19853aa79a4eULL,0x9ce55e973efaf7b9ULL,0xfc5d4b5fbd7f7656ULL,
+0xa16dd934438c2971ULL,0x73d0c26bafc12e47ULL,0xc55e34b6edfe113aULL,0x6a89003a1b8764a5ULL,
+0xf1da8d1ac6137429ULL,0x2056217118c8564cULL,0xd408cb5fa0a53a2bULL,0xcbe5acb9331f5bfaULL,
+0x462be6b0cca5934dULL,0x9b9e7c1e0e218196ULL,0xd873ec4c22f37e49ULL,0x02ebbd52b2810818ULL,
+0xec2e1d2a87e77e0dULL,0x4915b0a280110c87ULL,0x1489d837223481b0ULL,0xcb43c090d76cf5c6ULL,
+0xb22f9e00526ba48dULL,0xc680d59356482921ULL,0x0b0aeb2ecdc91c3bULL,0x7d84415d3f51f0d7ULL,
+0x989832ace4dc2e90ULL,0x9b734532920eeedeULL,0xf31d2a7716674f31ULL,0x4ef6f384005e385fULL,
+0x15a46b421f64a904ULL,0xcf084f6bd9e1e817ULL,0x9204f365d20a2ac4ULL,0xba35c48d37212d76ULL,
+0x1997f360b753f06cULL,0x2f1b5cfe7ebc4108ULL,0xb30a4f954c82ef29ULL,0x8b5b329795581519ULL,
+0xe012c44eaf3d9c4bULL,0x19a9d60149758779ULL,0x61e73f3af4a79265ULL,0x0c41dde210f2040eULL,
+0xf65b08c5efa40305ULL,0xde45eeb929bad831ULL,0xc62fac341327b9a9ULL,0x3dc6a75c639e59e0ULL,
+0xf0e2bf18f990e7bdULL,0x9e07cea9c0a2b240ULL,0x403f7fdf59a9f22bULL,0xf533989bd7551679ULL,
+0xb972ad03a62d611dULL,0x5814c8f1029f9063ULL,0x935cd7350fb780c2ULL,0x0480baf83650c1daULL,
+0xae6e283c826dab1bULL,0x9403eeecba31977eULL,0x171516ce797d87e7ULL,0x978abc930e89a78cULL,
+0x8aec5f230461ebb8ULL,0xcf51594c1dcc07e4ULL,0x4e7c1b94b7d479a4ULL,0xd23236cd4982d9baULL,
+0x69793ccdd0946bcbULL,0x2f667e3119cee955ULL,0xfe38643d463ac0baULL,0xc4f5842cbb9645b0ULL,
+0xedf1793fd1e1ec67ULL,0x09651a6bfbc63253ULL,0x17e852707a4e1b4aULL,0x8ca32cfb49e9c1f2ULL,
+0xe561e27972e37105ULL,0x6b17da7adb5c6dbdULL,0x859c087b1b46341dULL,0xc78b3aee1de06653ULL,
+0x2296d389a9abff58ULL,0x8d44a92ae716d279ULL,0xdabae2b9cffe9335ULL,0xdd57f3e3a54e3f9aULL,
+0x6fa10d4a75988453ULL,0x44751e637287cc4fULL,0x3826e90044b88618ULL,0xabdc86d1a00c4864ULL,
+0xac5154b5269750faULL,0x0b0c98d0a734e6aeULL,0x8a38ebbe6dfcd890ULL,0xf578bc7e9f63584eULL,
+0x066a20a40c81aa79ULL,0xb1da95c681714f98ULL,0x95a7a8939262db69ULL,0xeb5d6d162db0c47dULL,
+0xf3cbe8f708242112ULL,0xaec2f3d2534d3653ULL,0xe368cdcad21cc38fULL,0x6ddf47e86c29ae6bULL,
+0x217c3065a6c9fc31ULL,0x8cc86fed4c910837ULL,0x734aa81b3a35bdb4ULL,0xdfab9d5e3944c522ULL,
+0xba8ace5aef0d926eULL,0xbca8fb8a3dd0dd71ULL,0x8c6f9b2ea76144a1ULL,0x3ab7c46491c5d372ULL,
+0xfdea9370de1a3944ULL,0xb21ede18c16ec2cfULL,0xa247d0c8f3581cc0ULL,0xfb8a09a13694e5e5ULL,
+0x2e3a35c923afdfbaULL,0x538bf2b326b6461dULL,0x6c38602ae2353a33ULL,0x0616b90c76d45fd1ULL,
+0x8ba1ab4e2e5418a2ULL,0x6b1b4cba49aab043ULL,0xf4b3807b6202b50eULL,0x30e460e644adc04aULL,
+0x2177e70736c1fc36ULL,0x08bc8cf5c8b924a4ULL,0x511edf6f6b2cafe8ULL,0xd50990da78d708e6ULL,
+0x39bd58ef4af9f7d6ULL,0xc8d83cb7c862305dULL,0x0b64be09f791357fULL,0x203298c8a390b840ULL,
+0xe8655af21c86b93dULL,0x0e9e081bebcb83f9ULL,0x7d4540d9fdae1387ULL,0xdc7ea2fccf583b79ULL,
+0x58948df65e916276ULL,0x4fa699acc456d34eULL,0xa9bd264cb8c624b1ULL,0x6186bb1f42e517c7ULL,
+0xf4d0cecdb6c77c86ULL,0x91b62496e92d4007ULL,0xe6884e4a995fbfc8ULL,0x360fb1f078911dbaULL,
+0xd2913b43fc8ed09aULL,0x4802ac43e9cce4cbULL,0x51792bb1d9b9f6a5ULL,0x1b80a0a7c4f6aa74ULL,
+0xe9cfe89ebf0fd497ULL,0x954e2ca2a5e42f12ULL,0xfde393fba24954f2ULL,0x570e5780e2d3f9faULL,
+0x8f1352dda1f7ae5eULL,0xf3656cb562fd1ab6ULL,0x89d52ee3498587efULL,0x08ec08a1a6be2e49ULL,
+0x2d8820f34153deefULL,0x579b418c80f79846ULL,0x996b578d31659089ULL,0x85b836c947893a33ULL,
+0xe72e090c3cb8f0beULL,0xa4c03328e3c94489ULL,0xf6b569ca5ca0c462ULL,0xdd2e775be2369af4ULL,
+0x8524d124d8aa721eULL,0xc0a77bc694326c48ULL,0xb6c1afa438fd842bULL,0xd7f290a76df49a2cULL,
+0x99ead5386853ed6fULL,0x799b53798b539b1cULL,0x5db2e52ee8e56ee0ULL,0xfe08d75aec95f6aeULL,
+0xfedbea28c256805cULL,0xdfd5f2a84e6bb1b3ULL,0x998a35c7cc4c6ea4ULL,0xd811df87ac37b46bULL,
+0xe1a8cd605a7ee7a2ULL,0x6e50e24c7ad14c0bULL,0xb0939d317d59fbafULL,0x634ed5336389b361ULL,
+0x6ffd0cac1457b393ULL,0xb051e7a7daad179cULL,0xa8717200aa075d3fULL,0x41deb9cd9acd8a15ULL,
+0xeb41d554ef55aebcULL,0xa0d2661239c800b3ULL,0xbf327a02cef15065ULL,0xfba11541537dfec0ULL,
+0x537d8e14e95a8fccULL,0x17ce1cb6524b5280ULL,0x022e06d043aeeaa5ULL,0x7e702426c779b0e5ULL,
+0x5f3e81b3ea707e5aULL,0x07cad8b5ce0d8817ULL,0xc87601c78dac0445ULL,0x777611b269475022ULL,
+0x3e24e81e1350d045ULL,0x253ecf00d8fbc0b9ULL,0x1178a94ef37c250eULL,0x007f3d52ecf1fcd7ULL,
+0xe40a338a55e96fe9ULL,0x65fe06013586f9f0ULL,0x39552ceeab4b2881ULL,0xee7f86bd4d92ba59ULL,
+0x3bb8faa518e27aecULL,0x9a3741260e2efcb7ULL,0x8dfa691ddb3a3482ULL,0x8e1565f9c1426e43ULL,
+0x279a9a74fe66c9d1ULL,0x13b9f8925cc24f4fULL,0x43bf58cead91cef4ULL,0xd6e2b4a88629424dULL,
+0xeee159c966933528ULL,0x9c4fb5ad60dc37b5ULL,0x17e808902199a72fULL,0x370ad041076053d1ULL,
+0xf75706a61ae5a5deULL,0xd6386efeede5af4eULL,0xd63aa08aeabeb868ULL,0x621b3cb0fdbd7244ULL,
+0xb471a2fc0bec5acfULL,0x20c318ab7abff16eULL,0x6c70f515eb8270b6ULL,0x15a67ea5468a3b9fULL,
+0xac806d321b5f7729ULL,0xb6e8d71b32633a44ULL,0x1d81a3fc2e385b6cULL,0x6bc1e3c198e7a0a2ULL,
+0xc6da0604ab91d760ULL,0xf35056b67187bf6dULL,0xf9b6eb751b88af0bULL,0xef422c4a4f7d915eULL,
+0xa8f8324f46270989ULL,0xdd52d4b110c83432ULL,0xa25a603ec3ec6d75ULL,0x4c9b33515196921cULL,
+0x02276767300c71c5ULL,0x457700ffba63c7f6ULL,0xcf16b3113812082fULL,0xe930a3e7b8729bf8ULL,
+0xa9c1d4ae2411747fULL,0x20a008371ed9d342ULL,0x9205463540dd2ca6ULL,0x0f4ff6d7db127aa1ULL,
+0xbd854c29ce1afbfbULL,0x1e8231fdfde87d45ULL,0x28efc8ad57411175ULL,0xad634e05d1009295ULL,
+0xd7073fdea7279cc9ULL,0x920c15ac2619c0a1ULL,0x34f3b62f46bb053aULL,0x47496dd4845c853fULL,
+0x14b46f3bfde59fb2ULL,0x1560522a21ca7458ULL,0x855978202eee5338ULL,0x0650162f164f2ee3ULL,
+0x5bb43d678278fe99ULL,0x00e41b9de79c21abULL,0xbb814528fa67f1daULL,0xd9ead75fbb0924daULL,
+0x11bcc3bdf9f2c518ULL,0x20162ada6a99bda9ULL,0xddb6d248728f7fafULL,0x2af54679064abca0ULL,
+0x61be9c5fc3359a21ULL,0x67bab1acb7e2a71eULL,0x98ffd55efb24b2abULL,0x7bce9f79ae365a19ULL,
+0x6eed62d19aaf43dcULL,0x678fe63ccb7f2d30ULL,0xdcf998ca5ef7076dULL,0xbd75e71fd6515355ULL,
+0x93df67d27ddebc8dULL,0xa331ef2c4e33c37fULL,0xa1668f9cc53cf9fbULL,0x54eeaba4cce0e139ULL,
+0xdc713dc459e8f295ULL,0xb1f751a14e827a95ULL,0xdc4c6b72e04299e7ULL,0xa45f439b44e5bcb0ULL,
+0x75d528261a9dd346ULL,0x60bbe86e87d440d6ULL,0x5507f0a0f2f9d449ULL,0x589ddbe8d38c87d4ULL,
+0x6a9325188b1b4e13ULL,0x6debafe7543805f4ULL,0x3c59cad53035933bULL,0x5425f2c2d6a01b15ULL,
+0xe9e0f53e2e548128ULL,0xd6b40f8b54205c19ULL,0x0528cd0440ae1a58ULL,0xaa8d82f522379d6eULL,
+0x8e96f1d46b2078acULL,0x7805c441a20ac004ULL,0x5704b4b0a0caa41dULL,0xc255c06a850c0d62ULL,
+0xae2a53f4c3edd4ddULL,0x3e52c89c671196c5ULL,0xff37ac769ca776bcULL,0xb49a880feb53a57fULL,
+0x350459f19159481eULL,0xd9cb47e6b7eeb44dULL,0x448720ab781460cfULL,0x67033b8e51fa51e6ULL,
+0x135f62109a1a3bebULL,0xdadd7ecbe4238ad5ULL,0xcefe8f02b9ba9165ULL,0x59999f0427393ea1ULL,
+0x02582b0a20ebe3f2ULL,0xe26080ea0fe5f050ULL,0x00851a6d480e2f34ULL,0xdcea9e2bab8481deULL,
+0x0ae74fed789fb981ULL,0xeb5fa5b0f630d9aaULL,0xeab795f79c0fc9e0ULL,0x74810dca5d08ee93ULL,
+0x00000000000078fbULL
+};
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp
new file mode 100644
index 000000000..4ec57e6bb
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_lto_nvrtc_callback_example.cpp
@@ -0,0 +1,132 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+
+/*
+ * Example showing the use of LTO callbacks with CUFFT to perform
+ * truncation with zero padding.
+ *
+*/
+
+#include <cuda_runtime_api.h>
+#include <cufftXt.h>
+#include "r2c_c2r_reference.h"
+#include "common.h"
+#include "nvrtc_helper.h"
+#include "callback_params.h"
+static_assert(window_size < (signal_size/2 + 1), "The window size must be smaller than the signal size in complex space");
+
+int test_r2c_window_c2r() {
+	// Padded array for in-place transforms
+	float  input_signals[batches][2 * complex_signal_size] = {};
+	float output_signals[batches][2 * complex_signal_size];
+	float      reference[batches][2 * complex_signal_size];
+
+	init_input_signals(batches, signal_size, &input_signals[0][0]);
+
+	const size_t complex_size_bytes = batches * complex_signal_size * 2 * sizeof(float);
+
+	// Allocate and copy input from host to GPU
+	float *device_signals;
+	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
+	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));
+
+	// NOTE: Use NVRTC to compile the callback function to LTO
+	std::vector<char> callback_buffer;
+	compile_file_to_lto(callback_buffer, CALLBACK_CODE_PATH("r2c_c2r_lto_callback_device.cu"));
+
+	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform with load callback
+	cufftHandle forward_plan, inverse_plan_cb;
+	size_t work_size;
+
+	CHECK_ERROR(cufftCreate(&forward_plan));
+	CHECK_ERROR(cufftCreate(&inverse_plan_cb));
+
+	// NOTE: LTO callbacks must be set before plan creation and cannot be unset (yet)
+#ifdef CB_USE_CONSTANT_MEMORY
+	cb_params *device_params  = nullptr;
+	std::string callback_name = "windowing_constant_memory_callback";
+#else
+	// Define a structure used to pass in the window size
+	cb_params host_params;
+	host_params.window_size = window_size;
+	host_params.signal_size = complex_signal_size;
+
+	// Allocate and copy callback parameters from host to GPU
+	cb_params *device_params;
+	CHECK_ERROR(cudaMalloc((void **)&device_params, sizeof(cb_params)));
+	CHECK_ERROR(cudaMemcpy(device_params, &host_params, sizeof(cb_params), cudaMemcpyHostToDevice));
+
+	std::string callback_name = "windowing_callback";
+#endif
+	CHECK_ERROR(cufftXtSetJITCallback(inverse_plan_cb,
+                                      callback_name.c_str(),
+                                      (void*)callback_buffer.data(),
+                                      callback_buffer.size(),
+                                      CUFFT_CB_LD_COMPLEX,
+                                      (void **)&device_params));
+
+	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
+	CHECK_ERROR(cufftMakePlan1d(inverse_plan_cb, signal_size, CUFFT_C2R, batches, &work_size));
+
+	// Transform signal forward
+	printf("Transforming signal cufftExecR2C\n");
+	CHECK_ERROR(cufftExecR2C(forward_plan,    (cufftReal *)device_signals, (cufftComplex *)device_signals));
+
+	// Apply window via load callback and inverse-transform the signal
+	printf("Transforming signal cufftExecC2R\n");
+	CHECK_ERROR(cufftExecC2R(inverse_plan_cb, (cufftComplex *)device_signals, (cufftReal *)device_signals));
+
+	// Copy device memory to host
+	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));
+
+	// Destroy CUFFT context
+	CHECK_ERROR(cufftDestroy(forward_plan));
+	CHECK_ERROR(cufftDestroy(inverse_plan_cb));
+
+	// Cleanup memory
+	CHECK_ERROR(cudaFree(device_signals));
+	CHECK_ERROR(cudaFree(device_params));
+
+	// Compute reference
+	if (reference_r2c_window_c2r(batches, signal_size, window_size, input_signals[0], reference[0]) != PASS_VALUE) {
+		printf("Failed to compute the reference");
+		return ERROR_VALUE;
+	}
+
+	double l2_error = compute_error<float>(reference[0], output_signals[0], batches, signal_size);
+	printf("L2 error: %e\n", l2_error);
+
+	return (l2_error < threshold) ? PASS_VALUE : ERROR_VALUE;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv) {
+    struct cudaDeviceProp properties;
+    int device;
+    CHECK_ERROR(cudaGetDevice(&device));
+    CHECK_ERROR(cudaGetDeviceProperties(&properties, device));
+    if (!(properties.major >= 5)) {
+        printf("cuFFT with LTO requires CUDA architecture SM5.0 or higher\n");
+        return ERROR_VALUE;
+    }
+
+    return test_r2c_window_c2r();
+}
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu
new file mode 100644
index 000000000..b040bfd54
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.cu
@@ -0,0 +1,86 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Reference for the example of LTO callbacks,
+ * run the same plans but perform the windowing with
+ * a separate kernel.
+ *
+*/
+
+#include <cuda_runtime_api.h>
+#include <cufft.h>
+#include <stdio.h>
+#include <vector>
+#include "r2c_c2r_reference.h"
+
+__global__ void windowing(unsigned nbatches, unsigned complex_signal_size, unsigned window_size, float2* buffer)
+{
+	const size_t idx = blockIdx.x*blockDim.x + threadIdx.x;
+
+	if(idx >= nbatches * complex_signal_size) return;
+
+	const unsigned sample = idx % complex_signal_size;
+
+	buffer[idx].x = (sample < window_size) ? buffer[idx].x : 0.f;
+	buffer[idx].y = (sample < window_size) ? buffer[idx].y : 0.f;
+}
+
+int reference_r2c_window_c2r(unsigned batches, unsigned signal_size, unsigned window_size, float* input_signals, float* output_signals) {
+	const unsigned complex_signal_size = signal_size / 2 + 1;
+	const size_t complex_size_bytes    = batches * complex_signal_size * 2 * sizeof(float);
+
+	// Allocate and copy input from host to GPU
+	float *device_signals;
+	CHECK_ERROR(cudaMalloc((void **)&device_signals, complex_size_bytes));
+	CHECK_ERROR(cudaMemcpy(device_signals, input_signals, complex_size_bytes, cudaMemcpyHostToDevice));
+
+	// Create a CUFFT plan for the forward transform, and a cuFFT plan for the inverse transform
+	cufftHandle forward_plan, inverse_plan;
+	size_t work_size;
+
+	CHECK_ERROR(cufftCreate(&forward_plan));
+	CHECK_ERROR(cufftCreate(&inverse_plan));
+
+	CHECK_ERROR(cufftMakePlan1d(forward_plan, signal_size, CUFFT_R2C, batches, &work_size));
+	CHECK_ERROR(cufftMakePlan1d(inverse_plan, signal_size, CUFFT_C2R, batches, &work_size));
+
+	// Transform signal forward
+	printf("Transforming reference cufftExecR2C\n");
+	CHECK_ERROR(cufftExecR2C(forward_plan, (cufftReal *)device_signals, (cufftComplex *)device_signals));
+
+	// Apply window via separate kernel
+	windowing<<<(batches * complex_signal_size + 255) / 256, 256>>>(batches, complex_signal_size, window_size, (float2*) device_signals);
+	cudaDeviceSynchronize();
+	CHECK_ERROR(cudaGetLastError());
+
+	// Inverse-transform the signal
+	printf("Transforming reference cufftExecC2R\n");
+	CHECK_ERROR(cufftExecC2R(inverse_plan, (cufftComplex *)device_signals, (cufftReal *)device_signals));
+
+	// Copy device memory to host
+	CHECK_ERROR(cudaMemcpy(output_signals, device_signals, complex_size_bytes, cudaMemcpyDeviceToHost));
+
+	// Destroy CUFFT context
+	CHECK_ERROR(cufftDestroy(forward_plan));
+	CHECK_ERROR(cufftDestroy(inverse_plan));
+
+	// Cleanup memory
+	CHECK_ERROR(cudaFree(device_signals));
+
+	return PASS_VALUE;
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h
new file mode 100644
index 000000000..9872fd98d
--- /dev/null
+++ b/src/cuda/HPC/cufft/cufft_lto_r2c_c2r/r2c_c2r_reference.h
@@ -0,0 +1,26 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#ifndef R2C_C2R_REFERENCE__H_
+#define R2C_C2R_REFERENCE__H_
+
+#include "common.h"
+
+int reference_r2c_window_c2r(unsigned batches, unsigned signal_size, unsigned window_size, float* input_signals, float* output_signals);
+
+#endif // R2C_C2R_REFERENCE__H_
\ No newline at end of file
diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile b/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile
new file mode 100644
index 000000000..8fdea2698
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/Makefile
@@ -0,0 +1,9 @@
+EXECUTABLE := cusolver_Xgetrf_scalable
+CUFILES := cusolver_Xgetrf_scalable.cu
+CCFILES :=
+ADDITIONAL_LIBS := -lcusolver -lcublas
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu
new file mode 100644
index 000000000..5253a00ae
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_Xgetrf_scalable.cu
@@ -0,0 +1,286 @@
+/*
+ * Modified cuSOLVER Xgetrf example with scalable input sizes
+ * Based on NVIDIA's cusolver_Xgetrf_example.cu
+ *
+ * Accepts command-line arguments for matrix size:
+ *   small:  16x16
+ *   medium: 128x128
+ *   large:  512x512
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <random>
+
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+#include "cusolver_utils.h"
+
+void print_usage(const char* prog_name) {
+    printf("Usage: %s [OPTIONS]\n", prog_name);
+    printf("\n");
+    printf("Options:\n");
+    printf("  -m, --m <value>    Matrix dimension (creates mxm matrix)\n");
+    printf("\n");
+    printf("Presets:\n");
+    printf("  small              16x16 matrix\n");
+    printf("  medium             128x128 matrix\n");
+    printf("  large              512x512 matrix\n");
+    printf("\n");
+    printf("Examples:\n");
+    printf("  %s --m 1024        # 1024x1024 matrix\n", prog_name);
+    printf("  %s -m 2048         # 2048x2048 matrix\n", prog_name);
+    printf("  %s small           # 16x16 matrix\n", prog_name);
+    printf("  %s medium          # 128x128 matrix\n", prog_name);
+    printf("\n");
+}
+
+int main(int argc, char *argv[]) {
+    // Parse command line arguments
+    int64_t m = 16;  // Default: small
+    const char* size_name = "small";
+    bool custom_m = false;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+            print_usage(argv[0]);
+            return 0;
+        } else if (strcmp(argv[i], "--m") == 0 || strcmp(argv[i], "-m") == 0) {
+            if (i + 1 < argc) {
+                m = atoll(argv[++i]);
+                custom_m = true;
+                size_name = "custom";
+            } else {
+                fprintf(stderr, "Error: %s requires a value\n", argv[i]);
+                print_usage(argv[0]);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "small") == 0) {
+            m = 16;
+            size_name = "small";
+        } else if (strcmp(argv[i], "medium") == 0) {
+            m = 128;
+            size_name = "medium";
+        } else if (strcmp(argv[i], "large") == 0) {
+            m = 512;
+            size_name = "large";
+        } else {
+            fprintf(stderr, "Error: Unknown argument '%s'\n", argv[i]);
+            print_usage(argv[0]);
+            return 1;
+        }
+    }
+
+    printf("==============================================\n");
+    printf("cuSOLVER Xgetrf Example (Scalable)\n");
+    printf("==============================================\n");
+    if (custom_m) {
+        printf("Matrix size: %ldx%ld\n", m, m);
+    } else {
+        printf("Matrix size: %s (%ldx%ld)\n", size_name, m, m);
+    }
+    printf("Pivot: ON (compute P*A = L*U)\n");
+    printf("==============================================\n\n");
+
+    cusolverDnHandle_t cusolverH = NULL;
+    cudaStream_t stream = NULL;
+
+    using data_type = double;
+
+    const int64_t lda = m;
+    const int64_t ldb = m;
+    const int64_t nrhs = 1;  // number of right-hand sides
+
+    // Generate random matrix A and vector B
+    std::vector<data_type> A(m * m);
+    std::vector<data_type> B(m);
+    std::vector<data_type> X(m, 0);
+    std::vector<data_type> LU(lda * m, 0);
+    std::vector<int64_t> Ipiv(m, 0);
+    int info = 0;
+
+    // Initialize with random values for reproducibility
+    std::mt19937 gen(42);  // Fixed seed for reproducibility
+    std::uniform_real_distribution<data_type> dist(0.0, 10.0);
+
+    for (int64_t i = 0; i < m * m; i++) {
+        A[i] = dist(gen);
+    }
+
+    for (int64_t i = 0; i < m; i++) {
+        B[i] = dist(gen);
+    }
+
+    // For small matrices, print them
+    if (m <= 16) {
+        std::printf("A = (matlab base-1)\n");
+        for (int64_t row = 0; row < m; row++) {
+            for (int64_t col = 0; col < m; col++) {
+                printf("%.2f ", A[col * m + row]);
+            }
+            printf("\n");
+        }
+        std::printf("=====\n");
+        std::printf("B = (matlab base-1)\n");
+        for (int64_t i = 0; i < m; i++) {
+            printf("%.2f ", B[i]);
+        }
+        printf("\n");
+        std::printf("=====\n");
+    }
+
+    data_type *d_A = nullptr;  /* device copy of A */
+    data_type *d_B = nullptr;  /* device copy of B */
+    int64_t *d_Ipiv = nullptr; /* pivoting sequence */
+    int *d_info = nullptr;     /* error info */
+
+    size_t workspaceInBytesOnDevice = 0; /* size of workspace */
+    void *d_work = nullptr;              /* device workspace for getrf */
+    size_t workspaceInBytesOnHost = 0;   /* size of workspace */
+    void *h_work = nullptr;              /* host workspace for getrf */
+
+    const int pivot_on = 1;
+    const int algo = 0;
+
+    printf("Using New Algo\n");
+
+    /* step 1: create cusolver handle, bind a stream */
+    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
+
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
+
+    /* step 2: copy A to device */
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(data_type) * A.size()));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(data_type) * B.size()));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_Ipiv), sizeof(int64_t) * Ipiv.size()));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
+
+    CUDA_CHECK(cudaMemcpyAsync(d_A, A.data(), sizeof(data_type) * A.size(),
+                               cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(cudaMemcpyAsync(d_B, B.data(), sizeof(data_type) * B.size(),
+                               cudaMemcpyHostToDevice, stream));
+
+    /* step 3: query working space of Xgetrf */
+    cusolverDnParams_t params;
+    CUSOLVER_CHECK(cusolverDnCreateParams(&params));
+
+    CUSOLVER_CHECK(cusolverDnXgetrf_bufferSize(cusolverH, params, m, m,
+                                                CUDA_R_64F, d_A, lda,
+                                                CUDA_R_64F, &workspaceInBytesOnDevice,
+                                                &workspaceInBytesOnHost));
+
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_work), workspaceInBytesOnDevice));
+
+    if (workspaceInBytesOnHost > 0) {
+        h_work = malloc(workspaceInBytesOnHost);
+        if (h_work == nullptr) {
+            throw std::runtime_error("Error: h_work not allocated.");
+        }
+    }
+
+    printf("Starting LU factorization (Xgetrf)...\n");
+    printf("Workspace: device=%zu bytes, host=%zu bytes\n",
+           workspaceInBytesOnDevice, workspaceInBytesOnHost);
+
+    /* step 4: LU factorization */
+    CUSOLVER_CHECK(cusolverDnXgetrf(cusolverH, params, m, m,
+                                     CUDA_R_64F, d_A, lda, d_Ipiv,
+                                     CUDA_R_64F, d_work, workspaceInBytesOnDevice,
+                                     h_work, workspaceInBytesOnHost, d_info));
+
+    CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(LU.data(), d_A, sizeof(data_type) * A.size(),
+                               cudaMemcpyDeviceToHost, stream));
+    CUDA_CHECK(cudaMemcpyAsync(Ipiv.data(), d_Ipiv, sizeof(int64_t) * Ipiv.size(),
+                               cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    std::printf("after Xgetrf: info = %d\n", info);
+    if (0 > info) {
+        std::printf("%d-th parameter is wrong \n", -info);
+        exit(1);
+    } else if (0 < info) {
+        std::printf("WARNING: matrix is singular, U(%d,%d) = 0\n", info, info);
+    }
+
+    // For small matrices, print pivoting sequence
+    if (m <= 16) {
+        printf("pivoting sequence, matlab base-1\n");
+        for (int64_t i = 0; i < m; i++) {
+            printf("Ipiv(%ld) = %ld\n", i + 1, Ipiv[i]);
+        }
+    } else {
+        printf("pivoting sequence (first 8), matlab base-1\n");
+        for (int i = 0; i < std::min((int64_t)8, m); i++) {
+            printf("Ipiv(%d) = %ld\n", i + 1, Ipiv[i]);
+        }
+    }
+
+    // For small matrices, print L and U
+    if (m <= 16) {
+        printf("L and U = (matlab base-1)\n");
+        for (int64_t row = 0; row < m; row++) {
+            for (int64_t col = 0; col < m; col++) {
+                printf("%.2f ", LU[col * m + row]);
+            }
+            printf("\n");
+        }
+        std::printf("=====\n");
+    }
+
+    printf("Starting solve (Xgetrs)...\n");
+
+    /* step 5: solve A*X = B */
+    CUSOLVER_CHECK(cusolverDnXgetrs(cusolverH, params, CUBLAS_OP_N, m, nrhs,
+                                     CUDA_R_64F, d_A, lda, d_Ipiv,
+                                     CUDA_R_64F, d_B, ldb, d_info));
+
+    CUDA_CHECK(cudaMemcpyAsync(X.data(), d_B, sizeof(data_type) * X.size(),
+                               cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // Print solution
+    if (m <= 16) {
+        std::printf("X = (matlab base-1)\n");
+        for (int64_t i = 0; i < m; i++) {
+            printf("%.6f ", X[i]);
+        }
+        printf("\n");
+    } else {
+        std::printf("X = (first 8 elements)\n");
+        for (int i = 0; i < std::min((int64_t)8, m); i++) {
+            printf("%.6f ", X[i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n==============================================\n");
+    printf("SUCCESS: LU factorization and solve completed\n");
+    printf("==============================================\n");
+
+    /* free resources */
+    CUDA_CHECK(cudaFree(d_A));
+    CUDA_CHECK(cudaFree(d_B));
+    CUDA_CHECK(cudaFree(d_Ipiv));
+    CUDA_CHECK(cudaFree(d_info));
+    CUDA_CHECK(cudaFree(d_work));
+
+    CUSOLVER_CHECK(cusolverDnDestroyParams(params));
+    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+
+    if (h_work) {
+        free(h_work);
+    }
+
+    CUDA_CHECK(cudaDeviceReset());
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h
new file mode 100644
index 000000000..2d37d51c5
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_Xgetrf/cusolver_utils.h
@@ -0,0 +1,299 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#pragma once
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include <cuComplex.h>
+#include <cuda_runtime_api.h>
+#include <cublas_api.h>
+#include <cusolverDn.h>
+#include <library_types.h>
+
+// CUDA API error checking
+#define CUDA_CHECK(err)                                                                            \
+    do {                                                                                           \
+        cudaError_t err_ = (err);                                                                  \
+        if (err_ != cudaSuccess) {                                                                 \
+            printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                          \
+            throw std::runtime_error("CUDA error");                                                \
+        }                                                                                          \
+    } while (0)
+
+// cusolver API error checking
+#define CUSOLVER_CHECK(err)                                                                        \
+    do {                                                                                           \
+        cusolverStatus_t err_ = (err);                                                             \
+        if (err_ != CUSOLVER_STATUS_SUCCESS) {                                                     \
+            printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
+            throw std::runtime_error("cusolver error");                                            \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUBLAS_CHECK(err)                                                                          \
+    do {                                                                                           \
+        cublasStatus_t err_ = (err);                                                               \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
+            printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                        \
+            throw std::runtime_error("cublas error");                                              \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUSPARSE_CHECK(err)                                                                        \
+    do {                                                                                           \
+        cusparseStatus_t err_ = (err);                                                             \
+        if (err_ != CUSPARSE_STATUS_SUCCESS) {                                                     \
+            printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
+            throw std::runtime_error("cusparse error");                                            \
+        }                                                                                          \
+    } while (0)
+
+// memory alignment
+#define ALIGN_TO(A, B) (((A + B - 1) / B) * B)
+
+// device memory pitch alignment
+static const size_t device_alignment = 32;
+
+// type traits
+template <typename T> struct traits;
+
+template <> struct traits<float> {
+    // scalar type
+    typedef float T;
+    typedef T S;
+
+    static constexpr T zero = 0.f;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_32F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F;
+#endif
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, S f) { return v * f; }
+};
+
+template <> struct traits<double> {
+    // scalar type
+    typedef double T;
+    typedef T S;
+
+    static constexpr T zero = 0.;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_64F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F;
+#endif
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, S f) { return v * f; }
+};
+
+template <> struct traits<cuFloatComplex> {
+    // scalar type
+    typedef float S;
+    typedef cuFloatComplex T;
+
+    static constexpr T zero = {0.f, 0.f};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_32F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F;
+#endif
+
+    inline static S abs(T val) { return cuCabsf(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuFloatComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCaddf(a, b); }
+    inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); }
+
+    inline static T mul(T v, S f) { return make_cuFloatComplex(v.x * f, v.y * f); }
+};
+
+template <> struct traits<cuDoubleComplex> {
+    // scalar type
+    typedef double S;
+    typedef cuDoubleComplex T;
+
+    static constexpr T zero = {0., 0.};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_64F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F;
+#endif
+
+    inline static S abs(T val) { return cuCabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuDoubleComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCadd(a, b); }
+    inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); }
+
+    inline static T mul(T v, S f) { return make_cuDoubleComplex(v.x * f, v.y * f); }
+};
+
+template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda);
+
+template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <>
+void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <typename T>
+void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0);
+    auto rand_gen = std::bind(dis, gen);
+
+    *lda = n;
+
+    size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T));
+    // suppress gcc 7 size warning
+    if (matrix_mem_size <= PTRDIFF_MAX)
+        *A = (T *)malloc(matrix_mem_size);
+    else
+        throw std::runtime_error("Memory allocation size is too large");
+
+    if (*A == NULL)
+        throw std::runtime_error("Unable to allocate host matrix");
+
+    for (int j = 0; j < n; ++j) {
+        for (int i = 0; i < m; ++i) {
+            T *A_col = (*A) + *lda * j;
+            A_col[i] = traits<T>::rand(rand_gen);
+        }
+    }
+}
+
+// Makes matrix A of size mxn and leading dimension lda diagonal dominant
+template <typename T>
+void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) {
+    for (int j = 0; j < std::min(m, n); ++j) {
+        T *A_col = A + lda * j;
+        auto col_sum = traits<typename traits<T>::S>::zero;
+        for (int i = 0; i < m; ++i) {
+            col_sum += traits<T>::abs(A_col[i]);
+        }
+        A_col[j] = traits<T>::add(A_col[j], col_sum);
+    }
+}
+
+// Returns cudaDataType value as defined in library_types.h for the string containing type name
+cudaDataType get_cuda_library_type(std::string type_string) {
+    if (type_string.compare("CUDA_R_16F") == 0)
+        return CUDA_R_16F;
+    else if (type_string.compare("CUDA_C_16F") == 0)
+        return CUDA_C_16F;
+    else if (type_string.compare("CUDA_R_32F") == 0)
+        return CUDA_R_32F;
+    else if (type_string.compare("CUDA_C_32F") == 0)
+        return CUDA_C_32F;
+    else if (type_string.compare("CUDA_R_64F") == 0)
+        return CUDA_R_64F;
+    else if (type_string.compare("CUDA_C_64F") == 0)
+        return CUDA_C_64F;
+    else if (type_string.compare("CUDA_R_8I") == 0)
+        return CUDA_R_8I;
+    else if (type_string.compare("CUDA_C_8I") == 0)
+        return CUDA_C_8I;
+    else if (type_string.compare("CUDA_R_8U") == 0)
+        return CUDA_R_8U;
+    else if (type_string.compare("CUDA_C_8U") == 0)
+        return CUDA_C_8U;
+    else if (type_string.compare("CUDA_R_32I") == 0)
+        return CUDA_R_32I;
+    else if (type_string.compare("CUDA_C_32I") == 0)
+        return CUDA_C_32I;
+    else if (type_string.compare("CUDA_R_32U") == 0)
+        return CUDA_R_32U;
+    else if (type_string.compare("CUDA_C_32U") == 0)
+        return CUDA_C_32U;
+    else
+        throw std::runtime_error("Unknown CUDA datatype");
+}
+
+// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing
+// solver name
+cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) {
+    if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0)
+        return CUSOLVER_IRS_REFINE_NONE;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0)
+        return CUSOLVER_IRS_REFINE_CLASSICAL;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_GMRES;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_GMRES_GMRES;
+    else
+        printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str());
+
+    return CUSOLVER_IRS_REFINE_NOT_SET;
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile b/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile
new file mode 100644
index 000000000..b7430b74e
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_ormqr/Makefile
@@ -0,0 +1,9 @@
+EXECUTABLE := cusolver_ormqr_scalable
+CUFILES := cusolver_ormqr_scalable.cu
+CCFILES :=
+ADDITIONAL_LIBS := -lcusolver -lcublas
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu
new file mode 100644
index 000000000..e5aded469
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_ormqr_scalable.cu
@@ -0,0 +1,253 @@
+/*
+ * Modified cuSOLVER ormqr example with scalable input sizes
+ * Based on NVIDIA's cusolver_ormqr_example.cu
+ *
+ * Accepts command-line arguments for matrix size:
+ *   small:  16x16
+ *   medium: 256x256
+ *   large:  768x768
+ */
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+#include <random>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+#include "cusolver_utils.h"
+
+void print_usage(const char* prog_name) {
+    printf("Usage: %s [OPTIONS]\n", prog_name);
+    printf("\n");
+    printf("Options:\n");
+    printf("  -m, --m <value>    Matrix dimension (creates mxm matrix)\n");
+    printf("\n");
+    printf("Presets:\n");
+    printf("  small              16x16 matrix\n");
+    printf("  medium             256x256 matrix\n");
+    printf("  large              768x768 matrix\n");
+    printf("\n");
+    printf("Examples:\n");
+    printf("  %s --m 512         # 512x512 matrix\n", prog_name);
+    printf("  %s -m 1024         # 1024x1024 matrix\n", prog_name);
+    printf("  %s small           # 16x16 matrix\n", prog_name);
+    printf("  %s medium          # 256x256 matrix\n", prog_name);
+    printf("\n");
+}
+
+int main(int argc, char *argv[]) {
+    // Parse command line arguments
+    int m = 16;  // Default: small
+    const char* size_name = "small";
+    bool custom_m = false;
+
+    for (int i = 1; i < argc; i++) {
+        if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 0) {
+            print_usage(argv[0]);
+            return 0;
+        } else if (strcmp(argv[i], "--m") == 0 || strcmp(argv[i], "-m") == 0) {
+            if (i + 1 < argc) {
+                m = atoi(argv[++i]);
+                custom_m = true;
+                size_name = "custom";
+            } else {
+                fprintf(stderr, "Error: %s requires a value\n", argv[i]);
+                print_usage(argv[0]);
+                return 1;
+            }
+        } else if (strcmp(argv[i], "small") == 0) {
+            m = 16;
+            size_name = "small";
+        } else if (strcmp(argv[i], "medium") == 0) {
+            m = 256;
+            size_name = "medium";
+        } else if (strcmp(argv[i], "large") == 0) {
+            m = 768;
+            size_name = "large";
+        } else {
+            fprintf(stderr, "Error: Unknown argument '%s'\n", argv[i]);
+            print_usage(argv[0]);
+            return 1;
+        }
+    }
+
+    printf("==============================================\n");
+    printf("cuSOLVER ormqr Example (Scalable)\n");
+    printf("==============================================\n");
+    if (custom_m) {
+        printf("Matrix size: %dx%d\n", m, m);
+    } else {
+        printf("Matrix size: %s (%dx%d)\n", size_name, m, m);
+    }
+    printf("==============================================\n\n");
+
+    cusolverDnHandle_t cusolverH = NULL;
+    cublasHandle_t cublasH = NULL;
+    cudaStream_t stream{};
+
+    const int lda = m;
+    const int ldb = m;
+    const int nrhs = 1; // number of right hand side vectors
+
+    // Generate random matrix A and vector B
+    std::vector<double> A(m * m);
+    std::vector<double> B(m);
+    std::vector<double> XC(ldb * nrhs, 0); // solution matrix from GPU
+
+    // Initialize with random values for reproducibility
+    std::mt19937 gen(42);  // Fixed seed for reproducibility
+    std::uniform_real_distribution<double> dist(0.0, 10.0);
+
+    for (int i = 0; i < m * m; i++) {
+        A[i] = dist(gen);
+    }
+
+    for (int i = 0; i < m; i++) {
+        B[i] = dist(gen);
+    }
+
+    // For small matrices, print them
+    if (m <= 16) {
+        std::printf("A = (first 8x8 block, matlab base-1)\n");
+        int print_size = std::min(m, 8);
+        for (int row = 0; row < print_size; row++) {
+            for (int col = 0; col < print_size; col++) {
+                printf("%.2f ", A[col * m + row]);
+            }
+            printf("\n");
+        }
+        std::printf("=====\n");
+        std::printf("B = (first 8 elements, matlab base-1)\n");
+        for (int i = 0; i < std::min(m, 8); i++) {
+            printf("%.2f ", B[i]);
+        }
+        printf("\n");
+        std::printf("=====\n");
+    }
+
+    /* device memory */
+    double *d_A = nullptr;
+    double *d_tau = nullptr;
+    double *d_B = nullptr;
+    int *d_info = nullptr;
+    double *d_work = nullptr;
+
+    int lwork_geqrf = 0;
+    int lwork_ormqr = 0;
+    int lwork = 0;
+    int info = 0;
+
+    const double one = 1;
+
+    /* step 1: create cudense/cublas handle */
+    CUSOLVER_CHECK(cusolverDnCreate(&cusolverH));
+    CUBLAS_CHECK(cublasCreate(&cublasH));
+
+    CUDA_CHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+    CUSOLVER_CHECK(cusolverDnSetStream(cusolverH, stream));
+    CUBLAS_CHECK(cublasSetStream(cublasH, stream));
+
+    /* step 2: copy A and B to device */
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_A), sizeof(double) * A.size()));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_tau), sizeof(double) * m));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_B), sizeof(double) * B.size()));
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_info), sizeof(int)));
+
+    CUDA_CHECK(
+        cudaMemcpyAsync(d_A, A.data(), sizeof(double) * A.size(), cudaMemcpyHostToDevice, stream));
+    CUDA_CHECK(
+        cudaMemcpyAsync(d_B, B.data(), sizeof(double) * B.size(), cudaMemcpyHostToDevice, stream));
+
+    /* step 3: query working space of geqrf and ormqr */
+    CUSOLVER_CHECK(cusolverDnDgeqrf_bufferSize(cusolverH, m, m, d_A, lda, &lwork_geqrf));
+
+    CUSOLVER_CHECK(cusolverDnDormqr_bufferSize(cusolverH, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, nrhs, m,
+                                               d_A, lda, d_tau, d_B, ldb, &lwork_ormqr));
+
+    lwork = std::max(lwork_geqrf, lwork_ormqr);
+
+    CUDA_CHECK(cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(double) * lwork));
+
+    printf("Starting QR factorization (geqrf)...\n");
+
+    /* step 4: compute QR factorization */
+    CUSOLVER_CHECK(cusolverDnDgeqrf(cusolverH, m, m, d_A, lda, d_tau, d_work, lwork, d_info));
+
+    /* check if QR is good or not */
+    CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    std::printf("after geqrf: info = %d\n", info);
+    if (0 > info) {
+        std::printf("%d-th parameter is wrong \n", -info);
+        exit(1);
+    }
+
+    printf("Starting ormqr (Q^T * B)...\n");
+
+    /* step 5: compute Q^T*B */
+    CUSOLVER_CHECK(cusolverDnDormqr(cusolverH, CUBLAS_SIDE_LEFT, CUBLAS_OP_T, m, nrhs, m, d_A, lda,
+                                    d_tau, d_B, ldb, d_work, lwork, d_info));
+
+    /* check if QR is good or not */
+    CUDA_CHECK(cudaMemcpyAsync(&info, d_info, sizeof(int), cudaMemcpyDeviceToHost, stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    std::printf("after ormqr: info = %d\n", info);
+    if (0 > info) {
+        std::printf("%d-th parameter is wrong \n", -info);
+        exit(1);
+    }
+
+    printf("Computing x = R \\ Q^T*B (triangular solve)...\n");
+
+    /* step 6: compute x = R \ Q^T*B */
+    CUBLAS_CHECK(cublasDtrsm(cublasH, CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER, CUBLAS_OP_N,
+                             CUBLAS_DIAG_NON_UNIT, m, nrhs, &one, d_A, lda, d_B, ldb));
+
+    CUDA_CHECK(cudaMemcpyAsync(XC.data(), d_B, sizeof(double) * XC.size(), cudaMemcpyDeviceToHost,
+                               stream));
+
+    CUDA_CHECK(cudaStreamSynchronize(stream));
+
+    // For small matrices, print solution
+    if (m <= 16) {
+        std::printf("X = (matlab base-1)\n");
+        for (int i = 0; i < m; i++) {
+            printf("%.6f ", XC[i]);
+        }
+        printf("\n");
+    } else {
+        std::printf("X = (first 8 elements)\n");
+        for (int i = 0; i < std::min(m, 8); i++) {
+            printf("%.6f ", XC[i]);
+        }
+        printf("\n");
+    }
+
+    printf("\n==============================================\n");
+    printf("SUCCESS: QR factorization and solve completed\n");
+    printf("==============================================\n");
+
+    /* free resources */
+    CUDA_CHECK(cudaFree(d_A));
+    CUDA_CHECK(cudaFree(d_tau));
+    CUDA_CHECK(cudaFree(d_B));
+    CUDA_CHECK(cudaFree(d_info));
+    CUDA_CHECK(cudaFree(d_work));
+
+    CUBLAS_CHECK(cublasDestroy(cublasH));
+    CUSOLVER_CHECK(cusolverDnDestroy(cusolverH));
+
+    CUDA_CHECK(cudaStreamDestroy(stream));
+
+    CUDA_CHECK(cudaDeviceReset());
+
+    return EXIT_SUCCESS;
+}
diff --git a/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h
new file mode 100644
index 000000000..2d37d51c5
--- /dev/null
+++ b/src/cuda/HPC/cusolver/cusolver_ormqr/cusolver_utils.h
@@ -0,0 +1,299 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2019 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#pragma once
+
+#include <cmath>
+#include <functional>
+#include <iostream>
+#include <random>
+#include <stdexcept>
+#include <string>
+
+#include <cuComplex.h>
+#include <cuda_runtime_api.h>
+#include <cublas_api.h>
+#include <cusolverDn.h>
+#include <library_types.h>
+
+// CUDA API error checking
+#define CUDA_CHECK(err)                                                                            \
+    do {                                                                                           \
+        cudaError_t err_ = (err);                                                                  \
+        if (err_ != cudaSuccess) {                                                                 \
+            printf("CUDA error %d at %s:%d\n", err_, __FILE__, __LINE__);                          \
+            throw std::runtime_error("CUDA error");                                                \
+        }                                                                                          \
+    } while (0)
+
+// cusolver API error checking
+#define CUSOLVER_CHECK(err)                                                                        \
+    do {                                                                                           \
+        cusolverStatus_t err_ = (err);                                                             \
+        if (err_ != CUSOLVER_STATUS_SUCCESS) {                                                     \
+            printf("cusolver error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
+            throw std::runtime_error("cusolver error");                                            \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUBLAS_CHECK(err)                                                                          \
+    do {                                                                                           \
+        cublasStatus_t err_ = (err);                                                               \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                                       \
+            printf("cublas error %d at %s:%d\n", err_, __FILE__, __LINE__);                        \
+            throw std::runtime_error("cublas error");                                              \
+        }                                                                                          \
+    } while (0)
+
+// cublas API error checking
+#define CUSPARSE_CHECK(err)                                                                        \
+    do {                                                                                           \
+        cusparseStatus_t err_ = (err);                                                             \
+        if (err_ != CUSPARSE_STATUS_SUCCESS) {                                                     \
+            printf("cusparse error %d at %s:%d\n", err_, __FILE__, __LINE__);                      \
+            throw std::runtime_error("cusparse error");                                            \
+        }                                                                                          \
+    } while (0)
+
+// memory alignment
+#define ALIGN_TO(A, B) (((A + B - 1) / B) * B)
+
+// device memory pitch alignment
+static const size_t device_alignment = 32;
+
+// type traits
+template <typename T> struct traits;
+
+template <> struct traits<float> {
+    // scalar type
+    typedef float T;
+    typedef T S;
+
+    static constexpr T zero = 0.f;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_32F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_32F;
+#endif
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, S f) { return v * f; }
+};
+
+template <> struct traits<double> {
+    // scalar type
+    typedef double T;
+    typedef T S;
+
+    static constexpr T zero = 0.;
+    static constexpr cudaDataType cuda_data_type = CUDA_R_64F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_R_64F;
+#endif
+
+    inline static S abs(T val) { return fabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) { return (S)gen(); }
+
+    inline static T add(T a, T b) { return a + b; }
+
+    inline static T mul(T v, S f) { return v * f; }
+};
+
+template <> struct traits<cuFloatComplex> {
+    // scalar type
+    typedef float S;
+    typedef cuFloatComplex T;
+
+    static constexpr T zero = {0.f, 0.f};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_32F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_32F;
+#endif
+
+    inline static S abs(T val) { return cuCabsf(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuFloatComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCaddf(a, b); }
+    inline static T add(T a, S b) { return cuCaddf(a, make_cuFloatComplex(b, 0.f)); }
+
+    inline static T mul(T v, S f) { return make_cuFloatComplex(v.x * f, v.y * f); }
+};
+
+template <> struct traits<cuDoubleComplex> {
+    // scalar type
+    typedef double S;
+    typedef cuDoubleComplex T;
+
+    static constexpr T zero = {0., 0.};
+    static constexpr cudaDataType cuda_data_type = CUDA_C_64F;
+#if CUDART_VERSION >= 11000
+    static constexpr cusolverPrecType_t cusolver_precision_type = CUSOLVER_C_64F;
+#endif
+
+    inline static S abs(T val) { return cuCabs(val); }
+
+    template <typename RNG> inline static T rand(RNG &gen) {
+        return make_cuDoubleComplex((S)gen(), (S)gen());
+    }
+
+    inline static T add(T a, T b) { return cuCadd(a, b); }
+    inline static T add(T a, S b) { return cuCadd(a, make_cuDoubleComplex(b, 0.)); }
+
+    inline static T mul(T v, S f) { return make_cuDoubleComplex(v.x * f, v.y * f); }
+};
+
+template <typename T> void print_matrix(const int &m, const int &n, const T *A, const int &lda);
+
+template <> void print_matrix(const int &m, const int &n, const float *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const double *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f ", A[j * lda + i]);
+        }
+        std::printf("\n");
+    }
+}
+
+template <> void print_matrix(const int &m, const int &n, const cuComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <>
+void print_matrix(const int &m, const int &n, const cuDoubleComplex *A, const int &lda) {
+    for (int i = 0; i < m; i++) {
+        for (int j = 0; j < n; j++) {
+            std::printf("%0.2f + %0.2fj ", A[j * lda + i].x, A[j * lda + i].y);
+        }
+        std::printf("\n");
+    }
+}
+
+template <typename T>
+void generate_random_matrix(cusolver_int_t m, cusolver_int_t n, T **A, int *lda) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<typename traits<T>::S> dis(-1.0, 1.0);
+    auto rand_gen = std::bind(dis, gen);
+
+    *lda = n;
+
+    size_t matrix_mem_size = static_cast<size_t>(*lda * m * sizeof(T));
+    // suppress gcc 7 size warning
+    if (matrix_mem_size <= PTRDIFF_MAX)
+        *A = (T *)malloc(matrix_mem_size);
+    else
+        throw std::runtime_error("Memory allocation size is too large");
+
+    if (*A == NULL)
+        throw std::runtime_error("Unable to allocate host matrix");
+
+    for (int j = 0; j < n; ++j) {
+        for (int i = 0; i < m; ++i) {
+            T *A_col = (*A) + *lda * j;
+            A_col[i] = traits<T>::rand(rand_gen);
+        }
+    }
+}
+
+// Makes matrix A of size mxn and leading dimension lda diagonal dominant
+template <typename T>
+void make_diag_dominant_matrix(cusolver_int_t m, cusolver_int_t n, T *A, int lda) {
+    for (int j = 0; j < std::min(m, n); ++j) {
+        T *A_col = A + lda * j;
+        auto col_sum = traits<typename traits<T>::S>::zero;
+        for (int i = 0; i < m; ++i) {
+            col_sum += traits<T>::abs(A_col[i]);
+        }
+        A_col[j] = traits<T>::add(A_col[j], col_sum);
+    }
+}
+
+// Returns cudaDataType value as defined in library_types.h for the string containing type name
+cudaDataType get_cuda_library_type(std::string type_string) {
+    if (type_string.compare("CUDA_R_16F") == 0)
+        return CUDA_R_16F;
+    else if (type_string.compare("CUDA_C_16F") == 0)
+        return CUDA_C_16F;
+    else if (type_string.compare("CUDA_R_32F") == 0)
+        return CUDA_R_32F;
+    else if (type_string.compare("CUDA_C_32F") == 0)
+        return CUDA_C_32F;
+    else if (type_string.compare("CUDA_R_64F") == 0)
+        return CUDA_R_64F;
+    else if (type_string.compare("CUDA_C_64F") == 0)
+        return CUDA_C_64F;
+    else if (type_string.compare("CUDA_R_8I") == 0)
+        return CUDA_R_8I;
+    else if (type_string.compare("CUDA_C_8I") == 0)
+        return CUDA_C_8I;
+    else if (type_string.compare("CUDA_R_8U") == 0)
+        return CUDA_R_8U;
+    else if (type_string.compare("CUDA_C_8U") == 0)
+        return CUDA_C_8U;
+    else if (type_string.compare("CUDA_R_32I") == 0)
+        return CUDA_R_32I;
+    else if (type_string.compare("CUDA_C_32I") == 0)
+        return CUDA_C_32I;
+    else if (type_string.compare("CUDA_R_32U") == 0)
+        return CUDA_R_32U;
+    else if (type_string.compare("CUDA_C_32U") == 0)
+        return CUDA_C_32U;
+    else
+        throw std::runtime_error("Unknown CUDA datatype");
+}
+
+// Returns cusolverIRSRefinement_t value as defined in cusolver_common.h for the string containing
+// solver name
+cusolverIRSRefinement_t get_cusolver_refinement_solver(std::string solver_string) {
+    if (solver_string.compare("CUSOLVER_IRS_REFINE_NONE") == 0)
+        return CUSOLVER_IRS_REFINE_NONE;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL") == 0)
+        return CUSOLVER_IRS_REFINE_CLASSICAL;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_GMRES;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_CLASSICAL_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_CLASSICAL_GMRES;
+    else if (solver_string.compare("CUSOLVER_IRS_REFINE_GMRES_GMRES") == 0)
+        return CUSOLVER_IRS_REFINE_GMRES_GMRES;
+    else
+        printf("Unknown solver parameter: \"%s\"\n", solver_string.c_str());
+
+    return CUSOLVER_IRS_REFINE_NOT_SET;
+}
\ No newline at end of file
diff --git a/src/cuda/HPC/external/cugraph b/src/cuda/HPC/external/cugraph
new file mode 160000
index 000000000..15e04b307
--- /dev/null
+++ b/src/cuda/HPC/external/cugraph
@@ -0,0 +1 @@
+Subproject commit 15e04b3070dda0985b281f69bc7d37ad0a91f94d
diff --git a/src/cuda/HPC/external/newton b/src/cuda/HPC/external/newton
new file mode 160000
index 000000000..141baffff
--- /dev/null
+++ b/src/cuda/HPC/external/newton
@@ -0,0 +1 @@
+Subproject commit 141baffff9d6059e94fbc399d9404868169555a8
diff --git a/src/cuda/HPC/find_cugraph_tag.sh b/src/cuda/HPC/find_cugraph_tag.sh
new file mode 100755
index 000000000..971a79982
--- /dev/null
+++ b/src/cuda/HPC/find_cugraph_tag.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Find the latest cuGraph tag that supports the given CUDA version
+
+CUDA_VERSION=$1
+CUGRAPH_DIR=$2
+
+if [ -z "$CUDA_VERSION" ] || [ -z "$CUGRAPH_DIR" ]; then
+    echo "Usage: $0 <cuda_version> <cugraph_dir>"
+    exit 1
+fi
+
+cd "$CUGRAPH_DIR" || exit 1
+
+# Fetch all tags
+git fetch --tags --quiet 2>/dev/null
+
+# Get all tags sorted by version (newest first)
+TAGS=$(git tag -l 'v*' | sort -V -r)
+
+# For each tag, check if it supports the CUDA version
+for TAG in $TAGS; do
+    # Checkout the tag quietly
+    git checkout "$TAG" --quiet 2>/dev/null || continue
+
+    # Check rapids-cmake or CMakeLists.txt for CUDA version support
+    # Look for CUDA version specifications in cmake files
+    if [ -f "rapids-cmake/rapids-cuda/rapids_cuda_init_architectures.cmake" ]; then
+        CUDA_FILE="rapids-cmake/rapids-cuda/rapids_cuda_init_architectures.cmake"
+    elif [ -f "cpp/CMakeLists.txt" ]; then
+        CUDA_FILE="cpp/CMakeLists.txt"
+    else
+        continue
+    fi
+
+    # Extract supported CUDA versions from the file
+    # Look for patterns like "CUDA 12.8" or "CUDA_VERSION 12.8"
+    if grep -q "$CUDA_VERSION" "$CUDA_FILE" 2>/dev/null || \
+       grep -qE "CUDA.*$CUDA_VERSION|$CUDA_VERSION.*CUDA" "$CUDA_FILE" 2>/dev/null; then
+        echo "$TAG"
+        exit 0
+    fi
+done
+
+# If no tag found, return the latest tag
+LATEST=$(git tag -l 'v*' | sort -V -r | head -1)
+echo "$LATEST"
diff --git a/src/cuda/HPC/generate_graph.py b/src/cuda/HPC/generate_graph.py
new file mode 100755
index 000000000..0bc623ce3
--- /dev/null
+++ b/src/cuda/HPC/generate_graph.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+"""
+Generate a synthetic graph with ~100K vertices for benchmarking.
+Creates a scale-free graph using the Barabási-Albert model.
+"""
+
+import sys
+import random
+import argparse
+
+
+def create_barabasi_albert_graph(n, m, seed=42):
+    """
+    Create a Barabási-Albert scale-free graph using optimized approach.
+
+    Args:
+        n: Number of vertices
+        m: Number of edges to attach from a new node to existing nodes
+        seed: Random seed for reproducibility
+
+    Returns:
+        List of edges (u, v) tuples
+    """
+    random.seed(seed)
+
+    print(f"Generating Barabási-Albert graph with {n:,} vertices...")
+    print(f"Each new node connects to {m} existing nodes")
+
+    # Edge list
+    edges = []
+
+    # Targets for preferential attachment (repeating nodes based on degree)
+    # This allows O(1) random selection with degree-based probability
+    targets = []
+
+    # Start with a small complete graph
+    initial_nodes = max(m, 2)
+    for i in range(initial_nodes):
+        for j in range(i + 1, initial_nodes):
+            edges.append((i, j))
+            targets.append(i)
+            targets.append(j)
+
+    # Add remaining nodes with preferential attachment
+    for new_node in range(initial_nodes, n):
+        # Sample m unique nodes from targets (with replacement conceptually,
+        # but we ensure uniqueness)
+        selected = set()
+
+        # Try to select m unique targets
+        attempts = 0
+        while len(selected) < m and attempts < m * 20:
+            target = random.choice(targets)
+            selected.add(target)
+            attempts += 1
+
+        # If we couldn't get m unique targets (very unlikely), fill with any nodes
+        if len(selected) < m:
+            available = set(range(new_node)) - selected
+            needed = m - len(selected)
+            selected.update(random.sample(list(available), min(needed, len(available))))
+
+        # Add edges to selected nodes
+        for target in selected:
+            edges.append((new_node, target))
+            # Add both endpoints to targets for preferential attachment
+            targets.append(new_node)
+            targets.append(target)
+
+        # Progress indicator
+        if (new_node + 1) % 10000 == 0:
+            print(f"  Generated {new_node + 1:,} / {n:,} vertices...")
+
+    n_edges = len(edges)
+    print(f"Generated graph: {n:,} vertices, {n_edges:,} edges")
+    print(f"Average degree: {2 * n_edges / n:.2f}")
+
+    return edges, n
+
+
+def save_graph_as_mtx(edges, n_vertices, output_file):
+    """
+    Save graph in Matrix Market (.mtx) format.
+
+    Args:
+        edges: List of (u, v) tuples
+        n_vertices: Number of vertices
+        output_file: Output file path
+    """
+    print(f"Saving to {output_file}...")
+
+    # Make edges symmetric (undirected graph) and unique
+    all_edges = set()
+    for u, v in edges:
+        # Ensure u < v for consistent ordering
+        if u > v:
+            u, v = v, u
+        all_edges.add((u, v))
+
+    # Convert to sorted list
+    all_edges = sorted(all_edges)
+
+    with open(output_file, 'w') as f:
+        # Write Matrix Market header
+        f.write("%%MatrixMarket matrix coordinate pattern symmetric\n")
+        f.write(f"% Barabási-Albert graph: {n_vertices} vertices, {len(all_edges)} edges\n")
+
+        # Write dimensions: rows cols non-zeros
+        # For an adjacency matrix, we only store the upper triangle
+        f.write(f"{n_vertices} {n_vertices} {len(all_edges)}\n")
+
+        # Write edges (1-based indexing in MTX format)
+        for u, v in all_edges:
+            f.write(f"{u+1} {v+1}\n")
+
+    print(f"Successfully saved graph to {output_file}")
+
+    # Print file size
+    import os
+    file_size = os.path.getsize(output_file)
+    if file_size < 1024*1024:
+        print(f"File size: {file_size/1024:.1f} KB")
+    else:
+        print(f"File size: {file_size/(1024*1024):.1f} MB")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate a synthetic graph for benchmarking',
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        '-n', '--vertices',
+        type=int,
+        default=100000,
+        help='Number of vertices'
+    )
+    parser.add_argument(
+        '-m', '--edges-per-node',
+        type=int,
+        default=5,
+        help='Number of edges each new node attaches to (controls graph density)'
+    )
+    parser.add_argument(
+        '-o', '--output',
+        type=str,
+        default='synthetic_100k.mtx',
+        help='Output file name'
+    )
+    parser.add_argument(
+        '--seed',
+        type=int,
+        default=42,
+        help='Random seed for reproducibility'
+    )
+
+    args = parser.parse_args()
+
+    # Validate arguments
+    if args.vertices < 1:
+        print("Error: Number of vertices must be positive")
+        sys.exit(1)
+
+    if args.edges_per_node < 1 or args.edges_per_node >= args.vertices:
+        print(f"Error: edges-per-node must be between 1 and {args.vertices-1}")
+        sys.exit(1)
+
+    # Generate graph
+    edges, n_vertices = create_barabasi_albert_graph(
+        args.vertices,
+        args.edges_per_node,
+        args.seed
+    )
+
+    # Save to MTX format
+    save_graph_as_mtx(edges, n_vertices, args.output)
+
+    n_edges = len(edges)
+    print("\nGraph statistics:")
+    print(f"  Vertices: {n_vertices:,}")
+    print(f"  Edges: {n_edges:,}")
+    print(f"  Average degree: {2 * n_edges / n_vertices:.2f}")
+    density = (2 * n_edges) / (n_vertices * (n_vertices - 1))
+    print(f"  Density: {density:.6f}")
+
+    print(f"\nUsage:")
+    print(f"  BFS: cd bfs && ./run.sh --file ../{args.output}")
+    print(f"  MST: cd mst && ./run.sh --file ../{args.output}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/cuda/HPC/generate_large_ppm.py b/src/cuda/HPC/generate_large_ppm.py
new file mode 100755
index 000000000..cd9ed9b7c
--- /dev/null
+++ b/src/cuda/HPC/generate_large_ppm.py
@@ -0,0 +1,96 @@
+#!/usr/bin/env python3
+"""
+Generate large PPM image files for recursiveGaussian benchmark.
+Creates synthetic test images at various resolutions.
+"""
+
+import sys
+import random
+
+def generate_ppm(width, height, filename):
+    """Generate a PPM P6 (binary RGB) image file."""
+
+    print(f"Generating {width}x{height} PPM image...")
+
+    # Calculate sizes
+    pixel_count = width * height
+    rgb_data_size = pixel_count * 3
+
+    with open(filename, 'wb') as f:
+        # Write ASCII header
+        header = f"P6\n# Generated test image for recursiveGaussian_hpc\n{width} {height}\n255\n"
+        f.write(header.encode('ascii'))
+
+        # Generate RGB data in chunks to avoid memory issues
+        chunk_size = 1024 * 1024  # 1MB chunks
+        bytes_written = 0
+
+        print(f"Writing {rgb_data_size / (1024*1024):.1f} MB of RGB data...")
+
+        while bytes_written < rgb_data_size:
+            # Generate chunk of random RGB values
+            remaining = rgb_data_size - bytes_written
+            current_chunk_size = min(chunk_size, remaining)
+
+            # Create gradient pattern (more interesting than pure random)
+            chunk_data = bytearray()
+            for i in range(current_chunk_size // 3):
+                pixel_idx = (bytes_written // 3) + i
+                row = pixel_idx // width
+                col = pixel_idx % width
+
+                # Create a gradient pattern
+                r = (col * 255 // width) & 0xFF
+                g = (row * 255 // height) & 0xFF
+                b = ((row + col) * 255 // (width + height)) & 0xFF
+
+                chunk_data.extend([r, g, b])
+
+            f.write(chunk_data)
+            bytes_written += len(chunk_data)
+
+            # Progress indicator
+            progress = (bytes_written / rgb_data_size) * 100
+            if bytes_written % (10 * 1024 * 1024) < chunk_size:  # Every ~10MB
+                print(f"  Progress: {progress:.1f}%")
+
+    # Get file size
+    import os
+    file_size = os.path.getsize(filename)
+
+    print(f"✓ Created {filename}")
+    print(f"  Size: {file_size / (1024*1024):.1f} MB")
+    print(f"  Dimensions: {width}x{height}")
+    print()
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        # Custom size from command line
+        if len(sys.argv) != 4:
+            print("Usage: generate_large_ppm.py <width> <height> <output_file>")
+            print("   or: generate_large_ppm.py (generates standard sizes)")
+            sys.exit(1)
+
+        width = int(sys.argv[1])
+        height = int(sys.argv[2])
+        filename = sys.argv[3]
+        generate_ppm(width, height, filename)
+    else:
+        # Generate standard test sizes
+        import os
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        base_dir = os.path.join(script_dir, "../../../data_dirs/cuda/HPC/recursiveGaussian/data")
+
+        # Create directory if it doesn't exist
+        os.makedirs(base_dir, exist_ok=True)
+
+        sizes = [
+            (768, 768, f"{base_dir}/teapot768.ppm"),
+            (1024, 1024, f"{base_dir}/teapot1024.ppm"),
+            # (2048, 2048, f"{base_dir}/teapot2048.ppm"),  # Uncomment for 4K
+        ]
+
+        for width, height, filename in sizes:
+            generate_ppm(width, height, filename)
+
+        print("All images generated successfully!")
diff --git a/src/cuda/HPC/generate_large_signal.py b/src/cuda/HPC/generate_large_signal.py
new file mode 100755
index 000000000..ac4798713
--- /dev/null
+++ b/src/cuda/HPC/generate_large_signal.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Generate large signal files for dwtHaar1D benchmark.
+Creates synthetic signal data at various sizes (powers of 2).
+"""
+
+import sys
+import math
+
+def generate_signal(size_power, filename, epsilon=0.001):
+    """
+    Generate a signal file with 2^size_power elements.
+
+    Args:
+        size_power: Power of 2 (e.g., 24 for 2^24 = 16,777,216 elements)
+        filename: Output filename
+        epsilon: Epsilon value for header (precision parameter)
+    """
+
+    num_elements = 2 ** size_power
+
+    print(f"Generating signal file with {num_elements:,} elements (2^{size_power})...")
+    print("This will take a few minutes...")
+
+    with open(filename, 'w') as f:
+        # Write header (epsilon value as comment)
+        f.write(f"# {epsilon}\n")
+
+        # Generate and write signal values
+        # Using a simple synthetic signal (sine wave + noise)
+        chunk_size = 100000  # Write in chunks
+        values_written = 0
+
+        while values_written < num_elements:
+            chunk_values = []
+
+            for i in range(min(chunk_size, num_elements - values_written)):
+                idx = values_written + i
+
+                # Create a synthetic signal: combination of multiple frequencies
+                # This creates a more realistic signal than pure random
+                t = idx / num_elements  # Normalized time 0 to 1
+
+                # Multiple frequency components
+                value = (
+                    math.sin(2 * math.pi * 5 * t) * 0.5 +      # 5 Hz
+                    math.sin(2 * math.pi * 13 * t) * 0.3 +     # 13 Hz
+                    math.sin(2 * math.pi * 31 * t) * 0.2       # 31 Hz
+                )
+
+                chunk_values.append(f"{value:.6f}")
+
+            # Write chunk (space-separated)
+            f.write(" ".join(chunk_values))
+            f.write(" ")
+
+            values_written += len(chunk_values)
+
+            # Progress indicator
+            progress = (values_written / num_elements) * 100
+            if progress % 25 < (100 * chunk_size / num_elements):
+                print(f"Progress: {progress:.1f}%")
+
+        f.write("\n")
+
+    # Get file size
+    import os
+    file_size = os.path.getsize(filename)
+
+    print(f"\n✓ Generated {filename}")
+    print(f"  Elements: {num_elements:,} (2^{size_power})")
+    print(f"  File size: {file_size / (1024*1024):.1f} MB")
+    print()
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        # Custom size from command line
+        if len(sys.argv) != 3:
+            print("Usage: generate_large_signal.py <power_of_2> <output_file>")
+            print("   Example: generate_large_signal.py 24 signal_2_24.dat")
+            print("   or: generate_large_signal.py (generates standard sizes)")
+            sys.exit(1)
+
+        size_power = int(sys.argv[1])
+        filename = sys.argv[2]
+        generate_signal(size_power, filename)
+    else:
+        # Generate standard test sizes
+        import os
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        base_dir = os.path.join(script_dir, "../../../data_dirs/cuda/HPC/dwtHaar1D_hpc/data")
+
+        # Create directory if it doesn't exist
+        os.makedirs(base_dir, exist_ok=True)
+
+        sizes = [
+            (20, f"{base_dir}/signal_2_20.dat"),  # 2^20 = 1,048,576
+            (22, f"{base_dir}/signal_2_22.dat"),  # 2^22 = 4,194,304
+            (24, f"{base_dir}/signal_2_24.dat"),  # 2^24 = 16,777,216
+        ]
+
+        for power, filename in sizes:
+            generate_signal(power, filename)
+
+        print("All signal files generated successfully!")
diff --git a/src/cuda/HPC/get_dwt_data.sh b/src/cuda/HPC/get_dwt_data.sh
new file mode 100755
index 000000000..17c3e2373
--- /dev/null
+++ b/src/cuda/HPC/get_dwt_data.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# Generate random signal data for DWT
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATA_DIR="$SCRIPT_DIR/../../../data_dirs/cuda/HPC/dwtHaar1D_hpc/data/"
+mkdir -p "$DATA_DIR"
+
+# Link gold files and signal files from cuda-samples
+CUDA_SAMPLES_DWT="$SCRIPT_DIR/../cuda-samples/Samples/5_Domain_Specific/dwtHaar1D/"
+
+if [ -d "$CUDA_SAMPLES_DWT" ]; then
+    echo "Linking reference data from cuda-samples..."
+    # Link gold files (rename to match yml expectations)
+    ln -sf "$CUDA_SAMPLES_DWT/data/regression_2_18.gold.dat" "$DATA_DIR/regression_2_18.gold.dat"
+    ln -sf "$CUDA_SAMPLES_DWT/data/regression_2_14.gold.dat" "$DATA_DIR/regression_2_14.gold.dat"
+    ln -sf "$CUDA_SAMPLES_DWT/data/regression.gold.dat" "$DATA_DIR/regression.gold.dat"
+
+    # Link signal files from cuda-samples
+    ln -sf "$CUDA_SAMPLES_DWT/data/signal_2_18.dat" "$DATA_DIR/signal_2_18.dat"
+    ln -sf "$CUDA_SAMPLES_DWT/data/signal_2_14.dat" "$DATA_DIR/signal_2_14.dat"
+    ln -sf "$CUDA_SAMPLES_DWT/data/signal.dat" "$DATA_DIR/signal.dat"
+
+    echo "Linked reference data from cuda-samples"
+fi
+
+# Generate random signal files (using dd for portability - no numpy required)
+for size in 512 1024 4096 16384 65536; do
+    dd if=/dev/urandom of="$DATA_DIR/signal_${size}.dat" bs=4 count=$size status=none 2>/dev/null
+    echo "Generated signal_${size}.dat"
+done
+
+echo "DWT signal data ready"
+
+echo "DWT data ready in $DATA_DIR"
+
+# Generate large signal files using the Python script
+if [ -f "$SCRIPT_DIR/generate_large_signal.py" ]; then
+    echo "Generating large signal files..."
+    python3 "$SCRIPT_DIR/generate_large_signal.py"
+fi
diff --git a/src/cuda/HPC/get_graph_data.sh b/src/cuda/HPC/get_graph_data.sh
new file mode 100755
index 000000000..2df0cff25
--- /dev/null
+++ b/src/cuda/HPC/get_graph_data.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Download standard graph datasets and generate synthetic graph
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATA_DIR="$SCRIPT_DIR/../../../data_dirs/cuda/HPC/graph/data/"
+mkdir -p "$DATA_DIR"
+
+# Download karate club (standard benchmark from SuiteSparse Matrix Collection)
+if [ ! -f "$DATA_DIR/karate.mtx" ]; then
+    echo "Downloading karate.mtx..."
+    wget -O "$DATA_DIR/karate.tar.gz" \
+      https://suitesparse-collection-website.herokuapp.com/MM/Newman/karate.tar.gz
+    cd "$DATA_DIR" && tar -xzf karate.tar.gz && mv karate/karate.mtx . && rm -rf karate karate.tar.gz
+fi
+
+# Download netscience (standard benchmark from SuiteSparse Matrix Collection)
+if [ ! -f "$DATA_DIR/netscience.mtx" ]; then
+    echo "Downloading netscience.mtx..."
+    wget -O "$DATA_DIR/netscience.tar.gz" \
+      https://suitesparse-collection-website.herokuapp.com/MM/Newman/netscience.tar.gz
+    cd "$DATA_DIR" && tar -xzf netscience.tar.gz && mv netscience/netscience.mtx . && rm -rf netscience netscience.tar.gz
+fi
+
+# Generate synthetic 100k vertex graph
+if [ ! -f "$DATA_DIR/synthetic_100k.mtx" ]; then
+    echo "Generating synthetic_100k.mtx..."
+    python3 "$SCRIPT_DIR/generate_graph.py" -n 100000 -m 5 -o "$DATA_DIR/synthetic_100k.mtx"
+fi
+
+echo "Graph data ready in $DATA_DIR"
diff --git a/src/cuda/HPC/get_image_data.sh b/src/cuda/HPC/get_image_data.sh
new file mode 100755
index 000000000..6abea0061
--- /dev/null
+++ b/src/cuda/HPC/get_image_data.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Generate test images for recursiveGaussian
+
+DATA_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/../../../data_dirs/cuda/HPC/recursiveGaussian_hpc/data/"
+mkdir -p "$DATA_DIR"
+
+# Generate test PPM images using Python (portable, no ImageMagick dependency)
+python3 - "$DATA_DIR" << 'EOF'
+import os
+import sys
+
+def create_gradient_ppm(filename, size):
+    """Create a simple gradient PPM image in P6 (binary) format"""
+    with open(filename, 'wb') as f:
+        # Write header in ASCII
+        header = f"P6\n{size} {size}\n255\n"
+        f.write(header.encode('ascii'))
+        # Write pixel data in binary
+        for y in range(size):
+            for x in range(size):
+                # Gradient from black to white
+                val = int((x + y) * 255 / (2 * size))
+                # Write RGB as 3 bytes (RGBA would need 4th byte)
+                f.write(bytes([val, val, val]))
+
+data_dir = os.path.expanduser(sys.argv[1])
+create_gradient_ppm(f"{data_dir}/teapot128.ppm", 128)
+create_gradient_ppm(f"{data_dir}/teapot256.ppm", 256)
+create_gradient_ppm(f"{data_dir}/teapot512.ppm", 512)
+create_gradient_ppm(f"{data_dir}/teapot768.ppm", 768)
+create_gradient_ppm(f"{data_dir}/teapot1024.ppm", 1024)
+print(f"Generated test images in {data_dir}")
+EOF
+
+echo "Image data ready in $DATA_DIR"
+
+# Generate large test images using the Python script
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+if [ -f "$SCRIPT_DIR/generate_large_ppm.py" ]; then
+    echo "Generating large PPM images..."
+    python3 "$SCRIPT_DIR/generate_large_ppm.py"
+fi
diff --git a/src/cuda/HPC/get_vpi_data.sh b/src/cuda/HPC/get_vpi_data.sh
new file mode 100755
index 000000000..c5e7b1a7d
--- /dev/null
+++ b/src/cuda/HPC/get_vpi_data.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Link VPI sample data from VPI installation
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DATA_ROOT="$SCRIPT_DIR/../../../data_dirs/cuda/HPC"
+
+# Find VPI installation
+VPI_ROOT=$(find /opt/nvidia -maxdepth 1 -name "vpi*" -type d 2>/dev/null | head -1)
+
+if [ -z "$VPI_ROOT" ]; then
+    echo "ERROR: VPI installation not found in /opt/nvidia/"
+    exit 1
+fi
+
+VPI_ASSETS="$VPI_ROOT/samples/assets"
+
+if [ ! -d "$VPI_ASSETS" ]; then
+    echo "ERROR: VPI sample assets not found at $VPI_ASSETS"
+    exit 1
+fi
+
+echo "Linking VPI sample data from $VPI_ASSETS..."
+
+# Create data directory structure for each VPI app
+for app in vpi_background_subtractor vpi_orb_feature_detector vpi_stereo_disparity; do
+    # Create parent directory
+    mkdir -p "$DATA_ROOT/$app"
+    # Remove existing data directory/symlink if it exists
+    rm -rf "$DATA_ROOT/$app/data"
+    # Create symbolic link to VPI assets
+    ln -sf "$VPI_ASSETS" "$DATA_ROOT/$app/data"
+    echo "  $app/data -> $VPI_ASSETS"
+done
+
+echo "VPI data linked successfully"
diff --git a/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt b/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt
new file mode 100644
index 000000000..478571f67
--- /dev/null
+++ b/src/cuda/HPC/graph/bfs_standalone/CMakeLists.txt
@@ -0,0 +1,41 @@
+cmake_minimum_required(VERSION 3.18)
+project(bfs_standalone CUDA CXX)
+
+find_package(CUDAToolkit REQUIRED)
+
+# Try to find cuGraph installation
+# First check environment variable, then common install locations
+if(DEFINED ENV{CUGRAPH_ROOT})
+    set(cugraph_ROOT $ENV{CUGRAPH_ROOT})
+elseif(EXISTS "$ENV{HOME}/cugraph")
+    set(cugraph_ROOT "$ENV{HOME}/cugraph")
+else()
+    set(cugraph_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../external/cugraph")
+endif()
+
+add_executable(bfs_standalone bfs_standalone.cu)
+target_compile_features(bfs_standalone PRIVATE cxx_std_17)
+
+# Add include directories for cuGraph and dependencies
+target_include_directories(bfs_standalone PRIVATE
+    ${cugraph_ROOT}/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/rmm-src/include
+    ${cugraph_ROOT}/cpp/build/_deps/raft-src/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/cuvs-src/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-src/include
+)
+
+# Link libraries
+target_link_directories(bfs_standalone PRIVATE
+    ${cugraph_ROOT}/cpp/build
+    ${cugraph_ROOT}/cpp/build/_deps/rmm-build
+    ${cugraph_ROOT}/cpp/build/_deps/raft-build
+    ${cugraph_ROOT}/cpp/build/_deps/cuvs-build
+)
+
+target_link_libraries(bfs_standalone PRIVATE
+    CUDA::cudart
+    CUDA::cublas
+    cugraph
+    cugraph_c
+)
diff --git a/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu b/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu
new file mode 100644
index 000000000..1a651b3e0
--- /dev/null
+++ b/src/cuda/HPC/graph/bfs_standalone/bfs_standalone.cu
@@ -0,0 +1,262 @@
+/*
+ * Standalone BFS kernel extracted from cugraph test suite
+ * Simplified to run without gtest framework
+ */
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/graph.hpp>
+#include <cugraph/graph_functions.hpp>
+#include <cugraph/graph_view.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_scalar.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/cuda_memory_resource.hpp>
+#include <rmm/mr/pool_memory_resource.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <cstdlib>
+#include <optional>
+#include <limits>
+#include <memory>
+
+// Simple graph loader for MTX format
+template <typename vertex_t, typename edge_t>
+struct SimpleGraph {
+    std::vector<edge_t> offsets;
+    std::vector<vertex_t> indices;
+    vertex_t num_vertices;
+    edge_t num_edges;
+};
+
+template <typename vertex_t, typename edge_t>
+SimpleGraph<vertex_t, edge_t> load_mtx_graph(const std::string& filename) {
+    SimpleGraph<vertex_t, edge_t> graph;
+
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot open file " << filename << std::endl;
+        exit(1);
+    }
+
+    // Check if graph is symmetric
+    bool is_symmetric = false;
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line[0] != '%') break;
+        if (line.find("symmetric") != std::string::npos) {
+            is_symmetric = true;
+        }
+    }
+
+    // Read dimensions
+    vertex_t num_rows, num_cols;
+    edge_t num_entries;
+    std::istringstream iss(line);
+    iss >> num_rows >> num_cols >> num_entries;
+
+    graph.num_vertices = std::max(num_rows, num_cols);
+
+    // Read edges
+    std::vector<std::pair<vertex_t, vertex_t>> edges;
+    vertex_t src, dst;
+    while (file >> src >> dst) {
+        src--; dst--; // MTX is 1-indexed
+        edges.push_back({src, dst});
+        // For symmetric graphs, add reverse edge if not a self-loop
+        if (is_symmetric && src != dst) {
+            edges.push_back({dst, src});
+        }
+    }
+
+    graph.num_edges = edges.size();
+
+    // Convert to CSR format
+    graph.offsets.resize(graph.num_vertices + 1, 0);
+
+    // Count degree
+    for (const auto& edge : edges) {
+        graph.offsets[edge.first + 1]++;
+    }
+
+    // Prefix sum
+    for (vertex_t i = 0; i < graph.num_vertices; i++) {
+        graph.offsets[i + 1] += graph.offsets[i];
+    }
+
+    graph.indices.resize(edges.size());
+    std::vector<edge_t> current_pos = graph.offsets;
+
+    for (const auto& edge : edges) {
+        graph.indices[current_pos[edge.first]++] = edge.second;
+    }
+
+    file.close();
+    return graph;
+}
+
+int main(int argc, char** argv) {
+    using vertex_t = int32_t;
+    using edge_t = int32_t;
+    using weight_t = float;
+
+    // Parse command line arguments
+    std::string graph_file = "karate.mtx";
+    vertex_t source = 0;
+    bool use_rmat = false;
+    int rmat_scale = 20;
+    int rmat_edge_factor = 16;
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "--file" && i + 1 < argc) {
+            graph_file = argv[++i];
+        } else if (arg == "--source" && i + 1 < argc) {
+            source = std::atoi(argv[++i]);
+        } else if (arg == "--rmat") {
+            use_rmat = true;
+        } else if (arg == "--scale" && i + 1 < argc) {
+            rmat_scale = std::atoi(argv[++i]);
+        } else if (arg == "--edge-factor" && i + 1 < argc) {
+            rmat_edge_factor = std::atoi(argv[++i]);
+        } else if (arg == "--help" || arg == "-h") {
+            std::cout << "Usage: " << argv[0] << " [options]\n"
+                      << "Options:\n"
+                      << "  --file <path>       Graph file in MTX format\n"
+                      << "  --source <vertex>   Source vertex for BFS (default: 0)\n"
+                      << "  --rmat              Use RMAT generated graph instead of file\n"
+                      << "  --scale <n>         RMAT scale parameter (default: 20)\n"
+                      << "  --edge-factor <n>   RMAT edge factor (default: 16)\n"
+                      << "  --help, -h          Show this help message\n";
+            return 0;
+        }
+    }
+
+    std::cout << "=== Standalone BFS Kernel ===" << std::endl;
+    std::cout << "Source vertex: " << source << std::endl;
+
+    if (use_rmat) {
+        std::cout << "RMAT generation not supported in standalone version.\n";
+        std::cout << "Please use --file option with an MTX graph file\n";
+        return 1;
+    }
+
+    std::cout << "Loading graph from: " << graph_file << std::endl;
+
+    // Initialize RMM memory pool
+    auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+    auto pool_mr = std::make_shared<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>>(
+        cuda_mr.get(), 1024 * 1024 * 1024ULL); // 1GB initial pool
+    rmm::mr::set_current_device_resource(pool_mr.get());
+
+    raft::handle_t handle{};
+    HighResTimer hr_timer{};
+
+    // Load graph
+    auto h_graph = load_mtx_graph<vertex_t, edge_t>(graph_file);
+    std::cout << "Loaded graph: " << h_graph.num_vertices << " vertices, "
+              << h_graph.num_edges << " edges" << std::endl;
+
+    // Copy edges to device
+    rmm::device_uvector<vertex_t> d_src(h_graph.num_edges, handle.get_stream());
+    rmm::device_uvector<vertex_t> d_dst(h_graph.num_edges, handle.get_stream());
+
+    // Extract source and destination from indices/offsets
+    std::vector<vertex_t> h_src, h_dst;
+    for (vertex_t v = 0; v < h_graph.num_vertices; v++) {
+        for (edge_t e = h_graph.offsets[v]; e < h_graph.offsets[v + 1]; e++) {
+            h_src.push_back(v);
+            h_dst.push_back(h_graph.indices[e]);
+        }
+    }
+
+    cudaMemcpy(d_src.data(), h_src.data(), h_src.size() * sizeof(vertex_t), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_dst.data(), h_dst.data(), h_dst.size() * sizeof(vertex_t), cudaMemcpyHostToDevice);
+
+    std::cout << "Constructing cugraph..." << std::endl;
+    hr_timer.start("Graph construction");
+
+    // Build graph from edge list
+    std::optional<rmm::device_uvector<vertex_t>> d_renumber_map{std::nullopt};
+
+    auto [graph, edge_properties, renumber_map] =
+        cugraph::create_graph_from_edgelist<vertex_t, edge_t, false, false>(
+            handle,
+            std::nullopt,  // vertex list
+            std::move(d_src),
+            std::move(d_dst),
+            std::vector<cugraph::arithmetic_device_uvector_t>{},  // no edge properties
+            cugraph::graph_properties_t{true, false},  // undirected, no multi-edges
+            true);  // renumber
+
+    auto graph_view = graph.view();
+
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    double graph_time = hr_timer.stop();
+    std::cout << "Graph construction: " << (graph_time * 1000.0) << " ms" << std::endl;
+
+    std::cout << "Running BFS from source " << source << "..." << std::endl;
+
+    // Allocate output
+    rmm::device_uvector<vertex_t> d_distances(graph_view.number_of_vertices(), handle.get_stream());
+    rmm::device_uvector<vertex_t> d_predecessors(graph_view.number_of_vertices(), handle.get_stream());
+
+    hr_timer.start("BFS");
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+
+    rmm::device_scalar<vertex_t> const d_source(source, handle.get_stream());
+
+    cugraph::bfs(handle,
+                 graph_view,
+                 d_distances.data(),
+                 d_predecessors.data(),
+                 d_source.data(),
+                 size_t{1},
+                 false,  // direction optimizing (false for now)
+                 std::numeric_limits<vertex_t>::max());
+
+    RAFT_CUDA_TRY(cudaDeviceSynchronize());
+    double bfs_time = hr_timer.stop();
+    std::cout << "BFS execution: " << (bfs_time * 1000.0) << " ms" << std::endl;
+
+    // Copy results back
+    std::vector<vertex_t> h_distances(graph_view.number_of_vertices());
+    std::vector<vertex_t> h_predecessors(graph_view.number_of_vertices());
+
+    cudaMemcpy(h_distances.data(), d_distances.data(),
+               h_distances.size() * sizeof(vertex_t), cudaMemcpyDeviceToHost);
+    cudaMemcpy(h_predecessors.data(), d_predecessors.data(),
+               h_predecessors.size() * sizeof(vertex_t), cudaMemcpyDeviceToHost);
+
+    // Print results summary
+    std::cout << "\n=== BFS Results ===" << std::endl;
+    std::cout << "First 10 vertices:" << std::endl;
+    std::cout << "Vertex\tDistance\tPredecessor" << std::endl;
+    for (int i = 0; i < std::min(10, (int)h_distances.size()); i++) {
+        std::cout << i << "\t" << h_distances[i] << "\t\t";
+        if (h_predecessors[i] == cugraph::invalid_vertex_id<vertex_t>::value) {
+            std::cout << "None";
+        } else {
+            std::cout << h_predecessors[i];
+        }
+        std::cout << std::endl;
+    }
+
+    // Count reachable vertices
+    int reachable = 0;
+    for (auto d : h_distances) {
+        if (d != std::numeric_limits<vertex_t>::max()) reachable++;
+    }
+    std::cout << "\nReachable vertices: " << reachable << " / " << h_distances.size() << std::endl;
+
+    std::cout << "\n=== BFS Complete ===" << std::endl;
+
+    return 0;
+}
diff --git a/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt b/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt
new file mode 100644
index 000000000..088a781ba
--- /dev/null
+++ b/src/cuda/HPC/graph/mst_standalone/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 3.18)
+project(mst_standalone CUDA CXX)
+
+find_package(CUDAToolkit REQUIRED)
+
+# Try to find cuGraph installation
+# First check environment variable, then common install locations
+if(DEFINED ENV{CUGRAPH_ROOT})
+    set(cugraph_ROOT $ENV{CUGRAPH_ROOT})
+elseif(EXISTS "$ENV{HOME}/cugraph")
+    set(cugraph_ROOT "$ENV{HOME}/cugraph")
+else()
+    set(cugraph_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../external/cugraph")
+endif()
+
+add_executable(mst_standalone mst_standalone.cu)
+target_compile_features(mst_standalone PRIVATE cxx_std_17)
+target_compile_options(mst_standalone PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
+
+# Add include directories for cuGraph and dependencies
+target_include_directories(mst_standalone PRIVATE
+    ${cugraph_ROOT}/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/rmm-src/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/rmm-build/include
+    ${cugraph_ROOT}/cpp/build/_deps/raft-src/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/raft-build/include
+    ${cugraph_ROOT}/cpp/build/_deps/cuvs-src/cpp/include
+    ${cugraph_ROOT}/cpp/build/_deps/cuvs-build/include
+    ${cugraph_ROOT}/cpp/build/_deps/cccl-src/libcudacxx/include
+    ${cugraph_ROOT}/cpp/build/_deps/cccl-src/thrust
+    ${cugraph_ROOT}/cpp/build/_deps/cccl-src/cub
+    ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-src/include
+)
+
+# Link libraries
+target_link_directories(mst_standalone PRIVATE
+    ${cugraph_ROOT}/cpp/build
+    ${cugraph_ROOT}/cpp/build/_deps/rmm-build
+    ${cugraph_ROOT}/cpp/build/_deps/raft-build
+    ${cugraph_ROOT}/cpp/build/_deps/cuvs-build
+    ${cugraph_ROOT}/cpp/build/_deps/rapids_logger-build
+)
+
+target_link_libraries(mst_standalone PRIVATE
+    CUDA::cudart
+    CUDA::cublas
+    cugraph
+    cugraph_c
+    rmm
+    rapids_logger
+)
diff --git a/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu b/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu
new file mode 100644
index 000000000..97059096b
--- /dev/null
+++ b/src/cuda/HPC/graph/mst_standalone/mst_standalone.cu
@@ -0,0 +1,247 @@
+/*
+ * Standalone MST (Minimum Spanning Tree) kernel
+ * Extracted and simplified from cugraph test suite
+ */
+
+#include <cugraph/algorithms.hpp>
+#include <cugraph/legacy/graph.hpp>
+#include <cugraph/utilities/high_res_timer.hpp>
+
+#include <raft/core/handle.hpp>
+#include <raft/util/cudart_utils.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/mr/cuda_memory_resource.hpp>
+#include <rmm/mr/pool_memory_resource.hpp>
+
+#include <thrust/device_ptr.h>
+#include <thrust/reduce.h>
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <vector>
+#include <string>
+#include <cstdlib>
+#include <optional>
+#include <limits>
+#include <memory>
+
+// Simple graph loader for MTX format
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct SimpleWeightedGraph {
+    std::vector<vertex_t> row_indices;
+    std::vector<vertex_t> col_indices;
+    std::vector<weight_t> weights;
+    vertex_t num_vertices;
+    edge_t num_edges;
+};
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+SimpleWeightedGraph<vertex_t, edge_t, weight_t> load_weighted_mtx_graph(const std::string& filename) {
+    SimpleWeightedGraph<vertex_t, edge_t, weight_t> graph;
+
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        std::cerr << "Error: Cannot open file " << filename << std::endl;
+        exit(1);
+    }
+
+    // Skip comments and read header
+    std::string line;
+    bool is_symmetric = false;
+    bool is_pattern = false;
+
+    while (std::getline(file, line)) {
+        if (line[0] != '%') break;
+        if (line.find("symmetric") != std::string::npos) {
+            is_symmetric = true;
+        }
+        if (line.find("pattern") != std::string::npos) {
+            is_pattern = true;
+        }
+    }
+
+    // Read dimensions from the first non-comment line
+    vertex_t num_rows, num_cols;
+    edge_t num_entries;
+    std::istringstream iss(line);
+    iss >> num_rows >> num_cols >> num_entries;
+
+    graph.num_vertices = std::max(num_rows, num_cols);
+
+    // Read edges
+    vertex_t src, dst;
+    weight_t weight;
+
+    while (file >> src >> dst) {
+        src--; dst--; // MTX is 1-indexed
+
+        if (is_pattern) {
+            weight = 1.0; // Default weight for pattern matrices
+        } else {
+            file >> weight;
+        }
+
+        graph.row_indices.push_back(src);
+        graph.col_indices.push_back(dst);
+        graph.weights.push_back(weight);
+
+        // For symmetric graphs, add reverse edge if not a self-loop
+        if (is_symmetric && src != dst) {
+            graph.row_indices.push_back(dst);
+            graph.col_indices.push_back(src);
+            graph.weights.push_back(weight);
+        }
+    }
+
+    graph.num_edges = graph.row_indices.size();
+
+    file.close();
+    return graph;
+}
+
+// Convert COO to CSR format
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct CSRGraph {
+    std::vector<edge_t> offsets;
+    std::vector<vertex_t> indices;
+    std::vector<weight_t> weights;
+    vertex_t num_vertices;
+    edge_t num_edges;
+};
+
+template <typename vertex_t, typename edge_t, typename weight_t>
+CSRGraph<vertex_t, edge_t, weight_t> coo_to_csr(const SimpleWeightedGraph<vertex_t, edge_t, weight_t>& coo) {
+    CSRGraph<vertex_t, edge_t, weight_t> csr;
+    csr.num_vertices = coo.num_vertices;
+    csr.num_edges = coo.num_edges;
+
+    // Initialize offsets
+    csr.offsets.resize(csr.num_vertices + 1, 0);
+
+    // Count degree
+    for (const auto& src : coo.row_indices) {
+        csr.offsets[src + 1]++;
+    }
+
+    // Prefix sum
+    for (vertex_t i = 0; i < csr.num_vertices; i++) {
+        csr.offsets[i + 1] += csr.offsets[i];
+    }
+
+    // Fill indices and weights
+    csr.indices.resize(coo.num_edges);
+    csr.weights.resize(coo.num_edges);
+    std::vector<edge_t> current_pos = csr.offsets;
+
+    for (size_t i = 0; i < coo.row_indices.size(); i++) {
+        vertex_t src = coo.row_indices[i];
+        edge_t pos = current_pos[src]++;
+        csr.indices[pos] = coo.col_indices[i];
+        csr.weights[pos] = coo.weights[i];
+    }
+
+    return csr;
+}
+
+int main(int argc, char** argv) {
+    using vertex_t = int32_t;
+    using edge_t = int32_t;
+    using weight_t = float;
+
+    // Parse command line arguments
+    std::string graph_file = "graphs/karate.mtx";
+
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+        if (arg == "--file" && i + 1 < argc) {
+            graph_file = argv[++i];
+        } else if (arg == "--help" || arg == "-h") {
+            std::cout << "Usage: " << argv[0] << " [options]\n"
+                      << "Options:\n"
+                      << "  --file <path>    Graph file in MTX format (default: graphs/karate.mtx)\n"
+                      << "  --help, -h       Show this help message\n";
+            return 0;
+        }
+    }
+
+    std::cout << "=== Standalone MST (Minimum Spanning Tree) Kernel ===" << std::endl;
+    std::cout << "Loading graph from: " << graph_file << std::endl;
+
+    // Load graph
+    auto coo_graph = load_weighted_mtx_graph<vertex_t, edge_t, weight_t>(graph_file);
+    std::cout << "Loaded graph: " << coo_graph.num_vertices << " vertices, "
+              << coo_graph.num_edges << " edges" << std::endl;
+
+    // Convert to CSR
+    std::cout << "Converting to CSR format..." << std::endl;
+    auto csr_graph = coo_to_csr(coo_graph);
+
+    // Initialize RMM memory resource
+    rmm::mr::cuda_memory_resource cuda_mr;
+    rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr(
+        &cuda_mr, 512 * 1024 * 1024ULL);  // 512 MB pool
+    rmm::mr::set_current_device_resource(&pool_mr);
+
+    // Create RAFT handle
+    raft::handle_t handle;
+
+    // Copy graph to device
+    std::cout << "Copying graph to device..." << std::endl;
+    rmm::device_uvector<edge_t> d_offsets(csr_graph.offsets.size(), handle.get_stream());
+    rmm::device_uvector<vertex_t> d_indices(csr_graph.indices.size(), handle.get_stream());
+    rmm::device_uvector<weight_t> d_weights(csr_graph.weights.size(), handle.get_stream());
+
+    raft::update_device(d_offsets.data(), csr_graph.offsets.data(),
+                       csr_graph.offsets.size(), handle.get_stream());
+    raft::update_device(d_indices.data(), csr_graph.indices.data(),
+                       csr_graph.indices.size(), handle.get_stream());
+    raft::update_device(d_weights.data(), csr_graph.weights.data(),
+                       csr_graph.weights.size(), handle.get_stream());
+
+    // Create cuGraph CSR view
+    std::cout << "Constructing cugraph..." << std::endl;
+    cugraph::legacy::GraphCSRView<vertex_t, edge_t, weight_t> graph_view(
+        d_offsets.data(),
+        d_indices.data(),
+        d_weights.data(),
+        csr_graph.num_vertices,
+        csr_graph.num_edges);
+
+    handle.sync_stream();
+
+    // Run MST
+    std::cout << "Running MST algorithm..." << std::endl;
+
+    HighResTimer hr_timer{};
+    hr_timer.start("MST");
+
+    auto mst_edges = cugraph::minimum_spanning_tree<vertex_t, edge_t, weight_t>(handle, graph_view);
+
+    handle.sync_stream();
+    hr_timer.stop();
+
+    std::cout << "\n=== MST Results ===" << std::endl;
+    hr_timer.display_and_clear(std::cout);
+
+    // Calculate MST weight
+    auto mst_weight = thrust::reduce(
+        thrust::device_pointer_cast(mst_edges->view().edge_data),
+        thrust::device_pointer_cast(mst_edges->view().edge_data) + mst_edges->view().number_of_edges);
+
+    auto total_weight = thrust::reduce(
+        thrust::device_pointer_cast(d_weights.data()),
+        thrust::device_pointer_cast(d_weights.data()) + csr_graph.num_edges);
+
+    std::cout << "MST edges: " << mst_edges->view().number_of_edges << std::endl;
+    std::cout << "MST total weight: " << mst_weight << std::endl;
+    std::cout << "Original graph total weight: " << total_weight << std::endl;
+    std::cout << "MST weight ratio: " << (mst_weight / total_weight * 100.0) << "%" << std::endl;
+    std::cout << "\nExpected MST edges for " << csr_graph.num_vertices
+              << " vertices: " << (csr_graph.num_vertices - 1) << std::endl;
+
+    std::cout << "\n=== MST Complete ===" << std::endl;
+
+    return 0;
+}
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp b/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp
new file mode 100644
index 000000000..d05094055
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3d.cpp
@@ -0,0 +1,232 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "FDTD3d.h"
+
+#include <assert.h>
+#include "helper_functions.h"
+#include <iomanip>
+#include <iostream>
+#include <math.h>
+
+#include "FDTD3dGPU.h"
+#include "FDTD3dReference.h"
+
+#ifndef CLAMP
+#define CLAMP(a, min, max) (MIN(max, MAX(a, min)))
+#endif
+
+//// Name of the log file
+// const char *printfFile = "FDTD3d.txt";
+
+// Forward declarations
+bool runTest(int argc, const char **argv);
+void showHelp(const int argc, const char **argv);
+
+int main(int argc, char **argv)
+{
+    bool bTestResult = false;
+    // Start the log
+    printf("%s Starting...\n\n", argv[0]);
+
+    // Check help flag
+    if (checkCmdLineFlag(argc, (const char **)argv, "help")) {
+        printf("Displaying help on console\n");
+        showHelp(argc, (const char **)argv);
+        bTestResult = true;
+    }
+    else {
+        // Execute
+        bTestResult = runTest(argc, (const char **)argv);
+    }
+
+    // Finish
+    exit(bTestResult ? EXIT_SUCCESS : EXIT_FAILURE);
+}
+
+void showHelp(const int argc, const char **argv)
+{
+    if (argc > 0)
+        std::cout << std::endl << argv[0] << std::endl;
+
+    std::cout << std::endl << "Syntax:" << std::endl;
+    std::cout << std::left;
+    std::cout << "    " << std::setw(20) << "--device=<device>"
+              << "Specify device to use for execution" << std::endl;
+    std::cout << "    " << std::setw(20) << "--dimx=<N>"
+              << "Specify number of elements in x direction (excluding halo)" << std::endl;
+    std::cout << "    " << std::setw(20) << "--dimy=<N>"
+              << "Specify number of elements in y direction (excluding halo)" << std::endl;
+    std::cout << "    " << std::setw(20) << "--dimz=<N>"
+              << "Specify number of elements in z direction (excluding halo)" << std::endl;
+    std::cout << "    " << std::setw(20) << "--radius=<N>"
+              << "Specify radius of stencil" << std::endl;
+    std::cout << "    " << std::setw(20) << "--timesteps=<N>"
+              << "Specify number of timesteps" << std::endl;
+    std::cout << "    " << std::setw(20) << "--block-size=<N>"
+              << "Specify number of threads per block" << std::endl;
+    std::cout << std::endl;
+    std::cout << "    " << std::setw(20) << "--noprompt"
+              << "Skip prompt before exit" << std::endl;
+    std::cout << std::endl;
+}
+
+bool runTest(int argc, const char **argv)
+{
+    float *host_output;
+    float *device_output;
+    float *input;
+    float *coeff;
+
+    int       defaultDim;
+    int       dimx;
+    int       dimy;
+    int       dimz;
+    int       outerDimx;
+    int       outerDimy;
+    int       outerDimz;
+    int       radius;
+    int       timesteps;
+    size_t    volumeSize;
+    memsize_t memsize;
+
+    const float lowerBound = 0.0f;
+    const float upperBound = 1.0f;
+
+    // Determine default dimensions
+    printf("Set-up, based upon target device GMEM size...\n");
+    // Get the memory size of the target device
+    printf(" getTargetDeviceGlobalMemSize\n");
+    getTargetDeviceGlobalMemSize(&memsize, argc, argv);
+
+    // We can never use all the memory so to keep things simple we aim to
+    // use around half the total memory
+    memsize /= 2;
+
+    // Most of our memory use is taken up by the input and output buffers -
+    // two buffers of equal size - and for simplicity the volume is a cube:
+    //   dim = floor( (N/2)^(1/3) )
+    defaultDim = (int)floor(pow((memsize / (2.0 * sizeof(float))), 1.0 / 3.0));
+
+    // By default, make the volume edge size an integer multiple of 128B to
+    // improve performance by coalescing memory accesses, in a real
+    // application it would make sense to pad the lines accordingly
+    int roundTarget = 128 / sizeof(float);
+    defaultDim      = defaultDim / roundTarget * roundTarget;
+    defaultDim -= k_radius_default * 2;
+
+    // Check dimension is valid
+    if (defaultDim < k_dim_min) {
+        printf("insufficient device memory (maximum volume on device is %d, must be "
+               "between %d and %d).\n",
+               defaultDim,
+               k_dim_min,
+               k_dim_max);
+        exit(EXIT_FAILURE);
+    }
+    else if (defaultDim > k_dim_max) {
+        defaultDim = k_dim_max;
+    }
+
+    // For QA testing, override default volume size
+    if (checkCmdLineFlag(argc, argv, "qatest")) {
+        defaultDim = MIN(defaultDim, k_dim_qa);
+    }
+
+    // set default dim
+    dimx      = defaultDim;
+    dimy      = defaultDim;
+    dimz      = defaultDim;
+    radius    = k_radius_default;
+    timesteps = k_timesteps_default;
+
+    // Parse command line arguments
+    if (checkCmdLineFlag(argc, argv, "dimx")) {
+        dimx = CLAMP(getCmdLineArgumentInt(argc, argv, "dimx"), k_dim_min, k_dim_max);
+    }
+
+    if (checkCmdLineFlag(argc, argv, "dimy")) {
+        dimy = CLAMP(getCmdLineArgumentInt(argc, argv, "dimy"), k_dim_min, k_dim_max);
+    }
+
+    if (checkCmdLineFlag(argc, argv, "dimz")) {
+        dimz = CLAMP(getCmdLineArgumentInt(argc, argv, "dimz"), k_dim_min, k_dim_max);
+    }
+
+    if (checkCmdLineFlag(argc, argv, "radius")) {
+        radius = CLAMP(getCmdLineArgumentInt(argc, argv, "radius"), k_radius_min, k_radius_max);
+    }
+
+    if (checkCmdLineFlag(argc, argv, "timesteps")) {
+        timesteps = CLAMP(getCmdLineArgumentInt(argc, argv, "timesteps"), k_timesteps_min, k_timesteps_max);
+    }
+
+    // Determine volume size
+    outerDimx  = dimx + 2 * radius;
+    outerDimy  = dimy + 2 * radius;
+    outerDimz  = dimz + 2 * radius;
+    volumeSize = outerDimx * outerDimy * outerDimz;
+
+    // Allocate memory
+    host_output = (float *)calloc(volumeSize, sizeof(float));
+    input       = (float *)malloc(volumeSize * sizeof(float));
+    coeff       = (float *)malloc((radius + 1) * sizeof(float));
+
+    // Create coefficients
+    for (int i = 0; i <= radius; i++) {
+        coeff[i] = 0.1f;
+    }
+
+    // Generate data
+    printf(" generateRandomData\n\n");
+    generateRandomData(input, outerDimx, outerDimy, outerDimz, lowerBound, upperBound);
+    printf("FDTD on %d x %d x %d volume with symmetric filter radius %d for %d "
+           "timesteps...\n\n",
+           dimx,
+           dimy,
+           dimz,
+           radius,
+           timesteps);
+
+    // Execute on the host
+    printf("fdtdReference...\n");
+    fdtdReference(host_output, input, coeff, dimx, dimy, dimz, radius, timesteps);
+    printf("fdtdReference complete\n");
+
+    // Allocate memory
+    device_output = (float *)calloc(volumeSize, sizeof(float));
+
+    // Execute on the device
+    printf("fdtdGPU...\n");
+    fdtdGPU(device_output, input, coeff, dimx, dimy, dimz, radius, timesteps, argc, argv);
+    printf("fdtdGPU complete\n");
+
+    // Compare the results
+    float tolerance = 0.0001f;
+    printf("\nCompareData (tolerance %f)...\n", tolerance);
+    return compareData(device_output, host_output, dimx, dimy, dimz, radius, tolerance);
+}
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3d.h b/src/cuda/HPC/image/FDTD3d/FDTD3d.h
new file mode 100644
index 000000000..f943ab1a7
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3d.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FDTD3D_H_
+#define _FDTD3D_H_
+
+// The values are set to give reasonable runtimes, they can
+// be changed but note that running very large dimensions can
+// take a very long time and you should avoid running on your
+// primary display in this case.
+#define k_dim_min 96
+#define k_dim_max 8192
+#define k_dim_qa  248
+
+// Note that the radius is defined here as exactly 4 since the
+// kernel code uses a constant. If you want a different radius
+// you must change the kernel accordingly.
+#define k_radius_min     4
+#define k_radius_max     4
+#define k_radius_default 4
+
+// The values are set to give reasonable runtimes, they can
+// be changed but note that running a very large number of
+// timesteps can take a very long time and you should avoid
+// running on your primary display in this case.
+#define k_timesteps_min     1
+#define k_timesteps_max     10
+#define k_timesteps_default 5
+
+#endif
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu
new file mode 100644
index 000000000..6dc2caf35
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.cu
@@ -0,0 +1,260 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <algorithm>
+#include "helper_cuda.h"
+#include "helper_functions.h"
+#include <iostream>
+
+#include "FDTD3dGPU.h"
+#include "FDTD3dGPUKernel.cuh"
+
+bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv)
+{
+    int    deviceCount  = 0;
+    int    targetDevice = 0;
+    size_t memsize      = 0;
+
+    // Get the number of CUDA enabled GPU devices
+    printf(" cudaGetDeviceCount\n");
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+    // Select target device (device 0 by default)
+    targetDevice = findCudaDevice(argc, (const char **)argv);
+
+    // Query target device for maximum memory allocation
+    printf(" cudaGetDeviceProperties\n");
+    struct cudaDeviceProp deviceProp;
+    checkCudaErrors(cudaGetDeviceProperties(&deviceProp, targetDevice));
+
+    memsize = deviceProp.totalGlobalMem;
+
+    // Save the result
+    *result = (memsize_t)memsize;
+    return true;
+}
+
+bool fdtdGPU(float       *output,
+             const float *input,
+             const float *coeff,
+             const int    dimx,
+             const int    dimy,
+             const int    dimz,
+             const int    radius,
+             const int    timesteps,
+             const int    argc,
+             const char **argv)
+{
+    const int    outerDimx    = dimx + 2 * radius;
+    const int    outerDimy    = dimy + 2 * radius;
+    const int    outerDimz    = dimz + 2 * radius;
+    const size_t volumeSize   = outerDimx * outerDimy * outerDimz;
+    int          deviceCount  = 0;
+    int          targetDevice = 0;
+    float       *bufferOut    = 0;
+    float       *bufferIn     = 0;
+    dim3         dimBlock;
+    dim3         dimGrid;
+
+    // Ensure that the inner data starts on a 128B boundary
+    const int    padding          = (128 / sizeof(float)) - radius;
+    const size_t paddedVolumeSize = volumeSize + padding;
+
+#ifdef GPU_PROFILING
+    cudaEvent_t profileStart     = 0;
+    cudaEvent_t profileEnd       = 0;
+    const int   profileTimesteps = timesteps - 1;
+
+    if (profileTimesteps < 1) {
+        printf(" cannot profile with fewer than two timesteps (timesteps=%d), "
+               "profiling is disabled.\n",
+               timesteps);
+    }
+
+#endif
+
+    // Check the radius is valid
+    if (radius != RADIUS) {
+        printf("radius is invalid, must be %d - see kernel for details.\n", RADIUS);
+        exit(EXIT_FAILURE);
+    }
+
+    // Get the number of CUDA enabled GPU devices
+    checkCudaErrors(cudaGetDeviceCount(&deviceCount));
+
+    // Select target device (device 0 by default)
+    targetDevice = findCudaDevice(argc, (const char **)argv);
+
+    checkCudaErrors(cudaSetDevice(targetDevice));
+
+    // Allocate memory buffers
+    checkCudaErrors(cudaMalloc((void **)&bufferOut, paddedVolumeSize * sizeof(float)));
+    checkCudaErrors(cudaMalloc((void **)&bufferIn, paddedVolumeSize * sizeof(float)));
+
+    // Check for a command-line specified block size
+    int userBlockSize;
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "block-size")) {
+        userBlockSize = getCmdLineArgumentInt(argc, argv, "block-size");
+        // Constrain to a multiple of k_blockDimX
+        userBlockSize = (userBlockSize / k_blockDimX * k_blockDimX);
+
+        // Constrain within allowed bounds
+        userBlockSize = MIN(MAX(userBlockSize, k_blockSizeMin), k_blockSizeMax);
+    }
+    else {
+        userBlockSize = k_blockSizeMax;
+    }
+
+    // Check the device limit on the number of threads
+    struct cudaFuncAttributes funcAttrib;
+    checkCudaErrors(cudaFuncGetAttributes(&funcAttrib, FiniteDifferencesKernel));
+
+    userBlockSize = MIN(userBlockSize, funcAttrib.maxThreadsPerBlock);
+
+    // Set the block size
+    dimBlock.x = k_blockDimX;
+    // Visual Studio 2005 does not like std::min
+    //    dimBlock.y = std::min<size_t>(userBlockSize / k_blockDimX,
+    //    (size_t)k_blockDimMaxY);
+    dimBlock.y = ((userBlockSize / k_blockDimX) < (size_t)k_blockDimMaxY) ? (userBlockSize / k_blockDimX)
+                                                                          : (size_t)k_blockDimMaxY;
+    dimGrid.x  = (unsigned int)ceil((float)dimx / dimBlock.x);
+    dimGrid.y  = (unsigned int)ceil((float)dimy / dimBlock.y);
+    printf(" set block size to %dx%d\n", dimBlock.x, dimBlock.y);
+    printf(" set grid size to %dx%d\n", dimGrid.x, dimGrid.y);
+
+    // Check the block size is valid
+    if (dimBlock.x < RADIUS || dimBlock.y < RADIUS) {
+        printf("invalid block size, x (%d) and y (%d) must be >= radius (%d).\n", dimBlock.x, dimBlock.y, RADIUS);
+        exit(EXIT_FAILURE);
+    }
+
+    // Copy the input to the device input buffer
+    checkCudaErrors(cudaMemcpy(bufferIn + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Copy the input to the device output buffer (actually only need the halo)
+    checkCudaErrors(cudaMemcpy(bufferOut + padding, input, volumeSize * sizeof(float), cudaMemcpyHostToDevice));
+
+    // Copy the coefficients to the device coefficient buffer
+    checkCudaErrors(cudaMemcpyToSymbol(stencil, (void *)coeff, (radius + 1) * sizeof(float)));
+
+#ifdef GPU_PROFILING
+
+    // Create the events
+    checkCudaErrors(cudaEventCreate(&profileStart));
+    checkCudaErrors(cudaEventCreate(&profileEnd));
+
+#endif
+
+    // Execute the FDTD
+    float *bufferSrc = bufferIn + padding;
+    float *bufferDst = bufferOut + padding;
+    printf(" GPU FDTD loop\n");
+
+#ifdef GPU_PROFILING
+    // Enqueue start event
+    checkCudaErrors(cudaEventRecord(profileStart, 0));
+#endif
+
+    for (int it = 0; it < timesteps; it++) {
+        printf("\tt = %d ", it);
+
+        // Launch the kernel
+        printf("launch kernel\n");
+        FiniteDifferencesKernel<<<dimGrid, dimBlock>>>(bufferDst, bufferSrc, dimx, dimy, dimz);
+
+        // Toggle the buffers
+        // Visual Studio 2005 does not like std::swap
+        //    std::swap<float *>(bufferSrc, bufferDst);
+        float *tmp = bufferDst;
+        bufferDst  = bufferSrc;
+        bufferSrc  = tmp;
+    }
+
+    printf("\n");
+
+#ifdef GPU_PROFILING
+    // Enqueue end event
+    checkCudaErrors(cudaEventRecord(profileEnd, 0));
+#endif
+
+    // Wait for the kernel to complete
+    checkCudaErrors(cudaDeviceSynchronize());
+
+    // Read the result back, result is in bufferSrc (after final toggle)
+    checkCudaErrors(cudaMemcpy(output, bufferSrc, volumeSize * sizeof(float), cudaMemcpyDeviceToHost));
+
+// Report time
+#ifdef GPU_PROFILING
+    float elapsedTimeMS = 0;
+
+    if (profileTimesteps > 0) {
+        checkCudaErrors(cudaEventElapsedTime(&elapsedTimeMS, profileStart, profileEnd));
+    }
+
+    if (profileTimesteps > 0) {
+        // Convert milliseconds to seconds
+        double elapsedTime    = elapsedTimeMS * 1.0e-3;
+        double avgElapsedTime = elapsedTime / (double)profileTimesteps;
+        // Determine number of computations per timestep
+        size_t pointsComputed = dimx * dimy * dimz;
+        // Determine throughput
+        double throughputM = 1.0e-6 * (double)pointsComputed / avgElapsedTime;
+        printf("FDTD3d, Throughput = %.4f MPoints/s, Time = %.5f s, Size = %u Points, "
+               "NumDevsUsed = %u, Blocksize = %u\n",
+               throughputM,
+               avgElapsedTime,
+               pointsComputed,
+               1,
+               dimBlock.x * dimBlock.y);
+    }
+
+#endif
+
+    // Cleanup
+    if (bufferIn) {
+        checkCudaErrors(cudaFree(bufferIn));
+    }
+
+    if (bufferOut) {
+        checkCudaErrors(cudaFree(bufferOut));
+    }
+
+#ifdef GPU_PROFILING
+
+    if (profileStart) {
+        checkCudaErrors(cudaEventDestroy(profileStart));
+    }
+
+    if (profileEnd) {
+        checkCudaErrors(cudaEventDestroy(profileEnd));
+    }
+
+#endif
+    return true;
+}
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h
new file mode 100644
index 000000000..463f7e1d9
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPU.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FDTD3DGPU_H_
+#define _FDTD3DGPU_H_
+
+#include <cstddef>
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) && defined(_MSC_VER)
+typedef unsigned __int64 memsize_t;
+#else
+#include <stdint.h>
+typedef uint64_t memsize_t;
+#endif
+
+#define k_blockDimX    32
+#define k_blockDimMaxY 16
+#define k_blockSizeMin 128
+#define k_blockSizeMax (k_blockDimX * k_blockDimMaxY)
+
+bool getTargetDeviceGlobalMemSize(memsize_t *result, const int argc, const char **argv);
+bool fdtdGPU(float       *output,
+             const float *input,
+             const float *coeff,
+             const int    dimx,
+             const int    dimy,
+             const int    dimz,
+             const int    radius,
+             const int    timesteps,
+             const int    argc,
+             const char **argv);
+
+#endif
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh b/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh
new file mode 100644
index 000000000..c6a473559
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3dGPUKernel.cuh
@@ -0,0 +1,160 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <cooperative_groups.h>
+
+#include "FDTD3dGPU.h"
+
+namespace cg = cooperative_groups;
+
+// Note: If you change the RADIUS, you should also change the unrolling below
+#define RADIUS 4
+
+__constant__ float stencil[RADIUS + 1];
+
+__global__ void
+FiniteDifferencesKernel(float *output, const float *input, const int dimx, const int dimy, const int dimz)
+{
+    bool      validr = true;
+    bool      validw = true;
+    const int gtidx  = blockIdx.x * blockDim.x + threadIdx.x;
+    const int gtidy  = blockIdx.y * blockDim.y + threadIdx.y;
+    const int ltidx  = threadIdx.x;
+    const int ltidy  = threadIdx.y;
+    const int workx  = blockDim.x;
+    const int worky  = blockDim.y;
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    __shared__ float tile[k_blockDimMaxY + 2 * RADIUS][k_blockDimX + 2 * RADIUS];
+
+    const int stride_y = dimx + 2 * RADIUS;
+    const int stride_z = stride_y * (dimy + 2 * RADIUS);
+
+    int inputIndex  = 0;
+    int outputIndex = 0;
+
+    // Advance inputIndex to start of inner volume
+    inputIndex += RADIUS * stride_y + RADIUS;
+
+    // Advance inputIndex to target element
+    inputIndex += gtidy * stride_y + gtidx;
+
+    float infront[RADIUS];
+    float behind[RADIUS];
+    float current;
+
+    const int tx = ltidx + RADIUS;
+    const int ty = ltidy + RADIUS;
+
+    // Check in bounds
+    if ((gtidx >= dimx + RADIUS) || (gtidy >= dimy + RADIUS))
+        validr = false;
+
+    if ((gtidx >= dimx) || (gtidy >= dimy))
+        validw = false;
+
+    // Preload the "infront" and "behind" data
+    for (int i = RADIUS - 2; i >= 0; i--) {
+        if (validr)
+            behind[i] = input[inputIndex];
+
+        inputIndex += stride_z;
+    }
+
+    if (validr)
+        current = input[inputIndex];
+
+    outputIndex = inputIndex;
+    inputIndex += stride_z;
+
+    for (int i = 0; i < RADIUS; i++) {
+        if (validr)
+            infront[i] = input[inputIndex];
+
+        inputIndex += stride_z;
+    }
+
+// Step through the xy-planes
+#pragma unroll 9
+
+    for (int iz = 0; iz < dimz; iz++) {
+        // Advance the slice (move the thread-front)
+        for (int i = RADIUS - 1; i > 0; i--)
+            behind[i] = behind[i - 1];
+
+        behind[0] = current;
+        current   = infront[0];
+#pragma unroll 4
+
+        for (int i = 0; i < RADIUS - 1; i++)
+            infront[i] = infront[i + 1];
+
+        if (validr)
+            infront[RADIUS - 1] = input[inputIndex];
+
+        inputIndex += stride_z;
+        outputIndex += stride_z;
+        cg::sync(cta);
+
+        // Note that for the work items on the boundary of the problem, the
+        // supplied index when reading the halo (below) may wrap to the
+        // previous/next row or even the previous/next xy-plane. This is
+        // acceptable since a) we disable the output write for these work
+        // items and b) there is at least one xy-plane before/after the
+        // current plane, so the access will be within bounds.
+
+        // Update the data slice in the local tile
+        // Halo above & below
+        if (ltidy < RADIUS) {
+            tile[ltidy][tx]                  = input[outputIndex - RADIUS * stride_y];
+            tile[ltidy + worky + RADIUS][tx] = input[outputIndex + worky * stride_y];
+        }
+
+        // Halo left & right
+        if (ltidx < RADIUS) {
+            tile[ty][ltidx]                  = input[outputIndex - RADIUS];
+            tile[ty][ltidx + workx + RADIUS] = input[outputIndex + workx];
+        }
+
+        tile[ty][tx] = current;
+        cg::sync(cta);
+
+        // Compute the output value
+        float value = stencil[0] * current;
+#pragma unroll 4
+
+        for (int i = 1; i <= RADIUS; i++) {
+            value += stencil[i]
+                   * (infront[i - 1] + behind[i - 1] + tile[ty - i][tx] + tile[ty + i][tx] + tile[ty][tx - i]
+                      + tile[ty][tx + i]);
+        }
+
+        // Store the output value
+        if (validw)
+            output[outputIndex] = value;
+    }
+}
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp
new file mode 100644
index 000000000..6ece3469c
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.cpp
@@ -0,0 +1,191 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "FDTD3dReference.h"
+
+#include <cmath>
+#include <cstdlib>
+#include <iomanip>
+#include <iostream>
+#include <stdio.h>
+
+void generateRandomData(float      *data,
+                        const int   dimx,
+                        const int   dimy,
+                        const int   dimz,
+                        const float lowerBound,
+                        const float upperBound)
+{
+    srand(0);
+
+    for (int iz = 0; iz < dimz; iz++) {
+        for (int iy = 0; iy < dimy; iy++) {
+            for (int ix = 0; ix < dimx; ix++) {
+                *data = (float)(lowerBound + ((float)rand() / (float)RAND_MAX) * (upperBound - lowerBound));
+                ++data;
+            }
+        }
+    }
+}
+
+void generatePatternData(float      *data,
+                         const int   dimx,
+                         const int   dimy,
+                         const int   dimz,
+                         const float lowerBound,
+                         const float upperBound)
+{
+    for (int iz = 0; iz < dimz; iz++) {
+        for (int iy = 0; iy < dimy; iy++) {
+            for (int ix = 0; ix < dimx; ix++) {
+                *data = (float)(lowerBound + ((float)iz / (float)dimz) * (upperBound - lowerBound));
+                ++data;
+            }
+        }
+    }
+}
+
+bool fdtdReference(float       *output,
+                   const float *input,
+                   const float *coeff,
+                   const int    dimx,
+                   const int    dimy,
+                   const int    dimz,
+                   const int    radius,
+                   const int    timesteps)
+{
+    const int    outerDimx    = dimx + 2 * radius;
+    const int    outerDimy    = dimy + 2 * radius;
+    const int    outerDimz    = dimz + 2 * radius;
+    const size_t volumeSize   = outerDimx * outerDimy * outerDimz;
+    const int    stride_y     = outerDimx;
+    const int    stride_z     = stride_y * outerDimy;
+    float       *intermediate = 0;
+    const float *bufsrc       = 0;
+    float       *bufdst       = 0;
+    float       *bufdstnext   = 0;
+
+    // Allocate temporary buffer
+    printf(" calloc intermediate\n");
+    intermediate = (float *)calloc(volumeSize, sizeof(float));
+
+    // Decide which buffer to use first (result should end up in output)
+    if ((timesteps % 2) == 0) {
+        bufsrc     = input;
+        bufdst     = intermediate;
+        bufdstnext = output;
+    }
+    else {
+        bufsrc     = input;
+        bufdst     = output;
+        bufdstnext = intermediate;
+    }
+
+    // Run the FDTD (naive method)
+    printf(" Host FDTD loop\n");
+
+    for (int it = 0; it < timesteps; it++) {
+        printf("\tt = %d\n", it);
+        const float *src = bufsrc;
+        float       *dst = bufdst;
+
+        for (int iz = -radius; iz < dimz + radius; iz++) {
+            for (int iy = -radius; iy < dimy + radius; iy++) {
+                for (int ix = -radius; ix < dimx + radius; ix++) {
+                    if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) {
+                        float value = (*src) * coeff[0];
+
+                        for (int ir = 1; ir <= radius; ir++) {
+                            value += coeff[ir] * (*(src + ir) + *(src - ir));                       // horizontal
+                            value += coeff[ir] * (*(src + ir * stride_y) + *(src - ir * stride_y)); // vertical
+                            value += coeff[ir] * (*(src + ir * stride_z) + *(src - ir * stride_z)); // in front & behind
+                        }
+
+                        *dst = value;
+                    }
+                    else {
+                        *dst = *src;
+                    }
+
+                    ++dst;
+                    ++src;
+                }
+            }
+        }
+
+        // Rotate buffers
+        float *tmp = bufdst;
+        bufdst     = bufdstnext;
+        bufdstnext = tmp;
+        bufsrc     = (const float *)tmp;
+    }
+
+    printf("\n");
+
+    if (intermediate)
+        free(intermediate);
+
+    return true;
+}
+
+bool compareData(const float *output,
+                 const float *reference,
+                 const int    dimx,
+                 const int    dimy,
+                 const int    dimz,
+                 const int    radius,
+                 const float  tolerance)
+{
+    for (int iz = -radius; iz < dimz + radius; iz++) {
+        for (int iy = -radius; iy < dimy + radius; iy++) {
+            for (int ix = -radius; ix < dimx + radius; ix++) {
+                if (ix >= 0 && ix < dimx && iy >= 0 && iy < dimy && iz >= 0 && iz < dimz) {
+                    // Determine the absolute difference
+                    float difference = fabs(*reference - *output);
+                    float error;
+
+                    // Determine the relative error
+                    if (*reference != 0)
+                        error = difference / *reference;
+                    else
+                        error = difference;
+
+                    // Check the error is within the tolerance
+                    if (error > tolerance) {
+                        printf("Data error at point (%d,%d,%d)\t%f instead of %f\n", ix, iy, iz, *output, *reference);
+                        return false;
+                    }
+                }
+
+                ++output;
+                ++reference;
+            }
+        }
+    }
+
+    return true;
+}
diff --git a/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h
new file mode 100644
index 000000000..f39ad62be
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/FDTD3dReference.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _FDTD3DREFERENCE_H_
+#define _FDTD3DREFERENCE_H_
+
+void generateRandomData(float      *data,
+                        const int   dimx,
+                        const int   dimy,
+                        const int   dimz,
+                        const float lowerBound,
+                        const float upperBound);
+void generatePatternData(float      *data,
+                         const int   dimx,
+                         const int   dimy,
+                         const int   dimz,
+                         const float lowerBound,
+                         const float upperBound);
+bool fdtdReference(float       *output,
+                   const float *input,
+                   const float *coeff,
+                   const int    dimx,
+                   const int    dimy,
+                   const int    dimz,
+                   const int    radius,
+                   const int    timesteps);
+bool compareData(const float *output,
+                 const float *reference,
+                 const int    dimx,
+                 const int    dimy,
+                 const int    dimz,
+                 const int    radius,
+                 const float  tolerance = 0.0001f);
+
+#endif
diff --git a/src/cuda/HPC/image/FDTD3d/Makefile b/src/cuda/HPC/image/FDTD3d/Makefile
new file mode 100644
index 000000000..183bc8b4c
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/Makefile
@@ -0,0 +1,8 @@
+EXECUTABLE := FDTD3d_hpc
+CUFILES := FDTD3dGPU.cu
+CCFILES := FDTD3d.cpp FDTD3dReference.cpp
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/image/FDTD3d/exception.h b/src/cuda/HPC/image/FDTD3d/exception.h
new file mode 100644
index 000000000..ca8ac2525
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
diff --git a/src/cuda/HPC/image/FDTD3d/helper_cuda.h b/src/cuda/HPC/image/FDTD3d/helper_cuda.h
new file mode 100644
index 000000000..723b28f7a
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/helper_cuda.h
@@ -0,0 +1,988 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+
+    case CUFFT_MISSING_DEPENDENCY:
+      return "CUFFT_MISSING_DEPENDENCY";
+
+    case CUFFT_NVRTC_FAILURE:
+      return "CUFFT_NVRTC_FAILURE";
+
+    case CUFFT_NVJITLINK_FAILURE:
+      return "CUFFT_NVJITLINK_FAILURE";
+
+    case CUFFT_NVSHMEM_FAILURE:
+      return "CUFFT_NVSHMEM_FAILURE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {0x89, 128},
+      {0x90, 128},
+      {0xa0, 128},
+      {0xa1, 128},
+      {0xa3, 128},
+      {0xb0, 128},
+      {0xc0, 128},
+      {0xc1, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {0x89, "Ada"},
+      {0x90, "Hopper"},
+      {0xa0, "Blackwell"},
+      {0xa1, "Blackwell"},
+      {0xa3, "Blackwell"},
+      {0xb0, "Blackwell"},
+      {0xc0, "Blackwell"},
+      {0xc1, "Blackwell"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/src/cuda/HPC/image/FDTD3d/helper_functions.h b/src/cuda/HPC/image/FDTD3d/helper_functions.h
new file mode 100644
index 000000000..f71da5761
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include "helper_image.h"  // helper functions for image compare, dump, data comparisons
+#include "helper_string.h"  // helper functions for string parsing
+#include "helper_timer.h"   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
diff --git a/src/cuda/HPC/image/FDTD3d/helper_image.h b/src/cuda/HPC/image/FDTD3d/helper_image.h
new file mode 100644
index 000000000..7e81b0cd8
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/helper_image.h
@@ -0,0 +1,1001 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include "helper_string.h"
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float &val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+  FILE *fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#') {
+      continue;
+    }
+
+    if (i == 0) {
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    } else if (i == 1) {
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    } else if (i == 2) {
+      i += SSCANF(header, "%u", &maxval);
+    }
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height) {
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+    }
+  } else {
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                    *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+      0) {
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+  }
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+  unsigned char *idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels)) {
+    return false;
+  }
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data) {
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+  }
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+    fh << data[i];
+  }
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char *ndata =
+      (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char *ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE *fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE *fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+    fh << data[i] << ' ';
+  }
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp) {
+      error_count++;
+    }
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count) {
+      printf("total # of errors = %d\n", error_count);
+    }
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE *fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf(
+          "compareBin2Bin <unsigned int>"
+          " unable to open ref_file: %s\n",
+          ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf(
+          "> compareBin2Bin <unsigned int> nelements=%d,"
+          " epsilon=%4.2f, threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+      printf(
+          "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+          " threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
diff --git a/src/cuda/HPC/image/FDTD3d/helper_string.h b/src/cuda/HPC/image/FDTD3d/helper_string.h
new file mode 100644
index 000000000..39a1b3805
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
diff --git a/src/cuda/HPC/image/FDTD3d/helper_timer.h b/src/cuda/HPC/image/FDTD3d/helper_timer.h
new file mode 100644
index 000000000..0614a7802
--- /dev/null
+++ b/src/cuda/HPC/image/FDTD3d/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include "exception.h"
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/Makefile b/src/cuda/HPC/image/dwtHaar1D/Makefile
new file mode 100644
index 000000000..b34033e13
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/Makefile
@@ -0,0 +1,8 @@
+EXECUTABLE := dwtHaar1D_hpc
+CUFILES := dwtHaar1D.cu
+CCFILES :=
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu
new file mode 100644
index 000000000..9a80481cc
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D.cu
@@ -0,0 +1,397 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+* 1D DWT for Haar wavelet and signals with a length which is a power of 2.
+* The code reduces bank conflicts and non-coalesced reads / writes as
+* appropriate but does not fully remove them because the computational
+* overhead to achieve this would outweighs the benefit (see inline comments
+* for more details).
+* Large signals are subdivided into sub-signals with 512 elements and the
+* wavelet transform for these is computed with one block over 10 decomposition
+* levels. The resulting signal consisting of the approximation coefficients at
+* level X is then processed in a subsequent step on the device. This requires
+* interblock synchronization which is only possible on host side.
+* Detail coefficients which have been computed are not further referenced
+* during the decomposition so that they can be stored directly in their final
+* position in global memory. The transform and its storing scheme preserve
+* locality in the coefficients so that these writes are coalesced.
+* Approximation coefficients are stored in shared memory because they are
+* needed to compute the subsequent decomposition step. The top most
+* approximation coefficient for a sub-signal processed by one block is stored
+* in a special global memory location to simplify the processing after the
+* interblock synchronization.
+* Most books on wavelets explain the Haar wavelet decomposition. A good freely
+* available resource is the Wavelet primer by Stollnitz et al.
+* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf
+* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf
+* The basic of all Wavelet transforms is to decompose a signal into
+* approximation (a) and detail (d) coefficients where the detail tends to be
+* small or zero which allows / simplifies compression. The following "graphs"
+* demonstrate the transform for a signal
+* of length eight. The index always describes the decomposition level where
+* a coefficient arises. The input signal is interpreted as approximation signal
+* at level 0. The coefficients computed on the device are stored in the same
+* scheme as in the example. This data structure is particularly well suited for
+* compression and also preserves the hierarchical structure of the
+decomposition.
+
+-------------------------------------------------
+| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+* Host code.
+*/
+
+#ifdef _WIN32
+#define NOMINMAX
+#endif
+
+// includes, system
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// includes, project
+#include "helper_cuda.h"
+#include "helper_functions.h"
+
+// constants which are used in host and device code
+#define INV_SQRT_2 0.70710678118654752440f;
+const unsigned int LOG_NUM_BANKS = 4;
+const unsigned int NUM_BANKS     = 16;
+
+////////////////////////////////////////////////////////////////////////////////
+// includes, kernels
+#include "dwtHaar1D_kernel.cuh"
+
+////////////////////////////////////////////////////////////////////////////////
+// declaration, forward
+void runTest(int argc, char **argv);
+bool getLevels(unsigned int len, unsigned int *levels);
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    // run test
+    runTest(argc, argv);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Perform the wavelet decomposition
+////////////////////////////////////////////////////////////////////////////////
+void runTest(int argc, char **argv)
+{
+    bool bResult = false; // flag for final validation of the results
+
+    char      *s_fname = NULL, *r_gold_fname = NULL;
+    char       r_fname[256];
+    const char usage[] = {"\nUsage:\n"
+                          "  dwtHaar1D --signal=<signal_file> --result=<result_file> "
+                          "[--gold=<gold_file>]\n\n"
+                          "  <signal_file> Input file containing the signal\n"
+                          "  <result_file> Output file storing the result of the wavelet "
+                          "decomposition\n"
+                          "  <gold_file>   (Optional) Input file containing the reference result of the "
+                          "wavelet decomposition\n"
+                          "\nExample:\n"
+                          "  ./dwtHaar1D\n"
+                          "       --signal=signal.dat\n"
+                          "       --result=result.dat\n"
+                          "       --gold=regression.gold.dat\n"};
+
+    printf("%s Starting...\n\n", argv[0]);
+
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    findCudaDevice(argc, (const char **)argv);
+
+    // file names, either specified as cmd line args or use default
+    if (argc >= 3) {
+        char *tmp_sfname, *tmp_rfname, *tmp_goldfname;
+
+        if ((getCmdLineArgumentString(argc, (const char **)argv, "signal", &tmp_sfname) != true)
+            || (getCmdLineArgumentString(argc, (const char **)argv, "result", &tmp_rfname) != true)) {
+            fprintf(stderr, "Invalid input syntax.\n%s", usage);
+            exit(EXIT_FAILURE);
+        }
+
+        s_fname      = sdkFindFilePath(tmp_sfname, argv[0]);
+        strcpy(r_fname, tmp_rfname);
+
+        // Gold file is optional
+        if (getCmdLineArgumentString(argc, (const char **)argv, "gold", &tmp_goldfname) == true) {
+            r_gold_fname = sdkFindFilePath(tmp_goldfname, argv[0]);
+        }
+    }
+    else {
+        s_fname      = sdkFindFilePath("signal.dat", argv[0]);
+        r_gold_fname = sdkFindFilePath("regression.gold.dat", argv[0]);
+        strcpy(r_fname, "result.dat");
+    }
+
+    printf("source file    = \"%s\"\n", s_fname);
+    printf("reference file = \"%s\"\n", r_fname);
+    if (r_gold_fname != NULL) {
+        printf("gold file      = \"%s\"\n", r_gold_fname);
+    } else {
+        printf("gold file      = (none - validation skipped)\n");
+    }
+
+    // read in signal
+    unsigned int slength = 0;
+    float       *signal  = NULL;
+
+    if (s_fname == NULL) {
+        fprintf(stderr, "Cannot find the file containing the signal.\n%s", usage);
+
+        exit(EXIT_FAILURE);
+    }
+
+    if (sdkReadFile(s_fname, &signal, &slength, false) == true) {
+        printf("Reading signal from \"%s\"\n", s_fname);
+    }
+    else {
+        exit(EXIT_FAILURE);
+    }
+
+    // get the number of decompositions necessary to perform a full decomposition
+    unsigned int dlevels_complete = 0;
+
+    if (true != getLevels(slength, &dlevels_complete)) {
+        // error message
+        fprintf(stderr, "Signal length not supported.\n");
+        // cleanup and abort
+        free(signal);
+        exit(EXIT_FAILURE);
+    }
+
+    // device in data
+    float *d_idata = NULL;
+    // device out data
+    float *d_odata = NULL;
+    // device approx_final data
+    float *approx_final = NULL;
+    // The very final approximation coefficient has to be written to the output
+    // data, all others are reused as input data in the next global step and
+    // therefore have to be written to the input data again.
+    // The following flag indicates where to copy approx_final data
+    //   - 0 is input, 1 is output
+    int approx_is_input;
+
+    // allocate device mem
+    const unsigned int smem_size = sizeof(float) * slength;
+    checkCudaErrors(cudaMalloc((void **)&d_idata, smem_size));
+    checkCudaErrors(cudaMalloc((void **)&d_odata, smem_size));
+    checkCudaErrors(cudaMalloc((void **)&approx_final, smem_size));
+    // copy input data to device
+    checkCudaErrors(cudaMemcpy(d_idata, signal, smem_size, cudaMemcpyHostToDevice));
+
+    // total number of threads
+    // in the first decomposition step always one thread computes the average and
+    // detail signal for one pair of adjacent values
+    unsigned int num_threads_total_left = slength / 2;
+    // decomposition levels performed in the current / next step
+    unsigned int dlevels_step = dlevels_complete;
+
+    // 1D signal so the arrangement of elements is also 1D
+    dim3 block_size;
+    dim3 grid_size;
+
+    // number of decomposition levels left after one iteration on the device
+    unsigned int dlevels_left = dlevels_complete;
+
+    // if less or equal 1k elements, then the data can be processed in one block,
+    // this avoids the Wait-For-Idle (WFI) on host side which is necessary if the
+    // computation is split across multiple SM's if enough input data
+    if (dlevels_complete <= 10) {
+        // decomposition can be performed at once
+        block_size.x    = num_threads_total_left;
+        approx_is_input = 0;
+    }
+    else {
+        // 512 threads per block
+        grid_size.x  = (num_threads_total_left / 512);
+        block_size.x = 512;
+
+        // 512 threads corresponds to 10 decomposition steps
+        dlevels_step = 10;
+        dlevels_left -= 10;
+
+        approx_is_input = 1;
+    }
+
+    // Initialize d_odata to 0.0f
+    initValue<<<grid_size, block_size>>>(d_odata, 0.0f);
+
+    // do until full decomposition is accomplished
+    while (0 != num_threads_total_left) {
+        // double the number of threads as bytes
+        unsigned int mem_shared = (2 * block_size.x) * sizeof(float);
+        // extra memory requirements to avoid bank conflicts
+        mem_shared += ((2 * block_size.x) / NUM_BANKS) * sizeof(float);
+
+        // run kernel
+        dwtHaar1D<<<grid_size, block_size, mem_shared>>>(
+            d_idata, d_odata, approx_final, dlevels_step, num_threads_total_left, block_size.x);
+
+        // Copy approx_final to appropriate location
+        if (approx_is_input) {
+            checkCudaErrors(cudaMemcpy(d_idata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice));
+        }
+        else {
+            checkCudaErrors(cudaMemcpy(d_odata, approx_final, grid_size.x * 4, cudaMemcpyDeviceToDevice));
+        }
+
+        // update level variables
+        if (dlevels_left < 10) {
+            // approx_final = d_odata;
+            approx_is_input = 0;
+        }
+
+        // more global steps necessary
+        dlevels_step = (dlevels_left > 10) ? dlevels_left - 10 : dlevels_left;
+        dlevels_left -= 10;
+
+        // after each step only half the threads are used any longer
+        // therefore after 10 steps 2^10 less threads
+        num_threads_total_left = num_threads_total_left >> 10;
+
+        // update block and grid size
+        grid_size.x = (num_threads_total_left / 512) + (0 != (num_threads_total_left % 512)) ? 1 : 0;
+
+        if (grid_size.x <= 1) {
+            block_size.x = num_threads_total_left;
+        }
+    }
+
+    // get the result back from the server
+    // allocate mem for the result
+    float *odata = (float *)malloc(smem_size);
+    checkCudaErrors(cudaMemcpy(odata, d_odata, smem_size, cudaMemcpyDeviceToHost));
+
+    // post processing
+    // write file for regression test
+    if (r_fname == NULL) {
+        fprintf(stderr,
+                "Cannot write the output file storing the result of the wavelet "
+                "decomposition.\n%s",
+                usage);
+        exit(EXIT_FAILURE);
+    }
+
+    if (sdkWriteFile(r_fname, odata, slength, 0.001f, false) == true) {
+        printf("Writing result to \"%s\"\n", r_fname);
+    }
+    else {
+        exit(EXIT_FAILURE);
+    }
+
+    // load the reference solution
+    unsigned int len_reference = 0;
+    float       *reference     = NULL;
+
+    // Validation is optional - only run if gold file is provided
+    if (r_gold_fname != NULL) {
+        if (sdkReadFile(r_gold_fname, &reference, &len_reference, false) == true) {
+            printf("Reading reference result from \"%s\"\n", r_gold_fname);
+        }
+        else {
+            fprintf(stderr, "Failed to read gold file \"%s\"\n", r_gold_fname);
+            exit(EXIT_FAILURE);
+        }
+
+        assert(slength == len_reference);
+
+        // compare the computed solution and the reference
+        bResult = (bool)sdkCompareL2fe(reference, odata, slength, 0.001f);
+        free(reference);
+    } else {
+        printf("No gold file provided - skipping validation\n");
+        bResult = true; // Mark as success when validation is skipped
+    }
+
+    // free allocated host and device memory
+    checkCudaErrors(cudaFree(d_odata));
+    checkCudaErrors(cudaFree(d_idata));
+    checkCudaErrors(cudaFree(approx_final));
+
+    free(signal);
+    free(odata);
+    free(s_fname);
+    if (r_gold_fname != NULL) {
+        free(r_gold_fname);
+    }
+
+    if (r_gold_fname != NULL) {
+        printf(bResult ? "Test success!\n" : "Test failure!\n");
+    } else {
+        printf("Processing complete (validation skipped)\n");
+    }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Get number of decomposition levels to perform a full decomposition
+//! Also check if the input signal size is suitable
+//! @return  true if the number of decomposition levels could be determined
+//!          and the signal length is supported by the implementation,
+//!          otherwise false
+//! @param   len  length of input signal
+//! @param   levels  number of decomposition levels necessary to perform a full
+//!           decomposition
+////////////////////////////////////////////////////////////////////////////////
+bool getLevels(unsigned int len, unsigned int *levels)
+{
+    bool retval = false;
+
+    // currently signals up to a length of 2^20 supported
+    for (unsigned int i = 0; i < 20; ++i) {
+        if (len == (1 << i)) {
+            *levels = i;
+            retval  = true;
+            break;
+        }
+    }
+
+    return retval;
+}
diff --git a/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh
new file mode 100644
index 000000000..3bf1e6588
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/dwtHaar1D_kernel.cuh
@@ -0,0 +1,248 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+* 1D DWT for Haar wavelet and signals with a length which is a power of 2.
+* The code reduces bank conflicts and non-coalesced reads / writes as
+* appropriate but does not fully remove them because the computational
+* overhead to achieve this would outweighs the benefit (see inline comments
+* for more details).
+* Large signals are subdivided into sub-signals with 512 elements and the
+* wavelet transform for these is computed with one block over 10 decomposition
+* levels. The resulting signal consisting of the approximation coefficients at
+* level X is then processed in a subsequent step on the device. This requires
+* interblock synchronization which is only possible on host side.
+* Detail coefficients which have been computed are not further referenced
+* during the decomposition so that they can be stored directly in their final
+* position in global memory. The transform and its storing scheme preserve
+* locality in the coefficients so that these writes are coalesced.
+* Approximation coefficients are stored in shared memory because they are
+* needed to compute the subsequent decomposition step. The top most
+* approximation coefficient for a sub-signal processed by one block is stored
+* in a special global memory location to simplify the processing after the
+* interblock synchronization.
+* Most books on wavelets explain the Haar wavelet decomposition. A good freely
+* available resource is the Wavelet primer by Stollnitz et al.
+* http://grail.cs.washington.edu/projects/wavelets/article/wavelet1.pdf
+* http://grail.cs.washington.edu/projects/wavelets/article/wavelet2.pdf
+* The basic of all Wavelet transforms is to decompose a signal into
+* approximation (a) and detail (d) coefficients where the detail tends to be
+* small or zero which allows / simplifies compression. The following "graphs"
+* demonstrate the transform for a signal
+* of length eight. The index always describes the decomposition level where
+* a coefficient arises. The input signal is interpreted as approximation signal
+* at level 0. The coefficients computed on the device are stored in the same
+* scheme as in the example. This data structure is particularly well suited for
+* compression and also preserves the hierarchical structure of the
+decomposition.
+
+-------------------------------------------------
+| a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 | a_0 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_1 | a_1 | a_1 | a_1 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_2 | a_2 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+-------------------------------------------------
+| a_3 | d_3 | d_2 | d_2 | d_1 | d_1 | d_1 | d_1 |
+-------------------------------------------------
+
+* Device Code.
+*/
+
+#ifndef _DWTHAAR1D_KERNEL_H_
+#define _DWTHAAR1D_KERNEL_H_
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+////////////////////////////////////////////////////////////////////////////////
+//! @param id  input data
+//! @param od  output data
+//! @param value
+////////////////////////////////////////////////////////////////////////////////
+__global__ void initValue(float *od, float value)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+    // position of write into global memory
+    unsigned int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    od[index] = value;
+
+    // sync after each decomposition step
+    cg::sync(cta);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Compute partial wavelet decomposition on the GPU using Haar basis
+//! For each thread block the full decomposition is computed but these results
+//! have to be combined
+//! Use one thread to perform the full decomposition
+//! @param id  input data
+//! @param od  output data
+//! @param approx_final  place to store the final approximation coefficient for
+//!                      the subsignal
+//! @param dlevels  number of decomposition levels for this transform
+//! @param slength_step_half   half signal length for current decomposition
+//!                            level (offset for storing detail coefficients in
+//!                            global memory
+//! @param bdim  block dimension
+////////////////////////////////////////////////////////////////////////////////
+__global__ void dwtHaar1D(float             *id,
+                          float             *od,
+                          float             *approx_final,
+                          const unsigned int dlevels,
+                          const unsigned int slength_step_half,
+                          const int          bdim)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+
+    // shared memory for part of the signal
+    extern __shared__ float shared[];
+
+    // thread runtime environment, 1D parametrization
+    const int gdim = gridDim.x;
+    // const int bdim = blockDim.x;
+    const int bid = blockIdx.x;
+    const int tid = threadIdx.x;
+
+    // global thread id (w.r.t. to total data set)
+    const int    tid_global = (bid * bdim) + tid;
+    unsigned int idata      = (bid * (2 * bdim)) + tid;
+
+    // read data from global memory
+    shared[tid]        = id[idata];
+    shared[tid + bdim] = id[idata + bdim];
+    cg::sync(cta);
+
+    // this operation has a two way bank conflicts for all threads, this are two
+    // additional cycles for each warp -- all alternatives to avoid this bank
+    // conflict are more expensive than the one cycle introduced by serialization
+    float data0 = shared[2 * tid];
+    float data1 = shared[(2 * tid) + 1];
+    cg::sync(cta);
+
+    // detail coefficient, not further referenced so directly store in
+    // global memory
+    od[tid_global + slength_step_half] = (data0 - data1) * INV_SQRT_2;
+
+    // offset to avoid bank conflicts
+    // see the scan example for a more detailed description
+    unsigned int atid = tid + (tid >> LOG_NUM_BANKS);
+
+    // approximation coefficient
+    // store in shared memory for further decomposition steps in this global step
+    shared[atid] = (data0 + data1) * INV_SQRT_2;
+
+    // all threads have to write approximation coefficient to shared memory before
+    // next steps can take place
+    cg::sync(cta);
+
+    // early out if possible
+    // the compiler removes this part from the source because dlevels is
+    // a constant shader input
+    // note: syncthreads in bodies of branches can lead to dead-locks unless
+    // the condition evaluates the same way for ALL threads of a block, as in
+    // this case
+    if (dlevels > 1) {
+        // offset to second element in shared element which has to be used for the
+        // decomposition, effectively 2^(i - 1)
+        unsigned int offset_neighbor = 1;
+        // number of active threads per decomposition level
+        // identical to the offset for the detail coefficients
+        unsigned int num_threads = bdim >> 1;
+
+        // index for the first element of the pair to process
+        // the representation is still compact (and therefore still tid * 2)
+        // because the first step operated on registers and only the result has been
+        // written to shared memory
+        unsigned int idata0 = tid * 2;
+
+        // offset levels to make the loop more efficient
+        for (unsigned int i = 1; i < dlevels; ++i) {
+            // Non-coalesced writes occur if the number of active threads becomes
+            // less than 16 for a block because the start address for the first
+            // block is not always aligned with 64 byte which is necessary for
+            // coalesced access. However, the problem only occurs at high levels
+            // with only a small number of active threads so that the total number of
+            // non-coalesced access is rather small and does not justify the
+            // computations which are necessary to avoid these uncoalesced writes
+            // (this has been tested and verified)
+            if (tid < num_threads) {
+                // update stride, with each decomposition level the stride grows by a
+                // factor of 2
+                unsigned int idata1 = idata0 + offset_neighbor;
+
+                // position of write into global memory
+                unsigned int g_wpos = (num_threads * gdim) + (bid * num_threads) + tid;
+
+                // compute wavelet decomposition step
+
+                // offset to avoid bank conflicts
+                unsigned int c_idata0 = idata0 + (idata0 >> LOG_NUM_BANKS);
+                unsigned int c_idata1 = idata1 + (idata1 >> LOG_NUM_BANKS);
+
+                // detail coefficient, not further modified so directly store
+                // in global memory
+                od[g_wpos] = (shared[c_idata0] - shared[c_idata1]) * INV_SQRT_2;
+
+                // approximation coefficient
+                // note that the representation in shared memory becomes rather sparse
+                // (with a lot of holes inbetween) but the storing scheme in global
+                // memory guarantees that the common representation (approx, detail_0,
+                // detail_1, ...)
+                // is achieved
+                shared[c_idata0] = (shared[c_idata0] + shared[c_idata1]) * INV_SQRT_2;
+
+                // update storage offset for details
+                num_threads = num_threads >> 1; // div 2
+                offset_neighbor <<= 1;          // mul 2
+                idata0 = idata0 << 1;           // mul 2
+            }
+
+            // sync after each decomposition step
+            cg::sync(cta);
+        }
+
+        // write the top most level element for the next decomposition steps
+        // which are performed after an interlock synchronization on host side
+        if (0 == tid) {
+            approx_final[bid] = shared[0];
+        }
+
+    } // end early out if possible
+}
+
+#endif // #ifndef _DWTHAAR1D_KERNEL_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/exception.h b/src/cuda/HPC/image/dwtHaar1D/exception.h
new file mode 100644
index 000000000..ca8ac2525
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h b/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h
new file mode 100644
index 000000000..723b28f7a
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/helper_cuda.h
@@ -0,0 +1,988 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+
+    case CUFFT_MISSING_DEPENDENCY:
+      return "CUFFT_MISSING_DEPENDENCY";
+
+    case CUFFT_NVRTC_FAILURE:
+      return "CUFFT_NVRTC_FAILURE";
+
+    case CUFFT_NVJITLINK_FAILURE:
+      return "CUFFT_NVJITLINK_FAILURE";
+
+    case CUFFT_NVSHMEM_FAILURE:
+      return "CUFFT_NVSHMEM_FAILURE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {0x89, 128},
+      {0x90, 128},
+      {0xa0, 128},
+      {0xa1, 128},
+      {0xa3, 128},
+      {0xb0, 128},
+      {0xc0, 128},
+      {0xc1, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {0x89, "Ada"},
+      {0x90, "Hopper"},
+      {0xa0, "Blackwell"},
+      {0xa1, "Blackwell"},
+      {0xa3, "Blackwell"},
+      {0xb0, "Blackwell"},
+      {0xc0, "Blackwell"},
+      {0xc1, "Blackwell"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_functions.h b/src/cuda/HPC/image/dwtHaar1D/helper_functions.h
new file mode 100644
index 000000000..f71da5761
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include "helper_image.h"  // helper functions for image compare, dump, data comparisons
+#include "helper_string.h"  // helper functions for string parsing
+#include "helper_timer.h"   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_image.h b/src/cuda/HPC/image/dwtHaar1D/helper_image.h
new file mode 100644
index 000000000..7e81b0cd8
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/helper_image.h
@@ -0,0 +1,1001 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include "helper_string.h"
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float &val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+  FILE *fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#') {
+      continue;
+    }
+
+    if (i == 0) {
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    } else if (i == 1) {
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    } else if (i == 2) {
+      i += SSCANF(header, "%u", &maxval);
+    }
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height) {
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+    }
+  } else {
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                    *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+      0) {
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+  }
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+  unsigned char *idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels)) {
+    return false;
+  }
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data) {
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+  }
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+    fh << data[i];
+  }
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char *ndata =
+      (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char *ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE *fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE *fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+    fh << data[i] << ' ';
+  }
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp) {
+      error_count++;
+    }
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count) {
+      printf("total # of errors = %d\n", error_count);
+    }
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE *fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf(
+          "compareBin2Bin <unsigned int>"
+          " unable to open ref_file: %s\n",
+          ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf(
+          "> compareBin2Bin <unsigned int> nelements=%d,"
+          " epsilon=%4.2f, threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+      printf(
+          "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+          " threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_string.h b/src/cuda/HPC/image/dwtHaar1D/helper_string.h
new file mode 100644
index 000000000..39a1b3805
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
diff --git a/src/cuda/HPC/image/dwtHaar1D/helper_timer.h b/src/cuda/HPC/image/dwtHaar1D/helper_timer.h
new file mode 100644
index 000000000..0614a7802
--- /dev/null
+++ b/src/cuda/HPC/image/dwtHaar1D/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include "exception.h"
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/Makefile b/src/cuda/HPC/image/recursiveGaussian/Makefile
new file mode 100644
index 000000000..526fc27c9
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/Makefile
@@ -0,0 +1,12 @@
+EXECUTABLE := recursiveGaussian_hpc
+CUFILES := recursiveGaussian_cuda.cu
+CCFILES := recursiveGaussian.cpp
+OMIT_CUTIL_LIB := 1
+OMIT_SHRUTIL_LIB := 1
+SM_VERSIONS := 75 80 90
+
+# Build in benchmark mode without OpenGL
+NVCCFLAGS += -DNO_OPENGL
+CXXFLAGS += -DNO_OPENGL
+
+include ../../../common/common.mk
diff --git a/src/cuda/HPC/image/recursiveGaussian/exception.h b/src/cuda/HPC/image/recursiveGaussian/exception.h
new file mode 100644
index 000000000..ca8ac2525
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/exception.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* CUda UTility Library */
+#ifndef COMMON_EXCEPTION_H_
+#define COMMON_EXCEPTION_H_
+
+// includes, system
+#include <stdlib.h>
+#include <exception>
+#include <iostream>
+#include <stdexcept>
+#include <string>
+
+//! Exception wrapper.
+//! @param Std_Exception Exception out of namespace std for easy typing.
+template <class Std_Exception>
+class Exception : public Std_Exception {
+ public:
+  //! @brief Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const char *detailed = "-");
+
+  //! Static construction interface
+  //! @return Alwayss throws ( Located_Exception<Exception>)
+  //! @param file file in which the Exception occurs
+  //! @param line line in which the Exception occurs
+  //! @param detailed details on the code fragment causing the Exception
+  static void throw_it(const char *file, const int line,
+                       const std::string &detailed);
+
+  //! Destructor
+  virtual ~Exception() throw();
+
+ private:
+  //! Constructor, default (private)
+  Exception();
+
+  //! Constructor, standard
+  //! @param str string returned by what()
+  explicit Exception(const std::string &str);
+};
+
+////////////////////////////////////////////////////////////////////////////////
+//! Exception handler function for arbitrary exceptions
+//! @param ex exception to handle
+////////////////////////////////////////////////////////////////////////////////
+template <class Exception_Typ>
+inline void handleException(const Exception_Typ &ex) {
+  std::cerr << ex.what() << std::endl;
+
+  exit(EXIT_FAILURE);
+}
+
+//! Convenience macros
+
+//! Exception caused by dynamic program behavior, e.g. file does not exist
+#define RUNTIME_EXCEPTION(msg) \
+  Exception<std::runtime_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Logic exception in program, e.g. an assert failed
+#define LOGIC_EXCEPTION(msg) \
+  Exception<std::logic_error>::throw_it(__FILE__, __LINE__, msg)
+
+//! Out of range exception
+#define RANGE_EXCEPTION(msg) \
+  Exception<std::range_error>::throw_it(__FILE__, __LINE__, msg)
+
+////////////////////////////////////////////////////////////////////////////////
+//! Implementation
+
+// includes, system
+#include <sstream>
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const char *detailed) {
+  std::stringstream s;
+
+  // Quiet heavy-weight but exceptions are not for
+  // performance / release versions
+  s << "Exception in file '" << file << "' in line " << line << "\n"
+    << "Detailed description: " << detailed << "\n";
+
+  throw Exception(s.str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Static construction interface.
+//! @param  Exception causing code fragment (file and line) and detailed infos.
+////////////////////////////////////////////////////////////////////////////////
+/*static*/ template <class Std_Exception>
+void Exception<Std_Exception>::throw_it(const char *file, const int line,
+                                        const std::string &msg) {
+  throw_it(file, line, msg.c_str());
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, default (private).
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception() : Std_Exception("Unknown Exception.\n") {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Constructor, standard (private).
+//! String returned by what().
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::Exception(const std::string &s) : Std_Exception(s) {}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Destructor
+////////////////////////////////////////////////////////////////////////////////
+template <class Std_Exception>
+Exception<Std_Exception>::~Exception() throw() {}
+
+  // functions, exported
+
+#endif  // COMMON_EXCEPTION_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h b/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h
new file mode 100644
index 000000000..723b28f7a
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_cuda.h
@@ -0,0 +1,988 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+////////////////////////////////////////////////////////////////////////////////
+// These are CUDA Helper functions for initialization and error checking
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#pragma once
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "helper_string.h"
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// Note, it is required that your SDK sample to include the proper header
+// files, please refer the CUDA examples for examples of the needed CUDA
+// headers, which may change depending on which CUDA functions are used.
+
+// CUDA Runtime error messages
+#ifdef __DRIVER_TYPES_H__
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+#endif
+
+#ifdef CUDA_DRIVER_API
+// CUDA Driver API errors
+static const char *_cudaGetErrorEnum(CUresult error) {
+  static char unknown[] = "<unknown>";
+  const char *ret = NULL;
+  cuGetErrorName(error, &ret);
+  return ret ? ret : unknown;
+}
+#endif
+
+#ifdef CUBLAS_API_H_
+// cuBLAS API errors
+static const char *_cudaGetErrorEnum(cublasStatus_t error) {
+  switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+      return "CUBLAS_STATUS_SUCCESS";
+
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      return "CUBLAS_STATUS_NOT_INITIALIZED";
+
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      return "CUBLAS_STATUS_ALLOC_FAILED";
+
+    case CUBLAS_STATUS_INVALID_VALUE:
+      return "CUBLAS_STATUS_INVALID_VALUE";
+
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      return "CUBLAS_STATUS_ARCH_MISMATCH";
+
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      return "CUBLAS_STATUS_MAPPING_ERROR";
+
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      return "CUBLAS_STATUS_EXECUTION_FAILED";
+
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      return "CUBLAS_STATUS_INTERNAL_ERROR";
+
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      return "CUBLAS_STATUS_NOT_SUPPORTED";
+
+    case CUBLAS_STATUS_LICENSE_ERROR:
+      return "CUBLAS_STATUS_LICENSE_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef _CUFFT_H_
+// cuFFT API errors
+static const char *_cudaGetErrorEnum(cufftResult error) {
+  switch (error) {
+    case CUFFT_SUCCESS:
+      return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+      return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+      return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+      return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+      return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+      return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+      return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+      return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+      return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+      return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INVALID_DEVICE:
+      return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_NO_WORKSPACE:
+      return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+      return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_NOT_SUPPORTED:
+      return "CUFFT_NOT_SUPPORTED";
+
+    case CUFFT_MISSING_DEPENDENCY:
+      return "CUFFT_MISSING_DEPENDENCY";
+
+    case CUFFT_NVRTC_FAILURE:
+      return "CUFFT_NVRTC_FAILURE";
+
+    case CUFFT_NVJITLINK_FAILURE:
+      return "CUFFT_NVJITLINK_FAILURE";
+
+    case CUFFT_NVSHMEM_FAILURE:
+      return "CUFFT_NVSHMEM_FAILURE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSPARSEAPI
+// cuSPARSE API errors
+static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
+  switch (error) {
+    case CUSPARSE_STATUS_SUCCESS:
+      return "CUSPARSE_STATUS_SUCCESS";
+
+    case CUSPARSE_STATUS_NOT_INITIALIZED:
+      return "CUSPARSE_STATUS_NOT_INITIALIZED";
+
+    case CUSPARSE_STATUS_ALLOC_FAILED:
+      return "CUSPARSE_STATUS_ALLOC_FAILED";
+
+    case CUSPARSE_STATUS_INVALID_VALUE:
+      return "CUSPARSE_STATUS_INVALID_VALUE";
+
+    case CUSPARSE_STATUS_ARCH_MISMATCH:
+      return "CUSPARSE_STATUS_ARCH_MISMATCH";
+
+    case CUSPARSE_STATUS_MAPPING_ERROR:
+      return "CUSPARSE_STATUS_MAPPING_ERROR";
+
+    case CUSPARSE_STATUS_EXECUTION_FAILED:
+      return "CUSPARSE_STATUS_EXECUTION_FAILED";
+
+    case CUSPARSE_STATUS_INTERNAL_ERROR:
+      return "CUSPARSE_STATUS_INTERNAL_ERROR";
+
+    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CUSOLVER_COMMON_H_
+// cuSOLVER API errors
+static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
+  switch (error) {
+    case CUSOLVER_STATUS_SUCCESS:
+      return "CUSOLVER_STATUS_SUCCESS";
+    case CUSOLVER_STATUS_NOT_INITIALIZED:
+      return "CUSOLVER_STATUS_NOT_INITIALIZED";
+    case CUSOLVER_STATUS_ALLOC_FAILED:
+      return "CUSOLVER_STATUS_ALLOC_FAILED";
+    case CUSOLVER_STATUS_INVALID_VALUE:
+      return "CUSOLVER_STATUS_INVALID_VALUE";
+    case CUSOLVER_STATUS_ARCH_MISMATCH:
+      return "CUSOLVER_STATUS_ARCH_MISMATCH";
+    case CUSOLVER_STATUS_MAPPING_ERROR:
+      return "CUSOLVER_STATUS_MAPPING_ERROR";
+    case CUSOLVER_STATUS_EXECUTION_FAILED:
+      return "CUSOLVER_STATUS_EXECUTION_FAILED";
+    case CUSOLVER_STATUS_INTERNAL_ERROR:
+      return "CUSOLVER_STATUS_INTERNAL_ERROR";
+    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
+    case CUSOLVER_STATUS_NOT_SUPPORTED:
+      return "CUSOLVER_STATUS_NOT_SUPPORTED ";
+    case CUSOLVER_STATUS_ZERO_PIVOT:
+      return "CUSOLVER_STATUS_ZERO_PIVOT";
+    case CUSOLVER_STATUS_INVALID_LICENSE:
+      return "CUSOLVER_STATUS_INVALID_LICENSE";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef CURAND_H_
+// cuRAND API errors
+static const char *_cudaGetErrorEnum(curandStatus_t error) {
+  switch (error) {
+    case CURAND_STATUS_SUCCESS:
+      return "CURAND_STATUS_SUCCESS";
+
+    case CURAND_STATUS_VERSION_MISMATCH:
+      return "CURAND_STATUS_VERSION_MISMATCH";
+
+    case CURAND_STATUS_NOT_INITIALIZED:
+      return "CURAND_STATUS_NOT_INITIALIZED";
+
+    case CURAND_STATUS_ALLOCATION_FAILED:
+      return "CURAND_STATUS_ALLOCATION_FAILED";
+
+    case CURAND_STATUS_TYPE_ERROR:
+      return "CURAND_STATUS_TYPE_ERROR";
+
+    case CURAND_STATUS_OUT_OF_RANGE:
+      return "CURAND_STATUS_OUT_OF_RANGE";
+
+    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
+      return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
+
+    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
+      return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
+
+    case CURAND_STATUS_LAUNCH_FAILURE:
+      return "CURAND_STATUS_LAUNCH_FAILURE";
+
+    case CURAND_STATUS_PREEXISTING_FAILURE:
+      return "CURAND_STATUS_PREEXISTING_FAILURE";
+
+    case CURAND_STATUS_INITIALIZATION_FAILED:
+      return "CURAND_STATUS_INITIALIZATION_FAILED";
+
+    case CURAND_STATUS_ARCH_MISMATCH:
+      return "CURAND_STATUS_ARCH_MISMATCH";
+
+    case CURAND_STATUS_INTERNAL_ERROR:
+      return "CURAND_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NVJPEGAPI
+// nvJPEG API errors
+static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
+  switch (error) {
+    case NVJPEG_STATUS_SUCCESS:
+      return "NVJPEG_STATUS_SUCCESS";
+
+    case NVJPEG_STATUS_NOT_INITIALIZED:
+      return "NVJPEG_STATUS_NOT_INITIALIZED";
+
+    case NVJPEG_STATUS_INVALID_PARAMETER:
+      return "NVJPEG_STATUS_INVALID_PARAMETER";
+
+    case NVJPEG_STATUS_BAD_JPEG:
+      return "NVJPEG_STATUS_BAD_JPEG";
+
+    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
+      return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
+
+    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
+      return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
+
+    case NVJPEG_STATUS_EXECUTION_FAILED:
+      return "NVJPEG_STATUS_EXECUTION_FAILED";
+
+    case NVJPEG_STATUS_ARCH_MISMATCH:
+      return "NVJPEG_STATUS_ARCH_MISMATCH";
+
+    case NVJPEG_STATUS_INTERNAL_ERROR:
+      return "NVJPEG_STATUS_INTERNAL_ERROR";
+  }
+
+  return "<unknown>";
+}
+#endif
+
+#ifdef NV_NPPIDEFS_H
+// NPP API errors
+static const char *_cudaGetErrorEnum(NppStatus error) {
+  switch (error) {
+    case NPP_NOT_SUPPORTED_MODE_ERROR:
+      return "NPP_NOT_SUPPORTED_MODE_ERROR";
+
+    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_RESIZE_NO_OPERATION_ERROR:
+      return "NPP_RESIZE_NO_OPERATION_ERROR";
+
+    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
+      return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_BAD_ARG_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFF_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECT_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUAD_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEM_ALLOC_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_INPUT:
+      return "NPP_INVALID_INPUT";
+
+    case NPP_POINTER_ERROR:
+      return "NPP_POINTER_ERROR";
+
+    case NPP_WARNING:
+      return "NPP_WARNING";
+
+    case NPP_ODD_ROI_WARNING:
+      return "NPP_ODD_ROI_WARNING";
+#else
+
+    // These are for CUDA 5.5 or higher
+    case NPP_BAD_ARGUMENT_ERROR:
+      return "NPP_BAD_ARGUMENT_ERROR";
+
+    case NPP_COEFFICIENT_ERROR:
+      return "NPP_COEFFICIENT_ERROR";
+
+    case NPP_RECTANGLE_ERROR:
+      return "NPP_RECTANGLE_ERROR";
+
+    case NPP_QUADRANGLE_ERROR:
+      return "NPP_QUADRANGLE_ERROR";
+
+    case NPP_MEMORY_ALLOCATION_ERR:
+      return "NPP_MEMORY_ALLOCATION_ERROR";
+
+    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_INVALID_HOST_POINTER_ERROR:
+      return "NPP_INVALID_HOST_POINTER_ERROR";
+
+    case NPP_INVALID_DEVICE_POINTER_ERROR:
+      return "NPP_INVALID_DEVICE_POINTER_ERROR";
+#endif
+
+    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
+      return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
+
+    case NPP_TEXTURE_BIND_ERROR:
+      return "NPP_TEXTURE_BIND_ERROR";
+
+    case NPP_WRONG_INTERSECTION_ROI_ERROR:
+      return "NPP_WRONG_INTERSECTION_ROI_ERROR";
+
+    case NPP_NOT_EVEN_STEP_ERROR:
+      return "NPP_NOT_EVEN_STEP_ERROR";
+
+    case NPP_INTERPOLATION_ERROR:
+      return "NPP_INTERPOLATION_ERROR";
+
+    case NPP_RESIZE_FACTOR_ERROR:
+      return "NPP_RESIZE_FACTOR_ERROR";
+
+    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
+      return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
+
+    case NPP_MEMFREE_ERR:
+      return "NPP_MEMFREE_ERR";
+
+    case NPP_MEMSET_ERR:
+      return "NPP_MEMSET_ERR";
+
+    case NPP_MEMCPY_ERR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERR:
+      return "NPP_MIRROR_FLIP_ERR";
+#else
+
+    case NPP_MEMFREE_ERROR:
+      return "NPP_MEMFREE_ERROR";
+
+    case NPP_MEMSET_ERROR:
+      return "NPP_MEMSET_ERROR";
+
+    case NPP_MEMCPY_ERROR:
+      return "NPP_MEMCPY_ERROR";
+
+    case NPP_MIRROR_FLIP_ERROR:
+      return "NPP_MIRROR_FLIP_ERROR";
+#endif
+
+    case NPP_ALIGNMENT_ERROR:
+      return "NPP_ALIGNMENT_ERROR";
+
+    case NPP_STEP_ERROR:
+      return "NPP_STEP_ERROR";
+
+    case NPP_SIZE_ERROR:
+      return "NPP_SIZE_ERROR";
+
+    case NPP_NULL_POINTER_ERROR:
+      return "NPP_NULL_POINTER_ERROR";
+
+    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
+      return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
+
+    case NPP_NOT_IMPLEMENTED_ERROR:
+      return "NPP_NOT_IMPLEMENTED_ERROR";
+
+    case NPP_ERROR:
+      return "NPP_ERROR";
+
+    case NPP_SUCCESS:
+      return "NPP_SUCCESS";
+
+    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
+      return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
+
+    case NPP_MISALIGNED_DST_ROI_WARNING:
+      return "NPP_MISALIGNED_DST_ROI_WARNING";
+
+    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
+      return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
+
+    case NPP_DOUBLE_SIZE_WARNING:
+      return "NPP_DOUBLE_SIZE_WARNING";
+
+    case NPP_WRONG_INTERSECTION_ROI_WARNING:
+      return "NPP_WRONG_INTERSECTION_ROI_WARNING";
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
+    /* These are 6.0 or higher */
+    case NPP_LUT_PALETTE_BITSIZE_ERROR:
+      return "NPP_LUT_PALETTE_BITSIZE_ERROR";
+
+    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
+      return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
+
+    case NPP_QUALITY_INDEX_ERROR:
+      return "NPP_QUALITY_INDEX_ERROR";
+
+    case NPP_CHANNEL_ORDER_ERROR:
+      return "NPP_CHANNEL_ORDER_ERROR";
+
+    case NPP_ZERO_MASK_VALUE_ERROR:
+      return "NPP_ZERO_MASK_VALUE_ERROR";
+
+    case NPP_NUMBER_OF_CHANNELS_ERROR:
+      return "NPP_NUMBER_OF_CHANNELS_ERROR";
+
+    case NPP_COI_ERROR:
+      return "NPP_COI_ERROR";
+
+    case NPP_DIVISOR_ERROR:
+      return "NPP_DIVISOR_ERROR";
+
+    case NPP_CHANNEL_ERROR:
+      return "NPP_CHANNEL_ERROR";
+
+    case NPP_STRIDE_ERROR:
+      return "NPP_STRIDE_ERROR";
+
+    case NPP_ANCHOR_ERROR:
+      return "NPP_ANCHOR_ERROR";
+
+    case NPP_MASK_SIZE_ERROR:
+      return "NPP_MASK_SIZE_ERROR";
+
+    case NPP_MOMENT_00_ZERO_ERROR:
+      return "NPP_MOMENT_00_ZERO_ERROR";
+
+    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
+      return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
+
+    case NPP_THRESHOLD_ERROR:
+      return "NPP_THRESHOLD_ERROR";
+
+    case NPP_CONTEXT_MATCH_ERROR:
+      return "NPP_CONTEXT_MATCH_ERROR";
+
+    case NPP_FFT_FLAG_ERROR:
+      return "NPP_FFT_FLAG_ERROR";
+
+    case NPP_FFT_ORDER_ERROR:
+      return "NPP_FFT_ORDER_ERROR";
+
+    case NPP_SCALE_RANGE_ERROR:
+      return "NPP_SCALE_RANGE_ERROR";
+
+    case NPP_DATA_TYPE_ERROR:
+      return "NPP_DATA_TYPE_ERROR";
+
+    case NPP_OUT_OFF_RANGE_ERROR:
+      return "NPP_OUT_OFF_RANGE_ERROR";
+
+    case NPP_DIVIDE_BY_ZERO_ERROR:
+      return "NPP_DIVIDE_BY_ZERO_ERROR";
+
+    case NPP_RANGE_ERROR:
+      return "NPP_RANGE_ERROR";
+
+    case NPP_NO_MEMORY_ERROR:
+      return "NPP_NO_MEMORY_ERROR";
+
+    case NPP_ERROR_RESERVED:
+      return "NPP_ERROR_RESERVED";
+
+    case NPP_NO_OPERATION_WARNING:
+      return "NPP_NO_OPERATION_WARNING";
+
+    case NPP_DIVIDE_BY_ZERO_WARNING:
+      return "NPP_DIVIDE_BY_ZERO_WARNING";
+#endif
+
+#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
+    /* These are 7.0 or higher */
+    case NPP_OVERFLOW_ERROR:
+      return "NPP_OVERFLOW_ERROR";
+
+    case NPP_CORRUPTED_DATA_ERROR:
+      return "NPP_CORRUPTED_DATA_ERROR";
+#endif
+  }
+
+  return "<unknown>";
+}
+#endif
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#ifdef __DRIVER_TYPES_H__
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#endif
+
+#ifndef MAX
+#define MAX(a, b) (a > b ? a : b)
+#endif
+
+// Float To Int conversion
+inline int ftoi(float value) {
+  return (value >= 0 ? static_cast<int>(value + 0.5)
+                     : static_cast<int>(value - 0.5));
+}
+
+// Beginning of GPU Architecture definitions
+inline int _ConvertSMVer2Cores(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the # of cores per SM
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    int Cores;
+  } sSMtoCores;
+
+  sSMtoCores nGpuArchCoresPerSM[] = {
+      {0x30, 192},
+      {0x32, 192},
+      {0x35, 192},
+      {0x37, 192},
+      {0x50, 128},
+      {0x52, 128},
+      {0x53, 128},
+      {0x60,  64},
+      {0x61, 128},
+      {0x62, 128},
+      {0x70,  64},
+      {0x72,  64},
+      {0x75,  64},
+      {0x80,  64},
+      {0x86, 128},
+      {0x87, 128},
+      {0x89, 128},
+      {0x90, 128},
+      {0xa0, 128},
+      {0xa1, 128},
+      {0xa3, 128},
+      {0xb0, 128},
+      {0xc0, 128},
+      {0xc1, 128},
+      {-1, -1}};
+
+  int index = 0;
+
+  while (nGpuArchCoresPerSM[index].SM != -1) {
+    if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchCoresPerSM[index].Cores;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoCores for SM %d.%d is undefined."
+      "  Default to use %d Cores/SM\n",
+      major, minor, nGpuArchCoresPerSM[index - 1].Cores);
+  return nGpuArchCoresPerSM[index - 1].Cores;
+}
+
+inline const char* _ConvertSMVer2ArchName(int major, int minor) {
+  // Defines for GPU Architecture types (using the SM version to determine
+  // the GPU Arch name)
+  typedef struct {
+    int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
+    // and m = SM minor version
+    const char* name;
+  } sSMtoArchName;
+
+  sSMtoArchName nGpuArchNameSM[] = {
+      {0x30, "Kepler"},
+      {0x32, "Kepler"},
+      {0x35, "Kepler"},
+      {0x37, "Kepler"},
+      {0x50, "Maxwell"},
+      {0x52, "Maxwell"},
+      {0x53, "Maxwell"},
+      {0x60, "Pascal"},
+      {0x61, "Pascal"},
+      {0x62, "Pascal"},
+      {0x70, "Volta"},
+      {0x72, "Xavier"},
+      {0x75, "Turing"},
+      {0x80, "Ampere"},
+      {0x86, "Ampere"},
+      {0x87, "Ampere"},
+      {0x89, "Ada"},
+      {0x90, "Hopper"},
+      {0xa0, "Blackwell"},
+      {0xa1, "Blackwell"},
+      {0xa3, "Blackwell"},
+      {0xb0, "Blackwell"},
+      {0xc0, "Blackwell"},
+      {0xc1, "Blackwell"},
+      {-1, "Graphics Device"}};
+
+  int index = 0;
+
+  while (nGpuArchNameSM[index].SM != -1) {
+    if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
+      return nGpuArchNameSM[index].name;
+    }
+
+    index++;
+  }
+
+  // If we don't find the values, we default use the previous one
+  // to run properly
+  printf(
+      "MapSMtoArchName for SM %d.%d is undefined."
+      "  Default to use %s\n",
+      major, minor, nGpuArchNameSM[index - 1].name);
+  return nGpuArchNameSM[index - 1].name;
+}
+  // end of GPU Architecture definitions
+
+#ifdef __CUDA_RUNTIME_H__
+// General GPU Device CUDA Initialization
+inline int gpuDeviceInit(int devID) {
+  int device_count;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuDeviceInit() CUDA error: "
+            "no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  if (devID < 0) {
+    devID = 0;
+  }
+
+  if (devID > device_count - 1) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
+            device_count);
+    fprintf(stderr,
+            ">> gpuDeviceInit (-device=%d) is not a valid"
+            " GPU device. <<\n",
+            devID);
+    fprintf(stderr, "\n");
+    return -devID;
+  }
+
+  int computeMode = -1, major = 0, minor = 0;
+  checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+  if (computeMode == cudaComputeModeProhibited) {
+    fprintf(stderr,
+            "Error: device is running in <Compute Mode "
+            "Prohibited>, no threads can use cudaSetDevice().\n");
+    return -1;
+  }
+
+  if (major < 1) {
+    fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  checkCudaErrors(cudaSetDevice(devID));
+  printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
+
+  return devID;
+}
+
+// This function returns the best GPU (with maximum GFLOPS)
+inline int gpuGetMaxGflopsDeviceId() {
+  int current_device = 0, sm_per_multiproc = 0;
+  int max_perf_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  uint64_t max_compute_perf = 0;
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the best CUDA capable GPU device
+  current_device = 0;
+
+  while (current_device < device_count) {
+    int computeMode = -1, major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+
+    // If this GPU is not running on Compute Mode prohibited,
+    // then we can add it to the list
+    if (computeMode != cudaComputeModeProhibited) {
+      if (major == 9999 && minor == 9999) {
+        sm_per_multiproc = 1;
+      } else {
+        sm_per_multiproc =
+            _ConvertSMVer2Cores(major,  minor);
+      }
+      int multiProcessorCount = 0, clockRate = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
+      cudaError_t result = cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device);
+      if (result != cudaSuccess) {
+        // If cudaDevAttrClockRate attribute is not supported we
+        // set clockRate as 1, to consider GPU with most SMs and CUDA Cores.
+        if(result == cudaErrorInvalidValue) {
+          clockRate = 1;
+        }
+        else {
+          fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \n", __FILE__, __LINE__,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result));
+          exit(EXIT_FAILURE);
+        }
+      }
+      uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
+
+      if (compute_perf > max_compute_perf) {
+        max_compute_perf = compute_perf;
+        max_perf_device = current_device;
+      }
+    } else {
+      devices_prohibited++;
+    }
+
+    ++current_device;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "gpuGetMaxGflopsDeviceId() CUDA error:"
+            " all devices have compute mode prohibited.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return max_perf_device;
+}
+
+// Initialization code to find the best CUDA Device
+inline int findCudaDevice(int argc, const char **argv) {
+  int devID = 0;
+
+  // If the command-line has a device number specified, use it
+  if (checkCmdLineFlag(argc, argv, "device")) {
+    devID = getCmdLineArgumentInt(argc, argv, "device=");
+
+    if (devID < 0) {
+      printf("Invalid command line parameter\n ");
+      exit(EXIT_FAILURE);
+    } else {
+      devID = gpuDeviceInit(devID);
+
+      if (devID < 0) {
+        printf("exiting...\n");
+        exit(EXIT_FAILURE);
+      }
+    }
+  } else {
+    // Otherwise pick the device with highest Gflops/s
+    devID = gpuGetMaxGflopsDeviceId();
+    checkCudaErrors(cudaSetDevice(devID));
+    int major = 0, minor = 0;
+    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
+    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
+    printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+           devID, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+  }
+
+  return devID;
+}
+
+inline int findIntegratedGPU() {
+  int current_device = 0;
+  int device_count = 0;
+  int devices_prohibited = 0;
+
+  checkCudaErrors(cudaGetDeviceCount(&device_count));
+
+  if (device_count == 0) {
+    fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  // Find the integrated GPU which is compute capable
+  while (current_device < device_count) {
+    int computeMode = -1, integrated = -1;
+    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
+    checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
+    // If GPU is integrated and is not running on Compute Mode prohibited,
+    // then cuda can map to GLES resource
+    if (integrated && (computeMode != cudaComputeModeProhibited)) {
+      checkCudaErrors(cudaSetDevice(current_device));
+
+      int major = 0, minor = 0;
+      checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
+      checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
+      printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
+             current_device, _ConvertSMVer2ArchName(major, minor), major, minor);
+
+      return current_device;
+    } else {
+      devices_prohibited++;
+    }
+
+    current_device++;
+  }
+
+  if (devices_prohibited == device_count) {
+    fprintf(stderr,
+            "CUDA error:"
+            " No GLES-CUDA Interop capable GPU found.\n");
+    exit(EXIT_FAILURE);
+  }
+
+  return -1;
+}
+
+// General check for CUDA GPU SM Capabilities
+inline bool checkCudaCapabilities(int major_version, int minor_version) {
+  int dev;
+  int major = 0, minor = 0;
+
+  checkCudaErrors(cudaGetDevice(&dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
+  checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
+
+  if ((major > major_version) ||
+      (major == major_version &&
+       minor >= minor_version)) {
+    printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
+           _ConvertSMVer2ArchName(major, minor), major, minor);
+    return true;
+  } else {
+    printf(
+        "  No GPU device was found that can support "
+        "CUDA compute capability %d.%d.\n",
+        major_version, minor_version);
+    return false;
+  }
+}
+#endif
+
+  // end of CUDA Helper Functions
+
+#endif  // COMMON_HELPER_CUDA_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_functions.h b/src/cuda/HPC/image/recursiveGaussian/helper_functions.h
new file mode 100644
index 000000000..f71da5761
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_functions.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing,
+// timers, image helpers, etc)
+#ifndef COMMON_HELPER_FUNCTIONS_H_
+#define COMMON_HELPER_FUNCTIONS_H_
+
+#ifdef WIN32
+#pragma warning(disable : 4996)
+#endif
+
+// includes, project
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// includes, timer, string parsing, image helpers
+#include "helper_image.h"  // helper functions for image compare, dump, data comparisons
+#include "helper_string.h"  // helper functions for string parsing
+#include "helper_timer.h"   // helper functions for timers
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#endif  // COMMON_HELPER_FUNCTIONS_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_gl.h b/src/cuda/HPC/image/recursiveGaussian/helper_gl.h
new file mode 100644
index 000000000..85c075595
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_gl.h
@@ -0,0 +1,267 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (OpenGL)
+#ifndef HELPER_GL_H
+#define HELPER_GL_H
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    #include <GL/glew.h>
+#endif
+
+#if defined(__APPLE__) || defined(MACOSX)
+    #include <OpenGL/gl.h>
+#else
+    #include <GL/gl.h>
+    #ifdef __linux__
+    #include <GL/glx.h>
+    #endif /* __linux__ */
+#endif
+
+#include <iostream>
+#include <cstdio>
+#include <string>
+#include <sstream>
+#include <algorithm>
+#include <iterator>
+#include <vector>
+#include <assert.h>
+
+
+/* Prototypes */
+namespace __HelperGL {
+    static int isGLVersionSupported(unsigned reqMajor, unsigned reqMinor);
+    static int areGLExtensionsSupported(const std::string &);
+#ifdef __linux__
+
+    #ifndef HELPERGL_EXTERN_GL_FUNC_IMPLEMENTATION
+    #define USE_GL_FUNC(name, proto) proto name = (proto) glXGetProcAddress ((const GLubyte *)#name)
+    #else
+    #define USE_GL_FUNC(name, proto) extern proto name
+    #endif
+
+    USE_GL_FUNC(glBindBuffer, PFNGLBINDBUFFERPROC);
+    USE_GL_FUNC(glDeleteBuffers, PFNGLDELETEBUFFERSPROC);
+    USE_GL_FUNC(glBufferData, PFNGLBUFFERDATAPROC);
+    USE_GL_FUNC(glBufferSubData, PFNGLBUFFERSUBDATAPROC);
+    USE_GL_FUNC(glGenBuffers, PFNGLGENBUFFERSPROC);
+    USE_GL_FUNC(glCreateProgram, PFNGLCREATEPROGRAMPROC);
+    USE_GL_FUNC(glBindProgramARB, PFNGLBINDPROGRAMARBPROC);
+    USE_GL_FUNC(glGenProgramsARB, PFNGLGENPROGRAMSARBPROC);
+    USE_GL_FUNC(glDeleteProgramsARB, PFNGLDELETEPROGRAMSARBPROC);
+    USE_GL_FUNC(glDeleteProgram, PFNGLDELETEPROGRAMPROC);
+    USE_GL_FUNC(glGetProgramInfoLog, PFNGLGETPROGRAMINFOLOGPROC);
+    USE_GL_FUNC(glGetProgramiv, PFNGLGETPROGRAMIVPROC);
+    USE_GL_FUNC(glProgramParameteriEXT, PFNGLPROGRAMPARAMETERIEXTPROC);
+    USE_GL_FUNC(glProgramStringARB, PFNGLPROGRAMSTRINGARBPROC);
+    USE_GL_FUNC(glUnmapBuffer, PFNGLUNMAPBUFFERPROC);
+    USE_GL_FUNC(glMapBuffer, PFNGLMAPBUFFERPROC);
+    USE_GL_FUNC(glGetBufferParameteriv, PFNGLGETBUFFERPARAMETERIVPROC);
+    USE_GL_FUNC(glLinkProgram, PFNGLLINKPROGRAMPROC);
+    USE_GL_FUNC(glUseProgram, PFNGLUSEPROGRAMPROC);
+    USE_GL_FUNC(glAttachShader, PFNGLATTACHSHADERPROC);
+    USE_GL_FUNC(glCreateShader, PFNGLCREATESHADERPROC);
+    USE_GL_FUNC(glShaderSource, PFNGLSHADERSOURCEPROC);
+    USE_GL_FUNC(glCompileShader, PFNGLCOMPILESHADERPROC);
+    USE_GL_FUNC(glDeleteShader, PFNGLDELETESHADERPROC);
+    USE_GL_FUNC(glGetShaderInfoLog, PFNGLGETSHADERINFOLOGPROC);
+    USE_GL_FUNC(glGetShaderiv, PFNGLGETSHADERIVPROC);
+    USE_GL_FUNC(glUniform1i, PFNGLUNIFORM1IPROC);
+    USE_GL_FUNC(glUniform1f, PFNGLUNIFORM1FPROC);
+    USE_GL_FUNC(glUniform2f, PFNGLUNIFORM2FPROC);
+    USE_GL_FUNC(glUniform3f, PFNGLUNIFORM3FPROC);
+    USE_GL_FUNC(glUniform4f, PFNGLUNIFORM4FPROC);
+    USE_GL_FUNC(glUniform1fv, PFNGLUNIFORM1FVPROC);
+    USE_GL_FUNC(glUniform2fv, PFNGLUNIFORM2FVPROC);
+    USE_GL_FUNC(glUniform3fv, PFNGLUNIFORM3FVPROC);
+    USE_GL_FUNC(glUniform4fv, PFNGLUNIFORM4FVPROC);
+    USE_GL_FUNC(glUniformMatrix4fv, PFNGLUNIFORMMATRIX4FVPROC);
+    USE_GL_FUNC(glSecondaryColor3fv, PFNGLSECONDARYCOLOR3FVPROC);
+    USE_GL_FUNC(glGetUniformLocation, PFNGLGETUNIFORMLOCATIONPROC);
+    USE_GL_FUNC(glGenFramebuffersEXT, PFNGLGENFRAMEBUFFERSEXTPROC);
+    USE_GL_FUNC(glBindFramebufferEXT, PFNGLBINDFRAMEBUFFEREXTPROC);
+    USE_GL_FUNC(glDeleteFramebuffersEXT, PFNGLDELETEFRAMEBUFFERSEXTPROC);
+    USE_GL_FUNC(glCheckFramebufferStatusEXT, PFNGLCHECKFRAMEBUFFERSTATUSEXTPROC);
+    USE_GL_FUNC(glGetFramebufferAttachmentParameterivEXT, PFNGLGETFRAMEBUFFERATTACHMENTPARAMETERIVEXTPROC);
+    USE_GL_FUNC(glFramebufferTexture1DEXT, PFNGLFRAMEBUFFERTEXTURE1DEXTPROC);
+    USE_GL_FUNC(glFramebufferTexture2DEXT, PFNGLFRAMEBUFFERTEXTURE2DEXTPROC);
+    USE_GL_FUNC(glFramebufferTexture3DEXT, PFNGLFRAMEBUFFERTEXTURE3DEXTPROC);
+    USE_GL_FUNC(glGenerateMipmapEXT, PFNGLGENERATEMIPMAPEXTPROC);
+    USE_GL_FUNC(glGenRenderbuffersEXT, PFNGLGENRENDERBUFFERSEXTPROC);
+    USE_GL_FUNC(glDeleteRenderbuffersEXT, PFNGLDELETERENDERBUFFERSEXTPROC);
+    USE_GL_FUNC(glBindRenderbufferEXT, PFNGLBINDRENDERBUFFEREXTPROC);
+    USE_GL_FUNC(glRenderbufferStorageEXT, PFNGLRENDERBUFFERSTORAGEEXTPROC);
+    USE_GL_FUNC(glFramebufferRenderbufferEXT, PFNGLFRAMEBUFFERRENDERBUFFEREXTPROC);
+    USE_GL_FUNC(glClampColorARB, PFNGLCLAMPCOLORARBPROC);
+    USE_GL_FUNC(glBindFragDataLocationEXT, PFNGLBINDFRAGDATALOCATIONEXTPROC);
+
+#if !defined(GLX_EXTENSION_NAME) || !defined(GL_VERSION_1_3)
+    USE_GL_FUNC(glActiveTexture, PFNGLACTIVETEXTUREPROC);
+    USE_GL_FUNC(glClientActiveTexture, PFNGLACTIVETEXTUREPROC);
+#endif
+
+    #undef USE_GL_FUNC
+#endif /*__linux__ */
+}
+
+
+namespace __HelperGL {
+    namespace __Int {
+        static std::vector<std::string> split(const std::string &str)
+        {
+            std::istringstream ss(str);
+            std::istream_iterator<std::string> it(ss);
+            return std::vector<std::string> (it, std::istream_iterator<std::string>());
+        }
+
+        /* Sort the vector passed by reference */
+        template<typename T> static inline void sort(std::vector<T> &a)
+        {
+            std::sort(a.begin(), a.end());
+        }
+
+        /* Compare two vectors */
+        template<typename T> static int equals(std::vector<T> a, std::vector<T> b)
+        {
+            if (a.size() != b.size()) return 0;
+            sort(a);
+            sort(b);
+
+            return std::equal(a.begin(), a.end(), b.begin());
+        }
+
+        template<typename T> static std::vector<T> getIntersection(std::vector<T> a, std::vector<T> b)
+        {
+            sort(a);
+            sort(b);
+
+            std::vector<T> rc;
+            std::set_intersection(a.begin(), a.end(), b.begin(), b.end(),
+                             std::back_inserter<std::vector<std::string> >(rc));
+            return rc;
+        }
+
+        static std::vector<std::string> getGLExtensions()
+        {
+            std::string extensionsStr( (const char *)glGetString(GL_EXTENSIONS));
+            return split (extensionsStr);
+        }
+    }
+
+    static int areGLExtensionsSupported(const std::string &extensions)
+    {
+        std::vector<std::string> all = __Int::getGLExtensions();
+
+        std::vector<std::string> requested = __Int::split(extensions);
+        std::vector<std::string> matched = __Int::getIntersection(all, requested);
+
+        return __Int::equals(matched, requested);
+    }
+
+    static int isGLVersionSupported(unsigned reqMajor, unsigned reqMinor)
+    {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+        if (glewInit() != GLEW_OK)
+        {
+            std::cerr << "glewInit() failed!" << std::endl;
+            return 0;
+        }
+#endif
+        std::string version ((const char *) glGetString (GL_VERSION));
+        std::stringstream stream (version);
+        unsigned major, minor;
+        char dot;
+
+        stream >> major >> dot >> minor;
+
+        assert (dot == '.');
+        return major > reqMajor || (major == reqMajor && minor >= reqMinor);
+    }
+
+    static inline const char* glErrorToString(GLenum err)
+    {
+#define CASE_RETURN_MACRO(arg) case arg: return #arg
+        switch(err)
+        {
+            CASE_RETURN_MACRO(GL_NO_ERROR);
+            CASE_RETURN_MACRO(GL_INVALID_ENUM);
+            CASE_RETURN_MACRO(GL_INVALID_VALUE);
+            CASE_RETURN_MACRO(GL_INVALID_OPERATION);
+            CASE_RETURN_MACRO(GL_OUT_OF_MEMORY);
+            CASE_RETURN_MACRO(GL_STACK_UNDERFLOW);
+            CASE_RETURN_MACRO(GL_STACK_OVERFLOW);
+#ifdef GL_INVALID_FRAMEBUFFER_OPERATION
+            CASE_RETURN_MACRO(GL_INVALID_FRAMEBUFFER_OPERATION);
+#endif
+            default: break;
+        }
+#undef CASE_RETURN_MACRO
+        return "*UNKNOWN*";
+    }
+
+////////////////////////////////////////////////////////////////////////////
+//! Check for OpenGL error
+//! @return bool if no GL error has been encountered, otherwise 0
+//! @param file  __FILE__ macro
+//! @param line  __LINE__ macro
+//! @note The GL error is listed on stderr
+//! @note This function should be used via the CHECK_ERROR_GL() macro
+////////////////////////////////////////////////////////////////////////////
+    inline bool sdkCheckErrorGL(const char *file, const int line)
+    {
+        bool ret_val = true;
+
+        // check for error
+        GLenum gl_error = glGetError();
+
+        if (gl_error != GL_NO_ERROR)
+        {
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+            char tmpStr[512];
+            // NOTE: "%s(%i) : " allows Visual Studio to directly jump to the file at the right line
+            // when the user double clicks on the error line in the Output pane. Like any compile error.
+            sprintf_s(tmpStr, 255, "\n%s(%i) : GL Error : %s\n\n", file, line, glErrorToString(gl_error));
+            fprintf(stderr, "%s", tmpStr);
+#endif
+            fprintf(stderr, "GL Error in file '%s' in line %d :\n", file, line);
+            fprintf(stderr, "%s\n", glErrorToString(gl_error));
+            ret_val = false;
+        }
+
+        return ret_val;
+    }
+
+#define SDK_CHECK_ERROR_GL()                                              \
+    if( false == sdkCheckErrorGL( __FILE__, __LINE__)) {                  \
+        exit(EXIT_FAILURE);                                               \
+    }                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           
+
+} /* of namespace __HelperGL*/
+
+using namespace __HelperGL;
+
+#endif /*HELPER_GL_H*/
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_image.h b/src/cuda/HPC/image/recursiveGaussian/helper_image.h
new file mode 100644
index 000000000..7e81b0cd8
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_image.h
@@ -0,0 +1,1001 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (image,bitmap)
+#ifndef COMMON_HELPER_IMAGE_H_
+#define COMMON_HELPER_IMAGE_H_
+
+#include <assert.h>
+#include "exception.h"
+#include <math.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#ifndef MIN
+#define MIN(a, b) ((a < b) ? a : b)
+#endif
+#ifndef MAX
+#define MAX(a, b) ((a > b) ? a : b)
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#include "helper_string.h"
+
+// namespace unnamed (internal)
+namespace helper_image_internal {
+//! size of PGM file header
+const unsigned int PGMHeaderSize = 0x40;
+
+// types
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterFromUByte;
+
+//! Data converter from unsigned char / unsigned byte
+template <>
+struct ConverterFromUByte<unsigned char> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<unsigned char>(val);
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to float
+template <>
+struct ConverterFromUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  float operator()(const unsigned char &val) {
+    return static_cast<float>(val) / 255.0f;
+  }
+};
+
+//! Data converter from unsigned char / unsigned byte to type T
+template <class T>
+struct ConverterToUByte;
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<unsigned char> {
+  //! Conversion operator (essentially a passthru
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const unsigned char &val) { return val; }
+};
+
+//! Data converter from unsigned char / unsigned byte to unsigned int
+template <>
+struct ConverterToUByte<float> {
+  //! Conversion operator
+  //! @return converted value
+  //! @param  val  value to convert
+  unsigned char operator()(const float &val) {
+    return static_cast<unsigned char>(val * 255.0f);
+  }
+};
+}  // namespace helper_image_internal
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#else
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#endif
+
+inline bool __loadPPM(const char *file, unsigned char **data, unsigned int *w,
+                      unsigned int *h, unsigned int *channels) {
+  FILE *fp = NULL;
+
+  if (FOPEN_FAIL(FOPEN(fp, file, "rb"))) {
+    std::cerr << "__LoadPPM() : Failed to open file: " << file << std::endl;
+    return false;
+  }
+
+  // check header
+  char header[helper_image_internal::PGMHeaderSize];
+
+  if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+    std::cerr << "__LoadPPM() : reading PGM header returned NULL" << std::endl;
+    return false;
+  }
+
+  if (strncmp(header, "P5", 2) == 0) {
+    *channels = 1;
+  } else if (strncmp(header, "P6", 2) == 0) {
+    *channels = 3;
+  } else {
+    std::cerr << "__LoadPPM() : File is not a PPM or PGM image" << std::endl;
+    *channels = 0;
+    return false;
+  }
+
+  // parse header, read maxval, width and height
+  unsigned int width = 0;
+  unsigned int height = 0;
+  unsigned int maxval = 0;
+  unsigned int i = 0;
+
+  while (i < 3) {
+    if (fgets(header, helper_image_internal::PGMHeaderSize, fp) == NULL) {
+      std::cerr << "__LoadPPM() : reading PGM header returned NULL"
+                << std::endl;
+      return false;
+    }
+
+    if (header[0] == '#') {
+      continue;
+    }
+
+    if (i == 0) {
+      i += SSCANF(header, "%u %u %u", &width, &height, &maxval);
+    } else if (i == 1) {
+      i += SSCANF(header, "%u %u", &height, &maxval);
+    } else if (i == 2) {
+      i += SSCANF(header, "%u", &maxval);
+    }
+  }
+
+  // check if given handle for the data is initialized
+  if (NULL != *data) {
+    if (*w != width || *h != height) {
+      std::cerr << "__LoadPPM() : Invalid image dimensions." << std::endl;
+    }
+  } else {
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * width * height *
+                                    *channels);
+    *w = width;
+    *h = height;
+  }
+
+  // read and close file
+  if (fread(*data, sizeof(unsigned char), width * height * *channels, fp) ==
+      0) {
+    std::cerr << "__LoadPPM() read data returned error." << std::endl;
+  }
+
+  fclose(fp);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPGM(const char *file, T **data, unsigned int *w,
+                       unsigned int *h) {
+  unsigned char *idata = NULL;
+  unsigned int channels;
+
+  if (true != __loadPPM(file, &idata, w, h, &channels)) {
+    return false;
+  }
+
+  unsigned int size = *w * *h * channels;
+
+  // initialize mem if necessary
+  // the correct size is checked / set in loadPGMc()
+  if (NULL == *data) {
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size));
+  }
+
+  // copy and cast data
+  std::transform(idata, idata + size, *data,
+                 helper_image_internal::ConverterFromUByte<T>());
+
+  free(idata);
+
+  return true;
+}
+
+template <class T>
+inline bool sdkLoadPPM4(const char *file, T **data, unsigned int *w,
+                        unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * size * 4));
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool __savePPM(const char *file, unsigned char *data, unsigned int w,
+                      unsigned int h, unsigned int channels) {
+  assert(NULL != data);
+  assert(w > 0);
+  assert(h > 0);
+
+  std::fstream fh(file, std::fstream::out | std::fstream::binary);
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  if (channels == 1) {
+    fh << "P5\n";
+  } else if (channels == 3) {
+    fh << "P6\n";
+  } else {
+    std::cerr << "__savePPM() : Invalid number of channels." << std::endl;
+    return false;
+  }
+
+  fh << w << "\n" << h << "\n" << 0xff << std::endl;
+
+  for (unsigned int i = 0; (i < (w * h * channels)) && fh.good(); ++i) {
+    fh << data[i];
+  }
+
+  fh.flush();
+
+  if (fh.bad()) {
+    std::cerr << "__savePPM() : Writing data failed." << std::endl;
+    return false;
+  }
+
+  fh.close();
+
+  return true;
+}
+
+template <class T>
+inline bool sdkSavePGM(const char *file, T *data, unsigned int w,
+                       unsigned int h) {
+  unsigned int size = w * h;
+  unsigned char *idata = (unsigned char *)malloc(sizeof(unsigned char) * size);
+
+  std::transform(data, data + size, idata,
+                 helper_image_internal::ConverterToUByte<T>());
+
+  // write file
+  bool result = __savePPM(file, idata, w, h, 1);
+
+  // cleanup
+  free(idata);
+
+  return result;
+}
+
+inline bool sdkSavePPM4ub(const char *file, unsigned char *data, unsigned int w,
+                          unsigned int h) {
+  // strip 4th component
+  int size = w * h;
+  unsigned char *ndata =
+      (unsigned char *)malloc(sizeof(unsigned char) * size * 3);
+  unsigned char *ptr = ndata;
+
+  for (int i = 0; i < size; i++) {
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    *ptr++ = *data++;
+    data++;
+  }
+
+  bool result = __savePPM(file, ndata, w, h, 3);
+  free(ndata);
+  return result;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFile(const char *filename, T **data, unsigned int *len,
+                        bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // intermediate storage for the data read
+  std::vector<T> data_read;
+
+  // open file for reading
+  FILE *fh = NULL;
+
+  // check if filestream is valid
+  if (FOPEN_FAIL(FOPEN(fh, filename, "r"))) {
+    printf("Unable to open input file: %s\n", filename);
+    return false;
+  }
+
+  // read all data elements
+  T token;
+
+  while (!feof(fh)) {
+    fscanf(fh, "%f", &token);
+    data_read.push_back(token);
+  }
+
+  // the last element is read twice
+  data_read.pop_back();
+  fclose(fh);
+
+  // check if the given handle is already initialized
+  if (NULL != *data) {
+    if (*len != data_read.size()) {
+      std::cerr << "sdkReadFile() : Initialized memory given but "
+                << "size  mismatch with signal read "
+                << "(data read / data init = " << (unsigned int)data_read.size()
+                << " / " << *len << ")" << std::endl;
+
+      return false;
+    }
+  } else {
+    // allocate storage for the data read
+    *data = reinterpret_cast<T *>(malloc(sizeof(T) * data_read.size()));
+    // store signal size
+    *len = static_cast<unsigned int>(data_read.size());
+  }
+
+  // copy data
+  memcpy(*data, &data_read.front(), sizeof(T) * data_read.size());
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Read file \filename and return the data
+//! @return bool if reading the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  uninitialized pointer, returned initialized and pointing to
+//!        the data read
+//! @param len  number of data elements in data, -1 on error
+//////////////////////////////////////////////////////////////////////////////
+template <class T>
+inline bool sdkReadFileBlocks(const char *filename, T **data, unsigned int *len,
+                              unsigned int block_num, unsigned int block_size,
+                              bool verbose) {
+  // check input arguments
+  assert(NULL != filename);
+  assert(NULL != len);
+
+  // open file for reading
+  FILE *fh = fopen(filename, "rb");
+
+  if (fh == NULL && verbose) {
+    std::cerr << "sdkReadFile() : Opening file failed." << std::endl;
+    return false;
+  }
+
+  // check if the given handle is already initialized
+  // allocate storage for the data read
+  data[block_num] = reinterpret_cast<T *>(malloc(block_size));
+
+  // read all data elements
+  fseek(fh, block_num * block_size, SEEK_SET);
+  *len = fread(data[block_num], sizeof(T), block_size / sizeof(T), fh);
+
+  fclose(fh);
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Write a data file \filename
+//! @return true if writing the file succeeded, otherwise false
+//! @param filename name of the source file
+//! @param data  data to write
+//! @param len  number of data elements in data, -1 on error
+//! @param epsilon  epsilon for comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool sdkWriteFile(const char *filename, const T *data, unsigned int len,
+                         const S epsilon, bool verbose, bool append = false) {
+  assert(NULL != filename);
+  assert(NULL != data);
+
+  // open file for writing
+  //    if (append) {
+  std::fstream fh(filename, std::fstream::out | std::fstream::ate);
+
+  if (verbose) {
+    std::cerr << "sdkWriteFile() : Open file " << filename
+              << " for write/append." << std::endl;
+  }
+
+  /*    } else {
+          std::fstream fh(filename, std::fstream::out);
+          if (verbose) {
+              std::cerr << "sdkWriteFile() : Open file " << filename << " for
+     write." << std::endl;
+          }
+      }
+  */
+
+  // check if filestream is valid
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Opening file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // first write epsilon
+  fh << "# " << epsilon << "\n";
+
+  // write data
+  for (unsigned int i = 0; (i < len) && (fh.good()); ++i) {
+    fh << data[i] << ' ';
+  }
+
+  // Check if writing succeeded
+  if (!fh.good()) {
+    if (verbose) {
+      std::cerr << "sdkWriteFile() : Writing file failed." << std::endl;
+    }
+
+    return false;
+  }
+
+  // file ends with nl
+  fh << std::endl;
+
+  return true;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  timer_interface to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareData(const T *reference, const T *data,
+                        const unsigned int len, const S epsilon,
+                        const float threshold) {
+  assert(epsilon >= 0);
+
+  bool result = true;
+  unsigned int error_count = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = static_cast<float>(reference[i]) - static_cast<float>(data[i]);
+    bool comp = (diff <= epsilon) && (diff >= -epsilon);
+    result &= comp;
+
+    error_count += !comp;
+
+#if 0
+
+    if (!comp) {
+      std::cerr << "ERROR, i = " << i << ",\t "
+                << reference[i] << " / "
+                << data[i]
+                << " (reference / data)\n";
+    }
+
+#endif
+  }
+
+  if (threshold == 0.0f) {
+    return (result) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return (len * threshold > error_count) ? true : false;
+  }
+}
+
+#ifndef __MIN_EPSILON_ERROR
+#define __MIN_EPSILON_ERROR 1e-3f
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//! Compare two arrays of arbitrary type
+//! @return  true if \a reference and \a data are identical, otherwise false
+//! @param reference  handle to the reference data / gold image
+//! @param data       handle to the computed data
+//! @param len        number of elements in reference and data
+//! @param epsilon    epsilon to use for the comparison
+//! @param epsilon    threshold % of (# of bytes) for pass/fail
+//////////////////////////////////////////////////////////////////////////////
+template <class T, class S>
+inline bool compareDataAsFloatThreshold(const T *reference, const T *data,
+                                        const unsigned int len, const S epsilon,
+                                        const float threshold) {
+  assert(epsilon >= 0);
+
+  // If we set epsilon to be 0, let's set a minimum threshold
+  float max_error = MAX((float)epsilon, __MIN_EPSILON_ERROR);
+  int error_count = 0;
+  bool result = true;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff =
+        fabs(static_cast<float>(reference[i]) - static_cast<float>(data[i]));
+    bool comp = (diff < max_error);
+    result &= comp;
+
+    if (!comp) {
+      error_count++;
+    }
+  }
+
+  if (threshold == 0.0f) {
+    if (error_count) {
+      printf("total # of errors = %d\n", error_count);
+    }
+
+    return (error_count == 0) ? true : false;
+  } else {
+    if (error_count) {
+      printf("%4.2f(%%) of bytes mismatched (count=%d)\n",
+             static_cast<float>(error_count) * 100 / static_cast<float>(len),
+             error_count);
+    }
+
+    return ((len * threshold > error_count) ? true : false);
+  }
+}
+
+inline void sdkDumpBin(void *data, unsigned int bytes, const char *filename) {
+  printf("sdkDumpBin: <%s>\n", filename);
+  FILE *fp;
+  FOPEN(fp, filename, "wb");
+  fwrite(data, bytes, 1, fp);
+  fflush(fp);
+  fclose(fp);
+}
+
+inline bool sdkCompareBin2BinUint(const char *src_file, const char *ref_file,
+                                  unsigned int nelements, const float epsilon,
+                                  const float threshold, char *exec_path) {
+  unsigned int *src_buffer, *ref_buffer;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+
+  uint64_t error_count = 0;
+  size_t fsize = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <unsigned int> unable to open src_file: %s\n",
+           src_file);
+    error_count++;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <unsigned int>  unable to find <%s> in <%s>\n",
+           ref_file, exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           ref_file);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf(
+          "compareBin2Bin <unsigned int>"
+          " unable to open ref_file: %s\n",
+          ref_file_path);
+      error_count++;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+      ref_buffer = (unsigned int *)malloc(nelements * sizeof(unsigned int));
+
+      fsize = fread(src_buffer, nelements, sizeof(unsigned int), src_fp);
+      fsize = fread(ref_buffer, nelements, sizeof(unsigned int), ref_fp);
+
+      printf(
+          "> compareBin2Bin <unsigned int> nelements=%d,"
+          " epsilon=%4.2f, threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize));
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize));
+
+      if (!compareData<unsigned int, float>(ref_buffer, src_buffer, nelements,
+                                            epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareBin2BinFloat(const char *src_file, const char *ref_file,
+                                   unsigned int nelements, const float epsilon,
+                                   const float threshold, char *exec_path) {
+  float *src_buffer = NULL, *ref_buffer = NULL;
+  FILE *src_fp = NULL, *ref_fp = NULL;
+  size_t fsize = 0;
+
+  uint64_t error_count = 0;
+
+  if (FOPEN_FAIL(FOPEN(src_fp, src_file, "rb"))) {
+    printf("compareBin2Bin <float> unable to open src_file: %s\n", src_file);
+    error_count = 1;
+  }
+
+  char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
+
+  if (ref_file_path == NULL) {
+    printf("compareBin2Bin <float> unable to find <%s> in <%s>\n", ref_file,
+           exec_path);
+    printf(">>> Check info.xml and [project//data] folder <%s> <<<\n",
+           exec_path);
+    printf("Aborting comparison!\n");
+    printf("  FAILED\n");
+    error_count++;
+
+    if (src_fp) {
+      fclose(src_fp);
+    }
+
+    if (ref_fp) {
+      fclose(ref_fp);
+    }
+  } else {
+    if (FOPEN_FAIL(FOPEN(ref_fp, ref_file_path, "rb"))) {
+      printf("compareBin2Bin <float> unable to open ref_file: %s\n",
+             ref_file_path);
+      error_count = 1;
+    }
+
+    if (src_fp && ref_fp) {
+      src_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+      ref_buffer = reinterpret_cast<float *>(malloc(nelements * sizeof(float)));
+
+      printf(
+          "> compareBin2Bin <float> nelements=%d, epsilon=%4.2f,"
+          " threshold=%4.2f\n",
+          nelements, epsilon, threshold);
+      fsize = fread(src_buffer, sizeof(float), nelements, src_fp);
+      printf("   src_file <%s>, size=%d bytes\n", src_file,
+             static_cast<int>(fsize * sizeof(float)));
+      fsize = fread(ref_buffer, sizeof(float), nelements, ref_fp);
+      printf("   ref_file <%s>, size=%d bytes\n", ref_file_path,
+             static_cast<int>(fsize * sizeof(float)));
+
+      if (!compareDataAsFloatThreshold<float, float>(
+              ref_buffer, src_buffer, nelements, epsilon, threshold)) {
+        error_count++;
+      }
+
+      fclose(src_fp);
+      fclose(ref_fp);
+
+      free(src_buffer);
+      free(ref_buffer);
+    } else {
+      if (src_fp) {
+        fclose(src_fp);
+      }
+
+      if (ref_fp) {
+        fclose(ref_fp);
+      }
+    }
+  }
+
+  if (error_count == 0) {
+    printf("  OK\n");
+  } else {
+    printf("  FAILURE: %d errors...\n", (unsigned int)error_count);
+  }
+
+  return (error_count == 0);  // returns true if all pixels pass
+}
+
+inline bool sdkCompareL2fe(const float *reference, const float *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = reference[i] - data[i];
+    error += diff * diff;
+    ref += reference[i] * reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline bool sdkLoadPPMub(const char *file, unsigned char **data,
+                         unsigned int *w, unsigned int *h) {
+  unsigned int channels;
+  return __loadPPM(file, data, w, h, &channels);
+}
+
+inline bool sdkLoadPPM4ub(const char *file, unsigned char **data,
+                          unsigned int *w, unsigned int *h) {
+  unsigned char *idata = 0;
+  unsigned int channels;
+
+  if (__loadPPM(file, &idata, w, h, &channels)) {
+    // pad 4th component
+    int size = *w * *h;
+    // keep the original pointer
+    unsigned char *idata_orig = idata;
+    *data = (unsigned char *)malloc(sizeof(unsigned char) * size * 4);
+    unsigned char *ptr = *data;
+
+    for (int i = 0; i < size; i++) {
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = *idata++;
+      *ptr++ = 0;
+    }
+
+    free(idata_orig);
+    return true;
+  } else {
+    free(idata);
+    return false;
+  }
+}
+
+inline bool sdkComparePPM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data, *ref_data;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPM4ub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPM4ub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PPMvsPPM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PPMvsPPM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors) {
+    std::cerr << "PPMvsPPM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+  }
+
+  if (compareData(ref_data, src_data, src_width * src_height * 4, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+inline bool sdkComparePGM(const char *src_file, const char *ref_file,
+                          const float epsilon, const float threshold,
+                          bool verboseErrors) {
+  unsigned char *src_data = 0, *ref_data = 0;
+  uint64_t error_count = 0;
+  unsigned int ref_width, ref_height;
+  unsigned int src_width, src_height;
+
+  if (src_file == NULL || ref_file == NULL) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: src_file or ref_file is NULL."
+                   "  Aborting comparison\n";
+    }
+
+    return false;
+  }
+
+  if (verboseErrors) {
+    std::cerr << "> Compare (a)rendered:  <" << src_file << ">\n";
+    std::cerr << ">         (b)reference: <" << ref_file << ">\n";
+  }
+
+  if (sdkLoadPPMub(ref_file, &ref_data, &ref_width, &ref_height) != true) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: unable to load ref image file: " << ref_file
+                << "\n";
+    }
+
+    return false;
+  }
+
+  if (sdkLoadPPMub(src_file, &src_data, &src_width, &src_height) != true) {
+    std::cerr << "PGMvsPGM: unable to load src image file: " << src_file
+              << "\n";
+    return false;
+  }
+
+  if (src_height != ref_height || src_width != ref_width) {
+    if (verboseErrors) {
+      std::cerr << "PGMvsPGM: source and ref size mismatch (" << src_width
+                << "," << src_height << ")vs(" << ref_width << "," << ref_height
+                << ")\n";
+    }
+  }
+
+  if (verboseErrors)
+    std::cerr << "PGMvsPGM: comparing images size (" << src_width << ","
+              << src_height << ") epsilon(" << epsilon << "), threshold("
+              << threshold * 100 << "%)\n";
+
+  if (compareData(ref_data, src_data, src_width * src_height, epsilon,
+                  threshold) == false) {
+    error_count = 1;
+  }
+
+  if (error_count == 0) {
+    if (verboseErrors) {
+      std::cerr << "    OK\n\n";
+    }
+  } else {
+    if (verboseErrors) {
+      std::cerr << "    FAILURE!  " << error_count << " errors...\n\n";
+    }
+  }
+
+  // returns true if all pixels pass
+  return (error_count == 0) ? true : false;
+}
+
+#endif  // COMMON_HELPER_IMAGE_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_math.h b/src/cuda/HPC/image/recursiveGaussian/helper_math.h
new file mode 100644
index 000000000..6d2f63aee
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_math.h
@@ -0,0 +1,1469 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ *  This file implements common mathematical operations on vector types
+ *  (float3, float4 etc.) since these are not provided as standard by CUDA.
+ *
+ *  The syntax is modeled on the Cg standard library.
+ *
+ *  This is part of the Helper library includes
+ *
+ *    Thanks to Linh Hah for additions and fixes.
+ */
+
+#ifndef HELPER_MATH_H
+#define HELPER_MATH_H
+
+#include "cuda_runtime.h"
+
+typedef unsigned int uint;
+typedef unsigned short ushort;
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+#ifndef __CUDACC__
+#include <math.h>
+
+////////////////////////////////////////////////////////////////////////////////
+// host implementations of CUDA functions
+////////////////////////////////////////////////////////////////////////////////
+
+inline float fminf(float a, float b)
+{
+    return a < b ? a : b;
+}
+
+inline float fmaxf(float a, float b)
+{
+    return a > b ? a : b;
+}
+
+inline int max(int a, int b)
+{
+    return a > b ? a : b;
+}
+
+inline int min(int a, int b)
+{
+    return a < b ? a : b;
+}
+
+inline float rsqrtf(float x)
+{
+    return 1.0f / sqrtf(x);
+}
+#endif
+
+////////////////////////////////////////////////////////////////////////////////
+// constructors
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 make_float2(float s)
+{
+    return make_float2(s, s);
+}
+inline __host__ __device__ float2 make_float2(float3 a)
+{
+    return make_float2(a.x, a.y);
+}
+inline __host__ __device__ float2 make_float2(int2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+inline __host__ __device__ float2 make_float2(uint2 a)
+{
+    return make_float2(float(a.x), float(a.y));
+}
+
+inline __host__ __device__ int2 make_int2(int s)
+{
+    return make_int2(s, s);
+}
+inline __host__ __device__ int2 make_int2(int3 a)
+{
+    return make_int2(a.x, a.y);
+}
+inline __host__ __device__ int2 make_int2(uint2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+inline __host__ __device__ int2 make_int2(float2 a)
+{
+    return make_int2(int(a.x), int(a.y));
+}
+
+inline __host__ __device__ uint2 make_uint2(uint s)
+{
+    return make_uint2(s, s);
+}
+inline __host__ __device__ uint2 make_uint2(uint3 a)
+{
+    return make_uint2(a.x, a.y);
+}
+inline __host__ __device__ uint2 make_uint2(int2 a)
+{
+    return make_uint2(uint(a.x), uint(a.y));
+}
+
+inline __host__ __device__ float3 make_float3(float s)
+{
+    return make_float3(s, s, s);
+}
+inline __host__ __device__ float3 make_float3(float2 a)
+{
+    return make_float3(a.x, a.y, 0.0f);
+}
+inline __host__ __device__ float3 make_float3(float2 a, float s)
+{
+    return make_float3(a.x, a.y, s);
+}
+inline __host__ __device__ float3 make_float3(float4 a)
+{
+    return make_float3(a.x, a.y, a.z);
+}
+inline __host__ __device__ float3 make_float3(int3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+inline __host__ __device__ float3 make_float3(uint3 a)
+{
+    return make_float3(float(a.x), float(a.y), float(a.z));
+}
+
+inline __host__ __device__ int3 make_int3(int s)
+{
+    return make_int3(s, s, s);
+}
+inline __host__ __device__ int3 make_int3(int2 a)
+{
+    return make_int3(a.x, a.y, 0);
+}
+inline __host__ __device__ int3 make_int3(int2 a, int s)
+{
+    return make_int3(a.x, a.y, s);
+}
+inline __host__ __device__ int3 make_int3(uint3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+inline __host__ __device__ int3 make_int3(float3 a)
+{
+    return make_int3(int(a.x), int(a.y), int(a.z));
+}
+
+inline __host__ __device__ uint3 make_uint3(uint s)
+{
+    return make_uint3(s, s, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a)
+{
+    return make_uint3(a.x, a.y, 0);
+}
+inline __host__ __device__ uint3 make_uint3(uint2 a, uint s)
+{
+    return make_uint3(a.x, a.y, s);
+}
+inline __host__ __device__ uint3 make_uint3(uint4 a)
+{
+    return make_uint3(a.x, a.y, a.z);
+}
+inline __host__ __device__ uint3 make_uint3(int3 a)
+{
+    return make_uint3(uint(a.x), uint(a.y), uint(a.z));
+}
+
+inline __host__ __device__ float4 make_float4(float s)
+{
+    return make_float4(s, s, s, s);
+}
+inline __host__ __device__ float4 make_float4(float3 a)
+{
+    return make_float4(a.x, a.y, a.z, 0.0f);
+}
+inline __host__ __device__ float4 make_float4(float3 a, float w)
+{
+    return make_float4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ float4 make_float4(int4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+inline __host__ __device__ float4 make_float4(uint4 a)
+{
+    return make_float4(float(a.x), float(a.y), float(a.z), float(a.w));
+}
+
+inline __host__ __device__ int4 make_int4(int s)
+{
+    return make_int4(s, s, s, s);
+}
+inline __host__ __device__ int4 make_int4(int3 a)
+{
+    return make_int4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ int4 make_int4(int3 a, int w)
+{
+    return make_int4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ int4 make_int4(uint4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+inline __host__ __device__ int4 make_int4(float4 a)
+{
+    return make_int4(int(a.x), int(a.y), int(a.z), int(a.w));
+}
+
+
+inline __host__ __device__ uint4 make_uint4(uint s)
+{
+    return make_uint4(s, s, s, s);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a)
+{
+    return make_uint4(a.x, a.y, a.z, 0);
+}
+inline __host__ __device__ uint4 make_uint4(uint3 a, uint w)
+{
+    return make_uint4(a.x, a.y, a.z, w);
+}
+inline __host__ __device__ uint4 make_uint4(int4 a)
+{
+    return make_uint4(uint(a.x), uint(a.y), uint(a.z), uint(a.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// negate
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 &a)
+{
+    return make_float2(-a.x, -a.y);
+}
+inline __host__ __device__ int2 operator-(int2 &a)
+{
+    return make_int2(-a.x, -a.y);
+}
+inline __host__ __device__ float3 operator-(float3 &a)
+{
+    return make_float3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ int3 operator-(int3 &a)
+{
+    return make_int3(-a.x, -a.y, -a.z);
+}
+inline __host__ __device__ float4 operator-(float4 &a)
+{
+    return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+inline __host__ __device__ int4 operator-(int4 &a)
+{
+    return make_int4(-a.x, -a.y, -a.z, -a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// addition
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator+(float2 a, float2 b)
+{
+    return make_float2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(float2 &a, float2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ float2 operator+(float2 a, float b)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ float2 operator+(float b, float2 a)
+{
+    return make_float2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(float2 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ int2 operator+(int2 a, int2 b)
+{
+    return make_int2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(int2 &a, int2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ int2 operator+(int2 a, int b)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ int2 operator+(int b, int2 a)
+{
+    return make_int2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(int2 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+inline __host__ __device__ uint2 operator+(uint2 a, uint2 b)
+{
+    return make_uint2(a.x + b.x, a.y + b.y);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint2 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+}
+inline __host__ __device__ uint2 operator+(uint2 a, uint b)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ uint2 operator+(uint b, uint2 a)
+{
+    return make_uint2(a.x + b, a.y + b);
+}
+inline __host__ __device__ void operator+=(uint2 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+}
+
+
+inline __host__ __device__ float3 operator+(float3 a, float3 b)
+{
+    return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(float3 &a, float3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ float3 operator+(float3 a, float b)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(float3 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int3 a, int3 b)
+{
+    return make_int3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(int3 &a, int3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ int3 operator+(int3 a, int b)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(int3 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ uint3 operator+(uint3 a, uint3 b)
+{
+    return make_uint3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint3 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+}
+inline __host__ __device__ uint3 operator+(uint3 a, uint b)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ void operator+=(uint3 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+}
+
+inline __host__ __device__ int3 operator+(int b, int3 a)
+{
+    return make_int3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ uint3 operator+(uint b, uint3 a)
+{
+    return make_uint3(a.x + b, a.y + b, a.z + b);
+}
+inline __host__ __device__ float3 operator+(float b, float3 a)
+{
+    return make_float3(a.x + b, a.y + b, a.z + b);
+}
+
+inline __host__ __device__ float4 operator+(float4 a, float4 b)
+{
+    return make_float4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(float4 &a, float4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ float4 operator+(float4 a, float b)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ float4 operator+(float b, float4 a)
+{
+    return make_float4(a.x + b, a.y + b, a.z + b, a.w + b);
+}
+inline __host__ __device__ void operator+=(float4 &a, float b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ int4 operator+(int4 a, int4 b)
+{
+    return make_int4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(int4 &a, int4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ int4 operator+(int4 a, int b)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ int4 operator+(int b, int4 a)
+{
+    return make_int4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(int4 &a, int b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+inline __host__ __device__ uint4 operator+(uint4 a, uint4 b)
+{
+    return make_uint4(a.x + b.x, a.y + b.y, a.z + b.z,  a.w + b.w);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint4 b)
+{
+    a.x += b.x;
+    a.y += b.y;
+    a.z += b.z;
+    a.w += b.w;
+}
+inline __host__ __device__ uint4 operator+(uint4 a, uint b)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ uint4 operator+(uint b, uint4 a)
+{
+    return make_uint4(a.x + b, a.y + b, a.z + b,  a.w + b);
+}
+inline __host__ __device__ void operator+=(uint4 &a, uint b)
+{
+    a.x += b;
+    a.y += b;
+    a.z += b;
+    a.w += b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// subtract
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator-(float2 a, float2 b)
+{
+    return make_float2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ float2 operator-(float2 a, float b)
+{
+    return make_float2(a.x - b, a.y - b);
+}
+inline __host__ __device__ float2 operator-(float b, float2 a)
+{
+    return make_float2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(float2 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ int2 operator-(int2 a, int2 b)
+{
+    return make_int2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ int2 operator-(int2 a, int b)
+{
+    return make_int2(a.x - b, a.y - b);
+}
+inline __host__ __device__ int2 operator-(int b, int2 a)
+{
+    return make_int2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(int2 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ uint2 operator-(uint2 a, uint2 b)
+{
+    return make_uint2(a.x - b.x, a.y - b.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint2 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+}
+inline __host__ __device__ uint2 operator-(uint2 a, uint b)
+{
+    return make_uint2(a.x - b, a.y - b);
+}
+inline __host__ __device__ uint2 operator-(uint b, uint2 a)
+{
+    return make_uint2(b - a.x, b - a.y);
+}
+inline __host__ __device__ void operator-=(uint2 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+}
+
+inline __host__ __device__ float3 operator-(float3 a, float3 b)
+{
+    return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ float3 operator-(float3 a, float b)
+{
+    return make_float3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ float3 operator-(float b, float3 a)
+{
+    return make_float3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(float3 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ int3 operator-(int3 a, int3 b)
+{
+    return make_int3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ int3 operator-(int3 a, int b)
+{
+    return make_int3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ int3 operator-(int b, int3 a)
+{
+    return make_int3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(int3 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ uint3 operator-(uint3 a, uint3 b)
+{
+    return make_uint3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint3 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+}
+inline __host__ __device__ uint3 operator-(uint3 a, uint b)
+{
+    return make_uint3(a.x - b, a.y - b, a.z - b);
+}
+inline __host__ __device__ uint3 operator-(uint b, uint3 a)
+{
+    return make_uint3(b - a.x, b - a.y, b - a.z);
+}
+inline __host__ __device__ void operator-=(uint3 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+}
+
+inline __host__ __device__ float4 operator-(float4 a, float4 b)
+{
+    return make_float4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(float4 &a, float4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ float4 operator-(float4 a, float b)
+{
+    return make_float4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ void operator-=(float4 &a, float b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ int4 operator-(int4 a, int4 b)
+{
+    return make_int4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ int4 operator-(int4 a, int b)
+{
+    return make_int4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ int4 operator-(int b, int4 a)
+{
+    return make_int4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(int4 &a, int b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+inline __host__ __device__ uint4 operator-(uint4 a, uint4 b)
+{
+    return make_uint4(a.x - b.x, a.y - b.y, a.z - b.z,  a.w - b.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint4 b)
+{
+    a.x -= b.x;
+    a.y -= b.y;
+    a.z -= b.z;
+    a.w -= b.w;
+}
+inline __host__ __device__ uint4 operator-(uint4 a, uint b)
+{
+    return make_uint4(a.x - b, a.y - b, a.z - b,  a.w - b);
+}
+inline __host__ __device__ uint4 operator-(uint b, uint4 a)
+{
+    return make_uint4(b - a.x, b - a.y, b - a.z, b - a.w);
+}
+inline __host__ __device__ void operator-=(uint4 &a, uint b)
+{
+    a.x -= b;
+    a.y -= b;
+    a.z -= b;
+    a.w -= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// multiply
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator*(float2 a, float2 b)
+{
+    return make_float2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ float2 operator*(float2 a, float b)
+{
+    return make_float2(a.x * b, a.y * b);
+}
+inline __host__ __device__ float2 operator*(float b, float2 a)
+{
+    return make_float2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(float2 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ int2 operator*(int2 a, int2 b)
+{
+    return make_int2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ int2 operator*(int2 a, int b)
+{
+    return make_int2(a.x * b, a.y * b);
+}
+inline __host__ __device__ int2 operator*(int b, int2 a)
+{
+    return make_int2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(int2 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ uint2 operator*(uint2 a, uint2 b)
+{
+    return make_uint2(a.x * b.x, a.y * b.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint2 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+}
+inline __host__ __device__ uint2 operator*(uint2 a, uint b)
+{
+    return make_uint2(a.x * b, a.y * b);
+}
+inline __host__ __device__ uint2 operator*(uint b, uint2 a)
+{
+    return make_uint2(b * a.x, b * a.y);
+}
+inline __host__ __device__ void operator*=(uint2 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+}
+
+inline __host__ __device__ float3 operator*(float3 a, float3 b)
+{
+    return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ float3 operator*(float3 a, float b)
+{
+    return make_float3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ float3 operator*(float b, float3 a)
+{
+    return make_float3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(float3 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ int3 operator*(int3 a, int3 b)
+{
+    return make_int3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ int3 operator*(int3 a, int b)
+{
+    return make_int3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ int3 operator*(int b, int3 a)
+{
+    return make_int3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(int3 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ uint3 operator*(uint3 a, uint3 b)
+{
+    return make_uint3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint3 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+}
+inline __host__ __device__ uint3 operator*(uint3 a, uint b)
+{
+    return make_uint3(a.x * b, a.y * b, a.z * b);
+}
+inline __host__ __device__ uint3 operator*(uint b, uint3 a)
+{
+    return make_uint3(b * a.x, b * a.y, b * a.z);
+}
+inline __host__ __device__ void operator*=(uint3 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+}
+
+inline __host__ __device__ float4 operator*(float4 a, float4 b)
+{
+    return make_float4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ float4 operator*(float4 a, float b)
+{
+    return make_float4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ float4 operator*(float b, float4 a)
+{
+    return make_float4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(float4 &a, float b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ int4 operator*(int4 a, int4 b)
+{
+    return make_int4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ int4 operator*(int4 a, int b)
+{
+    return make_int4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ int4 operator*(int b, int4 a)
+{
+    return make_int4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(int4 &a, int b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+inline __host__ __device__ uint4 operator*(uint4 a, uint4 b)
+{
+    return make_uint4(a.x * b.x, a.y * b.y, a.z * b.z,  a.w * b.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint4 b)
+{
+    a.x *= b.x;
+    a.y *= b.y;
+    a.z *= b.z;
+    a.w *= b.w;
+}
+inline __host__ __device__ uint4 operator*(uint4 a, uint b)
+{
+    return make_uint4(a.x * b, a.y * b, a.z * b,  a.w * b);
+}
+inline __host__ __device__ uint4 operator*(uint b, uint4 a)
+{
+    return make_uint4(b * a.x, b * a.y, b * a.z, b * a.w);
+}
+inline __host__ __device__ void operator*=(uint4 &a, uint b)
+{
+    a.x *= b;
+    a.y *= b;
+    a.z *= b;
+    a.w *= b;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// divide
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 operator/(float2 a, float2 b)
+{
+    return make_float2(a.x / b.x, a.y / b.y);
+}
+inline __host__ __device__ void operator/=(float2 &a, float2 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+}
+inline __host__ __device__ float2 operator/(float2 a, float b)
+{
+    return make_float2(a.x / b, a.y / b);
+}
+inline __host__ __device__ void operator/=(float2 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+}
+inline __host__ __device__ float2 operator/(float b, float2 a)
+{
+    return make_float2(b / a.x, b / a.y);
+}
+
+inline __host__ __device__ float3 operator/(float3 a, float3 b)
+{
+    return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+inline __host__ __device__ void operator/=(float3 &a, float3 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+}
+inline __host__ __device__ float3 operator/(float3 a, float b)
+{
+    return make_float3(a.x / b, a.y / b, a.z / b);
+}
+inline __host__ __device__ void operator/=(float3 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+}
+inline __host__ __device__ float3 operator/(float b, float3 a)
+{
+    return make_float3(b / a.x, b / a.y, b / a.z);
+}
+
+inline __host__ __device__ float4 operator/(float4 a, float4 b)
+{
+    return make_float4(a.x / b.x, a.y / b.y, a.z / b.z,  a.w / b.w);
+}
+inline __host__ __device__ void operator/=(float4 &a, float4 b)
+{
+    a.x /= b.x;
+    a.y /= b.y;
+    a.z /= b.z;
+    a.w /= b.w;
+}
+inline __host__ __device__ float4 operator/(float4 a, float b)
+{
+    return make_float4(a.x / b, a.y / b, a.z / b,  a.w / b);
+}
+inline __host__ __device__ void operator/=(float4 &a, float b)
+{
+    a.x /= b;
+    a.y /= b;
+    a.z /= b;
+    a.w /= b;
+}
+inline __host__ __device__ float4 operator/(float b, float4 a)
+{
+    return make_float4(b / a.x, b / a.y, b / a.z, b / a.w);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// min
+////////////////////////////////////////////////////////////////////////////////
+
+inline  __host__ __device__ float2 fminf(float2 a, float2 b)
+{
+    return make_float2(fminf(a.x,b.x), fminf(a.y,b.y));
+}
+inline __host__ __device__ float3 fminf(float3 a, float3 b)
+{
+    return make_float3(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z));
+}
+inline  __host__ __device__ float4 fminf(float4 a, float4 b)
+{
+    return make_float4(fminf(a.x,b.x), fminf(a.y,b.y), fminf(a.z,b.z), fminf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 min(int2 a, int2 b)
+{
+    return make_int2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ int3 min(int3 a, int3 b)
+{
+    return make_int3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ int4 min(int4 a, int4 b)
+{
+    return make_int4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 min(uint2 a, uint2 b)
+{
+    return make_uint2(min(a.x,b.x), min(a.y,b.y));
+}
+inline __host__ __device__ uint3 min(uint3 a, uint3 b)
+{
+    return make_uint3(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z));
+}
+inline __host__ __device__ uint4 min(uint4 a, uint4 b)
+{
+    return make_uint4(min(a.x,b.x), min(a.y,b.y), min(a.z,b.z), min(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// max
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmaxf(float2 a, float2 b)
+{
+    return make_float2(fmaxf(a.x,b.x), fmaxf(a.y,b.y));
+}
+inline __host__ __device__ float3 fmaxf(float3 a, float3 b)
+{
+    return make_float3(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z));
+}
+inline __host__ __device__ float4 fmaxf(float4 a, float4 b)
+{
+    return make_float4(fmaxf(a.x,b.x), fmaxf(a.y,b.y), fmaxf(a.z,b.z), fmaxf(a.w,b.w));
+}
+
+inline __host__ __device__ int2 max(int2 a, int2 b)
+{
+    return make_int2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ int3 max(int3 a, int3 b)
+{
+    return make_int3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ int4 max(int4 a, int4 b)
+{
+    return make_int4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+inline __host__ __device__ uint2 max(uint2 a, uint2 b)
+{
+    return make_uint2(max(a.x,b.x), max(a.y,b.y));
+}
+inline __host__ __device__ uint3 max(uint3 a, uint3 b)
+{
+    return make_uint3(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z));
+}
+inline __host__ __device__ uint4 max(uint4 a, uint4 b)
+{
+    return make_uint4(max(a.x,b.x), max(a.y,b.y), max(a.z,b.z), max(a.w,b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// lerp
+// - linear interpolation between a and b, based on value t in [0, 1] range
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float lerp(float a, float b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float2 lerp(float2 a, float2 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float3 lerp(float3 a, float3 b, float t)
+{
+    return a + t*(b-a);
+}
+inline __device__ __host__ float4 lerp(float4 a, float4 b, float t)
+{
+    return a + t*(b-a);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// clamp
+// - clamp the value v to be in the range [a, b]
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float clamp(float f, float a, float b)
+{
+    return fmaxf(a, fminf(f, b));
+}
+inline __device__ __host__ int clamp(int f, int a, int b)
+{
+    return max(a, min(f, b));
+}
+inline __device__ __host__ uint clamp(uint f, uint a, uint b)
+{
+    return max(a, min(f, b));
+}
+
+inline __device__ __host__ float2 clamp(float2 v, float a, float b)
+{
+    return make_float2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ float2 clamp(float2 v, float2 a, float2 b)
+{
+    return make_float2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ float3 clamp(float3 v, float a, float b)
+{
+    return make_float3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ float3 clamp(float3 v, float3 a, float3 b)
+{
+    return make_float3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ float4 clamp(float4 v, float a, float b)
+{
+    return make_float4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ float4 clamp(float4 v, float4 a, float4 b)
+{
+    return make_float4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ int2 clamp(int2 v, int a, int b)
+{
+    return make_int2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ int2 clamp(int2 v, int2 a, int2 b)
+{
+    return make_int2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ int3 clamp(int3 v, int a, int b)
+{
+    return make_int3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ int3 clamp(int3 v, int3 a, int3 b)
+{
+    return make_int3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ int4 clamp(int4 v, int a, int b)
+{
+    return make_int4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ int4 clamp(int4 v, int4 a, int4 b)
+{
+    return make_int4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+inline __device__ __host__ uint2 clamp(uint2 v, uint a, uint b)
+{
+    return make_uint2(clamp(v.x, a, b), clamp(v.y, a, b));
+}
+inline __device__ __host__ uint2 clamp(uint2 v, uint2 a, uint2 b)
+{
+    return make_uint2(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint a, uint b)
+{
+    return make_uint3(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b));
+}
+inline __device__ __host__ uint3 clamp(uint3 v, uint3 a, uint3 b)
+{
+    return make_uint3(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint a, uint b)
+{
+    return make_uint4(clamp(v.x, a, b), clamp(v.y, a, b), clamp(v.z, a, b), clamp(v.w, a, b));
+}
+inline __device__ __host__ uint4 clamp(uint4 v, uint4 a, uint4 b)
+{
+    return make_uint4(clamp(v.x, a.x, b.x), clamp(v.y, a.y, b.y), clamp(v.z, a.z, b.z), clamp(v.w, a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// dot product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float dot(float2 a, float2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ float dot(float3 a, float3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ float dot(float4 a, float4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ int dot(int2 a, int2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ int dot(int3 a, int3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ int dot(int4 a, int4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+inline __host__ __device__ uint dot(uint2 a, uint2 b)
+{
+    return a.x * b.x + a.y * b.y;
+}
+inline __host__ __device__ uint dot(uint3 a, uint3 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+inline __host__ __device__ uint dot(uint4 a, uint4 b)
+{
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// length
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float length(float2 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float3 v)
+{
+    return sqrtf(dot(v, v));
+}
+inline __host__ __device__ float length(float4 v)
+{
+    return sqrtf(dot(v, v));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// normalize
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 normalize(float2 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float3 normalize(float3 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+inline __host__ __device__ float4 normalize(float4 v)
+{
+    float invLen = rsqrtf(dot(v, v));
+    return v * invLen;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// floor
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 floorf(float2 v)
+{
+    return make_float2(floorf(v.x), floorf(v.y));
+}
+inline __host__ __device__ float3 floorf(float3 v)
+{
+    return make_float3(floorf(v.x), floorf(v.y), floorf(v.z));
+}
+inline __host__ __device__ float4 floorf(float4 v)
+{
+    return make_float4(floorf(v.x), floorf(v.y), floorf(v.z), floorf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// frac - returns the fractional portion of a scalar or each vector component
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float fracf(float v)
+{
+    return v - floorf(v);
+}
+inline __host__ __device__ float2 fracf(float2 v)
+{
+    return make_float2(fracf(v.x), fracf(v.y));
+}
+inline __host__ __device__ float3 fracf(float3 v)
+{
+    return make_float3(fracf(v.x), fracf(v.y), fracf(v.z));
+}
+inline __host__ __device__ float4 fracf(float4 v)
+{
+    return make_float4(fracf(v.x), fracf(v.y), fracf(v.z), fracf(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// fmod
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fmodf(float2 a, float2 b)
+{
+    return make_float2(fmodf(a.x, b.x), fmodf(a.y, b.y));
+}
+inline __host__ __device__ float3 fmodf(float3 a, float3 b)
+{
+    return make_float3(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z));
+}
+inline __host__ __device__ float4 fmodf(float4 a, float4 b)
+{
+    return make_float4(fmodf(a.x, b.x), fmodf(a.y, b.y), fmodf(a.z, b.z), fmodf(a.w, b.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// absolute value
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float2 fabs(float2 v)
+{
+    return make_float2(fabs(v.x), fabs(v.y));
+}
+inline __host__ __device__ float3 fabs(float3 v)
+{
+    return make_float3(fabs(v.x), fabs(v.y), fabs(v.z));
+}
+inline __host__ __device__ float4 fabs(float4 v)
+{
+    return make_float4(fabs(v.x), fabs(v.y), fabs(v.z), fabs(v.w));
+}
+
+inline __host__ __device__ int2 abs(int2 v)
+{
+    return make_int2(abs(v.x), abs(v.y));
+}
+inline __host__ __device__ int3 abs(int3 v)
+{
+    return make_int3(abs(v.x), abs(v.y), abs(v.z));
+}
+inline __host__ __device__ int4 abs(int4 v)
+{
+    return make_int4(abs(v.x), abs(v.y), abs(v.z), abs(v.w));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// reflect
+// - returns reflection of incident ray I around surface normal N
+// - N should be normalized, reflected vector's length is equal to length of I
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 reflect(float3 i, float3 n)
+{
+    return i - 2.0f * n * dot(n,i);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// cross product
+////////////////////////////////////////////////////////////////////////////////
+
+inline __host__ __device__ float3 cross(float3 a, float3 b)
+{
+    return make_float3(a.y*b.z - a.z*b.y, a.z*b.x - a.x*b.z, a.x*b.y - a.y*b.x);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// smoothstep
+// - returns 0 if x < a
+// - returns 1 if x > b
+// - otherwise returns smooth interpolation between 0 and 1 based on x
+////////////////////////////////////////////////////////////////////////////////
+
+inline __device__ __host__ float smoothstep(float a, float b, float x)
+{
+    float y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(3.0f - (2.0f*y)));
+}
+inline __device__ __host__ float2 smoothstep(float2 a, float2 b, float2 x)
+{
+    float2 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float2(3.0f) - (make_float2(2.0f)*y)));
+}
+inline __device__ __host__ float3 smoothstep(float3 a, float3 b, float3 x)
+{
+    float3 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float3(3.0f) - (make_float3(2.0f)*y)));
+}
+inline __device__ __host__ float4 smoothstep(float4 a, float4 b, float4 x)
+{
+    float4 y = clamp((x - a) / (b - a), 0.0f, 1.0f);
+    return (y*y*(make_float4(3.0f) - (make_float4(2.0f)*y)));
+}
+
+#endif
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_string.h b/src/cuda/HPC/image/recursiveGaussian/helper_string.h
new file mode 100644
index 000000000..39a1b3805
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_string.h
@@ -0,0 +1,428 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// These are helper functions for the SDK samples (string parsing, timers, etc)
+#ifndef COMMON_HELPER_STRING_H_
+#define COMMON_HELPER_STRING_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <fstream>
+#include <string>
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+#ifndef _CRT_SECURE_NO_DEPRECATE
+#define _CRT_SECURE_NO_DEPRECATE
+#endif
+#ifndef STRCASECMP
+#define STRCASECMP _stricmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP _strnicmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result != 0)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf_s
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf_s
+#endif
+#else  // Linux Includes
+#include <string.h>
+#include <strings.h>
+
+#ifndef STRCASECMP
+#define STRCASECMP strcasecmp
+#endif
+#ifndef STRNCASECMP
+#define STRNCASECMP strncasecmp
+#endif
+#ifndef STRCPY
+#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
+#endif
+
+#ifndef FOPEN
+#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
+#endif
+#ifndef FOPEN_FAIL
+#define FOPEN_FAIL(result) (result == NULL)
+#endif
+#ifndef SSCANF
+#define SSCANF sscanf
+#endif
+#ifndef SPRINTF
+#define SPRINTF sprintf
+#endif
+#endif
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// CUDA Utility Helper Functions
+inline int stringRemoveDelimiter(char delimiter, const char *string) {
+  int string_start = 0;
+
+  while (string[string_start] == delimiter) {
+    string_start++;
+  }
+
+  if (string_start >= static_cast<int>(strlen(string) - 1)) {
+    return 0;
+  }
+
+  return string_start;
+}
+
+inline int getFileExtension(char *filename, char **extension) {
+  int string_length = static_cast<int>(strlen(filename));
+
+  while (filename[string_length--] != '.') {
+    if (string_length == 0) break;
+  }
+
+  if (string_length > 0) string_length += 2;
+
+  if (string_length == 0)
+    *extension = NULL;
+  else
+    *extension = &filename[string_length];
+
+  return string_length;
+}
+
+inline bool checkCmdLineFlag(const int argc, const char **argv,
+                             const char *string_ref) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+
+      const char *equal_pos = strchr(string_argv, '=');
+      int argv_length = static_cast<int>(
+          equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
+
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (length == argv_length &&
+          !STRNCASECMP(string_argv, string_ref, length)) {
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+// This function wraps the CUDA Driver API into a template function
+template <class T>
+inline bool getCmdLineArgumentValue(const int argc, const char **argv,
+                                    const char *string_ref, T *value) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          *value = (T)atoi(&string_argv[length + auto_inc]);
+        }
+
+        bFound = true;
+        i = argc;
+      }
+    }
+  }
+
+  return bFound;
+}
+
+inline int getCmdLineArgumentInt(const int argc, const char **argv,
+                                 const char *string_ref) {
+  bool bFound = false;
+  int value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = atoi(&string_argv[length + auto_inc]);
+        } else {
+          value = 0;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline float getCmdLineArgumentFloat(const int argc, const char **argv,
+                                     const char *string_ref) {
+  bool bFound = false;
+  float value = -1;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      const char *string_argv = &argv[i][string_start];
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        if (length + 1 <= static_cast<int>(strlen(string_argv))) {
+          int auto_inc = (string_argv[length] == '=') ? 1 : 0;
+          value = static_cast<float>(atof(&string_argv[length + auto_inc]));
+        } else {
+          value = 0.f;
+        }
+
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (bFound) {
+    return value;
+  } else {
+    return 0;
+  }
+}
+
+inline bool getCmdLineArgumentString(const int argc, const char **argv,
+                                     const char *string_ref,
+                                     char **string_retval) {
+  bool bFound = false;
+
+  if (argc >= 1) {
+    for (int i = 1; i < argc; i++) {
+      int string_start = stringRemoveDelimiter('-', argv[i]);
+      char *string_argv = const_cast<char *>(&argv[i][string_start]);
+      int length = static_cast<int>(strlen(string_ref));
+
+      if (!STRNCASECMP(string_argv, string_ref, length)) {
+        *string_retval = &string_argv[length + 1];
+        bFound = true;
+        continue;
+      }
+    }
+  }
+
+  if (!bFound) {
+    *string_retval = NULL;
+  }
+
+  return bFound;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+//! Find the path for a file assuming that
+//! files are found in the searchPath.
+//!
+//! @return the path if succeeded, otherwise 0
+//! @param filename         name of the file
+//! @param executable_path  optional absolute path of the executable
+//////////////////////////////////////////////////////////////////////////////
+inline char *sdkFindFilePath(const char *filename,
+                             const char *executable_path) {
+  // <executable_name> defines a variable that is replaced with the name of the
+  // executable
+
+  // Typical relative search paths to locate needed companion files (e.g. sample
+  // input data, or JIT source files) The origin for the relative search may be
+  // the .exe file, a .bat file launching an .exe, a browser .exe launching the
+  // .exe or .bat, etc
+  const char *searchPath[] = {
+      "./",                                           // same dir
+      "./data/",                                      // same dir
+
+      "../../../../Samples/<executable_name>/",       // up 4 in tree
+      "../../../Samples/<executable_name>/",          // up 3 in tree
+      "../../Samples/<executable_name>/",             // up 2 in tree
+
+      "../../../../Samples/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/",        // up 2 in tree
+
+      "../../../../Samples/0_Introduction/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/0_Introduction/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/0_Introduction/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/1_Utilities/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/1_Utilities/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/1_Utilities/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/2_Concepts_and_Techniques/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/3_CUDA_Features/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/3_CUDA_Features/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/3_CUDA_Features/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/4_CUDA_Libraries/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/4_CUDA_Libraries/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/4_CUDA_Libraries/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/5_Domain_Specific/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/5_Domain_Specific/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/5_Domain_Specific/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Samples/6_Performance/<executable_name>/data/",  // up 4 in tree
+      "../../../Samples/6_Performance/<executable_name>/data/",     // up 3 in tree
+      "../../Samples/6_Performance/<executable_name>/data/",        // up 2 in tree
+
+      "../../../../Common/data/",                     // up 4 in tree
+      "../../../Common/data/",                        // up 3 in tree
+      "../../Common/data/"                            // up 2 in tree
+  };
+
+  // Extract the executable name
+  std::string executable_name;
+
+  if (executable_path != 0) {
+    executable_name = std::string(executable_path);
+
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+    // Windows path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('\\');
+    executable_name.erase(0, delimiter_pos + 1);
+
+    if (executable_name.rfind(".exe") != std::string::npos) {
+      // we strip .exe, only if the .exe is found
+      executable_name.resize(executable_name.size() - 4);
+    }
+
+#else
+    // Linux & OSX path delimiter
+    size_t delimiter_pos = executable_name.find_last_of('/');
+    executable_name.erase(0, delimiter_pos + 1);
+#endif
+  }
+
+  // Loop over all search paths and return the first hit
+  for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
+    std::string path(searchPath[i]);
+    size_t executable_name_pos = path.find("<executable_name>");
+
+    // If there is executable_name variable in the searchPath
+    // replace it with the value
+    if (executable_name_pos != std::string::npos) {
+      if (executable_path != 0) {
+        path.replace(executable_name_pos, strlen("<executable_name>"),
+                     executable_name);
+      } else {
+        // Skip this path entry if no executable argument is given
+        continue;
+      }
+    }
+
+#ifdef _DEBUG
+    printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
+#endif
+
+    // Test if the file exists
+    path.append(filename);
+    FILE *fp;
+    FOPEN(fp, path.c_str(), "rb");
+
+    if (fp != NULL) {
+      fclose(fp);
+      // File found
+      // returning an allocated array here for backwards compatibility reasons
+      char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
+      STRCPY(file_path, path.length() + 1, path.c_str());
+      return file_path;
+    }
+
+    if (fp) {
+      fclose(fp);
+    }
+  }
+
+  // File not found
+  printf("\nerror: sdkFindFilePath: file <%s> not found!\n", filename);
+  return 0;
+}
+
+#endif  // COMMON_HELPER_STRING_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/helper_timer.h b/src/cuda/HPC/image/recursiveGaussian/helper_timer.h
new file mode 100644
index 000000000..0614a7802
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/helper_timer.h
@@ -0,0 +1,465 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+// Helper Timing Functions
+#ifndef COMMON_HELPER_TIMER_H_
+#define COMMON_HELPER_TIMER_H_
+
+#ifndef EXIT_WAIVED
+#define EXIT_WAIVED 2
+#endif
+
+// includes, system
+#include <vector>
+
+// includes, project
+#include "exception.h"
+
+// Definition of the StopWatch Interface, this is used if we don't want to use
+// the CUT functions But rather in a self contained class interface
+class StopWatchInterface {
+ public:
+  StopWatchInterface() {}
+  virtual ~StopWatchInterface() {}
+
+ public:
+  //! Start time measurement
+  virtual void start() = 0;
+
+  //! Stop time measurement
+  virtual void stop() = 0;
+
+  //! Reset time counters to zero
+  virtual void reset() = 0;
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  virtual float getTime() = 0;
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  virtual float getAverageTime() = 0;
+};
+
+//////////////////////////////////////////////////////////////////
+// Begin Stopwatch timer class definitions for all OS platforms //
+//////////////////////////////////////////////////////////////////
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+// includes, system
+#define WINDOWS_LEAN_AND_MEAN
+#include <windows.h>
+#undef min
+#undef max
+
+//! Windows specific implementation of StopWatch
+class StopWatchWin : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchWin()
+      : start_time(),
+        end_time(),
+        diff_time(0.0f),
+        total_time(0.0f),
+        running(false),
+        clock_sessions(0),
+        freq(0),
+        freq_set(false) {
+    if (!freq_set) {
+      // helper variable
+      LARGE_INTEGER temp;
+
+      // get the tick frequency from the OS
+      QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER *>(&temp));
+
+      // convert to type in which it is needed
+      freq = (static_cast<double>(temp.QuadPart)) / 1000.0;
+
+      // rememeber query
+      freq_set = true;
+    }
+  }
+
+  // Destructor
+  ~StopWatchWin() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  LARGE_INTEGER start_time;
+  //! End of measurement
+  LARGE_INTEGER end_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+
+  //! tick frequency
+  double freq;
+
+  //! flag if the frequency has been set
+  bool freq_set;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::start() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::stop() {
+  QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&end_time));
+  diff_time = static_cast<float>(((static_cast<double>(end_time.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+
+  total_time += diff_time;
+  clock_sessions++;
+  running = false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchWin::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&start_time));
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    LARGE_INTEGER temp;
+    QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER *>(&temp));
+    retval += static_cast<float>(((static_cast<double>(temp.QuadPart) -
+                                   static_cast<double>(start_time.QuadPart)) /
+                                  freq));
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchWin::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+#else
+// Declarations for Stopwatch on Linux and Mac OSX
+// includes, system
+#include <sys/time.h>
+#include <ctime>
+
+//! Windows specific implementation of StopWatch
+class StopWatchLinux : public StopWatchInterface {
+ public:
+  //! Constructor, default
+  StopWatchLinux()
+      : start_time(),
+        diff_time(0.0),
+        total_time(0.0),
+        running(false),
+        clock_sessions(0) {}
+
+  // Destructor
+  virtual ~StopWatchLinux() {}
+
+ public:
+  //! Start time measurement
+  inline void start();
+
+  //! Stop time measurement
+  inline void stop();
+
+  //! Reset time counters to zero
+  inline void reset();
+
+  //! Time in msec. after start. If the stop watch is still running (i.e. there
+  //! was no call to stop()) then the elapsed time is returned, otherwise the
+  //! time between the last start() and stop call is returned
+  inline float getTime();
+
+  //! Mean time to date based on the number of times the stopwatch has been
+  //! _stopped_ (ie finished sessions) and the current total time
+  inline float getAverageTime();
+
+ private:
+  // helper functions
+
+  //! Get difference between start time and current time
+  inline float getDiffTime();
+
+ private:
+  // member variables
+
+  //! Start of measurement
+  struct timeval start_time;
+
+  //! Time difference between the last start and stop
+  float diff_time;
+
+  //! TOTAL time difference between starts and stops
+  float total_time;
+
+  //! flag if the stop watch is running
+  bool running;
+
+  //! Number of times clock has been started
+  //! and stopped to allow averaging
+  int clock_sessions;
+};
+
+// functions, inlined
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start time measurement
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::start() {
+  gettimeofday(&start_time, 0);
+  running = true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop time measurement and increment add to the current diff_time summation
+//! variable. Also increment the number of times this clock has been run.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::stop() {
+  diff_time = getDiffTime();
+  total_time += diff_time;
+  running = false;
+  clock_sessions++;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Reset the timer to 0. Does not change the timer running state but does
+//! recapture this point in time as the current start time if it is running.
+////////////////////////////////////////////////////////////////////////////////
+inline void StopWatchLinux::reset() {
+  diff_time = 0;
+  total_time = 0;
+  clock_sessions = 0;
+
+  if (running) {
+    gettimeofday(&start_time, 0);
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. after start. If the stop watch is still running (i.e. there
+//! was no call to stop()) then the elapsed time is returned added to the
+//! current diff_time sum, otherwise the current summed time difference alone
+//! is returned.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getTime() {
+  // Return the TOTAL time to date
+  float retval = total_time;
+
+  if (running) {
+    retval += getDiffTime();
+  }
+
+  return retval;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Time in msec. for a single run based on the total number of COMPLETED runs
+//! and the total time.
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getAverageTime() {
+  return (clock_sessions > 0) ? (total_time / clock_sessions) : 0.0f;
+}
+////////////////////////////////////////////////////////////////////////////////
+
+////////////////////////////////////////////////////////////////////////////////
+inline float StopWatchLinux::getDiffTime() {
+  struct timeval t_time;
+  gettimeofday(&t_time, 0);
+
+  // time difference in milli-seconds
+  return static_cast<float>(1000.0 * (t_time.tv_sec - start_time.tv_sec) +
+                            (0.001 * (t_time.tv_usec - start_time.tv_usec)));
+}
+#endif  // WIN32
+
+////////////////////////////////////////////////////////////////////////////////
+//! Timer functionality exported
+
+////////////////////////////////////////////////////////////////////////////////
+//! Create a new timer
+//! @return true if a time has been created, otherwise false
+//! @param  name of the new timer, 0 if the creation failed
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkCreateTimer(StopWatchInterface **timer_interface) {
+// printf("sdkCreateTimer called object %08x\n", (void *)*timer_interface);
+#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
+  *timer_interface = reinterpret_cast<StopWatchInterface *>(new StopWatchWin());
+#else
+  *timer_interface =
+      reinterpret_cast<StopWatchInterface *>(new StopWatchLinux());
+#endif
+  return (*timer_interface != NULL) ? true : false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Delete a timer
+//! @return true if a time has been deleted, otherwise false
+//! @param  name of the timer to delete
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkDeleteTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkDeleteTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    delete *timer_interface;
+    *timer_interface = NULL;
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Start the time with name \a name
+//! @param name  name of the timer to start
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStartTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStartTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->start();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Stop the time with name \a name. Does not reset.
+//! @param name  name of the timer to stop
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkStopTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkStopTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->stop();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Resets the timer's counter.
+//! @param name  name of the timer to reset.
+////////////////////////////////////////////////////////////////////////////////
+inline bool sdkResetTimer(StopWatchInterface **timer_interface) {
+  // printf("sdkResetTimer called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    (*timer_interface)->reset();
+  }
+
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Return the average time for timer execution as the total time
+//! for the timer dividied by the number of completed (stopped) runs the timer
+//! has made.
+//! Excludes the current running time if the timer is currently running.
+//! @param name  name of the timer to return the time of
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetAverageTimerValue(StopWatchInterface **timer_interface) {
+  //  printf("sdkGetAverageTimerValue called object %08x\n", (void
+  //  *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getAverageTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//! Total execution time for the timer over all runs since the last reset
+//! or timer creation.
+//! @param name  name of the timer to obtain the value of.
+////////////////////////////////////////////////////////////////////////////////
+inline float sdkGetTimerValue(StopWatchInterface **timer_interface) {
+  // printf("sdkGetTimerValue called object %08x\n", (void *)*timer_interface);
+  if (*timer_interface) {
+    return (*timer_interface)->getTime();
+  } else {
+    return 0.0f;
+  }
+}
+
+#endif  // COMMON_HELPER_TIMER_H_
diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp
new file mode 100644
index 000000000..1a6172e0d
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian.cpp
@@ -0,0 +1,587 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Recursive Gaussian filter
+  sgreen 8/1/08
+
+  This code sample implements a Gaussian blur using Deriche's recursive method:
+  http://citeseer.ist.psu.edu/deriche93recursively.html
+
+  This is similar to the box filter sample in the SDK, but it uses the previous
+  outputs of the filter as well as the previous inputs. This is also known as an
+  IIR (infinite impulse response) filter, since its response to an input impulse
+  can last forever.
+
+  The main advantage of this method is that the execution time is independent of
+  the filter width.
+
+  The GPU processes columns of the image in parallel. To avoid uncoalesced reads
+  for the row pass we transpose the image and then transpose it back again
+  afterwards.
+
+  The implementation is based on code from the CImg library:
+  http://cimg.sourceforge.net/
+  Thanks to David Tschumperl� and all the CImg contributors!
+*/
+
+#pragma warning(disable : 4819)
+
+#ifndef NO_OPENGL
+// OpenGL Graphics includes
+#include "helper_gl.h"
+#if defined(__APPLE__) || defined(MACOSX)
+#pragma clang diagnostic ignored "-Wdeprecated-declarations"
+#include <GLUT/glut.h>
+#ifndef glutCloseFunc
+#define glutCloseFunc glutWMCloseFunc
+#endif
+#else
+#include <GL/freeglut.h>
+#endif
+
+// CUDA includes and interop headers
+#include <cuda_gl_interop.h>
+#endif
+
+#include <cuda_runtime.h>
+
+// CUDA utilities and system includes
+#include "helper_cuda.h" // includes cuda.h and cuda_runtime_api.h
+#include "helper_functions.h"
+
+// Includes
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define MAX(a, b) ((a > b) ? a : b)
+
+#define USE_SIMPLE_FILTER 0
+
+#define MAX_EPSILON_ERROR 5.0f
+#define THRESHOLD         0.15f
+
+// Define the files that are to be save and the reference images for validation
+const char *sOriginal[] = {"teapot512_10.ppm", "teapot512_14.ppm", "teapot512_18.ppm", "teapot512_22.ppm", NULL};
+
+const char *sReference[] = {"ref_10.ppm", "ref_14.ppm", "ref_18.ppm", "ref_22.ppm", NULL};
+
+const char *image_filename = "teapot512.ppm";
+float       sigma          = 10.0f;
+int         order          = 0;
+int         nthreads       = 64; // number of threads per block
+
+unsigned int  width, height;
+unsigned int *h_img  = NULL;
+unsigned int *d_img  = NULL;
+unsigned int *d_temp = NULL;
+
+#ifndef NO_OPENGL
+GLuint pbo   = 0; // OpenGL pixel buffer object
+GLuint texid = 0; // texture
+
+cudaGraphicsResource_t cuda_vbo_resource;
+#endif
+
+StopWatchInterface *timer = 0;
+
+// Auto-Verification Code
+const int    frameCheckNumber = 4;
+int          fpsCount         = 0; // FPS count for averaging
+int          fpsLimit         = 1; // FPS limit for sampling
+unsigned int frameCount       = 0;
+
+int   *pArgc = NULL;
+char **pArgv = NULL;
+
+bool runBenchmark = false;
+
+const char *sSDKsample = "CUDA Recursive Gaussian";
+
+extern "C" void transpose(unsigned int *d_src, unsigned int *d_dest, unsigned int width, int height);
+
+extern "C" void gaussianFilterRGBA(unsigned int *d_src,
+                                   unsigned int *d_dest,
+                                   unsigned int *d_temp,
+                                   int           width,
+                                   int           height,
+                                   float         sigma,
+                                   int           order,
+                                   int           nthreads);
+
+void cleanup();
+
+#ifndef NO_OPENGL
+void computeFPS()
+{
+    frameCount++;
+    fpsCount++;
+
+    if (fpsCount == fpsLimit) {
+        char  fps[256];
+        float ifps = 1.f / (sdkGetAverageTimerValue(&timer) / 1000.f);
+        sprintf(fps, "%s (sigma=%4.2f): %3.1f fps", sSDKsample, sigma, ifps);
+
+        glutSetWindowTitle(fps);
+        fpsCount = 0;
+
+        fpsLimit = ftoi(MAX(ifps, 1.f));
+        sdkResetTimer(&timer);
+    }
+}
+
+// display results using OpenGL
+void display()
+{
+    sdkStartTimer(&timer);
+
+    // execute filter, writing results to pbo
+    unsigned int *d_result;
+    checkCudaErrors(cudaGraphicsMapResources(1, &cuda_vbo_resource, 0));
+    size_t num_bytes;
+    checkCudaErrors(cudaGraphicsResourceGetMappedPointer((void **)&d_result, &num_bytes, cuda_vbo_resource));
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+
+    // unmap buffer object
+    checkCudaErrors(cudaGraphicsUnmapResources(1, &cuda_vbo_resource, 0));
+
+    // load texture from pbo
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBindTexture(GL_TEXTURE_2D, texid);
+    glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, GL_RGBA, GL_UNSIGNED_BYTE, 0);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+
+    // display results
+    glClear(GL_COLOR_BUFFER_BIT);
+
+    glEnable(GL_TEXTURE_2D);
+    glDisable(GL_DEPTH_TEST);
+
+    glBegin(GL_QUADS);
+    glTexCoord2f(0, 1);
+    glVertex2f(0, 0);
+    glTexCoord2f(1, 1);
+    glVertex2f(1, 0);
+    glTexCoord2f(1, 0);
+    glVertex2f(1, 1);
+    glTexCoord2f(0, 0);
+    glVertex2f(0, 1);
+    glEnd();
+
+    glDisable(GL_TEXTURE_2D);
+    glutSwapBuffers();
+
+    sdkStopTimer(&timer);
+
+    computeFPS();
+}
+
+void idle() { glutPostRedisplay(); }
+#endif
+
+void cleanup()
+{
+    sdkDeleteTimer(&timer);
+
+    checkCudaErrors(cudaFree(d_img));
+    checkCudaErrors(cudaFree(d_temp));
+
+#ifndef NO_OPENGL
+    if (!runBenchmark) {
+        if (pbo) {
+            // unregister this buffer object with CUDA
+            checkCudaErrors(cudaGraphicsUnregisterResource(cuda_vbo_resource));
+            glDeleteBuffers(1, &pbo);
+        }
+
+        if (texid) {
+            glDeleteTextures(1, &texid);
+        }
+    }
+#endif
+}
+
+#ifndef NO_OPENGL
+void keyboard(unsigned char key, int x, int y)
+{
+    switch (key) {
+    case 27:
+#if defined(__APPLE__) || defined(MACOSX)
+        exit(EXIT_SUCCESS);
+#else
+        glutDestroyWindow(glutGetWindow());
+        return;
+#endif
+        break;
+
+    case '=':
+        sigma += 0.1f;
+        break;
+
+    case '-':
+        sigma -= 0.1f;
+
+        if (sigma < 0.0) {
+            sigma = 0.0f;
+        }
+
+        break;
+
+    case '+':
+        sigma += 1.0f;
+        break;
+
+    case '_':
+        sigma -= 1.0f;
+
+        if (sigma < 0.0) {
+            sigma = 0.0f;
+        }
+
+        break;
+
+    case '0':
+        order = 0;
+        break;
+
+    case '1':
+        order = 1;
+        sigma = 0.5f;
+        break;
+
+    case '2':
+        order = 2;
+        sigma = 0.5f;
+        break;
+
+    default:
+        break;
+    }
+
+    printf("sigma = %f\n", sigma);
+    glutPostRedisplay();
+}
+
+void reshape(int x, int y)
+{
+    glViewport(0, 0, x, y);
+
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+    glOrtho(0.0, 1.0, 0.0, 1.0, 0.0, 1.0);
+}
+
+void initGLBuffers()
+{
+    // create pixel buffer object to store final image
+    glGenBuffers(1, &pbo);
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, pbo);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER_ARB, width * height * sizeof(GLubyte) * 4, h_img, GL_STREAM_DRAW_ARB);
+
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER_ARB, 0);
+    checkCudaErrors(cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, pbo, cudaGraphicsRegisterFlagsWriteDiscard));
+
+    // create texture for display
+    glGenTextures(1, &texid);
+    glBindTexture(GL_TEXTURE_2D, texid);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA8, width, height, 0, GL_RGBA, GL_UNSIGNED_BYTE, NULL);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+    glBindTexture(GL_TEXTURE_2D, 0);
+}
+
+void initGL(int *argc, char **argv)
+{
+    glutInit(argc, argv);
+    glutInitDisplayMode(GLUT_RGB | GLUT_DOUBLE);
+    glutInitWindowSize(width, height);
+    glutCreateWindow(sSDKsample);
+    glutDisplayFunc(display);
+    glutKeyboardFunc(keyboard);
+    glutReshapeFunc(reshape);
+    glutIdleFunc(idle);
+
+#if defined(__APPLE__) || defined(MACOSX)
+    atexit(cleanup);
+#else
+    glutCloseFunc(cleanup);
+#endif
+
+    printf("Press '+' and '-' to change filter width\n");
+    printf("0, 1, 2 - change filter order\n");
+
+    if (!isGLVersionSupported(2, 0)
+        || !areGLExtensionsSupported("GL_ARB_vertex_buffer_object GL_ARB_pixel_buffer_object")) {
+        fprintf(stderr, "Required OpenGL extensions missing.");
+        exit(EXIT_FAILURE);
+    }
+}
+#endif
+
+void initCudaBuffers()
+{
+    unsigned int size = width * height * sizeof(unsigned int);
+
+    // allocate device memory
+    checkCudaErrors(cudaMalloc((void **)&d_img, size));
+    checkCudaErrors(cudaMalloc((void **)&d_temp, size));
+
+    checkCudaErrors(cudaMemcpy(d_img, h_img, size, cudaMemcpyHostToDevice));
+
+    sdkCreateTimer(&timer);
+}
+
+void benchmark(int iterations)
+{
+    // allocate memory for result
+    unsigned int *d_result;
+    unsigned int  size = width * height * sizeof(unsigned int);
+    checkCudaErrors(cudaMalloc((void **)&d_result, size));
+
+    // warm-up
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStartTimer(&timer);
+
+    // execute the kernel
+    for (int i = 0; i < iterations; i++) {
+        gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+    }
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStopTimer(&timer);
+
+    // check if kernel execution generated an error
+    getLastCudaError("Kernel execution failed");
+
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width * height * iterations / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+
+    checkCudaErrors(cudaFree(d_result));
+}
+
+bool runSingleTest(const char *ref_file, const char *exec_path)
+{
+    // allocate memory for result
+    int           nTotalErrors = 0;
+    unsigned int *d_result;
+    unsigned int  size = width * height * sizeof(unsigned int);
+    checkCudaErrors(cudaMalloc((void **)&d_result, size));
+
+    // warm-up
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+
+    checkCudaErrors(cudaDeviceSynchronize());
+    sdkStartTimer(&timer);
+
+    gaussianFilterRGBA(d_img, d_result, d_temp, width, height, sigma, order, nthreads);
+    checkCudaErrors(cudaDeviceSynchronize());
+    getLastCudaError("Kernel execution failed");
+    sdkStopTimer(&timer);
+
+    unsigned char *h_result = (unsigned char *)malloc(width * height * 4);
+    checkCudaErrors(cudaMemcpy(h_result, d_result, width * height * 4, cudaMemcpyDeviceToHost));
+
+    char dump_file[1024];
+    sprintf(dump_file, "teapot512_%02d.ppm", (int)sigma);
+    sdkSavePPM4ub(dump_file, h_result, width, height);
+
+    if (!sdkComparePPM(dump_file, sdkFindFilePath(ref_file, exec_path), MAX_EPSILON_ERROR, THRESHOLD, false)) {
+        nTotalErrors++;
+    }
+
+    printf("Processing time: %f (ms)\n", sdkGetTimerValue(&timer));
+    printf("%.2f Mpixels/sec\n", (width * height / (sdkGetTimerValue(&timer) / 1000.0f)) / 1e6);
+
+    checkCudaErrors(cudaFree(d_result));
+    free(h_result);
+
+    printf("Summary: %d errors!\n", nTotalErrors);
+
+    printf(nTotalErrors == 0 ? "Test passed\n" : "Test failed!\n");
+    return (nTotalErrors == 0);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Program main
+////////////////////////////////////////////////////////////////////////////////
+int main(int argc, char **argv)
+{
+    pArgc          = &argc;
+    pArgv          = argv;
+    char *ref_file = NULL;
+
+#if defined(__linux__)
+    setenv("DISPLAY", ":0", 0);
+#endif
+
+    printf("%s Starting...\n\n", sSDKsample);
+
+    printf("NOTE: The CUDA Samples are not meant for performance measurements. "
+           "Results may vary when GPU Boost is enabled.\n\n");
+
+    // use command-line specified CUDA device, otherwise use device with highest
+    // Gflops/s
+    if (argc > 1) {
+        if (checkCmdLineFlag(argc, (const char **)argv, "file")) {
+            getCmdLineArgumentString(argc, (const char **)argv, "file", &ref_file);
+            fpsLimit = frameCheckNumber;
+        }
+    }
+
+    // Get the path of the filename
+    char *filename;
+
+    if (getCmdLineArgumentString(argc, (const char **)argv, "image", &filename)) {
+        image_filename = filename;
+    }
+
+    // load image
+    char *image_path = sdkFindFilePath(image_filename, argv[0]);
+
+    if (image_path == NULL) {
+        fprintf(stderr, "Error unable to find and load image file: '%s'\n", image_filename);
+        exit(EXIT_FAILURE);
+    }
+
+    sdkLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height);
+
+    if (!h_img) {
+        printf("Error unable to load PPM file: '%s'\n", image_path);
+        exit(EXIT_FAILURE);
+    }
+
+    printf("Loaded '%s', %d x %d pixels\n", image_path, width, height);
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "threads")) {
+        nthreads = getCmdLineArgumentInt(argc, (const char **)argv, "threads");
+    }
+
+    if (checkCmdLineFlag(argc, (const char **)argv, "sigma")) {
+        sigma = getCmdLineArgumentFloat(argc, (const char **)argv, "sigma");
+    }
+
+    int iterations = 100;  // Default iterations
+    if (checkCmdLineFlag(argc, (const char **)argv, "iterations")) {
+        iterations = getCmdLineArgumentInt(argc, (const char **)argv, "iterations");
+    }
+
+    runBenchmark = checkCmdLineFlag(argc, (const char **)argv, "benchmark");
+
+#ifdef NO_OPENGL
+    // Check if benchmark mode is enabled when compiled without OpenGL
+    if (!runBenchmark) {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "========================================\n");
+        fprintf(stderr, "WARNING: This binary was compiled without OpenGL support.\n");
+        fprintf(stderr, "You must use the --benchmark flag to run this program.\n");
+        fprintf(stderr, "========================================\n");
+        fprintf(stderr, "\n");
+        fprintf(stderr, "Auto-enabling benchmark mode...\n");
+        fprintf(stderr, "\n");
+        runBenchmark = true;
+    }
+#endif
+
+    int                   device;
+    struct cudaDeviceProp prop;
+    cudaGetDevice(&device);
+    cudaGetDeviceProperties(&prop, device);
+
+    if (!strncmp("Tesla", prop.name, 5)) {
+        printf("Tesla card detected, running the test in benchmark mode (no OpenGL "
+               "display)\n");
+        //        runBenchmark = true;
+        runBenchmark = true;
+    }
+
+    // Benchmark or AutoTest mode detected, no OpenGL
+    if (runBenchmark == true || ref_file != NULL) {
+        findCudaDevice(argc, (const char **)argv);
+    }
+#ifndef NO_OPENGL
+    else {
+        // First initialize OpenGL context, and then select CUDA device.
+        initGL(&argc, argv);
+        findCudaDevice(argc, (const char **)argv);
+    }
+#else
+    else {
+        // This binary was compiled without OpenGL support
+        fprintf(stderr, "\n");
+        fprintf(stderr, "ERROR: This binary was compiled without OpenGL support (NO_OPENGL defined).\n");
+        fprintf(stderr, "You must run in benchmark mode using the --benchmark flag.\n");
+        fprintf(stderr, "\n");
+        fprintf(stderr, "Usage: %s --benchmark [--iterations N] [--image FILE]\n", argv[0]);
+        fprintf(stderr, "\n");
+        fprintf(stderr, "Example: %s --benchmark --iterations 1\n", argv[0]);
+        fprintf(stderr, "\n");
+        exit(EXIT_FAILURE);
+    }
+#endif
+
+    initCudaBuffers();
+
+    if (ref_file) {
+        printf("(Automated Testing)\n");
+        bool testPassed = runSingleTest(ref_file, argv[0]);
+
+        cleanup();
+        exit(testPassed ? EXIT_SUCCESS : EXIT_FAILURE);
+    }
+
+    if (runBenchmark) {
+        printf("(Run Benchmark with %d iteration%s)\n", iterations, iterations == 1 ? "" : "s");
+        benchmark(iterations);
+
+        cleanup();
+        exit(EXIT_SUCCESS);
+    }
+
+#ifndef NO_OPENGL
+    initGLBuffers();
+    glutMainLoop();
+#else
+    // If we get here, something went wrong - we're not in benchmark mode
+    // but OpenGL is disabled
+    fprintf(stderr, "\n");
+    fprintf(stderr, "ERROR: Reached interactive mode path, but this binary was compiled without OpenGL.\n");
+    fprintf(stderr, "This should not happen. Please use --benchmark flag.\n");
+    fprintf(stderr, "\n");
+    cleanup();
+    exit(EXIT_FAILURE);
+#endif
+
+    exit(EXIT_SUCCESS);
+}
diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu
new file mode 100644
index 000000000..353c30019
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_cuda.cu
@@ -0,0 +1,155 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Recursive Gaussian filter
+  sgreen 8/1/08
+
+  This code sample implements a Gaussian blur using Deriche's recursive method:
+  http://citeseer.ist.psu.edu/deriche93recursively.html
+
+  This is similar to the box filter sample in the SDK, but it uses the previous
+  outputs of the filter as well as the previous inputs. This is also known as an
+  IIR (infinite impulse response) filter, since its response to an input impulse
+  can last forever.
+
+  The main advantage of this method is that the execution time is independent of
+  the filter width.
+
+  The GPU processes columns of the image in parallel. To avoid uncoalesced reads
+  for the row pass we transpose the image and then transpose it back again
+  afterwards.
+
+  The implementation is based on code from the CImg library:
+  http://cimg.sourceforge.net/
+  Thanks to David Tschumperl� and all the CImg contributors!
+*/
+
+#include <cuda_runtime.h>
+#include "helper_cuda.h"
+#include "helper_math.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "recursiveGaussian_kernel.cuh"
+
+#define USE_SIMPLE_FILTER 0
+
+// Round a / b to nearest higher integer value
+int iDivUp(int a, int b) { return (a % b != 0) ? (a / b + 1) : (a / b); }
+
+/*
+  Transpose a 2D array (see SDK transpose example)
+*/
+extern "C" void transpose(uint *d_src, uint *d_dest, uint width, int height)
+{
+    dim3 grid(iDivUp(width, BLOCK_DIM), iDivUp(height, BLOCK_DIM), 1);
+    dim3 threads(BLOCK_DIM, BLOCK_DIM, 1);
+    d_transpose<<<grid, threads>>>(d_dest, d_src, width, height);
+    getLastCudaError("Kernel execution failed");
+}
+
+/*
+  Perform Gaussian filter on a 2D image using CUDA
+
+  Parameters:
+  d_src  - pointer to input image in device memory
+  d_dest - pointer to destination image in device memory
+  d_temp - pointer to temporary storage in device memory
+  width  - image width
+  height - image height
+  sigma  - sigma of Gaussian
+  order  - filter order (0, 1 or 2)
+*/
+
+// 8-bit RGBA version
+extern "C" void
+gaussianFilterRGBA(uint *d_src, uint *d_dest, uint *d_temp, int width, int height, float sigma, int order, int nthreads)
+{
+    // compute filter coefficients
+    const float nsigma = sigma < 0.1f ? 0.1f : sigma, alpha = 1.695f / nsigma, ema = (float)std::exp(-alpha),
+                ema2 = (float)std::exp(-2 * alpha), b1 = -2 * ema, b2 = ema2;
+
+    float a0 = 0, a1 = 0, a2 = 0, a3 = 0, coefp = 0, coefn = 0;
+
+    switch (order) {
+    case 0: {
+        const float k = (1 - ema) * (1 - ema) / (1 + 2 * alpha * ema - ema2);
+        a0            = k;
+        a1            = k * (alpha - 1) * ema;
+        a2            = k * (alpha + 1) * ema;
+        a3            = -k * ema2;
+    } break;
+
+    case 1: {
+        const float k = (1 - ema) * (1 - ema) / ema;
+        a0            = k * ema;
+        a1 = a3 = 0;
+        a2      = -a0;
+    } break;
+
+    case 2: {
+        const float ea = (float)std::exp(-alpha), k = -(ema2 - 1) / (2 * alpha * ema),
+                    kn = (-2 * (-1 + 3 * ea - 3 * ea * ea + ea * ea * ea) / (3 * ea + 1 + 3 * ea * ea + ea * ea * ea));
+        a0             = kn;
+        a1             = -kn * (1 + k * alpha) * ema;
+        a2             = kn * (1 - k * alpha) * ema;
+        a3             = -kn * ema2;
+    } break;
+
+    default:
+        fprintf(stderr, "gaussianFilter: invalid order parameter!\n");
+        return;
+    }
+
+    coefp = (a0 + a1) / (1 + b1 + b2);
+    coefn = (a2 + a3) / (1 + b1 + b2);
+
+// process columns
+#if USE_SIMPLE_FILTER
+    d_simpleRecursive_rgba<<<iDivUp(width, nthreads), nthreads>>>(d_src, d_temp, width, height, ema);
+#else
+    d_recursiveGaussian_rgba<<<iDivUp(width, nthreads), nthreads>>>(
+        d_src, d_temp, width, height, a0, a1, a2, a3, b1, b2, coefp, coefn);
+#endif
+    getLastCudaError("Kernel execution failed");
+
+    transpose(d_temp, d_dest, width, height);
+    getLastCudaError("transpose: Kernel execution failed");
+
+// process rows
+#if USE_SIMPLE_FILTER
+    d_simpleRecursive_rgba<<<iDivUp(height, nthreads), nthreads>>>(d_dest, d_temp, height, width, ema);
+#else
+    d_recursiveGaussian_rgba<<<iDivUp(height, nthreads), nthreads>>>(
+        d_dest, d_temp, height, width, a0, a1, a2, a3, b1, b2, coefp, coefn);
+#endif
+    getLastCudaError("Kernel execution failed");
+
+    transpose(d_temp, d_dest, height, width);
+}
diff --git a/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh
new file mode 100644
index 000000000..4db2642c9
--- /dev/null
+++ b/src/cuda/HPC/image/recursiveGaussian/recursiveGaussian_kernel.cuh
@@ -0,0 +1,235 @@
+/* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Recursive Gaussian filter
+*/
+
+#ifndef _RECURSIVEGAUSSIAN_KERNEL_CU_
+#define _RECURSIVEGAUSSIAN_KERNEL_CU_
+
+#include <cooperative_groups.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace cg = cooperative_groups;
+
+#include "helper_cuda.h"
+#include "helper_math.h"
+
+#define BLOCK_DIM     16
+#define CLAMP_TO_EDGE 1
+
+// Transpose kernel (see transpose CUDA Sample for details)
+__global__ void d_transpose(uint *odata, uint *idata, int width, int height)
+{
+    // Handle to thread block group
+    cg::thread_block cta = cg::this_thread_block();
+
+    __shared__ uint block[BLOCK_DIM][BLOCK_DIM + 1];
+
+    // read the matrix tile into shared memory
+    unsigned int xIndex = blockIdx.x * BLOCK_DIM + threadIdx.x;
+    unsigned int yIndex = blockIdx.y * BLOCK_DIM + threadIdx.y;
+
+    if ((xIndex < width) && (yIndex < height)) {
+        unsigned int index_in           = yIndex * width + xIndex;
+        block[threadIdx.y][threadIdx.x] = idata[index_in];
+    }
+
+    cg::sync(cta);
+
+    // write the transposed matrix tile to global memory
+    xIndex = blockIdx.y * BLOCK_DIM + threadIdx.x;
+    yIndex = blockIdx.x * BLOCK_DIM + threadIdx.y;
+
+    if ((xIndex < height) && (yIndex < width)) {
+        unsigned int index_out = yIndex * height + xIndex;
+        odata[index_out]       = block[threadIdx.x][threadIdx.y];
+    }
+}
+
+// RGBA version
+// reads from 32-bit uint array holding 8-bit RGBA
+
+// convert floating point rgba color to 32-bit integer
+__device__ uint rgbaFloatToInt(float4 rgba)
+{
+    rgba.x = __saturatef(rgba.x); // clamp to [0.0, 1.0]
+    rgba.y = __saturatef(rgba.y);
+    rgba.z = __saturatef(rgba.z);
+    rgba.w = __saturatef(rgba.w);
+    return (uint(rgba.w * 255) << 24) | (uint(rgba.z * 255) << 16) | (uint(rgba.y * 255) << 8) | uint(rgba.x * 255);
+}
+
+// convert from 32-bit int to float4
+__device__ float4 rgbaIntToFloat(uint c)
+{
+    float4 rgba;
+    rgba.x = (c & 0xff) / 255.0f;
+    rgba.y = ((c >> 8) & 0xff) / 255.0f;
+    rgba.z = ((c >> 16) & 0xff) / 255.0f;
+    rgba.w = ((c >> 24) & 0xff) / 255.0f;
+    return rgba;
+}
+
+/*
+  simple 1st order recursive filter
+  - processes one image column per thread
+
+  parameters:
+  id - pointer to input data (RGBA image packed into 32-bit integers)
+  od - pointer to output data
+  w  - image width
+  h  - image height
+  a  - blur parameter
+*/
+
+__global__ void d_simpleRecursive_rgba(uint *id, uint *od, int w, int h, float a)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (x >= w)
+        return;
+
+    id += x; // advance pointers to correct column
+    od += x;
+
+    // forward pass
+    float4 yp = rgbaIntToFloat(*id); // previous output
+
+    for (int y = 0; y < h; y++) {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = xc + a * (yp - xc); // simple lerp between current and previous value
+        *od       = rgbaFloatToInt(yc);
+        id += w;
+        od += w; // move to next row
+        yp = yc;
+    }
+
+    // reset pointers to point to last element in column
+    id -= w;
+    od -= w;
+
+    // reverse pass
+    // ensures response is symmetrical
+    yp = rgbaIntToFloat(*id);
+
+    for (int y = h - 1; y >= 0; y--) {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = xc + a * (yp - xc);
+        *od       = rgbaFloatToInt((rgbaIntToFloat(*od) + yc) * 0.5f);
+        id -= w;
+        od -= w; // move to previous row
+        yp = yc;
+    }
+}
+
+/*
+  recursive Gaussian filter
+
+  parameters:
+  id - pointer to input data (RGBA image packed into 32-bit integers)
+  od - pointer to output data
+  w  - image width
+  h  - image height
+  a0-a3, b1, b2, coefp, coefn - filter parameters
+*/
+
+__global__ void d_recursiveGaussian_rgba(uint *id,
+                                         uint *od,
+                                         int   w,
+                                         int   h,
+                                         float a0,
+                                         float a1,
+                                         float a2,
+                                         float a3,
+                                         float b1,
+                                         float b2,
+                                         float coefp,
+                                         float coefn)
+{
+    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (x >= w)
+        return;
+
+    id += x; // advance pointers to correct column
+    od += x;
+
+    // forward pass
+    float4 xp = make_float4(0.0f); // previous input
+    float4 yp = make_float4(0.0f); // previous output
+    float4 yb = make_float4(0.0f); // previous output by 2
+#if CLAMP_TO_EDGE
+    xp = rgbaIntToFloat(*id);
+    yb = coefp * xp;
+    yp = yb;
+#endif
+
+    for (int y = 0; y < h; y++) {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = a0 * xc + a1 * xp - b1 * yp - b2 * yb;
+        *od       = rgbaFloatToInt(yc);
+        id += w;
+        od += w; // move to next row
+        xp = xc;
+        yb = yp;
+        yp = yc;
+    }
+
+    // reset pointers to point to last element in column
+    id -= w;
+    od -= w;
+
+    // reverse pass
+    // ensures response is symmetrical
+    float4 xn = make_float4(0.0f);
+    float4 xa = make_float4(0.0f);
+    float4 yn = make_float4(0.0f);
+    float4 ya = make_float4(0.0f);
+#if CLAMP_TO_EDGE
+    xn = xa = rgbaIntToFloat(*id);
+    yn      = coefn * xn;
+    ya      = yn;
+#endif
+
+    for (int y = h - 1; y >= 0; y--) {
+        float4 xc = rgbaIntToFloat(*id);
+        float4 yc = a2 * xn + a3 * xa - b1 * yn - b2 * ya;
+        xa        = xn;
+        xn        = xc;
+        ya        = yn;
+        yn        = yc;
+        *od       = rgbaFloatToInt(rgbaIntToFloat(*od) + yc);
+        id -= w;
+        od -= w; // move to previous row
+    }
+}
+
+#endif // #ifndef _GAUSSIAN_KERNEL_H_
diff --git a/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py b/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py
new file mode 100644
index 000000000..0ae69119e
--- /dev/null
+++ b/src/cuda/HPC/newton/diffsim_ball/example_diffsim_ball.py
@@ -0,0 +1,284 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 The Newton Developers
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################################
+# Example Diffsim Ball
+#
+# Shows how to use Newton to optimize the initial velocity of a particle
+# such that it bounces off the wall and floor in order to hit a target.
+#
+# This example uses the built-in wp.Tape() object to compute gradients of
+# the distance to target (loss) w.r.t the initial velocity, followed by
+# a simple gradient-descent optimization step.
+#
+# Command: python -m newton.examples diffsim_ball
+#
+###########################################################################
+import numpy as np
+import warp as wp
+
+import newton
+import newton.examples
+from newton.tests.unittest_utils import assert_np_equal
+from newton.utils import bourke_color_map
+
+
+@wp.kernel
+def loss_kernel(pos: wp.array(dtype=wp.vec3), target: wp.vec3, loss: wp.array(dtype=float)):
+    # distance to target
+    delta = pos[0] - target
+    loss[0] = wp.dot(delta, delta)
+
+
+@wp.kernel
+def step_kernel(x: wp.array(dtype=wp.vec3), grad: wp.array(dtype=wp.vec3), alpha: float):
+    tid = wp.tid()
+
+    # gradient descent step
+    x[tid] = x[tid] - grad[tid] * alpha
+
+
+class Example:
+    def __init__(self, viewer, args):
+        # setup simulation parameters first
+        self.fps = 60
+        self.frame = 0
+        self.frame_dt = 1.0 / self.fps
+        self.sim_steps = 36
+        self.sim_substeps = 8
+        self.sim_dt = self.frame_dt / self.sim_substeps
+
+        self.verbose = args.verbose
+
+        self.train_iter = 0
+        self.train_rate = 0.02
+        self.target = (0.0, -2.0, 1.5)
+        self.loss = wp.zeros(1, dtype=wp.float32, requires_grad=True)
+        self.loss_history = []
+
+        self.viewer = viewer
+        self.viewer.show_particles = True
+
+        # setup simulation scene
+        scene = newton.ModelBuilder(up_axis=newton.Axis.Z)
+
+        scene.add_particle(pos=wp.vec3(0.0, -0.5, 1.0), vel=wp.vec3(0.0, 5.0, -5.0), mass=1.0)
+
+        # add wall and ground plane
+        ke = 1.0e4
+        kf = 0.0
+        kd = 1.0e1
+        mu = 0.2
+
+        scene.add_shape_box(
+            body=-1,
+            xform=wp.transform(wp.vec3(0.0, 2.0, 1.0), wp.quat_identity()),
+            hx=1.0,
+            hy=0.25,
+            hz=1.0,
+            cfg=newton.ModelBuilder.ShapeConfig(ke=ke, kf=kf, kd=kd, mu=mu),
+        )
+
+        scene.add_ground_plane(cfg=newton.ModelBuilder.ShapeConfig(ke=ke, kf=kf, kd=kd, mu=mu))
+
+        # finalize model
+        # use `requires_grad=True` to create a model for differentiable simulation
+        self.model = scene.finalize(requires_grad=True)
+
+        self.model.soft_contact_ke = ke
+        self.model.soft_contact_kf = kf
+        self.model.soft_contact_kd = kd
+        self.model.soft_contact_mu = mu
+        self.model.soft_contact_restitution = 1.0
+
+        self.solver = newton.solvers.SolverSemiImplicit(self.model)
+
+        # allocate sim states, initialize control and one-shot contacts (valid for simple collisions against constant plane)
+        self.states = [self.model.state() for _ in range(self.sim_steps * self.sim_substeps + 1)]
+        self.control = self.model.control()
+
+        # Create collision pipeline (requires_grad for differentiable simulation)
+        self.collision_pipeline = newton.CollisionPipeline(
+            self.model,
+            broad_phase="explicit",
+            soft_contact_margin=10.0,
+            requires_grad=True,
+        )
+        self.contacts = self.collision_pipeline.contacts()
+        self.collision_pipeline.collide(self.states[0], self.contacts)
+
+        self.viewer.set_model(self.model)
+
+        # capture forward/backward passes
+        self.capture()
+
+    def capture(self):
+        # if wp.get_device().is_cuda:
+        #     with wp.ScopedCapture() as capture:
+        #         self.forward_backward()
+        #     self.graph = capture.graph
+        # else:
+            self.graph = None
+
+    def forward_backward(self):
+        self.tape = wp.Tape()
+        with self.tape:
+            self.forward()
+        self.tape.backward(self.loss)
+
+    def forward(self):
+        # run simulation loop
+        for sim_step in range(self.sim_steps):
+            self.simulate(sim_step)
+
+        # compute loss on final state
+        wp.launch(loss_kernel, dim=1, inputs=[self.states[-1].particle_q, self.target, self.loss])
+
+        return self.loss
+
+    def simulate(self, sim_step):
+        for i in range(self.sim_substeps):
+            t = sim_step * self.sim_substeps + i
+            self.states[t].clear_forces()
+            self.solver.step(self.states[t], self.states[t + 1], self.control, self.contacts, self.sim_dt)
+
+    def step(self):
+        if self.graph:
+            wp.capture_launch(self.graph)
+        else:
+            self.forward_backward()
+
+        x = self.states[0].particle_qd
+
+        if self.verbose:
+            print(f"Train iter: {self.train_iter} Loss: {self.loss}")
+            print(f"    x: {x} g: {x.grad}")
+
+        # gradient descent step
+        wp.launch(step_kernel, dim=len(x), inputs=[x, x.grad, self.train_rate])
+
+        # clear grads for next iteration
+        self.tape.zero()
+
+        self.train_iter += 1
+        self.loss_history.append(self.loss.numpy()[0])
+
+    def test_final(self):
+        x_grad_numeric, x_grad_analytic = self.check_grad()
+        assert_np_equal(x_grad_numeric, x_grad_analytic, tol=5e-2)
+        assert all(np.array(self.loss_history) < 10.0)
+        # skip the last loss because there could be some bouncing around the optimum
+        assert all(np.diff(self.loss_history[:-1]) < -1e-3)
+
+    def render(self):
+        if self.viewer.is_paused():
+            self.viewer.begin_frame(self.viewer.time)
+            self.viewer.end_frame()
+            return
+
+        if self.frame > 0 and self.train_iter % 16 != 0:
+            return
+
+        # draw trajectory
+        traj_verts = [self.states[0].particle_q.numpy()[0].tolist()]
+
+        for i in range(self.sim_steps + 1):
+            state = self.states[i * self.sim_substeps]
+            traj_verts.append(state.particle_q.numpy()[0].tolist())
+
+            self.viewer.begin_frame(self.frame * self.frame_dt)
+            self.viewer.log_scalar("/loss", self.loss.numpy()[0])
+            self.viewer.log_state(state)
+            self.viewer.log_contacts(self.contacts, state)
+            self.viewer.log_shapes(
+                "/target",
+                newton.GeoType.BOX,
+                (0.1, 0.1, 0.1),
+                wp.array([wp.transform(self.target, wp.quat_identity())], dtype=wp.transform),
+                wp.array([wp.vec3(0.0, 0.0, 0.0)], dtype=wp.vec3),
+            )
+            self.viewer.log_lines(
+                f"/traj_{self.train_iter - 1}",
+                wp.array(traj_verts[0:-1], dtype=wp.vec3),
+                wp.array(traj_verts[1:], dtype=wp.vec3),
+                bourke_color_map(0.0, 7.0, self.loss.numpy()[0]),
+            )
+            self.viewer.end_frame()
+
+            self.frame += 1
+
+    def check_grad(self):
+        param = self.states[0].particle_qd
+
+        # initial value
+        x_c = param.numpy().flatten()
+
+        # compute numeric gradient
+        x_grad_numeric = np.zeros_like(x_c)
+
+        for i in range(len(x_c)):
+            eps = 1.0e-3
+
+            step = np.zeros_like(x_c)
+            step[i] = eps
+
+            x_1 = x_c + step
+            x_0 = x_c - step
+
+            param.assign(x_1)
+            l_1 = self.forward().numpy()[0]
+
+            param.assign(x_0)
+            l_0 = self.forward().numpy()[0]
+
+            dldx = (l_1 - l_0) / (eps * 2.0)
+
+            x_grad_numeric[i] = dldx
+
+        # reset initial state
+        param.assign(x_c)
+
+        # compute analytic gradient
+        tape = wp.Tape()
+        with tape:
+            l = self.forward()
+
+        tape.backward(l)
+
+        x_grad_analytic = param.grad.numpy()[0].copy()
+
+        print(f"numeric grad: {x_grad_numeric}")
+        print(f"analytic grad: {x_grad_analytic}")
+
+        tape.zero()
+
+        return x_grad_numeric, x_grad_analytic
+
+    @staticmethod
+    def create_parser():
+        parser = newton.examples.create_parser()
+        parser.add_argument(
+            "--verbose", action="store_true", help="Print out additional status messages during execution."
+        )
+        return parser
+
+
+if __name__ == "__main__":
+    parser = Example.create_parser()
+    viewer, args = newton.examples.init(parser)
+
+    example = Example(viewer, args)
+    example.check_grad()
+    newton.examples.run(example, args)
diff --git a/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py b/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py
new file mode 100644
index 000000000..485c865f0
--- /dev/null
+++ b/src/cuda/HPC/newton/robot_cartpole/example_robot_cartpole.py
@@ -0,0 +1,206 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 The Newton Developers
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################################
+# Example Robot Cartpole
+#
+# Shows how to set up a simulation of a rigid-body cartpole articulation
+# from a USD stage using newton.ModelBuilder.add_usd().
+#
+# Command: python -m newton.examples robot_cartpole --world-count 100
+#
+###########################################################################
+
+import warp as wp
+
+import newton
+import newton.examples
+
+
+class Example:
+    def __init__(self, viewer, args):
+        self.fps = 60
+        self.frame_dt = 1.0 / self.fps
+        self.sim_time = 0.0
+        self.sim_substeps = 10
+        self.sim_dt = self.frame_dt / self.sim_substeps
+
+        self.world_count = args.world_count
+
+        self.viewer = viewer
+
+        cartpole = newton.ModelBuilder()
+        newton.solvers.SolverMuJoCo.register_custom_attributes(cartpole)
+        cartpole.default_shape_cfg.density = 100.0
+        cartpole.default_joint_cfg.armature = 0.1
+        cartpole.default_body_armature = 0.1
+
+        cartpole.add_usd(
+            newton.examples.get_asset("cartpole.usda"),
+            enable_self_collisions=False,
+            collapse_fixed_joints=True,
+        )
+        # set initial joint positions
+        cartpole.joint_q[-3:] = [0.0, 0.3, 0.0]
+
+        builder = newton.ModelBuilder()
+        builder.replicate(cartpole, self.world_count, spacing=(1.0, 2.0, 0.0))
+
+        # finalize model
+        self.model = builder.finalize()
+
+        self.solver = newton.solvers.SolverMuJoCo(self.model)
+        # self.solver = newton.solvers.SolverSemiImplicit(self.model, joint_attach_ke=1600.0, joint_attach_kd=20.0)
+        # self.solver = newton.solvers.SolverFeatherstone(self.model)
+
+        self.state_0 = self.model.state()
+        self.state_1 = self.model.state()
+        self.control = self.model.control()
+        # we do not need to evaluate contacts for this example
+        self.contacts = None
+
+        # Evaluating forward kinematics is needed only for maximal-coordinate solvers
+        newton.eval_fk(self.model, self.model.joint_q, self.model.joint_qd, self.state_0)
+
+        self.viewer.set_model(self.model)
+        self.viewer.set_world_offsets((0.0, 0.0, 0.0))
+
+        # Set camera to view all the cartpoles
+        self.viewer.set_camera(
+            pos=wp.vec3(7.3, -14.0, 2.3),
+            pitch=-5.0,
+            yaw=-225.0,
+        )
+        if hasattr(self.viewer, "camera") and hasattr(self.viewer.camera, "fov"):
+            self.viewer.camera.fov = 90.0
+
+        self.capture()
+
+    def capture(self):
+        self.graph = None
+        # if wp.get_device().is_cuda:
+        #     with wp.ScopedCapture() as capture:
+        #         self.simulate()
+        #     self.graph = capture.graph
+
+    def simulate(self):
+        for _ in range(self.sim_substeps):
+            self.state_0.clear_forces()
+
+            # apply forces to the model for picking, wind, etc
+            self.viewer.apply_forces(self.state_0)
+
+            self.solver.step(self.state_0, self.state_1, self.control, self.contacts, self.sim_dt)
+
+            # swap states
+            self.state_0, self.state_1 = self.state_1, self.state_0
+
+    def step(self):
+        if self.graph:
+            wp.capture_launch(self.graph)
+        else:
+            self.simulate()
+
+        self.sim_time += self.frame_dt
+
+    def render(self):
+        self.viewer.begin_frame(self.sim_time)
+        self.viewer.log_state(self.state_0)
+        self.viewer.end_frame()
+
+    def test_final(self):
+        num_bodies_per_world = self.model.body_count // self.world_count
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "cart is at ground level and has correct orientation",
+            lambda q, qd: q[2] == 0.0 and newton.math.vec_allclose(q.q, wp.quat_identity()),
+            indices=[i * num_bodies_per_world for i in range(self.world_count)],
+        )
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "cart only moves along y direction",
+            lambda q, qd: qd[0] == 0.0
+            and abs(qd[1]) > 0.05
+            and qd[2] == 0.0
+            and wp.length_sq(wp.spatial_bottom(qd)) == 0.0,
+            indices=[i * num_bodies_per_world for i in range(self.world_count)],
+        )
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "pole1 only has y-axis linear velocity and x-axis angular velocity",
+            lambda q, qd: qd[0] == 0.0
+            and abs(qd[1]) > 0.05
+            and qd[2] == 0.0
+            and abs(qd[3]) > 0.3
+            and qd[4] == 0.0
+            and qd[5] == 0.0,
+            indices=[i * num_bodies_per_world + 1 for i in range(self.world_count)],
+        )
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "pole2 only has yz-plane linear velocity and x-axis angular velocity",
+            lambda q, qd: qd[0] == 0.0
+            and abs(qd[1]) > 0.05
+            and abs(qd[2]) > 0.05
+            and abs(qd[3]) > 0.2
+            and qd[4] == 0.0
+            and qd[5] == 0.0,
+            indices=[i * num_bodies_per_world + 2 for i in range(self.world_count)],
+        )
+        qd = self.state_0.body_qd.numpy()
+        world0_cart_vel = wp.spatial_vector(*qd[0])
+        world0_pole1_vel = wp.spatial_vector(*qd[1])
+        world0_pole2_vel = wp.spatial_vector(*qd[2])
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "cart velocities match across worlds",
+            lambda q, qd: newton.math.vec_allclose(qd, world0_cart_vel),
+            indices=[i * num_bodies_per_world for i in range(self.world_count)],
+        )
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "pole1 velocities match across worlds",
+            lambda q, qd: newton.math.vec_allclose(qd, world0_pole1_vel),
+            indices=[i * num_bodies_per_world + 1 for i in range(self.world_count)],
+        )
+        newton.examples.test_body_state(
+            self.model,
+            self.state_0,
+            "pole2 velocities match across worlds",
+            lambda q, qd: newton.math.vec_allclose(qd, world0_pole2_vel, atol=1e-6),
+            indices=[i * num_bodies_per_world + 2 for i in range(self.world_count)],
+        )
+
+    @staticmethod
+    def create_parser():
+        parser = newton.examples.create_parser()
+        newton.examples.add_world_count_arg(parser)
+        parser.set_defaults(world_count=100)
+        return parser
+
+
+if __name__ == "__main__":
+    parser = Example.create_parser()
+    viewer, args = newton.examples.init(parser)
+
+    example = Example(viewer, args)
+
+    newton.examples.run(example, args)
diff --git a/src/cuda/HPC/newton/setup_newton.sh b/src/cuda/HPC/newton/setup_newton.sh
new file mode 100755
index 000000000..7ec1090f4
--- /dev/null
+++ b/src/cuda/HPC/newton/setup_newton.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Setup Newton environment (similar to huggingface/setup_environment.sh)
+
+NEWTON_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+NEWTON_SUBMODULE="$NEWTON_DIR/../external/newton"
+
+# Create Python virtual environment if it doesn't exist
+if [ ! -d "$NEWTON_DIR/newton_venv" ]; then
+    echo "Creating Python virtual environment for Newton..."
+    python3 -m venv "$NEWTON_DIR/newton_venv"
+    source "$NEWTON_DIR/newton_venv/bin/activate"
+
+    # Install Newton and dependencies
+    pip install --upgrade pip
+    if [ -d "$NEWTON_SUBMODULE" ]; then
+        echo "Installing Newton from submodule..."
+        pip install -e "$NEWTON_SUBMODULE"
+        # Install additional dependencies for robot and USD examples
+        pip install usd-core mujoco-warp numpy warp-lang
+    else
+        echo "WARNING: Newton submodule not found at $NEWTON_SUBMODULE"
+        echo "Run: git submodule update --init --recursive"
+    fi
+else
+    source "$NEWTON_DIR/newton_venv/bin/activate"
+fi
+
+export NEWTON_ENV="$NEWTON_DIR/newton_venv"
+export PYTHONPATH="$NEWTON_SUBMODULE:$PYTHONPATH"
diff --git a/src/cuda/HPC/setup_vpi.sh b/src/cuda/HPC/setup_vpi.sh
new file mode 100755
index 000000000..1f38c4f52
--- /dev/null
+++ b/src/cuda/HPC/setup_vpi.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# VPI Setup Script
+# Installs VPI library if not already present
+# Requires sudo for system-wide installation
+
+set -e
+
+echo "VPI Setup Script"
+echo "================"
+
+# Check if VPI is already installed
+if command -v vpi-config &> /dev/null; then
+    VPI_VERSION=$(vpi-config --version)
+    VPI_PATH=$(vpi-config --prefix)
+    echo "VPI already installed: version $VPI_VERSION at $VPI_PATH"
+    exit 0
+fi
+
+# Check if running on x86_64 Linux
+if [[ "$(uname -m)" != "x86_64" ]] || [[ "$(uname -s)" != "Linux" ]]; then
+    echo "ERROR: VPI installation is only supported on Linux x86_64"
+    exit 1
+fi
+
+# Detect Ubuntu version
+if [[ -f /etc/os-release ]]; then
+    . /etc/os-release
+    UBUNTU_VERSION=$VERSION_ID
+else
+    echo "ERROR: Cannot detect Ubuntu version"
+    exit 1
+fi
+
+echo "Detected Ubuntu $UBUNTU_VERSION"
+
+# Check for sudo
+if ! command -v sudo &> /dev/null; then
+    echo "ERROR: sudo is required for VPI installation"
+    exit 1
+fi
+
+echo ""
+echo "Installing VPI via apt..."
+echo "This requires sudo privileges and will install system packages."
+echo ""
+
+# Install prerequisites
+sudo apt-get update
+sudo apt-get install -y gnupg software-properties-common
+
+# Add NVIDIA repository key (modern method)
+sudo mkdir -p /etc/apt/keyrings
+curl -fsSL https://repo.download.nvidia.com/jetson/jetson-ota-public.asc | sudo gpg --dearmor -o /etc/apt/keyrings/nvidia-jetson.gpg
+
+# Add repository based on Ubuntu version
+if [[ "$UBUNTU_VERSION" == "22.04" ]]; then
+    echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/jammy r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list
+elif [[ "$UBUNTU_VERSION" == "24.04" ]]; then
+    echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/noble r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list
+else
+    echo "WARNING: Ubuntu $UBUNTU_VERSION not officially supported. Trying jammy repository..."
+    echo "deb [signed-by=/etc/apt/keyrings/nvidia-jetson.gpg] https://repo.download.nvidia.com/jetson/x86_64/jammy r38.4 main" | sudo tee /etc/apt/sources.list.d/nvidia-jetson.list
+fi
+
+# Install VPI packages
+sudo apt-get update
+sudo apt-get install -y libnvvpi4 vpi4-dev vpi4-samples
+
+# Detect Python version and install Python bindings
+PYTHON_VERSION=$(python3 --version | awk '{print $2}' | cut -d. -f1,2)
+if [[ "$PYTHON_VERSION" == "3.10" ]]; then
+    sudo apt-get install -y python3.10-vpi4
+elif [[ "$PYTHON_VERSION" == "3.12" ]]; then
+    sudo apt-get install -y python3.12-vpi4
+else
+    echo "WARNING: Python VPI bindings not available for Python $PYTHON_VERSION"
+    echo "Supported versions: 3.10, 3.12"
+fi
+
+echo ""
+echo "VPI installation complete!"
+echo "VPI installed at: $(vpi-config --prefix)"
+echo "VPI version: $(vpi-config --version)"
diff --git a/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp b/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp
new file mode 100644
index 000000000..9364f7eed
--- /dev/null
+++ b/src/cuda/HPC/vpi/vpi_background_subtractor/main.cpp
@@ -0,0 +1,276 @@
+/*
+* Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions
+* are met:
+*  * Redistributions of source code must retain the above copyright
+*    notice, this list of conditions and the following disclaimer.
+*  * Redistributions in binary form must reproduce the above copyright
+*    notice, this list of conditions and the following disclaimer in the
+*    documentation and/or other materials provided with the distribution.
+*  * Neither the name of NVIDIA CORPORATION nor the names of its
+*    contributors may be used to endorse or promote products derived
+*    from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+* PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include <opencv2/core/version.hpp>
+#include <opencv2/imgcodecs.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/videoio.hpp>
+#include <vpi/OpenCVInterop.hpp>
+
+#include <vpi/Array.h>
+#include <vpi/Image.h>
+#include <vpi/ImageFormat.h>
+#include <vpi/Pyramid.h>
+#include <vpi/Status.h>
+#include <vpi/Stream.h>
+#include <vpi/algo/BackgroundSubtractor.h>
+#include <vpi/algo/ConvertImageFormat.h>
+
+#include <iostream>
+#include <sstream>
+
+#define CHECK_STATUS(STMT)                                    \
+    do                                                        \
+    {                                                         \
+        VPIStatus status = (STMT);                            \
+        if (status != VPI_SUCCESS)                            \
+        {                                                     \
+            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
+            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
+            std::ostringstream ss;                            \
+            ss << vpiStatusGetName(status) << ": " << buffer; \
+            throw std::runtime_error(ss.str());               \
+        }                                                     \
+    } while (0);
+
+int main(int argc, char *argv[])
+{
+    // OpenCV image that will be wrapped by a VPIImage.
+    // Define it here so that it's destroyed *after* wrapper is destroyed
+    cv::Mat cvCurFrame;
+
+    // VPI objects that will be used
+    VPIStream stream     = NULL;
+    VPIImage imgCurFrame = NULL;
+    VPIImage bgimage     = NULL;
+    VPIImage fgmask      = NULL;
+    VPIPayload payload   = NULL;
+
+    int retval = 0;
+
+    try
+    {
+        // Parse named arguments
+        std::string strBackend;
+        std::string strInputVideo;
+        int numFramesToProcess = -1; // -1 means process all frames
+
+        for (int i = 1; i < argc; i++)
+        {
+            std::string arg = argv[i];
+
+            if (arg == "--backend" || arg == "-b")
+            {
+                if (i + 1 < argc)
+                {
+                    strBackend = argv[++i];
+                }
+                else
+                {
+                    throw std::runtime_error("--backend requires a value");
+                }
+            }
+            else if (arg == "--input" || arg == "-i")
+            {
+                if (i + 1 < argc)
+                {
+                    strInputVideo = argv[++i];
+                }
+                else
+                {
+                    throw std::runtime_error("--input requires a value");
+                }
+            }
+            else if (arg == "--num-frames" || arg == "-n")
+            {
+                if (i + 1 < argc)
+                {
+                    numFramesToProcess = std::atoi(argv[++i]);
+                    if (numFramesToProcess <= 0)
+                    {
+                        throw std::runtime_error("--num-frames must be a positive integer");
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("--num-frames requires a value");
+                }
+            }
+            else if (arg == "--help" || arg == "-h")
+            {
+                std::cout << "Usage: " << argv[0] << " [OPTIONS]\n"
+                          << "Options:\n"
+                          << "  --backend, -b <cpu|cuda>    Backend to use (required)\n"
+                          << "  --input, -i <video_file>    Input video file (required)\n"
+                          << "  --num-frames, -n <count>    Number of frames to process (optional, default: all)\n"
+                          << "  --help, -h                  Show this help message\n";
+                return 0;
+            }
+            else
+            {
+                throw std::runtime_error("Unknown argument: " + arg);
+            }
+        }
+
+        // Validate required arguments
+        if (strBackend.empty())
+        {
+            throw std::runtime_error("--backend is required\n\nUse --help for usage information");
+        }
+        if (strInputVideo.empty())
+        {
+            throw std::runtime_error("--input is required\n\nUse --help for usage information");
+        }
+
+        VPIBackend backend;
+        if (strBackend == "cpu")
+        {
+            backend = VPI_BACKEND_CPU;
+        }
+        else if (strBackend == "cuda")
+        {
+            backend = VPI_BACKEND_CUDA;
+        }
+        else
+        {
+            throw std::runtime_error("Backend '" + strBackend + "' not recognized.");
+        }
+
+        // Load the input video
+        cv::VideoCapture invid;
+        if (!invid.open(strInputVideo))
+        {
+            throw std::runtime_error("Can't open '" + strInputVideo + "'");
+        }
+
+        int32_t width  = invid.get(cv::CAP_PROP_FRAME_WIDTH);
+        int32_t height = invid.get(cv::CAP_PROP_FRAME_HEIGHT);
+
+        // Create the stream where processing will happen. We'll use user-provided backend.
+        CHECK_STATUS(vpiStreamCreate(backend, &stream));
+
+        // Create background subtractor payload to be executed on the given backend
+        // OpenCV delivers us BGR8 images, so the algorithm is configured to accept that.
+        CHECK_STATUS(vpiCreateBackgroundSubtractor(backend, width, height, VPI_IMAGE_FORMAT_BGR8, &payload));
+
+        // Create foreground image
+        CHECK_STATUS(vpiImageCreate(width, height, VPI_IMAGE_FORMAT_U8, 0, &fgmask));
+
+        // Create background image
+        CHECK_STATUS(vpiImageCreate(width, height, VPI_IMAGE_FORMAT_BGR8, 0, &bgimage));
+
+        int fourcc = cv::VideoWriter::fourcc('M', 'P', 'E', 'G');
+        double fps = invid.get(cv::CAP_PROP_FPS);
+
+        cv::VideoWriter outVideo("fgmask_" + strBackend + ".mp4", fourcc, fps, cv::Size(width, height), false);
+        if (!outVideo.isOpened())
+        {
+            throw std::runtime_error("Can't create output video");
+        }
+
+        cv::VideoWriter bgimageVideo("bgimage_" + strBackend + ".mp4", fourcc, fps, cv::Size(width, height));
+        if (!outVideo.isOpened())
+        {
+            throw std::runtime_error("Can't create output video");
+        }
+
+        // Fetch a new frame until video ends or desired frame count is reached
+        int idxFrame = 1;
+
+        while (invid.read(cvCurFrame))
+        {
+            // Check if we've reached the desired number of frames
+            if (numFramesToProcess > 0 && idxFrame > numFramesToProcess)
+            {
+                printf("Processed %d frames (limit reached)\n", idxFrame - 1);
+                break;
+            }
+
+            printf("Processing frame %d\n", idxFrame++);
+            // Wrap frame into a VPIImage
+            if (imgCurFrame == NULL)
+            {
+                CHECK_STATUS(vpiImageCreateWrapperOpenCVMat(cvCurFrame, 0, &imgCurFrame));
+            }
+            else
+            {
+                CHECK_STATUS(vpiImageSetWrappedOpenCVMat(imgCurFrame, cvCurFrame));
+            }
+
+            VPIBackgroundSubtractorParams params;
+            CHECK_STATUS(vpiInitBackgroundSubtractorParams(&params));
+            params.learningRate = 0.01;
+
+            CHECK_STATUS(
+                vpiSubmitBackgroundSubtractor(stream, backend, payload, imgCurFrame, fgmask, bgimage, &params));
+
+            // Wait for processing to finish.
+            CHECK_STATUS(vpiStreamSync(stream));
+
+            {
+                // Now add it to the output video stream
+                VPIImageData imgdata;
+                CHECK_STATUS(vpiImageLockData(fgmask, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &imgdata));
+
+                cv::Mat outFrame;
+                CHECK_STATUS(vpiImageDataExportOpenCVMat(imgdata, &outFrame));
+
+                outVideo << outFrame;
+
+                CHECK_STATUS(vpiImageUnlock(fgmask));
+            }
+
+            {
+                VPIImageData bgdata;
+                CHECK_STATUS(vpiImageLockData(bgimage, VPI_LOCK_READ, VPI_IMAGE_BUFFER_HOST_PITCH_LINEAR, &bgdata));
+
+                cv::Mat outFrame;
+                CHECK_STATUS(vpiImageDataExportOpenCVMat(bgdata, &outFrame));
+
+                bgimageVideo << outFrame;
+
+                CHECK_STATUS(vpiImageUnlock(bgimage));
+            }
+        }
+    }
+    catch (std::exception &e)
+    {
+        std::cerr << e.what() << std::endl;
+        retval = 1;
+    }
+
+    // Destroy all resources used
+    vpiStreamDestroy(stream);
+    vpiPayloadDestroy(payload);
+
+    vpiImageDestroy(imgCurFrame);
+    vpiImageDestroy(fgmask);
+    vpiImageDestroy(bgimage);
+
+    return retval;
+}
diff --git a/src/cuda/common/common.mk b/src/cuda/common/common.mk
index cd5f3cefc..14885532b 100644
--- a/src/cuda/common/common.mk
+++ b/src/cuda/common/common.mk
@@ -21,7 +21,9 @@
 .SUFFIXES : .cu .cu_dbg.o .c_dbg.o .cpp_dbg.o .cu_rel.o .c_rel.o .cpp_rel.o .cubin .ptx
 
 INCLUDES += -I$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/common/inc
-ADDITIONAL_LIBS += -L$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/lib -lcutil_x86_64
+ifneq ($(OMIT_CUTIL_LIB),1)
+	ADDITIONAL_LIBS += -L$(NVIDIA_COMPUTE_SDK_LOCATION)/../4.2/C/lib -lcutil_x86_64
+endif
 
 # Add new SM Versions here as devices with new Compute Capability are released
 SM_VERSIONS   :=  70 75
@@ -56,6 +58,9 @@ BINDIR     ?= $(ROOTBINDIR)/$(OSLOWER)
 ROOTOBJDIR ?= obj
 LIBDIR     ?= $(ROOTDIR)/../lib
 
+# Set CUDA_VERSION_MAJOR if not already set
+CUDA_VERSION_MAJOR ?= 12
+
 ifeq ($(shell test ${CUDA_VERSION_MAJOR} -lt 5; echo $$?), 0)
   LIBDIRSDK     := $(NVIDIA_COMPUTE_SDK_LOCATION)/C/lib
   COMMONDIR  := $(NVIDIA_COMPUTE_SDK_LOCATION)/C/common
diff --git a/src/cuda/cutlass-bench b/src/cuda/cutlass-bench
index e51efbfe1..291300fff 160000
--- a/src/cuda/cutlass-bench
+++ b/src/cuda/cutlass-bench
@@ -1 +1 @@
-Subproject commit e51efbfe18fe4f4cbb66ab814c55bf4aa0185491
+Subproject commit 291300ffffa3533a78ee104f08a8490a29ce9ccb
diff --git a/src/cuda/pytorch_examples b/src/cuda/pytorch_examples
index 30d1a73e5..1bef748fa 160000
--- a/src/cuda/pytorch_examples
+++ b/src/cuda/pytorch_examples
@@ -1 +1 @@
-Subproject commit 30d1a73e5ab3bb92e5834dc11805e53ee8aee2c7
+Subproject commit 1bef748fab064e2fc3beddcbda60fd51cb9612d2
diff --git a/src/setup_environment b/src/setup_environment
index 21b42af72..29a1dbba4 100755
--- a/src/setup_environment
+++ b/src/setup_environment
@@ -208,4 +208,11 @@ else
     fi
 fi
 
+# Check for VPI (optional for H100 VPI apps)
+if command -v vpi-config &> /dev/null; then
+    export VPI_INSTALL_PATH=$(vpi-config --prefix)
+    export VPI_VERSION=$(vpi-config --version)
+    echo "VPI found: version $VPI_VERSION at $VPI_INSTALL_PATH"
+fi
+
 export GPUAPPS_SETUP_ENVIRONMENT_WAS_RUN=1