Skip to content

Commit 351374d

Browse files
committed
Add NVRTC testing and fix missing header
1 parent 097601c commit 351374d

File tree

4 files changed

+384
-0
lines changed

4 files changed

+384
-0
lines changed

include/boost/math/special_functions/pow1p.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <boost/math/tools/is_detected.hpp>
1414
#include <boost/math/tools/precision.hpp>
1515
#include <boost/math/special_functions/fpclassify.hpp>
16+
#include <boost/math/special_functions/sign.hpp>
1617
#include <boost/math/policies/error_handling.hpp>
1718

1819
// For cuda we would rather use builtin nextafter than unsupported boost::math::nextafter

test/nvrtc_jamfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,3 +123,4 @@ run test_sin_pi_nvrtc_float.cpp ;
123123
run test_trigamma_nvrtc_double.cpp ;
124124
run test_trigamma_nvrtc_float.cpp ;
125125
run test_trunc_nvrtc_double.cpp ;
126+
run test_pow1p_nvrtc_double.cpp ;

test/test_pow1p_nvrtc_double.cpp

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
// Copyright John Maddock 2016.
2+
// Copyright Matt Borland 2024.
3+
// Use, modification and distribution are subject to the
4+
// Boost Software License, Version 1.0. (See accompanying file
5+
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6+
7+
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
8+
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
9+
10+
// Must be included first
11+
#include <nvrtc.h>
12+
#include <cuda.h>
13+
#include <cuda_runtime.h>
14+
15+
#include <iostream>
16+
#include <iomanip>
17+
#include <vector>
18+
#include <random>
19+
#include <exception>
20+
#include <boost/math/special_functions/pow1p.hpp>
21+
#include <boost/math/special_functions/relative_difference.hpp>
22+
23+
typedef double float_type;
24+
25+
const char* cuda_kernel = R"(
26+
typedef double float_type;
27+
#include <cuda/std/type_traits>
28+
#include <boost/math/special_functions/pow1p.hpp>
29+
extern "C" __global__
30+
void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
31+
{
32+
int i = blockDim.x * blockIdx.x + threadIdx.x;
33+
if (i < numElements)
34+
{
35+
out[i] = boost::math::pow1p(in1[i], in2[i]);
36+
}
37+
}
38+
)";
39+
40+
void checkCUDAError(cudaError_t result, const char* msg)
41+
{
42+
if (result != cudaSuccess)
43+
{
44+
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
45+
exit(EXIT_FAILURE);
46+
}
47+
}
48+
49+
void checkCUError(CUresult result, const char* msg)
50+
{
51+
if (result != CUDA_SUCCESS)
52+
{
53+
const char* errorStr;
54+
cuGetErrorString(result, &errorStr);
55+
std::cerr << msg << ": " << errorStr << std::endl;
56+
exit(EXIT_FAILURE);
57+
}
58+
}
59+
60+
void checkNVRTCError(nvrtcResult result, const char* msg)
61+
{
62+
if (result != NVRTC_SUCCESS)
63+
{
64+
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
65+
exit(EXIT_FAILURE);
66+
}
67+
}
68+
69+
int main()
70+
{
71+
try
72+
{
73+
// Initialize CUDA driver API
74+
checkCUError(cuInit(0), "Failed to initialize CUDA");
75+
76+
// Create CUDA context
77+
CUcontext context;
78+
CUdevice device;
79+
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
80+
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
81+
82+
nvrtcProgram prog;
83+
nvrtcResult res;
84+
85+
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
86+
checkNVRTCError(res, "Failed to create NVRTC program");
87+
88+
nvrtcAddNameExpression(prog, "test_trunc_kernel");
89+
90+
#ifdef BOOST_MATH_NVRTC_CI_RUN
91+
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
92+
#else
93+
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
94+
#endif
95+
96+
// Compile the program
97+
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
98+
if (res != NVRTC_SUCCESS)
99+
{
100+
size_t log_size;
101+
nvrtcGetProgramLogSize(prog, &log_size);
102+
char* log = new char[log_size];
103+
nvrtcGetProgramLog(prog, log);
104+
std::cerr << "Compilation failed:\n" << log << std::endl;
105+
delete[] log;
106+
exit(EXIT_FAILURE);
107+
}
108+
109+
// Get PTX from the program
110+
size_t ptx_size;
111+
nvrtcGetPTXSize(prog, &ptx_size);
112+
char* ptx = new char[ptx_size];
113+
nvrtcGetPTX(prog, ptx);
114+
115+
// Load PTX into CUDA module
116+
CUmodule module;
117+
CUfunction kernel;
118+
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
119+
checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
120+
121+
int numElements = 5000;
122+
float_type *h_in1, *h_in2, *h_out;
123+
float_type *d_in1, *d_in2, *d_out;
124+
125+
// Allocate memory on the host
126+
h_in1 = new float_type[numElements];
127+
h_in2 = new float_type[numElements];
128+
h_out = new float_type[numElements];
129+
130+
// Initialize input arrays
131+
std::mt19937_64 rng(42);
132+
std::uniform_real_distribution<float_type> x_vals(-1, 1);
133+
std::uniform_real_distribution<float_type> y_vals(-10, 10);
134+
for (int i = 0; i < numElements; ++i)
135+
{
136+
h_in1[i] = static_cast<float_type>(x_vals(rng));
137+
h_in2[i] = static_cast<float_type>(y_vals(rng));
138+
}
139+
140+
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
141+
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
142+
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
143+
144+
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
145+
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
146+
147+
int blockSize = 256;
148+
int numBlocks = (numElements + blockSize - 1) / blockSize;
149+
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
150+
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
151+
152+
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
153+
154+
// Verify Result
155+
for (int i = 0; i < numElements; ++i)
156+
{
157+
const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
158+
159+
if (std::isfinite(res))
160+
{
161+
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
162+
{
163+
std::cout << "error at line: " << i
164+
<< "\nParallel: " << h_out[i]
165+
<< "\n Serial: " << res
166+
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
167+
}
168+
}
169+
}
170+
171+
cudaFree(d_in1);
172+
cudaFree(d_in2);
173+
cudaFree(d_out);
174+
delete[] h_in1;
175+
delete[] h_in2;
176+
delete[] h_out;
177+
178+
nvrtcDestroyProgram(&prog);
179+
delete[] ptx;
180+
181+
cuCtxDestroy(context);
182+
183+
std::cout << "Kernel executed successfully." << std::endl;
184+
return 0;
185+
}
186+
catch(const std::exception& e)
187+
{
188+
std::cerr << "Stopped with exception: " << e.what() << std::endl;
189+
return EXIT_FAILURE;
190+
}
191+
}

test/test_pow1p_nvrtc_float.cpp

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
// Copyright John Maddock 2016.
2+
// Copyright Matt Borland 2024.
3+
// Use, modification and distribution are subject to the
4+
// Boost Software License, Version 1.0. (See accompanying file
5+
// LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
6+
7+
#define BOOST_MATH_OVERFLOW_ERROR_POLICY ignore_error
8+
#define BOOST_MATH_PROMOTE_DOUBLE_POLICY false
9+
10+
// Must be included first
11+
#include <nvrtc.h>
12+
#include <cuda.h>
13+
#include <cuda_runtime.h>
14+
15+
#include <iostream>
16+
#include <iomanip>
17+
#include <vector>
18+
#include <random>
19+
#include <exception>
20+
#include <boost/math/special_functions/pow1p.hpp>
21+
#include <boost/math/special_functions/relative_difference.hpp>
22+
23+
typedef double float_type;
24+
25+
const char* cuda_kernel = R"(
26+
typedef double float_type;
27+
#include <cuda/std/type_traits>
28+
#include <boost/math/special_functions/pow1p.hpp>
29+
extern "C" __global__
30+
void test_trunc_kernel(const float_type *in1, const float_type *in2, float_type *out, int numElements)
31+
{
32+
int i = blockDim.x * blockIdx.x + threadIdx.x;
33+
if (i < numElements)
34+
{
35+
out[i] = boost::math::pow1p(in1[i], in2[i]);
36+
}
37+
}
38+
)";
39+
40+
void checkCUDAError(cudaError_t result, const char* msg)
41+
{
42+
if (result != cudaSuccess)
43+
{
44+
std::cerr << msg << ": " << cudaGetErrorString(result) << std::endl;
45+
exit(EXIT_FAILURE);
46+
}
47+
}
48+
49+
void checkCUError(CUresult result, const char* msg)
50+
{
51+
if (result != CUDA_SUCCESS)
52+
{
53+
const char* errorStr;
54+
cuGetErrorString(result, &errorStr);
55+
std::cerr << msg << ": " << errorStr << std::endl;
56+
exit(EXIT_FAILURE);
57+
}
58+
}
59+
60+
void checkNVRTCError(nvrtcResult result, const char* msg)
61+
{
62+
if (result != NVRTC_SUCCESS)
63+
{
64+
std::cerr << msg << ": " << nvrtcGetErrorString(result) << std::endl;
65+
exit(EXIT_FAILURE);
66+
}
67+
}
68+
69+
int main()
70+
{
71+
try
72+
{
73+
// Initialize CUDA driver API
74+
checkCUError(cuInit(0), "Failed to initialize CUDA");
75+
76+
// Create CUDA context
77+
CUcontext context;
78+
CUdevice device;
79+
checkCUError(cuDeviceGet(&device, 0), "Failed to get CUDA device");
80+
checkCUError(cuCtxCreate(&context, 0, device), "Failed to create CUDA context");
81+
82+
nvrtcProgram prog;
83+
nvrtcResult res;
84+
85+
res = nvrtcCreateProgram(&prog, cuda_kernel, "test_trunc_kernel.cu", 0, nullptr, nullptr);
86+
checkNVRTCError(res, "Failed to create NVRTC program");
87+
88+
nvrtcAddNameExpression(prog, "test_trunc_kernel");
89+
90+
#ifdef BOOST_MATH_NVRTC_CI_RUN
91+
const char* opts[] = {"--std=c++14", "--gpu-architecture=compute_75", "--include-path=/home/runner/work/cuda-math/boost-root/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
92+
#else
93+
const char* opts[] = {"--std=c++14", "--include-path=/home/mborland/Documents/boost/libs/cuda-math/include/", "-I/usr/local/cuda/include"};
94+
#endif
95+
96+
// Compile the program
97+
res = nvrtcCompileProgram(prog, sizeof(opts) / sizeof(const char*), opts);
98+
if (res != NVRTC_SUCCESS)
99+
{
100+
size_t log_size;
101+
nvrtcGetProgramLogSize(prog, &log_size);
102+
char* log = new char[log_size];
103+
nvrtcGetProgramLog(prog, log);
104+
std::cerr << "Compilation failed:\n" << log << std::endl;
105+
delete[] log;
106+
exit(EXIT_FAILURE);
107+
}
108+
109+
// Get PTX from the program
110+
size_t ptx_size;
111+
nvrtcGetPTXSize(prog, &ptx_size);
112+
char* ptx = new char[ptx_size];
113+
nvrtcGetPTX(prog, ptx);
114+
115+
// Load PTX into CUDA module
116+
CUmodule module;
117+
CUfunction kernel;
118+
checkCUError(cuModuleLoadDataEx(&module, ptx, 0, 0, 0), "Failed to load module");
119+
checkCUError(cuModuleGetFunction(&kernel, module, "test_trunc_kernel"), "Failed to get kernel function");
120+
121+
int numElements = 5000;
122+
float_type *h_in1, *h_in2, *h_out;
123+
float_type *d_in1, *d_in2, *d_out;
124+
125+
// Allocate memory on the host
126+
h_in1 = new float_type[numElements];
127+
h_in2 = new float_type[numElements];
128+
h_out = new float_type[numElements];
129+
130+
// Initialize input arrays
131+
std::mt19937_64 rng(42);
132+
std::uniform_real_distribution<float_type> x_vals(-1, 1);
133+
std::uniform_real_distribution<float_type> y_vals(-10, 10);
134+
for (int i = 0; i < numElements; ++i)
135+
{
136+
h_in1[i] = static_cast<float_type>(x_vals(rng));
137+
h_in2[i] = static_cast<float_type>(y_vals(rng));
138+
}
139+
140+
checkCUDAError(cudaMalloc(&d_in1, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in1");
141+
checkCUDAError(cudaMalloc(&d_in2, numElements * sizeof(float_type)), "Failed to allocate device memory for d_in2");
142+
checkCUDAError(cudaMalloc(&d_out, numElements * sizeof(float_type)), "Failed to allocate device memory for d_out");
143+
144+
checkCUDAError(cudaMemcpy(d_in1, h_in1, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in1");
145+
checkCUDAError(cudaMemcpy(d_in2, h_in2, numElements * sizeof(float_type), cudaMemcpyHostToDevice), "Failed to copy data to device for d_in2");
146+
147+
int blockSize = 256;
148+
int numBlocks = (numElements + blockSize - 1) / blockSize;
149+
void* args[] = { &d_in1, &d_in2, &d_out, &numElements };
150+
checkCUError(cuLaunchKernel(kernel, numBlocks, 1, 1, blockSize, 1, 1, 0, 0, args, 0), "Kernel launch failed");
151+
152+
checkCUDAError(cudaMemcpy(h_out, d_out, numElements * sizeof(float_type), cudaMemcpyDeviceToHost), "Failed to copy data back to host for h_out");
153+
154+
// Verify Result
155+
for (int i = 0; i < numElements; ++i)
156+
{
157+
const auto res = boost::math::pow1p(h_in1[i], h_in2[i]);
158+
159+
if (std::isfinite(res))
160+
{
161+
if (boost::math::epsilon_difference(res, h_out[i]) > 300)
162+
{
163+
std::cout << "error at line: " << i
164+
<< "\nParallel: " << h_out[i]
165+
<< "\n Serial: " << res
166+
<< "\n Dist: " << boost::math::epsilon_difference(res, h_out[i]) << std::endl;
167+
}
168+
}
169+
}
170+
171+
cudaFree(d_in1);
172+
cudaFree(d_in2);
173+
cudaFree(d_out);
174+
delete[] h_in1;
175+
delete[] h_in2;
176+
delete[] h_out;
177+
178+
nvrtcDestroyProgram(&prog);
179+
delete[] ptx;
180+
181+
cuCtxDestroy(context);
182+
183+
std::cout << "Kernel executed successfully." << std::endl;
184+
return 0;
185+
}
186+
catch(const std::exception& e)
187+
{
188+
std::cerr << "Stopped with exception: " << e.what() << std::endl;
189+
return EXIT_FAILURE;
190+
}
191+
}

0 commit comments

Comments
 (0)