Support cuda 13 (#6316)

jmackay2 · web-flow · commit a0be844981dd · 2025-08-11T17:29:09.000+02:00
* Support cuda 13

* formatting

* formatting

---------

Co-authored-by: jmackay2 &lt;jmackay2&gt;
diff --git a/cuda/common/include/pcl/cuda/cutil_inline_runtime.h b/cuda/common/include/pcl/cuda/cutil_inline_runtime.h
@@ -110,6 +110,7 @@ inline int cutGetMaxGflopsDeviceId()
     int max_perf_device  = 0;
     int device_count     = 0;
     int best_SM_arch     = 0;
+    int clock_rate       = 0;
 
 	cudaGetDeviceCount( &device_count );
 	// Find the best major SM Architecture GPU device
@@ -129,7 +130,8 @@ inline int cutGetMaxGflopsDeviceId()
 		cudaGetDeviceProperties( &deviceProp, current_device );
               int sm_per_multiproc = (deviceProp.major == 9999 && deviceProp.minor == 9999) ? 1 : _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
 
-		int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+        cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
+        int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
 		if( compute_perf  > max_compute_perf ) {
             // If we find GPU with SM major > 2, search only these
 			if ( best_SM_arch > 2 ) {
@@ -156,7 +158,8 @@ inline int cutGetMaxGflopsGraphicsDeviceId()
     int max_perf_device  = 0;
     int device_count     = 0;
     int best_SM_arch     = 0;
-    int bTCC = 0;
+    int bTCC             = 0;
+    int clock_rate       = 0;
 
 	cudaGetDeviceCount( &device_count );
 	// Find the best major SM Architecture GPU device that is graphics capable
@@ -185,7 +188,8 @@ inline int cutGetMaxGflopsGraphicsDeviceId()
 
 		if (!bTCC) // Is this GPU running the TCC driver?  If so we pass on this
 		{
-			int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * deviceProp.clockRate;
+            cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, current_device);
+            int compute_perf  = deviceProp.multiProcessorCount * sm_per_multiproc * clock_rate;
 			if( compute_perf  > max_compute_perf ) {
 				// If we find GPU with SM major > 2, search only these
 				if ( best_SM_arch > 2 ) {
diff --git a/cuda/sample_consensus/src/sac_model_1point_plane.cu b/cuda/sample_consensus/src/sac_model_1point_plane.cu
@@ -326,7 +326,7 @@ namespace pcl
       //thrust::counting_iterator<int> first (0);
       // Input: Point Cloud, Indices
       // Output: Hypotheses
-      transform (//first, first + max_iterations,
+      thrust::transform (//first, first + max_iterations,
                  //index_sequence_begin, 
                  //index_sequence_begin + max_iterations, 
                  randoms.begin (), randoms.begin () + max_iterations,
@@ -360,7 +360,7 @@ namespace pcl
       //thrust::counting_iterator<int> first (0);
       // Input: Point Cloud, Indices
       // Output: Hypotheses
-      transform (//first, first + max_iterations,
+      thrust::transform (//first, first + max_iterations,
                  //index_sequence_begin, 
                  //index_sequence_begin + max_iterations, 
                  randoms.begin (), randoms.begin () + max_iterations,
@@ -555,7 +555,7 @@ namespace pcl
       coefficients.z = model_coefficients[2];
       coefficients.w = model_coefficients[3];
 
-      return (int) count_if (
+      return (int) thrust::count_if (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              indices_->size (),
@@ -608,7 +608,7 @@ namespace pcl
       {
     //  pcl::ScopeTime t ("transform");
       // Send the data to the device
-      transform (
+      thrust::transform (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              nr_points,
@@ -667,7 +667,7 @@ namespace pcl
       {
     //  pcl::ScopeTime t ("transform");
       // Send the data to the device
-      transform (
+      thrust::transform (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              nr_points,
diff --git a/cuda/sample_consensus/src/sac_model_plane.cu b/cuda/sample_consensus/src/sac_model_plane.cu
@@ -238,7 +238,7 @@ namespace pcl
       coefficients.z = model_coefficients[2];
       coefficients.w = model_coefficients[3];
 
-      return (int) count_if (
+      return (int) thrust::count_if (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              indices_->size (),
@@ -286,7 +286,7 @@ namespace pcl
       coefficients.w = model_coefficients[3];
 
       // Send the data to the device
-      transform (
+      thrust::transform (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              nr_points,
@@ -331,7 +331,7 @@ namespace pcl
       coefficients.w = ((float4)h[idx]).w;
 
       // Send the data to the device
-      transform (
+      thrust::transform (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              nr_points,
@@ -372,7 +372,7 @@ namespace pcl
       coefficients.z = ((float4)h[idx]).z;
       coefficients.w = ((float4)h[idx]).w;
 
-      transform (
+      thrust::transform (
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())),
           make_zip_iterator (make_tuple (input_->points.begin (), indices_->begin ())) + 
                              nr_points,
diff --git a/gpu/containers/src/initialization.cpp b/gpu/containers/src/initialization.cpp
@@ -229,8 +229,10 @@ pcl::gpu::printCudaDeviceInfo(int device)
            prop.multiProcessorCount,
            sm_cores,
            sm_cores * prop.multiProcessorCount);
+    int clockRate;
+    cudaSafeCall(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, dev));
     printf("  GPU Clock Speed:                               %.2f GHz\n",
-           prop.clockRate * 1e-6f);
+           clockRate * 1e-6f);
 
     // This is not available in the CUDA Runtime API, so we make the necessary calls the
     // driver API to support this for output
@@ -285,10 +287,13 @@ pcl::gpu::printCudaDeviceInfo(int device)
 
     printf(
         "  Concurrent copy and execution:                 %s with %d copy engine(s)\n",
-        (prop.deviceOverlap ? "Yes" : "No"),
+        (prop.asyncEngineCount ? "Yes" : "No"),
         prop.asyncEngineCount);
+    int kernelExecTimeoutEnabled;
+    cudaSafeCall(cudaDeviceGetAttribute(
+        &kernelExecTimeoutEnabled, cudaDevAttrKernelExecTimeout, dev));
     printf("  Run time limit on kernels:                     %s\n",
-           prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+           kernelExecTimeoutEnabled ? "Yes" : "No");
     printf("  Integrated GPU sharing Host Memory:            %s\n",
            prop.integrated ? "Yes" : "No");
     printf("  Support host page-locked memory mapping:       %s\n",
@@ -307,8 +312,10 @@ pcl::gpu::printCudaDeviceInfo(int device)
     printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n",
            prop.pciBusID,
            prop.pciDeviceID);
+    int propComputeMode;
+    cudaSafeCall(cudaDeviceGetAttribute(&propComputeMode, cudaDevAttrComputeMode, dev));
     printf("  Compute Mode:\n");
-    printf("      %s \n", computeMode[prop.computeMode]);
+    printf("      %s \n", computeMode[propComputeMode]);
   }
 
   printf("\n");
diff --git a/gpu/utils/include/pcl/gpu/utils/device/functional.hpp b/gpu/utils/include/pcl/gpu/utils/device/functional.hpp
@@ -39,14 +39,12 @@
 #define PCL_DEVICE_FUNCTIONAL_HPP_
 
 #include <thrust/functional.h>
+#include <cuda.h>
 
 namespace pcl
 {
     namespace device
     {
-        // Function Objects
-
-        using thrust::binary_function;
 
         // Arithmetic Operations
 
@@ -87,7 +85,11 @@ namespace pcl
 
         // Generalized Identity Operations
 
-        using thrust::identity;    
+        #if CUDA_VERSION >= 13000
+        using cuda::std::identity;
+        #else
+        using thrust::identity;
+        #endif
         using thrust::project1st;
         using thrust::project2nd;