Remove the thread pool size limitation

thomasRoglin · thomasRoglin · commit b18f59987ce0 · 2025-08-05T15:39:33.000+02:00
The thread pool is now dynamic vector, allowing one thread per core on
CPUs with many cores.

Since the maximum number of threads is not known at compile time, the
`signals` array is now a vector
diff --git a/modules/mux/targets/host/include/host/thread_pool.h b/modules/mux/targets/host/include/host/thread_pool.h
@@ -26,6 +26,7 @@
 #include <map>
 #include <mutex>
 #include <new>
+#include <vector>
 
 #ifdef CA_HOST_ENABLE_PAPI_COUNTERS
 #include <papi.h>
@@ -95,17 +96,14 @@ struct thread_pool_s final {
   /// @param[in] user_data User data to pass to the function.
   /// @param[in] user_data2 A second user data to pass to the function.
   /// @param[in] user_data3 A third user data to pass to the function.
-  /// @param[in,out] signals A list of bools that will be signalled when each
+  /// @param[in,out] signals A vector of bools that will be signalled when each
   /// slice of the enqueue range has completed.
   /// @param[in,out] count A number that is incremented immediately, and
   /// decremented when the enqueued function has completed.
   /// @param[in] slices The number of pieces that the work is to be divided into
   /// when it is enqueued on the thread pool.
-  ///
-  /// @tparam N Length of `signals`.
-  template <size_t N>
   void enqueue_range(function_t function, void *user_data, void *user_data2,
-                     std::array<std::atomic<bool>, N> &signals,
+                     std::vector<std::atomic<bool>> &signals,
                      std::atomic<uint32_t> *count, size_t slices) {
     const tracer::TraceGuard<tracer::Impl> traceGuard(__func__);
 
@@ -173,20 +171,15 @@ struct thread_pool_s final {
   /// enqueue, wait() will wait for the counter to reach zero.
   void wait(std::atomic<uint32_t> *count);
 
-  /// The maximum number of threads our thread pool supports. Useful for
-  /// allocating memory (you know the max size of allocations required).
-  static const size_t max_num_threads = 32;
-
   /// The maximum number of work that can be enqueued.
   static const size_t queue_max = 4096;
 
-  /// The number of threads actually initialized in the thread pool.  General
-  /// the lower of the number of cores or max_num_threads, but could be lower in
-  /// the presence of debug settings.
+  /// The number of threads actually initialized in the thread pool. In General
+  /// the number of cores, but could be lower in the presence of debug settings.
   size_t initialized_threads;
 
   /// The pool of threads to use for execution.
-  std::array<cargo::thread, max_num_threads> pool;
+  std::vector<cargo::thread> pool;
 
   /// The buffer to hold the queue of work.
   std::array<thread_pool_work_item_s, queue_max> queue;
diff --git a/modules/mux/targets/host/source/queue.cpp b/modules/mux/targets/host/source/queue.cpp
@@ -249,9 +249,10 @@ void commandNDRange(host::queue_s *queue, host::command_info_s *info) {
     return;
   }
 
-  constexpr size_t signal_count =
-      host::thread_pool_s::max_num_threads * slice_multiplier;
-  std::array<std::atomic<bool>, signal_count> signals;
+  const size_t signal_count =
+      host_device->thread_pool.num_threads() * slice_multiplier;
+
+  std::vector<std::atomic<bool>> signals(signal_count);
   std::atomic<uint32_t> queued(0);
   host_device->thread_pool.enqueue_range(
       [](void *const in, void *const info, void *, size_t index) {
diff --git a/modules/mux/targets/host/source/thread_pool.cpp b/modules/mux/targets/host/source/thread_pool.cpp
@@ -87,8 +87,7 @@ thread_pool_s::thread_pool_s() : stayAlive(true) {
   const size_t hw_threads = cargo::thread::hardware_concurrency();
   const size_t desired_threads =
       clamp(hw_threads - ca_free_hw_threads, 2, hw_threads);
-  const size_t max_threads = thread_pool_s::max_num_threads;
-  size_t debug_threads = thread_pool_s::max_num_threads;
+  size_t debug_threads = hw_threads;
 
   // Register the value of the CA_HOST_NUM_THREADS environment variable.
   // If the programmer has provided an override to the number of threads that
@@ -102,7 +101,8 @@ thread_pool_s::thread_pool_s() : stayAlive(true) {
   }
 
   // Must be set before num_threads() is called.
-  initialized_threads = std::min({desired_threads, max_threads, debug_threads});
+  initialized_threads = std::min({desired_threads, debug_threads});
+  pool.resize(num_threads());
   for (size_t i = 0, e = num_threads(); i < e; i++) {
     pool[i] = cargo::thread(threadFunc, this);
     pool[i].set_name("host:pool:" + std::to_string(i));