|
4 | 4 | #include <stan/math/prim/meta.hpp> |
5 | 5 | #include <stan/math/prim/functor.hpp> |
6 | 6 | #include <stan/math/rev/core.hpp> |
| 7 | + |
7 | 8 | #include <tbb/task_arena.h> |
8 | 9 | #include <tbb/parallel_reduce.h> |
9 | 10 | #include <tbb/blocked_range.h> |
@@ -249,15 +250,23 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType, |
249 | 250 | std::forward<Vec>(vmapped), |
250 | 251 | std::forward<Args>(args)...); |
251 | 252 |
|
252 | | - if (auto_partitioning) { |
253 | | - tbb::parallel_reduce( |
254 | | - tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker); |
255 | | - } else { |
256 | | - tbb::simple_partitioner partitioner; |
257 | | - tbb::parallel_deterministic_reduce( |
258 | | - tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker, |
259 | | - partitioner); |
260 | | - } |
| 253 | + // we must use task isolation as described here: |
| 254 | + // https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html |
| 255 | + // this is to ensure that the thread local AD tape ressource is |
| 256 | + // not being modified from a different task which may happen |
| 257 | + // whenever this function is being used itself in a parallel |
| 258 | + // context (like running multiple chains for Stan) |
| 259 | + tbb::this_task_arena::isolate([&] { |
| 260 | + if (auto_partitioning) { |
| 261 | + tbb::parallel_reduce( |
| 262 | + tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker); |
| 263 | + } else { |
| 264 | + tbb::simple_partitioner partitioner; |
| 265 | + tbb::parallel_deterministic_reduce( |
| 266 | + tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker, |
| 267 | + partitioner); |
| 268 | + } |
| 269 | + }); |
261 | 270 |
|
262 | 271 | for (size_t i = 0; i < num_vars_shared_terms; ++i) { |
263 | 272 | partials[num_vars_sliced_terms + i] = worker.args_adjoints_.coeff(i); |
|
0 commit comments