Skip to content

Commit 00e82e3

Browse files
authored
Merge pull request #2445 from stan-dev/feature/issue-2444-nested-paralellism
Feature/issue 2444 nested paralellism
2 parents 91e965a + cc3084a commit 00e82e3

4 files changed

Lines changed: 33 additions & 19 deletions

File tree

stan/math/fwd/functor/reduce_sum.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,6 @@
44
#include <stan/math/prim/meta.hpp>
55
#include <stan/math/prim/err.hpp>
66

7-
#include <tbb/task_arena.h>
8-
#include <tbb/parallel_reduce.h>
9-
#include <tbb/blocked_range.h>
10-
117
#include <algorithm>
128
#include <tuple>
139
#include <vector>

stan/math/rev/core/init_chainablestack.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,9 @@ namespace math {
2020
* hook ensures that each worker thread has an initialized AD tape
2121
* ready for use.
2222
*
23-
* Refer to https://software.intel.com/en-us/node/506314 for details
24-
* on the observer concept.
23+
* Refer to
24+
* https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-reference/task-scheduler/taskschedulerobserver.html
25+
* for details on the observer concept.
2526
*/
2627
class ad_tape_observer final : public tbb::task_scheduler_observer {
2728
using stack_ptr = std::unique_ptr<ChainableStack>;

stan/math/rev/functor/map_rect_concurrent.hpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,10 +45,18 @@ map_rect_concurrent(
4545
};
4646

4747
#ifdef STAN_THREADS
48-
tbb::parallel_for(tbb::blocked_range<std::size_t>(0, num_jobs),
49-
[&](const tbb::blocked_range<size_t>& r) {
50-
execute_chunk(r.begin(), r.end());
51-
});
48+
// we must use task isolation as described here:
49+
// https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html
50+
// this is to ensure that the thread local AD tape ressource is
51+
// not being modified from a different task which may happen
52+
// whenever this function is being used itself in a parallel
53+
// context (like running multiple chains for Stan)
54+
tbb::this_task_arena::isolate([&] {
55+
tbb::parallel_for(tbb::blocked_range<std::size_t>(0, num_jobs),
56+
[&](const tbb::blocked_range<size_t>& r) {
57+
execute_chunk(r.begin(), r.end());
58+
});
59+
});
5260
#else
5361
execute_chunk(0, num_jobs);
5462
#endif

stan/math/rev/functor/reduce_sum.hpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <stan/math/prim/meta.hpp>
55
#include <stan/math/prim/functor.hpp>
66
#include <stan/math/rev/core.hpp>
7+
78
#include <tbb/task_arena.h>
89
#include <tbb/parallel_reduce.h>
910
#include <tbb/blocked_range.h>
@@ -249,15 +250,23 @@ struct reduce_sum_impl<ReduceFunction, require_var_t<ReturnType>, ReturnType,
249250
std::forward<Vec>(vmapped),
250251
std::forward<Args>(args)...);
251252

252-
if (auto_partitioning) {
253-
tbb::parallel_reduce(
254-
tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker);
255-
} else {
256-
tbb::simple_partitioner partitioner;
257-
tbb::parallel_deterministic_reduce(
258-
tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker,
259-
partitioner);
260-
}
253+
// we must use task isolation as described here:
254+
// https://software.intel.com/content/www/us/en/develop/documentation/tbb-documentation/top/intel-threading-building-blocks-developer-guide/task-isolation.html
255+
// this is to ensure that the thread local AD tape ressource is
256+
// not being modified from a different task which may happen
257+
// whenever this function is being used itself in a parallel
258+
// context (like running multiple chains for Stan)
259+
tbb::this_task_arena::isolate([&] {
260+
if (auto_partitioning) {
261+
tbb::parallel_reduce(
262+
tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker);
263+
} else {
264+
tbb::simple_partitioner partitioner;
265+
tbb::parallel_deterministic_reduce(
266+
tbb::blocked_range<std::size_t>(0, num_terms, grainsize), worker,
267+
partitioner);
268+
}
269+
});
261270

262271
for (size_t i = 0; i < num_vars_shared_terms; ++i) {
263272
partials[num_vars_sliced_terms + i] = worker.args_adjoints_.coeff(i);

0 commit comments

Comments
 (0)