Skip to content

Commit a499866

Browse files
committed
Final text and code until worker pool
1 parent 39c0780 commit a499866

2 files changed

Lines changed: 75 additions & 54 deletions

File tree

animation/parallelism/src/scenes/code.tsx

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@ import { parser as parser_cpp } from '@lezer/cpp';
1313

1414
import blockingCode from '../lectures/parallelism.md?snippet=parallelism_blocking/main.cpp';
1515
import asyncCode from '../lectures/parallelism.md?snippet=parallelism_async/main.cpp';
16-
import algorithmsCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main.cpp';
17-
import tbbCode from '../lectures/parallelism.md?snippet=parallelism_raw_tbb/main.cpp';
16+
import algorithmsSequentialCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_sequential.cpp';
17+
import algorithmsParallelCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_parallel.cpp';
18+
import tbbCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_tbb.cpp';
1819
import jthreadCode from '../lectures/parallelism.md?snippet=parallelism_jthread/main.cpp';
1920
import threadpool17Code from '../lectures/parallelism.md?snippet=parallelism_threadpool_17/main.cpp';
2021

@@ -125,57 +126,84 @@ export default makeScene2D(function* (view) {
125126
// Focus on loading an image
126127
yield* centerOn(codeRef(), [lines(8, 10)], duration, 30);
127128
yield* waitFor(duration);
128-
yield* centerOn(codeRef(), [lines(11, 16), lines(5, 5)], duration, 30);
129+
yield* centerOn(codeRef(), [lines(12, 16), lines(6, 11)], duration, 30);
129130
yield* waitFor(duration);
130-
yield* centerOn(codeRef(), lines(17, 28), duration, 30);
131+
yield* centerOn(codeRef(), lines(18, 28), duration, 30);
131132
yield* waitFor(duration);
132133
yield* centerOn(codeRef(), DEFAULT, duration, 20);
133134
yield* waitFor(duration);
134135

135136
// 1. std::async Background Task
136137
yield* all(
137138
codeRef().code(asyncCode, duration),
138-
centerOn(codeRef(), DEFAULT, duration, 12)
139+
centerOn(codeRef(), DEFAULT, duration, 14)
139140
);
140141
yield* waitFor(duration);
141142

142143
// Focus on loading an image
143-
yield* centerOn(codeRef(), lines(20, 44), duration, 30);
144+
yield* centerOn(codeRef(), lines(21, 38), duration, 30);
144145
yield* waitFor(duration);
145146

146147
// Focus on std::async
147-
yield* centerOn(codeRef(), lines(46, 49), duration, 30);
148+
yield* centerOn(codeRef(), lines(42, 45), duration, 30);
148149
yield* waitFor(duration);
149150

150151
// Focus on future polling
151-
yield* centerOn(codeRef(), lines(50, 58), duration, 30);
152+
yield* centerOn(codeRef(), lines(46, 54), duration, 30);
152153
yield* waitFor(duration);
153154

154155
// Focus on getting
155-
yield* centerOn(codeRef(), lines(59, 61), duration, 30);
156+
yield* centerOn(codeRef(), lines(55, 58), duration, 30);
156157
yield* waitFor(duration);
157158

158-
yield* centerOn(codeRef(), lines(48, 49), duration, 40);
159+
yield* centerOn(codeRef(), lines(44, 45), duration, 40);
159160
yield* waitFor(duration);
160161

161162
// 2. Parallel Algorithms
162-
yield* centerOn(codeRef(), DEFAULT, duration, 20);
163-
yield* codeRef().code(algorithmsCode, duration);
163+
yield* centerOn(codeRef(), DEFAULT, duration, 18);
164+
yield* codeRef().code(algorithmsSequentialCode, duration);
164165
yield* waitFor(duration);
165166

166-
// Focus on execution policy
167-
yield* centerOn(codeRef(), lines(32, 35), duration, 25);
167+
yield* centerOn(codeRef(), lines(16, 22), duration, 30);
168168
yield* waitFor(duration);
169169

170-
yield* centerOn(codeRef(), DEFAULT, duration, 20);
170+
yield* centerOn(codeRef(), lines(8, 10), duration, 30);
171+
yield* waitFor(duration);
172+
173+
yield* centerOn(codeRef(), lines(12, 14), duration, 30);
174+
yield* waitFor(duration);
175+
176+
yield* centerOn(codeRef(), lines(26, 28), duration, 30);
177+
yield* waitFor(duration);
178+
179+
yield* centerOn(codeRef(), lines(32, 35), duration, 30);
180+
yield* waitFor(duration);
181+
182+
yield* centerOn(codeRef(), lines(29, 39), duration, 30);
183+
yield* waitFor(duration);
184+
185+
// 2. Parallel Algorithms
186+
yield* centerOn(codeRef(), DEFAULT, duration, 18);
187+
yield* codeRef().code(algorithmsParallelCode, duration);
188+
yield* waitFor(duration);
189+
190+
yield* centerOn(codeRef(), lines(32, 37), duration, 30);
191+
yield* waitFor(duration);
192+
193+
yield* centerOn(codeRef(), lines(2, 2), duration, 30);
194+
yield* waitFor(duration);
195+
196+
yield* centerOn(codeRef(), lines(29, 41), duration, 30);
197+
yield* waitFor(duration);
198+
199+
yield* centerOn(codeRef(), DEFAULT, duration, 17);
171200
yield* waitFor(duration);
172201

173202
// 3. Raw TBB
174203
yield* codeRef().code(tbbCode, duration);
175204
yield* waitFor(duration);
176205

177-
// Focus on parallel_for
178-
yield* centerOn(codeRef(), lines(17, 26), duration, 25);
206+
yield* centerOn(codeRef(), lines(34, 41), duration, 30);
179207
yield* waitFor(duration);
180208

181209
yield* centerOn(codeRef(), DEFAULT, duration, 20);

lectures/parallelism.md

Lines changed: 30 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,6 @@ $PLACEHOLDER
7070

7171
namespace {
7272
constexpr std::chrono::milliseconds kLoadTime{5000};
73-
} // namespace
7473

7574
struct Image {
7675
std::string data = "massive_image_data";
@@ -81,6 +80,7 @@ Image LoadMassiveImage(/* some path would go here */) {
8180
std::this_thread::sleep_for(kLoadTime);
8281
return Image{};
8382
}
83+
} // namespace
8484

8585
int main() {
8686
std::cout << "Loading massive image..." << std::flush;
@@ -109,11 +109,11 @@ $PLACEHOLDER
109109
#include <future>
110110
#include <iostream>
111111
#include <string>
112+
#include <thread>
112113
113114
namespace {
114115
constexpr std::chrono::milliseconds kLoadTime{5000};
115116
constexpr std::chrono::milliseconds kSpinInterval{100};
116-
} // namespace
117117
118118
struct Image {
119119
std::string data = "massive_image_data";
@@ -130,24 +130,20 @@ class Spinner {
130130
static inline const std::array<std::string, 10> kFrames = {
131131
"⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"};
132132
133-
Spinner(std::string message) : message_{std::move(message)} {
134-
std::cout << message_ << ' ' << kFrames[idx_++] << std::flush;
135-
idx_ %= kFrames.size();
136-
}
133+
Spinner(std::string message) : message_{std::move(message)} { Spin(); }
137134
138135
void Spin() {
139136
std::cout << "\r" << message_ << ' ' << kFrames[idx_++] << std::flush;
140137
idx_ %= kFrames.size();
141138
}
142139
143-
~Spinner() {
144-
std::cout << "\r" << message_ << " ✅" << std::endl;
145-
}
140+
~Spinner() { std::cout << "\r" << message_ << " ✅\n"; }
146141
147142
private:
148143
int idx_{};
149144
std::string message_{};
150145
};
146+
} // namespace
151147
152148
int main() {
153149
// 🚀 Kick off the heavy task in the background.
@@ -158,7 +154,8 @@ int main() {
158154
// ⏳ Start the spinner while we wait.
159155
{
160156
Spinner spinner("Loading massive image...");
161-
while (future_image.wait_for(kSpinInterval) == std::future_status::timeout) {
157+
while (future_image.wait_for(kSpinInterval) ==
158+
std::future_status::timeout) {
162159
spinner.Spin();
163160
}
164161
}
@@ -190,7 +187,7 @@ Since C++17, many algorithms in the `<algorithm>` and `<numeric>` headers accept
190187

191188
Imagine we want to apply a simple filter, e.g. color inversion, to every pixel of that "massive image" we've just loaded. To make it a complete example, we'll add some details to our `Image` struct from before, but we'll still keep it extremely simple.
192189

193-
Our image now holds a vector of pixels, with each pixel holding an RGB value. A function for inverting the color of a pixel only needs that pixel as an input and so is completely independent of other pixels. Tasks like this are called [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel), which means we don't have to worry about any data collisions during parallel execution. More on that later.
190+
Our image now holds a vector of pixels, with each pixel holding an RGB value. A function for inverting the color of a pixel only needs that pixel as an input and so is completely independent of other pixels. Tasks like these are called [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel), which means we don't have to worry about any data collisions during parallel execution. More on that a bit later.
194191

195192
<!--
196193
`CPP_COPY_SNIPPET` parallelism_algorithms/main_sequential.cpp
@@ -200,14 +197,13 @@ Our image now holds a vector of pixels, with each pixel holding an RGB value. A
200197
#include <algorithm>
201198
#include <chrono>
202199
#include <iostream>
203-
#include <thread>
204200
#include <vector>
205201

206202
namespace {
207203
using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
208204

209205
struct Pixel {
210-
std::uint8_t r, g, b;
206+
int r, g, b;
211207
};
212208

213209
Pixel Invert(const Pixel& pixel) {
@@ -216,13 +212,12 @@ Pixel Invert(const Pixel& pixel) {
216212

217213
// Using a struct to keep the example code short.
218214
struct Image {
219-
Image(std::size_t width, std::size_t height, const Pixel value)
220-
: pixels(width * height, value) {}
215+
Image(std::size_t width, std::size_t height, const Pixel value)
216+
: pixels(width * height, value) {}
221217

222218
std::vector<Pixel> pixels;
223219
};
224-
225-
} // namespace
220+
} // namespace
226221

227222
int main() {
228223
// A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -266,14 +261,13 @@ Now let's make this program run in parallel!
266261
#include <chrono>
267262
#include <execution>
268263
#include <iostream>
269-
#include <thread>
270264
#include <vector>
271265

272266
namespace {
273267
using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
274268

275269
struct Pixel {
276-
std::uint8_t r, g, b;
270+
int r, g, b;
277271
};
278272

279273
Pixel Invert(const Pixel& pixel) {
@@ -282,13 +276,12 @@ Pixel Invert(const Pixel& pixel) {
282276

283277
// Using a struct to keep the example code short.
284278
struct Image {
285-
Image(std::size_t width, std::size_t height, const Pixel value)
286-
: pixels(width * height, value) {}
279+
Image(std::size_t width, std::size_t height, const Pixel value)
280+
: pixels(width * height, value) {}
287281

288282
std::vector<Pixel> pixels;
289283
};
290-
291-
} // namespace
284+
} // namespace
292285

293286
int main() {
294287
// A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -310,7 +303,7 @@ int main() {
310303
}
311304
```
312305
313-
The code didn't change much at all! We only added the `std::execution::par` parameter to the `std::transform` algorithm. We also need to slightly change that compile command from before by adding `-ltbb` to it:
306+
The code didn't change much at all! We only added the `std::execution::par` parameter to the `std::transform` algorithm as well as the `<execution>` header needed for it. We also need to slightly change that compile command from before by adding `-ltbb` to it:
314307
315308
```
316309
c++ -std=c++17 -O3 main.cpp -ltbb
@@ -339,7 +332,7 @@ Now let's talk about that `std::execution::par` parameter. Similar to launch pol
339332
> [!NOTE]
340333
> This is a good time to talk about this `-ltbb` linker option! We also used it in the previous compilation command. The reason why we often need it to enable parallel version of the standard algorithms is because, under the hood, compilers often use **Intel Threading Building Blocks (oneTBB)** as the backend for these parallel algorithms. TBB is an industry-standard library for task-based parallelism but, again, if you're on Apple Clang you'll need to swtich to Clang (non-Apple) or GCC to use it.
341334
342-
This also then means that we are not confined to the limits of standard library when we want to write code that runs in parallel. If we need more control than the standard library algorithms provide, for example if we want to specify how many threads to use, we can drop down an abstraction level and use Intel TBB directly. It provides a rich set of algorithms like `tbb::parallel_for`, `tbb::parallel_reduce`, and concurrent data structures.
335+
This also then means that we are not confined to the limits of standard library when we want to write code that runs in parallel. If we need more control than the standard library algorithms provide, we can drop down an abstraction level and use Intel TBB directly. It provides a rich set of algorithms like `tbb::parallel_for`, `tbb::parallel_reduce`, and concurrent data structures.
343336
344337
Let's rewrite our color inversion example using `tbb::parallel_for`. This explicitly tells TBB to split our vector index range into chunks ("blocked ranges") and process them across available worker threads:
345338
@@ -348,17 +341,18 @@ Let's rewrite our color inversion example using `tbb::parallel_for`. This explic
348341
`CPP_RUN_CMD` CWD:parallelism_algorithms bash -c 'g++-15 -std=c++17 -O3 -I/opt/homebrew/include -L/opt/homebrew/lib main_tbb.cpp -ltbb -o tbb 2>/dev/null || c++ -std=c++17 -O3 main_tbb.cpp -ltbb -o tbb'
349342
-->
350343
```cpp
351-
#include <chrono>
352-
#include <iostream>
353344
#include <tbb/blocked_range.h>
354345
#include <tbb/parallel_for.h>
346+
347+
#include <chrono>
348+
#include <iostream>
355349
#include <vector>
356350
357351
namespace {
358352
using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
359353
360354
struct Pixel {
361-
std::uint8_t r, g, b;
355+
int r, g, b;
362356
};
363357
364358
Pixel Invert(const Pixel& pixel) {
@@ -367,13 +361,12 @@ Pixel Invert(const Pixel& pixel) {
367361
368362
// Using a struct to keep the example code short.
369363
struct Image {
370-
Image(std::size_t width, std::size_t height, const Pixel value)
371-
: pixels(width * height, value) {}
364+
Image(std::size_t width, std::size_t height, const Pixel value)
365+
: pixels(width * height, value) {}
372366
373367
std::vector<Pixel> pixels;
374368
};
375-
376-
} // namespace
369+
} // namespace
377370
378371
int main() {
379372
// A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -384,23 +377,23 @@ int main() {
384377
385378
// tbb::parallel_for takes a range, and a lambda to process that sub-range
386379
tbb::parallel_for(tbb::blocked_range<size_t>(0, image.pixels.size()),
387-
[&](const tbb::blocked_range<size_t> &r) {
380+
[&](const tbb::blocked_range<size_t>& r) {
388381
// This loop processes ONE chunk assigned to a specific thread
389382
for (size_t i = r.begin(); i != r.end(); ++i) {
390383
image.pixels[i] = Invert(image.pixels[i]);
391384
}
392385
});
393386
394387
const auto end = std::chrono::high_resolution_clock::now();
395-
const DoubleMilliseconds tbb_ms = end - start;
396-
std::cout << "Raw TBB time: " << tbb_ms.count() << " ms\n";
388+
const DoubleMilliseconds time_taken = end - start;
389+
std::cout << "Raw TBB time: " << time_taken.count() << " ms\n";
397390
return 0;
398391
}
399392
```
400393

401-
All in all, TBB gives us explicit control over the chunks, which is very useful for more complex loops where standard library algorithms might not fit perfectly. But, as for this example, we can compile it just as we did before and it should run in about the same time as the parallel version of the standard algorithms, in around 5ms on my machine.
394+
We can compile this example just as we compiled the previous one and it should run in about the same time as the parallel version of the standard algorithms, in around 5ms on my machine.
402395

403-
If you want a challenge, go ahead and find a way to only use, say, 2 threads rather than all available ones with this version of the code!
396+
All in all, TBB is a very powerful library that gives us much more control over how our code runs in parallel. For example, there is no way to select how many threads to use using the standard library parallel algorithms but TBB allows to change that. If you want a small challenge, go ahead and find a way to only use, say, 2 threads rather than all available ones with our TBB example!
404397

405398
### Worker Threads and Thread Pools
406399
So now we know how to kick off long-running tasks and how to use parallel algorithms to process many tiny tasks. Is that it? Not quite. Imagine we receive a stream of thousands tiny images that all need their colors inverted before they can be displayed.

0 commit comments

Comments
 (0)