Final text and code until worker pool

niosus · niosus · commit a499866d591c · 2026-04-26T22:57:31.000+02:00
diff --git a/animation/parallelism/src/scenes/code.tsx b/animation/parallelism/src/scenes/code.tsx
@@ -13,8 +13,9 @@ import { parser as parser_cpp } from '@lezer/cpp';
 
 import blockingCode from '../lectures/parallelism.md?snippet=parallelism_blocking/main.cpp';
 import asyncCode from '../lectures/parallelism.md?snippet=parallelism_async/main.cpp';
-import algorithmsCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main.cpp';
-import tbbCode from '../lectures/parallelism.md?snippet=parallelism_raw_tbb/main.cpp';
+import algorithmsSequentialCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_sequential.cpp';
+import algorithmsParallelCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_parallel.cpp';
+import tbbCode from '../lectures/parallelism.md?snippet=parallelism_algorithms/main_tbb.cpp';
 import jthreadCode from '../lectures/parallelism.md?snippet=parallelism_jthread/main.cpp';
 import threadpool17Code from '../lectures/parallelism.md?snippet=parallelism_threadpool_17/main.cpp';
 
@@ -125,57 +126,84 @@ export default makeScene2D(function* (view) {
     // Focus on loading an image
     yield* centerOn(codeRef(), [lines(8, 10)], duration, 30);
     yield* waitFor(duration);
-    yield* centerOn(codeRef(), [lines(11, 16), lines(5, 5)], duration, 30);
+    yield* centerOn(codeRef(), [lines(12, 16), lines(6, 11)], duration, 30);
     yield* waitFor(duration);
-    yield* centerOn(codeRef(), lines(17, 28), duration, 30);
+    yield* centerOn(codeRef(), lines(18, 28), duration, 30);
     yield* waitFor(duration);
     yield* centerOn(codeRef(), DEFAULT, duration, 20);
     yield* waitFor(duration);
 
     // 1. std::async Background Task
     yield* all(
         codeRef().code(asyncCode, duration),
-        centerOn(codeRef(), DEFAULT, duration, 12)
+        centerOn(codeRef(), DEFAULT, duration, 14)
     );
     yield* waitFor(duration);
 
     // Focus on loading an image
-    yield* centerOn(codeRef(), lines(20, 44), duration, 30);
+    yield* centerOn(codeRef(), lines(21, 38), duration, 30);
     yield* waitFor(duration);
 
     // Focus on std::async
-    yield* centerOn(codeRef(), lines(46, 49), duration, 30);
+    yield* centerOn(codeRef(), lines(42, 45), duration, 30);
     yield* waitFor(duration);
 
     // Focus on future polling
-    yield* centerOn(codeRef(), lines(50, 58), duration, 30);
+    yield* centerOn(codeRef(), lines(46, 54), duration, 30);
     yield* waitFor(duration);
 
     // Focus on getting
-    yield* centerOn(codeRef(), lines(59, 61), duration, 30);
+    yield* centerOn(codeRef(), lines(55, 58), duration, 30);
     yield* waitFor(duration);
 
-    yield* centerOn(codeRef(), lines(48, 49), duration, 40);
+    yield* centerOn(codeRef(), lines(44, 45), duration, 40);
     yield* waitFor(duration);
 
     // 2. Parallel Algorithms
-    yield* centerOn(codeRef(), DEFAULT, duration, 20);
-    yield* codeRef().code(algorithmsCode, duration);
+    yield* centerOn(codeRef(), DEFAULT, duration, 18);
+    yield* codeRef().code(algorithmsSequentialCode, duration);
     yield* waitFor(duration);
 
-    // Focus on execution policy
-    yield* centerOn(codeRef(), lines(32, 35), duration, 25);
+    yield* centerOn(codeRef(), lines(16, 22), duration, 30);
     yield* waitFor(duration);
 
-    yield* centerOn(codeRef(), DEFAULT, duration, 20);
+    yield* centerOn(codeRef(), lines(8, 10), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(12, 14), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(26, 28), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(32, 35), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(29, 39), duration, 30);
+    yield* waitFor(duration);
+
+    // 2. Parallel Algorithms
+    yield* centerOn(codeRef(), DEFAULT, duration, 18);
+    yield* codeRef().code(algorithmsParallelCode, duration);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(32, 37), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(2, 2), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), lines(29, 41), duration, 30);
+    yield* waitFor(duration);
+
+    yield* centerOn(codeRef(), DEFAULT, duration, 17);
     yield* waitFor(duration);
 
     // 3. Raw TBB
     yield* codeRef().code(tbbCode, duration);
     yield* waitFor(duration);
 
-    // Focus on parallel_for
-    yield* centerOn(codeRef(), lines(17, 26), duration, 25);
+    yield* centerOn(codeRef(), lines(34, 41), duration, 30);
     yield* waitFor(duration);
 
     yield* centerOn(codeRef(), DEFAULT, duration, 20);
diff --git a/lectures/parallelism.md b/lectures/parallelism.md
@@ -70,7 +70,6 @@ $PLACEHOLDER
 
 namespace {
 constexpr std::chrono::milliseconds kLoadTime{5000};
-}  // namespace
 
 struct Image {
   std::string data = "massive_image_data";
@@ -81,6 +80,7 @@ Image LoadMassiveImage(/* some path would go here */) {
   std::this_thread::sleep_for(kLoadTime);
   return Image{};
 }
+}  // namespace
 
 int main() {
   std::cout << "Loading massive image..." << std::flush;
@@ -109,11 +109,11 @@ $PLACEHOLDER
 #include <future>
 #include <iostream>
 #include <string>
+#include <thread>
 
 namespace {
 constexpr std::chrono::milliseconds kLoadTime{5000};
 constexpr std::chrono::milliseconds kSpinInterval{100};
-}  // namespace
 
 struct Image {
   std::string data = "massive_image_data";
@@ -130,24 +130,20 @@ class Spinner {
   static inline const std::array<std::string, 10> kFrames = {
       "⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"};
 
-  Spinner(std::string message) : message_{std::move(message)} {
-    std::cout << message_ << ' ' << kFrames[idx_++] << std::flush;
-    idx_ %= kFrames.size();
-  }
+  Spinner(std::string message) : message_{std::move(message)} { Spin(); }
 
   void Spin() {
     std::cout << "\r" << message_ << ' ' << kFrames[idx_++] << std::flush;
     idx_ %= kFrames.size();
   }
 
-  ~Spinner() {
-    std::cout << "\r" << message_ << " ✅" << std::endl;
-  }
+  ~Spinner() { std::cout << "\r" << message_ << " ✅\n"; }
 
  private:
   int idx_{};
   std::string message_{};
 };
+}  // namespace
 
 int main() {
   // 🚀 Kick off the heavy task in the background.
@@ -158,7 +154,8 @@ int main() {
   // ⏳ Start the spinner while we wait.
   {
     Spinner spinner("Loading massive image...");
-    while (future_image.wait_for(kSpinInterval) == std::future_status::timeout) {
+    while (future_image.wait_for(kSpinInterval) ==
+           std::future_status::timeout) {
       spinner.Spin();
     }
   }
@@ -190,7 +187,7 @@ Since C++17, many algorithms in the `<algorithm>` and `<numeric>` headers accept
 
 Imagine we want to apply a simple filter, e.g. color inversion, to every pixel of that "massive image" we've just loaded. To make it a complete example, we'll add some details to our `Image` struct from before, but we'll still keep it extremely simple. 
 
-Our image now holds a vector of pixels, with each pixel holding an RGB value. A function for inverting the color of a pixel only needs that pixel as an input and so is completely independent of other pixels. Tasks like this are called [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel), which means we don't have to worry about any data collisions during parallel execution. More on that later.
+Our image now holds a vector of pixels, with each pixel holding an RGB value. A function for inverting the color of a pixel only needs that pixel as an input and so is completely independent of other pixels. Tasks like these are called [embarrassingly parallel](https://en.wikipedia.org/wiki/Embarrassingly_parallel), which means we don't have to worry about any data collisions during parallel execution. More on that a bit later.
 
 <!-- 
 `CPP_COPY_SNIPPET` parallelism_algorithms/main_sequential.cpp
@@ -200,14 +197,13 @@ Our image now holds a vector of pixels, with each pixel holding an RGB value. A
 #include <algorithm>
 #include <chrono>
 #include <iostream>
-#include <thread>
 #include <vector>
 
 namespace {
 using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
 
 struct Pixel {
-  std::uint8_t r, g, b;
+  int r, g, b;
 };
 
 Pixel Invert(const Pixel& pixel) {
@@ -216,13 +212,12 @@ Pixel Invert(const Pixel& pixel) {
 
 // Using a struct to keep the example code short.
 struct Image {
-  Image(std::size_t width, std::size_t height, const Pixel value) 
-    : pixels(width * height, value) {}
+  Image(std::size_t width, std::size_t height, const Pixel value)
+      : pixels(width * height, value) {}
 
   std::vector<Pixel> pixels;
 };
-
-} // namespace
+}  // namespace
 
 int main() {
   // A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -266,14 +261,13 @@ Now let's make this program run in parallel!
 #include <chrono>
 #include <execution>
 #include <iostream>
-#include <thread>
 #include <vector>
 
 namespace {
 using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
 
 struct Pixel {
-  std::uint8_t r, g, b;
+  int r, g, b;
 };
 
 Pixel Invert(const Pixel& pixel) {
@@ -282,13 +276,12 @@ Pixel Invert(const Pixel& pixel) {
 
 // Using a struct to keep the example code short.
 struct Image {
-  Image(std::size_t width, std::size_t height, const Pixel value) 
-    : pixels(width * height, value) {}
+  Image(std::size_t width, std::size_t height, const Pixel value)
+      : pixels(width * height, value) {}
 
   std::vector<Pixel> pixels;
 };
-
-} // namespace
+}  // namespace
 
 int main() {
   // A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -310,7 +303,7 @@ int main() {
 }
 ```
 
-The code didn't change much at all! We only added the `std::execution::par` parameter to the `std::transform` algorithm. We also need to slightly change that compile command from before by adding `-ltbb` to it:
+The code didn't change much at all! We only added the `std::execution::par` parameter to the `std::transform` algorithm as well as the `<execution>` header needed for it. We also need to slightly change that compile command from before by adding `-ltbb` to it:
 
 ```
 c++ -std=c++17 -O3 main.cpp -ltbb
@@ -339,7 +332,7 @@ Now let's talk about that `std::execution::par` parameter. Similar to launch pol
 > [!NOTE]
 > This is a good time to talk about this `-ltbb` linker option! We also used it in the previous compilation command. The reason why we often need it to enable parallel version of the standard algorithms is because, under the hood, compilers often use **Intel Threading Building Blocks (oneTBB)** as the backend for these parallel algorithms. TBB is an industry-standard library for task-based parallelism but, again, if you're on Apple Clang you'll need to swtich to Clang (non-Apple) or GCC to use it.
 
-This also then means that we are not confined to the limits of standard library when we want to write code that runs in parallel. If we need more control than the standard library algorithms provide, for example if we want to specify how many threads to use, we can drop down an abstraction level and use Intel TBB directly. It provides a rich set of algorithms like `tbb::parallel_for`, `tbb::parallel_reduce`, and concurrent data structures.
+This also then means that we are not confined to the limits of standard library when we want to write code that runs in parallel. If we need more control than the standard library algorithms provide, we can drop down an abstraction level and use Intel TBB directly. It provides a rich set of algorithms like `tbb::parallel_for`, `tbb::parallel_reduce`, and concurrent data structures.
 
 Let's rewrite our color inversion example using `tbb::parallel_for`. This explicitly tells TBB to split our vector index range into chunks ("blocked ranges") and process them across available worker threads:
 
@@ -348,17 +341,18 @@ Let's rewrite our color inversion example using `tbb::parallel_for`. This explic
 `CPP_RUN_CMD` CWD:parallelism_algorithms bash -c 'g++-15 -std=c++17 -O3 -I/opt/homebrew/include -L/opt/homebrew/lib main_tbb.cpp -ltbb -o tbb 2>/dev/null || c++ -std=c++17 -O3 main_tbb.cpp -ltbb -o tbb'
 -->
 ```cpp
-#include <chrono>
-#include <iostream>
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
+
+#include <chrono>
+#include <iostream>
 #include <vector>
 
 namespace {
 using DoubleMilliseconds = std::chrono::duration<double, std::milli>;
 
 struct Pixel {
-  std::uint8_t r, g, b;
+  int r, g, b;
 };
 
 Pixel Invert(const Pixel& pixel) {
@@ -367,13 +361,12 @@ Pixel Invert(const Pixel& pixel) {
 
 // Using a struct to keep the example code short.
 struct Image {
-  Image(std::size_t width, std::size_t height, const Pixel value) 
-    : pixels(width * height, value) {}
+  Image(std::size_t width, std::size_t height, const Pixel value)
+      : pixels(width * height, value) {}
 
   std::vector<Pixel> pixels;
 };
-
-} // namespace
+}  // namespace
 
 int main() {
   // A massive 100-megapixel image! Imagine it is filled with useful data.
@@ -384,23 +377,23 @@ int main() {
 
   // tbb::parallel_for takes a range, and a lambda to process that sub-range
   tbb::parallel_for(tbb::blocked_range<size_t>(0, image.pixels.size()),
-                    [&](const tbb::blocked_range<size_t> &r) {
+                    [&](const tbb::blocked_range<size_t>& r) {
                       // This loop processes ONE chunk assigned to a specific thread
                       for (size_t i = r.begin(); i != r.end(); ++i) {
                         image.pixels[i] = Invert(image.pixels[i]);
                       }
                     });
 
   const auto end = std::chrono::high_resolution_clock::now();
-  const DoubleMilliseconds tbb_ms = end - start;
-  std::cout << "Raw TBB time: " << tbb_ms.count() << " ms\n";
+  const DoubleMilliseconds time_taken = end - start;
+  std::cout << "Raw TBB time: " << time_taken.count() << " ms\n";
   return 0;
 }
 ```
 
-All in all, TBB gives us explicit control over the chunks, which is very useful for more complex loops where standard library algorithms might not fit perfectly. But, as for this example, we can compile it just as we did before and it should run in about the same time as the parallel version of the standard algorithms, in around 5ms on my machine.
+We can compile this example just as we compiled the previous one and it should run in about the same time as the parallel version of the standard algorithms, in around 5ms on my machine.
 
-If you want a challenge, go ahead and find a way to only use, say, 2 threads rather than all available ones with this version of the code!
+All in all, TBB is a very powerful library that gives us much more control over how our code runs in parallel. For example, there is no way to select how many threads to use using the standard library parallel algorithms but TBB allows to change that. If you want a small challenge, go ahead and find a way to only use, say, 2 threads rather than all available ones with our TBB example!
 
 ### Worker Threads and Thread Pools
 So now we know how to kick off long-running tasks and how to use parallel algorithms to process many tiny tasks. Is that it? Not quite. Imagine we receive a stream of thousands tiny images that all need their colors inverted before they can be displayed.