diff --git a/.clang-format b/.clang-format
index 42912faec..ad9f4bb18 100644
--- a/.clang-format
+++ b/.clang-format
@@ -2,19 +2,35 @@
 Language: Cpp
 ColumnLimit: 110
 IndentPPDirectives: BeforeHash
-AlwaysBreakTemplateDeclarations : true
-PackConstructorInitializers : CurrentLine
+AlwaysBreakTemplateDeclarations: Yes
+BreakAfterAttributes: Always
+PackConstructorInitializers: Never
 AccessModifierOffset: -1
-IndentCaseLabels : true
+IndentCaseLabels: true
 AllowShortLambdasOnASingleLine: Empty
 RequiresExpressionIndentation: OuterScope
-BinPackArguments : false
-BinPackParameters : false
-LambdaBodyIndentation : Signature
-PenaltyReturnTypeOnItsOwnLine : 1
+LambdaBodyIndentation: Signature
+PenaltyReturnTypeOnItsOwnLine: 1
+
+# TODO: update for clang-format 23
+BinPackArguments: false
+BinPackParameters: OnePerLine
+
+BreakBeforeConceptDeclarations: Always
+
+Macros:
+  - LF_TRY=if
+  - LF_CATCH_ALL=else
+  - LF_CATCH(x)=else
+  - LF_HOF(x)={x;}
+  - LF_HOF(x,y)={x,y;}
+  - LF_HOF(x,y,z)={x,y,z;}
+  - LF_HOF(x,y,z,w)={x,y,z,w;}
+  - LF_HOF(a,b,c,d,e)={a;}
+  - LF_HOF(a,b,c,d,e,f)={a,b,c,d,e,f;}
 
 SpaceBeforeParens: Custom
 SpaceBeforeParensOptions:
-    AfterRequiresInClause: true
-    AfterRequiresInExpression : true
+  AfterRequiresInClause: true
+  AfterRequiresInExpression: true
 ...
diff --git a/.clang-tidy b/.clang-tidy
index 5d813fd55..d05681165 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -10,148 +10,150 @@ Checks: "*,\
   -llvm-header-guard,\
   -llvm-include-order,\
   -llvmlibc-*,\
-  -modernize-use-nodiscard,\
+  -readability-identifier-length,\
   -misc-non-private-member-variables-in-classes"
-WarningsAsErrors: ''
+WarningsAsErrors: ""
 CheckOptions:
   - key: readability-function-cognitive-complexity.IgnoreMacros
-    value: 'true'
-  - key: 'bugprone-argument-comment.StrictMode'
-    value: 'true'
-# Prefer using enum classes with 2 values for parameters instead of bools
-  - key: 'bugprone-argument-comment.CommentBoolLiterals'
-    value: 'true'
-  - key: 'bugprone-misplaced-widening-cast.CheckImplicitCasts'
-    value: 'true'
-  - key: 'bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression'
-    value: 'true'
-  - key: 'bugprone-suspicious-string-compare.WarnOnLogicalNotComparison'
-    value: 'true'
-  - key: 'readability-simplify-boolean-expr.ChainedConditionalReturn'
-    value: 'true'
-  - key: 'readability-simplify-boolean-expr.ChainedConditionalAssignment'
-    value: 'true'
-  - key: 'readability-uniqueptr-delete-release.PreferResetCall'
-    value: 'true'
-  - key: 'cppcoreguidelines-init-variables.MathHeader'
-    value: '<cmath>'
-  - key: 'cppcoreguidelines-narrowing-conversions.PedanticMode'
-    value: 'true'
-  - key: 'readability-else-after-return.WarnOnUnfixable'
-    value: 'true'
-  - key: 'readability-else-after-return.WarnOnConditionVariables'
-    value: 'true'
-  - key: 'readability-inconsistent-declaration-parameter-name.Strict'
-    value: 'true'
-  - key: 'readability-qualified-auto.AddConstToQualified'
-    value: 'true'
-  - key: 'readability-redundant-access-specifiers.CheckFirstDeclaration'
-    value: 'true'
-# These seem to be the most common identifier styles
-  - key: 'readability-identifier-naming.AbstractClassCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ClassMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstantPointerParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprFunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ConstexprVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.EnumCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.EnumConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.FunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalConstantPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalFunctionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.GlobalVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.InlineNamespaceCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalConstantPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalPointerCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.LocalVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.MacroDefinitionCase'
-    value: 'UPPER_CASE'
-  - key: 'readability-identifier-naming.MemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.MethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.NamespaceCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ParameterPackCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PointerParameterCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PrivateMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PrivateMemberPrefix'
-    value: 'm_'
-  - key: 'readability-identifier-naming.PrivateMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ProtectedMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ProtectedMemberPrefix'
-    value: 'm_'
-  - key: 'readability-identifier-naming.ProtectedMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PublicMemberCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.PublicMethodCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ScopedEnumConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StaticConstantCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StaticVariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.StructCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.TemplateTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.TypeAliasCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TypedefCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.TypeTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.UnionCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.ValueTemplateParameterCase'
-    value: 'CamelCase'
-  - key: 'readability-identifier-naming.VariableCase'
-    value: 'lower_case'
-  - key: 'readability-identifier-naming.VirtualMethodCase'
-    value: 'lower_case'
+    value: "true"
+  - key: "cppcoreguidelines-avoid-do-while.IgnoreMacros"
+    value: "true"
+  - key: "bugprone-argument-comment.StrictMode"
+    value: "true"
+  # Prefer using enum classes with 2 values for parameters instead of bools
+  - key: "bugprone-argument-comment.CommentBoolLiterals"
+    value: "true"
+  - key: "bugprone-misplaced-widening-cast.CheckImplicitCasts"
+    value: "true"
+  - key: "bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression"
+    value: "true"
+  - key: "bugprone-suspicious-string-compare.WarnOnLogicalNotComparison"
+    value: "true"
+  - key: "readability-simplify-boolean-expr.ChainedConditionalReturn"
+    value: "true"
+  - key: "readability-simplify-boolean-expr.ChainedConditionalAssignment"
+    value: "true"
+  - key: "readability-uniqueptr-delete-release.PreferResetCall"
+    value: "true"
+  - key: "cppcoreguidelines-init-variables.MathHeader"
+    value: "<cmath>"
+  - key: "cppcoreguidelines-narrowing-conversions.PedanticMode"
+    value: "true"
+  - key: "readability-else-after-return.WarnOnUnfixable"
+    value: "true"
+  - key: "readability-else-after-return.WarnOnConditionVariables"
+    value: "true"
+  - key: "readability-inconsistent-declaration-parameter-name.Strict"
+    value: "true"
+  - key: "readability-qualified-auto.AddConstToQualified"
+    value: "true"
+  - key: "readability-redundant-access-specifiers.CheckFirstDeclaration"
+    value: "true"
+  # These seem to be the most common identifier styles
+  - key: "readability-identifier-naming.AbstractClassCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ClassMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstantPointerParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprFunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ConstexprVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.EnumCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.EnumConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.FunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalConstantPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalFunctionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.GlobalVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.InlineNamespaceCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalConstantPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalPointerCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.LocalVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.MacroDefinitionCase"
+    value: "UPPER_CASE"
+  - key: "readability-identifier-naming.MemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.MethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.NamespaceCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ParameterPackCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PointerParameterCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PrivateMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PrivateMemberPrefix"
+    value: "m_"
+  - key: "readability-identifier-naming.PrivateMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ProtectedMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ProtectedMemberPrefix"
+    value: "m_"
+  - key: "readability-identifier-naming.ProtectedMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PublicMemberCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.PublicMethodCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ScopedEnumConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StaticConstantCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StaticVariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.StructCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.TemplateTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.TypeAliasCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TypedefCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.TypeTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.UnionCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.ValueTemplateParameterCase"
+    value: "CamelCase"
+  - key: "readability-identifier-naming.VariableCase"
+    value: "lower_case"
+  - key: "readability-identifier-naming.VirtualMethodCase"
+    value: "lower_case"
 ...
diff --git a/.clangd b/.clangd
index ef86cb6b0..fd3d2a8f4 100644
--- a/.clangd
+++ b/.clangd
@@ -1,2 +1,2 @@
 CompileFlags:
-  CompilationDatabase: build/dev
\ No newline at end of file
+  CompilationDatabase: build/dev
diff --git a/.codespellrc b/.codespellrc
index c3920f351..86730201c 100644
--- a/.codespellrc
+++ b/.codespellrc
@@ -1,7 +1,7 @@
 [codespell]
-builtin = clear,rare,en-GB_to_en-US,names,informal,code
+builtin = clear,rare,names,informal,code
 check-filenames =
 check-hidden =
 ignore-words-list = deque,warmup,stdio,copyable,combinate
-skip = */.git,*/build,*/prefix,*/vcpkg,*/_build,*/bench
+skip = */.git,*/build,*/.legacy
 quiet-level = 2
diff --git a/.gemini/settings.json b/.gemini/settings.json
new file mode 100644
index 000000000..b8dce87f3
--- /dev/null
+++ b/.gemini/settings.json
@@ -0,0 +1,8 @@
+{
+  "context": {
+    "fileName": "AGENTS.md"
+  },
+  "ui": {
+    "hideBanner": true
+  }
+}
\ No newline at end of file
diff --git a/.github/workflows/linear.yml b/.github/workflows/linear.yml
new file mode 100644
index 000000000..8d1ba0975
--- /dev/null
+++ b/.github/workflows/linear.yml
@@ -0,0 +1,33 @@
+name: Linear History
+
+on:
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  check-linear-history:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+        with:
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+          fetch-depth: 0
+
+      - name: Check for merge commits
+        run: |
+          BASE_REF=${{ github.base_ref || 'modules' }}
+          echo "Comparing against base: $BASE_REF"
+          git fetch origin $BASE_REF:$BASE_REF
+          MERGE_COMMITS=$(git rev-list --merges $BASE_REF..HEAD)
+          if [ -n "$MERGE_COMMITS" ]; then
+            echo "Error: Merge commits detected. libfork requires a linear history."
+            echo "Please rebase your branch onto $BASE_REF to remove merge commits."
+            echo ""
+            echo "Merge commits found:"
+            git log --merges --oneline $BASE_REF..HEAD
+            exit 1
+          else
+            echo "No merge commits detected. Linear history check passed."
+          fi
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 000000000..82cfdc9a7
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,28 @@
+name: Lint
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  lint:
+    runs-on: macos-latest
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install clang-format codespell
+
+      - name: Run codespell
+        run: codespell
+
+      - name: Run clang-format
+        run: |
+          find src include test benchmark/src -name "*.cpp" -o -name "*.hpp" -o -name "*.cxx" | xargs clang-format --dry-run --Werror
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
new file mode 100644
index 000000000..4904ae1e5
--- /dev/null
+++ b/.github/workflows/linux.yml
@@ -0,0 +1,35 @@
+name: Linux
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [ci-hardened, ci-release, ci-no-except-rtti]
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install cmake ninja gcc binutils catch2 google-benchmark
+
+      - name: Configure
+        run: cmake --preset ${{ matrix.preset }}
+          -DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake
+
+      - name: Build
+        run: cmake --build --preset ${{ matrix.preset }}
+
+      - name: Test
+        run: ctest --preset ${{ matrix.preset }}
diff --git a/.github/workflows/macos.yml b/.github/workflows/macos.yml
new file mode 100644
index 000000000..17eb53779
--- /dev/null
+++ b/.github/workflows/macos.yml
@@ -0,0 +1,35 @@
+name: MacOS
+
+on:
+  push:
+    branches: ["modules"]
+  pull_request:
+    branches: ["modules"]
+  workflow_dispatch:
+
+jobs:
+  build-and-test:
+    runs-on: macos-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [ci-hardened, ci-release, ci-no-except-rtti, ci-sanitize]
+
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Set up Homebrew
+        uses: Homebrew/actions/setup-homebrew@main
+
+      - name: Install Dependencies
+        run: brew install cmake ninja llvm catch2 google-benchmark
+
+      - name: Configure
+        run: cmake --preset ${{ matrix.preset }}
+          -DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake
+
+      - name: Build
+        run: cmake --build --preset ${{ matrix.preset }}
+
+      - name: Test
+        run: ctest --preset ${{ matrix.preset }}
diff --git a/.gitignore b/.gitignore
index 47a6b5d9d..41a66b83c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,5 +26,3 @@ output.png
 **/.DS_Store
 
 compile_commands.json
-CMakeLists.txt.user
-CMakeUserPresets.json
diff --git a/.legacy/include/libfork/core/macro.hpp b/.legacy/include/libfork/core/macro.hpp
index 0944c1644..e42fcfc18 100644
--- a/.legacy/include/libfork/core/macro.hpp
+++ b/.legacy/include/libfork/core/macro.hpp
@@ -61,17 +61,6 @@
   #define LF_STATIC_CONST const
 #endif
 
-// clang-format off
-
-/**
- * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/requires/decltype specifiers.
- * 
- * This macro is not truly variadic but the ``...`` allows commas in the macro argument.
- */
-#define LF_HOF_RETURNS(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) requires requires { __VA_ARGS__; } { return __VA_ARGS__;}
-
-// clang-format on
-
 /**
  * @brief __[public]__ Detects if the compiler has exceptions enabled.
  *
@@ -192,28 +181,6 @@ using std::unreachable;
   #define LF_ASSERT(expr) LF_ASSUME(expr)
 #endif
 
-/**
- * @brief Macro to prevent a function to be inlined.
- */
-#if !defined(LF_NOINLINE)
-  #if defined(_MSC_VER) && !defined(__clang__)
-    #define LF_NOINLINE __declspec(noinline)
-  #elif defined(__GNUC__) && __GNUC__ > 3
-  // Clang also defines __GNUC__ (as 4)
-    #if defined(__CUDACC__)
-  // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392
-      #define LF_NOINLINE __attribute__((noinline))
-    #elif defined(__HIP__)
-  // See https://github.com/boostorg/config/issues/392
-      #define LF_NOINLINE __attribute__((noinline))
-    #else
-      #define LF_NOINLINE __attribute__((__noinline__))
-    #endif
-  #else
-    #define LF_NOINLINE
-  #endif
-#endif
-
 /**
  * @brief Force no-inline for clang, works-around https://github.com/llvm/llvm-project/issues/63022.
  *
@@ -229,28 +196,6 @@ using std::unreachable;
   #define LF_CLANG_TLS_NOINLINE
 #endif
 
-/**
- * @brief Macro to use next to 'inline' to force a function to be inlined.
- *
- * \rst
- *
- * .. note::
- *
- *    This does not imply the c++'s `inline` keyword which also has an effect on linkage.
- *
- * \endrst
- */
-#if !defined(LF_FORCEINLINE)
-  #if defined(_MSC_VER) && !defined(__clang__)
-    #define LF_FORCEINLINE __forceinline
-  #elif defined(__GNUC__) && __GNUC__ > 3
-  // Clang also defines __GNUC__ (as 4)
-    #define LF_FORCEINLINE __attribute__((__always_inline__))
-  #else
-    #define LF_FORCEINLINE
-  #endif
-#endif
-
 #if defined(__clang__) && defined(__has_attribute)
   /**
    * @brief Compiler specific attribute.
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 000000000..cb1e1f47d
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,206 @@
+# Libfork Copilot Instructions
+
+## Project Overview
+
+**libfork** is a continuation-stealing coroutine-tasking library implementing
+strict fork-join parallelism using C++20 coroutines.
+
+- **Type**: C++ library with module/`import std` support
+- **Languages**: C++26
+
+## Critical Build Requirements
+
+### Compiler & Module Support
+
+This project **requires C++23's `import std`** and **MUST** use the appropriate
+toolchain file:
+
+- **MacOS**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake`
+- **Linux**: Use `-DCMAKE_TOOLCHAIN_FILE=cmake/gcc-brew-toolchain.cmake`
+
+**Common Error**: Without the toolchain file, CMake will fail.
+
+**Always include the toolchain file** in configure commands.
+
+### Dependencies (Homebrew)
+
+Make sure Homebrew is installed and `brew` is in your `PATH`:
+
+```bash
+brew --version
+```
+
+**Required for building/testing:**
+
+- `cmake`
+- `ninja`
+- `catch2`
+- `google-benchmark`
+- `clang-format`
+- `codespell`
+
+If on MacOS, also require:
+
+- `llvm`
+
+If on Linux, also require:
+
+- `gcc`
+- `binutils`
+
+Install all at once (MacOS):
+
+```bash
+brew install cmake ninja catch2 google-benchmark clang-format codespell llvm
+```
+
+Install all at once (Linux):
+
+```bash
+brew install cmake ninja catch2 google-benchmark clang-format codespell gcc binutils
+```
+
+## Build & Test Workflow
+
+### 1. Configure
+
+Always use presets with the toolchain file:
+
+```bash
+cmake --preset <preset-name> -DCMAKE_TOOLCHAIN_FILE=cmake/<toolchain>.cmake
+```
+
+**Relevant available presets** (from `CMakePresets.json`):
+
+- `ci-hardened` - Debug build with warnings and hardening flags
+- `ci-release` - Optimized release build
+
+All presets enable developer mode (`libfork_DEV_MODE=ON`) and use Ninja generator.
+
+You should use the `ci-hardened` preset for development/testing and
+`ci-release` for benchmarking.
+
+### 2. Build
+
+```bash
+cmake --build --preset <preset-name>
+```
+
+**Build warnings** (expected and safe):
+
+- "It is recommended to build benchmarks in Release mode" - only relevant for `ci-hardened`
+- CMake experimental `import std;` warning - expected for C++23's `import std`
+
+### 3. Test
+
+```bash
+ctest --preset <preset-name>
+```
+
+All tests should pass. If tests fail, check that:
+
+- Configuration used the correct toolchain file
+- Build completed without errors
+- Any changes you have made are correct
+
+## Project Structure
+
+### Source Layout
+
+```sh
+libfork/
+├── cmake/                    # CMake utilities
+├── include/libfork/**/*.hpp  # Public headers (macros, version)
+├── src/                      # C++26 module source files (.cxx) and impl (.cpp)
+│   ├── libfork.cxx           # libfork — meta-module, re-exports all public modules
+│   ├── utils/                # libfork.utils — internal utilities (not public API)
+│   │   ├── utils.cxx         #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   ├── core/                 # libfork.core — core task/scheduler primitives
+│   │   ├── core.cxx          #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   ├── batteries/            # libfork.batteries — stacks, contexts, adaptors
+│   │   ├── batteries.cxx     #   aggregator
+│   │   └── *.cxx             #   :partitions
+│   └── schedulers/           # libfork.schedulers — concrete schedulers
+│   │   ├── schedulers.cxx    #   aggregator
+│   │   └── *.cxx             #   :partitions
+├── test/src/**/              # Test suite (Catch2) — uses `import libfork;`
+│   └── *.cpp
+├── benchmark/                # Benchmarking suite (google-benchmark)
+│   ├── lib/                  # Shared benchmark utilities and definitions
+│   │   ├── *.hpp             #   headers
+│   │   └── *.cpp             #   common source
+│   ├── src/                  # Implementation-specific benchmarks
+│   │   ├── libfork/          #   libfork-based benchmarks
+│   │   ├── serial/           #   serial benchmarks
+│   │   └── */                #   Other library benchmarks (e.g. OpenMP, TBB, Cilk Plus)
+│   └── external/             # External benchmark code (e.g. UTS)
+├── .github/workflows/        # CI workflows
+│   ├── linux.yml             # Linux builds
+│   ├── macos.yml             # MacOS builds
+│   ├── lint.yml              # Linting
+│   └── linear.yml            # Enforces linear history (no merge commits)
+└── CMakeLists.txt            # Main build configuration
+```
+
+## Workflows
+
+### Workflow Command Pattern
+
+All workflows follow this pattern:
+
+```yaml
+- Install Dependencies: brew install ...
+- Configure: cmake --preset <preset> -DCMAKE_TOOLCHAIN_FILE=<toolchain>.cmake
+- Build: cmake --build --preset <preset>
+- Test: ctest --preset <preset>
+```
+
+## Common Development Tasks
+
+### Making Code Changes
+
+1. **Modify source files** in `src/`, `include/`, `test/`, or `benchmark/`
+2. **Rebuild**: `cmake --build --preset <your-preset>`
+3. **Test**: `ctest --preset <your-preset>`
+
+#### Adding/removing files from `src/` or `include/`
+
+- Update the root `CMakeLists.txt` with new/removed files.
+
+#### Adding/removing files from benchmarks
+
+- Update the relevant `CMakeLists.txt` in `benchmark/lib/` or `benchmark/src/<impl>/`.
+
+### Adding Tests
+
+Strive to add tests for new features/bug fixes.
+
+- Add `.cpp` files to `test/src/`
+- Tests auto-discovered by CMake (GLOB_RECURSE)
+- Links against `libfork::libfork` and `Catch2::Catch2WithMain`
+
+### Modifying Build Configuration
+
+**Warning**: Module-related changes are complex. Test thoroughly with clean builds.
+
+## Troubleshooting
+
+### Build Failures
+
+**Problem**: Configuration/Build fails after adding/removing files or modifying CMakeLists.txt
+**Solution**: Try a clean build directory:
+
+```bash
+rm -rf build/
+```
+
+**Problem**: "compiler does not provide a way to discover the import graph"
+**Solution**: Add `-DCMAKE_TOOLCHAIN_FILE=cmake/llvm-brew-toolchain.cmake` to configure
+
+**Problem**: "Could not find 'brew' executable"
+**Solution**: Install Homebrew
+
+**Problem**: "Could not automatically find libc++.modules.json"
+**Solution**: Ensure LLVM is installed via Homebrew; toolchain auto-detects the path
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 000000000..68490f0c1
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,3 @@
+# In ./CLAUDE.md
+
+@AGENTS.md
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d193bc879..577173f29 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,38 +1,121 @@
-cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 4.3 FATAL_ERROR)
 
 # See `Help/dev/experimental.rst`
-set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "d0edc3af-4c50-42ea-a356-e2862fe7a444")
+set(CMAKE_EXPERIMENTAL_CXX_IMPORT_STD "451f2fe2-a8a2-47c3-bc32-94786d8fc91b")
 
 include(cmake/read_version.cmake)
 
-read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/core/macro.hpp)
+read_version(${CMAKE_CURRENT_SOURCE_DIR}/include/libfork/version.hpp)
 
 project(
   libfork
   VERSION ${version_major}.${version_minor}.${version_patch}
-  DESCRIPTION "A bleeding-edge, lock-free, wait-free, continuation-stealing fork-join library built on C++20's coroutines."
   LANGUAGES CXX
 )
 
+# ---- Project options ----
+
+option(libfork_DEV_MODE "Enable developer build (tests/benchmarks/etc) for libfork" OFF)
+
+# ---- System dependencies ----
+
+find_package(Threads REQUIRED)
+
+# ===========================
+
 # Tell CMake that we explicitly want `import std`. This will initialize the
 # property on all targets declared after this to 1
 # TODO: set property per target
 set(CMAKE_CXX_MODULE_STD 1)
 
-# Make a library.
-add_library(uses_std STATIC)
+add_library(libfork_libfork)
+add_library(libfork::libfork ALIAS libfork_libfork)
+
+target_link_libraries(libfork_libfork PUBLIC Threads::Threads)
 
-# Add sources.
-target_sources(uses_std PRIVATE uses_std.cxx)
+set_property(TARGET libfork_libfork PROPERTY EXPORT_NAME libfork)
 
-# Tell CMake we're using C++23 but only C++20 is needed to consume it.
-target_compile_features(uses_std INTERFACE cxx_std_23)
+target_compile_features(libfork_libfork PUBLIC cxx_std_26)
 
-# Make an executable.
-add_executable(main)
+# Public headers, __impl must be public because consumers need
+# them to build the module BMI
+target_sources(libfork_libfork
+  PUBLIC
+    FILE_SET HEADERS FILES
+      include/libfork/version.hpp
+      include/libfork/__impl/compiler.hpp
+      include/libfork/__impl/exception.hpp
+      include/libfork/__impl/utils.hpp
+      include/libfork/__impl/assume.hpp
+    BASE_DIRS
+      include
+)
 
-target_sources(main PRIVATE main.cxx)
-target_link_libraries(main PRIVATE uses_std)
+# Add the module files to the library, must be public because
+# consumers will need bo build the BMI
+target_sources(libfork_libfork
+  PUBLIC
+    FILE_SET CXX_MODULES FILES
+      # libfork (meta)
+      src/libfork.cxx
+      # libfork.utils
+      src/utils/utils.cxx
+      src/utils/utility.cxx
+      src/utils/constants.cxx
+      src/utils/tuple.cxx
+      src/utils/concepts.cxx
+      src/utils/defer.cxx
+      src/utils/uninitialized.cxx
+      # libfork.core
+      src/core/core.cxx
+      src/core/exception.cxx
+      src/core/concepts/stack.cxx
+      src/core/concepts/context.cxx
+      src/core/concepts/scheduler.cxx
+      src/core/concepts/invocable.cxx
+      src/core/frame.cxx
+      src/core/task.cxx
+      src/core/ops.cxx
+      src/core/poly_context.cxx
+      src/core/thread_locals.cxx
+      src/core/schedule.cxx
+      src/core/handles.cxx
+      src/core/root.cxx
+      src/core/execute.cxx
+      src/core/receiver.cxx
+      src/core/final_suspend.cxx
+      src/core/awaitables.cxx
+      src/core/promise.cxx
+      src/core/stop.cxx
+      # libfork.batteries
+      src/batteries/batteries.cxx
+      src/batteries/deque.cxx
+      src/batteries/adaptors.cxx
+      src/batteries/contexts.cxx
+      src/batteries/geometric_stack.cxx
+      src/batteries/adaptor_stack.cxx
+      src/batteries/slab_stack.cxx
+      # libfork.schedulers
+      src/schedulers/schedulers.cxx
+      src/schedulers/inline.cxx
+      src/schedulers/busy.cxx
+    PRIVATE
+      src/exception.cpp
+)
+
+# ======================
+
+if(libfork_DEV_MODE)
+
+  include(CTest) # Enables the BUILD_TESTING option
+
+  if(BUILD_TESTING)
+    add_subdirectory(test)
+  endif()
+
+  add_subdirectory(benchmark)
+
+endif()
 
 # list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 #
@@ -43,9 +126,6 @@ target_link_libraries(main PRIVATE uses_std)
 #
 # message(STATUS "CMAKE_BUILD_TYPE is set to '${CMAKE_BUILD_TYPE}'")
 #
-# # ---- System dependencies ----
-#
-# find_package(Threads REQUIRED)
 #
 # # ------ Declare library ------
 #
diff --git a/CMakePresets.json b/CMakePresets.json
new file mode 100644
index 000000000..ac23b1e37
--- /dev/null
+++ b/CMakePresets.json
@@ -0,0 +1,126 @@
+{
+  "version": 10,
+  "configurePresets": [
+    {
+      "name": "cmake-pedantic",
+      "hidden": true,
+      "warnings": {
+        "dev": true,
+        "deprecated": true,
+        "uninitialized": true,
+        "unusedCli": true,
+        "systemVars": false
+      },
+      "errors": {
+        "deprecated": true
+      }
+    },
+    {
+      "name": "ci-base",
+      "inherits": "cmake-pedantic",
+      "hidden": true,
+      "generator": "Ninja",
+      "binaryDir": "${sourceDir}/build/${presetName}",
+      "cacheVariables": {
+        "CMAKE_EXPORT_COMPILE_COMMANDS": "ON",
+        "libfork_DEV_MODE": "ON"
+      }
+    },
+    {
+      "name": "ci-hardened",
+      "inherits": "ci-base",
+      "displayName": "Debug with warnings and hardening",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Debug",
+        "CMAKE_CXX_FLAGS": "-O2 -Wall -Wextra -Wpedantic -Wconversion -Wsign-conversion -Wcast-qual -Wformat -Wformat=2 -Wundef -Werror=float-equal -Wshadow -Wcast-align -Wunused -Wnull-dereference -Wdouble-promotion -Wimplicit-fallthrough -Wextra-semi -Woverloaded-virtual -Wnon-virtual-dtor -Wold-style-cast -Werror=format-security -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS -fstrict-flex-arrays=3 -fstack-protector-strong -Wno-missing-braces -Wno-missing-field-initializers -Wno-c2y-extensions"
+      }
+    },
+    {
+      "name": "ci-release",
+      "inherits": "ci-base",
+      "displayName": "Release",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -falign-functions=64"
+      }
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "inherits": "ci-base",
+      "displayName": "Release no RTTI or exceptions",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Release",
+        "CMAKE_CXX_FLAGS": "-O3 -DNDEBUG -flto=auto -march=native -fno-exceptions -fno-rtti -falign-functions=64"
+      }
+    },
+    {
+      "name": "ci-sanitize",
+      "inherits": "ci-base",
+      "displayName": "Debug with sanitizers",
+      "cacheVariables": {
+        "CMAKE_BUILD_TYPE": "Sanitize",
+        "CMAKE_CXX_FLAGS": "-O2 -g -fsanitize=address,undefined -fno-omit-frame-pointer -fno-common -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=3 -D_GLIBCXX_ASSERTIONS"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "ci-hardened",
+      "configurePreset": "ci-hardened"
+    },
+    {
+      "name": "ci-release",
+      "configurePreset": "ci-release"
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "configurePreset": "ci-no-except-rtti"
+    },
+    {
+      "name": "ci-sanitize",
+      "configurePreset": "ci-sanitize"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "ci-hardened",
+      "configurePreset": "ci-hardened",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-release",
+      "configurePreset": "ci-release",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-no-except-rtti",
+      "configurePreset": "ci-no-except-rtti",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    },
+    {
+      "name": "ci-sanitize",
+      "configurePreset": "ci-sanitize",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    }
+  ]
+}
diff --git a/CMakeUserPresets.json b/CMakeUserPresets.json
new file mode 100644
index 000000000..26e650168
--- /dev/null
+++ b/CMakeUserPresets.json
@@ -0,0 +1,79 @@
+{
+  "version": 10,
+  "configurePresets": [
+    {
+      "name": "dev",
+      "inherits": "ci-hardened",
+      "displayName": "Hardened development build",
+      "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake",
+      "cacheVariables": {
+        "CMAKE_COLOR_DIAGNOSTICS": "ON"
+      }
+    },
+    {
+      "name": "bench",
+      "inherits": "ci-release",
+      "displayName": "Release build for benchmarks",
+      "toolchainFile": "${sourceDir}/cmake/llvm-brew-toolchain.cmake",
+      "cacheVariables": {
+        "CMAKE_COLOR_DIAGNOSTICS": "ON"
+      }
+    }
+  ],
+  "buildPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev"
+    },
+    {
+      "name": "bench",
+      "configurePreset": "bench"
+    }
+  ],
+  "testPresets": [
+    {
+      "name": "dev",
+      "configurePreset": "dev",
+      "output": {
+        "outputOnFailure": true
+      },
+      "execution": {
+        "stopOnFailure": true
+      }
+    }
+  ],
+  "workflowPresets": [
+    {
+      "name": "dev",
+      "displayName": "Development Debug Hardened Workflow",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "dev"
+        },
+        {
+          "type": "build",
+          "name": "dev"
+        },
+        {
+          "type": "test",
+          "name": "dev"
+        }
+      ]
+    },
+    {
+      "name": "bench",
+      "displayName": "Release Build (including Benchmarks)",
+      "steps": [
+        {
+          "type": "configure",
+          "name": "bench"
+        },
+        {
+          "type": "build",
+          "name": "bench"
+        }
+      ]
+    }
+  ]
+}
diff --git a/.legacy/LICENSE.md b/LICENSE.md
similarity index 100%
rename from .legacy/LICENSE.md
rename to LICENSE.md
diff --git a/actions/setup/action.yaml b/actions/setup/action.yaml
deleted file mode 100644
index cd451a14b..000000000
--- a/actions/setup/action.yaml
+++ /dev/null
@@ -1,52 +0,0 @@
-name: 'setup'
-description: 'setup vcpkg/cmake/ninja and caching'
-
-runs:
-  using: "composite"
-
-  steps:
-    # Set env vars needed for vcpkg to leverage the GitHub Action cache as a storage for Binary Caching.
-    - uses: actions/github-script@v6
-      with:
-        script: |
-          core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
-          core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
-
-    - uses: actions/checkout@v3
-      with:
-        submodules: true
-    - name: "Create directory '${{ env.VCPKG_DEFAULT_BINARY_CACHE }}'"
-      run: mkdir -p $VCPKG_DEFAULT_BINARY_CACHE
-      shell: bash
-
-    # Setup the build machine with the most recent versions of CMake and Ninja. 
-    # Both are cached if not already: on subsequent runs both will be quickly restored from GitHub cache service.
-    - uses: lukka/get-cmake@latest
-
-    # Restore vcpkg from the GitHub Action cache service. 
-    # Note that packages are restored by vcpkg's binary caching when it is being run afterwards by CMake.
-    - name: Restore vcpkg
-      uses: actions/cache@v3
-      with:
-        # The first path is the location of vcpkg: it contains the vcpkg executable and data files, as long as the
-        # built package archives (aka binary cache) which are located by VCPKG_DEFAULT_BINARY_CACHE env var.
-        # The other paths starting with '!' are exclusions: they contain temporary files generated 
-        # during the build of the installed packages.
-        path: |
-          ${{ env.VCPKG_ROOT_DIR }}
-          !${{ env.VCPKG_ROOT_DIR }}/buildtrees
-          !${{ env.VCPKG_ROOT_DIR }}/packages
-          !${{ env.VCPKG_ROOT_DIR }}/downloads
-          !${{ env.VCPKG_ROOT_DIR }}/installed
-        # The key is composed in a way that it gets properly invalidated whenever a different version of vcpkg is being used.
-        key: |
-          ${{ hashFiles( '.git/modules/vcpkg/HEAD' )}}
-
-    # On Windows runners, let's ensure to have the Developer Command Prompt environment setup correctly.
-    # As used here the Developer Command Prompt created is targeting x64 and using the default the Windows SDK.
-    - uses: ilammy/msvc-dev-cmd@v1
-
-    - name: Setup xcode
-      if: matrix.os == 'macos-13'
-      shell: bash
-      run: sudo xcode-select --switch /Applications/Xcode_15.0.app/Contents/Developer 
\ No newline at end of file
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
new file mode 100644
index 000000000..5ce1a7480
--- /dev/null
+++ b/benchmark/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(libfork_benchmark LANGUAGES CXX)
+
+if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+  message(WARNING "It is recommended to build benchmarks in Release mode for accurate results.")
+endif()
+
+# ---- Dependencies ----
+
+find_package(benchmark REQUIRED)
+
+# ---- Benchmarks ----
+
+add_subdirectory(lib)
+
+add_subdirectory(src/serial)
+add_subdirectory(src/baremetal)
+add_subdirectory(src/libfork)
+
+# WHOLE_ARCHIVE ensures benchmark registrations (global initialisers) are not
+# dropped by the linker when pulling objects from the static libraries above.
+add_executable(libfork_benchmark src/benchmarks.cpp)
+
+target_link_libraries(libfork_benchmark
+  PRIVATE
+    $<LINK_LIBRARY:WHOLE_ARCHIVE,serial_benchmarks,baremetal_benchmarks,libfork_benchmarks>
+    benchmark::benchmark_main
+)
+
+if(BUILD_TESTING)
+  add_test(NAME Benchmark
+    COMMAND libfork_benchmark --benchmark_dry_run --benchmark_filter=^test/
+  )
+endif()
+
+# ---- OpenMP Benchmarks ----
+
+find_package(OpenMP REQUIRED)
+
+if(OpenMP_CXX_FOUND)
+
+  add_subdirectory(src/openmp)
+
+  target_link_libraries(libfork_benchmark
+    PRIVATE
+      $<LINK_LIBRARY:WHOLE_ARCHIVE,openmp_benchmarks>
+  )
+endif()
+
+
diff --git a/benchmark/external/uts/CMakeLists.txt b/benchmark/external/uts/CMakeLists.txt
new file mode 100644
index 000000000..89c8b5e6e
--- /dev/null
+++ b/benchmark/external/uts/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(uts_external LANGUAGES C)
+
+add_library(uts_c)
+
+target_sources(uts_c
+  PRIVATE
+    src/uts.c
+    src/rng/brg_sha1.c
+  PUBLIC
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/include
+    FILES
+      include/uts/uts.h
+      include/uts/rng/rng.h
+      include/uts/rng/brg_sha1.h
+      include/uts/rng/brg_types.h
+)
diff --git a/benchmark/external/uts/include/uts/rng/brg_sha1.h b/benchmark/external/uts/include/uts/rng/brg_sha1.h
new file mode 100644
index 000000000..d30f12c0d
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/brg_sha1.h
@@ -0,0 +1,100 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+*/
+
+#ifndef _SHA1_H
+#define _SHA1_H
+
+#include "uts/rng/brg_types.h"
+
+#define SHA1_BLOCK_SIZE 64
+#define SHA1_DIGEST_SIZE 20
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+#define POS_MASK 0x7fffffff
+#define HIGH_BITS 0x80000000
+
+#define sha1_context sha1_ctx_s
+
+/**********************************/
+/* random number generator state  */
+/**********************************/
+struct state_t {
+  uint_8t state[20];
+};
+
+typedef uint_8t RNG_state;
+
+/***************************************/
+/* random number generator operations  */
+/***************************************/
+void rng_init(RNG_state *state, int seed);
+void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnNumber);
+int rng_rand(RNG_state *mystate);
+int rng_nextrand(RNG_state *mystate);
+char *rng_showstate(RNG_state *state, char *s);
+int rng_showtype(char *strBuf, int ind);
+
+/** END: UTS RNG Harness **/
+/* type to hold the SHA256 context  */
+
+struct sha1_ctx_s {
+  uint_32t count[2];
+  uint_32t hash[5];
+  uint_32t wbuf[16];
+};
+
+typedef struct sha1_ctx_s sha1_ctx;
+
+/* Note that these prototypes are the same for both bit and */
+/* byte oriented implementations. However the length fields */
+/* are in bytes or bits as appropriate for the version used */
+/* and bit sequences are input as arrays of bytes in which  */
+/* bit sequences run from the most to the least significant */
+/* end of each byte                                         */
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1]);
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1]);
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]);
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]);
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/rng/brg_types.h b/benchmark/external/uts/include/uts/rng/brg_types.h
new file mode 100644
index 000000000..9532acce6
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/brg_types.h
@@ -0,0 +1,214 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 1998-2006, Brian Gladman, Worcester, UK. All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 09/09/2006
+
+ The unsigned integer types defined here are of the form uint_<nn>t where
+ <nn> is the length of the type; for example, the unsigned 32-bit type is
+ 'uint_32t'.  These are NOT the same as the 'C99 integer types' that are
+ defined in the inttypes.h and stdint.h headers since attempts to use these
+ types have shown that support for them is still highly variable.  However,
+ since the latter are of the form unit<nn>_t, a regular expression search
+ and replace (in VC++ search on 'uint_{:z}t' and replace with 'uint\1_t')
+ can be used to convert the types used here to the C99 standard types.
+*/
+
+#ifndef BRG_TYPES_H
+#define BRG_TYPES_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <limits.h>
+
+/* Try one of these if things don't work automatically */
+#ifdef BRG_C99_TYPES
+  #include <inttypes.h>
+  #include <stdint.h>
+  #define BRG_UI8
+typedef uint8_t uint_8t;
+  #define BRG_UI16
+typedef uint16_t uint_16t;
+  #define BRG_UI32
+  #define li_32(h) 0x##h##u
+typedef uint32_t uint_32t;
+  #define BRG_UI64
+  #define li_64(h) 0x##h##u
+typedef uint64_t uint_64t;
+
+#elif defined(BRG_STD_TYPES)
+  #include <sys/types.h>
+  #define BRG_UI8
+typedef u_int8_t uint_8t;
+  #define BRG_UI16
+typedef u_int16_t uint_16t;
+  #define BRG_UI32
+  #define li_32(h) 0x##h##u
+typedef u_int32_t uint_32t;
+  #define BRG_UI64
+  #define li_64(h) 0x##h##u
+typedef u_int64_t uint_64t;
+
+#endif
+
+#ifndef BRG_UI8
+  #define BRG_UI8
+  #if UCHAR_MAX == 255u
+typedef unsigned char uint_8t;
+  #else
+    #error Please define uint_8t as an 8-bit unsigned integer type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI16
+  #define BRG_UI16
+  #if USHRT_MAX == 65535u
+typedef unsigned short uint_16t;
+  #else
+    #error Please define uint_16t as a 16-bit unsigned short type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI32
+  #define BRG_UI32
+  #if UINT_MAX == 4294967295u
+    #define li_32(h) 0x##h##u
+typedef unsigned int uint_32t;
+  #elif ULONG_MAX == 4294967295u
+    #define li_32(h) 0x##h##ul
+typedef unsigned long uint_32t;
+  #elif defined(_CRAY)
+    #error This code needs 32-bit data types, which Cray machines do not provide
+  #else
+    #error Please define uint_32t as a 32-bit unsigned integer type in brg_types.h
+  #endif
+#endif
+
+#ifndef BRG_UI64
+  #if defined(__BORLANDC__) && !defined(__MSDOS__)
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ull
+typedef unsigned __int64 uint_64t;
+  #elif defined(_MSC_VER) && (_MSC_VER < 1300) /* 1300 == VC++ 7.0 */
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ui64
+typedef unsigned __int64 uint_64t;
+  #elif defined(__sun) && defined(ULONG_MAX) && ULONG_MAX == 0xfffffffful
+    #define BRG_UI64
+    #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+  #elif defined(UINT_MAX) && UINT_MAX > 4294967295u
+    #if UINT_MAX == 18446744073709551615u
+      #define BRG_UI64
+      #define li_64(h) 0x##h##u
+typedef unsigned int uint_64t;
+    #endif
+  #elif defined(ULONG_MAX) && ULONG_MAX > 4294967295u
+    #if ULONG_MAX == 18446744073709551615ul
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ul
+typedef unsigned long uint_64t;
+    #endif
+  #elif defined(ULLONG_MAX) && ULLONG_MAX > 4294967295u
+    #if ULLONG_MAX == 18446744073709551615ull
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+    #endif
+  #elif defined(ULONG_LONG_MAX) && ULONG_LONG_MAX > 4294967295u
+    #if ULONG_LONG_MAX == 18446744073709551615ull
+      #define BRG_UI64
+      #define li_64(h) 0x##h##ull
+typedef unsigned long long uint_64t;
+    #endif
+  #endif
+#endif
+
+#if defined(NEED_UINT_64T) && !defined(BRG_UI64)
+  #error Please define uint_64t as an unsigned 64 bit type in brg_types.h
+#endif
+
+#ifndef RETURN_VALUES
+  #define RETURN_VALUES
+  #if defined(DLL_EXPORT)
+    #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+      #define VOID_RETURN __declspec(dllexport) void __stdcall
+      #define INT_RETURN __declspec(dllexport) int __stdcall
+    #elif defined(__GNUC__)
+      #define VOID_RETURN __declspec(__dllexport__) void
+      #define INT_RETURN __declspec(__dllexport__) int
+    #else
+      #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+    #endif
+  #elif defined(DLL_IMPORT)
+    #if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+      #define VOID_RETURN __declspec(dllimport) void __stdcall
+      #define INT_RETURN __declspec(dllimport) int __stdcall
+    #elif defined(__GNUC__)
+      #define VOID_RETURN __declspec(__dllimport__) void
+      #define INT_RETURN __declspec(__dllimport__) int
+    #else
+      #error Use of the DLL is only available on the Microsoft, Intel and GCC compilers
+    #endif
+  #elif defined(__WATCOMC__)
+    #define VOID_RETURN void __cdecl
+    #define INT_RETURN int __cdecl
+  #else
+    #define VOID_RETURN void
+    #define INT_RETURN int
+  #endif
+#endif
+
+/*  These defines are used to declare buffers in a way that allows
+    faster operations on longer variables to be used.  In all these
+    defines 'size' must be a power of 2 and >= 8
+
+    dec_unit_type(size,x)       declares a variable 'x' of length
+                                'size' bits
+
+    dec_bufr_type(size,bsize,x) declares a buffer 'x' of length 'bsize'
+                                bytes defined as an array of variables
+                                each of 'size' bits (bsize must be a
+                                multiple of size / 8)
+
+    ptr_cast(x,size)            casts a pointer to a pointer to a
+                                variable of length 'size' bits
+*/
+
+#define ui_type(size) uint_##size##t
+#define dec_unit_type(size, x) typedef ui_type(size) x
+#define dec_bufr_type(size, bsize, x) typedef ui_type(size) x[bsize / (size >> 3)]
+#define ptr_cast(x, size) ((ui_type(size) *)(x))
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/rng/rng.h b/benchmark/external/uts/include/uts/rng/rng.h
new file mode 100644
index 000000000..105c40466
--- /dev/null
+++ b/benchmark/external/uts/include/uts/rng/rng.h
@@ -0,0 +1,6 @@
+#ifndef _RNG_H
+#define _RNG_H
+
+#include "uts/rng/brg_sha1.h"
+
+#endif /* _RNG_H */
\ No newline at end of file
diff --git a/benchmark/external/uts/include/uts/uts.h b/benchmark/external/uts/include/uts/uts.h
new file mode 100644
index 000000000..e86e68f3e
--- /dev/null
+++ b/benchmark/external/uts/include/uts/uts.h
@@ -0,0 +1,120 @@
+#ifndef A0179FFF_4078_4EEB_BB6E_1E8C75CC694C
+#define A0179FFF_4078_4EEB_BB6E_1E8C75CC694C
+/*
+ *         ---- The Unbalanced Tree Search (UTS) Benchmark ----
+ *
+ *  Copyright (c) 2010 See AUTHORS file for copyright holders
+ *
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.  See AUTHORS file for more information.
+ *
+ */
+
+#ifndef _UTS_H
+  #define _UTS_H
+
+  #ifdef __cplusplus
+extern "C" {
+  #endif
+
+  #include "uts/rng/rng.h"
+
+  #define UTS_VERSION "2.1"
+
+  /***********************************************************
+   *  Tree node descriptor and statistics                    *
+   ***********************************************************/
+
+  #define MAXNUMCHILDREN 100 // cap on children (BIN root is exempt)
+
+struct node_t {
+  int type;        // distribution governing number of children
+  int height;      // depth of this node in the tree
+  int numChildren; // number of children, -1 => not yet determined
+
+  /* for RNG state associated with this node */
+  struct state_t state;
+};
+
+typedef struct node_t Node;
+
+/* Tree type
+ *   Trees are generated using a Galton-Watson process, in
+ *   which the branching factor of each node is a random
+ *   variable.
+ *
+ *   The random variable can follow a binomial distribution
+ *   or a geometric distribution.  Hybrid tree are
+ *   generated with geometric distributions near the
+ *   root and binomial distributions towards the leaves.
+ */
+enum uts_trees_e { BIN = 0, GEO, HYBRID, BALANCED };
+enum uts_geoshape_e { LINEAR = 0, EXPDEC, CYCLIC, FIXED };
+
+typedef enum uts_trees_e tree_t;
+typedef enum uts_geoshape_e geoshape_t;
+
+/* Strings for the above enums */
+extern char *uts_trees_str[];
+extern char *uts_geoshapes_str[];
+
+/* Tree  parameters */
+extern tree_t type;
+extern double b_0;
+extern int rootId;
+extern int nonLeafBF;
+extern double nonLeafProb;
+extern int gen_mx;
+extern geoshape_t shape_fn;
+extern double shiftDepth;
+
+/* Benchmark parameters */
+extern int computeGranularity;
+extern int debug;
+extern int verbose;
+
+/* For stats generation: */
+typedef unsigned long long counter_t;
+
+  /* Utility Functions */
+  #define max(a, b) (((a) > (b)) ? (a) : (b))
+  #define min(a, b) (((a) < (b)) ? (a) : (b))
+
+void uts_error(char *str);
+void uts_parseParams(int argc, char **argv);
+int uts_paramsToStr(char *strBuf, int ind);
+void uts_printParams();
+void uts_helpMessage();
+
+void uts_showStats(
+    int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth);
+double uts_wctime();
+
+double rng_toProb(int n);
+
+/* Common tree routines */
+void uts_initRoot(Node *root, int type);
+int uts_numChildren(Node *parent);
+int uts_numChildren_bin(Node *parent);
+int uts_numChildren_geo(Node *parent);
+int uts_childType(Node *parent);
+
+/* Implementation Specific Functions */
+char *impl_getName();
+int impl_paramsToStr(char *strBuf, int ind);
+int impl_parseParam(char *param, char *value);
+void impl_helpMessage();
+void impl_abort(int err);
+
+  #ifdef __cplusplus
+}
+  #endif
+
+#endif /* _UTS_H */
+
+#endif /* A0179FFF_4078_4EEB_BB6E_1E8C75CC694C */
diff --git a/benchmark/external/uts/src/rng/brg_endian.h b/benchmark/external/uts/src/rng/brg_endian.h
new file mode 100644
index 000000000..96082e57b
--- /dev/null
+++ b/benchmark/external/uts/src/rng/brg_endian.h
@@ -0,0 +1,132 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue 20/10/2006
+*/
+
+#ifndef BRG_ENDIAN_H
+#define BRG_ENDIAN_H
+
+#define IS_BIG_ENDIAN 4321    /* byte 0 is most significant (mc68k) */
+#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */
+
+/* Include files where endian defines and byteswap functions may reside */
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__)
+  #include <sys/endian.h>
+#elif defined(BSD) && (BSD >= 199103) || defined(__APPLE__) || defined(__CYGWIN32__) ||                      \
+    defined(__DJGPP__) || defined(__osf__)
+  #include <machine/endian.h>
+#elif defined(__linux__) || defined(__GNUC__) || defined(__GNU_LIBRARY__)
+  #if !defined(__MINGW32__) && !defined(__sun__)
+    #include <endian.h>
+    #if !defined(__BEOS__)
+      #include <byteswap.h>
+    #endif
+  #endif
+#endif
+
+/* Now attempt to set the define for platform byte order using any  */
+/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which  */
+/* seem to encompass most endian symbol definitions                 */
+
+#if defined(BIG_ENDIAN) && defined(LITTLE_ENDIAN)
+  #if defined(BYTE_ORDER) && BYTE_ORDER == BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(BYTE_ORDER) && BYTE_ORDER == LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(_BIG_ENDIAN) && defined(_LITTLE_ENDIAN)
+  #if defined(_BYTE_ORDER) && _BYTE_ORDER == _BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(_BYTE_ORDER) && _BYTE_ORDER == _LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(_BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(_LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN) && defined(__LITTLE_ENDIAN)
+  #if defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(__BIG_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(__LITTLE_ENDIAN)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+#if defined(__BIG_ENDIAN__) && defined(__LITTLE_ENDIAN__)
+  #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __BIG_ENDIAN__
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #elif defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __LITTLE_ENDIAN__
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #endif
+#elif defined(__BIG_ENDIAN__)
+  #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+#elif defined(__LITTLE_ENDIAN__)
+  #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+#endif
+
+/*  if the platform byte order could not be determined, then try to */
+/*  set this define using common machine defines                    */
+#if !defined(PLATFORM_BYTE_ORDER)
+
+  #if defined(__alpha__) || defined(__alpha) || defined(i386) || defined(__i386__) || defined(_M_I86) ||     \
+      defined(_M_IX86) || defined(__OS2__) || defined(sun386) || defined(__TURBOC__) || defined(vax) ||      \
+      defined(vms) || defined(VMS) || defined(__VMS) || defined(_M_X64)
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+
+  #elif defined(AMIGA) || defined(applec) || defined(__AS400__) || defined(_CRAY) || defined(__hppa) ||      \
+      defined(__hp9000) || defined(ibm370) || defined(mc68000) || defined(m68k) || defined(__MRC__) ||       \
+      defined(__MVS__) || defined(__MWERKS__) || defined(sparc) || defined(__sparc) ||                       \
+      defined(SYMANTEC_C) || defined(__VOS__) || defined(__TIGCC__) || defined(__TANDEM) ||                  \
+      defined(THINK_C) || defined(__VMCMS__)
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+
+  #elif 0 /* **** EDIT HERE IF NECESSARY **** */
+    #define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN
+  #elif 0 /* **** EDIT HERE IF NECESSARY **** */
+    #define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN
+  #else
+    #error Please edit lines 126 or 128 in brg_endian.h to set the platform byte order
+  #endif
+
+#endif
+
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/src/rng/brg_sha1.c b/benchmark/external/uts/src/rng/brg_sha1.c
new file mode 100644
index 000000000..f6757bafc
--- /dev/null
+++ b/benchmark/external/uts/src/rng/brg_sha1.c
@@ -0,0 +1,340 @@
+/*
+ ---------------------------------------------------------------------------
+ Copyright (c) 2002, Dr Brian Gladman, Worcester, UK.   All rights reserved.
+
+ LICENSE TERMS
+
+ The free distribution and use of this software in both source and binary
+ form is allowed (with or without changes) provided that:
+
+   1. distributions of this source code include the above copyright
+      notice, this list of conditions and the following disclaimer;
+
+   2. distributions in binary form include the above copyright
+      notice, this list of conditions and the following disclaimer
+      in the documentation and/or other associated materials;
+
+   3. the copyright holder's name is not used to endorse products
+      built using this software without specific written permission.
+
+ ALTERNATIVELY, provided that this notice is retained in full, this product
+ may be distributed under the terms of the GNU General Public License (GPL),
+ in which case the provisions of the GPL apply INSTEAD OF those given above.
+
+ DISCLAIMER
+
+ This software is provided 'as is' with no explicit or implied warranties
+ in respect of its properties, including, but not limited to, correctness
+ and/or fitness for purpose.
+ ---------------------------------------------------------------------------
+ Issue Date: 01/08/2005
+
+ This is a byte oriented version of SHA1 that operates on arrays of bytes
+ stored in memory.
+*/
+
+#include <stdio.h>
+#include <string.h> /* for memcpy() etc.        */
+
+#include "brg_endian.h"
+#include "uts/rng/brg_sha1.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/** BEGIN: UTS RNG Harness **/
+
+void rng_init(RNG_state *newstate, int seed) {
+  struct sha1_context ctx;
+  struct state_t gen;
+  int i;
+
+  for (i = 0; i < 16; i++)
+    gen.state[i] = 0;
+  gen.state[16] = 0xFF & (seed >> 24);
+  gen.state[17] = 0xFF & (seed >> 16);
+  gen.state[18] = 0xFF & (seed >> 8);
+  gen.state[19] = 0xFF & (seed >> 0);
+
+  sha1_begin(&ctx);
+  sha1_hash(gen.state, 20, &ctx);
+  sha1_end(newstate, &ctx);
+}
+
+void rng_spawn(RNG_state *mystate, RNG_state *newstate, int spawnnumber) {
+  struct sha1_context ctx;
+  uint_8t bytes[4];
+
+  bytes[0] = 0xFF & (spawnnumber >> 24);
+  bytes[1] = 0xFF & (spawnnumber >> 16);
+  bytes[2] = 0xFF & (spawnnumber >> 8);
+  bytes[3] = 0xFF & spawnnumber;
+
+  sha1_begin(&ctx);
+  sha1_hash(mystate, 20, &ctx);
+  sha1_hash(bytes, 4, &ctx);
+  sha1_end(newstate, &ctx);
+}
+
+int rng_rand(RNG_state *mystate) {
+  int r;
+  uint_32t b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0);
+  b = b & POS_MASK;
+
+  r = (int)b;
+  // printf("b: %d\t, r: %d\n", b, r);
+  return r;
+}
+
+int rng_nextrand(RNG_state *mystate) {
+  struct sha1_context ctx;
+  int r;
+  uint_32t b;
+
+  sha1_begin(&ctx);
+  sha1_hash(mystate, 20, &ctx);
+  sha1_end(mystate, &ctx);
+  b = (mystate[16] << 24) | (mystate[17] << 16) | (mystate[18] << 8) | (mystate[19] << 0);
+  b = b & POS_MASK;
+
+  r = (int)b;
+  return r;
+}
+
+/* condense state into string to display during debugging */
+char *rng_showstate(RNG_state *state, char *s) {
+  sprintf(s, "%.2X%.2X...", state[0], state[1]);
+  return s;
+}
+
+/* describe random number generator type into string */
+int rng_showtype(char *strBuf, int ind) {
+  ind += sprintf(strBuf + ind, "SHA-1 (state size = %uB)", (unsigned)sizeof(struct state_t));
+  return ind;
+}
+
+/** END: UTS RNG Harness **/
+
+#if defined(_MSC_VER) && (_MSC_VER > 800)
+  #pragma intrinsic(memcpy)
+#endif
+
+#if 0 && defined(_MSC_VER)
+  #define rotl32 _lrotl
+  #define rotr32 _lrotr
+#else
+  #define rotl32(x, n) (((x) << n) | ((x) >> (32 - n)))
+  #define rotr32(x, n) (((x) >> n) | ((x) << (32 - n)))
+#endif
+
+#if !defined(bswap_32)
+  #define bswap_32(x) ((rotr32((x), 24) & 0x00ff00ff) | (rotr32((x), 8) & 0xff00ff00))
+#endif
+
+#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN)
+  #define SWAP_BYTES
+#else
+  #undef SWAP_BYTES
+#endif
+
+#if defined(SWAP_BYTES)
+  #define bsw_32(p, n)                                                                                       \
+    {                                                                                                        \
+      int _i = (n);                                                                                          \
+      while (_i--)                                                                                           \
+        ((uint_32t *)p)[_i] = bswap_32(((uint_32t *)p)[_i]);                                                 \
+    }
+#else
+  #define bsw_32(p, n)
+#endif
+
+#define SHA1_MASK (SHA1_BLOCK_SIZE - 1)
+
+#if 0
+
+  #define ch(x, y, z) (((x) & (y)) ^ (~(x) & (z)))
+  #define parity(x, y, z) ((x) ^ (y) ^ (z))
+  #define maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z)))
+
+#else /* Discovered by Rich Schroeppel and Colin Plumb   */
+
+  #define ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z))))
+  #define parity(x, y, z) ((x) ^ (y) ^ (z))
+  #define maj(x, y, z) (((x) & (y)) | ((z) & ((x) ^ (y))))
+
+#endif
+
+/* Compile 64 bytes of hash data into SHA1 context. Note    */
+/* that this routine assumes that the byte order in the     */
+/* ctx->wbuf[] at this point is in such an order that low   */
+/* address bytes in the ORIGINAL byte stream will go in     */
+/* this buffer to the high end of 32-bit words on BOTH big  */
+/* and little endian systems                                */
+
+#ifdef ARRAY
+  #define q(v, n) v[n]
+#else
+  #define q(v, n) v##n
+#endif
+
+#define one_cycle(v, a, b, c, d, e, f, k, h)                                                                 \
+  q(v, e) += rotr32(q(v, a), 27) + f(q(v, b), q(v, c), q(v, d)) + k + h;                                     \
+  q(v, b) = rotr32(q(v, b), 2)
+
+#define five_cycle(v, f, k, i)                                                                               \
+  one_cycle(v, 0, 1, 2, 3, 4, f, k, hf(i));                                                                  \
+  one_cycle(v, 4, 0, 1, 2, 3, f, k, hf(i + 1));                                                              \
+  one_cycle(v, 3, 4, 0, 1, 2, f, k, hf(i + 2));                                                              \
+  one_cycle(v, 2, 3, 4, 0, 1, f, k, hf(i + 3));                                                              \
+  one_cycle(v, 1, 2, 3, 4, 0, f, k, hf(i + 4))
+
+VOID_RETURN sha1_compile(sha1_ctx ctx[1]) {
+  uint_32t *w = ctx->wbuf;
+
+#ifdef ARRAY
+  uint_32t v[5];
+  memcpy(v, ctx->hash, 5 * sizeof(uint_32t));
+#else
+  uint_32t v0, v1, v2, v3, v4;
+  v0 = ctx->hash[0];
+  v1 = ctx->hash[1];
+  v2 = ctx->hash[2];
+  v3 = ctx->hash[3];
+  v4 = ctx->hash[4];
+#endif
+
+#define hf(i) w[i]
+
+  five_cycle(v, ch, 0x5a827999, 0);
+  five_cycle(v, ch, 0x5a827999, 5);
+  five_cycle(v, ch, 0x5a827999, 10);
+  one_cycle(v, 0, 1, 2, 3, 4, ch, 0x5a827999, hf(15));
+
+#undef hf
+#define hf(i) (w[(i)&15] = rotl32(w[((i) + 13) & 15] ^ w[((i) + 8) & 15] ^ w[((i) + 2) & 15] ^ w[(i)&15], 1))
+
+  one_cycle(v, 4, 0, 1, 2, 3, ch, 0x5a827999, hf(16));
+  one_cycle(v, 3, 4, 0, 1, 2, ch, 0x5a827999, hf(17));
+  one_cycle(v, 2, 3, 4, 0, 1, ch, 0x5a827999, hf(18));
+  one_cycle(v, 1, 2, 3, 4, 0, ch, 0x5a827999, hf(19));
+
+  five_cycle(v, parity, 0x6ed9eba1, 20);
+  five_cycle(v, parity, 0x6ed9eba1, 25);
+  five_cycle(v, parity, 0x6ed9eba1, 30);
+  five_cycle(v, parity, 0x6ed9eba1, 35);
+
+  five_cycle(v, maj, 0x8f1bbcdc, 40);
+  five_cycle(v, maj, 0x8f1bbcdc, 45);
+  five_cycle(v, maj, 0x8f1bbcdc, 50);
+  five_cycle(v, maj, 0x8f1bbcdc, 55);
+
+  five_cycle(v, parity, 0xca62c1d6, 60);
+  five_cycle(v, parity, 0xca62c1d6, 65);
+  five_cycle(v, parity, 0xca62c1d6, 70);
+  five_cycle(v, parity, 0xca62c1d6, 75);
+
+#ifdef ARRAY
+  ctx->hash[0] += v[0];
+  ctx->hash[1] += v[1];
+  ctx->hash[2] += v[2];
+  ctx->hash[3] += v[3];
+  ctx->hash[4] += v[4];
+#else
+  ctx->hash[0] += v0;
+  ctx->hash[1] += v1;
+  ctx->hash[2] += v2;
+  ctx->hash[3] += v3;
+  ctx->hash[4] += v4;
+#endif
+}
+
+VOID_RETURN sha1_begin(sha1_ctx ctx[1]) {
+  ctx->count[0] = ctx->count[1] = 0;
+  ctx->hash[0] = 0x67452301;
+  ctx->hash[1] = 0xefcdab89;
+  ctx->hash[2] = 0x98badcfe;
+  ctx->hash[3] = 0x10325476;
+  ctx->hash[4] = 0xc3d2e1f0;
+}
+
+/* SHA1 hash data in an array of bytes into hash buffer and */
+/* call the hash_compile function as required.              */
+
+VOID_RETURN sha1_hash(const unsigned char data[], unsigned long len, sha1_ctx ctx[1]) {
+  uint_32t pos = (uint_32t)(ctx->count[0] & SHA1_MASK), space = SHA1_BLOCK_SIZE - pos;
+  const unsigned char *sp = data;
+
+  if ((ctx->count[0] += len) < len)
+    ++(ctx->count[1]);
+
+  while (len >= space) /* transfer whole blocks if possible  */
+  {
+    memcpy(((unsigned char *)ctx->wbuf) + pos, sp, space);
+    sp += space;
+    len -= space;
+    space = SHA1_BLOCK_SIZE;
+    pos = 0;
+    bsw_32(ctx->wbuf, SHA1_BLOCK_SIZE >> 2);
+    sha1_compile(ctx);
+  }
+
+  memcpy(((unsigned char *)ctx->wbuf) + pos, sp, len);
+}
+
+/* SHA1 final padding and digest calculation  */
+
+VOID_RETURN sha1_end(unsigned char hval[], sha1_ctx ctx[1]) {
+  uint_32t i = (uint_32t)(ctx->count[0] & SHA1_MASK);
+
+  /* put bytes in the buffer in an order in which references to   */
+  /* 32-bit words will put bytes with lower addresses into the    */
+  /* top of 32 bit words on BOTH big and little endian machines   */
+  bsw_32(ctx->wbuf, (i + 3) >> 2);
+
+  /* we now need to mask valid bytes and add the padding which is */
+  /* a single 1 bit and as many zero bits as necessary. Note that */
+  /* we can always add the first padding byte here because the    */
+  /* buffer always has at least one empty slot                    */
+  ctx->wbuf[i >> 2] &= 0xffffff80 << 8 * (~i & 3);
+  ctx->wbuf[i >> 2] |= 0x00000080 << 8 * (~i & 3);
+
+  /* we need 9 or more empty positions, one for the padding byte  */
+  /* (above) and eight for the length count. If there is not      */
+  /* enough space, pad and empty the buffer                       */
+  if (i > SHA1_BLOCK_SIZE - 9) {
+    if (i < 60)
+      ctx->wbuf[15] = 0;
+    sha1_compile(ctx);
+    i = 0;
+  } else /* compute a word index for the empty buffer positions  */
+    i = (i >> 2) + 1;
+
+  while (i < 14) /* and zero pad all but last two positions        */
+    ctx->wbuf[i++] = 0;
+
+  /* the following 32-bit length fields are assembled in the      */
+  /* wrong byte order on little endian machines but this is       */
+  /* corrected later since they are only ever used as 32-bit      */
+  /* word values.                                                 */
+  ctx->wbuf[14] = (ctx->count[1] << 3) | (ctx->count[0] >> 29);
+  ctx->wbuf[15] = ctx->count[0] << 3;
+  sha1_compile(ctx);
+
+  /* extract the hash value as bytes in case the hash buffer is   */
+  /* misaligned for 32-bit words                                  */
+  for (i = 0; i < SHA1_DIGEST_SIZE; ++i)
+    hval[i] = (unsigned char)(ctx->hash[i >> 2] >> (8 * (~i & 3)));
+}
+
+VOID_RETURN sha1(unsigned char hval[], const unsigned char data[], unsigned long len) {
+  sha1_ctx cx[1];
+
+  sha1_begin(cx);
+  sha1_hash(data, len, cx);
+  sha1_end(hval, cx);
+}
+
+#if defined(__cplusplus)
+}
+#endif
\ No newline at end of file
diff --git a/benchmark/external/uts/src/uts.c b/benchmark/external/uts/src/uts.c
new file mode 100644
index 000000000..507915bea
--- /dev/null
+++ b/benchmark/external/uts/src/uts.c
@@ -0,0 +1,474 @@
+/*
+ *         ---- The Unbalanced Tree Search (UTS) Benchmark ----
+ *
+ *  Copyright (c) 2010 See AUTHORS file for copyright holders
+ *
+ *  This file is part of the unbalanced tree search benchmark.  This
+ *  project is licensed under the MIT Open Source license.  See the LICENSE
+ *  file for copyright and licensing information.
+ *
+ *  UTS is a collaborative project between researchers at the University of
+ *  Maryland, the University of North Carolina at Chapel Hill, and the Ohio
+ *  State University.  See AUTHORS file for more information.
+ *
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "uts/uts.h"
+
+/***********************************************************
+ *  tree generation and search parameters                  *
+ *                                                         *
+ *  Tree generation strategy is controlled via various     *
+ *  parameters set from the command line.  The parameters  *
+ *  and their default values are given below.              *
+ ***********************************************************/
+
+char *uts_trees_str[] = {"Binomial", "Geometric", "Hybrid", "Balanced"};
+char *uts_geoshapes_str[] = {"Linear decrease", "Exponential decrease", "Cyclic", "Fixed branching factor"};
+
+/* Tree type
+ *   Trees are generated using a Galton-Watson process, in
+ *   which the branching factor of each node is a random
+ *   variable.
+ *
+ *   The random variable can follow a binomial distribution
+ *   or a geometric distribution.  Hybrid tree are
+ *   generated with geometric distributions near the
+ *   root and binomial distributions towards the leaves.
+ */
+tree_t type = GEO; // Default tree type
+double b_0 = 4.0;  // default branching factor at the root
+int rootId = 0;    // default seed for RNG state at root
+
+/*  Tree type BIN (BINOMIAL)
+ *  The branching factor at the root is specified by b_0.
+ *  The branching factor below the root follows an
+ *     identical binomial distribution at all nodes.
+ *  A node has m children with prob q, or no children with
+ *     prob (1-q).  The expected branching factor is q * m.
+ *
+ *  Default parameter values
+ */
+int nonLeafBF = 4;                // m
+double nonLeafProb = 15.0 / 64.0; // q
+
+/*  Tree type GEO (GEOMETRIC)
+ *  The branching factor follows a geometric distribution with
+ *     expected value b.
+ *  The probability that a node has 0 <= n children is p(1-p)^n for
+ *     0 < p <= 1. The distribution is truncated at MAXNUMCHILDREN.
+ *  The expected number of children b = (1-p)/p.  Given b (the
+ *     target branching factor) we can solve for p.
+ *
+ *  A shape function computes a target branching factor b_i
+ *     for nodes at depth i as a function of the root branching
+ *     factor b_0 and a maximum depth gen_mx.
+ *
+ *  Default parameter values
+ */
+int gen_mx = 6;               // default depth of tree
+geoshape_t shape_fn = LINEAR; // default shape function (b_i decr linearly)
+
+/*  In type HYBRID trees, each node is either type BIN or type
+ *  GEO, with the generation strategy changing from GEO to BIN
+ *  at a fixed depth, expressed as a fraction of gen_mx
+ */
+double shiftDepth = 0.5;
+
+/* compute granularity - number of rng evaluations per tree node */
+int computeGranularity = 1;
+
+/* display parameters */
+int debug = 0;
+int verbose = 1;
+
+/***********************************************************
+ *                                                         *
+ *  FUNCTIONS                                              *
+ *                                                         *
+ ***********************************************************/
+
+/* fatal error */
+void uts_error(char *str) {
+  printf("*** Error: %s\n", str);
+  impl_abort(1);
+}
+
+/*
+ * wall clock time
+ *   for detailed accounting of work, this needs
+ *   high resolution
+ */
+double uts_wctime() {
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return (tv.tv_sec + 1E-9 * tv.tv_nsec);
+}
+
+// Interpret 32 bit positive integer as value on [0,1)
+double rng_toProb(int n) {
+  if (n < 0) {
+    printf("*** toProb: rand n = %d out of range\n", n);
+  }
+  return ((n < 0) ? 0.0 : ((double)n) / 2147483648.0);
+}
+
+void uts_initRoot(Node *root, int type) {
+  root->type = type;
+  root->height = 0;
+  root->numChildren = -1; // means not yet determined
+  rng_init(root->state.state, rootId);
+
+  if (debug & 1)
+    printf("root node of type %d at %p\n", type, root);
+}
+
+int uts_numChildren_bin(Node *parent) {
+  // distribution is identical everywhere below root
+  int v = rng_rand(parent->state.state);
+  double d = rng_toProb(v);
+
+  return (d < nonLeafProb) ? nonLeafBF : 0;
+}
+
+int uts_numChildren_geo(Node *parent) {
+  double b_i = b_0;
+  int depth = parent->height;
+  int numChildren, h;
+  double p, u;
+
+  // use shape function to compute target b_i
+  if (depth > 0) {
+    switch (shape_fn) {
+
+        // expected size polynomial in depth
+      case EXPDEC:
+        b_i = b_0 * pow((double)depth, -log(b_0) / log((double)gen_mx));
+        break;
+
+        // cyclic tree size
+      case CYCLIC:
+        if (depth > 5 * gen_mx) {
+          b_i = 0.0;
+          break;
+        }
+        b_i = pow(b_0, sin(2.0 * 3.141592653589793 * (double)depth / (double)gen_mx));
+        break;
+
+        // identical distribution at all nodes up to max depth
+      case FIXED:
+        b_i = (depth < gen_mx) ? b_0 : 0;
+        break;
+
+        // linear decrease in b_i
+      case LINEAR:
+      default:
+        b_i = b_0 * (1.0 - (double)depth / (double)gen_mx);
+        break;
+    }
+  }
+
+  // given target b_i, find prob p so expected value of
+  // geometric distribution is b_i.
+  p = 1.0 / (1.0 + b_i);
+
+  // get uniform random number on [0,1)
+  h = rng_rand(parent->state.state);
+  u = rng_toProb(h);
+
+  // max number of children at this cumulative probability
+  // (from inverse geometric cumulative density function)
+  numChildren = (int)floor(log(1 - u) / log(1 - p));
+
+  return numChildren;
+}
+
+int uts_numChildren(Node *parent) {
+  int numChildren = 0;
+
+  /* Determine the number of children */
+  switch (type) {
+    case BIN:
+      if (parent->height == 0)
+        numChildren = (int)floor(b_0);
+      else
+        numChildren = uts_numChildren_bin(parent);
+      break;
+
+    case GEO:
+      numChildren = uts_numChildren_geo(parent);
+      break;
+
+    case HYBRID:
+      if (parent->height < shiftDepth * gen_mx)
+        numChildren = uts_numChildren_geo(parent);
+      else
+        numChildren = uts_numChildren_bin(parent);
+      break;
+    case BALANCED:
+      if (parent->height < gen_mx)
+        numChildren = (int)b_0;
+      break;
+    default:
+      uts_error("parTreeSearch(): Unknown tree type");
+  }
+
+  // limit number of children
+  // only a BIN root can have more than MAXNUMCHILDREN
+  if (parent->height == 0 && parent->type == BIN) {
+    int rootBF = (int)ceil(b_0);
+    if (numChildren > rootBF) {
+      printf("*** Number of children of root truncated from %d to %d\n", numChildren, rootBF);
+      numChildren = rootBF;
+    }
+  } else if (type != BALANCED) {
+    if (numChildren > MAXNUMCHILDREN) {
+      printf("*** Number of children truncated from %d to %d\n", numChildren, MAXNUMCHILDREN);
+      numChildren = MAXNUMCHILDREN;
+    }
+  }
+
+  return numChildren;
+}
+
+int uts_childType(Node *parent) {
+  switch (type) {
+    case BIN:
+      return BIN;
+    case GEO:
+      return GEO;
+    case HYBRID:
+      if (parent->height < shiftDepth * gen_mx)
+        return GEO;
+      else
+        return BIN;
+    case BALANCED:
+      return BALANCED;
+    default:
+      uts_error("uts_get_childtype(): Unknown tree type");
+      return -1;
+  }
+}
+
+// construct string with all parameter settings
+int uts_paramsToStr(char *strBuf, int ind) {
+  // version + execution model
+  ind += sprintf(strBuf + ind, "UTS - Unbalanced Tree Search %s (%s)\n", UTS_VERSION, impl_getName());
+
+  // tree type
+  ind += sprintf(strBuf + ind, "Tree type:  %d (%s)\n", type, uts_trees_str[type]);
+
+  // tree shape parameters
+  ind += sprintf(strBuf + ind, "Tree shape parameters:\n");
+  ind += sprintf(strBuf + ind, "  root branching factor b_0 = %.1f, root seed = %d\n", b_0, rootId);
+
+  if (type == GEO || type == HYBRID) {
+    ind += sprintf(strBuf + ind,
+                   "  GEO parameters: gen_mx = %d, shape function = %d (%s)\n",
+                   gen_mx,
+                   shape_fn,
+                   uts_geoshapes_str[shape_fn]);
+  }
+
+  if (type == BIN || type == HYBRID) {
+    double q = nonLeafProb;
+    int m = nonLeafBF;
+    double es = (1.0 / (1.0 - q * m));
+    ind +=
+        sprintf(strBuf + ind, "  BIN parameters:  q = %f, m = %d, E(n) = %f, E(s) = %.2f\n", q, m, q * m, es);
+  }
+
+  if (type == HYBRID) {
+    ind += sprintf(
+        strBuf + ind, "  HYBRID:  GEO from root to depth %d, then BIN\n", (int)ceil(shiftDepth * gen_mx));
+  }
+
+  if (type == BALANCED) {
+    ind += sprintf(strBuf + ind, "  BALANCED parameters: gen_mx = %d\n", gen_mx);
+    ind += sprintf(strBuf + ind,
+                   "        Expected size: %llu nodes, %llu leaves\n",
+                   (counter_t)((pow(b_0, gen_mx + 1) - 1.0) / (b_0 - 1.0)) /* geometric series */,
+                   (counter_t)pow(b_0, gen_mx));
+  }
+
+  // random number generator
+  ind += sprintf(strBuf + ind, "Random number generator: ");
+  ind = rng_showtype(strBuf, ind);
+  ind += sprintf(strBuf + ind, "\nCompute granularity: %d\n", computeGranularity);
+
+  return ind;
+}
+
+// show parameter settings
+void uts_printParams() {
+  char strBuf[5000] = "";
+  int ind = 0;
+
+  if (verbose > 0) {
+    ind = uts_paramsToStr(strBuf, ind);
+    ind = impl_paramsToStr(strBuf, ind);
+    printf("%s\n", strBuf);
+  }
+}
+
+void uts_parseParams(int argc, char *argv[]) {
+  int i = 1;
+  int err = -1;
+  while (i < argc && err == -1) {
+    if (argv[i][0] == '-' && argv[i][1] == 'h') {
+      uts_helpMessage();
+      impl_abort(0);
+
+    } else if (argv[i][0] != '-' || strlen(argv[i]) != 2 || argc <= i + 1) {
+      err = i;
+      break;
+    }
+
+    // Matched by implementation -- return 0 on success
+    // This is fragile, don't override parameters in impl_parseParam()!
+    if (!impl_parseParam(argv[i], argv[i + 1])) {
+      i += 2;
+      continue;
+    }
+
+    switch (argv[i][1]) {
+      case 'q':
+        nonLeafProb = atof(argv[i + 1]);
+        break;
+      case 'm':
+        nonLeafBF = atoi(argv[i + 1]);
+        break;
+      case 'r':
+        rootId = atoi(argv[i + 1]);
+        break;
+      case 'x':
+        debug = atoi(argv[i + 1]);
+        break;
+      case 'v':
+        verbose = atoi(argv[i + 1]);
+        break;
+      case 't':
+        type = (tree_t)atoi(argv[i + 1]);
+        if (type != BIN && type != GEO && type != HYBRID && type != BALANCED)
+          err = i;
+        break;
+      case 'a':
+        shape_fn = (geoshape_t)atoi(argv[i + 1]);
+        if (shape_fn > FIXED)
+          err = i;
+        break;
+      case 'b':
+        b_0 = atof(argv[i + 1]);
+        break;
+      case 'd':
+        gen_mx = atoi(argv[i + 1]);
+        break;
+      case 'f':
+        shiftDepth = atof(argv[i + 1]);
+        break;
+      case 'g':
+        computeGranularity = max(1, atoi(argv[i + 1]));
+        break;
+      default:
+        err = i;
+    }
+
+    if (err != -1)
+      break;
+
+    i += 2;
+  }
+
+  if (err != -1) {
+    printf("Unrecognized parameter or incorrect/missing value: '%s %s'\n",
+           argv[i],
+           (i + 1 < argc) ? argv[i + 1] : "[none]");
+    printf("Try -h for help.\n");
+    impl_abort(4);
+  }
+}
+
+void uts_helpMessage() {
+  printf("  UTS - Unbalanced Tree Search %s (%s)\n\n", UTS_VERSION, impl_getName());
+  printf("    usage:  uts-bin [parameter value] ...\n\n");
+  printf("  parameter type  description\n");
+  printf("  ==== ====  =========================================\n");
+  printf("\n  Benchmark Parameters:\n");
+  printf("   -t  int   tree type (0: BIN, 1: GEO, 2: HYBRID, 3: BALANCED)\n");
+  printf("   -b  dble  root branching factor\n");
+  printf("   -r  int   root seed 0 <= r < 2^31 \n");
+  printf("   -a  int   GEO: tree shape function \n");
+  printf("   -d  int   GEO, BALANCED: tree depth\n");
+  printf("   -q  dble  BIN: probability of non-leaf node\n");
+  printf("   -m  int   BIN: number of children for non-leaf node\n");
+  printf("   -f  dble  HYBRID: fraction of depth for GEO -> BIN transition\n");
+  printf("   -g  int   compute granularity: number of rng_spawns per node\n");
+  printf("   -v  int   nonzero to set verbose output\n");
+  printf("   -x  int   debug level\n");
+
+  // Get help message from the implementation
+  printf("\n  Additional Implementation Parameters:\n");
+  impl_helpMessage();
+  printf("\n");
+}
+
+void uts_showStats(
+    int nPes, int chunkSize, double walltime, counter_t nNodes, counter_t nLeaves, counter_t maxDepth) {
+  // summarize execution info for machine consumption
+  if (verbose == 0) {
+    printf("%4d %7.3f %9llu %7.0llu %7.0llu %d %d %.2f %d %d %1d %f %3d\n",
+           nPes,
+           walltime,
+           nNodes,
+           (long long)(nNodes / walltime),
+           (long long)((nNodes / walltime) / nPes),
+           chunkSize,
+           type,
+           b_0,
+           rootId,
+           gen_mx,
+           shape_fn,
+           nonLeafProb,
+           nonLeafBF);
+  }
+
+  // summarize execution info for human consumption
+  else {
+    printf("Tree size = %llu, tree depth = %llu, num leaves = %llu (%.2f%%)\n",
+           nNodes,
+           maxDepth,
+           nLeaves,
+           nLeaves / (float)nNodes * 100.0);
+    printf("Wallclock time = %.3f sec, performance = %.0f nodes/sec (%.0f nodes/sec per PE)\n\n",
+           walltime,
+           (nNodes / walltime),
+           (nNodes / walltime / nPes));
+  }
+}
+
+// --------------------------------------------------------------------- //
+
+// The name of this implementation
+char *impl_getName() { return "Sequential Recursive Search"; }
+
+int impl_paramsToStr(char *strBuf, int ind) {
+  ind += sprintf(strBuf + ind, "Execution strategy:  %s\n", impl_getName());
+  return ind;
+}
+
+// Not using UTS command line params, return non-success
+int impl_parseParam(char *param, char *value) {
+  return 1;
+  (void)param;
+  (void)value;
+}
+
+void impl_helpMessage() { printf("   none.\n"); }
+
+void impl_abort(int err) { exit(err); }
\ No newline at end of file
diff --git a/benchmark/lib/CMakeLists.txt b/benchmark/lib/CMakeLists.txt
new file mode 100644
index 000000000..46fd98c80
--- /dev/null
+++ b/benchmark/lib/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_library(benchmark_common)
+
+target_compile_features(benchmark_common PUBLIC cxx_std_26)
+
+target_sources(benchmark_common
+  PRIVATE
+    uts.cpp
+  PUBLIC
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
+    FILES
+      fib.hpp
+      uts.hpp
+      macros.hpp
+)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/../external/uts external/uts)
+
+target_link_libraries(benchmark_common
+  PUBLIC
+    benchmark::benchmark
+    uts_c
+)
diff --git a/benchmark/lib/fib.hpp b/benchmark/lib/fib.hpp
new file mode 100644
index 000000000..08e11ec80
--- /dev/null
+++ b/benchmark/lib/fib.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <cstdint>
+#else
+import std;
+#endif
+
+inline constexpr int fib_test = 8;
+inline constexpr int fib_base = 37;
+
+/**
+ * @brief Non-recursive Fibonacci calculation
+ */
+constexpr auto fib_ref(std::int64_t n) -> std::int64_t {
+
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t prev = 0;
+  std::int64_t curr = 1;
+
+  for (std::int64_t i = 2; i <= n; ++i) {
+    std::int64_t next = prev + curr;
+    prev = curr;
+    curr = next;
+  }
+
+  return curr;
+}
diff --git a/benchmark/lib/macros.hpp b/benchmark/lib/macros.hpp
new file mode 100644
index 000000000..a32642f43
--- /dev/null
+++ b/benchmark/lib/macros.hpp
@@ -0,0 +1,172 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+// Use `import std;` by default. Textually `#include <thread>` drags in
+// `<stop_token>`, which triggers a libc++ 22 link-time bug (undefined
+// `__atomic_unique_lock::__set_locked_bit`) in TUs that later instantiate
+// anything touching std::stop_*. Targets that can't use modules (e.g. the
+// openmp benchmarks, see benchmark/src/openmp/CMakeLists.txt) define
+// LF_BENCH_NO_IMPORT_STD and get textual includes instead.
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <algorithm>
+  #include <cstdint>
+  #include <string>
+  #include <thread>
+#else
+import std;
+#endif
+
+#define BENCH_GET_FN(bench_fn, ...) bench_fn __VA_OPT__(<__VA_ARGS__>)
+
+namespace lf_bench {
+
+inline void bench_thread_args(benchmark::Benchmark *bench, auto make_args) {
+  unsigned hw = std::max(1U, std::thread::hardware_concurrency());
+  for (unsigned t : {1U, 2U, 4U, 6U, 8U, 12U, 16U, 24U, 32U, 48U, 64U, 96U}) {
+    if (t > hw) {
+      return;
+    }
+    make_args(bench, t);
+  }
+}
+
+inline auto sanitize(std::string s) -> std::string {
+  s.erase(std::remove(s.begin(), s.end(), ' '), s.end());
+  return s;
+}
+
+inline auto
+format_name(std::string mode, std::string category, std::string name, std::string args) -> std::string {
+  std::string res = sanitize(mode) + "/" + sanitize(category) + "/" + sanitize(name);
+  std::string s_args = sanitize(args);
+  if (!s_args.empty()) {
+    res += "/" + s_args;
+  }
+  return res;
+}
+
+inline auto inverse_complexity(benchmark::IterationCount n) -> double { return 1.0 / static_cast<double>(n); }
+
+inline void setup_single(benchmark::Benchmark *b, std::int64_t size) { b->Arg(size)->UseRealTime(); }
+
+inline void setup_mt(benchmark::Benchmark *b, std::int64_t size) {
+  b->Apply([size](benchmark::Benchmark *bm) {
+     bench_thread_args(bm, [size](benchmark::Benchmark *inner_b, unsigned t) {
+       inner_b->Args({size, static_cast<std::int64_t>(t)});
+     });
+   })
+      ->Complexity(inverse_complexity)
+      ->UseRealTime();
+}
+
+inline void setup_uts_mt(benchmark::Benchmark *b) {
+  b->Apply([](benchmark::Benchmark *bm) {
+     bench_thread_args(bm, [](benchmark::Benchmark *inner_b, unsigned t) {
+       inner_b->Arg(static_cast<std::int64_t>(t));
+     });
+   })
+      ->Complexity(inverse_complexity)
+      ->UseRealTime();
+}
+
+} // namespace lf_bench
+
+// --- Standard Benchmarks ---
+
+#define BENCH_ONE_WITH_ID(id, bench_fn, category, name, mode, prefix, ...)                                   \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__),   \
+                                             BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__));             \
+      lf_bench::setup_single(b, prefix##_##mode);                                                            \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define BENCH_ONE_HIDDEN(id, ...) BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define BENCH_ONE(bench_fn, category, name, mode, prefix, ...)                                               \
+  BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define BENCH_ALL(bench_fn, category, name, prefix, ...)                                                     \
+  BENCH_ONE(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__)                               \
+  BENCH_ONE(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+// --- Multi-Threaded Benchmarks ---
+
+#define BENCH_ONE_MT_WITH_ID(id, bench_fn, category, name, mode, prefix, ...)                                \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(lf_bench::format_name(#mode, #category, #name, #__VA_ARGS__),   \
+                                             BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__));             \
+      lf_bench::setup_mt(b, prefix##_##mode);                                                                \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define BENCH_ONE_MT_HIDDEN(id, ...) BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define BENCH_ONE_MT(bench_fn, category, name, mode, prefix, ...)                                            \
+  BENCH_ONE_MT_HIDDEN(__COUNTER__, bench_fn, category, name, mode, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define BENCH_ALL_MT(bench_fn, category, name, prefix, ...)                                                  \
+  BENCH_ONE_MT(bench_fn, category, name, test, prefix __VA_OPT__(, ) __VA_ARGS__)                            \
+  BENCH_ONE_MT(bench_fn, category, name, base, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+// --- UTS Benchmarks ---
+
+#define UTS_BENCH_ONE_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...)                         \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(                                                                \
+          lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__),                           \
+          [=](benchmark::State &state) {                                                                     \
+            BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id);                               \
+          });                                                                                                \
+      b->UseRealTime();                                                                                      \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define UTS_BENCH_ONE_HIDDEN(id, ...) UTS_BENCH_ONE_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define UTS_BENCH_ONE(bench_fn, category, mode, tree_name, tree_id, ...)                                     \
+  UTS_BENCH_ONE_HIDDEN(__COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define UTS_BENCH_ALL(bench_fn, category, ...)                                                               \
+  UTS_BENCH_ONE(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__)                 \
+  UTS_BENCH_ONE(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__)                 \
+  UTS_BENCH_ONE(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__)                           \
+  UTS_BENCH_ONE(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__)                           \
+  UTS_BENCH_ONE(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__)
+
+// --- UTS Multi-Threaded Benchmarks ---
+
+#define UTS_BENCH_ONE_MT_WITH_ID(id, bench_fn, category, mode, tree_name, tree_id, ...)                      \
+  namespace {                                                                                                \
+  struct benchmark_reg_##id {                                                                                \
+    benchmark_reg_##id() {                                                                                   \
+      auto *b = benchmark::RegisterBenchmark(                                                                \
+          lf_bench::format_name(#mode, #category, "uts/" tree_name, #__VA_ARGS__),                           \
+          [=](benchmark::State &state) {                                                                     \
+            BENCH_GET_FN(bench_fn __VA_OPT__(, ) __VA_ARGS__)(state, tree_id);                               \
+          });                                                                                                \
+      lf_bench::setup_uts_mt(b);                                                                             \
+    }                                                                                                        \
+  } benchmark_reg_inst_##id;                                                                                 \
+  }
+
+#define UTS_BENCH_ONE_MT_HIDDEN(id, ...) UTS_BENCH_ONE_MT_WITH_ID(id __VA_OPT__(, ) __VA_ARGS__)
+#define UTS_BENCH_ONE_MT(bench_fn, category, mode, tree_name, tree_id, ...)                                  \
+  UTS_BENCH_ONE_MT_HIDDEN(                                                                                   \
+      __COUNTER__, bench_fn, category, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define UTS_BENCH_ALL_MT(bench_fn, category, ...)                                                            \
+  UTS_BENCH_ONE_MT(bench_fn, category, test, "T1_mini", uts_t1_mini __VA_OPT__(, ) __VA_ARGS__)              \
+  UTS_BENCH_ONE_MT(bench_fn, category, test, "T3_mini", uts_t3_mini __VA_OPT__(, ) __VA_ARGS__)              \
+  UTS_BENCH_ONE_MT(bench_fn, category, base, "T1", uts_t1 __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE_MT(bench_fn, category, base, "T3", uts_t3 __VA_OPT__(, ) __VA_ARGS__)                        \
+  UTS_BENCH_ONE_MT(bench_fn, category, large, "T1L", uts_t1l __VA_OPT__(, ) __VA_ARGS__)                     \
+  UTS_BENCH_ONE_MT(bench_fn, category, large, "T3L", uts_t3l __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/lib/uts.cpp b/benchmark/lib/uts.cpp
new file mode 100644
index 000000000..90964d48c
--- /dev/null
+++ b/benchmark/lib/uts.cpp
@@ -0,0 +1,159 @@
+#include "uts.hpp"
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <exception>
+#else
+import std;
+#endif
+
+namespace {
+
+void reset_uts() {
+  type = GEO;
+  b_0 = 4.0;
+  rootId = 0;
+  nonLeafBF = 4;
+  nonLeafProb = 15.0 / 64.0;
+  gen_mx = 6;
+  shape_fn = LINEAR;
+  shiftDepth = 0.5;
+  computeGranularity = 1;
+  debug = 0;
+  verbose = 1;
+}
+
+// (T1 mini) Geometric
+void setup_t1_mini() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 7;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T1) Geometric
+void setup_t1() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 10;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T1L) Geometric
+void setup_t1l() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 13;
+  b_0 = 4;
+  rootId = 29;
+}
+
+// (T1XXL)
+void setup_t1xxl() {
+  reset_uts();
+  type = static_cast<tree_t>(1);
+  shape_fn = static_cast<geoshape_t>(3);
+  gen_mx = 15;
+  b_0 = 4;
+  rootId = 19;
+}
+
+// (T3 mini)
+void setup_t3_mini() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 20;
+  nonLeafBF = 8;
+  nonLeafProb = 0.124875;
+  rootId = 42;
+}
+
+// (T3) Binomial
+void setup_t3() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 8;
+  nonLeafProb = 0.124875;
+  rootId = 42;
+}
+
+// (T3L) Binomial
+void setup_t3l() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 5;
+  nonLeafProb = 0.200014;
+  rootId = 7;
+}
+
+// (T3XXL) Binomial
+void setup_t3xxl() {
+  reset_uts();
+  type = static_cast<tree_t>(0);
+  b_0 = 2000;
+  nonLeafBF = 2;
+  nonLeafProb = 0.499995;
+  rootId = 316;
+}
+
+} // namespace
+
+void setup_tree(uts_tree tree) {
+  switch (tree) {
+    case uts_t1_mini:
+      setup_t1_mini();
+      break;
+    case uts_t1:
+      setup_t1();
+      break;
+    case uts_t1l:
+      setup_t1l();
+      break;
+    case uts_t1xxl:
+      setup_t1xxl();
+      break;
+    case uts_t3_mini:
+      setup_t3_mini();
+      break;
+    case uts_t3:
+      setup_t3();
+      break;
+    case uts_t3l:
+      setup_t3l();
+      break;
+    case uts_t3xxl:
+      setup_t3xxl();
+      break;
+    default:
+      std::terminate();
+  }
+}
+
+auto expected_result(uts_tree tree) -> result {
+  switch (tree) {
+    case uts_t1_mini:
+      return {.maxdepth = 7, .size = 63914, .leaves = 51124};
+    case uts_t1:
+      return {.maxdepth = 10, .size = 4130071, .leaves = 3305118};
+    case uts_t1l:
+      return {.maxdepth = 13, .size = 102181082, .leaves = 81746377};
+    case uts_t1xxl:
+      return {.maxdepth = 15, .size = 4230646601, .leaves = 3384495738};
+    case uts_t3_mini:
+      return {.maxdepth = 67, .size = 6213, .leaves = 5438};
+    case uts_t3:
+      return {.maxdepth = 1572, .size = 4112897, .leaves = 3599034};
+    case uts_t3l:
+      return {.maxdepth = 17844, .size = 111345631, .leaves = 89076904};
+    case uts_t3xxl:
+      return {.maxdepth = 99049, .size = 2793220501, .leaves = 1396611250};
+    default:
+      std::terminate();
+  }
+}
diff --git a/benchmark/lib/uts.hpp b/benchmark/lib/uts.hpp
new file mode 100644
index 000000000..2d3a2065f
--- /dev/null
+++ b/benchmark/lib/uts.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+// Include the C UTS library header first (it defines max/min macros that would
+// clash with std::max/std::min after import std).
+#include "uts/uts.h"
+
+#undef max
+#undef min
+
+#ifdef LF_BENCH_NO_IMPORT_STD
+  #include <format>
+  #include <string>
+#else
+import std;
+#endif
+
+struct result {
+  counter_t maxdepth;
+  counter_t size;
+  counter_t leaves;
+  auto operator<=>(const result &) const = default;
+};
+
+template <>
+struct std::formatter<result> : std::formatter<std::string> {
+  auto format(const result &r, auto &ctx) const {
+    return std::formatter<std::string>::format(
+        std::format("{{maxdepth={}, size={}, leaves={}}}", r.maxdepth, r.size, r.leaves), ctx);
+  }
+};
+
+struct pair {
+  result res;
+  Node child;
+};
+
+enum uts_tree : char {
+  uts_t1_mini, // Geometric [fixed],  ~64K nodes  (test only)
+  uts_t1,      // Geometric [fixed],  ~4M nodes
+  uts_t1l,     // Geometric [fixed],  ~102M nodes
+  uts_t1xxl,   // Geometric [fixed],  ~4.2B nodes
+  uts_t3_mini, // Binomial,           ~6K nodes   (test only)
+  uts_t3,      // Binomial,           ~4M nodes
+  uts_t3l,     // Binomial,           ~111M nodes
+  uts_t3xxl,   // Binomial,           ~2.8B nodes
+};
+
+void setup_tree(uts_tree tree);
+
+auto expected_result(uts_tree tree) -> result;
diff --git a/benchmark/src/baremetal/CMakeLists.txt b/benchmark/src/baremetal/CMakeLists.txt
new file mode 100644
index 000000000..ea6f1e69f
--- /dev/null
+++ b/benchmark/src/baremetal/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(baremetal_benchmarks)
+
+target_sources(baremetal_benchmarks PRIVATE fib.cpp)
+
+target_link_libraries(baremetal_benchmarks PUBLIC benchmark_common libfork::libfork)
diff --git a/benchmark/src/baremetal/fib.cpp b/benchmark/src/baremetal/fib.cpp
new file mode 100644
index 000000000..247336594
--- /dev/null
+++ b/benchmark/src/baremetal/fib.cpp
@@ -0,0 +1,177 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+// ==== Allocators ==== //
+
+[[nodiscard]]
+inline auto fib_align_size(std::size_t n) -> std::size_t {
+  constexpr std::size_t align = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
+  return (n + align - 1) & ~(align - 1);
+}
+
+constinit inline thread_local std::byte *tls_bump_ptr = nullptr;
+
+struct task {
+  struct promise_type {
+
+    static auto operator new(std::size_t sz) -> void * {
+      auto *prev = tls_bump_ptr;
+      tls_bump_ptr += fib_align_size(sz);
+      return prev;
+    }
+
+    static auto operator delete(void *p, [[maybe_unused]] std::size_t sz) noexcept -> void {
+      tls_bump_ptr = std::bit_cast<std::byte *>(p);
+    }
+
+    auto get_return_object() -> task { return {std::coroutine_handle<promise_type>::from_promise(*this)}; }
+
+    auto initial_suspend() -> std::suspend_always { return {}; }
+
+    auto final_suspend() noexcept {
+      struct final_awaitable : std::suspend_always {
+        auto await_suspend(std::coroutine_handle<promise_type> h) noexcept -> std::coroutine_handle<> {
+
+          std::coroutine_handle<> cont = h.promise().continuation;
+
+          h.destroy();
+
+          if (cont) {
+            return cont;
+          }
+
+          return std::noop_coroutine();
+        }
+      };
+
+      return final_awaitable{};
+    }
+
+    void return_value(std::int64_t val) { *value = val; }
+    void unhandled_exception() { std::terminate(); }
+
+    std::int64_t *value = nullptr;
+    std::coroutine_handle<> continuation = nullptr;
+  };
+
+  std::coroutine_handle<promise_type> coro;
+
+  auto set(std::int64_t &out) -> task & {
+    coro.promise().value = &out;
+    return *this;
+  }
+
+  auto await_ready() noexcept -> bool { return false; }
+
+  auto await_suspend(std::coroutine_handle<> h) -> std::coroutine_handle<promise_type> {
+    coro.promise().continuation = h;
+    return coro;
+  }
+
+  void await_resume() noexcept {}
+};
+
+auto fib(std::int64_t n) -> task {
+  if (n <= 1) {
+    co_return n;
+  }
+  std::int64_t a = 0;
+  std::int64_t b = 0;
+  co_await fib(n - 2).set(a);
+  co_await fib(n - 1).set(b);
+  co_return a + b;
+}
+
+template <typename = void>
+void fib_coro_no_queue(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  // 8MB stack
+  std::unique_ptr buffer = std::make_unique<std::byte[]>(1024 * 1024 * 8);
+  tls_bump_ptr = buffer.get();
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    std::int64_t result = 0;
+    fib(n).set(result).coro.resume();
+
+    if (result != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(result);
+  }
+
+  if (tls_bump_ptr != buffer.get()) {
+    std::terminate(); // Stack leak
+  }
+}
+
+// === Recursive with Deque overhead
+
+constinit inline thread_local lf::deque<std::int64_t> *tls_deque = nullptr;
+
+auto deque() -> lf::deque<std::int64_t> & { return *tls_deque; }
+
+auto fib_recursive_deque_impl(std::int64_t n) -> std::int64_t {
+  if (n <= 1) {
+    return n;
+  }
+
+  // Emulate work item creation/scheduling overhead
+  deque().push(n);
+  std::int64_t a = fib_recursive_deque_impl(n - 2);
+  deque().pop();
+
+  std::int64_t b = fib_recursive_deque_impl(n - 1);
+
+  return a + b;
+}
+
+template <typename = void>
+void fib_recursive_deque(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  lf::deque<std::int64_t> deque{64};
+  tls_deque = &deque;
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    std::int64_t result = fib_recursive_deque_impl(n);
+
+    if (result != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(result);
+  }
+
+  tls_deque = nullptr;
+}
+
+} // namespace
+
+// Minimal coroutine, bump allocated (thread-local) stack
+BENCH_ALL(fib_coro_no_queue, baremetal, coro, fib)
+
+BENCH_ALL(fib_recursive_deque, baremetal, deque, fib)
diff --git a/benchmark/src/benchmarks.cpp b/benchmark/src/benchmarks.cpp
new file mode 100644
index 000000000..d6cf26f54
--- /dev/null
+++ b/benchmark/src/benchmarks.cpp
@@ -0,0 +1 @@
+// Benchmarks are registered in the linked sub-libraries.
diff --git a/benchmark/src/libfork/CMakeLists.txt b/benchmark/src/libfork/CMakeLists.txt
new file mode 100644
index 000000000..2e2def872
--- /dev/null
+++ b/benchmark/src/libfork/CMakeLists.txt
@@ -0,0 +1,17 @@
+add_library(libfork_benchmarks)
+
+target_sources(libfork_benchmarks
+  PRIVATE
+    fib.cpp uts.cpp
+  PRIVATE
+    FILE_SET HEADERS
+    BASE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}
+    FILES
+      helpers.hpp
+)
+
+target_link_libraries(libfork_benchmarks
+  PUBLIC
+    benchmark_common
+    libfork::libfork
+)
diff --git a/benchmark/src/libfork/fib.cpp b/benchmark/src/libfork/fib.cpp
new file mode 100644
index 000000000..beaace9c7
--- /dev/null
+++ b/benchmark/src/libfork/fib.cpp
@@ -0,0 +1,94 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+struct fib {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, std::int64_t n) -> lf::task<std::int64_t, Context> {
+    if (n < 2) {
+      co_return n;
+    }
+
+    std::int64_t lhs = 0;
+    std::int64_t rhs = 0;
+
+    auto sc = co_await lf::scope();
+
+    co_await sc.fork(&rhs, fib{}, n - 2);
+    co_await sc.call(&lhs, fib{}, n - 1);
+
+    co_await sc.join();
+
+    co_return lhs + rhs;
+  }
+};
+
+template <lf::scheduler Sch>
+void run(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["p"] = static_cast<double>(thread_count<Sch>(state));
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(thread_count<Sch>(state)));
+
+  Sch scheduler = make_scheduler<Sch>(state);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    lf::receiver recv = lf::schedule(scheduler, fib{}, n);
+    std::int64_t return_value = std::move(recv).get();
+
+    if (return_value != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", return_value, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(return_value);
+  }
+}
+
+} // namespace
+
+using lf::adapt_deque;
+using lf::adapt_vector;
+
+using lf::adaptor_stack;
+using lf::geometric_stack;
+using lf::slab_stack;
+
+// -- Vector
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<adaptor_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<adaptor_stack<>, adapt_vector<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<slab_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<slab_stack<>, adapt_vector<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<geometric_stack<>, adapt_vector<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<geometric_stack<>, adapt_vector<>>)
+
+// -- Deque
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<adaptor_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<adaptor_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<slab_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<slab_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL(run, fib, fib, lf::mono_inline_scheduler<geometric_stack<>, adapt_deque<>>)
+LIBFORK_BENCH_ALL(run, fib, fib, lf::poly_inline_scheduler<geometric_stack<>, adapt_deque<>>)
+
+LIBFORK_BENCH_ALL_MT(run, fib, fib, mono_busy_pool)
+LIBFORK_BENCH_ALL_MT(run, fib, fib, poly_busy_pool)
diff --git a/benchmark/src/libfork/helpers.hpp b/benchmark/src/libfork/helpers.hpp
new file mode 100644
index 000000000..8fbef45dc
--- /dev/null
+++ b/benchmark/src/libfork/helpers.hpp
@@ -0,0 +1,41 @@
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+
+import std;
+
+import libfork;
+
+template <lf::scheduler Sch>
+auto thread_count(benchmark::State &state) -> std::size_t {
+  if constexpr (std::constructible_from<Sch, std::size_t>) {
+    return static_cast<std::size_t>(state.range(1));
+  } else {
+    return 1;
+  }
+}
+
+template <lf::scheduler Sch>
+auto make_scheduler(benchmark::State &state) -> Sch {
+  if constexpr (std::constructible_from<Sch, std::size_t>) {
+    return Sch{static_cast<std::size_t>(state.range(1))};
+  } else {
+    return Sch{};
+  }
+}
+
+using mono_busy_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+#define LIBFORK_BENCH_ALL(bench_fn, name, prefix, ...)                                                       \
+  BENCH_ALL(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_BENCH_ALL_MT(bench_fn, name, prefix, ...)                                                    \
+  BENCH_ALL_MT(bench_fn, libfork, name, prefix __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_UTS_BENCH_ONE_MT(bench_fn, mode, tree_name, tree_id, ...)                                    \
+  UTS_BENCH_ONE_MT(bench_fn, libfork, mode, tree_name, tree_id __VA_OPT__(, ) __VA_ARGS__)
+
+#define LIBFORK_UTS_BENCH_ALL_MT(bench_fn, ...) UTS_BENCH_ALL_MT(bench_fn, libfork __VA_OPT__(, ) __VA_ARGS__)
diff --git a/benchmark/src/libfork/uts.cpp b/benchmark/src/libfork/uts.cpp
new file mode 100644
index 000000000..91ef3f793
--- /dev/null
+++ b/benchmark/src/libfork/uts.cpp
@@ -0,0 +1,93 @@
+#include <benchmark/benchmark.h>
+
+#include "uts.hpp"
+
+#include "helpers.hpp"
+
+import std;
+
+import libfork;
+
+// === Coroutine
+
+namespace {
+
+// TODO: try a version that uses try_fork
+
+struct uts_fn {
+  template <lf::worker_context Context>
+  static auto operator()(lf::env<Context>, int depth, Node *parent) -> lf::task<result, Context> {
+
+    result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+    int num_children = uts_numChildren(parent);
+    int child_type = uts_childType(parent);
+
+    parent->numChildren = num_children;
+
+    if (num_children > 0) {
+      std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+      auto sc = co_await lf::scope();
+
+      for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+        cs[i].child.type = child_type;
+        cs[i].child.height = parent->height + 1;
+        cs[i].child.numChildren = -1;
+
+        for (int j = 0; j < computeGranularity; ++j) {
+          rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+        }
+
+        if (i + 1 == static_cast<std::size_t>(num_children)) {
+          co_await sc.call(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child);
+        } else {
+          co_await sc.fork(&cs[i].res, uts_fn{}, depth + 1, &cs[i].child);
+        }
+      }
+
+      co_await sc.join();
+
+      for (auto &&elem : cs) {
+        r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+        r.size += elem.res.size;
+        r.leaves += elem.res.leaves;
+      }
+    } else {
+      r.leaves = 1;
+    }
+
+    co_return r;
+  }
+};
+
+template <lf::scheduler Sch>
+void run(benchmark::State &state, uts_tree tree) {
+  setup_tree(tree);
+  auto expect = expected_result(tree);
+
+  std::size_t threads = static_cast<std::size_t>(state.range(0));
+  state.counters["p"] = static_cast<double>(threads);
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(threads));
+
+  Sch scheduler = Sch{threads};
+
+  for (auto _ : state) {
+    Node root;
+    uts_initRoot(&root, type);
+    lf::receiver recv = lf::schedule(scheduler, uts_fn{}, 0, &root);
+    result r = std::move(recv).get();
+
+    if (r != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", r, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(r);
+  }
+}
+
+} // namespace
+
+LIBFORK_UTS_BENCH_ALL_MT(run, mono_busy_pool)
+LIBFORK_UTS_BENCH_ALL_MT(run, poly_busy_pool)
diff --git a/benchmark/src/openmp/CMakeLists.txt b/benchmark/src/openmp/CMakeLists.txt
new file mode 100644
index 000000000..035d5326f
--- /dev/null
+++ b/benchmark/src/openmp/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_library(openmp_benchmarks)
+
+# OpenMP compiles with -fopenmp which conflicts with the shared std.pcm (built
+# without OpenMP). Disable module scanning so CMake doesn't inject the
+# incompatible modmap for this target.
+set_target_properties(openmp_benchmarks PROPERTIES CXX_SCAN_FOR_MODULES OFF)
+
+# TODO: remove this hack when we have LLVM 23
+
+# Signal to shared benchmark headers that this target cannot `import std;`
+# and must use textual standard headers instead.
+target_compile_definitions(openmp_benchmarks PRIVATE LF_BENCH_NO_IMPORT_STD)
+
+target_sources(openmp_benchmarks
+  PRIVATE
+    fib.cpp uts.cpp
+)
+
+target_link_libraries(openmp_benchmarks
+  PUBLIC
+    benchmark_common
+    OpenMP::OpenMP_CXX
+)
diff --git a/benchmark/src/openmp/fib.cpp b/benchmark/src/openmp/fib.cpp
new file mode 100644
index 000000000..4643ee67d
--- /dev/null
+++ b/benchmark/src/openmp/fib.cpp
@@ -0,0 +1,59 @@
+#include <cstdint>
+#include <format>
+
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+namespace {
+
+auto fib(std::int64_t n) -> std::int64_t {
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t lhs = 0;
+  std::int64_t rhs = 0;
+
+#pragma omp task untied shared(lhs) firstprivate(n) default(none)
+  lhs = fib(n - 2);
+
+  rhs = fib(n - 1);
+
+#pragma omp taskwait
+  return lhs + rhs;
+}
+
+template <typename = void>
+void fib_run(benchmark::State &state) {
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+  int threads = static_cast<int>(state.range(1));
+
+  state.counters["n"] = static_cast<double>(n);
+  state.counters["p"] = static_cast<double>(threads);
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(threads));
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    std::int64_t return_value = 0;
+
+#pragma omp parallel num_threads(threads) default(shared)
+#pragma omp single nowait
+    {
+      return_value = fib(n);
+    }
+
+    if (return_value != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", return_value, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(return_value);
+  }
+}
+
+} // namespace
+
+BENCH_ALL_MT(fib_run, openmp, fib, fib)
diff --git a/benchmark/src/openmp/uts.cpp b/benchmark/src/openmp/uts.cpp
new file mode 100644
index 000000000..b91d9e1e5
--- /dev/null
+++ b/benchmark/src/openmp/uts.cpp
@@ -0,0 +1,86 @@
+#include <algorithm>
+#include <cstddef>
+#include <format>
+#include <vector>
+
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "uts.hpp"
+
+namespace {
+
+auto uts(int depth, Node *parent) -> result {
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+      cs[i].child.type = child_type;
+      cs[i].child.height = parent->height + 1;
+      cs[i].child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+      }
+
+      if (i + 1 == static_cast<std::size_t>(num_children)) {
+        cs[i].res = uts(depth + 1, &cs[i].child);
+      } else {
+#pragma omp task untied shared(cs) firstprivate(depth, i) default(none)
+        cs[i].res = uts(depth + 1, &cs[i].child);
+      }
+    }
+
+#pragma omp taskwait
+
+    for (auto &&elem : cs) {
+      r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+      r.size += elem.res.size;
+      r.leaves += elem.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_run(benchmark::State &state, uts_tree tree) {
+  int threads = static_cast<int>(state.range(0));
+
+  setup_tree(tree);
+  auto expect = expected_result(tree);
+
+  state.counters["p"] = static_cast<double>(threads);
+  state.SetComplexityN(static_cast<benchmark::IterationCount>(threads));
+
+  for (auto _ : state) {
+    Node root;
+    uts_initRoot(&root, type);
+    result r;
+
+#pragma omp parallel num_threads(threads) default(shared)
+#pragma omp single nowait
+    {
+      r = uts(0, &root);
+    }
+
+    if (r != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", r, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(r);
+  }
+}
+
+} // namespace
+
+UTS_BENCH_ALL_MT(uts_run, openmp)
diff --git a/benchmark/src/serial/CMakeLists.txt b/benchmark/src/serial/CMakeLists.txt
new file mode 100644
index 000000000..80c53454c
--- /dev/null
+++ b/benchmark/src/serial/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_library(serial_benchmarks)
+
+target_sources(serial_benchmarks PRIVATE fib.cpp uts.cpp)
+
+target_link_libraries(serial_benchmarks PUBLIC benchmark_common)
diff --git a/benchmark/src/serial/fib.cpp b/benchmark/src/serial/fib.cpp
new file mode 100644
index 000000000..2b65104ef
--- /dev/null
+++ b/benchmark/src/serial/fib.cpp
@@ -0,0 +1,78 @@
+#include <benchmark/benchmark.h>
+
+#include "fib.hpp"
+#include "macros.hpp"
+
+import std;
+
+namespace {
+
+auto fib_impl(std::int64_t &ret, std::int64_t n) -> void {
+  if (n < 2) {
+    ret = n;
+    return;
+  }
+
+  std::int64_t lhs = 0;
+  std::int64_t rhs = 0;
+
+  fib_impl(lhs, n - 2);
+  fib_impl(rhs, n - 1);
+
+  ret = lhs + rhs;
+}
+
+template <typename = void>
+void fib_serial(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    std::int64_t result = 0;
+    fib_impl(result, n);
+    if (result != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
+      break;
+    }
+    benchmark::DoNotOptimize(result);
+  }
+}
+
+auto fib_ret_impl(std::int64_t n) -> std::int64_t {
+  if (n < 2) {
+    return n;
+  }
+
+  std::int64_t lhs = fib_ret_impl(n - 1);
+  std::int64_t rhs = fib_ret_impl(n - 2);
+
+  return lhs + rhs;
+}
+
+template <typename = void>
+void fib_serial_return(benchmark::State &state) {
+
+  std::int64_t n = state.range(0);
+  std::int64_t expect = fib_ref(n);
+
+  state.counters["n"] = static_cast<double>(n);
+
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(n);
+    std::int64_t result = fib_ret_impl(n);
+    if (result != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", result, expect));
+      break;
+    }
+    benchmark::DoNotOptimize(result);
+  }
+}
+
+} // namespace
+
+BENCH_ALL(fib_serial, serial, fib, fib)
+BENCH_ALL(fib_serial_return, serial, fib / return, fib)
diff --git a/benchmark/src/serial/uts.cpp b/benchmark/src/serial/uts.cpp
new file mode 100644
index 000000000..291934e79
--- /dev/null
+++ b/benchmark/src/serial/uts.cpp
@@ -0,0 +1,125 @@
+#include <benchmark/benchmark.h>
+
+#include "macros.hpp"
+#include "uts.hpp"
+
+import std;
+
+namespace {
+
+auto uts_traverse(int depth, Node *parent) -> result {
+
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    std::vector<pair> cs(static_cast<std::size_t>(num_children));
+
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+      cs[i].child.type = child_type;
+      cs[i].child.height = parent->height + 1;
+      cs[i].child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs[i].child.state.state, static_cast<int>(i));
+      }
+
+      cs[i].res = uts_traverse(depth + 1, &cs[i].child);
+    }
+
+    for (auto &&elem : cs) {
+      r.maxdepth = std::max(r.maxdepth, elem.res.maxdepth);
+      r.size += elem.res.size;
+      r.leaves += elem.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_serial(benchmark::State &state, uts_tree tree) {
+  setup_tree(tree);
+  auto expect = expected_result(tree);
+
+  for (auto _ : state) {
+    Node root;
+    uts_initRoot(&root, type);
+    result r = uts_traverse(0, &root);
+
+    if (r != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", r, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(r);
+  }
+}
+
+} // namespace
+
+UTS_BENCH_ALL(uts_serial, serial)
+
+namespace {
+
+auto uts_traverse_no_alloc(int depth, Node *parent) -> result {
+
+  result r{.maxdepth = static_cast<counter_t>(depth), .size = counter_t{1}, .leaves = counter_t{0}};
+
+  int num_children = uts_numChildren(parent);
+  int child_type = uts_childType(parent);
+
+  parent->numChildren = num_children;
+
+  if (num_children > 0) {
+    for (std::size_t i = 0; i < static_cast<std::size_t>(num_children); ++i) {
+
+      pair cs;
+
+      cs.child.type = child_type;
+      cs.child.height = parent->height + 1;
+      cs.child.numChildren = -1;
+
+      for (int j = 0; j < computeGranularity; ++j) {
+        rng_spawn(parent->state.state, cs.child.state.state, static_cast<int>(i));
+      }
+
+      cs.res = uts_traverse(depth + 1, &cs.child);
+
+      r.maxdepth = std::max(r.maxdepth, cs.res.maxdepth);
+      r.size += cs.res.size;
+      r.leaves += cs.res.leaves;
+    }
+  } else {
+    r.leaves = 1;
+  }
+
+  return r;
+}
+
+void uts_serial_no_alloc(benchmark::State &state, uts_tree tree) {
+  setup_tree(tree);
+  auto expect = expected_result(tree);
+
+  for (auto _ : state) {
+    Node root;
+    uts_initRoot(&root, type);
+    result r = uts_traverse_no_alloc(0, &root);
+
+    if (r != expect) {
+      state.SkipWithError(std::format("incorrect result: {} != {}", r, expect));
+      break;
+    }
+
+    benchmark::DoNotOptimize(r);
+  }
+}
+
+} // namespace
+
+UTS_BENCH_ALL(uts_serial_no_alloc, serial / no_alloc)
diff --git a/cmake/gcc-brew-toolchain.cmake b/cmake/gcc-brew-toolchain.cmake
new file mode 100644
index 000000000..aa67ccaaf
--- /dev/null
+++ b/cmake/gcc-brew-toolchain.cmake
@@ -0,0 +1,92 @@
+cmake_minimum_required(VERSION 4.2.1)
+
+# Set up Homebrew GCC@15 & Ninja toolchain for CMake
+
+find_program(BREW_EXE brew)
+
+if(NOT BREW_EXE)
+  message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.")
+endif()
+
+# --- Ninja
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix ninja
+  OUTPUT_VARIABLE NINJA_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_MAKE_PROGRAM
+  NAMES ninja
+  HINTS "${NINJA_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- GCC
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix gcc
+  OUTPUT_VARIABLE GCC_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_C_COMPILER
+  NAMES gcc-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_CXX_COMPILER
+  NAMES g++-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_AR
+  NAMES gcc-ar-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_RANLIB
+  NAMES gcc-ranlib-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_NM
+  NAMES gcc-nm-HEAD
+  HINTS "${GCC_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- Binutils
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix binutils
+  OUTPUT_VARIABLE BINUTILS_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -B${BINUTILS_PREFIX}/bin/" CACHE STRING "" FORCE)
+
+
+# Get macOS SDK path (only on macOS)
+if(APPLE)
+  execute_process(
+    COMMAND xcrun --show-sdk-path
+    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+endif()
diff --git a/cmake/llvm-brew-toolchain.cmake b/cmake/llvm-brew-toolchain.cmake
new file mode 100644
index 000000000..199bdae34
--- /dev/null
+++ b/cmake/llvm-brew-toolchain.cmake
@@ -0,0 +1,88 @@
+cmake_minimum_required(VERSION 4.2.1)
+
+# Set up Homebrew LLVM & Ninja toolchain for CMake
+
+find_program(BREW_EXE brew)
+
+if(NOT BREW_EXE)
+  message(FATAL_ERROR "Could not find 'brew' executable. Please install Homebrew.")
+endif()
+
+# --- Ninja
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix ninja
+  OUTPUT_VARIABLE NINJA_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_MAKE_PROGRAM
+  NAMES ninja
+  HINTS "${NINJA_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# --- LLVM
+
+execute_process(
+  COMMAND ${BREW_EXE} --prefix llvm
+  OUTPUT_VARIABLE LLVM_PREFIX
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+  COMMAND_ERROR_IS_FATAL ANY
+)
+
+find_program(CMAKE_C_COMPILER
+  NAMES clang
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_CXX_COMPILER
+  NAMES clang++
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_AR
+  NAMES llvm-ar
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_RANLIB
+  NAMES llvm-ranlib
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+find_program(CMAKE_NM
+  NAMES llvm-nm
+  HINTS "${LLVM_PREFIX}/bin"
+  NO_DEFAULT_PATH
+  REQUIRED
+)
+
+# Dynamically find the standard library modules JSON, brew puts it in the wrong place
+file(GLOB_RECURSE LIBCXX_MODULES_JSON "${LLVM_PREFIX}/lib/**/libc++.modules.json")
+
+if(LIBCXX_MODULES_JSON)
+  set(CMAKE_CXX_STDLIB_MODULES_JSON "${LIBCXX_MODULES_JSON}")
+else()
+  message(FATAL_ERROR "Could not automatically find libc++.modules.json in ${LLVM_PREFIX}")
+endif()
+
+# Get macOS SDK path (only on macOS)
+if(APPLE)
+  execute_process(
+    COMMAND xcrun --show-sdk-path
+    OUTPUT_VARIABLE CMAKE_OSX_SYSROOT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    COMMAND_ERROR_IS_FATAL ANY
+  )
+endif()
diff --git a/docs/api.md b/docs/api.md
new file mode 100644
index 000000000..edf767d82
--- /dev/null
+++ b/docs/api.md
@@ -0,0 +1,287 @@
+# libfork Public API
+
+All symbols live in the `lf` namespace. Access them via `import libfork;`.
+
+---
+
+## Core concepts
+
+### `concept returnable<T>` — `:task`
+
+`T` is `void` or a `std::movable` plain object type. Used as the return-type constraint on async functions.
+
+### `concept worker_stack<T>` — `:concepts_stack`
+
+A type that provides a contiguous stack with `push`, `pop`, `checkpoint`, `prepare_release`, `release`, and `acquire`.
+
+### `concept lifo_stack<T, U>` — `:concepts_context`
+
+`T` is a plain object type supporting `push(U)` and a `noexcept pop() -> U`. Used to define `worker_context`.
+
+### `concept worker_context<T>` — `:concepts_context`
+
+A type that satisfies `lifo_stack<T, steal_handle<T>>` and exposes a `worker_stack` via a `noexcept stack()`.
+
+### `using stack_t<Context>` — `:concepts_context`
+
+Extracts the stack type from a `worker_context`.
+
+### `concept has_context_typedef<T>` — `:concepts_scheduler`
+
+`T` has a `context_type` member typedef. Used to define `scheduler` and constrain `context_t`.
+
+### `concept scheduler<Sch>` — `:concepts_scheduler`
+
+A type satisfying `has_context_typedef` with a `post(sched_handle<context_type>)` method.
+
+### `using context_t<T>` — `:concepts_scheduler`
+
+Extracts `T::context_type`. Requires `has_context_typedef<T>`.
+
+### `concept async_invocable<Fn, Context, Args...>` — `:concepts_invocable`
+
+`Fn` is callable with an `env<Context>` (or without one) and `Args...`, returning an `lf::task`.
+
+### `concept async_nothrow_invocable<Fn, Context, Args...>` — `:concepts_invocable`
+
+Subsumes `async_invocable` and requires the call to be `noexcept`.
+
+### `using async_result_t<Fn, Context, Args...>` — `:concepts_invocable`
+
+The `value_type` of the `task` returned by invoking `Fn`.
+
+### `concept async_invocable_to<Fn, R, Context, Args...>` — `:concepts_invocable`
+
+Subsumes `async_invocable` and constrains the result type to `R`.
+
+### `concept async_nothrow_invocable_to<Fn, R, Context, Args...>` — `:concepts_invocable`
+
+Subsumes both `async_nothrow_invocable` and `async_invocable_to`.
+
+---
+
+## Coroutine types
+
+### `struct env<Context>` — `:task`
+
+The Y-combinator environment. Passed as the first argument to every async function, allowing recursive self-calls. Users declare it but never construct it directly.
+
+### `class task<T, Context>` — `:task`
+
+The return type of all async functions. `T` must satisfy `returnable`. Users never store or manipulate instances — the type exists solely to identify libfork coroutines.
+
+---
+
+## Handles
+
+### `struct unsafe_steal_handle` — `:handles`
+
+Untyped steal handle. Used by `deque_policy` implementations to store handles without knowing the full context type.
+
+### `struct unsafe_sched_handle` — `:handles`
+
+Untyped schedule handle. Used when type-erasing across context types.
+
+### `struct steal_handle<Context>` — `:handles`
+
+Typed handle to a task suspended at a fork point; passed to `context.push()` and returned by `context.pop()` / `context.steal()`.
+
+### `struct sched_handle<Context>` — `:handles`
+
+Typed handle to a task ready to be started or resumed; passed to `scheduler::post()` and `execute()`.
+
+---
+
+## Scope and task operations
+
+### `constexpr auto scope() -> scope_type` — `:ops`
+
+Primary entry point for fork/call/join. `co_await` this to obtain a `scope_ops<Context>` which provides:
+
+- `.fork(ret, fn, args...)` / `.fork(fn, args...)` / `.fork_drop(fn, args...)` — spawn concurrent child
+- `.call(ret, fn, args...)` / `.call(fn, args...)` / `.call_drop(fn, args...)` — inline child call
+- `.join()` — wait for all outstanding children
+
+### `constexpr auto child_scope() -> child_scope_type` — `:ops`
+
+Entry point for a cancellable scope. `co_await` this to obtain a `child_scope_ops<Context>`, which extends `scope_ops` and also inherits from `stop_source`. Tasks forked/called through this scope receive the scope's stop token.
+
+---
+
+## Cancellation
+
+### `class stop_source` — `:stop`
+
+A non-copyable, non-movable stop source. Methods: `token()`, `stop_possible()`, `stop_requested()`, `request_stop()`, `race_request_stop()`.
+
+### `class stop_source::stop_token` — `:stop`
+
+Lightweight copyable token. Methods: `stop_possible()`, `stop_requested()`.
+
+---
+
+## Scheduling
+
+### `class recv_state<T, Stoppable = false>` — `:receiver`
+
+Pre-allocated shared state for a root task. Constructors mirror `make_shared` / `allocate_shared`:
+
+```cpp
+recv_state<int> s;                                   // default-init
+recv_state<int> s{42};                               // in-place init
+recv_state<int> s{std::allocator_arg, alloc};        // custom allocator
+recv_state<int> s{std::allocator_arg, alloc, 42};    // custom allocator + in-place init
+recv_state<int, true> s;                             // cancellable variant
+```
+
+Move-only. Pass to `schedule()` to get back a `receiver`.
+
+### `class receiver<T, Stoppable = false>` — `:receiver`
+
+Handle to the result of a scheduled root task. Methods:
+
+- `.valid()` — whether the receiver is connected to state
+- `.ready()` — whether the task has completed
+- `.wait()` — block until complete (may be called multiple times)
+- `.stop_source()` — access the stop source (only when `Stoppable = true`)
+- `.get()` — consume the result, rethrowing any stored exception; throws `operation_cancelled_error` if cancelled
+
+### `auto schedule(Sch&&, recv_state<R,S>, Fn&&, Args&&...) -> receiver<R,S>` — `:schedule`
+
+Schedule an async function as a root task using a pre-allocated `recv_state`.
+
+### `auto schedule(Sch&&, Fn&&, Args&&...) -> receiver<R>` — `:schedule`
+
+Convenience overload: default-constructs a non-cancellable `recv_state<R>`.
+
+### `void execute(Context&, sched_handle<Context>)` — `:execute`
+
+Bind the calling thread to `context` and resume the scheduled task. Used by scheduler implementations.
+
+### `void execute(Context&, steal_handle<Context>)` — `:execute`
+
+Bind the calling thread to `context` and resume a stolen task. Used by scheduler implementations.
+
+---
+
+## Polymorphic context base classes
+
+### `class base_context<Stack>` — `:poly_context`
+
+CRTP base providing `stack()` -> `Stack&`. Inherit from this (or `poly_context`) when implementing a custom context.
+
+### `class poly_context<Stack>` — `:poly_context`
+
+Abstract base for polymorphic contexts. Provides pure-virtual `push(steal_handle<poly_context>)`, `pop()`, and a defaulting `post(sched_handle<poly_context>)` that throws `post_error`.
+
+---
+
+## Exception hierarchy
+
+All exceptions derive from `lf::libfork_exception : std::exception`.
+
+| Type                        | Thrown by              | Condition                                |
+| --------------------------- | ---------------------- | ---------------------------------------- |
+| `libfork_exception`         | —                      | Base type; catch-all for libfork errors  |
+| `schedule_error`            | `schedule()`           | Called from a worker thread              |
+| `execute_error`             | `execute()`            | Called from a worker thread              |
+| `steal_overflow_error`      | `execute()`            | A single task stolen > 65,535 times      |
+| `root_alloc_error`          | `schedule()`           | Root frame too large for inline buffer   |
+| `broken_receiver_error`     | `receiver` methods     | Receiver is in an invalid state          |
+| `operation_cancelled_error` | `receiver::get()`      | Task was cancelled via stop token        |
+| `post_error`                | `poly_context::post()` | Derived context does not override `post` |
+| `deque_full_error`          | `deque::push()`        | Deque has reached maximum capacity       |
+
+---
+
+## Batteries: stacks
+
+All stacks satisfy `worker_stack`. Template parameter is an allocator for `std::byte`.
+
+### `class geometric_stack<Allocator>` — `:geometric_stack`
+
+Segmented stack with geometric growth and segment caching. Recommended default.
+
+### `class adaptor_stack<Allocator>` — `:adaptor_stack`
+
+Thin allocator-backed stack; allocates/deallocates on every push/pop.
+
+### `class slab_stack<Allocator>` — `:slab_stack`
+
+Fixed-capacity slab stack; throws on overflow.
+
+---
+
+## Batteries: deque and adaptors
+
+### `class deque<T, Allocator>` — `:deque`
+
+Lock-free Chase-Lev work-stealing deque. `T` must be `lock_free` and `default_initializable`. Methods: `push(T)`, `pop() -> std::optional<T>`, `get_thief() -> thief_handle`.
+
+### `class deque::thief_handle` — `:deque`
+
+Non-owning steal handle obtained via `deque::get_thief()`. Method: `steal(Fn on_empty) -> std::optional<T>`.
+
+### `enum class err` — `:deque`
+
+Return code from low-level steal operations: `none`, `lost`, `empty`.
+
+### `struct steal_t<T>` — `:deque`
+
+Steal result wrapper returned by `thief_handle::steal`. Has `err code` and `T val` fields; `operator bool` tests `code == err::none`.
+
+### `class adapt_vector<Allocator>` — `:adaptors`
+
+`std::vector`-backed LIFO deque policy. Satisfies `deque_policy`.
+
+### `class adapt_deque<Allocator>` — `:adaptors`
+
+Lock-free deque-backed policy. Satisfies both `deque_policy` and `stealable_deque_policy`.
+
+---
+
+## Batteries: context policies and contexts
+
+### `concept deque_policy<T>` — `:contexts`
+
+A type that is a LIFO stack over `unsafe_steal_handle` (has `push` and `pop`).
+
+### `concept stealable_deque_policy<T>` — `:contexts`
+
+Extends `deque_policy` with a `steal() -> unsafe_steal_handle` method for FIFO work stealing.
+
+### `class mono_context<Stack, Deque>` — `:contexts`
+
+Monomorphic worker context. Composes a `worker_stack` and a `deque_policy`. Satisfies `worker_context<mono_context>`. Exposes `steal()` when `Deque` satisfies `stealable_deque_policy`.
+
+### `class derived_poly_context<Stack, Deque>` — `:contexts`
+
+Polymorphic worker context. Derives from `poly_context<Stack>` and implements `push`/`pop` via `Deque`. Exposes `steal()` when `Deque` satisfies `stealable_deque_policy`. The `context_type` alias is `poly_context<Stack>`.
+
+---
+
+## Schedulers
+
+### `concept derived_worker_context<Context>` — `:inline_scheduler`
+
+`Context` has a `context_type` typedef and is derived from it (i.e., it is a concrete subclass of its own context type).
+
+### `class inline_scheduler<Context>` — `:inline_scheduler`
+
+Single-threaded synchronous scheduler. Stores one `Context` instance; `post()` calls `execute()` directly on the calling thread.
+
+### `enum class pool_kind` — `:basic_busy_pool`
+
+`mono` — uses `mono_context`; `poly` — uses `derived_poly_context`.
+
+### `class basic_busy_pool<Kind, Stack, Deque, Alloc>` — `:basic_busy_pool`
+
+Work-stealing thread pool using busy-wait. Spawns `N` worker threads (default: `std::thread::hardware_concurrency()`). Constructor: `basic_busy_pool(n_threads)`.
+
+### `using mono_busy_pool<Stack, Deque, Alloc>` — `:basic_busy_pool`
+
+Alias for `basic_busy_pool<pool_kind::mono, ...>`.
+
+### `using poly_busy_pool<Stack, Deque, Alloc>` — `:basic_busy_pool`
+
+Alias for `basic_busy_pool<pool_kind::poly, ...>`.
diff --git a/docs/structure.md b/docs/structure.md
new file mode 100644
index 000000000..cb1e5203d
--- /dev/null
+++ b/docs/structure.md
@@ -0,0 +1,24 @@
+# Structure of libfork
+
+Libfork is organized into several modules:
+
+- `libfork`: Meta module that re-exports all public modules
+  - tuple
+  - etc
+- `libfork.utils`: Independent internal utilities, not part of the public API
+- `libfork.core`: Core functionality of libfork including:
+  - Task template
+  - Task handles
+  - Concepts for context/stack/scheduler
+  - Fork/call primitives
+  - Execute primitives (for starting work)
+  - Schedule primitive (for launching work)
+  - Polymorphic context ABC
+  - \[internal\] Promise/frame
+  - \[internal\] Thread locals
+- `libfork.batteries`: Collection of context, stack and other types
+  - The `::stacks` namespace
+  - Contexts
+  - adaptors
+- `libfork.schedulers`: Collection of schedulers
+  - Inline scheduler
diff --git a/docs/tour.md b/docs/tour.md
index 4c1800dd3..45b7b1fe5 100644
--- a/docs/tour.md
+++ b/docs/tour.md
@@ -25,7 +25,7 @@ Definitions:
 - __Parent:__ A task that spawns other tasks.
 - __Child:__ A task that is spawned by another task.
 
-The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, lets start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this:
+The tasking/fork-join interface is designed to mirror [Cilk](https://en.wikipedia.org/wiki/Cilk) and other fork-join frameworks. The best way to learn is by example, let's start with the canonical introduction to fork-join, the recursive Fibonacci function, in regular C++ it looks like this:
 
 ```cpp
 auto fib(int n) -> int {
diff --git a/include/libfork/__impl/assume.hpp b/include/libfork/__impl/assume.hpp
new file mode 100644
index 000000000..b40cd4fab
--- /dev/null
+++ b/include/libfork/__impl/assume.hpp
@@ -0,0 +1,56 @@
+#pragma once
+
+#include "libfork/__impl/exception.hpp"
+
+/**
+ * @file assume.hpp
+ *
+ * @brief A collection of internal macros.
+ *
+ * These macros are not safe to use unless `import std` is in scope.
+ */
+
+/**
+ * @brief If expr evaluates to `false`, terminates the program with an error message.
+ *
+ * This macro is always active, regardless of optimization settings or `NDEBUG`.
+ */
+#define LF_ENSURE(...)                                                                                       \
+  do {                                                                                                       \
+    if (!(__VA_ARGS__)) {                                                                                    \
+      LF_TERMINATE("Assumption '" #__VA_ARGS__ "' failed!");                                                 \
+    }                                                                                                        \
+  } while (false)
+
+/**
+ * @brief Invokes undefined behavior if ``expr`` evaluates to `false`.
+ *
+ * \rst
+ *
+ *  .. warning::
+ *
+ *    This has different semantics than ``[[assume(expr)]]`` as it WILL evaluate the
+ *    expression at runtime. Hence you should conservatively only use this macro
+ *    if ``expr`` is side-effect free and cheap to evaluate.
+ *
+ * \endrst
+ */
+#ifdef NDEBUG
+  #define LF_ASSUME(...)                                                                                     \
+    do {                                                                                                     \
+      if (!(__VA_ARGS__)) {                                                                                  \
+        ::std::unreachable();                                                                                \
+      }                                                                                                      \
+    } while (false)
+#else
+  #define LF_ASSUME(...) LF_ENSURE(__VA_ARGS__)
+#endif
+
+#ifdef NDEBUG
+  #define LF_UNREACHABLE()                                                                                   \
+    do {                                                                                                     \
+      ::std::unreachable();                                                                                  \
+    } while (false)
+#else
+  #define LF_UNREACHABLE() LF_TERMINATE("This code should be unreachable!");
+#endif
diff --git a/include/libfork/__impl/compiler.hpp b/include/libfork/__impl/compiler.hpp
new file mode 100644
index 000000000..8e71083fc
--- /dev/null
+++ b/include/libfork/__impl/compiler.hpp
@@ -0,0 +1,57 @@
+#pragma once
+
+#include "libfork/__impl/exception.hpp"
+
+/**
+ * @file compiler.hpp
+ *
+ * @brief A collection of internal macros.
+ *
+ * These macros are standalone i.e. they can be used without importing/including anything else.
+ */
+
+// =============== Inlining/optimization =============== //
+
+/**
+ * @brief Macro to use next to 'inline' to force a function to be inlined.
+ *
+ * \rst
+ *
+ * .. note::
+ *
+ *    This does not imply the c++'s `inline` keyword which also has an effect on linkage.
+ *
+ * \endrst
+ */
+#if !defined(LF_FORCE_INLINE)
+  #if defined(_MSC_VER) && !defined(__clang__)
+    #define LF_FORCE_INLINE __forceinline
+  #elif defined(__GNUC__) && __GNUC__ > 3
+  // Clang also defines __GNUC__ (as 4)
+    #define LF_FORCE_INLINE __attribute__((__always_inline__))
+  #else
+    #define LF_FORCE_INLINE
+  #endif
+#endif
+
+/**
+ * @brief Macro to prevent a function to be inlined.
+ */
+#if !defined(LF_NO_INLINE)
+  #if defined(_MSC_VER) && !defined(__clang__)
+    #define LF_NO_INLINE __declspec(noinline)
+  #elif defined(__GNUC__) && __GNUC__ > 3
+    // Clang also defines __GNUC__ (as 4)
+    #if defined(__CUDACC__)
+      // nvcc doesn't always parse __noinline__, see: https://svn.boost.org/trac/boost/ticket/9392
+      #define LF_NO_INLINE __attribute__((noinline))
+    #elif defined(__HIP__)
+      // See https://github.com/boostorg/config/issues/392
+      #define LF_NO_INLINE __attribute__((noinline))
+    #else
+      #define LF_NO_INLINE __attribute__((__noinline__))
+    #endif
+  #else
+    #define LF_NO_INLINE
+  #endif
+#endif
diff --git a/include/libfork/__impl/exception.hpp b/include/libfork/__impl/exception.hpp
new file mode 100644
index 000000000..9868d7e75
--- /dev/null
+++ b/include/libfork/__impl/exception.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+/**
+ * @file exception.hpp
+ *
+ * @brief A collection of internal macros for exception handling.
+ *
+ * These macros are standalone i.e. they can be used without importing/including anything else.
+ */
+
+/**
+ * @brief Detects if the compiler has exceptions enabled.
+ *
+ * Overridable by defining `LF_COMPILER_EXCEPTIONS` globally.
+ */
+#ifndef LF_COMPILER_EXCEPTIONS
+  #if defined(__cpp_exceptions) || (defined(_MSC_VER) && defined(_CPPUNWIND)) || defined(__EXCEPTIONS)
+    #define LF_COMPILER_EXCEPTIONS 1
+  #else
+    #define LF_COMPILER_EXCEPTIONS 0
+  #endif
+#endif
+
+namespace lf::impl {
+
+/**
+ * @brief Calls `std::terminate` after printing `msg`.
+ */
+[[noreturn]]
+void terminate_with(char const *message, char const *file, int line) noexcept;
+
+} // namespace lf::impl
+
+#define LF_TERMINATE(message) ::lf::impl::terminate_with((message), __FILE__, __LINE__)
+
+#if LF_COMPILER_EXCEPTIONS
+  /**
+   * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``.
+   */
+  #define LF_TRY try
+  /**
+   * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH_ALL catch (...)
+  /**
+   * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH(...) catch (__VA_ARGS__)
+  /**
+   * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_THROW(X) throw X
+  /**
+   * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_RETHROW throw
+#else
+  /**
+   * @brief Expands to ``try`` if exceptions are enabled, otherwise expands to ``if constexpr (true)``.
+   */
+  #define LF_TRY if constexpr (true)
+  /**
+   * @brief Expands to ``catch (...)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH_ALL if constexpr (false)
+  /**
+   * @brief Expands to ``catch (__VA_ARGS__)`` if exceptions are enabled, otherwise ``if constexpr (false)``.
+   */
+  #define LF_CATCH(...) if constexpr (false)
+  /**
+   * @brief Expands to ``throw X`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_THROW(X) LF_TERMINATE("Tried to throw '" #X "' without compiler exceptions")
+  /**
+   * @brief Expands to ``throw`` if exceptions are enabled, otherwise terminates the program.
+   */
+  #define LF_RETHROW LF_TERMINATE("Tried to rethrow without compiler exceptions")
+#endif
diff --git a/include/libfork/__impl/utils.hpp b/include/libfork/__impl/utils.hpp
new file mode 100644
index 000000000..2adbf49c9
--- /dev/null
+++ b/include/libfork/__impl/utils.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+/**
+ * @file utils.hpp
+ *
+ * @brief A collection of internal utility macros.
+ *
+ * These macros are not safe to use unless `import std` is in scope.
+ */
+
+// =============== Utility =============== //
+
+// clang-format off
+
+/**
+ * @brief Use like `BOOST_HOF_RETURNS` to define a function/lambda with all the noexcept/decltype specifiers.
+ *
+ * This macro is not truly variadic but the ``...`` allows commas in the macro argument.
+ */
+#define LF_HOF(...) noexcept(noexcept(__VA_ARGS__)) -> decltype(__VA_ARGS__) { return __VA_ARGS__;}
+
+// clang-format on
+
+/**
+ * @brief Use like `std::forward` to perfectly forward an expression.
+ */
+#define LF_FWD(...) ::std::forward<decltype(__VA_ARGS__)>(__VA_ARGS__)
+
+/**
+ * @brief Use to define a `T` that is aligned to the required alignment of `std::atomic_ref<T>`.
+ */
+#define ATOMIC_ALIGN(T) alignas(std::atomic_ref<T>::required_alignment) T
diff --git a/include/libfork/version.hpp b/include/libfork/version.hpp
index d048ad622..03f18744e 100644
--- a/include/libfork/version.hpp
+++ b/include/libfork/version.hpp
@@ -1,3 +1,5 @@
+#pragma once
+
 /**
  * @brief __[public]__ The major version of libfork.
  *
diff --git a/src/batteries/adaptor_stack.cxx b/src/batteries/adaptor_stack.cxx
new file mode 100644
index 000000000..2c333b7b7
--- /dev/null
+++ b/src/batteries/adaptor_stack.cxx
@@ -0,0 +1,117 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:adaptor_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief An adaptor_stack wraps a standard allocator to satisfy the worker_stack concept.
+ *
+ * Every push/pop directly allocates/deallocates through the allocator. This is the simplest
+ * possible stack implementation — no caching, no geometric growth — just a thin wrapper.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`.
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class adaptor_stack {
+
+  struct alignas(k_new_align) aligned {};
+
+  static_assert(sizeof(aligned) == k_new_align);
+
+  using align_trait = std::allocator_traits<Allocator>::template rebind_traits<aligned>;
+  using align_alloc = align_trait::allocator_type;
+
+  using alloc_ptr = align_trait::pointer;
+  using void_ptr = align_trait::void_pointer;
+
+  using size_type = align_trait::size_type;
+
+  struct release_t {
+    explicit constexpr release_t(key_t /*unused*/) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend adaptor_stack;
+    explicit constexpr checkpoint_t(align_alloc const &alloc) noexcept
+        : m_alloc(alloc) {}
+
+    struct empty {
+      constexpr empty() noexcept = default;
+      constexpr auto operator==(empty const &) const noexcept -> bool = default;
+      explicit constexpr empty(align_alloc const & /*unused*/) noexcept {}
+    };
+
+    std::conditional_t<align_trait::is_always_equal::value, empty, align_alloc> m_alloc;
+  };
+
+ public:
+  constexpr adaptor_stack() noexcept(noexcept(Allocator()))
+      : adaptor_stack(Allocator()) {}
+  explicit constexpr adaptor_stack(Allocator const &alloc) noexcept
+      : m_alloc(alloc) {}
+
+  // TODO: drop constexpr for =default and =delete across the lib
+
+  constexpr adaptor_stack(adaptor_stack const &) = delete;
+  constexpr adaptor_stack(adaptor_stack &&) = delete;
+
+  constexpr auto operator=(adaptor_stack const &) -> adaptor_stack & = delete;
+  constexpr auto operator=(adaptor_stack &&) -> adaptor_stack & = delete;
+
+  /**
+   * @brief Get a checkpoint of the stack.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_alloc};
+  }
+
+  /**
+   * @brief Allocate size bytes and return a pointer to the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    LF_ASSUME(size > 0);
+    size_type num_aligned = safe_cast<size_type>(((size - 1) / k_new_align) + 1);
+    return static_cast<void_ptr>(align_trait::allocate(m_alloc, num_aligned));
+  }
+
+  /**
+   * @brief Deallocate the allocation at ptr of size n.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t size) noexcept {
+    LF_ASSUME(size > 0);
+    size_type num_aligned = safe_cast<size_type>(((size - 1) / k_new_align) + 1);
+    align_trait::deallocate(m_alloc, static_cast<alloc_ptr>(ptr), num_aligned);
+  }
+
+  [[nodiscard]]
+  constexpr auto prepare_release() const noexcept -> release_t {
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t key) noexcept {}
+
+  constexpr void acquire(checkpoint_t const &ckpt) noexcept {
+    if constexpr (!align_trait::is_always_equal::value) {
+      m_alloc = ckpt.m_alloc;
+    }
+  }
+
+ private:
+  align_alloc m_alloc;
+};
+
+} // namespace lf
diff --git a/src/batteries/adaptors.cxx b/src/batteries/adaptors.cxx
new file mode 100644
index 000000000..243fe043a
--- /dev/null
+++ b/src/batteries/adaptors.cxx
@@ -0,0 +1,71 @@
+export module libfork.batteries:adaptors;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+import :deque;
+
+namespace lf {
+
+export template <allocator_of<unsafe_steal_handle> Allocator = std::allocator<unsafe_steal_handle>>
+class adapt_vector {
+ public:
+  constexpr adapt_vector() noexcept(noexcept(Allocator()))
+      : adapt_vector(Allocator()) {}
+
+  explicit constexpr adapt_vector(Allocator const &alloc) noexcept
+      : m_vector(alloc) {}
+
+  constexpr void push(unsafe_steal_handle value) { m_vector.push_back(value); }
+
+  constexpr auto pop() noexcept -> unsafe_steal_handle {
+    if (!m_vector.empty()) {
+      unsafe_steal_handle value = m_vector.back();
+      m_vector.pop_back();
+      return value;
+    }
+    return {};
+  }
+
+ private:
+  std::vector<unsafe_steal_handle, Allocator> m_vector;
+};
+
+export template <allocator_of<std::atomic<unsafe_steal_handle>> Allocator =
+                     std::allocator<std::atomic<unsafe_steal_handle>>>
+class adapt_deque {
+ public:
+  using size_type = deque<unsafe_steal_handle, Allocator>::size_type;
+
+ private:
+  static constexpr size_type k_default_capacity = 1024 * 32;
+
+ public:
+  constexpr adapt_deque()
+      : adapt_deque(k_default_capacity, Allocator()) {}
+
+  explicit constexpr adapt_deque(size_type capacity, Allocator const &alloc = Allocator())
+      : m_deque{capacity, alloc} {}
+
+  constexpr void push(unsafe_steal_handle value) { m_deque.push(value); }
+
+  constexpr auto pop() noexcept -> unsafe_steal_handle {
+    return m_deque.pop([] static noexcept -> unsafe_steal_handle {
+      return {};
+    });
+  }
+
+  constexpr auto steal() noexcept -> unsafe_steal_handle {
+    if (auto [_, result] = m_deque.thief().steal()) {
+      return result;
+    }
+    return {};
+  }
+
+ private:
+  deque<unsafe_steal_handle, Allocator> m_deque;
+};
+
+} // namespace lf
diff --git a/src/batteries/batteries.cxx b/src/batteries/batteries.cxx
new file mode 100644
index 000000000..c58f0e23b
--- /dev/null
+++ b/src/batteries/batteries.cxx
@@ -0,0 +1,8 @@
+export module libfork.batteries;
+
+export import :deque;
+export import :geometric_stack;
+export import :adaptor_stack;
+export import :slab_stack;
+export import :adaptors;
+export import :contexts;
diff --git a/src/batteries/contexts.cxx b/src/batteries/contexts.cxx
new file mode 100644
index 000000000..15319beee
--- /dev/null
+++ b/src/batteries/contexts.cxx
@@ -0,0 +1,115 @@
+module;
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:contexts;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+namespace lf {
+
+// =================== Context Policies =================== //
+
+/**
+ * @brief The simplest context policy is just a LIFO stack of type-erased handles.
+ *
+ * Context policies (unlike full contexts) are not aware of the full context
+ * type hence, operate on untyped handles. This is inherently unsafe. To
+ * prevent UB a policy must not give-out the handles it receives. All
+ * operations must be managed through either `derived_poly_context` or
+ * `mono_context`.
+ */
+export template <typename T>
+concept deque_policy = lifo_stack<T, unsafe_steal_handle>;
+
+// TODO: consider the methods/concepts needed for a auto/scheduling worker
+// context that has a `post` method.
+
+/**
+ * @brief An extension of `deque_policy` that supports FIFO stealing of handles.
+ */
+export template <typename T>
+concept stealable_deque_policy = deque_policy<T> && requires (T &policy) {
+  { policy.steal() } -> std::same_as<unsafe_steal_handle>;
+};
+
+// =================== Contexts =================== //
+
+template <typename Base, worker_stack Stack, deque_policy Deque, typename ContextType>
+class context_base : public Base {
+ public:
+  constexpr context_base() = default;
+
+  template <typename... StackArgs, typename... DequeArgs>
+    requires std::constructible_from<Stack, StackArgs...> && std::constructible_from<Deque, DequeArgs...>
+  constexpr context_base(
+      std::piecewise_construct_t,
+      std::tuple<StackArgs...> stack_args,
+      std::tuple<DequeArgs...> deque_args) noexcept(std::is_nothrow_constructible_v<Stack, StackArgs...> &&
+                                                    std::is_nothrow_constructible_v<Deque, DequeArgs...>)
+      : context_base(std::move(stack_args),
+                     std::move(deque_args),
+                     std::index_sequence_for<StackArgs...>{},
+                     std::index_sequence_for<DequeArgs...>{}) {}
+
+  [[nodiscard]]
+  constexpr auto steal() noexcept(noexcept(m_container.steal())) -> steal_handle<ContextType>
+    requires stealable_deque_policy<Deque>
+  {
+    return {key(), get(key(), m_container.steal())};
+  }
+
+ protected:
+  Deque m_container;
+
+ private:
+  template <typename... StackArgs, typename... DequeArgs, std::size_t... Is, std::size_t... Js>
+  constexpr context_base(
+      std::tuple<StackArgs...> stack_args,
+      std::tuple<DequeArgs...> deque_args,
+      std::index_sequence<Is...>,
+      std::index_sequence<Js...>) noexcept(std::is_nothrow_constructible_v<Stack, StackArgs...> &&
+                                           std::is_nothrow_constructible_v<Deque, DequeArgs...>)
+      : Base(std::get<Is>(std::move(stack_args))...),
+        m_container(std::get<Js>(std::move(deque_args))...) {}
+};
+
+/**
+ * @brief A polymorphic worker context composed of a `worker_stack` and a `deque_policy`.
+ */
+export template <worker_stack Stack, deque_policy Deque>
+class derived_poly_context : public context_base<poly_context<Stack>, Stack, Deque, poly_context<Stack>> {
+  using base = context_base<poly_context<Stack>, Stack, Deque, poly_context<Stack>>;
+
+ public:
+  using context_type = poly_context<Stack>;
+
+  using base::base;
+
+  constexpr void push(steal_handle<context_type> handle) final { this->m_container.push(handle); }
+
+  constexpr auto pop() noexcept -> steal_handle<context_type> final {
+    return {key(), get(key(), this->m_container.pop())};
+  }
+};
+
+export template <worker_stack Stack, deque_policy Deque>
+class mono_context : public context_base<base_context<Stack>, Stack, Deque, mono_context<Stack, Deque>> {
+  using base = context_base<base_context<Stack>, Stack, Deque, mono_context>;
+
+ public:
+  using context_type = mono_context;
+
+  using base::base;
+
+  constexpr void push(steal_handle<context_type> handle) noexcept(noexcept(this->m_container.push(handle))) {
+    this->m_container.push(handle);
+  }
+
+  constexpr auto pop() noexcept -> steal_handle<context_type> {
+    return {key(), get(key(), this->m_container.pop())};
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/deque.cxx b/src/batteries/deque.cxx
new file mode 100644
index 000000000..bdc17dcc9
--- /dev/null
+++ b/src/batteries/deque.cxx
@@ -0,0 +1,462 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:deque;
+
+import std;
+
+import libfork.core;
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief Test is a type is suitable for use with `lf::deque`.
+ *
+ * This requires it to be `lf::lock_free` and `std::default_initializable`.
+ */
+export template <typename T>
+concept dequeable = lock_free<T> && std::default_initializable<T>;
+
+/**
+ * @brief Thrown when a push operation fails because the deque is full.
+ */
+export struct deque_full_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "push failed because deque is full";
+  }
+};
+
+/**
+ * @brief A basic wrapper around a c-style array that provides modulo load/stores.
+ *
+ * This class is designed for internal use only. It provides a c-style API that is
+ * used efficiently by deque for low level atomic operations.
+ *
+ * @tparam T The type of the elements in the array.
+ */
+template <dequeable T, allocator_of<std::atomic<T>> Allocator>
+struct atomic_ring_buf {
+ private:
+  using traits = std::allocator_traits<Allocator>;
+  using pointer = traits::pointer;
+
+ public:
+  using diff_type = traits::difference_type;
+  using size_type = traits::size_type;
+
+  /**
+   * @brief Construct a new ring buff object
+   *
+   * @param cap The capacity of the buffer, MUST be a power of 2.
+   * @param alloc The allocator used to allocate the buffer.
+   */
+  constexpr atomic_ring_buf(diff_type cap, Allocator const &alloc)
+      : m_alloc{alloc},
+        m_cap{cap},
+        m_mask{cap - 1} {
+
+    LF_ASSUME(cap > 0 && std::has_single_bit(safe_cast<size_type>(cap)));
+
+    m_buf = traits::allocate(m_alloc, safe_cast<size_type>(cap));
+
+    diff_type i = 0;
+
+    LF_TRY {
+      // Begin the lifetime of each atomic.
+      for (; i < cap; ++i) {
+        traits::construct(m_alloc, std::to_address(m_buf + i));
+      }
+    } LF_CATCH_ALL {
+      clean_up(i);
+      LF_RETHROW;
+    }
+  }
+
+  atomic_ring_buf(atomic_ring_buf const &) = delete;
+  atomic_ring_buf(atomic_ring_buf &&) = delete;
+  auto operator=(atomic_ring_buf const &) -> atomic_ring_buf & = delete;
+  auto operator=(atomic_ring_buf &&) -> atomic_ring_buf & = delete;
+
+  constexpr ~atomic_ring_buf() noexcept { clean_up(m_cap); }
+
+  /**
+   * @brief Get the capacity of the buffer.
+   */
+  [[nodiscard]]
+  constexpr auto capacity() const noexcept -> diff_type {
+    return m_cap;
+  }
+  /**
+   * @brief Store ``val`` at ``index % this->capacity()``.
+   */
+  constexpr auto store(diff_type index, T const &val) noexcept -> void {
+    LF_ASSUME(index >= 0);
+    std::to_address(m_buf + (index & m_mask))->store(val, std::memory_order_relaxed);
+  }
+  /**
+   * @brief Load value at ``index % this->capacity()``.
+   */
+  [[nodiscard]]
+  constexpr auto load(diff_type index) const noexcept -> T {
+    LF_ASSUME(index >= 0);
+    return std::to_address(m_buf + (index & m_mask))->load(std::memory_order_relaxed);
+  }
+
+ private:
+  /**
+   * @brief Destroy the first `n` elements and deallocate the buffer.
+   */
+  constexpr void clean_up(diff_type n) noexcept {
+
+    LF_ASSUME(0 <= n && n <= m_cap);
+
+    for (diff_type i = n - 1; i >= 0; --i) {
+      traits::destroy(m_alloc, std::to_address(m_buf + i));
+    }
+    traits::deallocate(m_alloc, m_buf, safe_cast<size_type>(m_cap));
+  }
+
+  [[no_unique_address]]
+  Allocator m_alloc;
+  pointer m_buf{};
+  diff_type m_cap;
+  diff_type m_mask;
+};
+
+/**
+ * @brief Error codes for ``deque`` 's ``steal()`` operation.
+ */
+export enum class err : std::uint8_t {
+  /**
+   * @brief The ``steal()`` operation succeeded.
+   */
+  none = 0,
+  /**
+   * @brief  Lost the ``steal()`` race hence, the ``steal()`` operation failed.
+   */
+  lost,
+  /**
+   * @brief The deque is empty and hence, the ``steal()`` operation failed.
+   */
+  empty,
+};
+
+/**
+ * @brief The return type of a `lf::deque` `steal()` operation.
+ *
+ * This type is suitable for structured bindings. We return a custom type instead of a
+ * `std::optional` to allow for more information to be returned as to why a steal may fail.
+ */
+export template <typename T>
+struct steal_t {
+  /**
+   * @brief Check if the operation succeeded.
+   */
+  [[nodiscard]]
+  constexpr explicit operator bool() const noexcept {
+    return code == err::none;
+  }
+  /**
+   * @brief Get the value like ``std::optional``.
+   *
+   * Requires ``code == err::none`` .
+   */
+  [[nodiscard]]
+  constexpr auto operator*() const noexcept -> T {
+    LF_ASSUME(code == err::none);
+    return val;
+  }
+  /**
+   * @brief Get the value ``like std::optional``.
+   *
+   * Requires ``code == err::none`` .
+   */
+  [[nodiscard]]
+  constexpr auto operator->() const noexcept -> T const * {
+    LF_ASSUME(code == err::none);
+    return std::addressof(val);
+  }
+  /**
+   * @brief The error code of the ``steal()`` operation.
+   */
+  err code;
+  /**
+   * @brief The value stolen from the deque, Only valid if ``code == err::none``.
+   */
+  T val;
+};
+
+/**
+ * @brief A functor that returns ``std::nullopt``.
+ */
+template <typename T>
+struct return_nullopt {
+  /**
+   * @brief Returns ``std::nullopt``.
+   */
+  [[nodiscard]]
+  static constexpr auto operator()() noexcept -> std::optional<T> {
+    return {};
+  }
+};
+
+/**
+ * @brief A bounded lock-free single-producer multiple-consumer work-stealing deque.
+ *
+ * Implements the "Chase-Lev" deque described in the papers, `"Dynamic Circular Work-Stealing deque"
+ * <https://doi.org/10.1145/1073970.1073974>`_ and `"Correct and Efficient Work-Stealing for Weak
+ * Memory Models" <https://doi.org/10.1145/2442516.2442524>`_.
+ *
+ * Only the deque owner can perform ``pop()`` and ``push()`` operations where the deque behaves
+ * like a LIFO stack. Others can (only) ``steal()`` data from the deque, they see a FIFO deque.
+ * All threads must have finished using the deque before it is destructed.
+ *
+ * Also see:
+
+ * - Rust: https://github.com/crossbeam-rs/crossbeam/blob/master/crossbeam-deque/src/deque.rs
+ * - CDSC: https://dl.acm.org/doi/epdf/10.1145/2544173.2509514
+ *
+ * @tparam T The type of the elements in the deque.
+ */
+export template <dequeable T, allocator_of<std::atomic<T>> Allocator = std::allocator<std::atomic<T>>>
+class deque {
+ public:
+  using diff_type = atomic_ring_buf<T, Allocator>::diff_type;
+  using size_type = atomic_ring_buf<T, Allocator>::size_type;
+
+  using value_type = T;
+  using allocator_type = Allocator;
+
+  deque(deque const &) = delete;
+  deque(deque &&) = delete;
+  auto operator=(deque const &) -> deque & = delete;
+  auto operator=(deque &&) -> deque & = delete;
+
+  /**
+   * @brief A non-owning handle that can be used to steal items from the deque.
+   *
+   * All non-owner interactions with the deque should be made through this handle.
+   */
+  class thief_handle {
+
+    friend class deque;
+
+    explicit thief_handle(deque *queue) noexcept
+        : m_queue{queue} {
+      LF_ASSUME(queue != nullptr);
+    }
+
+   public:
+    /**
+     * @brief Check if the deque is empty.
+     */
+    [[nodiscard]]
+    constexpr auto empty(this thief_handle self) noexcept -> bool {
+      diff_type const top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+      return top >= bottom;
+    }
+    /**
+     * @brief Get the number of elements in the deque.
+     */
+    [[nodiscard]]
+    constexpr auto size(this thief_handle self) noexcept -> size_type {
+      return safe_cast<size_type>(self.ssize());
+    }
+    /**
+     * @brief Get the number of elements in the deque as a signed integer.
+     */
+    [[nodiscard]]
+    constexpr auto ssize(this thief_handle self) noexcept -> diff_type {
+      diff_type const top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+      return std::max(bottom - top, diff_type{0});
+    }
+    /**
+     * @brief Get the capacity of the deque.
+     */
+    [[nodiscard]]
+    constexpr auto capacity(this thief_handle self) noexcept -> diff_type {
+      return self.m_queue->capacity();
+    }
+    /**
+     * @brief Steal an item from the deque.
+     *
+     * Any threads can try to steal an item from the deque. This operation can
+     * fail if the deque is empty or if another thread simultaneously stole an
+     * item from the deque.
+     */
+    constexpr auto steal(this thief_handle self) noexcept -> steal_t<T> {
+      //
+      diff_type top = self.m_queue->m_top.load(acquire);
+      std::atomic_thread_fence(seq_cst);
+      diff_type const bottom = self.m_queue->m_bottom.load(acquire);
+
+      if (top < bottom) {
+        // Must load *before* acquiring the slot as slot may be overwritten immediately after
+        // acquiring. This load is NOT required to be atomic even-though it may race with an overwrite
+        // as we only return the value if we win the race below guaranteeing we had no race during our
+        // read. If we loose the race then 'x' could be corrupt due to read-during-write race but as T
+        // is trivially destructible this does not matter.
+        T tmp = self.m_queue->m_buf.load(top);
+
+        static_assert(std::is_trivially_destructible_v<T>, "'atomicable' should guarantee this already");
+
+        if (!self.m_queue->m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) {
+          return {.code = err::lost, .val = {}};
+        }
+        return {.code = err::none, .val = tmp};
+      }
+      return {.code = err::empty, .val = {}};
+    }
+
+   private:
+    deque *m_queue;
+  };
+
+  /**
+   * @brief Construct a new empty deque object.
+   *
+   * @param cap The capacity of the deque (will be rounded to the next power of two).
+   * @param alloc Allocator used to allocate the internal buffer.
+   */
+  constexpr explicit deque(size_type cap, Allocator const &alloc = Allocator())
+      : m_buf(round_capacity(cap), alloc) {}
+
+  /**
+   * @brief Check if the deque is empty.
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(seq_cst);
+    return top >= bottom;
+  }
+  /**
+   * @brief Get the number of elements in the deque.
+   */
+  [[nodiscard]]
+  constexpr auto size() const noexcept -> size_type {
+    return safe_cast<size_type>(ssize());
+  }
+  /**
+   * @brief Get the number of elements in the deque as a signed integer.
+   */
+  [[nodiscard]]
+  constexpr auto ssize() const noexcept -> diff_type {
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(seq_cst);
+    return std::max(bottom - top, diff_type{0});
+  }
+  /**
+   * @brief Get the capacity of the deque.
+   */
+  [[nodiscard]]
+  constexpr auto capacity() const noexcept -> diff_type {
+    return m_buf.capacity();
+  }
+  /**
+   * @brief Get a non-owning `thief_handle` that can be used to steal items from the deque.
+   */
+  constexpr auto thief() noexcept -> thief_handle { return thief_handle{this}; }
+
+  /**
+   * @brief Push an item into the deque.
+   *
+   * Only the owner thread can insert an item into the deque. This will throw
+   * an exception if the deque is full. This returns the number of elements in
+   * the deque before the push.
+   *
+   * @param val Value to add to the deque.
+   */
+  constexpr auto push(T val) -> diff_type {
+
+    diff_type const bottom = m_bottom.load(relaxed);
+    diff_type const top = m_top.load(acquire);
+    diff_type const ssize = bottom - top;
+
+    if (m_buf.capacity() < ssize + 1) {
+      LF_THROW(deque_full_error{});
+    }
+
+    // Construct new object, this does not have to be atomic as no one can steal
+    // this item until after we store the new value of bottom, ordering is
+    // maintained by surrounding atomics.
+    m_buf.store(bottom, val);
+
+    std::atomic_thread_fence(release);
+    m_bottom.store(bottom + 1, relaxed);
+
+    // This was the size just before the push, upon return the size could be any
+    // smaller number, down to zero, as stealers could have stolen all the
+    // tasks.
+    return ssize;
+  }
+
+  /**
+   * @brief Pop an item from the deque.
+   *
+   * Only the owner thread can pop out an item from the deque. If the buffer is
+   * empty calls `when_empty` and returns the result. By default, `when_empty`
+   * is a no-op that returns a null `std::optional<T>`.
+   */
+  template <std::invocable Fn = return_nullopt<T>>
+    requires std::convertible_to<T, std::invoke_result_t<Fn>>
+  constexpr auto
+  pop(Fn &&when_empty = {}) noexcept(std::is_nothrow_invocable_v<Fn>) -> std::invoke_result_t<Fn> {
+
+    diff_type const bottom = m_bottom.load(relaxed) - 1; //
+    m_bottom.store(bottom, relaxed);                     // Stealers can no longer steal.
+
+    std::atomic_thread_fence(seq_cst);
+
+    diff_type top = m_top.load(relaxed);
+
+    if (top <= bottom) {
+      // Non-empty deque
+
+      // This load is not required to be atomic as we are the exclusive writer.
+      T val = m_buf.load(bottom);
+
+      if (top == bottom) {
+        // The last item could get stolen, by a stealer that loaded bottom before our write above.
+        if (!m_top.compare_exchange_strong(top, top + 1, seq_cst, relaxed)) {
+          // Failed race, thief got the last item.
+          m_bottom.store(bottom + 1, relaxed);
+          return std::invoke(std::forward<Fn>(when_empty));
+        }
+        m_bottom.store(bottom + 1, relaxed);
+      }
+      return val;
+    }
+    m_bottom.store(bottom + 1, relaxed);
+    return std::invoke(std::forward<Fn>(when_empty));
+  }
+
+ private:
+  alignas(k_cache_line) atomic_ring_buf<T, Allocator> m_buf;
+  alignas(k_cache_line) std::atomic<diff_type> m_top{0};
+  alignas(k_cache_line) std::atomic<diff_type> m_bottom{0};
+
+  // Convenience aliases.
+  static constexpr std::memory_order relaxed = std::memory_order_relaxed;
+  static constexpr std::memory_order consume = std::memory_order_consume;
+  static constexpr std::memory_order acquire = std::memory_order_acquire;
+  static constexpr std::memory_order release = std::memory_order_release;
+  static constexpr std::memory_order seq_cst = std::memory_order_seq_cst;
+
+  /**
+   * @brief Round `cap` up to the next power of two as a `diff_type`.
+   */
+  static constexpr auto round_capacity(size_type cap) -> diff_type {
+    constexpr auto max_cap = std::bit_floor(safe_cast<size_type>(std::numeric_limits<diff_type>::max()));
+    LF_ASSUME(0 < cap && cap <= max_cap);
+    return safe_cast<diff_type>(std::bit_ceil(cap));
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/geometric_stack.cxx b/src/batteries/geometric_stack.cxx
new file mode 100644
index 000000000..c34648442
--- /dev/null
+++ b/src/batteries/geometric_stack.cxx
@@ -0,0 +1,449 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:geometric_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief A geometric_stack is a user-space (geometric-growth) segmented program stack.
+ *
+ * This protects against hot-splitting by keeping a single cached segment.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class geometric_stack {
+
+  struct ctrl;
+  struct node;
+
+  using ctrl_traits = std::allocator_traits<Allocator>::template rebind_traits<ctrl>;
+  using node_traits = std::allocator_traits<Allocator>::template rebind_traits<node>;
+
+  using ctrl_ptr = ctrl_traits::pointer;
+  using node_ptr = node_traits::pointer;
+
+  using void_ptr = node_traits::void_pointer;
+
+  using size_type = node_traits::size_type;
+  using diff_type = node_traits::difference_type;
+
+  struct release_t {
+    explicit constexpr release_t(key_t) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend geometric_stack;
+    explicit constexpr checkpoint_t(ctrl_ptr ptr) noexcept
+        : m_ctrl(ptr) {}
+    ctrl_ptr m_ctrl = nullptr;
+  };
+
+ public:
+  constexpr geometric_stack() noexcept(noexcept(Allocator()))
+      : geometric_stack(Allocator()) {}
+  explicit constexpr geometric_stack(Allocator const &alloc) noexcept
+      : m_ctrl_alloc(alloc) {}
+
+  constexpr geometric_stack(geometric_stack const &other) = delete;
+  constexpr geometric_stack(geometric_stack &&other) = delete;
+
+  constexpr auto operator=(geometric_stack const &other) -> geometric_stack & = delete;
+  constexpr auto operator=(geometric_stack &&other) -> geometric_stack & = delete;
+
+  constexpr ~geometric_stack() noexcept {
+    LF_ASSUME(empty());
+    delete_ctrl(m_ctrl);
+  }
+
+  /**
+   * @brief Test if the stack is empty (all pushes have been popped).
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+
+    if (m_ctrl != nullptr) {
+      LF_ASSUME(m_ctrl->top != nullptr);
+    } else {
+      return true;
+    }
+
+    if (m_ctrl->top->prev != nullptr) {
+      return false;
+    }
+
+    return m_sp == m_lo;
+  }
+
+  /**
+   * @brief Get a checkpoint of the stack that can be used to acquire it from another stack allocator.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_ctrl};
+  }
+
+  /**
+   * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    // Zero sized pushed are an error
+    LF_ASSUME(size > 0);
+
+    // Very careful math to avoid superfluous instructions on this (very) hot path.
+    diff_type push_bytes = safe_cast<diff_type>(round_to_multiple<sizeof(node)>(size));
+
+    constexpr diff_type node_size = sizeof(node);
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    // Optimized to just the subtraction because multiplication cancels the implicit division.
+    diff_type free_bytes = node_size * (m_hi - m_sp);
+
+    if (push_bytes > free_bytes) [[unlikely]] {
+      return push_cached(push_bytes);
+    }
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    // Compiler should optimize this division away when it fuses it with the
+    // implicit multiplication in the pointer arithmetic below.
+    diff_type num_nodes = push_bytes / node_size;
+
+    // node_ptr -> void_ptr
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  /**
+   * @brief Deallocate the most recent allocation of size bytes at ptr, which
+   * must be the most recent allocation returned by push and not yet popped.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept {
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_sp != nullptr);
+    LF_ASSUME(ptr != nullptr);
+
+    // Inverse of push: void_ptr -> node_ptr
+    auto sp = static_cast<node_ptr>(ptr);
+
+    if (m_sp == m_lo) [[unlikely]] {
+      return pop_shuffle(sp);
+    }
+
+    m_sp = sp;
+  }
+
+  [[nodiscard]]
+  constexpr auto prepare_release() noexcept -> release_t {
+
+    // Guard against null release
+    if (m_ctrl != nullptr) {
+      m_ctrl->sp_cache = m_sp;
+    }
+
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t) noexcept {
+
+    // Don't delete, will be resumed from a checkpoint.
+    m_ctrl = nullptr;
+
+    m_lo = nullptr;
+    m_sp = nullptr;
+    m_hi = nullptr;
+  }
+
+  constexpr void acquire(checkpoint_t ckpt) noexcept {
+
+    LF_ASSUME(empty());
+    LF_ASSUME(ckpt.m_ctrl != m_ctrl);
+
+    if (ckpt.m_ctrl == nullptr) {
+      return;
+    }
+
+    delete_ctrl(m_ctrl);
+
+    m_ctrl = ckpt.m_ctrl;
+
+    if constexpr (!node_traits::is_always_equal::value) {
+      // Need to propagate allocator
+      m_ctrl_alloc = typename ctrl_traits::allocator_type{std::as_const(m_ctrl->node_alloc)};
+    }
+
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    load_local<from::cache>();
+  }
+
+ private:
+  // ============== Types ==============  //
+
+  enum class from : char {
+    top,
+    cache,
+    none,
+  };
+
+  struct alignas(k_new_align) node {
+    node_ptr prev;  // Linked list (past)
+    diff_type size; // Usable-size of the stacklet
+  };
+
+  struct ctrl {
+    [[no_unique_address]]
+    typename node_traits::allocator_type node_alloc;
+
+    node_ptr top = nullptr;      // Most recent stacklet i.e. the top of the stack.
+    node_ptr cache = nullptr;    // Cached (empty) stacklet for hot-split guarding.
+    node_ptr sp_cache = nullptr; // Cached stack pointer for this stacklet.
+  };
+
+  // ============== Members ==============  //
+
+  [[no_unique_address]]
+  typename ctrl_traits::allocator_type m_ctrl_alloc;
+
+  ctrl_ptr m_ctrl = nullptr; // The control block for the stack.
+
+  node_ptr m_lo = nullptr; // The base pointer for the current stacklet.
+  node_ptr m_sp = nullptr; // The stack pointer for the current stacklet.
+  node_ptr m_hi = nullptr; // The one-past-the-end pointer for the current stacklet.
+
+  // ============== Methods ==============  //
+
+  /**
+   * @brief Make local pointers point to the current stacklet in the control block.
+   *
+   * Assumes that the control block and top stacklet are non-nullptr.
+   */
+  template <from StackPtr>
+  constexpr auto load_local() noexcept -> void {
+
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    constexpr diff_type one{1};
+
+    m_lo = m_ctrl->top + one;
+    m_hi = m_lo + m_ctrl->top->size;
+
+    if constexpr (StackPtr == from::cache) {
+      m_sp = m_ctrl->sp_cache;
+    } else if constexpr (StackPtr == from::top) {
+      m_sp = m_lo;
+    } else {
+      static_assert(StackPtr == from::none);
+    }
+  }
+
+  /**
+   * @brief Allocate and construct a new control block with a single stacklet of size bytes.
+   */
+  [[nodiscard]]
+  constexpr auto new_ctrl(this geometric_stack &self, diff_type num_nodes) -> ctrl_ptr {
+
+    ctrl_ptr new_ctrl = ctrl_traits::allocate(self.m_ctrl_alloc, 1);
+
+    LF_TRY {
+      // Propagate ctrl allocator to control blocks node allocator.
+      ctrl_traits::construct(self.m_ctrl_alloc, std::to_address(new_ctrl), std::as_const(self.m_ctrl_alloc));
+      LF_TRY {
+        new_ctrl->top = new_node(new_ctrl, num_nodes);
+      } LF_CATCH_ALL {
+        // Clean up construction
+        ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(new_ctrl));
+        LF_RETHROW;
+      }
+    } LF_CATCH_ALL {
+      // Clean up allocation
+      ctrl_traits::deallocate(self.m_ctrl_alloc, new_ctrl, 1);
+      LF_RETHROW;
+    }
+
+    return new_ctrl;
+  }
+
+  /**
+   * @brief Clean and delete the control block and all stacklets.
+   */
+  constexpr void delete_ctrl(this geometric_stack &self, ctrl_ptr ctrl) noexcept {
+    if (ctrl != nullptr) {
+      LF_ASSUME(ctrl->top != nullptr);
+      LF_ASSUME(ctrl->top->prev == nullptr);
+
+      // Clea-up stacklets
+      delete_node(ctrl, ctrl->top);
+      delete_node(ctrl, ctrl->cache);
+
+      // Finally delete the control block.
+      ctrl_traits::destroy(self.m_ctrl_alloc, std::to_address(ctrl));
+      ctrl_traits::deallocate(self.m_ctrl_alloc, ctrl, 1);
+    }
+  }
+
+  /**
+   * @brief Allocate node with size bytes for stacklet.
+   *
+   * This function is strongly exception-safe.
+   */
+  [[nodiscard]]
+  static constexpr auto new_node(ctrl_ptr ctrl, diff_type num_nodes) -> node_ptr {
+
+    // Allocation should be a multiple of the node size
+    LF_ASSUME(num_nodes > 0);
+    LF_ASSUME(ctrl != nullptr);
+
+    // Allocation/deallocation requires size_type, +1 for the header node
+    size_type allocate_nodes = 1 + safe_cast<size_type>(num_nodes);
+
+    node_ptr next_node = node_traits::allocate(ctrl->node_alloc, allocate_nodes);
+
+    LF_TRY {
+      // Construct the header
+      node_traits::construct(ctrl->node_alloc, std::to_address(next_node), nullptr, num_nodes);
+    } LF_CATCH_ALL {
+      node_traits::deallocate(ctrl->node_alloc, next_node, allocate_nodes);
+      LF_RETHROW;
+    }
+
+    return next_node;
+  }
+
+  /**
+   * @brief Delete a (possibly null) node and it's associated stacklet.
+   */
+  static constexpr auto delete_node(ctrl_ptr ctrl, node_ptr ptr) noexcept -> void {
+    if (ptr != nullptr) {
+      LF_ASSUME(ctrl != nullptr);
+      // Size doesn't include the header node so we +1 here.
+      size_type allocated_nodes = 1 + safe_cast<size_type>(ptr->size);
+      node_traits::destroy(ctrl->node_alloc, std::to_address(ptr));
+      node_traits::deallocate(ctrl->node_alloc, ptr, allocated_nodes);
+    }
+  }
+
+  [[nodiscard]]
+  constexpr auto push_cached(diff_type push_bytes) -> void_ptr {
+
+    // Have to be very careful in this function to be strongly exception-safe!
+
+    constexpr diff_type node_size = sizeof(node);
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    diff_type num_nodes = safe_cast<diff_type>(push_bytes / node_size);
+
+    LF_ASSUME(num_nodes > 0);
+
+    if (m_ctrl == nullptr) {
+      // Initial stacklet wants to be quite large
+      constexpr diff_type min_nodes = (k_page_size / sizeof(node)) - 1;
+
+      m_ctrl = new_ctrl(std::max(min_nodes, num_nodes));
+
+      // Local copies of the new top
+      load_local<from::top>();
+      // Do the allocation.
+      return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+    }
+
+    LF_ASSUME(m_ctrl->top != nullptr);
+
+    if (m_ctrl->cache != nullptr && m_ctrl->cache->size >= num_nodes) {
+
+      // We have space in the cache. No allocations on this path, nothing cam throw.
+
+      if (m_sp == m_lo) {
+        // There is nothing allocated on the current stacklet/top but it doesn't
+        // have enough space hence, we need to delete top such that we don't end up
+        // with an empty stacklet in the chain. This would break deletion otherwise.
+        node_ptr empty_top = m_ctrl->top;
+        m_ctrl->top = m_ctrl->top->prev; // top could be null now
+        delete_node(m_ctrl, empty_top);
+      }
+
+      // Shuffle cache to the top.
+      m_ctrl->cache->prev = m_ctrl->top;
+      m_ctrl->top = m_ctrl->cache;
+      m_ctrl->cache = nullptr;
+
+      // Local copies of the new top
+      load_local<from::top>();
+      // Do the allocation.
+      return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+    }
+
+    // We need to allocate a new stacklet to fit this allocation, we choose to
+    // grow geometrically to try to avoid too many allocations. Fine if this
+    // throws
+    node_ptr new_top = new_node(m_ctrl, std::max(num_nodes, 2 * m_ctrl->top->size));
+
+    // Nothing can throw after this point
+
+    // We didn't use the cache because it wasn't big enough, we should delete it
+    // now because we had to grow the stack. We couldn't do this until now because
+    // new_node may have thrown.
+    delete_node(m_ctrl, std::exchange(m_ctrl->cache, nullptr));
+
+    if (m_sp == m_lo) {
+      // There is nothing allocated on the current stacklet/top but it doesn't
+      // have enough space hence, we need to delete top such that we don't end up
+      // with an empty stacklet in the chain. This would break deletion otherwise.
+      node_ptr empty_top = m_ctrl->top;
+      m_ctrl->top = m_ctrl->top->prev; // top could be null now
+      delete_node(m_ctrl, empty_top);
+    }
+
+    // Commit the new/node
+    new_top->prev = m_ctrl->top;
+    m_ctrl->top = new_top;
+
+    // Local copies of the new top
+    load_local<from::top>();
+    // Do the allocation.
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  constexpr void pop_shuffle(node_ptr sp) noexcept {
+
+    // Shuffle top/cache
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_ctrl != nullptr);
+    LF_ASSUME(m_ctrl->top != nullptr);       // Pop from empty stack
+    LF_ASSUME(m_ctrl->top->prev != nullptr); // ^
+
+    // Shuffle top to cache
+    node_ptr old_cache = m_ctrl->cache;
+    m_ctrl->cache = m_ctrl->top;
+    delete_node(m_ctrl, old_cache);
+
+    // Go back one stacklet
+    m_ctrl->top = m_ctrl->top->prev;
+
+    // Local copies of the new top
+    load_local<from::none>();
+    m_sp = sp;
+  }
+};
+
+} // namespace lf
diff --git a/src/batteries/slab_stack.cxx b/src/batteries/slab_stack.cxx
new file mode 100644
index 000000000..7145b8336
--- /dev/null
+++ b/src/batteries/slab_stack.cxx
@@ -0,0 +1,236 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.batteries:slab_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief A slab_stack is a user-space stack backed by a single fixed-size slab of memory.
+ *
+ * For this to conform to `worker_stack` the allocators void pointer type must be `void *`
+ */
+export template <allocator_of<std::byte> Allocator = std::allocator<std::byte>>
+class slab_stack {
+
+  struct node;
+
+  using node_traits = std::allocator_traits<Allocator>::template rebind_traits<node>;
+  using node_alloc_t = node_traits::allocator_type;
+
+  using node_ptr = node_traits::pointer;
+  using void_ptr = node_traits::void_pointer;
+
+  using size_type = node_traits::size_type;
+  using diff_type = node_traits::difference_type;
+
+  struct alignas(k_new_align) node {
+    [[no_unique_address]]
+    node_alloc_t node_alloc; // Propagated to new owners on acquire.
+    node_ptr sp_cache;       // Stack pointer saved across release/acquire.
+    diff_type size;          // Usable node count following this header.
+  };
+
+  static constexpr diff_type k_default_nodes = safe_cast<diff_type>(4 * k_page_size / sizeof(node)) - 1;
+
+  static_assert(k_default_nodes > 0);
+
+  struct release_t {
+    explicit constexpr release_t(key_t) noexcept {}
+  };
+
+  class checkpoint_t {
+   public:
+    constexpr checkpoint_t() noexcept = default;
+    constexpr auto operator==(checkpoint_t const &) const noexcept -> bool = default;
+
+   private:
+    friend slab_stack;
+    explicit constexpr checkpoint_t(node_ptr ptr) noexcept
+        : m_ctrl(ptr) {}
+    node_ptr m_ctrl = nullptr;
+  };
+
+ public:
+  constexpr slab_stack()
+      : slab_stack(k_default_nodes, Allocator()) {}
+
+  // TODO: what is appropriate unit for initialisation
+  // TODO: remove default constructor?
+
+  explicit constexpr slab_stack(diff_type num_nodes, Allocator const &alloc = Allocator())
+      : m_alloc(alloc) {
+    init_slab(num_nodes);
+  }
+
+  constexpr slab_stack(slab_stack const &) = delete;
+  constexpr slab_stack(slab_stack &&) = delete;
+
+  constexpr auto operator=(slab_stack const &) -> slab_stack & = delete;
+  constexpr auto operator=(slab_stack &&) -> slab_stack & = delete;
+
+  constexpr ~slab_stack() noexcept {
+    LF_ASSUME(empty());
+    delete_ctrl(m_ctrl);
+  }
+
+  /**
+   * @brief Test if the stack is empty (all pushes have been popped).
+   */
+  [[nodiscard]]
+  constexpr auto empty() const noexcept -> bool {
+    return m_sp == m_lo;
+  }
+
+  /**
+   * @brief Get a checkpoint of the stack for transfer to another stack instance.
+   */
+  [[nodiscard]]
+  constexpr auto checkpoint() noexcept -> checkpoint_t {
+    return checkpoint_t{m_ctrl};
+  }
+
+  /**
+   * @brief Allocate size bytes on the stack and return a pointer to the base of the allocation.
+   */
+  [[nodiscard]]
+  constexpr auto push(std::size_t size) -> void_ptr {
+    LF_ASSUME(size > 0);
+
+    constexpr diff_type node_size = sizeof(node);
+
+    diff_type push_bytes = safe_cast<diff_type>(round_to_multiple<sizeof(node)>(size));
+
+    LF_ASSUME(push_bytes >= node_size);
+    LF_ASSUME(push_bytes % node_size == 0);
+
+    // Optimized to just the subtraction because multiplication cancels the implicit division.
+    diff_type free_bytes = node_size * (m_hi - m_sp);
+
+    if (push_bytes > free_bytes) [[unlikely]] {
+      LF_THROW(std::bad_alloc{});
+    }
+
+    diff_type num_nodes = push_bytes / node_size;
+
+    // node_ptr -> void_ptr
+    return static_cast<void_ptr>(std::exchange(m_sp, m_sp + num_nodes));
+  }
+
+  /**
+   * @brief Deallocate the most recent allocation of n bytes at ptr.
+   */
+  constexpr void pop(void_ptr ptr, [[maybe_unused]] std::size_t n) noexcept {
+    LF_ASSUME(!empty());
+    LF_ASSUME(m_sp != nullptr);
+    LF_ASSUME(ptr != nullptr);
+
+    // Inverse of push: void_ptr -> node_ptr
+    m_sp = static_cast<node_ptr>(ptr);
+  }
+
+  /**
+   * @brief Make ready for a call to release().
+   */
+  [[nodiscard]]
+  constexpr auto prepare_release() noexcept -> release_t {
+    // Guard against null ctrl (failed prior allocation in release()).
+    if (m_ctrl != nullptr) {
+      m_ctrl->sp_cache = m_sp;
+    }
+    return release_t{key()};
+  }
+
+  constexpr void release([[maybe_unused]] release_t) noexcept {
+
+    diff_type next_size = (m_ctrl != nullptr) ? m_ctrl->size : k_default_nodes;
+
+    // Hand off the current slab to whoever holds the checkpoint; clear local state.
+    m_ctrl = nullptr;
+    m_lo = nullptr;
+    m_sp = nullptr;
+    m_hi = nullptr;
+
+    // Pre-allocate a fresh slab for our next use.
+
+    LF_TRY {
+      init_slab(next_size);
+    } LF_CATCH_ALL {
+      // If ^ throws, swallow the exception — push will see no space
+      // i.e. (m_hi - m_sp == 0) and throw instead.
+    }
+  }
+
+  constexpr void acquire(checkpoint_t ckpt) noexcept {
+
+    LF_ASSUME(empty());
+    LF_ASSUME(ckpt.m_ctrl != m_ctrl);
+
+    if (ckpt.m_ctrl == nullptr) {
+      return;
+    }
+
+    // Discard the fresh empty slab we prepared during release() (may be null on alloc failure).
+    delete_ctrl(m_ctrl);
+
+    m_ctrl = ckpt.m_ctrl;
+
+    if constexpr (!node_traits::is_always_equal::value) {
+      m_alloc = node_alloc_t{std::as_const(m_ctrl->node_alloc)};
+    }
+
+    load_local();
+  }
+
+ private:
+  [[no_unique_address]]
+  node_alloc_t m_alloc;
+
+  node_ptr m_ctrl = nullptr; // Header node (fused ctrl+first-node of the slab).
+  node_ptr m_lo = nullptr;   // Base of usable space (m_ctrl + 1).
+  node_ptr m_sp = nullptr;   // Stack pointer for the current slab.
+  node_ptr m_hi = nullptr;   // One-past-the-end of usable space.
+
+  // Restore local pointers from the header node, taking sp from the cache.
+  constexpr void load_local() noexcept {
+    LF_ASSUME(m_ctrl != nullptr);
+    m_lo = m_ctrl + 1;
+    m_hi = m_lo + m_ctrl->size;
+    m_sp = m_ctrl->sp_cache;
+  }
+
+  // Allocate and construct a fresh slab with num_nodes usable nodes.
+  constexpr void init_slab(diff_type num_nodes) {
+    LF_ASSUME(num_nodes > 0);
+
+    size_type total = safe_cast<size_type>(1 + num_nodes);
+    m_ctrl = node_traits::allocate(m_alloc, total);
+
+    LF_TRY {
+      node_traits::construct(m_alloc, std::to_address(m_ctrl), m_alloc, nullptr, num_nodes);
+    } LF_CATCH_ALL {
+      node_traits::deallocate(m_alloc, m_ctrl, total);
+      m_ctrl = nullptr;
+      LF_RETHROW;
+    }
+
+    m_lo = m_sp = m_ctrl + 1;
+    m_hi = m_lo + num_nodes;
+  }
+
+  // Destroy and deallocate a slab (no-op if null).
+  constexpr void delete_ctrl(node_ptr ctrl) noexcept {
+    if (ctrl != nullptr) {
+      size_type total = safe_cast<size_type>(1 + ctrl->size);
+      node_traits::destroy(m_alloc, std::to_address(ctrl));
+      node_traits::deallocate(m_alloc, ctrl, total);
+    }
+  }
+};
+
+} // namespace lf
diff --git a/src/core/awaitables.cxx b/src/core/awaitables.cxx
new file mode 100644
index 000000000..83cf411bf
--- /dev/null
+++ b/src/core/awaitables.cxx
@@ -0,0 +1,239 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:awaitables;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+import :frame;
+import :handles;
+import :task;
+import :thread_locals;
+import :final_suspend;
+
+namespace lf {
+
+// =============== Fork/Call =============== //
+
+/**
+ * @brief Call inside a catch block, stash current exception in `frame`.
+ */
+template <typename Checkpoint>
+constexpr void stash_current_exception(frame_type<Checkpoint> *frame) noexcept {
+  // No synchronization is done via exception_bit, hence we can use relaxed atomics
+  // and rely on the usual fork/join synchronization to ensure memory ordering.
+  if (frame->atomic_except().exchange(1, std::memory_order_relaxed) == 0) {
+
+    frame->except.construct(std::current_exception());
+
+    // Should have been called from inside a catch block
+    LF_ASSUME(*frame->except != nullptr);
+  }
+}
+
+/**
+ * @brief In a separate function to allow it to be placed in cold block.
+ */
+template <typename T, typename Context>
+constexpr void
+destroy_child_stash_exception(frame_t<Context> *child, coro<promise_type<T, Context>> parent) noexcept {
+  // Clean-up the child that will never be resumed.
+  child->handle().destroy();
+  // Stash in the parent's frame which will then be resumed.
+  stash_current_exception(&parent.promise().frame);
+}
+
+/**
+ * @brief Awaitable for forking/calling an async function.
+ */
+template <category Cat, worker_context Context>
+struct async_awaitable : std::suspend_always {
+
+  static_assert(Cat == category::call || Cat == category::fork, "Invalid category for awaitable");
+
+  frame_t<Context> *child;
+
+  template <typename T>
+  constexpr auto
+  await_suspend(this async_awaitable self, coro<promise_type<T, Context>> parent) noexcept -> coro<> {
+
+    // TODO: test of having a dedicated is_stopped awaitable is quicker
+
+    if (!self.child) [[unlikely]] {
+      // Noop if an exception was thrown.
+      return parent;
+    }
+
+    if (self.child->stop_requested()) [[unlikely]] {
+      // Noop if stopped, must clean-up the child that will never be resumed.
+      return self.child->handle().destroy(), parent;
+    }
+
+    // Propagate parent->child relationships
+    self.child->parent = &parent.promise().frame;
+
+    if constexpr (Cat == category::call) {
+      // Should be the default
+      LF_ASSUME(self.child->kind == category::call);
+    } else {
+      self.child->kind = Cat;
+    }
+
+    if constexpr (Cat == category::fork) {
+      // It is critical to pass self by-value here, after the call to push()
+      // the object `*this` may be destroyed, if passing by ref it would be
+      // use-after-free to then access self in the following line to fetch the
+      // handle.
+      LF_TRY {
+        get_tls_context<Context>().push(steal_handle<Context>{key(), &parent.promise().frame});
+      } LF_CATCH_ALL {
+        return destroy_child_stash_exception(self.child, parent), parent;
+      }
+    }
+
+    return self.child->handle();
+  }
+};
+
+// =============== Join =============== //
+
+template <worker_context Context>
+struct join_awaitable {
+
+  frame_t<Context> *frame;
+
+  constexpr auto await_ready(this join_awaitable self) noexcept -> bool {
+    if (not_null(self.frame)->steals == 0) [[likely]] {
+      if (self.frame->stop_requested()) [[unlikely]] {
+        // Must unconditionally suspended if stopped
+        return false;
+      }
+      // If no steals then we are the only owner of the parent and we are
+      // ready to join. Therefore, no need to reset the control block.
+      return true;
+    }
+    return false;
+  }
+
+  constexpr auto await_suspend(this join_awaitable self, std::coroutine_handle<> task) noexcept -> coro<> {
+    // Currently   self.joins  = k_u16_max  - num_joined
+    //
+    // We set           joins  = self->joins - (k_u16_max - num_steals)
+    //                         = num_steals - num_joined
+    //
+    // Hence               joined = k_u16_max - num_joined
+    //         k_u16_max - joined = num_joined
+
+    // Lemma:
+    //
+    //    If a thread is at a join and steals have occurred then the
+    //    thread can never own the stack of the current frame.
+    //
+    // This is because threads follow the work-first principle, so for the
+    // owner to be running this task it would have to have re-stolen it from a
+    // thief. Which implies it would have run the final suspend of the child
+    // that had it's continuation stolen, where it would have had to release
+    // the stack, because the parent was at not at the join.
+
+    LF_ASSUME(self.frame);
+
+    std::uint32_t steals = self.frame->steals;
+    std::uint32_t offset = k_u16_max - steals;
+    std::uint32_t joined = self.frame->atomic_joins().fetch_sub(offset, std::memory_order_release);
+
+    // If this was a stop:
+    //
+    // steals = 0, joins = k_u16_max then:
+    //
+    // steals = 0
+    // offset = k_u16_max
+    // joined = k_u16_max, (self.frame->joins is now 0)
+    //
+    // k_u16_max - joined = 0 = steals, hence win the if
+
+    if (steals == k_u16_max - joined) {
+      // We set joins after all children had completed therefore we can resume the task.
+      // Need to acquire to ensure we see all writes by other threads to the result.
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (self.frame->stop_requested()) [[unlikely]] {
+        return self.handle_stop();
+      }
+
+      // We must reset the control block and take the stack. We should never
+      // own the stack at this point because we must have stolen the stack.
+      self.take_stack();
+      self.frame->reset_counters();
+      return task;
+    }
+    // Someone else is responsible for running this task.
+
+    // We cannot touch *this or dereference self as someone may have resumed already!
+    // We cannot currently own this stack (checking would violate above).
+
+    // If no explicit scheduling then we must have an empty WSQ as we stole this task.
+
+    // If explicit scheduling then we may have tasks on our WSQ if we performed a self-steal
+    // in a switch awaitable. In this case we can/must do another self-steal.
+
+    // return try_self_stealing();
+
+    return std::noop_coroutine();
+  }
+
+  constexpr void await_resume(this join_awaitable self) {
+    // We should have been reset
+    LF_ASSUME(self.frame->steals == 0);
+    LF_ASSUME(self.frame->joins == k_u16_max);
+
+    // Outside parallel regions so can touch non-atomically.
+    //
+    // A task that completes by responding to cancellation will drop any
+    // exceptions however, a task may still throw exceptions even if cancelled.
+    // Here we must rethrow even if cancelled because we can't re-suspend at
+    // this point.
+    if constexpr (LF_COMPILER_EXCEPTIONS) {
+      if (self.frame->exception_bit) [[unlikely]] {
+        self.rethrow_exception();
+      }
+    }
+
+    LF_ASSUME(self.frame->exception_bit == 0);
+  }
+
+  constexpr auto take_stack(this join_awaitable self) noexcept -> void {
+    stack_t<Context> &stack = get_tls_stack<Context>();
+    LF_ASSUME(self.frame->stack_ckpt != stack.checkpoint());
+    stack.acquire(std::as_const(self.frame->stack_ckpt));
+  }
+
+  [[nodiscard]]
+  constexpr auto handle_stop(this join_awaitable self) noexcept -> coro<> {
+    // Only need to take the stack if there were steals
+    if (self.frame->steals > 0) {
+      self.take_stack();
+    }
+
+    // We always need to reset the connters as we modified
+    self.frame->reset_counters();
+
+    // Drop any exceptions in the now-stopped task
+    if constexpr (LF_COMPILER_EXCEPTIONS) {
+      if (self.frame->exception_bit) [[unlikely]] {
+        std::ignore = extract_exception(self.frame);
+      }
+    }
+
+    return final_suspend_leading<Context>(self.frame);
+  }
+
+  [[noreturn]]
+  constexpr void rethrow_exception(this join_awaitable self) {
+    std::rethrow_exception(extract_exception(self.frame));
+  }
+};
+
+} // namespace lf
diff --git a/src/core/concepts/context.cxx b/src/core/concepts/context.cxx
new file mode 100644
index 000000000..359da3ae0
--- /dev/null
+++ b/src/core/concepts/context.cxx
@@ -0,0 +1,49 @@
+export module libfork.core:concepts_context;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_stack;
+import :handles;
+
+namespace lf {
+
+template <typename T>
+concept ref_to_worker_stack = std::is_lvalue_reference_v<T> && worker_stack<std::remove_reference_t<T>>;
+
+/**
+ * @brief Specifies that a type acts as a LIFO stack over U.
+ */
+export template <typename T, typename U>
+concept lifo_stack = plain_object<T> && requires (T context, U val) {
+  { context.push(val) } -> std::same_as<void>;
+  { context.pop() } noexcept -> std::same_as<U>;
+};
+
+/**
+ * @brief Defines the API for a libfork compatible worker context.
+ *
+ * This requires that `T` is an object type and supports the following operations:
+ *
+ * - Push/pop a steal handle onto the context in a LIFO manner.
+ * - Have a `worker_stack` that can be accessed via `stack()`.
+ */
+export template <typename T>
+concept worker_context = lifo_stack<T, steal_handle<T>> && requires (T context) {
+  { context.stack() } noexcept -> ref_to_worker_stack;
+};
+
+/**
+ * @brief Fetch the stack type of a worker context `T`.
+ */
+export template <worker_context T>
+using stack_t = std::remove_reference_t<decltype(std::declval<T &>().stack())>;
+
+/**
+ * @brief Fetch the checkpoint type of a worker context `T`.
+ */
+template <worker_context T>
+using checkpoint_t = decltype(std::declval<stack_t<T> &>().checkpoint());
+
+} // namespace lf
diff --git a/src/core/concepts/invocable.cxx b/src/core/concepts/invocable.cxx
new file mode 100644
index 000000000..c6da4b016
--- /dev/null
+++ b/src/core/concepts/invocable.cxx
@@ -0,0 +1,66 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:concepts_invocable;
+
+import std;
+
+import libfork.utils;
+
+import :task;
+import :concepts_context;
+
+namespace lf {
+
+template <typename Context>
+struct ctx_invoke_t {
+  // Explicitly constrained so overload resolution selects prefers
+  template <typename... Args, typename Fn>
+    requires std::invocable<Fn, env<Context>, Args...>
+  static constexpr auto operator()(Fn &&fn, Args &&...args)
+      LF_HOF(std::invoke(std::forward<Fn>(fn), env<Context>{key()}, std::forward<Args>(args)...))
+
+  template <typename... Args, typename Fn>
+  static constexpr auto operator()(Fn &&fn, Args &&...args)
+      LF_HOF(std::invoke(std::forward<Fn>(fn), std::forward<Args>(args)...))
+};
+
+template <typename Context, typename R>
+concept task_from = specialization_of<R, task> && std::same_as<Context, typename R::context_type>;
+
+/**
+ * @brief Test if a callable `Fn` when invoked with `Args...` returns an `lf::task`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+concept async_invocable = worker_context<Context> &&                                                    //
+                          std::invocable<ctx_invoke_t<Context>, Fn, Args...> &&                         //
+                          task_from<Context, std::invoke_result_t<ctx_invoke_t<Context>, Fn, Args...>>; //
+
+/**
+ * @brief Subsumes `async_invocable` and checks that the invocation is `noexcept`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+concept async_nothrow_invocable =
+    async_invocable<Fn, Context, Args...> && std::is_nothrow_invocable_v<ctx_invoke_t<Context>, Fn, Args...>;
+
+/**
+ * @brief The result type of invoking an async function `Fn` with `Args...`.
+ */
+export template <typename Fn, typename Context, typename... Args>
+  requires async_invocable<Fn, Context, Args...>
+using async_result_t = std::invoke_result_t<ctx_invoke_t<Context>, Fn, Args...>::value_type;
+
+/**
+ * @brief Subsumes `async_invocable` and checks the result type is `R`.
+ */
+export template <typename Fn, typename R, typename Context, typename... Args>
+concept async_invocable_to =
+    async_invocable<Fn, Context, Args...> && std::same_as<R, async_result_t<Fn, Context, Args...>>;
+
+/**
+ * @brief Subsumes `async_nothrow_invocable` and `async_invocable_to`.
+ */
+export template <typename Fn, typename R, typename Context, typename... Args>
+concept async_nothrow_invocable_to =
+    async_nothrow_invocable<Fn, Context, Args...> && async_invocable_to<Fn, R, Context, Args...>;
+
+} // namespace lf
diff --git a/src/core/concepts/scheduler.cxx b/src/core/concepts/scheduler.cxx
new file mode 100644
index 000000000..bd081ce6d
--- /dev/null
+++ b/src/core/concepts/scheduler.cxx
@@ -0,0 +1,29 @@
+export module libfork.core:concepts_scheduler;
+
+import std;
+
+import :handles;
+
+namespace lf {
+
+export template <typename T>
+concept has_context_typedef = requires { typename std::remove_cvref_t<T>::context_type; };
+
+export template <has_context_typedef T>
+using context_t = typename std::remove_cvref_t<T>::context_type;
+
+/**
+ * @brief An object capable of scheduling a libfork task for execution.
+ *
+ * These are typed to a context, the `post` method must:
+ *
+ * - Satisfy the strong exception guarantee.
+ * - Guarantee eventual execution of the task associated with `handle`.
+ */
+export template <typename Sch>
+concept scheduler =
+    has_context_typedef<Sch> && requires (Sch &&scheduler, sched_handle<context_t<Sch>> handle) {
+      { static_cast<Sch &&>(scheduler).post(handle) } -> std::same_as<void>;
+    };
+
+} // namespace lf
diff --git a/src/core/concepts/stack.cxx b/src/core/concepts/stack.cxx
new file mode 100644
index 000000000..a85efab37
--- /dev/null
+++ b/src/core/concepts/stack.cxx
@@ -0,0 +1,52 @@
+export module libfork.core:concepts_stack;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+template <typename T>
+  requires std::is_object_v<T>
+consteval auto constify(T &&x) noexcept -> std::add_const_t<T> &;
+
+/**
+ * @brief Defines the API for a libfork compatible stack.
+ *
+ * - After construction `this` is empty and push is valid.
+ * - Pop is valid provided the FILO order is respected.
+ * - Push produces pointers aligned to __STDCPP_DEFAULT_NEW_ALIGNMENT__.
+ * - Destruction is expected to only occur when the stack is empty.
+ * - Result of `.checkpoint()` is expected to:
+ *     - Be "cheap to copy".
+ *     - Have a null state (default constructed) that only compares equal to itself.
+ *     - Is allowed to return null if push has never been called.
+ *     - Compare equal if and only if they are both null or they allocate from the same stack.
+ *     - Have no preconditions about when it's called.
+ * - Prepare release puts the stack into a state which another thread can acquire it.
+ * - Release detaches the current stack and leaves `this` empty.
+ *     - This may be called concurrently with acquire
+ * - Acquire attaches to the stack that the checkpoint came from:
+ *     - It is only called the stack is empty.
+ *     - It is only called with a checkpoint not equal to the current checkpoint.
+ *     - It is called after prepare release (and no other functions in between)
+ *
+ * Fast-path operations: empty, push, pop, checkpoint
+ * Slow-path operations: release, acquire
+ */
+export template <typename T>
+concept worker_stack = plain_object<T> && requires (T stack, std::size_t n, void *ptr) {
+  { stack.push(n) } -> std::same_as<void *>;
+  { stack.pop(ptr, n) } noexcept -> std::same_as<void>;
+  { stack.checkpoint() } noexcept -> std::regular;
+  { stack.prepare_release() } noexcept -> std::movable;
+  { stack.release(stack.prepare_release()) } noexcept -> std::same_as<void>;
+  { stack.acquire(constify(stack.checkpoint())) } noexcept -> std::same_as<void>;
+};
+
+// TODO: Allocator aware stack
+
+// export template <typename T>
+// concept aa_worker_stack = worker_stack<T> && true;
+
+} // namespace lf
diff --git a/src/core/core.cxx b/src/core/core.cxx
new file mode 100644
index 000000000..3d983b95c
--- /dev/null
+++ b/src/core/core.cxx
@@ -0,0 +1,26 @@
+export module libfork.core;
+
+// This module contains the core components of libfork, this includes:
+//
+// task/promise
+// schedule
+// concepts
+// polymorphic context ABC
+
+export import :concepts_invocable;
+export import :concepts_scheduler;
+export import :concepts_context;
+export import :concepts_stack;
+export import :frame;
+export import :task;
+export import :thread_locals;
+export import :poly_context;
+export import :ops;
+export import :handles;
+export import :promise;
+export import :schedule;
+export import :root;
+export import :execute;
+export import :receiver;
+export import :stop;
+export import :exception;
diff --git a/src/core/exception.cxx b/src/core/exception.cxx
new file mode 100644
index 000000000..102447051
--- /dev/null
+++ b/src/core/exception.cxx
@@ -0,0 +1,12 @@
+export module libfork.core:exception;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Base class for all libfork exceptions.
+ */
+export struct libfork_exception : std::exception {};
+
+} // namespace lf
diff --git a/src/core/execute.cxx b/src/core/execute.cxx
new file mode 100644
index 000000000..5eb503b0c
--- /dev/null
+++ b/src/core/execute.cxx
@@ -0,0 +1,78 @@
+
+module;
+#include "libfork/__impl/assume.hpp"
+export module libfork.core:execute;
+
+import std;
+
+import :frame;
+import :thread_locals;
+import :concepts_context;
+import :handles;
+import :exception;
+
+namespace lf {
+
+export struct execute_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "execute called from within a worker thread!";
+  }
+};
+
+/**
+ * @brief Bind this thread to a context and execute the scheduled tasks on that context/thread.
+ *
+ * This should not be called from a thread already bound to a context, once this call returns
+ * the thread is unbound from the context.
+ */
+export template <worker_context Context>
+constexpr void execute(Context &context, sched_handle<Context> handle) {
+
+  if (thread_local_context<Context> != nullptr) {
+    LF_THROW(execute_error{});
+  }
+
+  thread_local_context<Context> = std::addressof(context);
+
+  defer _ = [] static noexcept -> void {
+    thread_local_context<Context> = nullptr;
+  };
+
+  auto *frame = static_cast<frame_type<checkpoint_t<Context>> *>(get(key(), handle));
+
+  frame->handle().resume();
+}
+
+export struct steal_overflow_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "a single task has been stolen 65,535 times";
+  }
+};
+
+export template <worker_context Context>
+constexpr void execute(Context &context, steal_handle<Context> handle) {
+
+  if (thread_local_context<Context> != nullptr) {
+    LF_THROW(execute_error{});
+  }
+
+  thread_local_context<Context> = std::addressof(context);
+
+  defer _ = [] static noexcept -> void {
+    thread_local_context<Context> = nullptr;
+  };
+
+  auto *frame = static_cast<frame_type<checkpoint_t<Context>> *>(get(key(), handle));
+
+  // TODO: bench if we should do this in debug only
+  if (frame->steals == k_u16_max) {
+    LF_THROW(steal_overflow_error{});
+  }
+
+  frame->steals += 1;
+  frame->handle().resume();
+}
+
+} // namespace lf
diff --git a/src/core/final_suspend.cxx b/src/core/final_suspend.cxx
new file mode 100644
index 000000000..db6c91fd3
--- /dev/null
+++ b/src/core/final_suspend.cxx
@@ -0,0 +1,243 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+export module libfork.core:final_suspend;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+import :frame;
+import :handles;
+import :thread_locals;
+
+namespace lf {
+
+template <typename T = void>
+using coro = std::coroutine_handle<T>;
+
+template <worker_context Context>
+using frame_t = frame_type<checkpoint_t<Context>>;
+
+// =============== Extract exception =============== //
+
+/**
+ * @brief Pull an exception out of a frame and clean-up the union/allocation.
+ */
+template <typename Checkpoint>
+[[nodiscard]]
+constexpr auto extract_exception(frame_type<Checkpoint> *frame) noexcept -> std::exception_ptr {
+
+  LF_ASSUME(frame->exception_bit); // Should only be called if an exception was thrown.
+
+  // Local copy
+  std::exception_ptr except = std::move(*frame->except);
+
+  // Should have been set by stash_current_exception
+  LF_ASSUME(except != nullptr);
+
+  // Clean-up exception state
+  frame->exception_bit = 0;
+  frame->except.destroy();
+
+  return except; // NRVO
+}
+
+// =============== Final =============== //
+
+/**
+ * @brief The full final suspend logic.
+ *
+ * The final suspend logic is fully expressed in this function in brief:
+ *
+ * - Try to resume parent if a call.
+ * - Try to resume parent if a fork with no stealing.
+ * - Try to resume a stolen forked task if last to complete.
+ *
+ * This function also handles cancellation (of the parent) by iteratively
+ * climbing up the parent chain.
+ *
+ * This function is split and repeated as two separate functions to allow the
+ * hot-path code to be inlined more easily into the final suspend.
+ */
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_full(Context &context, frame_t<Context> *frame) noexcept -> coro<> {
+  for (;;) {
+    // Validate final state
+    LF_ASSUME(frame);
+    LF_ASSUME(frame->kind != category::root);
+    LF_ASSUME(frame->steals == 0);
+    LF_ASSUME(frame->joins == k_u16_max);
+    LF_ASSUME(frame->exception_bit == 0);
+
+    // Local copies (before we destroy frame)
+    category const kind = frame->kind;
+
+    frame_t<Context> *parent = not_null(frame->parent);
+
+    // Before resuming the next (or exiting) we should clean-up the current frame.
+    // Can't use frame from this point onwards
+    frame->handle().destroy();
+
+    if (kind == category::call) {
+      return parent->handle();
+    }
+
+    // Given we are not a call we must be a fork hence, our
+    // parent can't be a root as they can only call.
+    LF_ASSUME(kind == category::fork);
+    LF_ASSUME(parent->kind != category::root);
+
+    if (steal_handle<Context> last_pushed = context.pop()) {
+      // No-one stole continuation, we are the exclusive owner of parent -> just keep ripping!
+      LF_ASSUME(last_pushed == steal_handle<Context>{key(), parent});
+      // This is not a join point so no state (i.e. counters) is guaranteed.
+      return parent->handle();
+    }
+
+    // An owner is a worker who:
+    //
+    // - Created the task.
+    // - OR had the task submitted to them.
+    // - OR won the task at a join.
+    //
+    // An owner of a task owns the stack the task is on.
+    //
+    // As the worker who completed the child task this thread owns the stack the child task was on.
+    //
+    // Either:
+    //
+    // 1. The parent is on the same stack as the child.
+    // 2. OR the parent is on a different stack to the child.
+    //
+    // Case (1) implies: we owned the parent; forked the child task; then the parent was then stolen.
+    // Case (2) implies: we stole the parent task; then forked the child; then the parent was stolen.
+    //
+    // Case (2) implies that our stack is empty.
+
+    // As soon as we do the `fetch_sub` below the parent task is no longer safe
+    // to access as it may be resumed and then destroyed by another thread. Hence
+    // we must make copies on-the-stack of any data we may need if we lose the
+    // join race.
+    bool const owner = parent->stack_ckpt == context.stack().checkpoint();
+
+    // As soon as we do the fetch_sub (if we loose) someone may acquire
+    // the stack so we must prepare it for release now.
+    auto release_key = context.stack().prepare_release();
+
+    // Register with parent we have completed this child task.
+    if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) {
+      // Parent has reached join and we are the last child task to complete. We
+      // are the exclusive owner of the parent and therefore, we must continue
+      // parent. As we won the race, acquire all writes before resuming.
+      std::atomic_thread_fence(std::memory_order_acquire);
+
+      if (!owner) {
+        // In case of scenario (2) we must acquire the parent's stack.
+        context.stack().acquire(std::as_const(parent->stack_ckpt));
+      }
+
+      // Must reset parent's control block before resuming parent.
+      parent->reset_counters();
+
+      if (parent->stop_requested()) [[unlikely]] {
+        // Don't resume if stopped
+        if constexpr (LF_COMPILER_EXCEPTIONS) {
+          if (parent->exception_bit) [[unlikely]] {
+            std::ignore = extract_exception(parent);
+          }
+        }
+        frame = parent;
+        continue;
+      }
+
+      return parent->handle();
+    }
+
+    if (owner) {
+      // We were unable to resume the parent and we were its owner, as the
+      // resuming thread will take ownership of the parent's we must give it up.
+      context.stack().release(std::move(release_key));
+    }
+
+    // We did not win the join-race, we cannot dereference the parent pointer now
+    // as the frame may now be freed by the winner. Parent has not reached join
+    // or we are not the last child to complete. We are now out of jobs, we must
+    // yield to the executor.
+
+    // Else, case (2), our stack has no allocations on it, it may be used later.
+    return std::noop_coroutine();
+  }
+}
+
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_trailing(Context &context, frame_t<Context> *parent) noexcept -> coro<> {
+
+  bool const owner = parent->stack_ckpt == context.stack().checkpoint();
+
+  auto release_key = context.stack().prepare_release();
+
+  if (parent->atomic_joins().fetch_sub(1, std::memory_order_release) == 1) {
+
+    std::atomic_thread_fence(std::memory_order_acquire);
+
+    if (!owner) {
+      context.stack().acquire(std::as_const(parent->stack_ckpt));
+    }
+
+    parent->reset_counters();
+
+    if (parent->stop_requested()) [[unlikely]] {
+      if constexpr (LF_COMPILER_EXCEPTIONS) {
+        if (parent->exception_bit) [[unlikely]] {
+          std::ignore = extract_exception(parent);
+        }
+      }
+      return final_suspend_full<Context>(context, parent);
+    }
+
+    return parent->handle();
+  }
+
+  if (owner) {
+    context.stack().release(std::move(release_key));
+  }
+
+  return std::noop_coroutine();
+}
+
+template <worker_context Context>
+[[nodiscard]]
+constexpr auto final_suspend_leading(frame_t<Context> *frame) noexcept -> coro<> {
+
+  LF_ASSUME(frame);
+  LF_ASSUME(frame->steals == 0);
+  LF_ASSUME(frame->joins == k_u16_max);
+  LF_ASSUME(frame->exception_bit == 0);
+
+  category const kind = frame->kind;
+
+  frame_t<Context> *parent = not_null(frame->parent);
+
+  frame->handle().destroy();
+
+  if (kind == category::call) {
+    return parent->handle();
+  }
+
+  LF_ASSUME(kind == category::fork);
+
+  Context &context = get_tls_context<Context>();
+
+  if (steal_handle<Context> last_pushed = context.pop()) {
+    LF_ASSUME(last_pushed == steal_handle<Context>{key(), parent});
+    return parent->handle();
+  }
+
+  return final_suspend_trailing<Context>(context, parent);
+}
+
+} // namespace lf
diff --git a/src/core/frame.cxx b/src/core/frame.cxx
new file mode 100644
index 000000000..3a12696e2
--- /dev/null
+++ b/src/core/frame.cxx
@@ -0,0 +1,80 @@
+module;
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:frame;
+
+import std;
+
+import libfork.utils;
+
+import :stop;
+
+namespace lf {
+
+enum class category : std::uint8_t {
+  call = 0,
+  fork,
+  root,
+};
+
+struct frame_base {};
+
+// TODO: make everything (deque etc) allocator aware...
+
+template <typename Checkpoint>
+struct frame_type : frame_base {
+
+  // == Member variables == //
+
+  // TODO: add checked accessors for all the things (including except etc)
+
+  // Only set if an exception is thrown, otherwise uninit
+  uninitialized<std::exception_ptr> except;
+
+  frame_type *parent;
+  stop_source::stop_token stop_token;
+
+  [[no_unique_address]]
+  Checkpoint stack_ckpt;
+
+  ATOMIC_ALIGN(std::uint32_t) joins = 0;        // Atomic is 32 bits for speed
+  std::uint16_t steals = 0;                     // In debug do overflow checking
+  category kind = static_cast<category>(0);     // Fork/Call
+  ATOMIC_ALIGN(std::uint8_t) exception_bit = 0; // Atomically set
+
+  // == Member functions == //
+
+  // Explicitly post construction, this allows the compiler to emit a single
+  // instruction for the zero init then an instruction for the joins init,
+  // instead of three instructions.
+  explicit constexpr frame_type(Checkpoint &&ckpt) noexcept(std::is_nothrow_move_constructible_v<Checkpoint>)
+      : stack_ckpt(std::move(ckpt)) {
+    joins = k_u16_max;
+  }
+
+  [[nodiscard]]
+  constexpr auto stop_requested() const noexcept -> bool {
+    // TODO: Should exception trigger stop?
+    return stop_token.stop_requested();
+  }
+
+  [[nodiscard]]
+  constexpr auto handle() LF_HOF(std::coroutine_handle<frame_type>::from_promise(*this))
+
+  [[nodiscard]]
+  constexpr auto atomic_joins() noexcept -> std::atomic_ref<std::uint32_t> {
+    return std::atomic_ref{joins};
+  }
+
+  [[nodiscard]]
+  constexpr auto atomic_except() noexcept -> std::atomic_ref<std::uint8_t> {
+    return std::atomic_ref{exception_bit};
+  }
+
+  constexpr void reset_counters() noexcept {
+    joins = k_u16_max;
+    steals = 0;
+  }
+};
+
+} // namespace lf
diff --git a/src/core/handles.cxx b/src/core/handles.cxx
new file mode 100644
index 000000000..ef646354b
--- /dev/null
+++ b/src/core/handles.cxx
@@ -0,0 +1,68 @@
+export module libfork.core:handles;
+
+import libfork.utils;
+
+import :frame;
+
+namespace lf {
+
+// =================== Untyped handles =================== //
+
+class handle {
+ public:
+  constexpr handle() = default;
+  constexpr handle(key_t, frame_base *ptr) noexcept
+      : m_ptr{ptr} {}
+  constexpr auto operator==(handle const &) const noexcept -> bool = default;
+  constexpr explicit operator bool() const noexcept { return m_ptr != nullptr; }
+
+ private:
+  [[nodiscard]]
+  constexpr friend auto get(key_t, handle h) noexcept -> frame_base * {
+    return h.m_ptr;
+  }
+
+  frame_base *m_ptr = nullptr;
+};
+
+/**
+ * @brief An untyped steal-handle.
+ *
+ * For use by context policies that need to store handles in an untyped manner.
+ */
+export struct unsafe_steal_handle : handle {
+  using handle::handle;
+};
+
+/**
+ * @brief An untyped schedule-handle.
+ *
+ * For use by context policies that need to store handles in an untyped manner.
+ */
+export struct unsafe_sched_handle : handle {
+  using handle::handle;
+};
+
+// =================== Tagged handles =================== //
+
+/**
+ * @brief A handle to a task that can be stolen and resumed with `execute`.
+ *
+ * The coroutine behind this task is always suspended at fork point.
+ */
+export template <typename T>
+struct steal_handle : unsafe_steal_handle {
+  using unsafe_steal_handle::unsafe_steal_handle;
+};
+
+/**
+ * @brief A handle to a task that can be resumed with `execute`.
+ *
+ * The coroutine behind this task is either not-yet-started or suspended at a context-switch.
+ */
+export template <typename T>
+struct sched_handle : unsafe_sched_handle {
+  using unsafe_sched_handle::unsafe_sched_handle;
+};
+
+} // namespace lf
diff --git a/src/core/ops.cxx b/src/core/ops.cxx
new file mode 100644
index 000000000..d88a09724
--- /dev/null
+++ b/src/core/ops.cxx
@@ -0,0 +1,228 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:ops;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_invocable;
+import :frame;
+import :stop;
+
+namespace lf {
+
+// Placeholder types for absent optional fields.
+struct no_stop_t {};
+struct no_ret_t {};
+
+// =============== Value-or-reference storage policy =============== //
+
+// For rvalue-reference arguments that are trivially copyable and fit in two
+// pointer-sized words, store by value inside pkg instead of keeping a reference.
+// This lets [[no_unique_address]] collapse empty functors to zero bytes and
+// allows the compiler to treat the stored values as local data (no aliasing).
+template <typename T>
+concept small_trivially_copyable = !std::is_reference_v<T>                     //
+                                   && std::is_trivially_copyable_v<T>          //
+                                   && sizeof(T) <= 2 * sizeof(void *)          //
+                                   && alignof(T) <= alignof(std::max_align_t); //
+
+// Only collapses rvalue refs; lvalue refs are kept as-is to preserve reference semantics.
+template <typename T>
+using store_as_t =
+    std::conditional_t<std::is_rvalue_reference_v<T> && small_trivially_copyable<std::remove_cvref_t<T>>,
+                       std::remove_cvref_t<T>,
+                       T>;
+
+// clang-format off
+
+template <category Cat, bool StopToken, typename Context, typename R, typename Fn, typename... Args>
+struct [[nodiscard("You should immediately co_await this!")]] pkg {
+  [[no_unique_address]] std::conditional_t<StopToken, stop_source::stop_token, no_stop_t> stop_token;
+  [[no_unique_address]] std::conditional_t<std::is_void_v<R>, no_ret_t, R *> return_addr;
+  [[no_unique_address]] Fn fn;
+  [[no_unique_address]] tuple<Args...> args;
+};
+
+// clang-format on
+
+/**
+ * @brief Forward the function member of a pkg correctly.
+ *
+ * Handles three cases:
+ *  - rvalue reference Fn: move it.
+ *  - lvalue reference Fn: return by reference.
+ *  - value type Fn (small trivially-copyable stored directly): return by value.
+ */
+template <typename Fn>
+constexpr auto fwd_fn(auto &&fn) noexcept -> Fn {
+  if constexpr (std::is_rvalue_reference_v<Fn>) {
+    return std::move(fn);
+  } else if constexpr (std::is_lvalue_reference_v<Fn> || small_trivially_copyable<Fn>) {
+    return fn;
+  } else {
+    static_assert(false, "Invalid Fn type in fwd_fn");
+  }
+}
+
+// =============== Join =============== //
+
+struct join_type {};
+
+/**
+ * @brief Base class shared by scope_ops and child_scope_ops.
+ *
+ * Provides a member `join()` so that `co_await sc.join()` works on any scope type.
+ */
+struct scope_base {
+  [[nodiscard("You should immediately co_await this!")]]
+  static constexpr auto join() noexcept -> join_type {
+    return {};
+  }
+};
+
+// =============== Scope ops (no embedded stop source) =============== //
+
+template <typename Context>
+struct scope_ops : scope_base {
+ private:
+  template <typename R, typename Fn, typename... Args>
+  using call_pkg = pkg<category::call, false, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+  template <typename R, typename Fn, typename... Args>
+  using fork_pkg = pkg<category::fork, false, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+ public:
+  // Default constructible
+  scope_ops() noexcept = default;
+
+  // Immovable
+  scope_ops(const scope_ops &) = delete;
+  scope_ops(scope_ops &&) = delete;
+  auto operator=(const scope_ops &) -> scope_ops & = delete;
+  auto operator=(scope_ops &&) -> scope_ops & = delete;
+
+  // === Fork === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  static constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg<R, Fn, Args...> {
+    return {.return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  static constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  static constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+
+  // === Call === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  static constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg<R, Fn, Args...> {
+    return {.return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  static constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  static constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+};
+
+// ==== Scope awaitable ==== //
+
+template <worker_context Context>
+struct scope_awaitable : std::suspend_never {
+  static constexpr auto await_resume() noexcept -> scope_ops<Context> { return {}; }
+};
+
+struct scope_type {};
+
+export [[nodiscard("You should immediately co_await this!")]]
+constexpr auto scope() noexcept -> scope_type {
+  return {};
+}
+
+// =============== Child scope ops (with embedded stop source) =============== //
+
+/**
+ * @brief A scope that is a stop_source.
+ */
+template <typename Context>
+struct child_scope_ops : scope_base, stop_source {
+ private:
+  template <typename R, typename Fn, typename... Args>
+  using call_pkg = pkg<category::call, true, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+  template <typename R, typename Fn, typename... Args>
+  using fork_pkg = pkg<category::fork, true, Context, R, store_as_t<Fn &&>, store_as_t<Args &&>...>;
+
+ public:
+  /**
+   * @brief Construct the scope, chaining its stop source onto the parent's token.
+   */
+  explicit constexpr child_scope_ops(stop_source::stop_token parent) noexcept
+      : stop_source(parent) {}
+
+  // Immovable (stop_source base is immovable)
+  child_scope_ops(const child_scope_ops &) = delete;
+  child_scope_ops(child_scope_ops &&) = delete;
+  auto operator=(const child_scope_ops &) -> child_scope_ops & = delete;
+  auto operator=(child_scope_ops &&) -> child_scope_ops & = delete;
+
+  // === Fork (binds this scope's stop source as child stop source) === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  constexpr auto fork(R *ret, Fn &&fn, Args &&...args) noexcept -> fork_pkg<R, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  constexpr auto fork_drop(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  constexpr auto fork(Fn &&fn, Args &&...args) noexcept -> fork_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+
+  // === Call (binds this scope's stop source as child stop source) === //
+
+  template <typename R, typename... Args, async_invocable_to<R, Context, Args...> Fn>
+  constexpr auto call(R *ret, Fn &&fn, Args &&...args) noexcept -> call_pkg<R, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = ret, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable<Context, Args...> Fn>
+  constexpr auto call_drop(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+  template <typename... Args, async_invocable_to<void, Context, Args...> Fn>
+  constexpr auto call(Fn &&fn, Args &&...args) noexcept -> call_pkg<void, Fn, Args...> {
+    return {.stop_token = token(), .return_addr = {}, .fn = LF_FWD(fn), .args = {LF_FWD(args)...}};
+  }
+};
+
+// =============== child_scope_awaitable =============== //
+
+template <worker_context Context>
+struct child_scope_awaitable : std::suspend_never {
+
+  stop_source::stop_token parent_stop_token;
+
+  constexpr auto await_resume(this child_scope_awaitable self) noexcept -> child_scope_ops<Context> {
+    return child_scope_ops<Context>{self.parent_stop_token};
+  }
+};
+
+struct child_scope_type {};
+
+export [[nodiscard("You should immediately co_await this!")]]
+constexpr auto child_scope() noexcept -> child_scope_type {
+  return {};
+}
+
+} // namespace lf
diff --git a/src/core/poly_context.cxx b/src/core/poly_context.cxx
new file mode 100644
index 000000000..f651d11b0
--- /dev/null
+++ b/src/core/poly_context.cxx
@@ -0,0 +1,58 @@
+module;
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:poly_context;
+
+import std;
+
+import :concepts_stack;
+import :handles;
+import :exception;
+
+namespace lf {
+
+export template <worker_stack Stack>
+class base_context {
+ public:
+  auto stack() noexcept -> Stack & { return m_stack; }
+
+ protected:
+  constexpr base_context() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<Stack, Args...>
+  explicit(sizeof...(Args) ==
+           1) constexpr base_context(Args &&...args) noexcept(std::is_nothrow_constructible_v<Stack, Args...>)
+      : m_stack(std::forward<Args>(args)...) {}
+
+ private:
+  Stack m_stack;
+};
+
+export struct post_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "derived context does not support posting tasks.";
+  }
+};
+
+/**
+ * @brief A worker context polymorphic in push/pop/post.
+ *
+ * This is the canonical/blessed base class in libfork for polymorphic uses
+ * cases. Although possible, libfork does not recommend contexts polymorphic
+ * in the `.stack` member
+ */
+export template <worker_stack Stack>
+class poly_context : public base_context<Stack> {
+ public:
+  using base_context<Stack>::base_context;
+
+  virtual void push(steal_handle<poly_context>) = 0;
+  virtual auto pop() noexcept -> steal_handle<poly_context> = 0;
+
+  virtual void post([[maybe_unused]] sched_handle<poly_context> handle) { LF_THROW(post_error{}); }
+
+  virtual ~poly_context() noexcept = default;
+};
+
+} // namespace lf
diff --git a/src/core/promise.cxx b/src/core/promise.cxx
new file mode 100644
index 000000000..3ed44def8
--- /dev/null
+++ b/src/core/promise.cxx
@@ -0,0 +1,228 @@
+module;
+#include <version>
+
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+#include "libfork/__impl/utils.hpp"
+export module libfork.core:promise;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+import :concepts_invocable;
+import :frame;
+import :stop;
+import :task;
+import :thread_locals;
+import :ops;
+import :handles;
+import :final_suspend;
+import :awaitables;
+
+// TODO: vet constexpr usage in the library
+
+namespace lf {
+
+// =============== Final awaitable =============== //
+
+struct final_awaitable : std::suspend_always {
+  template <returnable T, worker_context Context>
+  constexpr static auto await_suspend(coro<promise_type<T, Context>> handle) noexcept -> coro<> {
+    return final_suspend_leading<Context>(&handle.promise().frame);
+  }
+};
+
+// =============== Frame mixin =============== //
+
+template <worker_context Context>
+struct mixin_frame {
+
+  // === For internal use === //
+
+  using enum category;
+
+  template <typename Self>
+    requires (!std::is_const_v<Self>)
+  [[nodiscard]]
+  constexpr auto handle(this Self &self)
+      LF_HOF(coro<Self>::from_promise(self))
+
+  // === Called by the compiler === //
+
+  // --- Allocation
+
+  static auto operator new(std::size_t sz) noexcept(noexcept(get_tls_stack<Context>().push(sz))) -> void * {
+    void *ptr = get_tls_stack<Context>().push(sz);
+    LF_ASSUME(is_sufficiently_aligned<k_new_align>(ptr));
+    return std::assume_aligned<k_new_align>(ptr);
+  }
+
+  static auto operator delete(void *p, std::size_t sz) noexcept -> void {
+    get_tls_stack<Context>().pop(p, sz);
+  }
+
+  // --- Await transformations
+
+  template <category Cat, bool StopToken, typename R, typename Fn, typename... Args>
+  constexpr auto
+  await_transform_pkg(this auto const &self, pkg<Cat, StopToken, Context, R, Fn, Args...> &&pkg) noexcept(
+      async_nothrow_invocable<Fn, Context, Args...>) -> async_awaitable<Cat, Context> {
+
+    using U = async_result_t<Fn, Context, Args...>;
+
+    // clang-format off
+
+    promise_type<U, Context> *child_promise = get(key(), std::move(pkg.args).apply(
+      [&](auto &&...args) LF_HOF(ctx_invoke_t<Context>{}(fwd_fn<Fn>(pkg.fn), LF_FWD(args)...))
+    ));
+
+    // clang-format on
+
+    LF_ASSUME(child_promise);
+
+    // void can signal drop return.
+    static_assert(std::same_as<R, U> || std::is_void_v<R>);
+
+    if constexpr (!std::is_void_v<R>) {
+      child_promise->return_address = not_null(pkg.return_addr);
+    } else if constexpr (!std::is_void_v<U>) {
+      // Set child's return address to null to inhibit the return
+      // TODO: add test for this
+      child_promise->return_address = nullptr;
+    }
+
+    if constexpr (StopToken) {
+      // TODO: need some kind of API to launch an unstoppable task?
+      LF_ASSUME(pkg.stop_token.stop_possible());
+      child_promise->frame.stop_token = pkg.stop_token;
+    } else {
+      child_promise->frame.stop_token = self.frame.stop_token;
+    }
+
+    return {.child = &child_promise->frame};
+  }
+
+  template <category Cat, bool StopToken, typename R, typename Fn, typename... Args>
+  constexpr auto await_transform(this auto &self, pkg<Cat, StopToken, Context, R, Fn, Args...> &&pkg) noexcept
+      -> async_awaitable<Cat, Context> {
+    LF_TRY {
+      return self.await_transform_pkg(std::move(pkg));
+    } LF_CATCH_ALL {
+      stash_current_exception(&self.frame);
+    }
+    return {.child = nullptr};
+  }
+
+  constexpr auto await_transform(this auto &self, join_type) noexcept -> join_awaitable<Context> {
+    return {.frame = &self.frame};
+  }
+
+  static constexpr auto await_transform(scope_type) noexcept -> scope_awaitable<Context> { return {}; }
+
+  constexpr auto
+  await_transform(this auto const &self, child_scope_type) noexcept -> child_scope_awaitable<Context> {
+    return {.parent_stop_token = self.frame.stop_token};
+  }
+
+  constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; }
+
+  constexpr static auto final_suspend() noexcept -> final_awaitable { return {}; }
+
+  constexpr void unhandled_exception(this auto &self) noexcept {
+    // Stash the exception in the parent which will rethrow at the join.
+    stash_current_exception(self.frame.parent);
+  }
+};
+
+// =============== Promise (void) =============== //
+
+template <worker_context Context>
+struct promise_type<void, Context> : mixin_frame<Context> {
+
+  // Putting init here allows:
+  //  1. Frame not to need to know about the checkpoint type
+  //  2. Compiler merge double read of thread local here and in allocator
+  frame_t<Context> frame{get_tls_stack<Context>().checkpoint()};
+
+  constexpr auto get_return_object() noexcept -> task<void, Context> { return {key(), this}; }
+
+  constexpr static void return_void() noexcept {}
+};
+
+// =============== Promise (non-void) =============== //
+
+template <returnable T, worker_context Context>
+struct promise_type : mixin_frame<Context> {
+
+  // Putting init here allows:
+  //  1. Frame not to need to know about the checkpoint type
+  //  2. Compiler merge double read of thread local here and in allocator
+  frame_t<Context> frame{get_tls_stack<Context>().checkpoint()};
+  T *return_address;
+
+  constexpr auto get_return_object() noexcept -> task<T, Context> { return {key(), this}; }
+
+  template <typename U = T>
+    requires std::assignable_from<T &, U &&>
+  constexpr void return_value(U &&value) noexcept(std::is_nothrow_assignable_v<T &, U &&>) {
+    if (return_address) {
+      *return_address = LF_FWD(value);
+    }
+  }
+};
+
+} // namespace lf
+
+// =============== std specialization =============== //
+
+template <typename R, lf::worker_context Context, typename... Args>
+struct std::coroutine_traits<lf::task<R, Context>, Args...> {
+  using promise_type = ::lf::promise_type<R, Context>;
+};
+
+template <typename R, typename Self, lf::worker_context Context, typename... Args>
+struct std::coroutine_traits<lf::task<R, Context>, Self, Args...> {
+  using promise_type = ::lf::promise_type<R, Context>;
+};
+
+// =============== Layout invariants =============== //
+
+namespace {
+
+struct unit_checkpoint {
+  auto operator==(unit_checkpoint const &) const -> bool = default;
+};
+
+struct unit_stack {
+  static auto push(std::size_t) -> void *;
+  static auto pop(void *, std::size_t) noexcept -> void;
+  static auto checkpoint() noexcept -> unit_checkpoint;
+  static auto prepare_release() noexcept -> int;
+  static auto release(int) noexcept -> void;
+  static auto acquire(unit_checkpoint) noexcept -> void;
+};
+
+struct unit_context {
+  void push(lf::steal_handle<unit_context>);
+  auto pop() noexcept -> lf::steal_handle<unit_context>;
+  auto stack() noexcept -> unit_stack &;
+};
+
+static_assert(lf::worker_context<unit_context>);
+
+using frame_t = lf::frame_type<unit_checkpoint>;
+
+static_assert(std::is_standard_layout_v<frame_t>);
+static_assert(alignof(lf::promise_type<void, unit_context>) == alignof(frame_t));
+static_assert(alignof(lf::promise_type<int, unit_context>) == alignof(frame_t));
+static_assert(std::is_standard_layout_v<lf::promise_type<void, unit_context>>);
+static_assert(std::is_standard_layout_v<lf::promise_type<int, unit_context>>);
+
+#ifdef __cpp_lib_is_pointer_interconvertible
+static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type<void, unit_context>::frame));
+static_assert(std::is_pointer_interconvertible_with_class(&lf::promise_type<int, unit_context>::frame));
+#endif
+
+} // namespace
diff --git a/src/core/receiver.cxx b/src/core/receiver.cxx
new file mode 100644
index 000000000..bfa0f42c7
--- /dev/null
+++ b/src/core/receiver.cxx
@@ -0,0 +1,214 @@
+
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:receiver;
+
+import std;
+
+import :stop;
+import :exception;
+
+namespace lf {
+
+export struct broken_receiver_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "receiver is in invalid state";
+  }
+};
+
+export struct operation_cancelled_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "operation was cancelled";
+  }
+};
+
+/**
+ * @brief Shared state between a scheduled task and its receiver handle.
+ */
+template <typename T, bool Stoppable = false>
+struct hidden_receiver_state {
+
+  struct empty_1 {};
+  struct empty_2 {};
+
+  alignas(k_new_align) std::array<std::byte, 1024> buffer{};
+
+  [[no_unique_address]]
+  std::conditional_t<std::is_void_v<T>, empty_1, T> return_value{};
+
+  std::exception_ptr exception;
+  std::atomic_flag ready;
+
+  [[no_unique_address]]
+  std::conditional_t<Stoppable, stop_source, empty_2> stop;
+
+  constexpr hidden_receiver_state() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<T, Args...>
+  constexpr explicit(sizeof...(Args) == 1)
+      hidden_receiver_state(Args &&...args) noexcept(std::is_nothrow_constructible_v<T, Args...>)
+      : return_value(std::forward<Args>(args)...) {}
+};
+
+/// Convenience alias — used throughout the core partitions.
+template <typename T, bool Stoppable = false>
+using state_handle = std::shared_ptr<hidden_receiver_state<T, Stoppable>>;
+
+/**
+ * @brief Lightweight move-only handle owning a pre-allocated root task state.
+ *
+ * Construction allocates a `hidden_receiver_state<T, Stoppable>` which embeds a
+ * 1 KiB aligned buffer; the root coroutine frame is placement-constructed
+ * into that buffer by `schedule`.
+ *
+ * Constructors mirror `make_shared` / `allocate_shared`:
+ *
+ *   recv_state<T> s;                               // default-init return value
+ *   recv_state<T> s{v1, v2};                       // in-place init: T{v1, v2}
+ *   recv_state<T> s{allocator_arg, alloc};         // default-init, custom allocator
+ *   recv_state<T> s{allocator_arg, alloc, v1, v2}; // in-place init + custom allocator
+ */
+export template <typename T, bool Stoppable = false>
+class recv_state {
+  using state_type = hidden_receiver_state<T, Stoppable>;
+
+ public:
+  /// Default: value-initialise via `std::make_shared`.
+  constexpr recv_state()
+      : m_ptr(std::make_shared<state_type>()) {}
+
+  /// Value-init from args: forwards `args` to `hidden_receiver_state`'s constructor
+  /// (in-place construction of the return value) via `std::make_shared`.
+  template <typename... Args>
+    requires std::constructible_from<state_type, Args...>
+  constexpr explicit(sizeof...(Args) == 1) recv_state(Args &&...args)
+      : m_ptr(std::make_shared<state_type>(std::forward<Args>(args)...)) {}
+
+  /// Allocator-aware, default return value: allocate via `std::allocate_shared`.
+  template <simple_allocator Alloc>
+  constexpr recv_state(std::allocator_arg_t, Alloc const &alloc)
+      : m_ptr(std::allocate_shared<state_type>(alloc)) {}
+
+  /// Allocator-aware with value-init args.
+  template <simple_allocator Alloc, typename... Args>
+    requires std::constructible_from<state_type, Args...>
+  constexpr recv_state(std::allocator_arg_t, Alloc const &alloc, Args &&...args)
+      : m_ptr(std::allocate_shared<state_type>(alloc, std::forward<Args>(args)...)) {}
+
+  // Move-only.
+  constexpr recv_state(recv_state &&) noexcept = default;
+  constexpr auto operator=(recv_state &&) noexcept -> recv_state & = default;
+  constexpr recv_state(recv_state const &) = delete;
+  constexpr auto operator=(recv_state const &) -> recv_state & = delete;
+
+ private:
+  [[nodiscard]]
+  friend constexpr auto get(key_t, recv_state &&self) noexcept -> state_handle<T, Stoppable> {
+    return std::move(self.m_ptr);
+  }
+
+  state_handle<T, Stoppable> m_ptr;
+};
+
+export template <typename T, bool Stoppable = false>
+class receiver {
+
+  using state_type = hidden_receiver_state<T, Stoppable>;
+
+ public:
+  constexpr receiver(key_t, state_handle<T, Stoppable> state) noexcept
+      : m_state(std::move(state)) {}
+
+  // Move only
+  constexpr receiver(receiver &&) noexcept = default;
+  constexpr receiver(const receiver &) = delete;
+  constexpr auto operator=(receiver &&) noexcept -> receiver & = default;
+  constexpr auto operator=(const receiver &) -> receiver & = delete;
+
+  /**
+   * @brief Test if connected to a receiver state.
+   */
+  [[nodiscard]]
+  constexpr auto valid() const noexcept -> bool {
+    return m_state != nullptr;
+  }
+
+  /**
+   * @brief Test if the associated task has completed (either successfully or with an exception/cancellation).
+   */
+  [[nodiscard]]
+  constexpr auto ready() const -> bool {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    return m_state->ready.test();
+  }
+
+  /**
+   * @brief Wait for the associated task to complete (either successfully or with an exception/cancellation).
+   *
+   * May be called multiple times.
+   */
+  constexpr void wait() const {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    m_state->ready.wait(false);
+  }
+
+  /**
+   * @brief Get a reference to the stop_source for this task, allowing the caller to request cancellation.
+   *
+   * Only available when Stoppable=true.
+   */
+  [[nodiscard]]
+  constexpr auto stop_source() -> stop_source &
+    requires Stoppable
+  {
+    if (!valid()) {
+      LF_THROW(broken_receiver_error{});
+    }
+    return m_state->stop;
+  }
+
+  /**
+   * @brief Wait for the associated task to complete and return its result, or rethrow.
+   *
+   * If the receiver was cancelled this will throw an exception.
+   *
+   * This may only be called once; the state is consumed and the receiver becomes invalid.
+   */
+  [[nodiscard]]
+  constexpr auto get() && -> T {
+
+    wait();
+
+    // State will be cleaned up on unwind
+    std::shared_ptr state = std::exchange(m_state, nullptr);
+
+    LF_ASSUME(state != nullptr);
+
+    if (state->exception) {
+      std::rethrow_exception(state->exception);
+    }
+
+    if constexpr (Stoppable) {
+      if (state->stop.stop_requested()) {
+        LF_THROW(operation_cancelled_error{});
+      }
+    }
+
+    if constexpr (!std::is_void_v<T>) {
+      return std::move(state->return_value);
+    }
+  }
+
+ private:
+  state_handle<T, Stoppable> m_state;
+};
+
+} // namespace lf
diff --git a/src/core/root.cxx b/src/core/root.cxx
new file mode 100644
index 000000000..12c9b955b
--- /dev/null
+++ b/src/core/root.cxx
@@ -0,0 +1,204 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:root;
+
+import std;
+
+import :concepts_context;
+import :concepts_invocable;
+import :frame;
+import :promise;
+import :receiver;
+import :thread_locals;
+import :task;
+import :exception;
+
+namespace lf {
+
+/**
+ * @brief Thrown if the root coroutine frame is too large for the embedded buffer.
+ */
+export struct root_alloc_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "root coroutine frame exceeds hidden_receiver_state buffer size";
+  }
+};
+
+struct get_frame_t {};
+
+template <typename Checkpoint>
+struct root_task {
+  struct promise_type {
+
+    frame_type<Checkpoint> frame{Checkpoint{}};
+
+    /// Owns a ref to the hidden_receiver_state hosting this frame's buffer.
+    std::shared_ptr<void> keep_alive;
+
+    template <typename R, bool Stoppable, typename... Args>
+    constexpr explicit promise_type(state_handle<R, Stoppable> const &recv, Args const &...) noexcept
+        : keep_alive(recv) {}
+
+    template <typename R, bool Stoppable, typename... Args>
+    static auto
+    operator new(std::size_t size, state_handle<R, Stoppable> const &recv, Args const &...) -> void * {
+
+      LF_ASSUME(recv != nullptr);
+
+      if (size > recv->buffer.size()) {
+        LF_THROW(root_alloc_error{});
+      }
+
+      return recv->buffer.data();
+    }
+
+    /// No-op: the buffer is owned by the hidden_receiver_state, not the frame.
+    static auto operator delete(void * /*ptr*/, std::size_t /*size*/) noexcept -> void {}
+
+    struct frame_awaitable : std::suspend_never {
+      frame_type<Checkpoint> *frame;
+      [[nodiscard]]
+      constexpr auto await_resume() const noexcept -> frame_type<Checkpoint> * {
+        return frame;
+      }
+    };
+
+    constexpr auto await_transform([[maybe_unused]] get_frame_t tag) noexcept -> frame_awaitable {
+      return {.frame = &frame};
+    }
+
+    struct call_awaitable : std::suspend_always {
+      frame_type<Checkpoint> *child;
+      constexpr auto await_suspend([[maybe_unused]] coro<promise_type> root) const noexcept -> coro<> {
+        return child->handle();
+      }
+    };
+
+    constexpr auto await_transform(frame_type<Checkpoint> *child) noexcept -> call_awaitable {
+      return {.child = child};
+    }
+
+    constexpr auto get_return_object() noexcept -> root_task { return {.promise = this}; }
+
+    constexpr static auto initial_suspend() noexcept -> std::suspend_always { return {}; }
+
+    /**
+     * @brief Custom final_suspend.
+     *
+     * The root coroutine frame lives inside the hidden_receiver_state's embedded
+     * buffer, so the hidden_receiver_state must outlive the frame teardown.
+     *
+     *   1. `std::exchange` the keep-alive shared_ptr into a local on the
+     *      host stack, leaving the promise member null.
+     *   2. `handle.destroy()` — runs parameter + promise destructors (including
+     *      the now-null `keep_alive`) and our no-op `operator delete`.
+     *      No frame-memory access occurs after the handle returns.
+     *   3. On return, the stack-local `shared_ptr<void>` dies; if its ref
+     *      was the last, it destroys the hidden_receiver_state cleanly — we are
+     *      no longer executing inside the buffer.
+     */
+    struct final_awaiter : std::suspend_always {
+      void await_suspend(std::coroutine_handle<promise_type> handle) const noexcept {
+        std::shared_ptr<void> local = std::exchange(handle.promise().keep_alive, nullptr);
+        LF_ASSUME(local != nullptr);
+        handle.destroy();
+        // `local` released here — possibly freeing hidden_receiver_state on return.
+      }
+    };
+
+    constexpr static auto final_suspend() noexcept -> final_awaiter { return {}; }
+
+    constexpr static void return_void() noexcept {}
+
+    [[noreturn]]
+    constexpr void unhandled_exception() noexcept {
+      // Any exceptions escaping the root task are a bug.
+      LF_UNREACHABLE();
+    }
+  };
+
+  promise_type *promise;
+};
+
+template <worker_context Context, typename R, bool Stoppable, typename Fn, typename... Args>
+  requires async_invocable_to<Fn, R, Context, Args...>
+[[nodiscard]]
+auto //
+root_pkg(state_handle<R, Stoppable> recv, Fn fn, Args... args) -> root_task<checkpoint_t<Context>> {
+
+  // This should be resumed on a valid context.
+  LF_ASSUME(thread_local_context<Context> != nullptr);
+
+  using checkpoint = checkpoint_t<Context>;
+
+  // Pointer to this root_task's own frame.
+  frame_type<checkpoint> *root = not_null(co_await get_frame_t{});
+
+  // Manual "call" invocation of the user-supplied task.
+
+  using result_type = async_result_t<Fn, Context, Args...>;
+  using promise_type = promise_type<result_type, Context>;
+
+  promise_type *child = nullptr;
+
+  if (root->stop_requested()) {
+    // The root task was cancelled before it even started, we can skip
+    // straight to cleanup.
+    goto cleanup;
+  }
+
+  LF_TRY {
+    // Potentially throwing
+    child = get(key(), ctx_invoke_t<Context>{}(std::move(fn), std::move(args)...));
+  } LF_CATCH_ALL {
+    recv->exception = std::current_exception();
+    goto cleanup;
+  }
+
+  LF_ASSUME(child != nullptr);
+
+  // Propagate parent/stop info to child
+  child->frame.parent = root;
+  child->frame.stop_token = root->stop_token;
+
+  LF_ASSUME(child->frame.kind == category::call);
+
+  if constexpr (!std::is_void_v<async_result_t<Fn, Context, Args...>>) {
+    child->return_address = std::addressof(recv->return_value);
+  }
+
+  // Begin normal execution of the child task, it will clean itself
+  // up (i.e. .destroy()) at the final suspend
+  co_await &child->frame;
+
+  // Now we have been resumed the child is done, it could have completed via:
+  //
+  // - Normal return
+  // - Exception
+  // - Cancellation (in which case it would have dropped any exceptions)
+  //
+  // For symmetry with a normal task we unconditionally propagate exceptions here,
+  // effectively this is an `await_resume`.
+
+  if constexpr (LF_COMPILER_EXCEPTIONS) {
+    if (root->exception_bit) {
+      // The child threw an exception, propagate it to the receiver.
+      recv->exception = extract_exception(root);
+    }
+  }
+
+cleanup:
+  // Notify the receiver that the task is done.
+  recv->ready.test_and_set();
+  recv->ready.notify_one();
+
+  LF_ASSUME(root->steals == 0);
+  LF_ASSUME(root->joins == k_u16_max);
+  LF_ASSUME(root->exception_bit == 0);
+
+  co_return;
+}
+
+} // namespace lf
diff --git a/src/core/schedule.cxx b/src/core/schedule.cxx
new file mode 100644
index 000000000..0a328063f
--- /dev/null
+++ b/src/core/schedule.cxx
@@ -0,0 +1,114 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.core:schedule;
+
+import std;
+
+import :concepts_invocable;
+import :concepts_scheduler;
+import :frame;
+import :stop;
+import :thread_locals;
+import :promise;
+import :root;
+import :handles;
+import :receiver;
+import :exception;
+
+namespace lf {
+
+export struct schedule_error final : libfork_exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "schedule called from within a worker thread!";
+  }
+};
+
+template <typename T>
+concept decay_copyable = std::convertible_to<T, std::decay_t<T>>;
+
+/**
+ * @brief Schedule a function using a caller-provided `recv_state`.
+ *
+ * This will create a root task that stores decayed copies of `Fn` and
+ * `Args...` in its frame, then post it to the scheduler. The root task must
+ * then be resumed by a worker which will perform the invocation of `Fn`.
+ *
+ * The return address/exception and possibly stop token of the root task are
+ * bound to the provided `recv_state` and can be observed by the caller via the
+ * returned `receiver`.
+ *
+ * Strongly exception safe.
+ */
+export template <scheduler Sch, typename R, bool Stoppable, decay_copyable Fn, decay_copyable... Args>
+  requires async_invocable_to<std::decay_t<Fn>, R, context_t<Sch>, std::decay_t<Args>...>
+[[nodiscard("Fire and forget is an anti-pattern")]]
+constexpr auto
+schedule(Sch &&sch, recv_state<R, Stoppable> state, Fn &&fn, Args &&...args) -> receiver<R, Stoppable> {
+
+  using context_type = context_t<Sch>;
+
+  if (thread_local_context<context_type> != nullptr) {
+    LF_THROW(schedule_error{});
+  }
+
+  state_handle<R, Stoppable> state_ptr = get(key(), std::move(state));
+
+  LF_ASSUME(state_ptr != nullptr);
+
+  // root_pkg's operator new may throw root_alloc_error if the frame is
+  // too large; if so, `state_ptr` goes out of scope and destroys the state.
+  root_task task = root_pkg<context_type>(state_ptr, std::forward<Fn>(fn), std::forward<Args>(args)...);
+
+  LF_ASSUME(task.promise != nullptr);
+
+  task.promise->frame.kind = category::root;
+  task.promise->frame.parent = nullptr;
+
+  if constexpr (Stoppable) {
+    task.promise->frame.stop_token = state_ptr->stop.token();
+  } else {
+    task.promise->frame.stop_token = stop_source::stop_token{}; // non-cancellable root
+  }
+
+  LF_TRY {
+    std::forward<Sch>(sch).post(sched_handle<context_type>{key(), &task.promise->frame});
+    // If ^ didn't throw then the root_task will destroy itself at the final suspend.
+  } LF_CATCH_ALL {
+    // Otherwise, if it did throw, we must clean up
+    task.promise->frame.handle().destroy();
+    LF_RETHROW;
+  }
+
+  return {key(), std::move(state_ptr)};
+}
+
+template <typename T>
+concept schedulable_return = std::is_void_v<T> || (std::default_initializable<T> && std::movable<T>);
+
+template <typename Fn, typename Context, typename... Args>
+concept default_schedulable =
+    async_invocable<Fn, Context, Args...> && schedulable_return<async_result_t<Fn, Context, Args...>>;
+
+template <typename Fn, typename Context, typename... Args>
+using async_decay_result_t = async_result_t<std::decay_t<Fn>, Context, std::decay_t<Args>...>;
+
+/**
+ * @brief Convenience overload: default-constructs a non-cancellable recv_state.
+ *
+ * Uses the default allocator (`make_shared`) for all allocations.
+ */
+export template <scheduler Sch, decay_copyable Fn, decay_copyable... Args>
+  requires default_schedulable<std::decay_t<Fn>, context_t<Sch>, std::decay_t<Args>...>
+[[nodiscard("Fire and forget is an anti-pattern")]]
+constexpr auto
+schedule(Sch &&sch, Fn &&fn, Args &&...args) -> receiver<async_decay_result_t<Fn, context_t<Sch>, Args...>> {
+  using result_type = async_decay_result_t<Fn, context_t<Sch>, Args...>;
+  recv_state<result_type, false> state;
+  return schedule(
+      std::forward<Sch>(sch), std::move(state), std::forward<Fn>(fn), std::forward<Args>(args)...);
+}
+
+} // namespace lf
diff --git a/src/core/stop.cxx b/src/core/stop.cxx
new file mode 100644
index 000000000..149b0adde
--- /dev/null
+++ b/src/core/stop.cxx
@@ -0,0 +1,126 @@
+export module libfork.core:stop;
+
+import std;
+
+import libfork.utils;
+
+namespace lf {
+
+/**
+ * @brief Similar to a linked-list of std::stop_source but with an embedded stop_state.
+ */
+export class stop_source {
+ public:
+  /**
+   * @brief Lightweight public handle to a stop_source chain.
+   *
+   * A stop_token is a non-owning pointer-sized wrapper around a stop_source.
+   */
+  class stop_token {
+   public:
+    /**
+     * @brief Construct a null (unstoppable) token.
+     */
+    constexpr stop_token() noexcept = default;
+
+    /**
+     * @brief Returns true if a stop source is associated (stopping is possible).
+     */
+    [[nodiscard]]
+    constexpr auto stop_possible() const noexcept -> bool {
+      return m_src != nullptr;
+    }
+
+    /**
+     * @brief Returns true if any stop source in the ancestor chain has been stopped.
+     *
+     * A null token always returns false.
+     *
+     * Complexity: O(chain depth). Every task that creates a child_scope adds one
+     * node to the chain, so deeply-nested task hierarchies pay proportionally more
+     * per stop check.
+     */
+    [[nodiscard]]
+    constexpr auto stop_requested() const noexcept -> bool {
+      return deep_stop_requested(m_src);
+    }
+
+   private:
+    friend class stop_source;
+
+    explicit constexpr stop_token(stop_source const *src) noexcept
+        : m_src(src) {}
+
+    stop_source const *m_src = nullptr;
+  };
+
+  /**
+   * @brief Construct a root stop source with no parent.
+   */
+  constexpr stop_source() noexcept = default;
+
+  /**
+   * @brief Construct a stop source chained onto the given parent token.
+   */
+  constexpr explicit stop_source(stop_token parent) noexcept
+      : m_parent(parent.m_src) {}
+
+  // Immovable
+  constexpr stop_source(const stop_source &) noexcept = delete;
+  constexpr stop_source(stop_source &&) noexcept = delete;
+  constexpr auto operator=(const stop_source &) noexcept -> stop_source & = delete;
+  constexpr auto operator=(stop_source &&) noexcept -> stop_source & = delete;
+
+  /**
+   * @brief Get a handle to this stop source.
+   */
+  [[nodiscard]]
+  constexpr auto token() const noexcept -> stop_token {
+    return stop_token{this};
+  }
+
+  /**
+   * @brief Returns true if any stop source in the ancestor chain has been stopped.
+   *
+   * Complexity: O(chain depth). Every task that creates a child_scope adds one
+   * node to the chain, so deeply-nested task hierarchies pay proportionally more
+   * per stop check.
+   */
+  [[nodiscard]]
+  constexpr auto stop_requested() const noexcept -> bool {
+    return deep_stop_requested(this);
+  }
+
+  /**
+   * @brief Request that this stop source (and all its children) stop.
+   */
+  constexpr auto request_stop() noexcept -> void { m_stop.store(1, std::memory_order_release); }
+
+  /**
+   * @brief Same as `request_stop`, but returns true if this is the first time stop has been requested.
+   */
+  [[nodiscard("You can use request_stop() if you don't need the return value")]]
+  constexpr auto race_request_stop() noexcept -> bool {
+    return m_stop.exchange(1, std::memory_order_release) == 0;
+  }
+
+ private:
+  /**
+   * @brief Test if any stop request has been made in the current chain.
+   *
+   * Safe to call with a null pointer, in which case it returns false.
+   */
+  [[nodiscard]]
+  friend constexpr auto deep_stop_requested(stop_source const *src) noexcept -> bool {
+    for (stop_source const *ptr = src; ptr != nullptr; ptr = ptr->m_parent) {
+      if (ptr->m_stop.load(std::memory_order_acquire) == 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  stop_source const *m_parent = nullptr;
+  std::atomic<std::uint32_t> m_stop = 0;
+};
+} // namespace lf
diff --git a/src/core/task.cxx b/src/core/task.cxx
new file mode 100644
index 000000000..424910b4e
--- /dev/null
+++ b/src/core/task.cxx
@@ -0,0 +1,59 @@
+export module libfork.core:task;
+
+import std;
+
+import libfork.utils;
+
+import :concepts_context;
+
+namespace lf {
+
+/**
+ * @brief A type returnable from libfork's async functions/coroutines.
+ *
+ * This requires that `T` is `void` or a `std::movable` type.
+ */
+export template <typename T>
+concept returnable = std::is_void_v<T> || (plain_object<T> && std::movable<T>);
+
+export template <worker_context>
+struct env {
+  explicit constexpr env(key_t) noexcept {}
+};
+
+// Forward-declare promise_type so task can reference it as a pointer.
+template <returnable T, worker_context Context>
+struct promise_type;
+
+/**
+ * @brief The return type for libfork's async functions/coroutines.
+ *
+ * This predominantly exists to disambiguate `libfork`s coroutines from other
+ * coroutines and specify `T` the async function's return type which is
+ * required to be `void` or a `std::movable` type.
+ *
+ * \rst
+ *
+ * .. note::
+ *
+ *    No consumer of this library should ever touch an instance of this type,
+ *    it is used for specifying the return type of an `async` function only.
+ *
+ * \endrst
+ */
+export template <returnable T, worker_context Context>
+class task {
+ public:
+  using value_type = T;
+  using context_type = Context;
+
+  constexpr task(key_t, promise_type<T, Context> *promise) noexcept
+      : m_promise(promise) {}
+
+ private:
+  friend constexpr auto get(key_t, task t) noexcept -> promise_type<T, Context> * { return t.m_promise; }
+
+  promise_type<T, Context> *m_promise;
+};
+
+} // namespace lf
diff --git a/src/core/thread_locals.cxx b/src/core/thread_locals.cxx
new file mode 100644
index 000000000..67a278378
--- /dev/null
+++ b/src/core/thread_locals.cxx
@@ -0,0 +1,33 @@
+export module libfork.core:thread_locals;
+
+import libfork.utils;
+
+import :concepts_context;
+
+namespace lf {
+
+/**
+ * @brief Thread-local pointer to the current worker context.
+ */
+template <worker_context Context>
+constinit inline thread_local Context *thread_local_context = nullptr;
+
+// TODO: implications of thread local on constexpr
+
+/**
+ * @brief A getter for the current worker context, checks for null in debug.
+ */
+template <worker_context Context>
+constexpr auto get_tls_context() noexcept -> Context & {
+  return *not_null(thread_local_context<Context>);
+}
+
+/**
+ * @brief A getter for the current worker context's stack, checks for null in debug.
+ */
+template <worker_context Context>
+constexpr auto get_tls_stack() noexcept -> stack_t<Context> & {
+  return get_tls_context<Context>().stack();
+}
+
+} // namespace lf
diff --git a/src/exception.cpp b/src/exception.cpp
new file mode 100644
index 000000000..a05e60c0e
--- /dev/null
+++ b/src/exception.cpp
@@ -0,0 +1,20 @@
+#include <cstdio>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+namespace lf::impl {
+
+[[noreturn]]
+void terminate_with(char const *message, char const *file, int line) noexcept {
+  LF_TRY {
+    std::println(stderr, "{} {}:{}: {}", std::this_thread::get_id(), file, line, message);
+  } LF_CATCH_ALL {
+    // Drop exceptions during termination
+  }
+  // TODO: can we get a stack trace here?
+  std::terminate();
+}
+
+} // namespace lf::impl
diff --git a/src/libfork.cxx b/src/libfork.cxx
new file mode 100644
index 000000000..25e255874
--- /dev/null
+++ b/src/libfork.cxx
@@ -0,0 +1,5 @@
+export module libfork;
+
+export import libfork.core;
+export import libfork.batteries;
+export import libfork.schedulers;
diff --git a/src/schedulers/busy.cxx b/src/schedulers/busy.cxx
new file mode 100644
index 000000000..f7bc92955
--- /dev/null
+++ b/src/schedulers/busy.cxx
@@ -0,0 +1,142 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/compiler.hpp"
+export module libfork.schedulers:basic_busy_pool;
+
+import std;
+
+import libfork.utils;
+import libfork.core;
+import libfork.batteries;
+
+namespace lf {
+
+struct invalid_workers_error : std::exception {
+  [[nodiscard]]
+  constexpr auto what() const noexcept -> const char * override {
+    return "A thread pool must have at least one worker.";
+  }
+};
+
+export enum class pool_kind { mono, poly };
+
+export template <pool_kind Kind,
+                 worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+class basic_busy_pool {
+
+  using context = std::conditional_t<     //
+      Kind == pool_kind::poly,            //
+      derived_poly_context<Stack, Deque>, //
+      mono_context<Stack, Deque>          //
+      >;
+
+ public:
+  using context_type = context::context_type;
+
+  // TODO: sleep when zero work
+
+  explicit basic_busy_pool(std::size_t n = std::thread::hardware_concurrency(), Alloc const &alloc = Alloc())
+      : m_contexts(n) {
+
+    // TODO: propagate alloc to m_contexts, m_posted, etc.
+    (void)alloc;
+
+    if (n < 1) {
+      LF_THROW(invalid_workers_error{});
+    }
+
+    LF_TRY{
+      for (std::size_t id = 0; id < n; ++id) {
+        m_threads.emplace_back([this, id](std::stop_token stop) -> void {
+          worker(std::move(stop), id);
+        });
+      }
+    } LF_CATCH_ALL {
+      // Force joins before members (which threads reference) are destroyed.
+      join_all();
+      LF_RETHROW;
+    }
+  }
+
+  basic_busy_pool(basic_busy_pool const &) = delete;
+  basic_busy_pool(basic_busy_pool &&) = delete;
+
+  auto operator=(basic_busy_pool const &) -> basic_busy_pool & = delete;
+  auto operator=(basic_busy_pool &&) -> basic_busy_pool & = delete;
+
+  ~basic_busy_pool() { join_all(); }
+
+  void post(sched_handle<context_type> handle) {
+    // TODO: use a lock-free queue here
+    auto lock = std::unique_lock(m_mutex);
+    m_posted.push_back(handle);
+  }
+
+ private:
+  void worker(std::stop_token stop, std::size_t id) {
+
+    LF_ASSUME(id < m_contexts.size());
+
+    context &ctx = m_contexts[id];
+
+    std::size_t const n = m_contexts.size();
+
+    std::default_random_engine rng(safe_cast<unsigned>(id + 1));
+    std::uniform_int_distribution<std::size_t> dist(0, n - 2);
+
+    constexpr int k_steal_attempts = 1024;
+
+    while (!stop.stop_requested()) {
+
+      if (auto lock = std::unique_lock(m_mutex); !m_posted.empty()) {
+        sched_handle task = m_posted.back();
+        m_posted.pop_back();
+        lock.unlock();
+        execute(static_cast<context_type &>(ctx), task);
+        continue;
+      }
+
+      if (n > 1) {
+        for (int i = 0; i < k_steal_attempts; ++i) {
+
+          std::size_t victim = dist(rng);
+
+          if (victim >= id) {
+            victim += 1;
+          }
+
+          LF_ASSUME(victim < n);
+          LF_ASSUME(victim != id);
+
+          if (auto result = m_contexts[victim].steal()) {
+            execute(static_cast<context_type &>(ctx), result);
+            continue;
+          }
+        }
+      }
+    }
+  }
+
+  void join_all() {
+    m_threads.clear(); // jthread calls stop and joins in destructor
+  }
+
+  std::vector<context> m_contexts;
+  std::vector<std::jthread> m_threads;
+  std::mutex m_mutex;
+  std::vector<sched_handle<context_type>> m_posted;
+};
+
+export template <worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+using mono_busy_pool = basic_busy_pool<pool_kind::mono, Stack, Deque, Alloc>;
+
+export template <worker_stack Stack,
+                 stealable_deque_policy Deque = adapt_deque<>,
+                 simple_allocator Alloc = std::allocator<std::byte>>
+using poly_busy_pool = basic_busy_pool<pool_kind::poly, Stack, Deque, Alloc>;
+
+} // namespace lf
diff --git a/src/schedulers/inline.cxx b/src/schedulers/inline.cxx
new file mode 100644
index 000000000..c0d58bd5b
--- /dev/null
+++ b/src/schedulers/inline.cxx
@@ -0,0 +1,51 @@
+export module libfork.schedulers:inline_scheduler;
+
+import std;
+
+import libfork.core;
+
+import libfork.batteries;
+
+namespace lf {
+
+// TODO: think about initialization:
+// - do we need default initializable on stack/context?
+// - with allocators
+
+// TODO: Can we store the context directly in TLS?
+
+template <typename Derived, typename Base>
+concept derived_context_from = worker_context<Base> && std::derived_from<Derived, Base>;
+
+export template <typename Context>
+concept derived_worker_context =
+    has_context_typedef<Context> && derived_context_from<Context, context_t<Context>>;
+
+export template <derived_worker_context Context>
+class inline_scheduler {
+ public:
+  using context_type = Context::context_type;
+
+  inline_scheduler() = default;
+
+  template <typename... Args>
+    requires std::constructible_from<Context, Args...>
+  explicit(sizeof...(Args) == 1)
+      inline_scheduler(Args &&...args) noexcept(std::is_nothrow_constructible_v<Context, Args...>)
+      : m_context(std::forward<Args>(args)...) {}
+
+  void post(lf::sched_handle<context_type> handle) {
+    execute(static_cast<context_type &>(m_context), handle);
+  }
+
+ private:
+  Context m_context;
+};
+
+export template <worker_stack Stack, deque_policy Deque>
+using mono_inline_scheduler = inline_scheduler<mono_context<Stack, Deque>>;
+
+export template <worker_stack Stack, deque_policy Deque>
+using poly_inline_scheduler = inline_scheduler<derived_poly_context<Stack, Deque>>;
+
+} // namespace lf
diff --git a/src/schedulers/schedulers.cxx b/src/schedulers/schedulers.cxx
new file mode 100644
index 000000000..0ab5c0a27
--- /dev/null
+++ b/src/schedulers/schedulers.cxx
@@ -0,0 +1,4 @@
+export module libfork.schedulers;
+
+export import :inline_scheduler;
+export import :basic_busy_pool;
diff --git a/src/utils/concepts.cxx b/src/utils/concepts.cxx
new file mode 100644
index 000000000..ad1d7a11b
--- /dev/null
+++ b/src/utils/concepts.cxx
@@ -0,0 +1,65 @@
+export module libfork.utils:concepts;
+
+import std;
+
+namespace lf {
+
+// =========== Atomic related concepts =========== //
+
+export template <typename T>
+concept plain_object = std::is_object_v<T> && std::same_as<T, std::remove_reference_t<T>>;
+
+/**
+ * @brief Verify a type is suitable for use with `std::atomic`
+ *
+ * This requires a `TriviallyCopyable` type satisfying both `CopyConstructible` and `CopyAssignable`.
+ */
+export template <typename T>
+concept atomicable = plain_object<T> &&                 //
+                     std::is_trivially_copyable_v<T> && //
+                     std::is_copy_constructible_v<T> && //
+                     std::is_move_constructible_v<T> && //
+                     std::is_copy_assignable_v<T> &&    //
+                     std::is_move_assignable_v<T>;      //
+
+/**
+ * @brief A concept that verifies a type is lock-free when used with `std::atomic`.
+ */
+export template <typename T>
+concept lock_free = atomicable<T> && std::atomic<T>::is_always_lock_free;
+
+// ========== Specialization ========== //
+
+template <typename T, template <typename...> typename Template>
+struct is_specialization_of : std::false_type {};
+
+template <template <typename...> typename Template, typename... Args>
+struct is_specialization_of<Template<Args...>, Template> : std::true_type {};
+
+/**
+ * @brief Test if `T` is a specialization of the template `Template`.
+ */
+export template <typename T, template <typename...> typename Template>
+concept specialization_of = is_specialization_of<std::remove_cvref_t<T>, Template>::value;
+
+// ==== Allocators
+
+/**
+ * @brief Lifted from the c++26 exposition only requirement.
+ */
+export template <class T>
+concept simple_allocator =
+    std::copy_constructible<T> && std::equality_comparable<T> && requires (T alloc, std::size_t n) {
+      { *alloc.allocate(n) } -> std::same_as<typename T::value_type &>;
+      { alloc.deallocate(alloc.allocate(n), n) };
+    };
+
+/**
+ * @brief Semantically `T` must be a std:: Allocator
+ *
+ * Doesn't specify the full API just 'simple-allocator' exposition only concept.
+ */
+export template <class T, typename U>
+concept allocator_of = simple_allocator<T> && std::same_as<typename T::value_type, U>;
+
+} // namespace lf
diff --git a/src/utils/constants.cxx b/src/utils/constants.cxx
new file mode 100644
index 000000000..e5f303761
--- /dev/null
+++ b/src/utils/constants.cxx
@@ -0,0 +1,24 @@
+export module libfork.utils:constants;
+
+import std;
+
+namespace lf {
+
+export constexpr std::size_t k_kilobyte = 1024;
+export constexpr std::size_t k_megabyte = 1024 * k_kilobyte;
+
+export constexpr std::uint32_t k_u16_max = std::numeric_limits<std::uint16_t>::max();
+
+export constexpr std::size_t k_new_align = __STDCPP_DEFAULT_NEW_ALIGNMENT__;
+export constexpr std::size_t k_page_size = 4096; // 4 KiB on most systems.
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Winterference-size"
+#endif // #ifdef __GNUC__
+export constexpr std::size_t k_cache_line = std::hardware_destructive_interference_size;
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+  #pragma GCC diagnostic pop
+#endif // #ifdef __GNUC__
+
+} // namespace lf
diff --git a/src/utils/defer.cxx b/src/utils/defer.cxx
new file mode 100644
index 000000000..b906035b3
--- /dev/null
+++ b/src/utils/defer.cxx
@@ -0,0 +1,55 @@
+module;
+#include "libfork/__impl/compiler.hpp"
+export module libfork.utils:defer;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Basic implementation of a Golang-like defer.
+ *
+ * \rst
+ *
+ * Use like:
+ *
+ * .. code::
+ *
+ *    auto * ptr = c_api_init();
+ *
+ *    defer _ = [&ptr] noexcept {
+ *      c_api_clean_up(ptr);
+ *    };
+ *
+ *    // Code that may throw
+ *
+ * \endrst
+ */
+export template <class Fn>
+  requires std::is_nothrow_invocable_v<Fn> && std::is_object_v<Fn>
+class [[nodiscard("Defer will execute unless bound to a name!")]] defer {
+ public:
+  defer(defer const &) = delete;
+  defer(defer &&) = delete;
+  auto operator=(defer const &) -> defer & = delete;
+  auto operator=(defer &&) -> defer & = delete;
+
+  /**
+   * @brief Construct a new Defer object.
+   *
+   * @param fn Nullary invocable forwarded into object and invoked by destructor.
+   */
+  constexpr defer(Fn &&fn) noexcept(std::is_nothrow_constructible_v<Fn, Fn &&>)
+      : m_fn(std::forward<Fn>(fn)) {}
+
+  /**
+   * @brief Calls the invocable.
+   */
+  LF_FORCE_INLINE constexpr ~defer() noexcept { std::invoke(std::move(m_fn)); }
+
+ private:
+  [[no_unique_address]]
+  Fn m_fn;
+};
+
+} // namespace lf
diff --git a/src/utils/tuple.cxx b/src/utils/tuple.cxx
new file mode 100644
index 000000000..977abb144
--- /dev/null
+++ b/src/utils/tuple.cxx
@@ -0,0 +1,101 @@
+module;
+#include "libfork/__impl/utils.hpp"
+export module libfork.utils:tuple;
+
+import std;
+
+namespace lf {
+
+// TODO: Replace with reflection tuple?
+
+//========== Copy Qualifiers =============//
+
+template <typename T, typename U>
+struct copy_cvref {
+ private:
+  using u0 = std::remove_reference_t<T>;
+  using u1 = std::conditional_t<std::is_const_v<u0>, std::add_const_t<U>, U>;
+  using u2 = std::conditional_t<std::is_volatile_v<u0>, std::add_volatile_t<u1>, u1>;
+  using u3 = std::conditional_t<std::is_lvalue_reference_v<T>, std::add_lvalue_reference_t<u2>, u2>;
+  using u4 = std::conditional_t<std::is_rvalue_reference_v<T>, std::add_rvalue_reference_t<u3>, u3>;
+
+ public:
+  using type = u4;
+};
+
+/**
+ * Copy the const/volatile/reference qualifiers from `From` to `To`.
+ */
+export template <typename From, typename To>
+using copy_cvref_t = copy_cvref<From, To>::type;
+
+template <int I, typename T>
+struct tuple_leaf {
+  [[no_unique_address]]
+  T elem;
+};
+
+// In GCC 15 name mangling is not implemented for function signatures
+// with 'deducing this' yet, so we fall back to the old implementation.
+#if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ <= 16)
+
+template <std::size_t I, typename... Ts>
+struct index;
+
+template <typename T, typename... Ts>
+struct index<0, T, Ts...> : std::type_identity<T> {};
+
+template <std::size_t I, typename T, typename... Ts>
+struct index<I, T, Ts...> : index<I - 1, Ts...> {};
+
+  #define INDEX_HACK(I, Pack) typename index<I, Pack>::type
+
+#else
+  #define INDEX_HACK(I, Pack) Pack[I]
+#endif
+
+//========== Tuple =============//
+
+template <typename, typename...>
+struct tuple_impl;
+
+template <std::size_t... Is, typename... Ts>
+struct tuple_impl<std::index_sequence<Is...>, Ts...> : tuple_leaf<Is, Ts>... {
+  template <std::size_t I, typename Self>
+  [[nodiscard]]
+  constexpr auto get(this Self &&self) noexcept -> copy_cvref_t<Self &&, INDEX_HACK(I, Ts...)> {
+    return static_cast<copy_cvref_t<Self &&, Ts...[I]>>(LF_FWD(self).template tuple_leaf<I, Ts...[I]>::elem);
+  }
+
+  [[nodiscard]]
+  constexpr auto apply(this auto &&self, auto &&fn)
+      LF_HOF(std::invoke(LF_FWD(fn), LF_FWD(self).template get<Is>()...))
+};
+
+/**
+ * @brief A minimal non-recursive tuple-as-aggregate implementation.
+ *
+ * This is a very stripped back tuple that only:
+ *
+ * - Provides `.get<I>()` member function.
+ * - Provides an `apply(fn)` member function.
+ * - Supports structured bindings.
+ *
+ * This has the advantage of significantly faster compilation times
+ * compared to the standard library's `std::tuple`. In addition it is an
+ * aggregate type hence, is trivially copyable/constructable/destructible
+ * conditional on the types it contains. It even works as an NTTP.
+ */
+export template <typename... Ts>
+struct tuple final : tuple_impl<std::index_sequence_for<Ts...>, Ts...> {};
+
+template <typename... Ts>
+tuple(Ts &&...) -> tuple<Ts...>;
+
+} // namespace lf
+
+template <typename... Ts>
+struct std::tuple_size<lf::tuple<Ts...>> : std::integral_constant<std::size_t, sizeof...(Ts)> {};
+
+template <std::size_t I, typename... Ts>
+struct std::tuple_element<I, lf::tuple<Ts...>> : std::type_identity<Ts... [I]> {};
diff --git a/src/utils/uninitialized.cxx b/src/utils/uninitialized.cxx
new file mode 100644
index 000000000..36888acb3
--- /dev/null
+++ b/src/utils/uninitialized.cxx
@@ -0,0 +1,43 @@
+
+export module libfork.utils:uninitialized;
+
+import std;
+
+import :concepts;
+
+namespace lf {
+
+#if defined(__cpp_trivial_union) && __cpp_trivial_union >= 202306L
+  #pragma message("TODO: __cpp_trivial_union is available — remove union workaround")
+#endif
+
+export template <plain_object T>
+class uninitialized {
+ public:
+  constexpr uninitialized() = default;
+
+  constexpr uninitialized(uninitialized const &) = delete;
+  constexpr uninitialized(uninitialized &&) = delete;
+
+  constexpr auto operator=(uninitialized const &) -> uninitialized & = delete;
+  constexpr auto operator=(uninitialized &&) -> uninitialized & = delete;
+
+  constexpr ~uninitialized() = default;
+
+  auto operator->() noexcept -> T * { return std::launder(std::bit_cast<T *>(auto{buffer})); }
+
+  auto operator*() noexcept -> T & { return *this->operator->(); }
+
+  template <class... Args>
+    requires std::constructible_from<T, Args...>
+  auto construct(Args &&...args) noexcept(std::is_nothrow_constructible_v<T, Args...>) -> T & {
+    return *::new (static_cast<void *>(buffer)) T(std::forward<Args>(args)...);
+  }
+
+  void destroy() noexcept { (**this).~T(); }
+
+ private:
+  alignas(T) std::byte buffer[sizeof(T)];
+};
+
+} // namespace lf
diff --git a/src/utils/utility.cxx b/src/utils/utility.cxx
new file mode 100644
index 000000000..31ff3fd6d
--- /dev/null
+++ b/src/utils/utility.cxx
@@ -0,0 +1,88 @@
+module;
+#include "libfork/__impl/assume.hpp"
+#include "libfork/__impl/exception.hpp"
+export module libfork.utils:utility;
+
+import std;
+
+namespace lf {
+
+/**
+ * @brief Safe integral cast, will terminate if the cast would overflow in debug.
+ */
+export template <std::integral To, std::integral From>
+[[nodiscard]]
+constexpr auto safe_cast(From val) noexcept -> To {
+
+  constexpr auto to_min = std::numeric_limits<To>::min();
+  constexpr auto to_max = std::numeric_limits<To>::max();
+
+  constexpr auto from_min = std::numeric_limits<From>::min();
+  constexpr auto from_max = std::numeric_limits<From>::max();
+
+  if constexpr (std::cmp_greater(to_min, from_min)) {
+    LF_ASSUME(val >= static_cast<From>(to_min));
+  }
+
+  if constexpr (std::cmp_less(to_max, from_max)) {
+    LF_ASSUME(val <= static_cast<From>(to_max));
+  }
+
+  return static_cast<To>(val);
+}
+
+/**
+ * @brief Assume a pointer is not null, otherwise terminates the program in debug mode.
+ */
+export template <typename T>
+[[nodiscard]]
+constexpr auto
+not_null(T *ptr, std::source_location const loc = std::source_location::current()) noexcept -> T * {
+  if (!ptr) {
+#ifdef NDEBUG
+    LF_UNREACHABLE();
+#else
+    impl::terminate_with("Null pointer dereferenced!", loc.file_name(), safe_cast<int>(loc.line()));
+#endif
+  }
+  return ptr;
+}
+
+export class key_t;
+
+export constexpr auto key() noexcept -> key_t;
+
+/**
+ * @brief Only way to get one is via un-exported `key()` function.
+ */
+export class key_t {
+ public:
+  friend constexpr auto key() noexcept -> key_t { return {}; }
+
+ private:
+  constexpr key_t() = default;
+};
+
+/**
+ * @brief Test if a pointer is aligned to a multiple of `Align`.
+ *
+ * Supports fancy pointers, doesn't require an object to exist at the pointer.
+ */
+export template <std::size_t Align, typename Ptr>
+  requires (std::has_single_bit(Align))
+[[nodiscard]]
+auto is_sufficiently_aligned(Ptr const &ptr) noexcept -> bool {
+  return (std::bit_cast<std::uintptr_t>(std::to_address(ptr)) & (Align - 1)) == 0;
+}
+
+/**
+ * @brief Round up size to a multiple of `Align` for alignment purposes.
+ */
+export template <std::size_t Align>
+  requires (std::has_single_bit(Align))
+[[nodiscard]]
+constexpr auto round_to_multiple(std::size_t size) noexcept -> std::size_t {
+  return (size + Align - 1) & ~(Align - 1);
+}
+
+} // namespace lf
diff --git a/src/utils/utils.cxx b/src/utils/utils.cxx
new file mode 100644
index 000000000..bb9096f08
--- /dev/null
+++ b/src/utils/utils.cxx
@@ -0,0 +1,8 @@
+export module libfork.utils;
+
+export import :utility;
+export import :constants;
+export import :tuple;
+export import :concepts;
+export import :defer;
+export import :uninitialized;
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 000000000..250f01026
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,23 @@
+cmake_minimum_required(VERSION 4.2.1 FATAL_ERROR)
+
+project(libfork_test LANGUAGES CXX)
+
+# ---- Dependencies ----
+
+find_package(Catch2 REQUIRED)
+
+# --------- Tests ---------
+
+file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp")
+
+add_executable(libfork_test ${TEST_SOURCES})
+target_link_libraries(libfork_test PRIVATE libfork::libfork Catch2::Catch2WithMain)
+target_compile_features(libfork_test PRIVATE cxx_std_26)
+
+if(DEFINED Catch2_SOURCE_DIR)
+  list(APPEND CMAKE_MODULE_PATH ${Catch2_SOURCE_DIR}/extras)
+endif()
+
+include(Catch)
+
+catch_discover_tests(libfork_test)
diff --git a/test/src/adaptors.cpp b/test/src/adaptors.cpp
new file mode 100644
index 000000000..6ed3e6c2a
--- /dev/null
+++ b/test/src/adaptors.cpp
@@ -0,0 +1,79 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+
+using namespace lf;
+
+namespace {
+
+// adapt_vector's default ctor only stores the allocator; no allocation occurs,
+// so it should be noexcept when the allocator's default ctor is noexcept.
+static_assert(std::is_nothrow_default_constructible_v<adapt_vector<>>);
+
+// adapt_deque's default ctor delegates to a capacity-taking ctor that
+// allocates an internal buffer, so it must NOT be noexcept — bad_alloc must
+// propagate instead of invoking std::terminate.
+static_assert(!std::is_nothrow_default_constructible_v<adapt_deque<>>);
+
+} // namespace
+
+TEST_CASE("adapt_deque: default constructor allocates", "[adaptors]") {
+  adapt_deque<> d;
+  // Freshly constructed: steal returns an empty handle.
+  REQUIRE_FALSE(d.steal());
+  REQUIRE_FALSE(d.pop());
+}
+
+TEST_CASE("adapt_vector: default constructor does not allocate", "[adaptors]") {
+  adapt_vector<> v;
+  REQUIRE_FALSE(v.pop());
+}
+
+namespace {
+
+using test_stack = geometric_stack<>;
+using test_deque = adapt_deque<>;
+
+TEST_CASE("derived_poly_context: piecewise construction compiles", "[contexts]") {
+  derived_poly_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("derived_poly_context: piecewise construction forwards stack args", "[contexts]") {
+  derived_poly_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<std::allocator<std::byte>>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("mono_context: piecewise construction compiles", "[contexts]") {
+  mono_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+TEST_CASE("mono_context: piecewise construction forwards stack args", "[contexts]") {
+  mono_context<test_stack, test_deque> ctx{
+      std::piecewise_construct,
+      std::tuple<std::allocator<std::byte>>{},
+      std::tuple<std::size_t>{1024},
+  };
+
+  std::ignore = ctx;
+}
+
+} // namespace
diff --git a/test/src/cancel.cpp b/test/src/cancel.cpp
new file mode 100644
index 000000000..12260c62b
--- /dev/null
+++ b/test/src/cancel.cpp
@@ -0,0 +1,773 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+// Exhaustive tests for all stop-token paths in promise.cxx.
+//
+// Stop check-points in promise.cxx:
+//
+//   A. awaitable::await_suspend (StopToken=true):
+//        child->stop_requested() → child not spawned (fork/call via child_scope_ops)
+//
+//   B. awaitable::await_suspend (StopToken=false):
+//        parent.promise().frame.stop_requested() → child not spawned (fork/call via scope_ops)
+//
+//   C. final_suspend_full / final_suspend_trailing:
+//        parent->stop_requested() after winning join race → exception dropped,
+//        iterative ancestor cleanup (exercises concurrent/stolen path)
+//
+//   D. join_awaitable::await_ready:
+//        stop_requested() forces suspension even when steals==0
+//
+//   E. join_awaitable::await_suspend:
+//        stop_requested() after winning join race → handle_stop()
+//
+//   F. handle_stop (exception_bit set on stopped frame):
+//        exception dropped, not propagated to caller
+//
+//   G. Nested child_scope chain propagation:
+//        inner child_scope inherits parent's stop token; stopping the outer
+//        source propagates through the chain to the inner scope.
+//
+//   H. Stoppable receiver / pre-cancelled root:
+//        recv_state<T, true> + receiver::request_stop() immediately after
+//        schedule() — covers the goto-cleanup fast path in root.cxx on
+//        schedulers where the task has not yet begun running.  Racy in
+//        principle, so the test only asserts completion, not that the body
+//        was skipped.
+//
+//   I. Stress tests: concurrent cancellation under contention.
+
+namespace {
+
+// ============================================================
+// Basic helper tasks
+// ============================================================
+
+struct count_up {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<int, Context> {
+    co_return count.fetch_add(1);
+  }
+};
+
+struct count_up_void {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    co_return;
+  }
+};
+
+// ============================================================
+// A. Cancel=true: child-specific cancellation via child_scope_ops.
+//
+//    child_scope_ops binds its stop_source as Cancel=true on every fork/call.
+//    Calling sc.request_stop() before launching exercises
+//    awaitable::await_suspend's Cancel=true branch.
+// ============================================================
+
+template <typename Context>
+auto test_call_drop_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.call_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_call_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 99;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.call(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 99 && count.load() == 0;
+}
+
+template <typename Context>
+auto test_fork_drop_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_fork_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 99;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 99 && count.load() == 0;
+}
+
+template <typename Context>
+auto test_call_not_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 0;
+  auto sc = co_await lf::child_scope();
+  co_await sc.call(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 0 && count.load() == 1;
+}
+
+template <typename Context>
+auto test_fork_not_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  int result = 0;
+  auto sc = co_await lf::child_scope();
+  co_await sc.fork(&result, count_up{}, count);
+  co_await sc.join();
+  co_return result == 0 && count.load() == 1;
+}
+
+template <typename Context>
+auto test_multiple_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc = co_await lf::child_scope();
+  sc.request_stop();
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.fork_drop(count_up_void{}, count);
+  co_await sc.join();
+  co_return count.load() == 0;
+}
+
+template <typename Context>
+auto test_mixed_cancel(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto sc_run = co_await lf::child_scope();
+  auto sc_skip = co_await lf::child_scope();
+  sc_skip.request_stop();
+  co_await sc_run.fork_drop(count_up_void{}, count);  // runs
+  co_await sc_skip.fork_drop(count_up_void{}, count); // skipped
+  co_await sc_run.fork_drop(count_up_void{}, count);  // runs
+  co_await sc_run.join();
+  co_await sc_skip.join();
+  co_return count.load() == 2;
+}
+
+// ============================================================
+// B. StopToken=false: parent frame stop propagation.
+//
+//    An inner task receives a stop_source& that IS its own frame's stop
+//    source (bound via child_scope_ops::call_drop / StopToken=true).  It calls
+//    request_stop() on it, making its own stop_requested() return true, then
+//    tries to launch sub-tasks via scope_ops (StopToken=false).  Those are
+//    skipped because parent.frame.stop_requested() is true (path B).
+//    At join, handle_stop fires (paths D+E).
+// ============================================================
+
+struct inner_call_after_self_cancel {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    my_cancel.request_stop(); // make this frame's stop_requested() == true
+    auto sc = co_await lf::scope();
+    co_await sc.call_drop(count_up_void{}, count); // StopToken=false: stop requested → skip
+    co_await sc.fork_drop(count_up_void{}, count); // StopToken=false: stop requested → skip
+    co_await sc.join();                            // paths D+E: join fires handle_stop
+    count.fetch_add(100);                          // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_call_parent_stop_source(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  // Pass the scope's stop_source by reference so the inner task can cancel it.
+  co_await outer_sc.call_drop(inner_call_after_self_cancel{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 0;
+}
+
+// ============================================================
+// C/D/E. Concurrent cancellation: final_suspend + join interaction.
+// ============================================================
+
+// A child task that cancels a stop_source then completes normally.
+struct cancel_source {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &src, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    src.request_stop();
+    co_return;
+  }
+};
+
+struct inner_fork_then_cancel_at_join {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(cancel_source{}, my_cancel, count);
+    co_await sc.join();   // stop_requested() after child requests stop → handle_stop
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_fork_cancel_at_join(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_fork_then_cancel_at_join{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 1;
+}
+
+// ============================================================
+// F. Exception + cancellation interaction.
+// ============================================================
+
+#if LF_COMPILER_EXCEPTIONS
+
+struct just_throw {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    throw std::runtime_error("test exception");
+    co_return;
+  }
+};
+
+struct inner_forks_throwing {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(just_throw{});
+    co_await sc.join(); // not cancelled → rethrow
+    co_return;
+  }
+};
+
+template <typename Context>
+auto test_exception_propagates(lf::env<Context>) -> lf::task<void, Context> {
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_forks_throwing{});
+  co_await outer_sc.join();
+}
+
+struct cancel_source_and_throw {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &src, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    src.request_stop();
+    throw std::runtime_error("should be dropped");
+    co_return;
+  }
+};
+
+struct inner_cancel_and_throw {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(cancel_source_and_throw{}, my_cancel, count);
+    co_await sc.join();   // stop requested + exception → handle_stop drops exception
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_exception_dropped_when_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_cancel_and_throw{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 1;
+}
+
+struct just_throw_and_count {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1);
+    throw std::runtime_error("sibling exception");
+    co_return;
+  }
+};
+
+struct inner_sibling_throws_and_cancel {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_cancel, std::atomic<int> &count)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(just_throw_and_count{}, count);
+    co_await sc.fork_drop(cancel_source{}, my_cancel, count);
+    co_await sc.join();   // stop requested; exceptions dropped
+    count.fetch_add(100); // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_sibling_exception_dropped_when_cancelled(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_sibling_throws_and_cancel{}, outer_sc, count);
+  co_await outer_sc.join();
+  auto c = count.load();
+  co_return c >= 2 && c < 100;
+}
+
+#endif // LF_COMPILER_EXCEPTIONS
+
+// ============================================================
+// G. Nested child_scope chain propagation.
+//
+//    A child_scope created inside a task that runs under another child_scope
+//    has m_parent pointing to the outer scope's stop_source.  Stopping the
+//    outer source propagates through the chain, making the inner scope's
+//    stop_requested() return true (path A).
+// ============================================================
+
+struct inner_with_nested_scope {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &outer, std::atomic<int> &count) -> lf::task<void, Context> {
+    auto inner_sc = co_await lf::child_scope();
+    // Cancel the outer scope; inner_sc.m_parent == &outer, so the chain fires.
+    outer.request_stop();
+    co_await inner_sc.fork_drop(count_up_void{}, count); // skipped: inner_sc is stopped
+    co_await inner_sc.join();                            // handle_stop
+    count.fetch_add(100);                                // must not be reached
+  }
+};
+
+template <typename Context>
+auto test_nested_child_scope_chain(lf::env<Context>) -> lf::task<bool, Context> {
+  std::atomic<int> count = 0;
+  auto outer_sc = co_await lf::child_scope();
+  co_await outer_sc.call_drop(inner_with_nested_scope{}, outer_sc, count);
+  co_await outer_sc.join();
+  co_return count.load() == 0;
+}
+
+// ============================================================
+// H. Stoppable receiver / pre-cancelled root.
+//
+//    Using recv_state<T, true> + receiver::request_stop() exercises the
+//    goto-cleanup fast path in root.cxx when stop is requested before the
+//    worker resumes the task.
+// ============================================================
+
+template <typename Context>
+auto pre_cancelled_root_fn(lf::env<Context>, std::atomic<bool> *ran) -> lf::task<void, Context> {
+  ran->store(true, std::memory_order_relaxed);
+  co_return;
+}
+
+// ============================================================
+// I. Stress tests: concurrent cancellation under contention.
+//
+//    These tests fork many tasks across multiple threads to maximize the
+//    probability of hitting the concurrent paths in final_suspend_full,
+//    final_suspend_trailing, and join_awaitable::await_suspend with
+//    stop_requested() == true.
+// ============================================================
+
+// --- Leaf task: does a tiny amount of work then returns.
+struct leaf_work {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count) -> lf::task<void, Context> {
+    count.fetch_add(1, std::memory_order_relaxed);
+    co_return;
+  }
+};
+
+// --- Fan-out many forks, one sibling cancels the scope mid-flight.
+//
+//     With enough forks and threads, some children will be in-flight when
+//     stop fires. This exercises:
+//       - final_suspend_trailing: child completes, wins join race, sees stop
+//       - final_suspend_full: iterative ancestor climb on stopped frames
+//       - awaitable::await_suspend: children launched after stop are skipped
+//       - join_awaitable: stop detected at join with steals > 0
+
+struct stress_fan_cancel_inner {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &my_stop, std::atomic<int> &count, int width)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+
+    // Fork width children; the last one cancels this scope.
+    for (int i = 0; i < width; ++i) {
+      if (i == width / 2) {
+        co_await sc.fork_drop(cancel_source{}, my_stop, count);
+      } else {
+        co_await sc.fork_drop(leaf_work{}, count);
+      }
+    }
+    co_await sc.join();
+    // Should not be reached — cancel_source fired mid-fan.
+    count.fetch_add(10000, std::memory_order_relaxed);
+  }
+};
+
+template <typename Context>
+auto stress_fan_cancel(lf::env<Context>, int width) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(stress_fan_cancel_inner{}, outer, count, width);
+  co_await outer.join();
+}
+
+// --- Deep recursive fork tree with cancellation at a specific depth.
+//
+//     This creates a binary tree of forks. When a node at the target depth
+//     fires, it cancels the scope. This stresses:
+//       - final_suspend_full loop: many frames in the ancestor chain may be
+//         stopped, causing iterative climbing
+//       - final_suspend_trailing: stolen forks completing concurrently
+//       - Stack ownership transfer under cancellation
+
+struct tree_cancel_node {
+  template <typename Context>
+  static auto
+  operator()(lf::env<Context>, lf::stop_source &root_stop, std::atomic<int> &count, int depth, int cancel_at)
+      -> lf::task<void, Context> {
+    count.fetch_add(1, std::memory_order_relaxed);
+
+    if (depth <= 0) {
+      co_return;
+    }
+
+    if (depth == cancel_at) {
+      root_stop.request_stop();
+      co_return;
+    }
+
+    auto sc = co_await lf::scope();
+    co_await sc.fork_drop(tree_cancel_node{}, root_stop, count, depth - 1, cancel_at);
+    co_await sc.fork_drop(tree_cancel_node{}, root_stop, count, depth - 1, cancel_at);
+    co_await sc.join();
+  }
+};
+
+template <typename Context>
+auto stress_tree_cancel(lf::env<Context>, int depth, int cancel_at) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(tree_cancel_node{}, outer, count, depth, cancel_at);
+  co_await outer.join();
+}
+
+// --- Repeated schedule + cancel: exercises root.cxx stop path and receiver.
+//
+//     Rapidly schedules tasks and immediately cancels them. The race between
+//     the worker picking up the task and the cancellation request stresses
+//     the root_pkg pre-cancelled path and final_suspend from root frames.
+
+struct busy_leaf {
+  template <typename Context>
+  static auto operator()(lf::env<Context>) -> lf::task<void, Context> {
+    co_return;
+  }
+};
+
+// --- Many-fork cancel with nested child_scopes at multiple levels.
+//
+//     An outer child_scope forks N tasks. Each inner task creates its own
+//     child_scope and forks M children. Mid-way, the outer scope is cancelled.
+//     This tests chain propagation under concurrent fork completion, hitting
+//     final_suspend_full's iterative climb through multiple nested stopped
+//     frames.
+
+struct nested_inner_worker {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, std::atomic<int> &count, int width) -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < width; ++i) {
+      co_await sc.fork_drop(leaf_work{}, count);
+    }
+    co_await sc.join();
+  }
+};
+
+struct nested_cancel_orchestrator {
+  template <typename Context>
+  static auto operator()(lf::env<Context>, lf::stop_source &root_stop, std::atomic<int> &count, int width)
+      -> lf::task<void, Context> {
+    auto sc = co_await lf::scope();
+    for (int i = 0; i < width; ++i) {
+      if (i == width / 2) {
+        // Cancel after forking half the work
+        root_stop.request_stop();
+      }
+      co_await sc.fork_drop(nested_inner_worker{}, count, width);
+    }
+    co_await sc.join();
+    // Should not be reached
+    count.fetch_add(100000, std::memory_order_relaxed);
+  }
+};
+
+template <typename Context>
+auto stress_nested_cancel(lf::env<Context>, int width) -> lf::task<void, Context> {
+  std::atomic<int> count = 0;
+  auto outer = co_await lf::child_scope();
+  co_await outer.call_drop(nested_cancel_orchestrator{}, outer, count, width);
+  co_await outer.join();
+}
+
+// ============================================================
+// Run all tests against a given scheduler
+// ============================================================
+
+template <typename Sch>
+void tests(Sch &scheduler) {
+
+  using Ctx = lf::context_t<Sch>;
+
+  SECTION("call_drop: pre-cancelled child is not run") {
+    auto recv = schedule(scheduler, test_call_drop_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call: pre-cancelled child is not run, return address not written") {
+    auto recv = schedule(scheduler, test_call_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork_drop: pre-cancelled child is not run") {
+    auto recv = schedule(scheduler, test_fork_drop_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork: pre-cancelled child is not run, return address not written") {
+    auto recv = schedule(scheduler, test_fork_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call: positive - not cancelled, child runs and writes result") {
+    auto recv = schedule(scheduler, test_call_not_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork: positive - not cancelled, child runs and writes result") {
+    auto recv = schedule(scheduler, test_fork_not_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("multiple fork_drops: all pre-cancelled, none run") {
+    auto recv = schedule(scheduler, test_multiple_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("mixed scopes: only non-cancelled children run") {
+    auto recv = schedule(scheduler, test_mixed_cancel<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("call_drop/fork_drop: skipped when parent frame is cancelled; join fires handle_cancel") {
+    auto recv = schedule(scheduler, test_call_parent_stop_source<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("fork child cancels parent stop source; join detects cancel via handle_cancel") {
+    auto recv = schedule(scheduler, test_fork_cancel_at_join<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("nested child_scope: stopping outer scope propagates to inner via chain") {
+    auto recv = schedule(scheduler, test_nested_child_scope_chain<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("stoppable receiver: recv_state + request_stop completes cleanly") {
+    std::atomic<bool> ran = false;
+    lf::recv_state<void, true> state;
+    auto recv = lf::schedule(scheduler, std::move(state), pre_cancelled_root_fn<Ctx>, &ran);
+    REQUIRE(recv.valid());
+    recv.stop_source().request_stop();
+
+#if LF_COMPILER_EXCEPTIONS
+    REQUIRE_THROWS_AS(std::move(recv).get(), lf::operation_cancelled_error);
+#else
+    recv.wait();
+#endif
+
+    // The task body may or may not have run depending on scheduler timing;
+    // what matters is that get() completes without error.
+    std::ignore = ran.load();
+  }
+
+  // --- Stress tests (paths C/D/E under contention) ---
+
+  SECTION("stress: fan-out cancel, width=16") {
+    auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 16);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: fan-out cancel, width=64") {
+    auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 64);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=6, cancel at depth=3") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 6, 3);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=8, cancel at depth=1 (near leaf)") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 8, 1);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: tree cancel depth=8, cancel at depth=7 (near root)") {
+    auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 8, 7);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: nested child_scope cancel, width=8") {
+    auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 8);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: nested child_scope cancel, width=32") {
+    auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 32);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("stress: rapid schedule + cancel") {
+    for (int i = 0; i < 50; ++i) {
+      lf::recv_state<void, true> state;
+      auto recv = lf::schedule(scheduler, std::move(state), busy_leaf{});
+      recv.stop_source().request_stop();
+#if LF_COMPILER_EXCEPTIONS
+      REQUIRE_THROWS_AS(std::move(recv).get(), lf::operation_cancelled_error);
+#else
+      recv.wait();
+#endif
+    }
+  }
+
+#if LF_COMPILER_EXCEPTIONS
+
+  SECTION("exception propagates through join when frame is NOT cancelled") {
+    auto recv = schedule(scheduler, test_exception_propagates<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+  }
+
+  SECTION("exception in cancelled frame is dropped by handle_cancel; recv.get() does not throw") {
+    auto recv = schedule(scheduler, test_exception_dropped_when_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+  SECTION("sibling exception dropped when another sibling cancels the frame") {
+    auto recv = schedule(scheduler, test_sibling_exception_dropped_when_cancelled<Ctx>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get());
+  }
+
+#endif // LF_COMPILER_EXCEPTIONS
+}
+
+using mono_inline_ctx = lf::mono_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using poly_inline_ctx = lf::derived_poly_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Inline cancel", "[cancel]", mono_inline_ctx, poly_inline_ctx) {
+  lf::inline_scheduler<TestType> sch{};
+  tests(sch);
+}
+
+namespace {
+
+using mono_busy_thread_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_thread_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy cancel", "[cancel]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  for (std::size_t thr = 1; thr < 4; ++thr) {
+    DYNAMIC_SECTION("threads=" << thr) {
+      TestType scheduler{thr};
+      tests(scheduler);
+    }
+  }
+}
+
+namespace {
+
+// Stress tests repeated at higher thread counts to maximize contention.
+template <typename Sch>
+void stress_tests(Sch &scheduler) {
+  using Ctx = lf::context_t<Sch>;
+
+  SECTION("stress: repeated fan cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_fan_cancel<Ctx>, 32);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+
+  SECTION("stress: repeated tree cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_tree_cancel<Ctx>, 7, 3);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+
+  SECTION("stress: repeated nested cancel") {
+    for (int rep = 0; rep < 20; ++rep) {
+      auto recv = schedule(scheduler, stress_nested_cancel<Ctx>, 16);
+      REQUIRE(recv.valid());
+      std::move(recv).get();
+    }
+  }
+}
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy cancel stress", "[cancel][stress]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  std::size_t max_thr = std::min(8UZ, static_cast<std::size_t>(std::thread::hardware_concurrency()));
+
+  for (std::size_t thr = 2; thr <= max_thr; thr *= 2) {
+    DYNAMIC_SECTION("threads=" << thr) {
+      TestType scheduler{thr};
+      stress_tests(scheduler);
+    }
+  }
+}
diff --git a/test/src/concepts.cpp b/test/src/concepts.cpp
new file mode 100644
index 000000000..2c41270ba
--- /dev/null
+++ b/test/src/concepts.cpp
@@ -0,0 +1,150 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+using namespace lf;
+
+using test_stack = geometric_stack<>;
+using test_context = mono_context<test_stack, adapt_vector<>>;
+
+TEST_CASE("Concepts: atomicable", "[concepts]") {
+  STATIC_REQUIRE(atomicable<std::byte>);
+  STATIC_REQUIRE(atomicable<void *>);
+
+  struct trivial {
+    int x;
+    float y;
+  };
+
+  STATIC_REQUIRE(atomicable<trivial>);
+
+  STATIC_REQUIRE_FALSE(atomicable<std::string>);
+  STATIC_REQUIRE_FALSE(atomicable<const int>);
+  STATIC_REQUIRE_FALSE(atomicable<int &>);
+}
+
+TEST_CASE("Concepts: lock_free", "[concepts]") {
+  STATIC_REQUIRE(lock_free<std::byte>);
+  STATIC_REQUIRE(lock_free<void *>);
+}
+
+namespace {
+
+template <typename... T>
+struct my_template {};
+
+} // namespace
+
+TEST_CASE("Concepts: specialization_of", "[concepts]") {
+  STATIC_REQUIRE(specialization_of<std::vector<int>, std::vector>);
+  STATIC_REQUIRE(specialization_of<my_template<int, float>, my_template>);
+  STATIC_REQUIRE(specialization_of<task<int, test_context>, task>);
+
+  STATIC_REQUIRE_FALSE(specialization_of<int, std::vector>);
+  STATIC_REQUIRE_FALSE(specialization_of<std::vector<int>, my_template>);
+}
+
+TEST_CASE("Concepts: returnable", "[concepts]") {
+  STATIC_REQUIRE(returnable<void>);
+  STATIC_REQUIRE(returnable<int>);
+  STATIC_REQUIRE(returnable<std::unique_ptr<int>>);
+
+  struct non_movable {
+    non_movable() = default;
+    non_movable(const non_movable &) = delete;
+    non_movable(non_movable &&) = delete;
+  };
+
+  STATIC_REQUIRE_FALSE(returnable<non_movable>);
+}
+
+TEST_CASE("Concepts: worker_stack", "[concepts]") {
+  STATIC_REQUIRE(worker_stack<test_stack>);
+
+  struct bad_stack {
+    struct ckpt {
+      auto operator==(ckpt const &) const -> bool = default;
+    };
+    static auto push(std::size_t) -> void *;
+    static auto pop(void *, std::size_t) -> void; // missing noexcept
+    static auto checkpoint() noexcept -> ckpt;
+    static auto prepare_release() noexcept -> int;
+    static auto release(int) noexcept -> void;
+    static auto acquire(ckpt const &) noexcept -> void;
+  };
+
+  STATIC_REQUIRE_FALSE(worker_stack<bad_stack>);
+}
+
+TEST_CASE("Concepts: worker_context", "[concepts]") {
+  STATIC_REQUIRE(worker_context<test_context>);
+
+  struct missing_push {
+    auto pop() noexcept -> lf::steal_handle<missing_push>;
+    auto stack() noexcept -> test_stack &;
+  };
+
+  STATIC_REQUIRE_FALSE(worker_context<missing_push>);
+}
+
+TEST_CASE("Concepts: async_invocable", "[concepts]") {
+
+  auto async_fn_env(env<test_context>, int) -> task<int, test_context>;
+  auto async_fn_no_env(int) -> task<int, test_context>;
+  auto not_async_fn(int) -> int;
+
+  struct both_invocable {
+    auto operator()(env<test_context>, int) const -> task<int, test_context>;
+    auto operator()(int) const -> task<double, test_context>;
+  };
+
+  // Basic positive cases
+  STATIC_REQUIRE(async_invocable<decltype(async_fn_env), test_context, int>);
+  STATIC_REQUIRE(async_invocable<decltype(async_fn_no_env), test_context, int>);
+
+  // Arg mismatch
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_env), test_context, int *>);
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_no_env), test_context, double *>);
+
+  // Result type check
+  STATIC_REQUIRE(std::same_as<async_result_t<decltype(async_fn_env), test_context, int>, int>);
+
+  // Preference check: when both are available, it should pick the one with env
+  // and return int task.
+  STATIC_REQUIRE(async_invocable_to<both_invocable, int, test_context, int>);
+  // Verification that it didn't pick the double one
+  STATIC_REQUIRE_FALSE(async_invocable_to<both_invocable, double, test_context, int>);
+
+  // Fails unless return is a task
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(not_async_fn), test_context, int>);
+
+  // Need a valid context of a different type
+  struct mock_context {
+    void push(lf::steal_handle<mock_context>);
+    void post(lf::sched_handle<mock_context>);
+    auto pop() noexcept -> lf::steal_handle<mock_context>;
+    auto stack() noexcept -> test_stack &;
+  };
+
+  STATIC_REQUIRE(worker_context<mock_context>);
+
+  // Fails because the result task's context doesn't match the provided context
+  STATIC_REQUIRE_FALSE(async_invocable<decltype(async_fn_no_env), mock_context, int>);
+}
+
+TEST_CASE("Concepts: async_nothrow_invocable", "[concepts]") {
+
+  struct nothrow_async {
+    auto operator()(int) const noexcept -> task<int, test_context>;
+  };
+
+  struct throwing_async {
+    auto operator()(int) const -> task<int, test_context>;
+  };
+
+  STATIC_REQUIRE(async_nothrow_invocable<nothrow_async, test_context, int>);
+  STATIC_REQUIRE_FALSE(async_nothrow_invocable<throwing_async, test_context, int>);
+}
diff --git a/test/src/deque.cpp b/test/src/deque.cpp
new file mode 100644
index 000000000..f23fcca92
--- /dev/null
+++ b/test/src/deque.cpp
@@ -0,0 +1,210 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+
+using namespace lf;
+
+TEST_CASE("Deque: Concepts", "[deque]") {
+  STATIC_REQUIRE(dequeable<int>);
+  STATIC_REQUIRE(dequeable<double>);
+  STATIC_REQUIRE_FALSE(dequeable<std::string>); // Not trivially copyable
+}
+
+TEST_CASE("Deque: Single thread as stack", "[deque]") {
+  // Only need a few
+  lf::deque<int> deque{16};
+
+  REQUIRE(deque.empty());
+
+  for (int i = 0; i < 10; ++i) {
+    auto pre = deque.push(i);
+    REQUIRE(pre == i);
+    REQUIRE(deque.ssize() == i + 1);
+  }
+
+  for (int i = 9; i >= 0; --i) {
+    auto item = deque.pop();
+    REQUIRE(item);
+    REQUIRE(*item == i);
+  }
+
+  REQUIRE(deque.empty());
+}
+
+TEST_CASE("Deque: Custom pop when_empty", "[deque]") {
+  lf::deque<int> deque{2};
+
+  auto result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == -1);
+
+  deque.push(42);
+  result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == 42);
+
+  result = deque.pop([]() {
+    return -1;
+  });
+  REQUIRE(result == -1);
+}
+
+namespace {
+
+void test_deque(std::size_t n_pushes, std::size_t n_consumers, bool do_pop) {
+
+  // The tested queue
+  lf::deque<std::uint64_t> deque{n_pushes};
+
+  // To store removed elements
+  std::vector<std::uint64_t> pops{};
+  std::vector<std::vector<std::uint64_t>> steals(n_consumers);
+
+  // Sync
+  std::atomic_flag done{};
+  std::latch start{static_cast<std::ptrdiff_t>(n_consumers + 1)};
+
+  //
+  std::mt19937_64 rng{std::random_device{}()};
+  std::uniform_int_distribution<std::size_t> dist{0, 4};
+
+  std::vector<std::thread> consumers;
+
+  // Consumers steal items and write to their respective vectors
+  for (std::size_t i = 0; i < n_consumers; ++i) {
+    consumers.emplace_back([&, i] {
+      // Wait for initialization to complete
+      start.arrive_and_wait();
+
+      for (;;) {
+        auto [err, item] = deque.thief().steal();
+
+        switch (err) {
+          case lf::err::none:
+            steals[i].push_back(item);
+            [[fallthrough]];
+          case lf::err::lost:
+            break;
+          case lf::err::empty:
+            if (done.test()) {
+              return;
+            }
+            std::this_thread::yield();
+        }
+      }
+    });
+  }
+
+  // Setup, producer write n/2 items
+  for (std::size_t i = 0; i < n_pushes / 2; ++i) {
+    deque.push(i);
+  }
+
+  // Start the consumers
+  start.arrive_and_wait();
+
+  // Push remaining items
+  if (do_pop) {
+    for (std::size_t i = n_pushes / 2; i < n_pushes;) {
+      if (dist(rng) == 0) {
+        if (auto item = deque.pop()) {
+          pops.push_back(*item);
+        }
+      } else {
+        deque.push(i);
+        ++i;
+      }
+    }
+  } else {
+    for (std::size_t i = n_pushes / 2; i < n_pushes; ++i) {
+      deque.push(i);
+    }
+  }
+
+  // Drain the queue
+  if (do_pop) {
+    for (;;) {
+      if (auto item = deque.pop()) {
+        pops.push_back(*item);
+      } else {
+        if (done.test_and_set()) {
+          break;
+        }
+      }
+    }
+  } else {
+    while (!deque.empty()) {
+      std::this_thread::yield();
+    }
+    done.test_and_set();
+  }
+
+  // Stop the consumers
+  for (auto &c : consumers) {
+    c.join();
+  }
+
+  // Verify ascending for each thief
+  for (auto &&deq : steals) {
+    REQUIRE(std::ranges::is_sorted(deq));
+  }
+
+  // Accumulate all items
+  std::vector<std::uint64_t> all = pops;
+
+  for (auto &&deq : steals) {
+    all.insert(all.end(), deq.begin(), deq.end());
+  }
+
+  REQUIRE(all.size() == n_pushes);
+
+  std::ranges::sort(all);
+
+  for (std::size_t i = 0; i < n_pushes; ++i) {
+    REQUIRE(all[i] == i);
+  }
+}
+
+constexpr std::size_t max_elements = 1uz << 20;
+
+} // namespace
+
+TEST_CASE("Deque: Single threaded", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 0, true); }
+  }
+}
+
+TEST_CASE("Deque: SPSC no-pop", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 1, false); }
+  }
+}
+
+TEST_CASE("Deque: SPSC with-pop", "[deque]") {
+  for (std::size_t i = 1; i <= max_elements; i <<= 1) {
+    DYNAMIC_SECTION("Elements: " << i) { test_deque(i, 1, true); }
+  }
+}
+
+TEST_CASE("Deque: MPSC no-pop", "[deque]") {
+  unsigned int max_threads = std::min(4u, std::thread::hardware_concurrency());
+  for (std::size_t i = 1; i <= (max_elements >> 2); i <<= 1) {
+    for (std::size_t j = 2; j <= max_threads; ++j) {
+      DYNAMIC_SECTION("Elements: " << i << " Consumers: " << j) { test_deque(i, j, false); }
+    }
+  }
+}
+
+TEST_CASE("Deque: MPSC with-pop", "[deque]") {
+  unsigned int max_threads = std::min(4u, std::thread::hardware_concurrency());
+  for (std::size_t i = 1; i <= (max_elements >> 2); i <<= 1) {
+    for (std::size_t j = 2; j <= max_threads; ++j) {
+      DYNAMIC_SECTION("Elements: " << i << " Consumers: " << j) { test_deque(i, j, true); }
+    }
+  }
+}
diff --git a/test/src/schedule.cpp b/test/src/schedule.cpp
new file mode 100644
index 000000000..6476a3a3a
--- /dev/null
+++ b/test/src/schedule.cpp
@@ -0,0 +1,122 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+
+namespace {
+
+using lf::env;
+using lf::task;
+
+template <typename Context>
+auto simple_function(env<Context> /*unused*/) -> task<bool, Context> {
+  co_return true;
+}
+
+template <typename Context>
+auto void_function(env<Context> /*unused*/) -> task<void, Context> {
+  co_return;
+}
+
+template <typename Context>
+auto throwing_function(env<Context> /*unused*/) -> task<void, Context> {
+  LF_THROW(std::runtime_error{"This function always throws"});
+  co_return;
+}
+
+// Task whose argument is a big enough value-type to push the root coroutine
+// frame past the 1 KiB embedded buffer in hidden_receiver_state.
+template <typename Context>
+auto big_arg_function(env<Context> /*unused*/, std::array<std::byte, 2048> /*unused*/)
+    -> task<void, Context> {
+  co_return;
+}
+
+template <typename Sch>
+void simple_tests(Sch &scheduler) {
+  SECTION("void") {
+    auto recv = schedule(scheduler, void_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    std::move(recv).get();
+  }
+
+  SECTION("non-void") {
+    auto recv = schedule(scheduler, simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("explicit recv_state") {
+    lf::recv_state<bool, false> state;
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("stoppable recv_state") {
+    lf::recv_state<bool, true> state;
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("recv_state with explicit allocator") {
+    std::allocator<std::byte> alloc;
+    lf::recv_state<bool, false> state{std::allocator_arg, alloc};
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+  SECTION("recv_state with value-init") {
+    // Pre-initialise the return slot; the task overwrites it.
+    lf::recv_state<bool, false> state{false};
+    auto recv = schedule(scheduler, std::move(state), simple_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE(std::move(recv).get() == true);
+  }
+
+#if LF_COMPILER_EXCEPTIONS
+  SECTION("throwing") {
+    auto recv = schedule(scheduler, throwing_function<lf::context_t<Sch>>);
+    REQUIRE(recv.valid());
+    REQUIRE_THROWS_AS(std::move(recv).get(), std::runtime_error);
+  }
+
+  SECTION("frame too large -> root_alloc_error") {
+    std::array<std::byte, 2048> big{};
+    REQUIRE_THROWS_AS(schedule(scheduler, big_arg_function<lf::context_t<Sch>>, big), lf::root_alloc_error);
+  }
+#endif
+}
+
+using mono_inline_ctx = lf::mono_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+using poly_inline_ctx = lf::derived_poly_context<lf::geometric_stack<>, lf::adapt_vector<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Inline schedule", "[schedule]", mono_inline_ctx, poly_inline_ctx) {
+  lf::inline_scheduler<TestType> scheduler;
+  simple_tests(scheduler);
+}
+
+namespace {
+
+using mono_busy_thread_pool = lf::mono_busy_pool<lf::geometric_stack<>>;
+using poly_busy_thread_pool = lf::poly_busy_pool<lf::geometric_stack<>>;
+
+} // namespace
+
+TEMPLATE_TEST_CASE("Busy schedule", "[schedule]", mono_busy_thread_pool, poly_busy_thread_pool) {
+
+  STATIC_REQUIRE(lf::scheduler<TestType>);
+
+  for (std::size_t thr = 1; thr < 4; ++thr) {
+    TestType scheduler{thr};
+    simple_tests(scheduler);
+  }
+}
diff --git a/test/src/stack.cpp b/test/src/stack.cpp
new file mode 100644
index 000000000..674758c08
--- /dev/null
+++ b/test/src/stack.cpp
@@ -0,0 +1,354 @@
+#include <catch2/catch_template_test_macros.hpp>
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/__impl/exception.hpp"
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+using lf::adaptor_stack;
+using lf::geometric_stack;
+using lf::k_new_align;
+using lf::slab_stack;
+using lf::worker_stack;
+
+namespace {
+
+auto not_constexpr() {}
+
+} // namespace
+
+#define expect(expr)                                                                                         \
+  if consteval {                                                                                             \
+    if (!(expr)) {                                                                                           \
+      not_constexpr();                                                                                       \
+    }                                                                                                        \
+  } else {                                                                                                   \
+    REQUIRE(expr);                                                                                           \
+  }
+
+#define TEST_CONSTEXPR(...)                                                                                  \
+  constexpr auto impl = __VA_ARGS__;                                                                         \
+  REQUIRE(impl());                                                                                           \
+  STATIC_REQUIRE(impl())
+
+namespace {
+
+constexpr void check_alignment(void *ptr) {
+
+  expect(ptr != nullptr);
+
+  if !consteval {
+    REQUIRE(lf::is_sufficiently_aligned<k_new_align>(ptr));
+  }
+}
+
+constexpr void check_empty(auto const &stack) {
+  if constexpr (requires { stack.empty(); }) {
+    expect(stack.empty());
+  }
+}
+
+constexpr void check_non_empty(auto const &stack) {
+  if constexpr (requires { stack.empty(); }) {
+    expect(!stack.empty());
+  }
+}
+
+} // namespace
+
+// Stack types that may hit slab_stack's fixed capacity need exception support
+// to signal overflow. Under -fno-exceptions, drop slab_stack from those tests.
+#if LF_COMPILER_EXCEPTIONS
+  #define STACK_TYPES_ALL geometric_stack<>, adaptor_stack<>, slab_stack<>
+#else
+  #define STACK_TYPES_ALL geometric_stack<>, adaptor_stack<>
+#endif
+
+TEMPLATE_TEST_CASE("Concept", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  STATIC_REQUIRE(worker_stack<TestType>); //
+}
+
+TEMPLATE_TEST_CASE("Basic push and pop", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  TEST_CONSTEXPR([]() -> bool {
+    TestType stack;
+    check_empty(stack);
+
+    void *p1 = stack.push(10);
+    check_alignment(p1);
+    check_non_empty(stack);
+
+    void *p2 = stack.push(20);
+    check_alignment(p2);
+    expect(p2 != p1);
+    check_non_empty(stack);
+
+    // Pop in FILO order
+    stack.pop(p2, 20);
+    stack.pop(p1, 10);
+    check_empty(stack);
+
+    return true;
+  });
+}
+
+TEMPLATE_TEST_CASE("Ckpt/Acquire/Release", "[stacks]", geometric_stack<>, adaptor_stack<>, slab_stack<>) {
+  TEST_CONSTEXPR([]() -> bool {
+    TestType stack1;
+    void *p1 = stack1.push(100);
+    auto cp1 = stack1.checkpoint();
+
+    TestType stack2;
+    auto cp2 = stack2.checkpoint();
+
+    using C = decltype(cp1);
+
+    expect(((cp1 == C{} && cp2 == C{}) || cp1 != cp2));
+
+    auto key1 = stack1.prepare_release();
+    stack2.acquire(cp1);
+    stack1.release(key1);
+    expect(stack2.checkpoint() == cp1);
+    stack2.pop(p1, 100);
+
+    return true;
+  });
+}
+
+TEMPLATE_TEST_CASE("Single pass", "[stacks]", STACK_TYPES_ALL) {
+  for (int k = 0; k < 10; ++k) {
+
+    TestType stack;
+    std::mt19937_64 rng{std::random_device{}()};
+    std::uniform_int_distribution<std::size_t> size_dist{1, 200};
+    std::uniform_int_distribution<std::size_t> depth_dist{5, 5000};
+
+    struct entry {
+      void *ptr;
+      std::size_t size;
+    };
+
+    // Perform several rounds of deep push/pop sequences
+    for (int i = 0; i < 2; ++i) {
+      std::vector<entry> entries;
+      const std::size_t depth = depth_dist(rng);
+
+      // Push phase — break early if slab_stack exhausts its fixed capacity
+      for (std::size_t j = 0; j < depth; ++j) {
+        std::size_t s = size_dist(rng);
+        void *p = nullptr;
+        LF_TRY {
+          p = stack.push(s);
+        } LF_CATCH(std::bad_alloc const &) {
+          break;
+        }
+        check_alignment(p);
+        entries.push_back({.ptr = p, .size = s});
+      }
+
+      // Pop phase (FILO) — use entries.size() in case push exited early
+      for (std::size_t j = entries.size(); j > 0; --j) {
+        auto const &e = entries[j - 1];
+        stack.pop(e.ptr, e.size);
+      }
+
+      check_empty(stack);
+    }
+  }
+}
+
+TEMPLATE_TEST_CASE("Randomized push/pop", "[stacks]", STACK_TYPES_ALL) {
+  TestType stack;
+  std::mt19937_64 rng{std::random_device{}()};
+  std::bernoulli_distribution push_dist{0.51};
+  std::uniform_int_distribution<std::size_t> size_dist{1, 512};
+
+  struct entry {
+    void *ptr;
+    std::size_t size;
+  };
+  std::vector<entry> entries;
+  std::size_t total_pushed = 0;
+  const std::size_t target_pushed = 10'000'000;
+
+  while (total_pushed < target_pushed) {
+
+    if (entries.empty()) {
+      check_empty(stack);
+    }
+
+    if (entries.empty() || push_dist(rng)) {
+      std::size_t s = size_dist(rng);
+      void *p = nullptr;
+      LF_TRY {
+        p = stack.push(s);
+      } LF_CATCH(std::bad_alloc const &) {
+        // slab_stack exhausted; clean up and finish
+        break;
+      }
+      check_alignment(p);
+      entries.push_back({.ptr = p, .size = s});
+      total_pushed++;
+    } else {
+      auto e = entries.back();
+      stack.pop(e.ptr, e.size);
+      entries.pop_back();
+    }
+  }
+
+  // Clean up remaining entries
+  while (!entries.empty()) {
+    auto e = entries.back();
+    stack.pop(e.ptr, e.size);
+    entries.pop_back();
+  }
+
+  check_empty(stack);
+}
+
+TEMPLATE_TEST_CASE("Spikey randomized push/pop", "[stacks]", STACK_TYPES_ALL) {
+  TestType stack;
+  std::mt19937_64 rng{std::random_device{}()};
+
+  // Higher probability of push after push, higher probability of pop after pop
+  std::bernoulli_distribution push_after_push{0.95};
+  std::bernoulli_distribution push_after_pop{0.1};
+  std::uniform_int_distribution<std::size_t> size_dist{1, 512};
+
+  struct entry {
+    void *ptr;
+    std::size_t size;
+  };
+  std::vector<entry> entries;
+  std::size_t total_pushed = 0;
+  const std::size_t target_pushed = 10'000'000;
+  bool last_was_push = true;
+
+  while (total_pushed < target_pushed) {
+
+    bool do_push = true;
+
+    if (entries.empty()) {
+      check_empty(stack);
+      do_push = true;
+    } else if (last_was_push) {
+      do_push = push_after_push(rng);
+    } else {
+      do_push = push_after_pop(rng);
+    }
+
+    if (do_push) {
+      std::size_t s = size_dist(rng);
+      void *p = nullptr;
+      LF_TRY {
+        p = stack.push(s);
+      } LF_CATCH(std::bad_alloc const &) {
+        // slab_stack exhausted; clean up and finish
+        break;
+      }
+      check_alignment(p);
+      entries.push_back({.ptr = p, .size = s});
+      total_pushed++;
+      last_was_push = true;
+    } else {
+      auto e = entries.back();
+      stack.pop(e.ptr, e.size);
+      entries.pop_back();
+      last_was_push = false;
+    }
+  }
+
+  // Clean up remaining entries
+  while (!entries.empty()) {
+    auto e = entries.back();
+    stack.pop(e.ptr, e.size);
+    entries.pop_back();
+  }
+  check_empty(stack);
+}
+
+// ---- slab_stack specific ----
+//
+// Tests that exercise behaviour unique to slab_stack's fixed-size design.
+
+#if LF_COMPILER_EXCEPTIONS
+TEST_CASE("slab_stack - throws when full", "[stacks]") {
+  // Use a tiny slab (2 usable nodes) to exercise the overflow path precisely.
+  lf::slab_stack<> stack(2);
+
+  void *p1 = stack.push(k_new_align);
+  void *p2 = stack.push(k_new_align);
+  REQUIRE_THROWS_AS(stack.push(k_new_align), std::bad_alloc);
+
+  stack.pop(p2, k_new_align);
+  stack.pop(p1, k_new_align);
+  check_empty(stack);
+}
+#endif
+
+TEST_CASE("slab_stack - single pass", "[stacks]") {
+  for (int k = 0; k < 10; ++k) {
+    // Slab sized to hold the worst-case live footprint without early exit:
+    // depth_max (5000) * roundup(size_max (200), k_new_align=16) / k_new_align
+    // = 5000 * 13 = 65 000 nodes, with headroom.
+    lf::slab_stack<> stack(70'000);
+    std::mt19937_64 rng{std::random_device{}()};
+    std::uniform_int_distribution<std::size_t> size_dist{1, 200};
+    std::uniform_int_distribution<std::size_t> depth_dist{5, 5000};
+
+    struct entry {
+      void *ptr;
+      std::size_t size;
+    };
+
+    for (int i = 0; i < 2; ++i) {
+      std::vector<entry> entries;
+      const std::size_t depth = depth_dist(rng);
+
+      for (std::size_t j = 0; j < depth; ++j) {
+        std::size_t s = size_dist(rng);
+        void *p = stack.push(s);
+        check_alignment(p);
+        entries.push_back({.ptr = p, .size = s});
+      }
+
+      for (std::size_t j = depth; j > 0; --j) {
+        auto const &e = entries[j - 1];
+        stack.pop(e.ptr, e.size);
+      }
+
+      check_empty(stack);
+    }
+  }
+}
+
+#if LF_COMPILER_EXCEPTIONS
+
+TEST_CASE("slab_stack - release/acquire preserves capacity", "[stacks]") {
+  // Regression: acquire must propagate the non-default capacity via m_ctrl->size,
+  // not silently revert to k_default_nodes.
+  constexpr int N = 4;
+  lf::slab_stack<> src(N);
+  lf::slab_stack<> dst;
+
+  void *p = src.push(k_new_align);
+  auto cp = src.checkpoint();
+  auto key = src.prepare_release();
+  dst.acquire(cp);
+  src.release(key);
+
+  // dst should have room for N-1 more pushes (one already used), and then throw.
+  std::vector<void *> ptrs{p};
+  for (int i = 1; i < N; ++i) {
+    ptrs.push_back(dst.push(k_new_align));
+  }
+  REQUIRE_THROWS_AS(dst.push(k_new_align), std::bad_alloc);
+  for (auto it = ptrs.rbegin(); it != ptrs.rend(); ++it) {
+    dst.pop(*it, k_new_align);
+  }
+  REQUIRE(dst.empty());
+}
+
+#endif
diff --git a/test/src/tuple.cpp b/test/src/tuple.cpp
new file mode 100644
index 000000000..aeecc40d4
--- /dev/null
+++ b/test/src/tuple.cpp
@@ -0,0 +1,234 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork.utils;
+
+namespace {
+
+template <typename T>
+struct control_struct {
+  T val;
+};
+
+struct nil {};
+
+template <typename T>
+using get = decltype(std::declval<T>().template get<0>());
+
+template <typename T>
+using val = decltype(std::get<0>(std::declval<T>()));
+
+template <typename T>
+void check_accessor_types() {
+  using tupl_t = lf::tuple<T>;
+  using ctrl_t = std::tuple<T>;
+
+  STATIC_REQUIRE(std::same_as<get<tupl_t &>, val<ctrl_t &>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t const &>, val<ctrl_t const &>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t &&>, val<ctrl_t &&>>);
+  STATIC_REQUIRE(std::same_as<get<tupl_t const &&>, val<ctrl_t const &&>>);
+
+  // Force instantiation of above
+
+  int x = 0;
+  tupl_t t{static_cast<T>(x)};
+
+  std::ignore = static_cast<tupl_t &>(t).template get<0>();
+  std::ignore = static_cast<tupl_t const &>(t).template get<0>();
+  std::ignore = static_cast<tupl_t &&>(t).template get<0>();
+  std::ignore = static_cast<tupl_t const &&>(t).template get<0>();
+}
+
+} // namespace
+
+TEST_CASE("Tuple accessor types", "[tuple]") {
+  check_accessor_types<int>();
+  check_accessor_types<int &>();
+  check_accessor_types<int &&>();
+
+  check_accessor_types<int const>();
+  check_accessor_types<int const &>();
+  check_accessor_types<int const &&>();
+}
+
+TEST_CASE("Tuple size optimization", "[tuple]") {
+
+  STATIC_REQUIRE(sizeof(lf::tuple<nil>) == 1);
+  STATIC_REQUIRE(sizeof(lf::tuple<int>) == sizeof(int));
+
+  STATIC_REQUIRE(sizeof(lf::tuple<int, nil>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, int>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, nil>) == sizeof(std::tuple<nil, nil>));
+  STATIC_REQUIRE(sizeof(lf::tuple<int, int>) == 2 * sizeof(int));
+
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, nil, int>) == sizeof(int));
+  STATIC_REQUIRE(sizeof(lf::tuple<int, nil, nil>) == 2 * sizeof(int)); // TODO: fixable?
+  STATIC_REQUIRE(sizeof(lf::tuple<nil, int, nil>) == 2 * sizeof(int));
+}
+
+TEST_CASE("Tuple triviality", "[tuple]") {
+  //
+  using trivial_tuple = lf::tuple<int, double, nil>;
+
+  STATIC_REQUIRE(std::is_aggregate_v<trivial_tuple>);
+
+  STATIC_REQUIRE(std::is_trivially_default_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_copy_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_move_constructible_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_copy_assignable_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_move_assignable_v<trivial_tuple>);
+  STATIC_REQUIRE(std::is_trivially_destructible_v<trivial_tuple>);
+}
+
+TEST_CASE("Tuple construction", "[tuple]") {
+  lf::tuple<int, double> _{1, 0.};
+  lf::tuple<nil, int, nil> _{nil{}, 2, nil{}};
+}
+
+TEST_CASE("Tuple apply", "[tuple]") {
+
+  struct move_only {
+    move_only() = default;
+    move_only(move_only const &) = delete;
+    move_only(move_only &&) = default;
+  };
+
+  lf::tuple<int, move_only> val{1, move_only{}};
+
+  REQUIRE(std::move(val).apply([](int x, move_only) -> bool {
+    return x == 1;
+  }));
+}
+
+TEST_CASE("Tuple structured bindings", "[tuple]") {
+
+  lf::tuple<int, nil> tup{1, nil{}};
+
+  auto &&[i, n] = tup;
+
+  REQUIRE(i == 1);
+  REQUIRE(std::is_same_v<decltype(n), nil>);
+
+  i += 1;
+
+  REQUIRE(tup.get<0>() == 2);
+}
+
+TEST_CASE("Tuple CTAD", "[tuple]") {
+  int x = 42;
+  lf::tuple t_lval{x};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_lval), lf::tuple<int &>>);
+
+  lf::tuple t_rval{42};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_rval), lf::tuple<int>>);
+
+  const int cx = 42;
+  lf::tuple t_clval{cx};
+  STATIC_REQUIRE(std::is_same_v<decltype(t_clval), lf::tuple<const int &>>);
+}
+
+TEST_CASE("Tuple reference semantics", "[tuple]") {
+  int x = 1;
+  lf::tuple<int &> t{x};
+  t.get<0>() = 2;
+  REQUIRE(x == 2);
+
+  auto &[ref] = t;
+  ref = 3;
+  REQUIRE(x == 3);
+}
+
+TEST_CASE("Tuple rvalue reference semantics", "[tuple]") {
+  int x = 1;
+  // tuple holding rvalue ref to x (cast to rvalue)
+  lf::tuple<int &&> t{std::move(x)};
+
+  // Accessing lvalue tuple -> lvalue ref to member (which is rvalue ref) -> int&
+  STATIC_REQUIRE(std::is_same_v<decltype(t.get<0>()), int &>);
+
+  // Modifying via the stored rvalue ref
+  t.get<0>() = 2;
+  REQUIRE(x == 2);
+
+  // Accessing rvalue tuple -> xvalue member -> int&&
+  STATIC_REQUIRE(std::is_same_v<decltype(std::move(t).get<0>()), int &&>);
+}
+
+TEST_CASE("Tuple apply value categories", "[tuple]") {
+  lf::tuple<int> t{42};
+
+  // Check lvalue arg
+  bool called_lvalue = false;
+  t.apply([&](int &x) {
+    called_lvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_lvalue);
+
+  // Check rvalue arg
+  bool called_rvalue = false;
+  std::move(t).apply([&](int &&x) {
+    called_rvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_rvalue);
+
+  // Check const lvalue arg
+  const lf::tuple<int> ct{42};
+  bool called_const_lvalue = false;
+  ct.apply([&](const int &x) {
+    called_const_lvalue = true;
+    REQUIRE(x == 42);
+  });
+  REQUIRE(called_const_lvalue);
+}
+
+TEST_CASE("Tuple empty", "[tuple]") {
+  lf::tuple<> t{};
+  STATIC_REQUIRE(sizeof(t) == 1);
+  STATIC_REQUIRE(std::tuple_size_v<decltype(t)> == 0);
+}
+
+TEST_CASE("Tuple move-only types", "[tuple]") {
+  auto ptr = std::make_unique<int>(42);
+  lf::tuple<std::unique_ptr<int>> t{std::move(ptr)};
+
+  REQUIRE(ptr == nullptr);
+  REQUIRE(*t.get<0>() == 42);
+
+  // Move out
+  auto ptr2 = std::move(t.get<0>());
+  REQUIRE(*ptr2 == 42);
+  REQUIRE(t.get<0>() == nullptr);
+}
+
+TEST_CASE("Tuple nested", "[tuple]") {
+  lf::tuple<lf::tuple<int, int>, int> t{1, 2, 3};
+
+  REQUIRE(t.get<0>().get<0>() == 1);
+  REQUIRE(t.get<0>().get<1>() == 2);
+  REQUIRE(t.get<1>() == 3);
+}
+
+TEST_CASE("Tuple const structured bindings", "[tuple]") {
+  lf::tuple<int, int> t{10, 20};
+  const auto &[x, y] = t;
+
+  STATIC_REQUIRE(std::is_same_v<decltype(x), const int>);
+  STATIC_REQUIRE(std::is_same_v<decltype(y), const int>);
+
+  t.get<0>() = 0;
+  t.get<1>() = 1;
+
+  REQUIRE(x == 0);
+  REQUIRE(y == 1);
+}
+
+TEST_CASE("Tuple traits", "[tuple]") {
+  using T = lf::tuple<int, double, char>;
+  STATIC_REQUIRE(std::tuple_size_v<T> == 3);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<0, T>, int>);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<1, T>, double>);
+  STATIC_REQUIRE(std::is_same_v<std::tuple_element_t<2, T>, char>);
+}
diff --git a/test/src/utility.cpp b/test/src/utility.cpp
new file mode 100644
index 000000000..57b33d3c7
--- /dev/null
+++ b/test/src/utility.cpp
@@ -0,0 +1,25 @@
+#include <catch2/catch_test_macros.hpp>
+
+import std;
+
+import libfork;
+import libfork.utils;
+
+TEST_CASE("Defer properties", "[defer]") {
+  using fn_t = void (*)() noexcept;
+  STATIC_REQUIRE(!std::is_copy_constructible_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_move_constructible_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_copy_assignable_v<lf::defer<fn_t>>);
+  STATIC_REQUIRE(!std::is_move_assignable_v<lf::defer<fn_t>>);
+}
+
+TEST_CASE("Defer executes on scope exit", "[defer]") {
+  int count = 0;
+  {
+    lf::defer _ = [&count]() noexcept -> void {
+      ++count;
+    };
+    REQUIRE(count == 0);
+  }
+  REQUIRE(count == 1);
+}
diff --git a/test/src/version.cpp b/test/src/version.cpp
new file mode 100644
index 000000000..ebfbb88d8
--- /dev/null
+++ b/test/src/version.cpp
@@ -0,0 +1,25 @@
+#include <catch2/catch_test_macros.hpp>
+
+#include "libfork/version.hpp"
+
+#ifndef LF_VERSION_MAJOR
+  #error LF_VERSION_MAJOR macro is missing
+#endif
+
+#ifndef LF_VERSION_MINOR
+  #error LF_VERSION_MINOR macro is missing
+#endif
+
+#ifndef LF_VERSION_PATCH
+  #error LF_VERSION_PATCH macro is missing
+#endif
+
+TEST_CASE("Version header", "[version]") {
+  constexpr std::size_t major{LF_VERSION_MAJOR};
+  constexpr std::size_t minor{LF_VERSION_MINOR};
+  constexpr std::size_t patch{LF_VERSION_PATCH};
+
+  REQUIRE(major >= 4);
+  REQUIRE(minor >= 0);
+  REQUIRE(patch >= 0);
+}
diff --git a/tools/deps.py b/tools/deps.py
new file mode 100644
index 000000000..9732ce0b7
--- /dev/null
+++ b/tools/deps.py
@@ -0,0 +1,19 @@
+import re
+import os
+
+files = [f for f in os.listdir("src/core") if f.endswith(".cxx") and f != "core.cxx"]
+deps = {}
+
+for f in files:
+    partition = f.split(".")[0]
+    with open(f"src/core/{f}", "r") as file:
+        content = file.read()
+        imports = re.findall(r"import :([^;]+);", content)
+        deps[partition] = imports
+
+print("digraph \"libfork.core\" {")
+print("  node [shape=box];")
+for p, i in sorted(deps.items()):
+    for dep in sorted(i):
+        print(f"  \"{p}\" -> \"{dep}\";")
+print("}")